diff options
Diffstat (limited to 'proto/bgp/bgp.c')
-rw-r--r-- | proto/bgp/bgp.c | 1511 |
1 files changed, 947 insertions, 564 deletions
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 0f1c9446..5e95e6b4 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -2,6 +2,8 @@ * BIRD -- The Border Gateway Protocol * * (c) 2000 Martin Mares <mj@ucw.cz> + * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2008--2016 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -9,48 +11,52 @@ /** * DOC: Border Gateway Protocol * - * The BGP protocol is implemented in three parts: |bgp.c| which takes care of the - * connection and most of the interface with BIRD core, |packets.c| handling + * The BGP protocol is implemented in three parts: |bgp.c| which takes care of + * the connection and most of the interface with BIRD core, |packets.c| handling * both incoming and outgoing BGP packets and |attrs.c| containing functions for * manipulation with BGP attribute lists. * - * As opposed to the other existing routing daemons, BIRD has a sophisticated core - * architecture which is able to keep all the information needed by BGP in the - * primary routing table, therefore no complex data structures like a central - * BGP table are needed. This increases memory footprint of a BGP router with - * many connections, but not too much and, which is more important, it makes - * BGP much easier to implement. + * As opposed to the other existing routing daemons, BIRD has a sophisticated + * core architecture which is able to keep all the information needed by BGP in + * the primary routing table, therefore no complex data structures like a + * central BGP table are needed. This increases memory footprint of a BGP router + * with many connections, but not too much and, which is more important, it + * makes BGP much easier to implement. * - * Each instance of BGP (corresponding to a single BGP peer) is described by a &bgp_proto - * structure to which are attached individual connections represented by &bgp_connection - * (usually, there exists only one connection, but during BGP session setup, there - * can be more of them). The connections are handled according to the BGP state machine - * defined in the RFC with all the timers and all the parameters configurable. + * Each instance of BGP (corresponding to a single BGP peer) is described by a + * &bgp_proto structure to which are attached individual connections represented + * by &bgp_connection (usually, there exists only one connection, but during BGP + * session setup, there can be more of them). The connections are handled + * according to the BGP state machine defined in the RFC with all the timers and + * all the parameters configurable. * - * In incoming direction, we listen on the connection's socket and each time we receive - * some input, we pass it to bgp_rx(). It decodes packet headers and the markers and - * passes complete packets to bgp_rx_packet() which distributes the packet according - * to its type. + * In incoming direction, we listen on the connection's socket and each time we + * receive some input, we pass it to bgp_rx(). It decodes packet headers and the + * markers and passes complete packets to bgp_rx_packet() which distributes the + * packet according to its type. * - * In outgoing direction, we gather all the routing updates and sort them to buckets - * (&bgp_bucket) according to their attributes (we keep a hash table for fast comparison - * of &rta's and a &fib which helps us to find if we already have another route for - * the same destination queued for sending, so that we can replace it with the new one - * immediately instead of sending both updates). There also exists a special bucket holding - * all the route withdrawals which cannot be queued anywhere else as they don't have any - * attributes. If we have any packet to send (due to either new routes or the connection - * tracking code wanting to send a Open, Keepalive or Notification message), we call - * bgp_schedule_packet() which sets the corresponding bit in a @packet_to_send - * bit field in &bgp_conn and as soon as the transmit socket buffer becomes empty, - * we call bgp_fire_tx(). It inspects state of all the packet type bits and calls - * the corresponding bgp_create_xx() functions, eventually rescheduling the same packet - * type if we have more data of the same type to send. + * In outgoing direction, we gather all the routing updates and sort them to + * buckets (&bgp_bucket) according to their attributes (we keep a hash table for + * fast comparison of &rta's and a &fib which helps us to find if we already + * have another route for the same destination queued for sending, so that we + * can replace it with the new one immediately instead of sending both + * updates). There also exists a special bucket holding all the route + * withdrawals which cannot be queued anywhere else as they don't have any + * attributes. If we have any packet to send (due to either new routes or the + * connection tracking code wanting to send a Open, Keepalive or Notification + * message), we call bgp_schedule_packet() which sets the corresponding bit in a + * @packet_to_send bit field in &bgp_conn and as soon as the transmit socket + * buffer becomes empty, we call bgp_fire_tx(). It inspects state of all the + * packet type bits and calls the corresponding bgp_create_xx() functions, + * eventually rescheduling the same packet type if we have more data of the same + * type to send. * - * The processing of attributes consists of two functions: bgp_decode_attrs() for checking - * of the attribute blocks and translating them to the language of BIRD's extended attributes - * and bgp_encode_attrs() which does the converse. Both functions are built around a - * @bgp_attr_table array describing all important characteristics of all known attributes. - * Unknown transitive attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams. + * The processing of attributes consists of two functions: bgp_decode_attrs() + * for checking of the attribute blocks and translating them to the language of + * BIRD's extended attributes and bgp_encode_attrs() which does the + * converse. Both functions are built around a @bgp_attr_table array describing + * all important characteristics of all known attributes. Unknown transitive + * attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams. * * BGP protocol implements graceful restart in both restarting (local restart) * and receiving (neighbor restart) roles. The first is handled mostly by the @@ -80,70 +86,143 @@ struct linpool *bgp_linpool; /* Global temporary pool */ -static sock *bgp_listen_sk; /* Global listening socket */ -static int bgp_counter; /* Number of protocol instances using the listening socket */ +static list bgp_sockets; /* Global list of listening sockets */ + -static void bgp_close(struct bgp_proto *p, int apply_md5); static void bgp_connect(struct bgp_proto *p); static void bgp_active(struct bgp_proto *p); -static sock *bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags); static void bgp_update_bfd(struct bgp_proto *p, int use_bfd); +static int bgp_incoming_connection(sock *sk, uint dummy UNUSED); +static void bgp_listen_sock_err(sock *sk UNUSED, int err); /** * bgp_open - open a BGP instance * @p: BGP instance * - * This function allocates and configures shared BGP resources. - * Should be called as the last step during initialization - * (when lock is acquired and neighbor is ready). - * When error, state changed to PS_DOWN, -1 is returned and caller - * should return immediately. + * This function allocates and configures shared BGP resources, mainly listening + * sockets. Should be called as the last step during initialization (when lock + * is acquired and neighbor is ready). When error, caller should change state to + * PS_DOWN and return immediately. */ static int bgp_open(struct bgp_proto *p) { - struct config *cfg = p->cf->c.global; - int errcode; + struct bgp_socket *bs = NULL; + struct iface *ifa = p->cf->strict_bind ? p->cf->iface : NULL; + ip_addr addr = p->cf->strict_bind ? p->cf->local_ip : + (ipa_is_ip4(p->cf->remote_ip) ? IPA_NONE4 : IPA_NONE6); + uint port = p->cf->local_port; + + /* FIXME: Add some global init? */ + if (!bgp_linpool) + init_list(&bgp_sockets); - if (!bgp_listen_sk) - bgp_listen_sk = bgp_setup_listen_sk(cfg->listen_bgp_addr, cfg->listen_bgp_port, cfg->listen_bgp_flags); + /* We assume that cf->iface is defined iff cf->local_ip is link-local */ - if (!bgp_listen_sk) + WALK_LIST(bs, bgp_sockets) + if (ipa_equal(bs->sk->saddr, addr) && (bs->sk->iface == ifa) && (bs->sk->sport == port)) { - errcode = BEM_NO_SOCKET; - goto err; + bs->uc++; + p->sock = bs; + return 0; } - if (!bgp_linpool) - bgp_linpool = lp_new(&root_pool, 4080); + sock *sk = sk_new(proto_pool); + sk->type = SK_TCP_PASSIVE; + sk->ttl = 255; + sk->saddr = addr; + sk->sport = port; + sk->flags = 0; + sk->tos = IP_PREC_INTERNET_CONTROL; + sk->rbsize = BGP_RX_BUFFER_SIZE; + sk->tbsize = BGP_TX_BUFFER_SIZE; + sk->rx_hook = bgp_incoming_connection; + sk->err_hook = bgp_listen_sock_err; + + if (sk_open(sk) < 0) + goto err; - bgp_counter++; + bs = mb_allocz(proto_pool, sizeof(struct bgp_socket)); + bs->sk = sk; + bs->uc = 1; + p->sock = bs; - if (p->cf->password) - if (sk_set_md5_auth(bgp_listen_sk, p->cf->source_addr, p->cf->remote_ip, - p->cf->iface, p->cf->password, p->cf->setkey) < 0) - { - sk_log_error(bgp_listen_sk, p->p.name); - bgp_close(p, 0); - errcode = BEM_INVALID_MD5; - goto err; - } + add_tail(&bgp_sockets, &bs->n); + + if (!bgp_linpool) + bgp_linpool = lp_new(proto_pool, 4080); return 0; err: - p->p.disabled = 1; - bgp_store_error(p, NULL, BE_MISC, errcode); - proto_notify_state(&p->p, PS_DOWN); + sk_log_error(sk, p->p.name); + log(L_ERR "%s: Cannot open listening socket", p->p.name); + rfree(sk); return -1; } +/** + * bgp_close - close a BGP instance + * @p: BGP instance + * + * This function frees and deconfigures shared BGP resources. + */ +static void +bgp_close(struct bgp_proto *p) +{ + struct bgp_socket *bs = p->sock; + + ASSERT(bs && bs->uc); + + if (--bs->uc) + return; + + rfree(bs->sk); + rem_node(&bs->n); + mb_free(bs); + + if (!EMPTY_LIST(bgp_sockets)) + return; + + rfree(bgp_linpool); + bgp_linpool = NULL; +} + +static inline int +bgp_setup_auth(struct bgp_proto *p, int enable) +{ + if (p->cf->password) + { + int rv = sk_set_md5_auth(p->sock->sk, + p->cf->local_ip, p->cf->remote_ip, p->cf->iface, + enable ? p->cf->password : NULL, p->cf->setkey); + + if (rv < 0) + sk_log_error(p->sock->sk, p->p.name); + + return rv; + } + else + return 0; +} + +static inline struct bgp_channel * +bgp_find_channel(struct bgp_proto *p, u32 afi) +{ + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + if (c->afi == afi) + return c; + + return NULL; +} + static void bgp_startup(struct bgp_proto *p) { BGP_TRACE(D_EVENTS, "Started"); - p->start_state = p->cf->capabilities ? BSS_CONNECT : BSS_CONNECT_NOCAP; + p->start_state = BSS_CONNECT; if (!p->cf->passive) bgp_active(p); @@ -159,50 +238,36 @@ bgp_startup_timeout(timer *t) static void bgp_initiate(struct bgp_proto *p) { - int rv = bgp_open(p); - if (rv < 0) - return; + int err_val; + + if (bgp_open(p) < 0) + { err_val = BEM_NO_SOCKET; goto err1; } + + if (bgp_setup_auth(p, 1) < 0) + { err_val = BEM_INVALID_MD5; goto err2; } if (p->cf->bfd) bgp_update_bfd(p, p->cf->bfd); if (p->startup_delay) - { - p->start_state = BSS_DELAY; - BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay); - bgp_start_timer(p->startup_timer, p->startup_delay); - } + { + p->start_state = BSS_DELAY; + BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay); + bgp_start_timer(p->startup_timer, p->startup_delay); + } else bgp_startup(p); -} -/** - * bgp_close - close a BGP instance - * @p: BGP instance - * @apply_md5: 0 to disable unsetting MD5 auth - * - * This function frees and deconfigures shared BGP resources. - * @apply_md5 is set to 0 when bgp_close is called as a cleanup - * from failed bgp_open(). - */ -static void -bgp_close(struct bgp_proto *p, int apply_md5) -{ - ASSERT(bgp_counter); - bgp_counter--; + return; - if (p->cf->password && apply_md5) - if (sk_set_md5_auth(bgp_listen_sk, p->cf->source_addr, p->cf->remote_ip, - p->cf->iface, NULL, p->cf->setkey) < 0) - sk_log_error(bgp_listen_sk, p->p.name); +err2: + bgp_close(p); +err1: + p->p.disabled = 1; + bgp_store_error(p, NULL, BE_MISC, err_val); + proto_notify_state(&p->p, PS_DOWN); - if (!bgp_counter) - { - rfree(bgp_listen_sk); - bgp_listen_sk = NULL; - rfree(bgp_linpool); - bgp_linpool = NULL; - } + return; } /** @@ -210,19 +275,19 @@ bgp_close(struct bgp_proto *p, int apply_md5) * @t: timer * @value: time to fire (0 to disable the timer) * - * This functions calls tm_start() on @t with time @value and the - * amount of randomization suggested by the BGP standard. Please use - * it for all BGP timers. + * This functions calls tm_start() on @t with time @value and the amount of + * randomization suggested by the BGP standard. Please use it for all BGP + * timers. */ void bgp_start_timer(timer *t, int value) { if (value) - { - /* The randomization procedure is specified in RFC 1771: 9.2.3.3 */ - t->randomize = value / 4; - tm_start(t, value - t->randomize); - } + { + /* The randomization procedure is specified in RFC 1771: 9.2.3.3 */ + t->randomize = value / 4; + tm_start(t, value - t->randomize); + } else tm_stop(t); } @@ -231,8 +296,8 @@ bgp_start_timer(timer *t, int value) * bgp_close_conn - close a BGP connection * @conn: connection to close * - * This function takes a connection described by the &bgp_conn structure, - * closes its socket and frees all resources associated with it. + * This function takes a connection described by the &bgp_conn structure, closes + * its socket and frees all resources associated with it. */ void bgp_close_conn(struct bgp_conn *conn) @@ -241,16 +306,22 @@ bgp_close_conn(struct bgp_conn *conn) DBG("BGP: Closing connection\n"); conn->packets_to_send = 0; - rfree(conn->connect_retry_timer); - conn->connect_retry_timer = NULL; + conn->channels_to_send = 0; + rfree(conn->connect_timer); + conn->connect_timer = NULL; rfree(conn->keepalive_timer); conn->keepalive_timer = NULL; rfree(conn->hold_timer); conn->hold_timer = NULL; - rfree(conn->sk); - conn->sk = NULL; rfree(conn->tx_ev); conn->tx_ev = NULL; + rfree(conn->sk); + conn->sk = NULL; + + mb_free(conn->local_caps); + conn->local_caps = NULL; + mb_free(conn->remote_caps); + conn->remote_caps = NULL; } @@ -258,9 +329,9 @@ bgp_close_conn(struct bgp_conn *conn) * bgp_update_startup_delay - update a startup delay * @p: BGP instance * - * This function updates a startup delay that is used to postpone next BGP connect. - * It also handles disable_after_error and might stop BGP instance when error - * happened and disable_after_error is on. + * This function updates a startup delay that is used to postpone next BGP + * connect. It also handles disable_after_error and might stop BGP instance + * when error happened and disable_after_error is on. * * It should be called when BGP protocol error happened. */ @@ -277,11 +348,11 @@ bgp_update_startup_delay(struct bgp_proto *p) p->last_proto_error = now; if (cf->disable_after_error) - { - p->startup_delay = 0; - p->p.disabled = 1; - return; - } + { + p->startup_delay = 0; + p->p.disabled = 1; + return; + } if (!p->startup_delay) p->startup_delay = cf->error_delay_time_min; @@ -290,32 +361,38 @@ bgp_update_startup_delay(struct bgp_proto *p) } static void -bgp_graceful_close_conn(struct bgp_conn *conn, unsigned subcode) +bgp_graceful_close_conn(struct bgp_conn *conn, uint subcode) { switch (conn->state) - { - case BS_IDLE: - case BS_CLOSE: - return; - case BS_CONNECT: - case BS_ACTIVE: - bgp_conn_enter_idle_state(conn); - return; - case BS_OPENSENT: - case BS_OPENCONFIRM: - case BS_ESTABLISHED: - bgp_error(conn, 6, subcode, NULL, 0); - return; - default: - bug("bgp_graceful_close_conn: Unknown state %d", conn->state); - } + { + case BS_IDLE: + case BS_CLOSE: + return; + + case BS_CONNECT: + case BS_ACTIVE: + bgp_conn_enter_idle_state(conn); + return; + + case BS_OPENSENT: + case BS_OPENCONFIRM: + case BS_ESTABLISHED: + bgp_error(conn, 6, subcode, NULL, 0); + return; + + default: + bug("bgp_graceful_close_conn: Unknown state %d", conn->state); + } } static void bgp_down(struct bgp_proto *p) { if (p->start_state > BSS_PREPARE) - bgp_close(p, 1); + { + bgp_setup_auth(p, 0); + bgp_close(p); + } BGP_TRACE(D_EVENTS, "Down"); proto_notify_state(&p->p, PS_DOWN); @@ -327,20 +404,20 @@ bgp_decision(void *vp) struct bgp_proto *p = vp; DBG("BGP: Decision start\n"); - if ((p->p.proto_state == PS_START) - && (p->outgoing_conn.state == BS_IDLE) - && (p->incoming_conn.state != BS_OPENCONFIRM) - && (!p->cf->passive)) + if ((p->p.proto_state == PS_START) && + (p->outgoing_conn.state == BS_IDLE) && + (p->incoming_conn.state != BS_OPENCONFIRM) && + !p->cf->passive) bgp_active(p); - if ((p->p.proto_state == PS_STOP) - && (p->outgoing_conn.state == BS_IDLE) - && (p->incoming_conn.state == BS_IDLE)) + if ((p->p.proto_state == PS_STOP) && + (p->outgoing_conn.state == BS_IDLE) && + (p->incoming_conn.state == BS_IDLE)) bgp_down(p); } void -bgp_stop(struct bgp_proto *p, unsigned subcode) +bgp_stop(struct bgp_proto *p, uint subcode) { proto_notify_state(&p->p, PS_STOP); bgp_graceful_close_conn(&p->outgoing_conn, subcode); @@ -349,7 +426,7 @@ bgp_stop(struct bgp_proto *p, unsigned subcode) } static inline void -bgp_conn_set_state(struct bgp_conn *conn, unsigned new_state) +bgp_conn_set_state(struct bgp_conn *conn, uint new_state) { if (conn->bgp->p.mrtdump & MD_STATES) mrt_dump_bgp_state_change(conn, conn->state, new_state); @@ -364,13 +441,17 @@ bgp_conn_enter_openconfirm_state(struct bgp_conn *conn) bgp_conn_set_state(conn, BS_OPENCONFIRM); } +static const struct bgp_af_caps dummy_af_caps = { }; + void bgp_conn_enter_established_state(struct bgp_conn *conn) { struct bgp_proto *p = conn->bgp; + struct bgp_caps *local = conn->local_caps; + struct bgp_caps *peer = conn->remote_caps; + struct bgp_channel *c; BGP_TRACE(D_EVENTS, "BGP session established"); - DBG("BGP: UP!!!\n"); /* For multi-hop BGP sessions */ if (ipa_zero(p->source_addr)) @@ -381,30 +462,92 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) p->conn = conn; p->last_error_class = 0; p->last_error_code = 0; - p->feed_state = BFS_NONE; - p->load_state = BFS_NONE; - bgp_init_bucket_table(p); - bgp_init_prefix_table(p, 8); - int peer_gr_ready = conn->peer_gr_aware && !(conn->peer_gr_flags & BGP_GRF_RESTART); + p->as4_session = conn->as4_session; + + p->route_refresh = peer->route_refresh; + p->enhanced_refresh = local->enhanced_refresh && peer->enhanced_refresh; - if (p->p.gr_recovery && !peer_gr_ready) - proto_graceful_restart_unlock(&p->p); + /* Whether we may handle possible GR of peer (it has some AF GR-able) */ + p->gr_ready = 0; /* Updated later */ - if (p->p.gr_recovery && (p->cf->gr_mode == BGP_GR_ABLE) && peer_gr_ready) - p->p.gr_wait = 1; + /* Whether peer is ready to handle our GR recovery */ + int peer_gr_ready = peer->gr_aware && !(peer->gr_flags & BGP_GRF_RESTART); - if (p->gr_active) + if (p->gr_active_num) tm_stop(p->gr_timer); - if (p->gr_active && (!conn->peer_gr_able || !(conn->peer_gr_aflags & BGP_GRF_FORWARDING))) - bgp_graceful_restart_done(p); + /* Number of active channels */ + int num = 0; + + WALK_LIST(c, p->p.channels) + { + const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi); + const struct bgp_af_caps *rem = bgp_find_af_caps(peer, c->afi); + + /* Ignore AFIs that were not announced in multiprotocol capability */ + if (!loc || !loc->ready) + loc = &dummy_af_caps; + + if (!rem || !rem->ready) + rem = &dummy_af_caps; + + int active = loc->ready && rem->ready; + c->c.disabled = !active; + c->c.reloadable = p->route_refresh; + + c->index = active ? num++ : 0; + + c->feed_state = BFS_NONE; + c->load_state = BFS_NONE; + + /* Channels where peer may do GR */ + c->gr_ready = active && local->gr_aware && rem->gr_able; + p->gr_ready = p->gr_ready || c->gr_ready; + + /* Channels not able to recover gracefully */ + if (p->p.gr_recovery && (!active || !peer_gr_ready)) + channel_graceful_restart_unlock(&c->c); - /* GR capability implies that neighbor will send End-of-RIB */ - if (conn->peer_gr_aware) - p->load_state = BFS_LOADING; + /* Channels waiting for local convergence */ + if (p->p.gr_recovery && loc->gr_able && peer_gr_ready) + c->c.gr_wait = 1; - /* proto_notify_state() will likely call bgp_feed_begin(), setting p->feed_state */ + /* Channels where peer is not able to recover gracefully */ + if (c->gr_active && ! (c->gr_ready && (rem->gr_af_flags & BGP_GRF_FORWARDING))) + bgp_graceful_restart_done(c); + + /* GR capability implies that neighbor will send End-of-RIB */ + if (peer->gr_aware) + c->load_state = BFS_LOADING; + + c->ext_next_hop = c->cf->ext_next_hop && (bgp_channel_is_ipv6(c) || rem->ext_next_hop); + c->add_path_rx = (loc->add_path & BGP_ADD_PATH_RX) && (rem->add_path & BGP_ADD_PATH_TX); + c->add_path_tx = (loc->add_path & BGP_ADD_PATH_TX) && (rem->add_path & BGP_ADD_PATH_RX); + + /* Update RA mode */ + if (c->add_path_tx) + c->c.ra_mode = RA_ANY; + else if (c->cf->secondary) + c->c.ra_mode = RA_ACCEPTED; + else + c->c.ra_mode = RA_OPTIMAL; + } + + p->afi_map = mb_alloc(p->p.pool, num * sizeof(u32)); + p->channel_map = mb_alloc(p->p.pool, num * sizeof(void *)); + p->channel_count = num; + + WALK_LIST(c, p->p.channels) + { + if (c->c.disabled) + continue; + + p->afi_map[c->index] = c->afi; + p->channel_map[c->index] = c; + } + + /* proto_notify_state() will likely call bgp_feed_begin(), setting c->feed_state */ bgp_conn_set_state(conn, BS_ESTABLISHED); proto_notify_state(&p->p, PS_UP); @@ -416,8 +559,9 @@ bgp_conn_leave_established_state(struct bgp_proto *p) BGP_TRACE(D_EVENTS, "BGP session closed"); p->conn = NULL; - bgp_free_prefix_table(p); - bgp_free_bucket_table(p); + // XXXX free these tables to avoid memory leak during graceful restart + // bgp_free_prefix_table(p); + // bgp_free_bucket_table(p); if (p->p.proto_state == PS_UP) bgp_stop(p, 0); @@ -471,34 +615,57 @@ bgp_handle_graceful_restart(struct bgp_proto *p) ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready); BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s", - p->gr_active ? " - already pending" : ""); - proto_notify_state(&p->p, PS_START); + p->gr_active_num ? " - already pending" : ""); - if (p->gr_active) - rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook); + p->gr_active_num = 0; - p->gr_active = 1; - bgp_start_timer(p->gr_timer, p->conn->peer_gr_time); - rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook); + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + { + if (c->gr_ready) + { + if (c->gr_active) + rt_refresh_end(c->c.table, &c->c); + + c->gr_active = 1; + p->gr_active_num++; + rt_refresh_begin(c->c.table, &c->c); + } + else + { + /* Just flush the routes */ + rt_refresh_begin(c->c.table, &c->c); + rt_refresh_end(c->c.table, &c->c); + } + } + + proto_notify_state(&p->p, PS_START); + bgp_start_timer(p->gr_timer, p->conn->local_caps->gr_time); } /** * bgp_graceful_restart_done - finish active BGP graceful restart - * @p: BGP instance + * @c: BGP channel * * This function is called when the active BGP graceful restart of the neighbor - * should be finished - either successfully (the neighbor sends all paths and - * reports end-of-RIB on the new session) or unsuccessfully (the neighbor does - * not support BGP graceful restart on the new session). The function ends - * routing table refresh cycle and stops BGP restart timer. + * should be finished for channel @c - either successfully (the neighbor sends + * all paths and reports end-of-RIB for given AFI/SAFI on the new session) or + * unsuccessfully (the neighbor does not support BGP graceful restart on the new + * session). The function ends the routing table refresh cycle. */ void -bgp_graceful_restart_done(struct bgp_proto *p) +bgp_graceful_restart_done(struct bgp_channel *c) { - BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); - p->gr_active = 0; - tm_stop(p->gr_timer); - rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook); + struct bgp_proto *p = (void *) c->c.proto; + + ASSERT(c->gr_active); + c->gr_active = 0; + p->gr_active_num--; + + if (!p->gr_active_num) + BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); + + rt_refresh_end(c->c.table, &c->c); } /** @@ -522,7 +689,7 @@ bgp_graceful_restart_timeout(timer *t) /** * bgp_refresh_begin - start incoming enhanced route refresh sequence - * @p: BGP instance + * @c: BGP channel * * This function is called when an incoming enhanced route refresh sequence is * started by the neighbor, demarcated by the BoRR packet. The function updates @@ -531,18 +698,20 @@ bgp_graceful_restart_timeout(timer *t) * ensure that these two sequences do not overlap. */ void -bgp_refresh_begin(struct bgp_proto *p) +bgp_refresh_begin(struct bgp_channel *c) { - if (p->load_state == BFS_LOADING) - { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } + struct bgp_proto *p = (void *) c->c.proto; + + if (c->load_state == BFS_LOADING) + { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } - p->load_state = BFS_REFRESHING; - rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook); + c->load_state = BFS_REFRESHING; + rt_refresh_begin(c->c.table, &c->c); } /** * bgp_refresh_end - finish incoming enhanced route refresh sequence - * @p: BGP instance + * @c: BGP channel * * This function is called when an incoming enhanced route refresh sequence is * finished by the neighbor, demarcated by the EoRR packet. The function updates @@ -550,39 +719,26 @@ bgp_refresh_begin(struct bgp_proto *p) * during the sequence are removed by the nest. */ void -bgp_refresh_end(struct bgp_proto *p) +bgp_refresh_end(struct bgp_channel *c) { - if (p->load_state != BFS_REFRESHING) - { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } + struct bgp_proto *p = (void *) c->c.proto; + + if (c->load_state != BFS_REFRESHING) + { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } - p->load_state = BFS_NONE; - rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook); + c->load_state = BFS_NONE; + rt_refresh_end(c->c.table, &c->c); } static void bgp_send_open(struct bgp_conn *conn) { - conn->start_state = conn->bgp->start_state; - - // Default values, possibly changed by receiving capabilities. - conn->advertised_as = 0; - conn->peer_refresh_support = 0; - conn->peer_as4_support = 0; - conn->peer_add_path = 0; - conn->peer_enhanced_refresh_support = 0; - conn->peer_gr_aware = 0; - conn->peer_gr_able = 0; - conn->peer_gr_time = 0; - conn->peer_gr_flags = 0; - conn->peer_gr_aflags = 0; - conn->peer_ext_messages_support = 0; - DBG("BGP: Sending open\n"); conn->sk->rx_hook = bgp_rx; conn->sk->tx_hook = bgp_tx; - tm_stop(conn->connect_retry_timer); - bgp_schedule_packet(conn, PKT_OPEN); + tm_stop(conn->connect_timer); + bgp_schedule_packet(conn, NULL, PKT_OPEN); bgp_conn_set_state(conn, BS_OPENSENT); bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time); } @@ -605,10 +761,10 @@ bgp_connect_timeout(timer *t) DBG("BGP: connect_timeout\n"); if (p->p.proto_state == PS_START) - { - bgp_close_conn(conn); - bgp_connect(p); - } + { + bgp_close_conn(conn); + bgp_connect(p); + } else bgp_conn_enter_idle_state(conn); } @@ -672,7 +828,7 @@ bgp_keepalive_timeout(timer *t) struct bgp_conn *conn = t->data; DBG("BGP: Keepalive timer\n"); - bgp_schedule_packet(conn, PKT_KEEPALIVE); + bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE); /* Kick TX a bit faster */ if (ev_active(conn->tx_ev)) @@ -682,21 +838,18 @@ bgp_keepalive_timeout(timer *t) static void bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn) { - timer *t; - conn->sk = NULL; conn->bgp = p; + conn->packets_to_send = 0; + conn->channels_to_send = 0; + conn->last_channel = 0; + conn->last_channel_count = 0; + + conn->connect_timer = tm_new_set(p->p.pool, bgp_connect_timeout, conn, 0, 0); + conn->hold_timer = tm_new_set(p->p.pool, bgp_hold_timeout, conn, 0, 0); + conn->keepalive_timer = tm_new_set(p->p.pool, bgp_keepalive_timeout, conn, 0, 0); - t = conn->connect_retry_timer = tm_new(p->p.pool); - t->hook = bgp_connect_timeout; - t->data = conn; - t = conn->hold_timer = tm_new(p->p.pool); - t->hook = bgp_hold_timeout; - t->data = conn; - t = conn->keepalive_timer = tm_new(p->p.pool); - t->hook = bgp_keepalive_timeout; - t->data = conn; conn->tx_ev = ev_new(p->p.pool); conn->tx_ev->hook = bgp_kick_tx; conn->tx_ev->data = conn; @@ -720,7 +873,7 @@ bgp_active(struct bgp_proto *p) BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay); bgp_setup_conn(p, conn); bgp_conn_set_state(conn, BS_ACTIVE); - bgp_start_timer(conn->connect_retry_timer, delay); + bgp_start_timer(conn->connect_timer, delay); } /** @@ -734,12 +887,11 @@ bgp_active(struct bgp_proto *p) static void bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing connection */ { - sock *s; struct bgp_conn *conn = &p->outgoing_conn; int hops = p->cf->multihop ? : 1; DBG("BGP: Connecting\n"); - s = sk_new(p->p.pool); + sock *s = sk_new(p->p.pool); s->type = SK_TCP_ACTIVE; s->saddr = p->source_addr; s->daddr = p->cf->remote_ip; @@ -766,10 +918,10 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c goto err; DBG("BGP: Waiting for connect success\n"); - bgp_start_timer(conn->connect_retry_timer, p->cf->connect_retry_time); + bgp_start_timer(conn->connect_timer, p->cf->connect_retry_time); return; - err: +err: sk_log_error(s, p->p.name); bgp_sock_err(s, 0); return; @@ -783,16 +935,15 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c static struct bgp_proto * bgp_find_proto(sock *sk) { - struct proto_config *pc; + struct bgp_proto *p; - WALK_LIST(pc, config->protos) - if ((pc->protocol == &proto_bgp) && pc->proto) - { - struct bgp_proto *p = (struct bgp_proto *) pc->proto; - if (ipa_equal(p->cf->remote_ip, sk->daddr) && - (!ipa_is_link_local(sk->daddr) || (p->cf->iface == sk->iface))) - return p; - } + WALK_LIST(p, proto_list) + if ((p->p.proto == &proto_bgp) && + ipa_equal(p->cf->remote_ip, sk->daddr) && + (!ipa_is_link_local(sk->daddr) || (p->cf->iface == sk->iface)) && + (ipa_zero(p->cf->local_ip) || ipa_equal(p->cf->local_ip, sk->saddr)) && + (p->cf->local_port == sk->sport)) + return p; return NULL; } @@ -818,12 +969,12 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport); p = bgp_find_proto(sk); if (!p) - { - log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)", - sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport); - rfree(sk); - return 0; - } + { + log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)", + sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport); + rfree(sk); + return 0; + } /* * BIRD should keep multiple incoming connections in OpenSent state (for @@ -836,26 +987,26 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk); if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready) - { - bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART); - bgp_handle_graceful_restart(p); - bgp_conn_enter_idle_state(p->conn); - acc = 1; - - /* There might be separate incoming connection in OpenSent state */ - if (p->incoming_conn.state > BS_ACTIVE) - bgp_close_conn(&p->incoming_conn); - } + { + bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART); + bgp_handle_graceful_restart(p); + bgp_conn_enter_idle_state(p->conn); + acc = 1; + + /* There might be separate incoming connection in OpenSent state */ + if (p->incoming_conn.state > BS_ACTIVE) + bgp_close_conn(&p->incoming_conn); + } BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s", sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport, acc ? "accepted" : "rejected"); if (!acc) - { - rfree(sk); - return 0; - } + { + rfree(sk); + return 0; + } hops = p->cf->multihop ? : 1; @@ -867,11 +1018,11 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) goto err; if (p->cf->enable_extended_messages) - { - sk->rbsize = BGP_RX_BUFFER_EXT_SIZE; - sk->tbsize = BGP_TX_BUFFER_EXT_SIZE; - sk_reallocate(sk); - } + { + sk->rbsize = BGP_RX_BUFFER_EXT_SIZE; + sk->tbsize = BGP_TX_BUFFER_EXT_SIZE; + sk_reallocate(sk); + } bgp_setup_conn(p, &p->incoming_conn); bgp_setup_sk(&p->incoming_conn, sk); @@ -894,34 +1045,6 @@ bgp_listen_sock_err(sock *sk UNUSED, int err) log(L_ERR "BGP: Error on listening socket: %M", err); } -static sock * -bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags) -{ - sock *s = sk_new(&root_pool); - DBG("BGP: Creating listening socket\n"); - s->type = SK_TCP_PASSIVE; - s->ttl = 255; - s->saddr = addr; - s->sport = port ? port : BGP_PORT; - s->flags = flags ? 0 : SKF_V6ONLY; - s->tos = IP_PREC_INTERNET_CONTROL; - s->rbsize = BGP_RX_BUFFER_SIZE; - s->tbsize = BGP_TX_BUFFER_SIZE; - s->rx_hook = bgp_incoming_connection; - s->err_hook = bgp_listen_sock_err; - - if (sk_open(s) < 0) - goto err; - - return s; - - err: - sk_log_error(s, "BGP"); - log(L_ERR "BGP: Cannot open listening socket"); - rfree(s); - return NULL; -} - static void bgp_start_neighbor(struct bgp_proto *p) { @@ -930,23 +1053,22 @@ bgp_start_neighbor(struct bgp_proto *p) if (ipa_zero(p->source_addr)) p->source_addr = p->neigh->ifa->ip; -#ifdef IPV6 + if (ipa_is_link_local(p->source_addr)) + p->link_addr = p->source_addr; + else { + /* Find some link-local address for given iface */ struct ifa *a; - p->local_link = IPA_NONE; + p->link_addr = IPA_NONE; WALK_LIST(a, p->neigh->iface->addrs) if (a->scope == SCOPE_LINK) - { - p->local_link = a->ip; - break; - } - - if (! ipa_nonzero(p->local_link)) - log(L_WARN "%s: Missing link local address on interface %s", p->p.name, p->neigh->iface->name); + { + p->link_addr = a->ip; + break; + } - DBG("BGP: Selected link-level address %I\n", p->local_link); + DBG("%s: Selected link-local address %I\n", p->p.name, p->link_addr); } -#endif bgp_initiate(p); } @@ -966,34 +1088,34 @@ bgp_neigh_notify(neighbor *n) int prepare = (ps == PS_START) && (p->start_state == BSS_PREPARE); if (n->scope <= 0) + { + if (!prepare) { - if (!prepare) - { - BGP_TRACE(D_EVENTS, "Neighbor lost"); - bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST); - /* Perhaps also run bgp_update_startup_delay(p)? */ - bgp_stop(p, 0); - } + BGP_TRACE(D_EVENTS, "Neighbor lost"); + bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST); + /* Perhaps also run bgp_update_startup_delay(p)? */ + bgp_stop(p, 0); } + } else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP)) + { + if (!prepare) { - if (!prepare) - { - BGP_TRACE(D_EVENTS, "Link down"); - bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN); - if (ps == PS_UP) - bgp_update_startup_delay(p); - bgp_stop(p, 0); - } + BGP_TRACE(D_EVENTS, "Link down"); + bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN); + if (ps == PS_UP) + bgp_update_startup_delay(p); + bgp_stop(p, 0); } + } else + { + if (prepare) { - if (prepare) - { - BGP_TRACE(D_EVENTS, "Neighbor ready"); - bgp_start_neighbor(p); - } + BGP_TRACE(D_EVENTS, "Neighbor ready"); + bgp_start_neighbor(p); } + } } static void @@ -1003,13 +1125,13 @@ bgp_bfd_notify(struct bfd_request *req) int ps = p->p.proto_state; if (req->down && ((ps == PS_START) || (ps == PS_UP))) - { - BGP_TRACE(D_EVENTS, "BFD session down"); - bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN); - if (ps == PS_UP) - bgp_update_startup_delay(p); - bgp_stop(p, 0); - } + { + BGP_TRACE(D_EVENTS, "BFD session down"); + bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN); + if (ps == PS_UP) + bgp_update_startup_delay(p); + bgp_stop(p, 0); + } } static void @@ -1021,71 +1143,72 @@ bgp_update_bfd(struct bgp_proto *p, int use_bfd) bgp_bfd_notify, p); if (!use_bfd && p->bfd_req) - { - rfree(p->bfd_req); - p->bfd_req = NULL; - } + { + rfree(p->bfd_req); + p->bfd_req = NULL; + } } -static int -bgp_reload_routes(struct proto *P) +static void +bgp_reload_routes(struct channel *C) { - struct bgp_proto *p = (struct bgp_proto *) P; - if (!p->conn || !p->conn->peer_refresh_support) - return 0; + struct bgp_proto *p = (void *) C->proto; + struct bgp_channel *c = (void *) C; - bgp_schedule_packet(p->conn, PKT_ROUTE_REFRESH); - return 1; + ASSERT(p->conn && p->route_refresh); + + bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); } static void -bgp_feed_begin(struct proto *P, int initial) +bgp_feed_begin(struct channel *C, int initial) { - struct bgp_proto *p = (struct bgp_proto *) P; + struct bgp_proto *p = (void *) C->proto; + struct bgp_channel *c = (void *) C; /* This should not happen */ if (!p->conn) return; if (initial && p->cf->gr_mode) - p->feed_state = BFS_LOADING; + c->feed_state = BFS_LOADING; /* It is refeed and both sides support enhanced route refresh */ - if (!initial && p->cf->enable_refresh && - p->conn->peer_enhanced_refresh_support) - { - /* BoRR must not be sent before End-of-RIB */ - if (p->feed_state == BFS_LOADING || p->feed_state == BFS_LOADED) - return; + if (!initial && p->enhanced_refresh) + { + /* BoRR must not be sent before End-of-RIB */ + if (c->feed_state == BFS_LOADING || c->feed_state == BFS_LOADED) + return; - p->feed_state = BFS_REFRESHING; - bgp_schedule_packet(p->conn, PKT_BEGIN_REFRESH); - } + c->feed_state = BFS_REFRESHING; + bgp_schedule_packet(p->conn, c, PKT_BEGIN_REFRESH); + } } static void -bgp_feed_end(struct proto *P) +bgp_feed_end(struct channel *C) { - struct bgp_proto *p = (struct bgp_proto *) P; + struct bgp_proto *p = (void *) C->proto; + struct bgp_channel *c = (void *) C; /* This should not happen */ if (!p->conn) return; /* Non-demarcated feed ended, nothing to do */ - if (p->feed_state == BFS_NONE) + if (c->feed_state == BFS_NONE) return; /* Schedule End-of-RIB packet */ - if (p->feed_state == BFS_LOADING) - p->feed_state = BFS_LOADED; + if (c->feed_state == BFS_LOADING) + c->feed_state = BFS_LOADED; /* Schedule EoRR packet */ - if (p->feed_state == BFS_REFRESHING) - p->feed_state = BFS_REFRESHED; + if (c->feed_state == BFS_REFRESHING) + c->feed_state = BFS_REFRESHED; /* Kick TX hook */ - bgp_schedule_packet(p->conn, PKT_UPDATE); + bgp_schedule_packet(p->conn, c, PKT_UPDATE); } @@ -1096,30 +1219,30 @@ bgp_start_locked(struct object_lock *lock) struct bgp_config *cf = p->cf; if (p->p.proto_state != PS_START) - { - DBG("BGP: Got lock in different state %d\n", p->p.proto_state); - return; - } + { + DBG("BGP: Got lock in different state %d\n", p->p.proto_state); + return; + } DBG("BGP: Got lock\n"); if (cf->multihop) - { - /* Multi-hop sessions do not use neighbor entries */ - bgp_initiate(p); - return; - } + { + /* Multi-hop sessions do not use neighbor entries */ + bgp_initiate(p); + return; + } neighbor *n = neigh_find2(&p->p, &cf->remote_ip, cf->iface, NEF_STICKY); if (!n) - { - log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface); - /* As we do not start yet, we can just disable protocol */ - p->p.disabled = 1; - bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP); - proto_notify_state(&p->p, PS_DOWN); - return; - } + { + log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface); + /* As we do not start yet, we can just disable protocol */ + p->p.disabled = 1; + bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP); + proto_notify_state(&p->p, PS_DOWN); + return; + } p->neigh = n; @@ -1144,9 +1267,7 @@ bgp_start(struct proto *P) p->neigh = NULL; p->bfd_req = NULL; p->gr_ready = 0; - p->gr_active = 0; - - rt_lock_table(p->igp_table); + p->gr_active_num = 0; p->event = ev_new(p->p.pool); p->event->hook = bgp_decision; @@ -1165,15 +1286,19 @@ bgp_start(struct proto *P) p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id; p->remote_id = 0; - p->source_addr = p->cf->source_addr; + p->source_addr = p->cf->local_ip; + /* XXXX */ if (p->p.gr_recovery && p->cf->gr_mode) - proto_graceful_restart_lock(P); + { + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + channel_graceful_restart_lock(&c->c); + } /* - * Before attempting to create the connection, we need to lock the - * port, so that are sure we're the only instance attempting to talk - * with that neighbor. + * Before attempting to create the connection, we need to lock the port, + * so that we are the only instance attempting to talk with that neighbor. */ lock = p->lock = olock_new(P->pool); @@ -1194,78 +1319,64 @@ static int bgp_shutdown(struct proto *P) { struct bgp_proto *p = (struct bgp_proto *) P; - unsigned subcode = 0; + uint subcode = 0; BGP_TRACE(D_EVENTS, "Shutdown requested"); switch (P->down_code) - { - case PDC_CF_REMOVE: - case PDC_CF_DISABLE: - subcode = 3; // Errcode 6, 3 - peer de-configured - break; - - case PDC_CF_RESTART: - subcode = 6; // Errcode 6, 6 - other configuration change - break; - - case PDC_CMD_DISABLE: - case PDC_CMD_SHUTDOWN: - subcode = 2; // Errcode 6, 2 - administrative shutdown - break; - - case PDC_CMD_RESTART: - subcode = 4; // Errcode 6, 4 - administrative reset - break; - - case PDC_RX_LIMIT_HIT: - case PDC_IN_LIMIT_HIT: - subcode = 1; // Errcode 6, 1 - max number of prefixes reached - /* log message for compatibility */ - log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name); - goto limit; - - case PDC_OUT_LIMIT_HIT: - subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown - - limit: - bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED); - if (proto_restart) - bgp_update_startup_delay(p); - else - p->startup_delay = 0; - goto done; - } + { + case PDC_CF_REMOVE: + case PDC_CF_DISABLE: + subcode = 3; // Errcode 6, 3 - peer de-configured + break; + + case PDC_CF_RESTART: + subcode = 6; // Errcode 6, 6 - other configuration change + break; + + case PDC_CMD_DISABLE: + case PDC_CMD_SHUTDOWN: + subcode = 2; // Errcode 6, 2 - administrative shutdown + break; + + case PDC_CMD_RESTART: + subcode = 4; // Errcode 6, 4 - administrative reset + break; + + case PDC_RX_LIMIT_HIT: + case PDC_IN_LIMIT_HIT: + subcode = 1; // Errcode 6, 1 - max number of prefixes reached + /* log message for compatibility */ + log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name); + goto limit; + + case PDC_OUT_LIMIT_HIT: + subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown + + limit: + bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED); + if (proto_restart) + bgp_update_startup_delay(p); + else + p->startup_delay = 0; + goto done; + } bgp_store_error(p, NULL, BE_MAN_DOWN, 0); p->startup_delay = 0; - done: +done: bgp_stop(p, subcode); return p->p.proto_state; } -static void -bgp_cleanup(struct proto *P) -{ - struct bgp_proto *p = (struct bgp_proto *) P; - rt_unlock_table(p->igp_table); -} - -static rtable * -get_igp_table(struct bgp_config *cf) -{ - return cf->igp_table ? cf->igp_table->table : cf->c.table->table; -} - static struct proto * -bgp_init(struct proto_config *C) +bgp_init(struct proto_config *CF) { - struct proto *P = proto_new(C, sizeof(struct bgp_proto)); - struct bgp_config *c = (struct bgp_config *) C; + struct proto *P = proto_new(CF); struct bgp_proto *p = (struct bgp_proto *) P; + struct bgp_config *cf = (struct bgp_config *) CF; - P->accept_ra_types = c->secondary ? RA_ACCEPTED : RA_OPTIMAL; P->rt_notify = bgp_rt_notify; P->import_control = bgp_import_control; P->neigh_notify = bgp_neigh_notify; @@ -1274,102 +1385,204 @@ bgp_init(struct proto_config *C) P->feed_end = bgp_feed_end; P->rte_better = bgp_rte_better; P->rte_mergable = bgp_rte_mergable; - P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL; - - p->cf = c; - p->local_as = c->local_as; - p->remote_as = c->remote_as; - p->is_internal = (c->local_as == c->remote_as); - p->rs_client = c->rs_client; - p->rr_client = c->rr_client; - p->igp_table = get_igp_table(c); + P->rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL; + + p->cf = cf; + p->local_as = cf->local_as; + p->remote_as = cf->remote_as; + p->public_as = cf->local_as; + p->is_internal = (cf->local_as == cf->remote_as); + p->is_interior = p->is_internal || cf->confederation_member; + p->rs_client = cf->rs_client; + p->rr_client = cf->rr_client; + + /* Confederation ID is used for truly external peers */ + if (cf->confederation && !p->is_interior) + p->public_as = cf->confederation; + + /* Add all channels */ + struct bgp_channel_config *cc; + WALK_LIST(cc, CF->channels) + proto_add_channel(P, &cc->c); return P; } +static inline rtable * +get_igp_table(struct bgp_channel_config *cf) +{ + return cf->igp_table ? cf->igp_table->table : cf->c.table->table; +} + +static void +bgp_channel_init(struct channel *C, struct channel_config *CF) +{ + struct bgp_channel *c = (void *) C; + struct bgp_channel_config *cf = (void *) CF; + + c->cf = cf; + c->afi = cf->afi; + c->desc = bgp_get_af_desc(c->afi); + c->igp_table = get_igp_table(cf); +} + +static int +bgp_channel_start(struct channel *C) +{ + struct bgp_proto *p = (void *) C->proto; + struct bgp_channel *c = (void *) C; + ip_addr src = p->source_addr; + + rt_lock_table(c->igp_table); + + c->pool = p->p.pool; // XXXX + bgp_init_bucket_table(c); + bgp_init_prefix_table(c); + + c->next_hop_addr = c->cf->next_hop_addr; + c->link_addr = IPA_NONE; + c->packets_to_send = 0; + + /* Try to use source address as next hop address */ + if (ipa_zero(c->next_hop_addr)) + { + if (bgp_channel_is_ipv4(c) && ipa_is_ip4(src)) + c->next_hop_addr = src; + + if (bgp_channel_is_ipv6(c) && ipa_is_ip6(src) && !ipa_is_link_local(src)) + c->next_hop_addr = src; + } + + /* Set link-local address for IPv6 single-hop BGP */ + if (bgp_channel_is_ipv6(c) && p->neigh) + { + c->link_addr = p->link_addr; + + if (ipa_zero(c->link_addr)) + log(L_WARN "%s: Missing link-local address", p->p.name); + } + + /* No next hop address is valid on IPv6 link-local BGP */ + if (ipa_zero(c->next_hop_addr) && !ipa_is_link_local(src)) + log(L_WARN "%s: Missing next hop address", p->p.name); + + return 0; /* XXXX: Currently undefined */ +} + +static void +bgp_channel_shutdown(struct channel *C) +{ + struct bgp_channel *c = (void *) C; + + /* XXXX: cleanup bucket and prefix tables */ + + c->next_hop_addr = IPA_NONE; + c->link_addr = IPA_NONE; +} + +static void +bgp_channel_cleanup(struct channel *C) +{ + struct bgp_channel *c = (void *) C; + + rt_unlock_table(c->igp_table); +} void -bgp_check_config(struct bgp_config *c) +bgp_postconfig(struct proto_config *CF) { - int internal = (c->local_as == c->remote_as); + struct bgp_config *cf = (void *) CF; + int internal = (cf->local_as == cf->remote_as); /* Do not check templates at all */ - if (c->c.class == SYM_TEMPLATE) + if (cf->c.class == SYM_TEMPLATE) return; /* EBGP direct by default, IBGP multihop by default */ - if (c->multihop < 0) - c->multihop = internal ? 64 : 0; - - /* Different default for gw_mode */ - if (!c->gw_mode) - c->gw_mode = c->multihop ? GW_RECURSIVE : GW_DIRECT; - - /* Different default based on rs_client */ - if (!c->missing_lladdr) - c->missing_lladdr = c->rs_client ? MLL_IGNORE : MLL_SELF; - - /* Disable after error incompatible with restart limit action */ - if (c->c.in_limit && (c->c.in_limit->action == PLA_RESTART) && c->disable_after_error) - c->c.in_limit->action = PLA_DISABLE; + if (cf->multihop < 0) + cf->multihop = internal ? 64 : 0; - if (!c->local_as) + if (!cf->local_as) cf_error("Local AS number must be set"); - if (ipa_zero(c->remote_ip)) + if (ipa_zero(cf->remote_ip)) cf_error("Neighbor must be configured"); - if (!c->remote_as) + if (!cf->remote_as) cf_error("Remote AS number must be set"); // if (ipa_is_link_local(c->remote_ip) && !c->iface) // cf_error("Link-local neighbor address requires specified interface"); - if (!ipa_is_link_local(c->remote_ip) != !c->iface) + if (!ipa_is_link_local(cf->remote_ip) != !cf->iface) cf_error("Link-local address and interface scope must be used together"); - if (!(c->capabilities && c->enable_as4) && (c->remote_as > 0xFFFF)) + if (!(cf->capabilities && cf->enable_as4) && (cf->remote_as > 0xFFFF)) cf_error("Neighbor AS number out of range (AS4 not available)"); - if (!internal && c->rr_client) + if (!internal && cf->rr_client) cf_error("Only internal neighbor can be RR client"); - if (internal && c->rs_client) + if (internal && cf->rs_client) cf_error("Only external neighbor can be RS client"); - if (c->multihop && (c->gw_mode == GW_DIRECT)) - cf_error("Multihop BGP cannot use direct gateway mode"); + if (!cf->confederation && cf->confederation_member) + cf_error("Confederation ID must be set for member sessions"); - if (c->multihop && (ipa_is_link_local(c->remote_ip) || - ipa_is_link_local(c->source_addr))) + if (cf->multihop && (ipa_is_link_local(cf->local_ip) || + ipa_is_link_local(cf->remote_ip))) cf_error("Multihop BGP cannot be used with link-local addresses"); - if (c->multihop && c->check_link) + if (cf->multihop && cf->check_link) cf_error("Multihop BGP cannot depend on link state"); - if (c->multihop && c->bfd && ipa_zero(c->source_addr)) - cf_error("Multihop BGP with BFD requires specified source address"); + if (cf->multihop && cf->bfd && ipa_zero(cf->local_ip)) + cf_error("Multihop BGP with BFD requires specified local address"); - if ((c->gw_mode == GW_RECURSIVE) && c->c.table->sorted) - cf_error("BGP in recursive mode prohibits sorted table"); - if (c->deterministic_med && c->c.table->sorted) - cf_error("BGP with deterministic MED prohibits sorted table"); + struct bgp_channel_config *cc; + WALK_LIST(cc, CF->channels) + { + /* Disable after error incompatible with restart limit action */ + if ((cc->c.in_limit.action == PLA_RESTART) && cf->disable_after_error) + cc->c.in_limit.action = PLA_DISABLE; + + /* Different default based on rs_client */ + if (!cc->missing_lladdr) + cc->missing_lladdr = cf->rs_client ? MLL_IGNORE : MLL_SELF; + + /* Different default for gw_mode */ + if (!cc->gw_mode) + cc->gw_mode = cf->multihop ? GW_RECURSIVE : GW_DIRECT; + + /* Default based on proto config */ + if (cc->gr_able == 0xff) + cc->gr_able = (cf->gr_mode == BGP_GR_ABLE); + + if (cf->multihop && (cc->gw_mode == GW_DIRECT)) + cf_error("Multihop BGP cannot use direct gateway mode"); + + if ((cc->gw_mode == GW_RECURSIVE) && cc->c.table->sorted) + cf_error("BGP in recursive mode prohibits sorted table"); + + if (cf->deterministic_med && cc->c.table->sorted) + cf_error("BGP with deterministic MED prohibits sorted table"); - if (c->secondary && !c->c.table->sorted) - cf_error("BGP with secondary option requires sorted table"); + if (cc->secondary && !cc->c.table->sorted) + cf_error("BGP with secondary option requires sorted table"); + } } static int -bgp_reconfigure(struct proto *P, struct proto_config *C) +bgp_reconfigure(struct proto *P, struct proto_config *CF) { - struct bgp_config *new = (struct bgp_config *) C; - struct bgp_proto *p = (struct bgp_proto *) P; + struct bgp_proto *p = (void *) P; + struct bgp_config *new = (void *) CF; struct bgp_config *old = p->cf; - if (proto_get_router_id(C) != p->local_id) + if (proto_get_router_id(CF) != p->local_id) return 0; int same = !memcmp(((byte *) old) + sizeof(struct proto_config), @@ -1377,8 +1590,26 @@ bgp_reconfigure(struct proto *P, struct proto_config *C) // password item is last and must be checked separately OFFSETOF(struct bgp_config, password) - sizeof(struct proto_config)) && ((!old->password && !new->password) - || (old->password && new->password && !strcmp(old->password, new->password))) - && (get_igp_table(old) == get_igp_table(new)); + || (old->password && new->password && !strcmp(old->password, new->password))); + + /* FIXME: Move channel reconfiguration to generic protocol code ? */ + struct channel *C, *C2; + struct bgp_channel_config *cc; + + WALK_LIST(C, p->p.channels) + C->stale = 1; + + WALK_LIST(cc, new->c.channels) + { + C = (struct channel *) bgp_find_channel(p, cc->afi); + same = proto_configure_channel(P, &C, &cc->c) && same; + C->stale = 0; + } + + WALK_LIST_DELSAFE(C, C2, p->p.channels) + if (C->stale) + same = proto_configure_channel(P, &C, NULL) && same; + if (same && (p->start_state > BSS_PREPARE)) bgp_update_bfd(p, new->bfd); @@ -1390,11 +1621,30 @@ bgp_reconfigure(struct proto *P, struct proto_config *C) return same; } +static int +bgp_channel_reconfigure(struct channel *C, struct channel_config *CC) +{ + struct bgp_channel *c = (void *) C; + struct bgp_channel_config *new = (void *) CC; + struct bgp_channel_config *old = c->cf; + + if (memcmp(((byte *) old) + sizeof(struct channel_config), + ((byte *) new) + sizeof(struct channel_config), + /* igp_table item is last and must be checked separately */ + OFFSETOF(struct bgp_channel_config, igp_table) - sizeof(struct channel_config))) + return 0; + + if (get_igp_table(old) != get_igp_table(new)) + return 0; + + c->cf = new; + return 1; +} + static void -bgp_copy_config(struct proto_config *dest, struct proto_config *src) +bgp_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUSED) { /* Just a shallow copy */ - proto_copy_rest(dest, src, sizeof(struct bgp_config)); } @@ -1411,14 +1661,14 @@ bgp_copy_config(struct proto_config *dest, struct proto_config *src) * closes the connection. */ void -bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int len) +bgp_error(struct bgp_conn *c, uint code, uint subcode, byte *data, int len) { struct bgp_proto *p = c->bgp; if (c->state == BS_CLOSE) return; - bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, (len > 0) ? len : -len); + bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, ABS(len)); bgp_store_error(p, c, BE_BGP_TX, (code << 16) | subcode); bgp_conn_enter_close_state(c); @@ -1426,13 +1676,13 @@ bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int l c->notify_subcode = subcode; c->notify_data = data; c->notify_size = (len > 0) ? len : 0; - bgp_schedule_packet(c, PKT_NOTIFICATION); + bgp_schedule_packet(c, NULL, PKT_NOTIFICATION); if (code != 6) - { - bgp_update_startup_delay(p); - bgp_stop(p, 0); - } + { + bgp_update_startup_delay(p); + bgp_stop(p, 0); + } } /** @@ -1471,19 +1721,19 @@ static const char * bgp_last_errmsg(struct bgp_proto *p) { switch (p->last_error_class) - { - case BE_MISC: - return bgp_misc_errors[p->last_error_code]; - case BE_SOCKET: - return (p->last_error_code == 0) ? "Connection closed" : strerror(p->last_error_code); - case BE_BGP_RX: - case BE_BGP_TX: - return bgp_error_dsc(p->last_error_code >> 16, p->last_error_code & 0xFF); - case BE_AUTO_DOWN: - return bgp_auto_errors[p->last_error_code]; - default: - return ""; - } + { + case BE_MISC: + return bgp_misc_errors[p->last_error_code]; + case BE_SOCKET: + return (p->last_error_code == 0) ? "Connection closed" : strerror(p->last_error_code); + case BE_BGP_RX: + case BE_BGP_TX: + return bgp_error_dsc(p->last_error_code >> 16, p->last_error_code & 0xFF); + case BE_AUTO_DOWN: + return bgp_auto_errors[p->last_error_code]; + default: + return ""; + } } static const char * @@ -1514,48 +1764,164 @@ bgp_get_status(struct proto *P, byte *buf) } static void +bgp_show_afis(int code, char *s, u32 *afis, uint count) +{ + buffer b; + LOG_BUFFER_INIT(b); + + buffer_puts(&b, s); + + for (u32 *af = afis; af < (afis + count); af++) + { + const struct bgp_af_desc *desc = bgp_get_af_desc(*af); + if (desc) + buffer_print(&b, " %s", desc->name); + else + buffer_print(&b, " <%u/%u>", BGP_AFI(*af), BGP_SAFI(*af)); + } + + if (b.pos == b.end) + strcpy(b.end - 32, " ... <too long>"); + + cli_msg(code, b.start); +} + +static void +bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) +{ + struct bgp_af_caps *ac; + uint any_mp_bgp = 0; + uint any_gr_able = 0; + uint any_add_path = 0; + uint any_ext_next_hop = 0; + u32 *afl1 = alloca(caps->af_count * sizeof(u32)); + u32 *afl2 = alloca(caps->af_count * sizeof(u32)); + uint afn1, afn2; + + WALK_AF_CAPS(caps, ac) + { + any_mp_bgp |= ac->ready; + any_gr_able |= ac->gr_able; + any_add_path |= ac->add_path; + any_ext_next_hop |= ac->ext_next_hop; + } + + if (any_mp_bgp) + { + cli_msg(-1006, " Multiprotocol"); + + afn1 = 0; + WALK_AF_CAPS(caps, ac) + if (ac->ready) + afl1[afn1++] = ac->afi; + + bgp_show_afis(-1006, " AF announced:", afl1, afn1); + } + + if (caps->route_refresh) + cli_msg(-1006, " Route refresh"); + + if (any_ext_next_hop) + { + cli_msg(-1006, " Extended next hop"); + + afn1 = 0; + WALK_AF_CAPS(caps, ac) + if (ac->ext_next_hop) + afl1[afn1++] = ac->afi; + + bgp_show_afis(-1006, " IPv6 nexthop:", afl1, afn1); + } + + if (caps->ext_messages) + cli_msg(-1006, " Extended message"); + + if (caps->gr_aware) + cli_msg(-1006, " Graceful restart"); + + if (any_gr_able) + { + /* Continues from gr_aware */ + cli_msg(-1006, " Restart time: %u", caps->gr_time); + if (caps->gr_flags & BGP_GRF_RESTART) + cli_msg(-1006, " Restart recovery"); + + afn1 = afn2 = 0; + WALK_AF_CAPS(caps, ac) + { + if (ac->gr_able) + afl1[afn1++] = ac->afi; + + if (ac->gr_af_flags & BGP_GRF_FORWARDING) + afl2[afn2++] = ac->afi; + } + + bgp_show_afis(-1006, " AF supported:", afl1, afn1); + bgp_show_afis(-1006, " AF preserved:", afl2, afn2); + } + + if (caps->as4_support) + cli_msg(-1006, " 4-octet AS numbers"); + + if (any_add_path) + { + cli_msg(-1006, " ADD-PATH"); + + afn1 = afn2 = 0; + WALK_AF_CAPS(caps, ac) + { + if (ac->add_path & BGP_ADD_PATH_RX) + afl1[afn1++] = ac->afi; + + if (ac->add_path & BGP_ADD_PATH_TX) + afl2[afn2++] = ac->afi; + } + + bgp_show_afis(-1006, " RX:", afl1, afn1); + bgp_show_afis(-1006, " TX:", afl2, afn2); + } + + if (caps->enhanced_refresh) + cli_msg(-1006, " Enhanced refresh"); +} + +static void bgp_show_proto_info(struct proto *P) { struct bgp_proto *p = (struct bgp_proto *) P; - struct bgp_conn *c = p->conn; - - proto_show_basic_info(P); cli_msg(-1006, " BGP state: %s", bgp_state_dsc(p)); cli_msg(-1006, " Neighbor address: %I%J", p->cf->remote_ip, p->cf->iface); cli_msg(-1006, " Neighbor AS: %u", p->remote_as); - if (p->gr_active) + if (p->gr_active_num) cli_msg(-1006, " Neighbor graceful restart active"); if (P->proto_state == PS_START) - { - struct bgp_conn *oc = &p->outgoing_conn; + { + struct bgp_conn *oc = &p->outgoing_conn; - if ((p->start_state < BSS_CONNECT) && - (p->startup_timer->expires)) - cli_msg(-1006, " Error wait: %d/%d", - p->startup_timer->expires - now, p->startup_delay); + if ((p->start_state < BSS_CONNECT) && + (p->startup_timer->expires)) + cli_msg(-1006, " Error wait: %d/%d", + p->startup_timer->expires - now, p->startup_delay); - if ((oc->state == BS_ACTIVE) && - (oc->connect_retry_timer->expires)) - cli_msg(-1006, " Connect delay: %d/%d", - oc->connect_retry_timer->expires - now, p->cf->connect_delay_time); + if ((oc->state == BS_ACTIVE) && + (oc->connect_timer->expires)) + cli_msg(-1006, " Connect delay: %d/%d", + oc->connect_timer->expires - now, p->cf->connect_delay_time); - if (p->gr_active && p->gr_timer->expires) - cli_msg(-1006, " Restart timer: %d/-", p->gr_timer->expires - now); - } + if (p->gr_active_num && p->gr_timer->expires) + cli_msg(-1006, " Restart timer: %d/-", p->gr_timer->expires - now); + } else if (P->proto_state == PS_UP) - { - cli_msg(-1006, " Neighbor ID: %R", p->remote_id); - cli_msg(-1006, " Neighbor caps: %s%s%s%s%s%s%s", - c->peer_refresh_support ? " refresh" : "", - c->peer_enhanced_refresh_support ? " enhanced-refresh" : "", - c->peer_gr_able ? " restart-able" : (c->peer_gr_aware ? " restart-aware" : ""), - c->peer_as4_support ? " AS4" : "", - (c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "", - (c->peer_add_path & ADD_PATH_TX) ? " add-path-tx" : "", - c->peer_ext_messages_support ? " ext-messages" : ""); + { + cli_msg(-1006, " Neighbor ID: %R", p->remote_id); + cli_msg(-1006, " Local capabilities"); + bgp_show_capabilities(p, p->conn->local_caps); + cli_msg(-1006, " Neighbor capabilities"); + bgp_show_capabilities(p, p->conn->remote_caps); +/* XXXX cli_msg(-1006, " Session: %s%s%s%s%s%s%s%s", p->is_internal ? "internal" : "external", p->cf->multihop ? " multihop" : "", @@ -1565,35 +1931,52 @@ bgp_show_proto_info(struct proto *P) p->add_path_rx ? " add-path-rx" : "", p->add_path_tx ? " add-path-tx" : "", p->ext_messages ? " ext-messages" : ""); - cli_msg(-1006, " Source address: %I", p->source_addr); - if (P->cf->in_limit) - cli_msg(-1006, " Route limit: %d/%d", - p->p.stats.imp_routes + p->p.stats.filt_routes, P->cf->in_limit->limit); - cli_msg(-1006, " Hold timer: %d/%d", - tm_remains(c->hold_timer), c->hold_time); - cli_msg(-1006, " Keepalive timer: %d/%d", - tm_remains(c->keepalive_timer), c->keepalive_time); - } +*/ + cli_msg(-1006, " Source address: %I", p->source_addr); + cli_msg(-1006, " Hold timer: %d/%d", + tm_remains(p->conn->hold_timer), p->conn->hold_time); + cli_msg(-1006, " Keepalive timer: %d/%d", + tm_remains(p->conn->keepalive_timer), p->conn->keepalive_time); + } if ((p->last_error_class != BE_NONE) && (p->last_error_class != BE_MAN_DOWN)) - { - const char *err1 = bgp_err_classes[p->last_error_class]; - const char *err2 = bgp_last_errmsg(p); - cli_msg(-1006, " Last error: %s%s", err1, err2); - } + { + const char *err1 = bgp_err_classes[p->last_error_class]; + const char *err2 = bgp_last_errmsg(p); + cli_msg(-1006, " Last error: %s%s", err1, err2); + } + + { + /* XXXX ?? */ + struct channel *c; + WALK_LIST(c, p->p.channels) + channel_show_info(c); + } } +struct channel_class channel_bgp = { + .channel_size = sizeof(struct bgp_channel), + .config_size = sizeof(struct bgp_channel_config), + .init = bgp_channel_init, + .start = bgp_channel_start, + .shutdown = bgp_channel_shutdown, + .cleanup = bgp_channel_cleanup, + .reconfigure = bgp_channel_reconfigure, +}; + struct protocol proto_bgp = { .name = "BGP", .template = "bgp%d", .attr_class = EAP_BGP, .preference = DEF_PREF_BGP, + .channel_mask = NB_IP | NB_FLOW4 | NB_FLOW6, + .proto_size = sizeof(struct bgp_proto), .config_size = sizeof(struct bgp_config), + .postconfig = bgp_postconfig, .init = bgp_init, .start = bgp_start, .shutdown = bgp_shutdown, - .cleanup = bgp_cleanup, .reconfigure = bgp_reconfigure, .copy_config = bgp_copy_config, .get_status = bgp_get_status, |