diff options
Diffstat (limited to 'proto')
63 files changed, 11085 insertions, 6130 deletions
@@ -4,7 +4,8 @@ C bfd C bgp C ospf C pipe -C rip C radv +C rip +C rpki C static S ../nest/rt-dev.c diff --git a/proto/babel/Makefile b/proto/babel/Makefile index 400ffbac..a5b4a13b 100644 --- a/proto/babel/Makefile +++ b/proto/babel/Makefile @@ -1,5 +1,6 @@ -source=babel.c packets.c -root-rel=../../ -dir-name=proto/babel +src := babel.c packets.c +obj := $(src-o-files) +$(all-daemon) +$(cf-local) -include ../../Rules +tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 38be6909..aa7e8b68 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -2,6 +2,8 @@ * BIRD -- The Babel protocol * * Copyright (c) 2015--2016 Toke Hoiland-Jorgensen + * (c) 2016--2017 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2016--2017 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. * @@ -29,17 +31,14 @@ * * The main route selection is done in babel_select_route(). This is called when * an entry is updated by receiving updates from the network or when modified by - * internal timers. It performs feasibility checks on the available routes for - * the prefix and selects the one with the lowest metric to be announced to the - * core. + * internal timers. The function selects from feasible and reachable routes the + * one with the lowest metric to be announced to the core. */ #include <stdlib.h> #include "babel.h" -#define OUR_ROUTE(r) (r->neigh == NULL) - /* * Is one number greater or equal than another mod 2^16? This is based on the * definition of serial number space in RFC 1982. Note that arguments are of @@ -48,47 +47,49 @@ static inline int ge_mod64k(uint a, uint b) { return (u16)(a - b) < 0x8000; } -static void babel_dump_entry(struct babel_entry *e); -static void babel_dump_route(struct babel_route *r); -static void babel_select_route(struct babel_entry *e); -static void babel_send_route_request(struct babel_entry *e, struct babel_neighbor *n); -static void babel_send_wildcard_request(struct babel_iface *ifa); -static int babel_cache_seqno_request(struct babel_proto *p, ip_addr prefix, u8 plen, - u64 router_id, u16 seqno); -static void babel_trigger_iface_update(struct babel_iface *ifa); -static void babel_trigger_update(struct babel_proto *p); -static void babel_send_seqno_request(struct babel_entry *e); +static void babel_expire_requests(struct babel_proto *p, struct babel_entry *e); +static void babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_route *mod); +static inline void babel_announce_retraction(struct babel_proto *p, struct babel_entry *e); +static void babel_send_route_request(struct babel_proto *p, struct babel_entry *e, struct babel_neighbor *n); +static void babel_send_seqno_request(struct babel_proto *p, struct babel_entry *e, struct babel_seqno_request *sr); +static void babel_update_cost(struct babel_neighbor *n); static inline void babel_kick_timer(struct babel_proto *p); static inline void babel_iface_kick_timer(struct babel_iface *ifa); +static inline void babel_lock_neighbor(struct babel_neighbor *nbr) +{ if (nbr) nbr->uc++; } + +static inline void babel_unlock_neighbor(struct babel_neighbor *nbr) +{ if (nbr && !--nbr->uc) mb_free(nbr); } + /* * Functions to maintain data structures */ static void -babel_init_entry(struct fib_node *n) +babel_init_entry(void *E) { - struct babel_entry *e = (void *) n; - e->proto = NULL; - e->selected_in = NULL; - e->selected_out = NULL; - e->updated = now; + struct babel_entry *e = E; + + e->updated = current_time(); + init_list(&e->requests); init_list(&e->sources); init_list(&e->routes); } static inline struct babel_entry * -babel_find_entry(struct babel_proto *p, ip_addr prefix, u8 plen) +babel_find_entry(struct babel_proto *p, const net_addr *n) { - return fib_find(&p->rtable, &prefix, plen); + struct fib *rtable = (n->type == NET_IP4) ? &p->ip4_rtable : &p->ip6_rtable; + return fib_find(rtable, n); } static struct babel_entry * -babel_get_entry(struct babel_proto *p, ip_addr prefix, u8 plen) +babel_get_entry(struct babel_proto *p, const net_addr *n) { - struct babel_entry *e = fib_get(&p->rtable, &prefix, plen); - e->proto = p; + struct fib *rtable = (n->type == NET_IP4) ? &p->ip4_rtable : &p->ip6_rtable; + struct babel_entry *e = fib_get(rtable, n); return e; } @@ -105,9 +106,8 @@ babel_find_source(struct babel_entry *e, u64 router_id) } static struct babel_source * -babel_get_source(struct babel_entry *e, u64 router_id) +babel_get_source(struct babel_proto *p, struct babel_entry *e, u64 router_id) { - struct babel_proto *p = e->proto; struct babel_source *s = babel_find_source(e, router_id); if (s) @@ -115,7 +115,7 @@ babel_get_source(struct babel_entry *e, u64 router_id) s = sl_alloc(p->source_slab); s->router_id = router_id; - s->expires = now + BABEL_GARBAGE_INTERVAL; + s->expires = current_time() + BABEL_GARBAGE_INTERVAL; s->seqno = 0; s->metric = BABEL_INFINITY; add_tail(&e->sources, NODE s); @@ -124,14 +124,14 @@ babel_get_source(struct babel_entry *e, u64 router_id) } static void -babel_expire_sources(struct babel_entry *e) +babel_expire_sources(struct babel_proto *p, struct babel_entry *e) { - struct babel_proto *p = e->proto; struct babel_source *n, *nx; + btime now_ = current_time(); WALK_LIST_DELSAFE(n, nx, e->sources) { - if (n->expires && n->expires <= now) + if (n->expires && n->expires <= now_) { rem_node(NODE n); sl_free(p->source_slab, n); @@ -152,9 +152,8 @@ babel_find_route(struct babel_entry *e, struct babel_neighbor *n) } static struct babel_route * -babel_get_route(struct babel_entry *e, struct babel_neighbor *nbr) +babel_get_route(struct babel_proto *p, struct babel_entry *e, struct babel_neighbor *nbr) { - struct babel_proto *p = e->proto; struct babel_route *r = babel_find_route(e, nbr); if (r) @@ -162,94 +161,91 @@ babel_get_route(struct babel_entry *e, struct babel_neighbor *nbr) r = sl_alloc(p->route_slab); memset(r, 0, sizeof(*r)); + r->e = e; + r->neigh = nbr; add_tail(&e->routes, NODE r); - - if (nbr) - { - r->neigh = nbr; - r->expires = now + BABEL_GARBAGE_INTERVAL; - add_tail(&nbr->routes, NODE &r->neigh_route); - } + add_tail(&nbr->routes, NODE &r->neigh_route); return r; } -static void -babel_flush_route(struct babel_route *r) +static inline void +babel_retract_route(struct babel_proto *p, struct babel_route *r) { - struct babel_proto *p = r->e->proto; + r->metric = r->advert_metric = BABEL_INFINITY; - DBG("Babel: Flush route %I/%d router_id %lR neigh %I\n", - r->e->n.prefix, r->e->n.pxlen, r->router_id, r->neigh ? r->neigh->addr : IPA_NONE); - - rem_node(NODE r); + if (r == r->e->selected) + babel_select_route(p, r->e, r); +} - if (r->neigh) - rem_node(&r->neigh_route); +static void +babel_flush_route(struct babel_proto *p, struct babel_route *r) +{ + DBG("Babel: Flush route %N router_id %lR neigh %I\n", + r->e->n.addr, r->router_id, r->neigh->addr); - if (r->e->selected_in == r) - r->e->selected_in = NULL; + rem_node(NODE r); + rem_node(&r->neigh_route); - if (r->e->selected_out == r) - r->e->selected_out = NULL; + if (r->e->selected == r) + r->e->selected = NULL; sl_free(p->route_slab, r); } static void -babel_expire_route(struct babel_route *r) +babel_expire_route(struct babel_proto *p, struct babel_route *r) { - struct babel_proto *p = r->e->proto; - struct babel_entry *e = r->e; + struct babel_config *cf = (void *) p->p.cf; - TRACE(D_EVENTS, "Route expiry timer for %I/%d router-id %lR fired", - e->n.prefix, e->n.pxlen, r->router_id); + TRACE(D_EVENTS, "Route expiry timer for %N router-id %lR fired", + r->e->n.addr, r->router_id); if (r->metric < BABEL_INFINITY) { - r->metric = BABEL_INFINITY; - r->expires = now + r->expiry_interval; + r->metric = r->advert_metric = BABEL_INFINITY; + r->expires = current_time() + cf->hold_time; } else { - babel_flush_route(r); + babel_flush_route(p, r); } } static void -babel_refresh_route(struct babel_route *r) +babel_refresh_route(struct babel_proto *p, struct babel_route *r) { - if (!OUR_ROUTE(r) && (r == r->e->selected_in)) - babel_send_route_request(r->e, r->neigh); + if (r == r->e->selected) + babel_send_route_request(p, r->e, r->neigh); r->refresh_time = 0; } static void -babel_expire_routes(struct babel_proto *p) +babel_expire_routes_(struct babel_proto *p, struct fib *rtable) { - struct babel_entry *e; + struct babel_config *cf = (void *) p->p.cf; struct babel_route *r, *rx; struct fib_iterator fit; + btime now_ = current_time(); - FIB_ITERATE_INIT(&fit, &p->rtable); + FIB_ITERATE_INIT(&fit, rtable); loop: - FIB_ITERATE_START(&p->rtable, &fit, n) + FIB_ITERATE_START(rtable, &fit, struct babel_entry, e) { - e = (struct babel_entry *) n; int changed = 0; WALK_LIST_DELSAFE(r, rx, e->routes) { - if (r->refresh_time && r->refresh_time <= now) - babel_refresh_route(r); + if (r->refresh_time && r->refresh_time <= now_) + babel_refresh_route(p, r); - if (r->expires && r->expires <= now) + if (r->expires && r->expires <= now_) { - babel_expire_route(r); - changed = 1; + changed = changed || (r == e->selected); + babel_expire_route(p, r); } } @@ -258,25 +254,148 @@ loop: /* * We have to restart the iteration because there may be a cascade of * synchronous events babel_select_route() -> nest table change -> - * babel_rt_notify() -> p->rtable change, invalidating hidden variables. + * babel_rt_notify() -> rtable change, invalidating hidden variables. */ + FIB_ITERATE_PUT(&fit); + babel_select_route(p, e, NULL); + goto loop; + } + + /* Clean up stale entries */ + if ((e->valid == BABEL_ENTRY_STALE) && ((e->updated + cf->hold_time) <= now_)) + e->valid = BABEL_ENTRY_DUMMY; - FIB_ITERATE_PUT(&fit, n); - babel_select_route(e); + /* Clean up unreachable route */ + if (e->unreachable && (!e->valid || (e->router_id == p->router_id))) + { + FIB_ITERATE_PUT(&fit); + babel_announce_retraction(p, e); goto loop; } - babel_expire_sources(e); + babel_expire_sources(p, e); + babel_expire_requests(p, e); /* Remove empty entries */ - if (EMPTY_LIST(e->sources) && EMPTY_LIST(e->routes)) + if (!e->valid && EMPTY_LIST(e->routes) && EMPTY_LIST(e->sources) && EMPTY_LIST(e->requests)) { - FIB_ITERATE_PUT(&fit, n); - fib_delete(&p->rtable, e); + FIB_ITERATE_PUT(&fit); + fib_delete(rtable, e); goto loop; } } - FIB_ITERATE_END(n); + FIB_ITERATE_END; +} + +static void +babel_expire_routes(struct babel_proto *p) +{ + babel_expire_routes_(p, &p->ip4_rtable); + babel_expire_routes_(p, &p->ip6_rtable); +} + +static inline int seqno_request_valid(struct babel_seqno_request *sr) +{ return !sr->nbr || sr->nbr->ifa; } + +/* + * Add seqno request to the table of pending requests (RFC 6216 3.2.6) and send + * it to network. Do nothing if it is already in the table. + */ + +static void +babel_add_seqno_request(struct babel_proto *p, struct babel_entry *e, + u64 router_id, u16 seqno, u8 hop_count, + struct babel_neighbor *nbr) +{ + struct babel_seqno_request *sr; + + WALK_LIST(sr, e->requests) + if (sr->router_id == router_id) + { + /* Found matching or newer */ + if (ge_mod64k(sr->seqno, seqno) && seqno_request_valid(sr)) + return; + + /* Found older */ + babel_unlock_neighbor(sr->nbr); + rem_node(NODE sr); + goto found; + } + + /* No entries found */ + sr = sl_alloc(p->seqno_slab); + +found: + sr->router_id = router_id; + sr->seqno = seqno; + sr->hop_count = hop_count; + sr->count = 0; + sr->expires = current_time() + BABEL_SEQNO_REQUEST_EXPIRY; + babel_lock_neighbor(sr->nbr = nbr); + add_tail(&e->requests, NODE sr); + + babel_send_seqno_request(p, e, sr); +} + +static void +babel_remove_seqno_request(struct babel_proto *p, struct babel_seqno_request *sr) +{ + babel_unlock_neighbor(sr->nbr); + rem_node(NODE sr); + sl_free(p->seqno_slab, sr); +} + +static int +babel_satisfy_seqno_request(struct babel_proto *p, struct babel_entry *e, + u64 router_id, u16 seqno) +{ + struct babel_seqno_request *sr; + + WALK_LIST(sr, e->requests) + if ((sr->router_id == router_id) && ge_mod64k(seqno, sr->seqno)) + { + /* Found the request, remove it */ + babel_remove_seqno_request(p, sr); + return 1; + } + + return 0; +} + +static void +babel_expire_requests(struct babel_proto *p, struct babel_entry *e) +{ + struct babel_seqno_request *sr, *srx; + btime now_ = current_time(); + + WALK_LIST_DELSAFE(sr, srx, e->requests) + { + /* Remove seqno requests sent to dead neighbors */ + if (!seqno_request_valid(sr)) + { + babel_remove_seqno_request(p, sr); + continue; + } + + /* Handle expired requests - resend or remove */ + if (sr->expires && sr->expires <= now_) + { + if (sr->count < BABEL_SEQNO_REQUEST_RETRY) + { + sr->count++; + sr->expires += (BABEL_SEQNO_REQUEST_EXPIRY << sr->count); + babel_send_seqno_request(p, e, sr); + } + else + { + TRACE(D_EVENTS, "Seqno request for %N router-id %lR expired", + e->n.addr, sr->router_id); + + babel_remove_seqno_request(p, sr); + continue; + } + } + } } static struct babel_neighbor * @@ -294,61 +413,79 @@ babel_find_neighbor(struct babel_iface *ifa, ip_addr addr) static struct babel_neighbor * babel_get_neighbor(struct babel_iface *ifa, ip_addr addr) { + struct babel_proto *p = ifa->proto; struct babel_neighbor *nbr = babel_find_neighbor(ifa, addr); if (nbr) return nbr; + TRACE(D_EVENTS, "New neighbor %I on %s", addr, ifa->iface->name); + nbr = mb_allocz(ifa->pool, sizeof(struct babel_neighbor)); nbr->ifa = ifa; nbr->addr = addr; + nbr->rxcost = BABEL_INFINITY; nbr->txcost = BABEL_INFINITY; + nbr->cost = BABEL_INFINITY; init_list(&nbr->routes); + babel_lock_neighbor(nbr); add_tail(&ifa->neigh_list, NODE nbr); return nbr; } static void -babel_flush_neighbor(struct babel_neighbor *nbr) +babel_flush_neighbor(struct babel_proto *p, struct babel_neighbor *nbr) { - struct babel_proto *p = nbr->ifa->proto; + struct babel_route *r; node *n; - TRACE(D_EVENTS, "Flushing neighbor %I", nbr->addr); + TRACE(D_EVENTS, "Removing neighbor %I on %s", nbr->addr, nbr->ifa->iface->name); WALK_LIST_FIRST(n, nbr->routes) { - struct babel_route *r = SKIP_BACK(struct babel_route, neigh_route, n); - struct babel_entry *e = r->e; - int selected = (r == e->selected_in); - - babel_flush_route(r); - - if (selected) - babel_select_route(e); + r = SKIP_BACK(struct babel_route, neigh_route, n); + babel_retract_route(p, r); + babel_flush_route(p, r); } + nbr->ifa = NULL; rem_node(NODE nbr); - mb_free(nbr); + babel_unlock_neighbor(nbr); } static void -babel_expire_ihu(struct babel_neighbor *nbr) +babel_expire_ihu(struct babel_proto *p, struct babel_neighbor *nbr) { + TRACE(D_EVENTS, "IHU from nbr %I on %s expired", nbr->addr, nbr->ifa->iface->name); + nbr->txcost = BABEL_INFINITY; + nbr->ihu_expiry = 0; + babel_update_cost(nbr); } static void -babel_expire_hello(struct babel_neighbor *nbr) +babel_expire_hello(struct babel_proto *p, struct babel_neighbor *nbr, btime now_) { +again: nbr->hello_map <<= 1; if (nbr->hello_cnt < 16) nbr->hello_cnt++; - if (!nbr->hello_map) - babel_flush_neighbor(nbr); + nbr->hello_expiry += nbr->last_hello_int; + + /* We may expire multiple hellos if last_hello_int is too short */ + if (nbr->hello_map && nbr->hello_expiry <= now_) + goto again; + + TRACE(D_EVENTS, "Hello from nbr %I on %s expired, %d left", + nbr->addr, nbr->ifa->iface->name, u32_popcount(nbr->hello_map)); + + if (nbr->hello_map) + babel_update_cost(nbr); + else + babel_flush_neighbor(p, nbr); } static void @@ -356,16 +493,17 @@ babel_expire_neighbors(struct babel_proto *p) { struct babel_iface *ifa; struct babel_neighbor *nbr, *nbx; + btime now_ = current_time(); WALK_LIST(ifa, p->interfaces) { WALK_LIST_DELSAFE(nbr, nbx, ifa->neigh_list) { - if (nbr->ihu_expiry && nbr->ihu_expiry <= now) - babel_expire_ihu(nbr); + if (nbr->ihu_expiry && nbr->ihu_expiry <= now_) + babel_expire_ihu(p, nbr); - if (nbr->hello_expiry && nbr->hello_expiry <= now) - babel_expire_hello(nbr); + if (nbr->hello_expiry && nbr->hello_expiry <= now_) + babel_expire_hello(p, nbr, now_); } } } @@ -399,66 +537,81 @@ babel_is_feasible(struct babel_source *s, u16 seqno, u16 metric) ((seqno == s->seqno) && (metric < s->metric)); } -static u16 -babel_compute_rxcost(struct babel_neighbor *n) +/* Simple additive metric - Appendix 3.1 in the RFC */ +static inline u16 +babel_compute_metric(struct babel_neighbor *n, uint metric) { - struct babel_iface *ifa = n->ifa; - u8 cnt, missed; - u16 map=n->hello_map; - - if (!map) return BABEL_INFINITY; - cnt = u32_popcount(map); // number of bits set - missed = n->hello_cnt-cnt; + return MIN(metric + n->cost, BABEL_INFINITY); +} - if (ifa->cf->type == BABEL_IFACE_TYPE_WIRELESS) - { - /* ETX - Appendix 2.2 in the RFC. +static void +babel_update_cost(struct babel_neighbor *nbr) +{ + struct babel_proto *p = nbr->ifa->proto; + struct babel_iface_config *cf = nbr->ifa->cf; + uint rcv = u32_popcount(nbr->hello_map); // number of bits set + uint max = nbr->hello_cnt; + uint rxcost = BABEL_INFINITY; /* Cost to announce in IHU */ + uint txcost = BABEL_INFINITY; /* Effective cost for route selection */ - beta = prob. of successful transmission. - rxcost = BABEL_RXCOST_WIRELESS/beta + if (!rcv || !nbr->ifa->up) + goto done; - Since: beta = 1-missed/n->hello_cnt = cnt/n->hello_cnt - Then: rxcost = BABEL_RXCOST_WIRELESS * n->hello_cnt / cnt - */ - if (!cnt) return BABEL_INFINITY; - return BABEL_RXCOST_WIRELESS * n->hello_cnt / cnt; - } - else + switch (cf->type) { + case BABEL_IFACE_TYPE_WIRED: /* k-out-of-j selection - Appendix 2.1 in the RFC. */ - DBG("Babel: Missed %d hellos from %I\n", missed, n->addr); - /* Link is bad if more than half the expected hellos were lost */ - return (missed > n->hello_cnt/2) ? BABEL_INFINITY : ifa->cf->rxcost; - } -} + /* Link is bad if less than cf->limit/16 of expected hellos were received */ + if (rcv * 16 < cf->limit * max) + break; -static u16 -babel_compute_cost(struct babel_neighbor *n) -{ - struct babel_iface *ifa = n->ifa; - u16 rxcost = babel_compute_rxcost(n); - if (rxcost == BABEL_INFINITY) return rxcost; - else if (ifa->cf->type == BABEL_IFACE_TYPE_WIRELESS) - { - /* ETX - Appendix 2.2 in the RFC */ - return (MAX(n->txcost, BABEL_RXCOST_WIRELESS) * rxcost)/BABEL_RXCOST_WIRELESS; + rxcost = cf->rxcost; + txcost = nbr->txcost; + break; + + case BABEL_IFACE_TYPE_WIRELESS: + /* + * ETX - Appendix 2.2 in the RFC. + * + * alpha = prob. of successful transmission estimated by the neighbor + * beta = prob. of successful transmission estimated by the router + * rxcost = nominal rxcost of the router / beta + * txcost = nominal rxcost of the neighbor / (alpha * beta) + * = received txcost / beta + * + * Note that received txcost is just neighbor's rxcost. Beta is rcv/max, + * we use inverse values of beta (i.e. max/rcv) to stay in integers. + */ + rxcost = MIN( cf->rxcost * max / rcv, BABEL_INFINITY); + txcost = MIN(nbr->txcost * max / rcv, BABEL_INFINITY); + break; } - else + +done: + /* If RX cost changed, send IHU with next Hello */ + if (rxcost != nbr->rxcost) { - /* k-out-of-j selection - Appendix 2.1 in the RFC. */ - return n->txcost; + nbr->rxcost = rxcost; + nbr->ihu_cnt = 0; } -} -/* Simple additive metric - Appendix 3.1 in the RFC */ -static u16 -babel_compute_metric(struct babel_neighbor *n, uint metric) -{ - metric += babel_compute_cost(n); - return MIN(metric, BABEL_INFINITY); -} + /* If link cost changed, run route selection */ + if (txcost != nbr->cost) + { + TRACE(D_EVENTS, "Cost of nbr %I on %s changed from %u to %u", + nbr->addr, nbr->ifa->iface->name, nbr->cost, txcost); + nbr->cost = txcost; + + struct babel_route *r; node *n; + WALK_LIST2(r, n, nbr->routes, neigh_route) + { + r->metric = babel_compute_metric(nbr, r->advert_metric); + babel_select_route(p, r->e, r); + } + } +} /** * babel_announce_rte - announce selected route to the core @@ -466,123 +619,151 @@ babel_compute_metric(struct babel_neighbor *n, uint metric) * @e: Babel route entry to announce * * This function announces a Babel entry to the core if it has a selected - * incoming path, and retracts it otherwise. If the selected entry has infinite - * metric, the route is announced as unreachable. + * incoming path, and retracts it otherwise. If there is no selected route but + * the entry is valid and ours, the unreachable route is announced instead. */ static void babel_announce_rte(struct babel_proto *p, struct babel_entry *e) { - struct babel_route *r = e->selected_in; + struct babel_route *r = e->selected; + struct channel *c = (e->n.addr->type == NET_IP4) ? p->ip4_channel : p->ip6_channel; if (r) { - net *n = net_get(p->p.table, e->n.prefix, e->n.pxlen); - rta A = { + rta a0 = { .src = p->p.main_source, .source = RTS_BABEL, .scope = SCOPE_UNIVERSE, - .cast = RTC_UNICAST, - .dest = r->metric == BABEL_INFINITY ? RTD_UNREACHABLE : RTD_ROUTER, - .flags = 0, + .dest = RTD_UNICAST, .from = r->neigh->addr, - .iface = r->neigh->ifa->iface, + .nh.gw = r->next_hop, + .nh.iface = r->neigh->ifa->iface, }; - if (r->metric < BABEL_INFINITY) - A.gw = r->next_hop; - - rta *a = rta_lookup(&A); + rta *a = rta_lookup(&a0); rte *rte = rte_get_temp(a); + rte->u.babel.seqno = r->seqno; rte->u.babel.metric = r->metric; rte->u.babel.router_id = r->router_id; - rte->net = n; rte->pflags = 0; - rte_update(&p->p, n, rte); + e->unreachable = 0; + rte_update2(c, e->n.addr, rte, p->p.main_source); + } + else if (e->valid && (e->router_id != p->router_id)) + { + /* Unreachable */ + rta a0 = { + .src = p->p.main_source, + .source = RTS_BABEL, + .scope = SCOPE_UNIVERSE, + .dest = RTD_UNREACHABLE, + }; + + rta *a = rta_lookup(&a0); + rte *rte = rte_get_temp(a); + memset(&rte->u.babel, 0, sizeof(rte->u.babel)); + rte->pflags = 0; + rte->pref = 1; + + e->unreachable = 1; + rte_update2(c, e->n.addr, rte, p->p.main_source); } else { /* Retraction */ - net *n = net_find(p->p.table, e->n.prefix, e->n.pxlen); - rte_update(&p->p, n, NULL); + e->unreachable = 0; + rte_update2(c, e->n.addr, NULL, p->p.main_source); } } +/* Special case of babel_announce_rte() just for retraction */ +static inline void +babel_announce_retraction(struct babel_proto *p, struct babel_entry *e) +{ + struct channel *c = (e->n.addr->type == NET_IP4) ? p->ip4_channel : p->ip6_channel; + e->unreachable = 0; + rte_update2(c, e->n.addr, NULL, p->p.main_source); +} + + /** * babel_select_route - select best route for given route entry + * @p: Babel protocol instance * @e: Babel entry to select the best route for + * @mod: Babel route that was modified or NULL if unspecified * - * Select the best feasible route for a given prefix among the routes received - * from peers, and propagate it to the nest. This just selects the feasible - * route with the lowest metric. + * Select the best reachable and feasible route for a given prefix among the + * routes received from peers, and propagate it to the nest. This just selects + * the reachable and feasible route with the lowest metric, but keeps selected + * the old one in case of tie. * * If no feasible route is available for a prefix that previously had a route - * selected, a seqno request is sent to try to get a valid route. In the - * meantime, the route is marked as infeasible in the nest (to blackhole packets - * going to it, as per the RFC). + * selected, a seqno request is sent to try to get a valid route. If the entry + * is valid and not owned by us, the unreachable route is announced to the nest + * (to blackhole packets going to it, as per section 2.8). It is later removed + * by babel_expire_routes(). Otherwise, the route is just removed from the nest. + * + * Argument @mod is used to optimize best route calculation. When specified, the + * function can assume that only the @mod route was modified to avoid full best + * route selection and announcement when non-best route was modified in minor + * way. The caller is advised to not call babel_select_route() when no change is + * done (e.g. periodic route updates) to avoid unnecessary announcements of the + * same best route. The caller is not required to call the function in case of a + * retraction of a non-best route. * - * If no feasible route is available, and no previous route is selected, the - * route is removed from the nest entirely. + * Note that the function does not active triggered updates. That is done by + * babel_rt_notify() when the change is propagated back to Babel. */ static void -babel_select_route(struct babel_entry *e) +babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_route *mod) { - struct babel_proto *p = e->proto; - struct babel_route *r, *cur = e->selected_in; + struct babel_route *r, *best = e->selected; - /* try to find the best feasible route */ - WALK_LIST(r, e->routes) - if (!OUR_ROUTE(r) && /* prevent propagating our own routes back to core */ - (!cur || r->metric < cur->metric) && - babel_is_feasible(babel_find_source(e, r->router_id), r->seqno, r->advert_metric)) - cur = r; - - if (cur && !OUR_ROUTE(cur) && - ((!e->selected_in && cur->metric < BABEL_INFINITY) || - (e->selected_in && cur->metric < e->selected_in->metric))) + /* Shortcut if only non-best was modified */ + if (mod && (mod != best)) { - TRACE(D_EVENTS, "Picked new route for prefix %I/%d: router id %lR metric %d", - e->n.prefix, e->n.pxlen, cur->router_id, cur->metric); - - e->selected_in = cur; - e->updated = now; - babel_announce_rte(p, e); + /* Either select modified route, or keep old best route */ + if ((mod->metric < (best ? best->metric : BABEL_INFINITY)) && mod->feasible) + best = mod; + else + return; } - else if (!cur || cur->metric == BABEL_INFINITY) + else { - /* Couldn't find a feasible route. If we have a selected route, that means - it just became infeasible; so set it's metric to infinite and install it - (as unreachable), then send a seqno request. - - babel_build_rte() will set the unreachable flag if the metric is BABEL_INFINITY.*/ - if (e->selected_in) - { - TRACE(D_EVENTS, "Lost feasible route for prefix %I/%d", - e->n.prefix, e->n.pxlen); - - e->selected_in->metric = BABEL_INFINITY; - e->updated = now; + /* Selected route may be modified and no longer admissible */ + if (!best || (best->metric == BABEL_INFINITY) || !best->feasible) + best = NULL; + + /* Find the best feasible route from all routes */ + WALK_LIST(r, e->routes) + if ((r->metric < (best ? best->metric : BABEL_INFINITY)) && r->feasible) + best = r; + } - babel_send_seqno_request(e); - babel_announce_rte(p, e); + if (best) + { + if (best != e->selected) + TRACE(D_EVENTS, "Picked new route for prefix %N: router-id %lR metric %d", + e->n.addr, best->router_id, best->metric); + } + else if (e->selected) + { + /* + * We have lost all feasible routes. We have to broadcast seqno request + * (Section 3.8.2.1) and keep unreachable route for a while (section 2.8). + * The later is done automatically by babel_announce_rte(). + */ - /* Section 3.6 of the RFC forbids an infeasible from being selected. This - is cleared after announcing the route to the core to make sure an - unreachable route is propagated first. */ - e->selected_in = NULL; - } - else - { - /* No route currently selected, and no new one selected; this means we - don't have a route to this destination anymore (and were probably - called from an expiry timer). Remove the route from the nest. */ - TRACE(D_EVENTS, "Flushing route for prefix %I/%d", e->n.prefix, e->n.pxlen); - - e->selected_in = NULL; - e->updated = now; - babel_announce_rte(p, e); - } + TRACE(D_EVENTS, "Lost feasible route for prefix %N", e->n.addr); + if (e->valid && (e->selected->router_id == e->router_id)) + babel_add_seqno_request(p, e, e->selected->router_id, e->selected->seqno + 1, 0, NULL); } + else + return; + + e->selected = best; + babel_announce_rte(p, e); } /* @@ -610,11 +791,11 @@ babel_build_ihu(union babel_msg *msg, struct babel_iface *ifa, struct babel_neig msg->type = BABEL_TLV_IHU; msg->ihu.addr = n->addr; - msg->ihu.rxcost = babel_compute_rxcost(n); + msg->ihu.rxcost = n->rxcost; msg->ihu.interval = ifa->cf->ihu_interval; - TRACE(D_PACKETS, "Sending IHU for %I with rxcost %d interval %d", - msg->ihu.addr, msg->ihu.rxcost, msg->ihu.interval); + TRACE(D_PACKETS, "Sending IHU for %I with rxcost %d interval %t", + msg->ihu.addr, msg->ihu.rxcost, (btime) msg->ihu.interval); } static void @@ -623,6 +804,7 @@ babel_send_ihu(struct babel_iface *ifa, struct babel_neighbor *n) union babel_msg msg = {}; babel_build_ihu(&msg, ifa, n); babel_send_unicast(&msg, ifa, n->addr); + n->ihu_cnt = BABEL_IHU_INTERVAL_FACTOR; } static void @@ -631,14 +813,18 @@ babel_send_ihus(struct babel_iface *ifa) struct babel_neighbor *n; WALK_LIST(n, ifa->neigh_list) { - union babel_msg msg = {}; - babel_build_ihu(&msg, ifa, n); - babel_enqueue(&msg, ifa); + if (n->hello_cnt && (--n->ihu_cnt <= 0)) + { + union babel_msg msg = {}; + babel_build_ihu(&msg, ifa, n); + babel_enqueue(&msg, ifa); + n->ihu_cnt = BABEL_IHU_INTERVAL_FACTOR; + } } } static void -babel_send_hello(struct babel_iface *ifa, u8 send_ihu) +babel_send_hello(struct babel_iface *ifa) { struct babel_proto *p = ifa->proto; union babel_msg msg = {}; @@ -647,30 +833,26 @@ babel_send_hello(struct babel_iface *ifa, u8 send_ihu) msg.hello.seqno = ifa->hello_seqno++; msg.hello.interval = ifa->cf->hello_interval; - TRACE(D_PACKETS, "Sending hello on %s with seqno %d interval %d", - ifa->ifname, msg.hello.seqno, msg.hello.interval); + TRACE(D_PACKETS, "Sending hello on %s with seqno %d interval %t", + ifa->ifname, msg.hello.seqno, (btime) msg.hello.interval); babel_enqueue(&msg, ifa); - if (send_ihu) - babel_send_ihus(ifa); + babel_send_ihus(ifa); } static void -babel_send_route_request(struct babel_entry *e, struct babel_neighbor *n) +babel_send_route_request(struct babel_proto *p, struct babel_entry *e, struct babel_neighbor *n) { - struct babel_proto *p = e->proto; - struct babel_iface *ifa = n->ifa; union babel_msg msg = {}; - TRACE(D_PACKETS, "Sending route request for %I/%d to %I", - e->n.prefix, e->n.pxlen, n->addr); + TRACE(D_PACKETS, "Sending route request for %N to %I", + e->n.addr, n->addr); msg.type = BABEL_TLV_ROUTE_REQUEST; - msg.route_request.prefix = e->n.prefix; - msg.route_request.plen = e->n.pxlen; + net_copy(&msg.route_request.net, e->n.addr); - babel_send_unicast(&msg, ifa, n->addr); + babel_send_unicast(&msg, n->ifa, n->addr); } static void @@ -689,56 +871,32 @@ babel_send_wildcard_request(struct babel_iface *ifa) } static void -babel_send_seqno_request(struct babel_entry *e) +babel_send_seqno_request(struct babel_proto *p, struct babel_entry *e, struct babel_seqno_request *sr) { - struct babel_proto *p = e->proto; - struct babel_route *r = e->selected_in; - struct babel_iface *ifa = NULL; - struct babel_source *s = NULL; union babel_msg msg = {}; - s = babel_find_source(e, r->router_id); - if (!s || !babel_cache_seqno_request(p, e->n.prefix, e->n.pxlen, r->router_id, s->seqno + 1)) - return; - - TRACE(D_PACKETS, "Sending seqno request for %I/%d router-id %lR seqno %d", - e->n.prefix, e->n.pxlen, r->router_id, s->seqno + 1); - msg.type = BABEL_TLV_SEQNO_REQUEST; - msg.seqno_request.plen = e->n.pxlen; - msg.seqno_request.seqno = s->seqno + 1; - msg.seqno_request.hop_count = BABEL_INITIAL_HOP_COUNT; - msg.seqno_request.router_id = r->router_id; - msg.seqno_request.prefix = e->n.prefix; - - WALK_LIST(ifa, p->interfaces) - babel_enqueue(&msg, ifa); -} + msg.seqno_request.hop_count = sr->hop_count ?: BABEL_INITIAL_HOP_COUNT; + msg.seqno_request.seqno = sr->seqno; + msg.seqno_request.router_id = sr->router_id; + net_copy(&msg.seqno_request.net, e->n.addr); -static void -babel_unicast_seqno_request(struct babel_route *r) -{ - struct babel_entry *e = r->e; - struct babel_proto *p = e->proto; - struct babel_iface *ifa = r->neigh->ifa; - struct babel_source *s = NULL; - union babel_msg msg = {}; - - s = babel_find_source(e, r->router_id); - if (!s || !babel_cache_seqno_request(p, e->n.prefix, e->n.pxlen, r->router_id, s->seqno + 1)) - return; - - TRACE(D_PACKETS, "Sending seqno request for %I/%d router-id %lR seqno %d", - e->n.prefix, e->n.pxlen, r->router_id, s->seqno + 1); + if (sr->nbr) + { + TRACE(D_PACKETS, "Sending seqno request for %N router-id %lR seqno %d to %I on %s", + e->n.addr, sr->router_id, sr->seqno, sr->nbr->addr, sr->nbr->ifa->ifname); - msg.type = BABEL_TLV_SEQNO_REQUEST; - msg.seqno_request.plen = e->n.pxlen; - msg.seqno_request.seqno = s->seqno + 1; - msg.seqno_request.hop_count = BABEL_INITIAL_HOP_COUNT; - msg.seqno_request.router_id = r->router_id; - msg.seqno_request.prefix = e->n.prefix; + babel_send_unicast(&msg, sr->nbr->ifa, sr->nbr->addr); + } + else + { + TRACE(D_PACKETS, "Sending broadcast seqno request for %N router-id %lR seqno %d", + e->n.addr, sr->router_id, sr->seqno); - babel_send_unicast(&msg, ifa, r->neigh->addr); + struct babel_iface *ifa; + WALK_LIST(ifa, p->interfaces) + babel_enqueue(&msg, ifa); + } } /** @@ -752,49 +910,55 @@ babel_unicast_seqno_request(struct babel_route *r) * transmitted entry is updated. */ static void -babel_send_update(struct babel_iface *ifa, bird_clock_t changed) +babel_send_update_(struct babel_iface *ifa, btime changed, struct fib *rtable) { struct babel_proto *p = ifa->proto; - FIB_WALK(&p->rtable, n) + /* Update increase was requested */ + if (p->update_seqno_inc) { - struct babel_entry *e = (void *) n; - struct babel_route *r = e->selected_out; + p->update_seqno++; + p->update_seqno_inc = 0; + } - if (!r) + FIB_WALK(rtable, struct babel_entry, e) + { + if (!e->valid) continue; /* Our own seqno might have changed, in which case we update the routes we originate. */ - if ((r->router_id == p->router_id) && (r->seqno < p->update_seqno)) + if ((e->router_id == p->router_id) && (e->seqno < p->update_seqno)) { - r->seqno = p->update_seqno; - e->updated = now; + e->seqno = p->update_seqno; + e->updated = current_time(); } /* Skip routes that weren't updated since 'changed' time */ if (e->updated < changed) continue; - TRACE(D_PACKETS, "Sending update for %I/%d router-id %lR seqno %d metric %d", - e->n.prefix, e->n.pxlen, r->router_id, r->seqno, r->metric); + TRACE(D_PACKETS, "Sending update for %N router-id %lR seqno %d metric %d", + e->n.addr, e->router_id, e->seqno, e->metric); union babel_msg msg = {}; msg.type = BABEL_TLV_UPDATE; - msg.update.plen = e->n.pxlen; msg.update.interval = ifa->cf->update_interval; - msg.update.seqno = r->seqno; - msg.update.metric = r->metric; - msg.update.prefix = e->n.prefix; - msg.update.router_id = r->router_id; + msg.update.seqno = e->seqno; + msg.update.metric = e->metric; + msg.update.router_id = e->router_id; + net_copy(&msg.update.net, e->n.addr); + + msg.update.next_hop = ((e->n.addr->type == NET_IP4) ? + ifa->next_hop_ip4 : ifa->next_hop_ip6); babel_enqueue(&msg, ifa); /* Update feasibility distance for redistributed routes */ - if (!OUR_ROUTE(r)) + if (e->router_id != p->router_id) { - struct babel_source *s = babel_get_source(e, r->router_id); - s->expires = now + BABEL_GARBAGE_INTERVAL; + struct babel_source *s = babel_get_source(p, e, e->router_id); + s->expires = current_time() + BABEL_GARBAGE_INTERVAL; if ((msg.update.seqno > s->seqno) || ((msg.update.seqno == s->seqno) && (msg.update.metric < s->metric))) @@ -808,6 +972,15 @@ babel_send_update(struct babel_iface *ifa, bird_clock_t changed) } static void +babel_send_update(struct babel_iface *ifa, btime changed) +{ + struct babel_proto *p = ifa->proto; + + babel_send_update_(ifa, changed, &p->ip4_rtable); + babel_send_update_(ifa, changed, &p->ip6_rtable); +} + +static void babel_trigger_iface_update(struct babel_iface *ifa) { struct babel_proto *p = ifa->proto; @@ -819,7 +992,7 @@ babel_trigger_iface_update(struct babel_iface *ifa) TRACE(D_EVENTS, "Scheduling triggered updates for %s seqno %d", ifa->iface->name, p->update_seqno); - ifa->want_triggered = now; + ifa->want_triggered = current_time(); babel_iface_kick_timer(ifa); } @@ -839,20 +1012,18 @@ babel_trigger_update(struct babel_proto *p) /* A retraction is an update with an infinite metric */ static void -babel_send_retraction(struct babel_iface *ifa, ip_addr prefix, int plen) +babel_send_retraction(struct babel_iface *ifa, net_addr *n) { struct babel_proto *p = ifa->proto; union babel_msg msg = {}; - TRACE(D_PACKETS, "Sending retraction for %I/%d seqno %d", - prefix, plen, p->update_seqno); + TRACE(D_PACKETS, "Sending retraction for %N seqno %d", n, p->update_seqno); msg.type = BABEL_TLV_UPDATE; - msg.update.plen = plen; msg.update.interval = ifa->cf->update_interval; msg.update.seqno = p->update_seqno; msg.update.metric = BABEL_INFINITY; - msg.update.prefix = prefix; + msg.update.net = *n; babel_enqueue(&msg, ifa); } @@ -881,7 +1052,7 @@ babel_send_wildcard_retraction(struct babel_iface *ifa) /* Update hello history according to Appendix A1 of the RFC */ static void -babel_update_hello_history(struct babel_neighbor *n, u16 seqno, u16 interval) +babel_update_hello_history(struct babel_neighbor *n, u16 seqno, uint interval) { /* * Compute the difference between expected and received seqno (modulo 2^16). @@ -892,7 +1063,7 @@ babel_update_hello_history(struct babel_neighbor *n, u16 seqno, u16 interval) u16 delta = ((uint) seqno - (uint) n->next_hello_seqno); - if (delta == 0) + if ((delta == 0) || (n->hello_cnt == 0)) { /* Do nothing */ } @@ -919,84 +1090,10 @@ babel_update_hello_history(struct babel_neighbor *n, u16 seqno, u16 interval) n->hello_map = (n->hello_map << 1) | 1; n->next_hello_seqno = seqno+1; if (n->hello_cnt < 16) n->hello_cnt++; - n->hello_expiry = now + BABEL_HELLO_EXPIRY_FACTOR(interval); -} - -static void -babel_expire_seqno_requests(struct babel_proto *p) -{ - struct babel_seqno_request *n, *nx; - WALK_LIST_DELSAFE(n, nx, p->seqno_cache) - { - if ((n->updated + BABEL_SEQNO_REQUEST_EXPIRY) <= now) - { - rem_node(NODE n); - sl_free(p->seqno_slab, n); - } - } -} - -/* - * Checks the seqno request cache for a matching request and returns failure if - * found. Otherwise, a new entry is stored in the cache. - */ -static int -babel_cache_seqno_request(struct babel_proto *p, ip_addr prefix, u8 plen, - u64 router_id, u16 seqno) -{ - struct babel_seqno_request *r; - - WALK_LIST(r, p->seqno_cache) - { - if (ipa_equal(r->prefix, prefix) && (r->plen == plen) && - (r->router_id == router_id) && (r->seqno == seqno)) - return 0; - } - - /* no entries found */ - r = sl_alloc(p->seqno_slab); - r->prefix = prefix; - r->plen = plen; - r->router_id = router_id; - r->seqno = seqno; - r->updated = now; - add_tail(&p->seqno_cache, NODE r); - - return 1; -} - -static void -babel_forward_seqno_request(struct babel_entry *e, - struct babel_msg_seqno_request *in, - ip_addr sender) -{ - struct babel_proto *p = e->proto; - struct babel_route *r; - - TRACE(D_PACKETS, "Forwarding seqno request for %I/%d router-id %lR seqno %d", - e->n.prefix, e->n.pxlen, in->router_id, in->seqno); - - WALK_LIST(r, e->routes) - { - if ((r->router_id == in->router_id) && - !OUR_ROUTE(r) && - !ipa_equal(r->neigh->addr, sender)) - { - if (!babel_cache_seqno_request(p, e->n.prefix, e->n.pxlen, in->router_id, in->seqno)) - return; - union babel_msg msg = {}; - msg.type = BABEL_TLV_SEQNO_REQUEST; - msg.seqno_request.plen = in->plen; - msg.seqno_request.seqno = in->seqno; - msg.seqno_request.hop_count = in->hop_count-1; - msg.seqno_request.router_id = in->router_id; - msg.seqno_request.prefix = e->n.prefix; - - babel_send_unicast(&msg, r->neigh->ifa, r->neigh->addr); - return; - } - } + /* Update expiration */ + n->hello_expiry = current_time() + BABEL_HELLO_EXPIRY_FACTOR(interval); + n->last_hello_int = interval; } @@ -1010,8 +1107,8 @@ babel_handle_ack_req(union babel_msg *m, struct babel_iface *ifa) struct babel_proto *p = ifa->proto; struct babel_msg_ack_req *msg = &m->ack_req; - TRACE(D_PACKETS, "Handling ACK request nonce %d interval %d", - msg->nonce, msg->interval); + TRACE(D_PACKETS, "Handling ACK request nonce %d interval %t", + msg->nonce, (btime) msg->interval); babel_send_ack(ifa, msg->sender, msg->nonce); } @@ -1022,12 +1119,17 @@ babel_handle_hello(union babel_msg *m, struct babel_iface *ifa) struct babel_proto *p = ifa->proto; struct babel_msg_hello *msg = &m->hello; - TRACE(D_PACKETS, "Handling hello seqno %d interval %d", - msg->seqno, msg->interval); + TRACE(D_PACKETS, "Handling hello seqno %d interval %t", + msg->seqno, (btime) msg->interval); struct babel_neighbor *n = babel_get_neighbor(ifa, msg->sender); + int first_hello = !n->hello_cnt; + babel_update_hello_history(n, msg->seqno, msg->interval); - if (ifa->cf->type == BABEL_IFACE_TYPE_WIRELESS) + babel_update_cost(n); + + /* Speed up session establishment by sending IHU immediately */ + if (first_hello) babel_send_ihu(ifa, n); } @@ -1041,12 +1143,13 @@ babel_handle_ihu(union babel_msg *m, struct babel_iface *ifa) if ((msg->ae != BABEL_AE_WILDCARD) && !ipa_equal(msg->addr, ifa->addr)) return; - TRACE(D_PACKETS, "Handling IHU rxcost %d interval %d", - msg->rxcost, msg->interval); + TRACE(D_PACKETS, "Handling IHU rxcost %d interval %t", + msg->rxcost, (btime) msg->interval); struct babel_neighbor *n = babel_get_neighbor(ifa, msg->sender); n->txcost = msg->rxcost; - n->ihu_expiry = now + BABEL_IHU_EXPIRY_FACTOR(msg->interval); + n->ihu_expiry = current_time() + BABEL_IHU_EXPIRY_FACTOR(msg->interval); + babel_update_cost(n); } /** @@ -1069,12 +1172,15 @@ babel_handle_update(union babel_msg *m, struct babel_iface *ifa) struct babel_neighbor *nbr; struct babel_entry *e; struct babel_source *s; - struct babel_route *r; + struct babel_route *r, *best; node *n; - int feasible; + int feasible, metric; - TRACE(D_PACKETS, "Handling update for %I/%d with seqno %d metric %d", - msg->prefix, msg->plen, msg->seqno, msg->metric); + if (msg->wildcard) + TRACE(D_PACKETS, "Handling wildcard retraction", msg->seqno); + else + TRACE(D_PACKETS, "Handling update for %N with seqno %d metric %d", + &msg->net, msg->seqno, msg->metric); nbr = babel_find_neighbor(ifa, msg->sender); if (!nbr) @@ -1089,38 +1195,12 @@ babel_handle_update(union babel_msg *m, struct babel_iface *ifa) return; } - /* - * RFC section 3.5.4: - * - * When a Babel node receives an update (id, prefix, seqno, metric) from a - * neighbour neigh with a link cost value equal to cost, it checks whether it - * already has a routing table entry indexed by (neigh, id, prefix). - * - * If no such entry exists: - * - * o if the update is unfeasible, it is ignored; - * - * o if the metric is infinite (the update is a retraction), the update is - * ignored; - * - * o otherwise, a new route table entry is created, indexed by (neigh, id, - * prefix), with seqno equal to seqno and an advertised metric equal to the - * metric carried by the update. - * - * If such an entry exists: - * - * o if the entry is currently installed and the update is unfeasible, then - * the behaviour depends on whether the router-ids of the two entries match. - * If the router-ids are different, the update is treated as though it were - * a retraction (i.e., as though the metric were FFFF hexadecimal). If the - * router-ids are equal, the update is ignored; - * - * o otherwise (i.e., if either the update is feasible or the entry is not - * currently installed), then the entry's sequence number, advertised - * metric, metric, and router-id are updated and, unless the advertised - * metric is infinite, the route's expiry timer is reset to a small multiple - * of the Interval value included in the update. - */ + struct channel *c = (msg->net.type == NET_IP4) ? p->ip4_channel : p->ip6_channel; + if (!c || (c->channel_state != CS_UP)) + { + DBG("Babel: Ignoring update for inactive address family.\n"); + return; + } /* Retraction */ if (msg->metric == BABEL_INFINITY) @@ -1134,13 +1214,12 @@ babel_handle_update(union babel_msg *m, struct babel_iface *ifa) WALK_LIST(n, nbr->routes) { r = SKIP_BACK(struct babel_route, neigh_route, n); - r->metric = BABEL_INFINITY; - babel_select_route(r->e); + babel_retract_route(p, r); } } else { - e = babel_find_entry(p, msg->prefix, msg->plen); + e = babel_find_entry(p, &msg->net); if (!e) return; @@ -1151,68 +1230,56 @@ babel_handle_update(union babel_msg *m, struct babel_iface *ifa) if (!r) return; - r->metric = BABEL_INFINITY; - babel_select_route(e); + /* Router-id, next-hop and seqno are ignored for retractions */ + babel_retract_route(p, r); } /* Done with retractions */ return; } - e = babel_get_entry(p, msg->prefix, msg->plen); - r = babel_find_route(e, nbr); /* the route entry indexed by neighbour */ + /* Regular update */ + e = babel_get_entry(p, &msg->net); + r = babel_get_route(p, e, nbr); /* the route entry indexed by neighbour */ s = babel_find_source(e, msg->router_id); /* for feasibility */ feasible = babel_is_feasible(s, msg->seqno, msg->metric); + metric = babel_compute_metric(nbr, msg->metric); + best = e->selected; - if (!r) - { - if (!feasible) - return; + /* RFC section 3.8.2.2 - Dealing with unfeasible updates */ + if (!feasible && (metric != BABEL_INFINITY) && + (!best || (r == best) || (metric < best->metric))) + babel_add_seqno_request(p, e, s->router_id, s->seqno + 1, 0, nbr); - r = babel_get_route(e, nbr); - r->advert_metric = msg->metric; - r->router_id = msg->router_id; - r->metric = babel_compute_metric(nbr, msg->metric); - r->next_hop = msg->next_hop; - r->seqno = msg->seqno; - } - else if (r == r->e->selected_in && !feasible) - { - /* - * Route is installed and update is infeasible - we may lose the route, - * so send a unicast seqno request (section 3.8.2.2 second paragraph). - */ - babel_unicast_seqno_request(r); + /* Special case - ignore unfeasible update to best route */ + if (r == best && !feasible && (msg->router_id == r->router_id)) + return; - if (msg->router_id == r->router_id) - return; + r->expires = current_time() + BABEL_ROUTE_EXPIRY_FACTOR(msg->interval); + r->refresh_time = current_time() + BABEL_ROUTE_REFRESH_FACTOR(msg->interval); - /* Treat as retraction */ - r->metric = BABEL_INFINITY; - } - else + /* No further processing if there is no change */ + if ((r->feasible == feasible) && (r->seqno == msg->seqno) && + (r->metric == metric) && (r->advert_metric == msg->metric) && + (r->router_id == msg->router_id) && ipa_equal(r->next_hop, msg->next_hop)) + return; + + /* Last paragraph above - update the entry */ + r->feasible = feasible; + r->seqno = msg->seqno; + r->metric = metric; + r->advert_metric = msg->metric; + r->router_id = msg->router_id; + r->next_hop = msg->next_hop; + + /* If received update satisfies seqno request, we send triggered updates */ + if (babel_satisfy_seqno_request(p, e, msg->router_id, msg->seqno)) { - /* Last paragraph above - update the entry */ - r->advert_metric = msg->metric; - r->metric = babel_compute_metric(nbr, msg->metric); - r->next_hop = msg->next_hop; - - r->router_id = msg->router_id; - r->seqno = msg->seqno; - - r->expiry_interval = BABEL_ROUTE_EXPIRY_FACTOR(msg->interval); - r->expires = now + r->expiry_interval; - if (r->expiry_interval > BABEL_ROUTE_REFRESH_INTERVAL) - r->refresh_time = now + r->expiry_interval - BABEL_ROUTE_REFRESH_INTERVAL; - - /* If the route is not feasible at this point, it means it is from another - neighbour than the one currently selected; so send a unicast seqno - request to try to get a better route (section 3.8.2.2 last paragraph). */ - if (!feasible) - babel_unicast_seqno_request(r); + babel_trigger_update(p); + e->updated = current_time(); } - babel_select_route(e); + babel_select_route(p, e, r); } void @@ -1231,23 +1298,22 @@ babel_handle_route_request(union babel_msg *m, struct babel_iface *ifa) return; } - TRACE(D_PACKETS, "Handling route request for %I/%d", msg->prefix, msg->plen); + TRACE(D_PACKETS, "Handling route request for %N", &msg->net); /* Non-wildcard request - see if we have an entry for the route. If not, send a retraction, otherwise send an update. */ - struct babel_entry *e = babel_find_entry(p, msg->prefix, msg->plen); + struct babel_entry *e = babel_find_entry(p, &msg->net); if (!e) { - babel_send_retraction(ifa, msg->prefix, msg->plen); + babel_send_retraction(ifa, &msg->net); } else { babel_trigger_iface_update(ifa); - e->updated = now; + e->updated = current_time(); } } - void babel_handle_seqno_request(union babel_msg *m, struct babel_iface *ifa) { @@ -1256,36 +1322,54 @@ babel_handle_seqno_request(union babel_msg *m, struct babel_iface *ifa) /* RFC 6126 3.8.1.2 */ - TRACE(D_PACKETS, "Handling seqno request for %I/%d router-id %lR seqno %d hop count %d", - msg->prefix, msg->plen, msg->router_id, msg->seqno, msg->hop_count); + TRACE(D_PACKETS, "Handling seqno request for %N router-id %lR seqno %d hop count %d", + &msg->net, msg->router_id, msg->seqno, msg->hop_count); /* Ignore if we have no such entry or entry has infinite metric */ - struct babel_entry *e = babel_find_entry(p, msg->prefix, msg->plen); - if (!e || !e->selected_out || (e->selected_out->metric == BABEL_INFINITY)) + struct babel_entry *e = babel_find_entry(p, &msg->net); + if (!e || !e->valid || (e->metric == BABEL_INFINITY)) return; /* Trigger update on incoming interface if we have a selected route with different router id or seqno no smaller than requested */ - struct babel_route *r = e->selected_out; - if ((r->router_id != msg->router_id) || ge_mod64k(r->seqno, msg->seqno)) + if ((e->router_id != msg->router_id) || ge_mod64k(e->seqno, msg->seqno)) { babel_trigger_iface_update(ifa); - e->updated = now; + e->updated = current_time(); return; } /* Seqno is larger; check if we own the router id */ if (msg->router_id == p->router_id) { - /* Ours; update seqno and trigger global update */ - p->update_seqno++; + /* Ours; seqno increase and trigger global update */ + p->update_seqno_inc = 1; babel_trigger_update(p); } - else + else if (msg->hop_count > 1) { /* Not ours; forward if TTL allows it */ - if (msg->hop_count > 1) - babel_forward_seqno_request(e, msg, msg->sender); + + /* Find best admissible route */ + struct babel_route *r, *best1 = NULL, *best2 = NULL; + WALK_LIST(r, e->routes) + if ((r->router_id == msg->router_id) && !ipa_equal(r->neigh->addr, msg->sender)) + { + /* Find best feasible route */ + if ((!best1 || r->metric < best1->metric) && r->feasible) + best1 = r; + + /* Find best not necessary feasible route */ + if (!best2 || r->metric < best2->metric) + best2 = r; + } + + /* If no route is found, do nothing */ + r = best1 ?: best2; + if (!r) + return; + + babel_add_seqno_request(p, e, msg->router_id, msg->seqno, msg->hop_count-1, r->neigh); } } @@ -1320,42 +1404,43 @@ babel_iface_timer(timer *t) { struct babel_iface *ifa = t->data; struct babel_proto *p = ifa->proto; - bird_clock_t hello_period = ifa->cf->hello_interval; - bird_clock_t update_period = ifa->cf->update_interval; + btime hello_period = ifa->cf->hello_interval; + btime update_period = ifa->cf->update_interval; + btime now_ = current_time(); - if (now >= ifa->next_hello) + if (now_ >= ifa->next_hello) { - babel_send_hello(ifa, (ifa->cf->type == BABEL_IFACE_TYPE_WIRELESS || - ifa->hello_seqno % BABEL_IHU_INTERVAL_FACTOR == 0)); - ifa->next_hello += hello_period * (1 + (now - ifa->next_hello) / hello_period); + babel_send_hello(ifa); + ifa->next_hello += hello_period * (1 + (now_ - ifa->next_hello) / hello_period); } - if (now >= ifa->next_regular) + if (now_ >= ifa->next_regular) { TRACE(D_EVENTS, "Sending regular updates on %s", ifa->ifname); babel_send_update(ifa, 0); - ifa->next_regular += update_period * (1 + (now - ifa->next_regular) / update_period); + ifa->next_regular += update_period * (1 + (now_ - ifa->next_regular) / update_period); ifa->want_triggered = 0; p->triggered = 0; } - else if (ifa->want_triggered && (now >= ifa->next_triggered)) + else if (ifa->want_triggered && (now_ >= ifa->next_triggered)) { TRACE(D_EVENTS, "Sending triggered updates on %s", ifa->ifname); babel_send_update(ifa, ifa->want_triggered); - ifa->next_triggered = now + MIN(5, update_period / 2 + 1); + ifa->next_triggered = now_ + MIN(1 S, update_period / 2); ifa->want_triggered = 0; p->triggered = 0; } - bird_clock_t next_event = MIN(ifa->next_hello, ifa->next_regular); - tm_start(ifa->timer, ifa->want_triggered ? 1 : (next_event - now)); + btime next_event = MIN(ifa->next_hello, ifa->next_regular); + if (ifa->want_triggered) next_event = MIN(next_event, ifa->next_triggered); + tm_set(ifa->timer, next_event); } static inline void babel_iface_kick_timer(struct babel_iface *ifa) { - if (ifa->timer->expires > (now + 1)) - tm_start(ifa->timer, 1); + if (ifa->timer->expires > (current_time() + 100 MS)) + tm_start(ifa->timer, 100 MS); } static void @@ -1365,14 +1450,14 @@ babel_iface_start(struct babel_iface *ifa) TRACE(D_EVENTS, "Starting interface %s", ifa->ifname); - ifa->next_hello = now + (random() % ifa->cf->hello_interval) + 1; - ifa->next_regular = now + (random() % ifa->cf->update_interval) + 1; - ifa->next_triggered = now + MIN(5, ifa->cf->update_interval / 2 + 1); + ifa->next_hello = current_time() + (random() % ifa->cf->hello_interval); + ifa->next_regular = current_time() + (random() % ifa->cf->update_interval); + ifa->next_triggered = current_time() + MIN(1 S, ifa->cf->update_interval / 2); ifa->want_triggered = 0; /* We send an immediate update (below) */ - tm_start(ifa->timer, 1); + tm_start(ifa->timer, 100 MS); ifa->up = 1; - babel_send_hello(ifa, 0); + babel_send_hello(ifa); babel_send_wildcard_retraction(ifa); babel_send_wildcard_request(ifa); babel_send_update(ifa, 0); /* Full update */ @@ -1398,9 +1483,7 @@ babel_iface_stop(struct babel_iface *ifa) WALK_LIST(n, nbr->routes) { r = SKIP_BACK(struct babel_route, neigh_route, n); - r->metric = BABEL_INFINITY; - r->expires = now + r->expiry_interval; - babel_select_route(r->e); + babel_retract_route(p, r); } } @@ -1488,21 +1571,21 @@ babel_add_iface(struct babel_proto *p, struct iface *new, struct babel_iface_con ifa->cf = ic; ifa->pool = pool; ifa->ifname = new->name; + ifa->addr = new->llv6->ip; add_tail(&p->interfaces, NODE ifa); - struct ifa *addr; - WALK_LIST(addr, new->addrs) - if (ipa_is_link_local(addr->ip)) - ifa->addr = addr->ip; + ip_addr addr4 = new->addr4 ? new->addr4->ip : IPA_NONE; + ifa->next_hop_ip4 = ipa_nonzero(ic->next_hop_ip4) ? ic->next_hop_ip4 : addr4; + ifa->next_hop_ip6 = ipa_nonzero(ic->next_hop_ip6) ? ic->next_hop_ip6 : ifa->addr; - if (ipa_zero(ifa->addr)) - log(L_WARN "%s: Cannot find link-local addr on %s", p->p.name, new->name); + if (ipa_zero(ifa->next_hop_ip4) && p->ip4_channel) + log(L_WARN "%s: Cannot find IPv4 next hop addr on %s", p->p.name, new->name); init_list(&ifa->neigh_list); ifa->hello_seqno = 1; - ifa->timer = tm_new_set(ifa->pool, babel_iface_timer, ifa, 0, 0); + ifa->timer = tm_new_init(ifa->pool, babel_iface_timer, ifa, 0, 0); init_list(&ifa->msg_queue); ifa->send_event = ev_new(ifa->pool); @@ -1527,7 +1610,7 @@ babel_remove_iface(struct babel_proto *p, struct babel_iface *ifa) struct babel_neighbor *n; WALK_LIST_FIRST(n, ifa->neigh_list) - babel_flush_neighbor(n); + babel_flush_neighbor(p, n); rem_node(NODE ifa); @@ -1545,12 +1628,16 @@ babel_if_notify(struct proto *P, unsigned flags, struct iface *iface) if (flags & IF_CHANGE_UP) { - struct babel_iface_config *ic = (void *) iface_patt_find(&cf->iface_list, iface, iface->addr); + struct babel_iface_config *ic = (void *) iface_patt_find(&cf->iface_list, iface, NULL); /* we only speak multicast */ if (!(iface->flags & IF_MULTICAST)) return; + /* Ignore ifaces without link-local address */ + if (!iface->llv6) + return; + if (ic) babel_add_iface(p, iface, ic); @@ -1590,11 +1677,18 @@ babel_reconfigure_iface(struct babel_proto *p, struct babel_iface *ifa, struct b ifa->cf = new; - if (ifa->next_hello > (now + new->hello_interval)) - ifa->next_hello = now + (random() % new->hello_interval) + 1; + ip_addr addr4 = ifa->iface->addr4 ? ifa->iface->addr4->ip : IPA_NONE; + ifa->next_hop_ip4 = ipa_nonzero(new->next_hop_ip4) ? new->next_hop_ip4 : addr4; + ifa->next_hop_ip6 = ipa_nonzero(new->next_hop_ip6) ? new->next_hop_ip6 : ifa->addr; + + if (ipa_zero(ifa->next_hop_ip4) && p->ip4_channel) + log(L_WARN "%s: Cannot find IPv4 next hop addr on %s", p->p.name, ifa->ifname); - if (ifa->next_regular > (now + new->update_interval)) - ifa->next_regular = now + (random() % new->update_interval) + 1; + if (ifa->next_hello > (current_time() + new->hello_interval)) + ifa->next_hello = current_time() + (random() % new->hello_interval); + + if (ifa->next_regular > (current_time() + new->update_interval)) + ifa->next_regular = current_time() + (random() % new->update_interval); if ((new->tx_length != old->tx_length) || (new->rx_buffer != old->rx_buffer)) babel_iface_update_buffers(ifa); @@ -1615,7 +1709,15 @@ babel_reconfigure_ifaces(struct babel_proto *p, struct babel_config *cf) WALK_LIST(iface, iface_list) { - if (! (iface->flags & IF_UP)) + if (!(iface->flags & IF_UP)) + continue; + + /* Ignore non-multicast ifaces */ + if (!(iface->flags & IF_MULTICAST)) + continue; + + /* Ignore ifaces without link-local address */ + if (!iface->llv6) continue; struct babel_iface *ifa = babel_find_iface(p, iface); @@ -1648,18 +1750,17 @@ babel_reconfigure_ifaces(struct babel_proto *p, struct babel_config *cf) static void babel_dump_source(struct babel_source *s) { - debug("Source router_id %lR seqno %d metric %d expires %d\n", - s->router_id, s->seqno, s->metric, s->expires ? s->expires-now : 0); + debug("Source router_id %lR seqno %d metric %d expires %t\n", + s->router_id, s->seqno, s->metric, + s->expires ? s->expires - current_time() : 0); } static void babel_dump_route(struct babel_route *r) { - debug("Route neigh %I if %s seqno %d metric %d/%d router_id %lR expires %d\n", - r->neigh ? r->neigh->addr : IPA_NONE, - r->neigh ? r->neigh->ifa->ifname : "(none)", - r->seqno, r->advert_metric, r->metric, - r->router_id, r->expires ? r->expires-now : 0); + debug("Route neigh %I if %s seqno %d metric %d/%d router_id %lR expires %t\n", + r->neigh->addr, r->neigh->ifa->ifname, r->seqno, r->advert_metric, r->metric, + r->router_id, r->expires ? r->expires - current_time() : 0); } static void @@ -1668,7 +1769,7 @@ babel_dump_entry(struct babel_entry *e) struct babel_source *s; struct babel_route *r; - debug("Babel: Entry %I/%d:\n", e->n.prefix, e->n.pxlen); + debug("Babel: Entry %N:\n", e->n.addr); WALK_LIST(s,e->sources) { debug(" "); babel_dump_source(s); } @@ -1676,8 +1777,7 @@ babel_dump_entry(struct babel_entry *e) WALK_LIST(r,e->routes) { debug(" "); - if (r == e->selected_out) debug("*"); - if (r == e->selected_in) debug("+"); + if (r == e->selected) debug("*"); babel_dump_route(r); } } @@ -1685,10 +1785,10 @@ babel_dump_entry(struct babel_entry *e) static void babel_dump_neighbor(struct babel_neighbor *n) { - debug("Neighbor %I txcost %d hello_map %x next seqno %d expires %d/%d\n", + debug("Neighbor %I txcost %d hello_map %x next seqno %d expires %t/%t\n", n->addr, n->txcost, n->hello_map, n->next_hello_seqno, - n->hello_expiry ? n->hello_expiry - now : 0, - n->ihu_expiry ? n->ihu_expiry - now : 0); + n->hello_expiry ? n->hello_expiry - current_time() : 0, + n->ihu_expiry ? n->ihu_expiry - current_time() : 0); } static void @@ -1696,9 +1796,10 @@ babel_dump_iface(struct babel_iface *ifa) { struct babel_neighbor *n; - debug("Babel: Interface %s addr %I rxcost %d type %d hello seqno %d intervals %d %d\n", + debug("Babel: Interface %s addr %I rxcost %d type %d hello seqno %d intervals %t %t", ifa->ifname, ifa->addr, ifa->cf->rxcost, ifa->cf->type, ifa->hello_seqno, ifa->cf->hello_interval, ifa->cf->update_interval); + debug(" next hop v4 %I next hop v6 %I\n", ifa->next_hop_ip4, ifa->next_hop_ip6); WALK_LIST(n, ifa->neigh_list) { debug(" "); babel_dump_neighbor(n); } @@ -1715,9 +1816,14 @@ babel_dump(struct proto *P) WALK_LIST(ifa, p->interfaces) babel_dump_iface(ifa); - FIB_WALK(&p->rtable, n) + FIB_WALK(&p->ip4_rtable, struct babel_entry, e) { - babel_dump_entry((struct babel_entry *) n); + babel_dump_entry(e); + } + FIB_WALK_END; + FIB_WALK(&p->ip6_rtable, struct babel_entry, e) + { + babel_dump_entry(e); } FIB_WALK_END; } @@ -1765,8 +1871,9 @@ babel_show_interfaces(struct proto *P, char *iff) } cli_msg(-1023, "%s:", p->p.name); - cli_msg(-1023, "%-10s %-6s %7s %6s %6s", - "Interface", "State", "RX cost", "Nbrs", "Timer"); + cli_msg(-1023, "%-10s %-6s %7s %6s %7s %-15s %s", + "Interface", "State", "RX cost", "Nbrs", "Timer", + "Next hop (v4)", "Next hop (v6)"); WALK_LIST(ifa, p->interfaces) { @@ -1777,9 +1884,11 @@ babel_show_interfaces(struct proto *P, char *iff) WALK_LIST(nbr, ifa->neigh_list) nbrs++; - int timer = MIN(ifa->next_regular, ifa->next_hello) - now; - cli_msg(-1023, "%-10s %-6s %7u %6u %6u", - ifa->iface->name, (ifa->up ? "Up" : "Down"), ifa->cf->rxcost, nbrs, MAX(timer, 0)); + btime timer = MIN(ifa->next_regular, ifa->next_hello) - current_time(); + cli_msg(-1023, "%-10s %-6s %7u %6u %7t %-15I %I", + ifa->iface->name, (ifa->up ? "Up" : "Down"), + ifa->cf->rxcost, nbrs, MAX(timer, 0), + ifa->next_hop_ip4, ifa->next_hop_ip6); } cli_msg(0, ""); @@ -1801,8 +1910,8 @@ babel_show_neighbors(struct proto *P, char *iff) } cli_msg(-1024, "%s:", p->p.name); - cli_msg(-1024, "%-25s %-10s %6s %6s %10s", - "IP address", "Interface", "Metric", "Routes", "Next hello"); + cli_msg(-1024, "%-25s %-10s %6s %6s %6s %7s", + "IP address", "Interface", "Metric", "Routes", "Hellos", "Expires"); WALK_LIST(ifa, p->interfaces) { @@ -1815,25 +1924,48 @@ babel_show_neighbors(struct proto *P, char *iff) WALK_LIST(r, n->routes) rts++; - int timer = n->hello_expiry - now; - cli_msg(-1024, "%-25I %-10s %6u %6u %10u", - n->addr, ifa->iface->name, n->txcost, rts, MAX(timer, 0)); + uint hellos = u32_popcount(n->hello_map); + btime timer = n->hello_expiry - current_time(); + cli_msg(-1024, "%-25I %-10s %6u %6u %6u %7t", + n->addr, ifa->iface->name, n->cost, rts, hellos, MAX(timer, 0)); } } cli_msg(0, ""); } +static void +babel_show_entries_(struct babel_proto *p UNUSED, struct fib *rtable) +{ + FIB_WALK(rtable, struct babel_entry, e) + { + struct babel_route *r = NULL; + uint rts = 0, srcs = 0; + node *n; + + WALK_LIST(n, e->routes) + rts++; + + WALK_LIST(n, e->sources) + srcs++; + + if (e->valid) + cli_msg(-1025, "%-24N %-23lR %6u %5u %7u %7u", + e->n.addr, e->router_id, e->metric, e->seqno, rts, srcs); + else if (r = e->selected) + cli_msg(-1025, "%-24N %-23lR %6u %5u %7u %7u", + e->n.addr, r->router_id, r->metric, r->seqno, rts, srcs); + else + cli_msg(-1025, "%-24N %-23s %6s %5s %7u %7u", + e->n.addr, "<none>", "-", "-", rts, srcs); + } + FIB_WALK_END; +} + void babel_show_entries(struct proto *P) { struct babel_proto *p = (void *) P; - struct babel_entry *e = NULL; - struct babel_source *s = NULL; - struct babel_route *r = NULL; - - char ipbuf[STD_ADDRESS_P_LENGTH+5]; - char ridbuf[ROUTER_ID_64_LENGTH+1]; if (p->p.proto_state != PS_UP) { @@ -1843,37 +1975,51 @@ babel_show_entries(struct proto *P) } cli_msg(-1025, "%s:", p->p.name); - cli_msg(-1025, "%-29s %-23s %6s %5s %7s %7s", - "Prefix", "Router ID", "Metric", "Seqno", "Expires", "Sources"); - - FIB_WALK(&p->rtable, n) - { - e = (struct babel_entry *) n; - r = e->selected_in ? e->selected_in : e->selected_out; - - int srcs = 0; - WALK_LIST(s, e->sources) - srcs++; + cli_msg(-1025, "%-24s %-23s %6s %5s %7s %7s", + "Prefix", "Router ID", "Metric", "Seqno", "Routes", "Sources"); - bsprintf(ipbuf, "%I/%u", e->n.prefix, e->n.pxlen); + babel_show_entries_(p, &p->ip4_rtable); + babel_show_entries_(p, &p->ip6_rtable); - if (r) - { - if (r->router_id == p->router_id) - bsprintf(ridbuf, "%s", "<self>"); - else - bsprintf(ridbuf, "%lR", r->router_id); + cli_msg(0, ""); +} - int time = r->expires ? r->expires - now : 0; - cli_msg(-1025, "%-29s %-23s %6u %5u %7u %7u", - ipbuf, ridbuf, r->metric, r->seqno, MAX(time, 0), srcs); - } - else +static void +babel_show_routes_(struct babel_proto *p UNUSED, struct fib *rtable) +{ + FIB_WALK(rtable, struct babel_entry, e) + { + struct babel_route *r; + WALK_LIST(r, e->routes) { - cli_msg(-1025, "%-29s %-44s %7u", ipbuf, "<pending>", srcs); + char c = (r == e->selected) ? '*' : (r->feasible ? '+' : ' '); + btime time = r->expires ? r->expires - current_time() : 0; + cli_msg(-1025, "%-24N %-25I %-10s %5u %c %5u %7t", + e->n.addr, r->next_hop, r->neigh->ifa->ifname, + r->metric, c, r->seqno, MAX(time, 0)); } } FIB_WALK_END; +} + +void +babel_show_routes(struct proto *P) +{ + struct babel_proto *p = (void *) P; + + if (p->p.proto_state != PS_UP) + { + cli_msg(-1025, "%s: is not up", p->p.name); + cli_msg(0, ""); + return; + } + + cli_msg(-1025, "%s:", p->p.name); + cli_msg(-1025, "%-24s %-25s %-9s %6s F %5s %7s", + "Prefix", "Nexthop", "Interface", "Metric", "Seqno", "Expires"); + + babel_show_routes_(p, &p->ip4_rtable); + babel_show_routes_(p, &p->ip6_rtable); cli_msg(0, ""); } @@ -1897,15 +2043,14 @@ babel_timer(timer *t) struct babel_proto *p = t->data; babel_expire_routes(p); - babel_expire_seqno_requests(p); babel_expire_neighbors(p); } static inline void babel_kick_timer(struct babel_proto *p) { - if (p->timer->expires > (now + 1)) - tm_start(p->timer, 1); + if (p->timer->expires > (current_time() + 100 MS)) + tm_start(p->timer, 100 MS); } @@ -1936,12 +2081,18 @@ babel_prepare_attrs(struct linpool *pool, ea_list *next, uint metric, u64 router static int -babel_import_control(struct proto *P, struct rte **rt, struct ea_list **attrs, struct linpool *pool) +babel_import_control(struct proto *P, struct rte **new, struct ea_list **attrs, struct linpool *pool) { struct babel_proto *p = (void *) P; + rte *rt = *new; + + /* Reject our own unreachable routes */ + if ((rt->attrs->dest == RTD_UNREACHABLE) && (rt->attrs->src->proto == P)) + return -1; + /* Prepare attributes with initial values */ - if ((*rt)->attrs->source != RTS_BABEL) + if (rt->attrs->source != RTS_BABEL) *attrs = babel_prepare_attrs(pool, NULL, 0, p->router_id); return 0; @@ -1964,70 +2115,55 @@ babel_store_tmp_attrs(struct rte *rt, struct ea_list *attrs) * so store it into our data structures. */ static void -babel_rt_notify(struct proto *P, struct rtable *table UNUSED, struct network *net, +babel_rt_notify(struct proto *P, struct channel *c UNUSED, struct network *net, struct rte *new, struct rte *old UNUSED, struct ea_list *attrs UNUSED) { struct babel_proto *p = (void *) P; struct babel_entry *e; - struct babel_route *r; if (new) { /* Update */ - e = babel_get_entry(p, net->n.prefix, net->n.pxlen); + uint internal = (new->attrs->src->proto == P); + uint rt_seqno = internal ? new->u.babel.seqno : p->update_seqno; + uint rt_metric = ea_get_int(attrs, EA_BABEL_METRIC, 0); + uint rt_router_id = internal ? new->u.babel.router_id : p->router_id; - if (new->attrs->src->proto != P) + if (rt_metric > BABEL_INFINITY) { - r = babel_get_route(e, NULL); - r->seqno = p->update_seqno; - r->router_id = p->router_id; - r->metric = 0; /* FIXME: should be selectable */ + log(L_WARN "%s: Invalid babel_metric value %u for route %N", + p->p.name, rt_metric, net->n.addr); + rt_metric = BABEL_INFINITY; } - else - r = e->selected_in; - if (r != e->selected_out) + e = babel_get_entry(p, net->n.addr); + + /* Activate triggered updates */ + if ((e->valid |= BABEL_ENTRY_VALID) || + (e->router_id != rt_router_id)) { - e->selected_out = r; - e->updated = now; babel_trigger_update(p); + e->updated = current_time(); } + + e->valid = BABEL_ENTRY_VALID; + e->seqno = rt_seqno; + e->metric = rt_metric; + e->router_id = rt_router_id; } else { /* Withdraw */ - e = babel_find_entry(p, net->n.prefix, net->n.pxlen); - if (!e || !e->selected_out) + e = babel_find_entry(p, net->n.addr); + + if (!e || e->valid != BABEL_ENTRY_VALID) return; - if (OUR_ROUTE(e->selected_out)) - { - /* - * We originate this route, so set its metric to infinity and set an - * expiry time. This causes a retraction to be sent, and later the route - * to be flushed once the hold time has passed. - */ - e->selected_out->metric = BABEL_INFINITY; - e->selected_out->expires = now + BABEL_HOLD_TIME; - e->updated = now; - babel_trigger_update(p); - } - else - { - /* - * This is a route originating from someone else that was lost; presumably - * because an export filter was updated to filter it. This means we can't - * set the metric to infinity (it would be overridden on subsequent - * updates from the peer originating the route), so just clear the - * exported route. - * - * This causes peers to expire the route after a while (like if we just - * shut down), but it's the best we can do in these circumstances; and - * since export filters presumably aren't updated that often this is - * acceptable. - */ - e->selected_out = NULL; - } + e->valid = BABEL_ENTRY_STALE; + e->metric = BABEL_INFINITY; + + babel_trigger_update(p); + e->updated = current_time(); } } @@ -2040,17 +2176,21 @@ babel_rte_better(struct rte *new, struct rte *old) static int babel_rte_same(struct rte *new, struct rte *old) { - return ((new->u.babel.router_id == old->u.babel.router_id) && - (new->u.babel.metric == old->u.babel.metric)); + return ((new->u.babel.seqno == old->u.babel.seqno) && + (new->u.babel.metric == old->u.babel.metric) && + (new->u.babel.router_id == old->u.babel.router_id)); } static struct proto * -babel_init(struct proto_config *cfg) +babel_init(struct proto_config *CF) { - struct proto *P = proto_new(cfg, sizeof(struct babel_proto)); + struct proto *P = proto_new(CF); + struct babel_proto *p = (void *) P; + + proto_configure_channel(P, &p->ip4_channel, proto_cf_find_channel(CF, NET_IP4)); + proto_configure_channel(P, &p->ip6_channel, proto_cf_find_channel(CF, NET_IP6)); - P->accept_ra_types = RA_OPTIMAL; P->if_notify = babel_if_notify; P->rt_notify = babel_rt_notify; P->import_control = babel_import_control; @@ -2068,10 +2208,14 @@ babel_start(struct proto *P) struct babel_proto *p = (void *) P; struct babel_config *cf = (void *) P->cf; - fib_init(&p->rtable, P->pool, sizeof(struct babel_entry), 0, babel_init_entry); + fib_init(&p->ip4_rtable, P->pool, NET_IP4, sizeof(struct babel_entry), + OFFSETOF(struct babel_entry, n), 0, babel_init_entry); + fib_init(&p->ip6_rtable, P->pool, NET_IP6, sizeof(struct babel_entry), + OFFSETOF(struct babel_entry, n), 0, babel_init_entry); + init_list(&p->interfaces); - p->timer = tm_new_set(P->pool, babel_timer, p, 0, 1); - tm_start(p->timer, 2); + p->timer = tm_new_init(P->pool, babel_timer, p, 1 S, 0); + tm_start(p->timer, 1 S); p->update_seqno = 1; p->router_id = proto_get_router_id(&cf->c); @@ -2079,7 +2223,6 @@ babel_start(struct proto *P) p->source_slab = sl_new(P->pool, sizeof(struct babel_source)); p->msg_slab = sl_new(P->pool, sizeof(struct babel_msg_node)); p->seqno_slab = sl_new(P->pool, sizeof(struct babel_seqno_request)); - init_list(&p->seqno_cache); p->log_pkt_tbf = (struct tbf){ .rate = 1, .burst = 5 }; @@ -2111,14 +2254,18 @@ babel_shutdown(struct proto *P) } static int -babel_reconfigure(struct proto *P, struct proto_config *c) +babel_reconfigure(struct proto *P, struct proto_config *CF) { struct babel_proto *p = (void *) P; - struct babel_config *new = (void *) c; + struct babel_config *new = (void *) CF; TRACE(D_EVENTS, "Reconfiguring"); - p->p.cf = c; + if (!proto_configure_channel(P, &p->ip4_channel, proto_cf_find_channel(CF, NET_IP4)) || + !proto_configure_channel(P, &p->ip6_channel, proto_cf_find_channel(CF, NET_IP6))) + return 0; + + p->p.cf = CF; babel_reconfigure_ifaces(p, new); babel_trigger_update(p); @@ -2133,6 +2280,8 @@ struct protocol proto_babel = { .template = "babel%d", .attr_class = EAP_BABEL, .preference = DEF_PREF_BABEL, + .channel_mask = NB_IP, + .proto_size = sizeof(struct babel_proto), .config_size = sizeof(struct babel_config), .init = babel_init, .dump = babel_dump, diff --git a/proto/babel/babel.h b/proto/babel/babel.h index 6a95d82f..1128d261 100644 --- a/proto/babel/babel.h +++ b/proto/babel/babel.h @@ -2,6 +2,8 @@ * BIRD -- The Babel protocol * * Copyright (c) 2015--2016 Toke Hoiland-Jorgensen + * (c) 2016--2017 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2016--2017 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. * @@ -23,10 +25,6 @@ #include "lib/string.h" #include "lib/timer.h" -#ifndef IPV6 -#error "The Babel protocol only speaks IPv6" -#endif - #define EA_BABEL_METRIC EA_CODE(EAP_BABEL, 0) #define EA_BABEL_ROUTER_ID EA_CODE(EAP_BABEL, 1) @@ -36,27 +34,30 @@ #define BABEL_INFINITY 0xFFFF -#define BABEL_HELLO_INTERVAL_WIRED 4 /* Default hello intervals in seconds */ -#define BABEL_HELLO_INTERVAL_WIRELESS 4 +#define BABEL_HELLO_INTERVAL_WIRED (4 S_) /* Default hello intervals in seconds */ +#define BABEL_HELLO_INTERVAL_WIRELESS (4 S_) +#define BABEL_HELLO_LIMIT 12 #define BABEL_UPDATE_INTERVAL_FACTOR 4 #define BABEL_IHU_INTERVAL_FACTOR 3 -#define BABEL_IHU_EXPIRY_FACTOR(X) ((X)*3/2) /* 1.5 */ -#define BABEL_HELLO_EXPIRY_FACTOR(X) ((X)*3/2) /* 1.5 */ -#define BABEL_ROUTE_EXPIRY_FACTOR(X) ((X)*7/2) /* 3.5 */ -#define BABEL_ROUTE_REFRESH_INTERVAL 2 /* Seconds before route expiry to send route request */ -#define BABEL_HOLD_TIME 10 /* Expiry time for our own routes */ +#define BABEL_HOLD_TIME_FACTOR 4 /* How long we keep unreachable route relative to update interval */ +#define BABEL_IHU_EXPIRY_FACTOR(X) ((btime)(X)*7/2) /* 3.5 */ +#define BABEL_HELLO_EXPIRY_FACTOR(X) ((btime)(X)*3/2) /* 1.5 */ +#define BABEL_ROUTE_EXPIRY_FACTOR(X) ((btime)(X)*7/2) /* 3.5 */ +#define BABEL_ROUTE_REFRESH_FACTOR(X) ((btime)(X)*5/2) /* 2.5 */ +#define BABEL_SEQNO_REQUEST_RETRY 4 +#define BABEL_SEQNO_REQUEST_EXPIRY (2 S_) +#define BABEL_GARBAGE_INTERVAL (300 S_) #define BABEL_RXCOST_WIRED 96 #define BABEL_RXCOST_WIRELESS 256 #define BABEL_INITIAL_HOP_COUNT 255 -#define BABEL_MAX_SEND_INTERVAL 5 -#define BABEL_TIME_UNITS 100 /* On-wire times are counted in centiseconds */ -#define BABEL_SEQNO_REQUEST_EXPIRY 60 -#define BABEL_GARBAGE_INTERVAL 300 +#define BABEL_MAX_SEND_INTERVAL 5 /* Unused ? */ /* Max interval that will not overflow when carried as 16-bit centiseconds */ -#define BABEL_MAX_INTERVAL (0xFFFF/BABEL_TIME_UNITS) +#define BABEL_TIME_UNITS 10000 /* On-wire times are counted in centiseconds */ +#define BABEL_MIN_INTERVAL (0x0001 * BABEL_TIME_UNITS) +#define BABEL_MAX_INTERVAL (0xFFFF * BABEL_TIME_UNITS) -#define BABEL_OVERHEAD (SIZE_OF_IP_HEADER+UDP_HEADER_LENGTH) +#define BABEL_OVERHEAD (IP6_HEADER_LENGTH+UDP_HEADER_LENGTH) #define BABEL_MIN_MTU (512 + BABEL_OVERHEAD) @@ -82,6 +83,11 @@ enum babel_tlv_type { BABEL_TLV_MAX }; +enum babel_subtlv_type { + BABEL_SUBTLV_PAD1 = 0, + BABEL_SUBTLV_PADN = 1 +}; + enum babel_iface_type { /* In practice, UNDEF and WIRED give equivalent behaviour */ BABEL_IFACE_TYPE_UNDEF = 0, @@ -101,8 +107,8 @@ enum babel_ae_type { struct babel_config { struct proto_config c; - - list iface_list; /* Patterns configured -- keep it first; see babel_reconfigure why */ + list iface_list; /* List of iface configs (struct babel_iface_config) */ + uint hold_time; /* Time to hold stale entries and unreachable routes */ }; struct babel_iface_config { @@ -110,33 +116,41 @@ struct babel_iface_config { u16 rxcost; u8 type; + u8 limit; /* Minimum number of Hellos to keep link up */ u8 check_link; uint port; - u16 hello_interval; - u16 ihu_interval; - u16 update_interval; + uint hello_interval; /* Hello interval, in us */ + uint ihu_interval; /* IHU interval, in us */ + uint update_interval; /* Update interval, in us */ u16 rx_buffer; /* RX buffer size, 0 for MTU */ u16 tx_length; /* TX packet length limit (including headers), 0 for MTU */ int tx_tos; int tx_priority; + + ip_addr next_hop_ip4; + ip_addr next_hop_ip6; }; struct babel_proto { struct proto p; timer *timer; - struct fib rtable; + struct fib ip4_rtable; + struct fib ip6_rtable; + + struct channel *ip4_channel; + struct channel *ip6_channel; + list interfaces; /* Interfaces we really know about (struct babel_iface) */ u64 router_id; u16 update_seqno; /* To be increased on request */ + u8 update_seqno_inc; /* Request for update_seqno increase */ u8 triggered; /* For triggering global updates */ slab *route_slab; slab *source_slab; slab *msg_slab; - slab *seqno_slab; - list seqno_cache; /* Seqno requests in the cache (struct babel_seqno_request) */ struct tbf log_pkt_tbf; /* TBF for packet messages */ }; @@ -155,16 +169,18 @@ struct babel_iface { char *ifname; sock *sk; ip_addr addr; + ip_addr next_hop_ip4; + ip_addr next_hop_ip6; int tx_length; list neigh_list; /* List of neighbors seen on this iface (struct babel_neighbor) */ list msg_queue; u16 hello_seqno; /* To be increased on each hello */ - bird_clock_t next_hello; - bird_clock_t next_regular; - bird_clock_t next_triggered; - bird_clock_t want_triggered; + btime next_hello; + btime next_regular; + btime next_triggered; + btime want_triggered; timer *timer; event *send_event; @@ -175,13 +191,18 @@ struct babel_neighbor { struct babel_iface *ifa; ip_addr addr; - u16 txcost; + uint uc; /* Reference counter for seqno requests */ + u16 rxcost; /* Sent in last IHU */ + u16 txcost; /* Received in last IHU */ + u16 cost; /* Computed neighbor cost */ + s8 ihu_cnt; /* IHU countdown, 0 to send it */ u8 hello_cnt; u16 hello_map; u16 next_hello_seqno; + uint last_hello_int; /* expiry timers */ - bird_clock_t hello_expiry; - bird_clock_t ihu_expiry; + btime hello_expiry; + btime ihu_expiry; list routes; /* Routes this neighbour has sent us (struct babel_route) */ }; @@ -192,7 +213,7 @@ struct babel_source { u64 router_id; u16 seqno; u16 metric; - bird_clock_t expires; + btime expires; }; struct babel_route { @@ -201,38 +222,47 @@ struct babel_route { struct babel_entry *e; struct babel_neighbor *neigh; + u8 feasible; u16 seqno; - u16 advert_metric; u16 metric; + u16 advert_metric; u64 router_id; ip_addr next_hop; - bird_clock_t refresh_time; - bird_clock_t expires; - u16 expiry_interval; + btime refresh_time; + btime expires; }; -struct babel_entry { - struct fib_node n; - struct babel_proto *proto; - struct babel_route *selected_in; - struct babel_route *selected_out; - - bird_clock_t updated; - - list sources; /* Source entries for this prefix (struct babel_source). */ - list routes; /* Routes for this prefix (struct babel_route) */ -}; - -/* Stores forwarded seqno requests for duplicate suppression. */ struct babel_seqno_request { node n; - ip_addr prefix; - u8 plen; u64 router_id; u16 seqno; - bird_clock_t updated; + u8 hop_count; + u8 count; + btime expires; + struct babel_neighbor *nbr; +}; + +struct babel_entry { + struct babel_route *selected; + + list routes; /* Routes for this prefix (struct babel_route) */ + list sources; /* Source entries for this prefix (struct babel_source). */ + list requests; + + u8 valid; /* Entry validity state (BABEL_ENTRY_*) */ + u8 unreachable; /* Unreachable route is announced */ + u16 seqno; /* Outgoing seqno */ + u16 metric; /* Outgoing metric */ + u64 router_id; /* Outgoing router ID */ + btime updated; /* Last change of outgoing rte, for triggered updates */ + + struct fib_node n; }; +#define BABEL_ENTRY_DUMMY 0 /* No outgoing route */ +#define BABEL_ENTRY_VALID 1 /* Valid outgoing route */ +#define BABEL_ENTRY_STALE 2 /* Stale outgoing route, waiting for GC */ + /* * Internal TLV messages @@ -241,7 +271,7 @@ struct babel_seqno_request { struct babel_msg_ack_req { u8 type; u16 nonce; - u16 interval; + uint interval; ip_addr sender; }; @@ -253,7 +283,7 @@ struct babel_msg_ack { struct babel_msg_hello { u8 type; u16 seqno; - u16 interval; + uint interval; ip_addr sender; }; @@ -261,7 +291,7 @@ struct babel_msg_ihu { u8 type; u8 ae; u16 rxcost; - u16 interval; + uint interval; ip_addr addr; ip_addr sender; }; @@ -269,12 +299,11 @@ struct babel_msg_ihu { struct babel_msg_update { u8 type; u8 wildcard; - u8 plen; - u16 interval; + uint interval; u16 seqno; u16 metric; - ip_addr prefix; u64 router_id; + net_addr net; ip_addr next_hop; ip_addr sender; }; @@ -282,17 +311,15 @@ struct babel_msg_update { struct babel_msg_route_request { u8 type; u8 full; - u8 plen; - ip_addr prefix; + net_addr net; }; struct babel_msg_seqno_request { u8 type; - u8 plen; - u16 seqno; u8 hop_count; + u16 seqno; u64 router_id; - ip_addr prefix; + net_addr net; ip_addr sender; }; @@ -326,6 +353,7 @@ void babel_handle_seqno_request(union babel_msg *msg, struct babel_iface *ifa); void babel_show_interfaces(struct proto *P, char *iff); void babel_show_neighbors(struct proto *P, char *iff); void babel_show_entries(struct proto *P); +void babel_show_routes(struct proto *P); /* packets.c */ void babel_enqueue(union babel_msg *msg, struct babel_iface *ifa); diff --git a/proto/babel/config.Y b/proto/babel/config.Y index b6170852..25ce5ba0 100644 --- a/proto/babel/config.Y +++ b/proto/babel/config.Y @@ -2,6 +2,8 @@ * BIRD -- Babel Configuration * * Copyright (c) 2015-2016 Toke Hoiland-Jorgensen + * (c) 2016--2017 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2016--2017 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -21,7 +23,8 @@ CF_DEFINES CF_DECLS CF_KEYWORDS(BABEL, METRIC, RXCOST, HELLO, UPDATE, INTERVAL, PORT, WIRED, -WIRELESS, RX, TX, BUFFER, LENGTH, CHECK, LINK, BABEL_METRIC) + WIRELESS, RX, TX, BUFFER, LENGTH, CHECK, LINK, BABEL_METRIC, NEXT, HOP, + IPV4, IPV6) CF_GRAMMAR @@ -31,10 +34,12 @@ babel_proto_start: proto_start BABEL { this_proto = proto_config_new(&proto_babel, $1); init_list(&BABEL_CFG->iface_list); + BABEL_CFG->hold_time = 1 S_; }; babel_proto_item: proto_item + | proto_channel | INTERFACE babel_iface ; @@ -54,6 +59,7 @@ babel_iface_start: init_list(&this_ipatt->ipn_list); BABEL_IFACE->port = BABEL_PORT; BABEL_IFACE->type = BABEL_IFACE_TYPE_WIRED; + BABEL_IFACE->limit = BABEL_HELLO_LIMIT; BABEL_IFACE->tx_tos = IP_PREC_INTERNET_CONTROL; BABEL_IFACE->tx_priority = sk_priority_control; BABEL_IFACE->check_link = 1; @@ -81,21 +87,26 @@ babel_iface_finish: if (!BABEL_IFACE->update_interval) BABEL_IFACE->update_interval = MIN_(BABEL_IFACE->hello_interval*BABEL_UPDATE_INTERVAL_FACTOR, BABEL_MAX_INTERVAL); BABEL_IFACE->ihu_interval = MIN_(BABEL_IFACE->hello_interval*BABEL_IHU_INTERVAL_FACTOR, BABEL_MAX_INTERVAL); + + BABEL_CFG->hold_time = MAX_(BABEL_CFG->hold_time, BABEL_IFACE->update_interval*BABEL_HOLD_TIME_FACTOR); }; babel_iface_item: | PORT expr { BABEL_IFACE->port = $2; if (($2<1) || ($2>65535)) cf_error("Invalid port number"); } | RXCOST expr { BABEL_IFACE->rxcost = $2; if (($2<1) || ($2>65535)) cf_error("Invalid rxcost"); } - | HELLO INTERVAL expr { BABEL_IFACE->hello_interval = $3; if (($3<1) || ($3>BABEL_MAX_INTERVAL)) cf_error("Invalid hello interval"); } - | UPDATE INTERVAL expr { BABEL_IFACE->update_interval = $3; if (($3<1) || ($3>BABEL_MAX_INTERVAL)) cf_error("Invalid update interval"); } + | LIMIT expr { BABEL_IFACE->limit = $2; if (($2<1) || ($2>16)) cf_error("Limit must be in range 1-16"); } | TYPE WIRED { BABEL_IFACE->type = BABEL_IFACE_TYPE_WIRED; } | TYPE WIRELESS { BABEL_IFACE->type = BABEL_IFACE_TYPE_WIRELESS; } + | HELLO INTERVAL expr_us { BABEL_IFACE->hello_interval = $3; if (($3<BABEL_MIN_INTERVAL) || ($3>BABEL_MAX_INTERVAL)) cf_error("Hello interval must be in range 10 ms - 655 s"); } + | UPDATE INTERVAL expr_us { BABEL_IFACE->update_interval = $3; if (($3<BABEL_MIN_INTERVAL) || ($3>BABEL_MAX_INTERVAL)) cf_error("Update interval must be in range 10 ms - 655 s"); } | RX BUFFER expr { BABEL_IFACE->rx_buffer = $3; if (($3<256) || ($3>65535)) cf_error("RX buffer must be in range 256-65535"); } | TX LENGTH expr { BABEL_IFACE->tx_length = $3; if (($3<256) || ($3>65535)) cf_error("TX length must be in range 256-65535"); } | TX tos { BABEL_IFACE->tx_tos = $2; } | TX PRIORITY expr { BABEL_IFACE->tx_priority = $3; } | CHECK LINK bool { BABEL_IFACE->check_link = $3; } + | NEXT HOP IPV4 ipa { BABEL_IFACE->next_hop_ip4 = $4; if (!ipa_is_ip4($4)) cf_error("Must be an IPv4 address"); } + | NEXT HOP IPV6 ipa { BABEL_IFACE->next_hop_ip6 = $4; if (!ipa_is_ip6($4)) cf_error("Must be an IPv6 address"); } ; babel_iface_opts: @@ -125,6 +136,9 @@ CF_CLI(SHOW BABEL NEIGHBORS, optsym opttext, [<name>] [\"<interface>\"], [[Show CF_CLI(SHOW BABEL ENTRIES, optsym opttext, [<name>], [[Show information about Babel prefix entries]]) { babel_show_entries(proto_get_named($4, &proto_babel)); }; +CF_CLI(SHOW BABEL ROUTES, optsym opttext, [<name>], [[Show information about Babel route entries]]) +{ babel_show_routes(proto_get_named($4, &proto_babel)); }; + CF_CODE CF_END diff --git a/proto/babel/packets.c b/proto/babel/packets.c index 08054832..4abcf7e4 100644 --- a/proto/babel/packets.c +++ b/proto/babel/packets.c @@ -2,6 +2,8 @@ * BIRD -- The Babel protocol * * Copyright (c) 2015--2016 Toke Hoiland-Jorgensen + * (c) 2016--2017 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2016--2017 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. * @@ -112,13 +114,15 @@ struct babel_parse_state { struct babel_proto *proto; struct babel_iface *ifa; ip_addr saddr; - ip_addr next_hop; + ip_addr next_hop_ip4; + ip_addr next_hop_ip6; u64 router_id; /* Router ID used in subsequent updates */ u8 def_ip6_prefix[16]; /* Implicit IPv6 prefix in network order */ u8 def_ip4_prefix[4]; /* Implicit IPv4 prefix in network order */ u8 router_id_seen; /* router_id field is valid */ u8 def_ip6_prefix_seen; /* def_ip6_prefix is valid */ u8 def_ip4_prefix_seen; /* def_ip4_prefix is valid */ + u8 current_tlv_endpos; /* End of self-terminating TLVs (offset from start) */ }; enum parse_result { @@ -130,7 +134,10 @@ enum parse_result { struct babel_write_state { u64 router_id; u8 router_id_seen; -// ip_addr next_hop; + ip_addr next_hop_ip4; + ip_addr next_hop_ip6; + u8 def_ip6_prefix[16]; /* Implicit IPv6 prefix in network order */ + u8 def_ip6_pxlen; }; @@ -146,34 +153,58 @@ struct babel_write_state { #define TLV_HDR(tlv,t,l) ({ tlv->type = t; tlv->length = l - sizeof(struct babel_tlv); }) #define TLV_HDR0(tlv,t) TLV_HDR(tlv, t, tlv_data[t].min_length) -#define BYTES(n) ((((uint) n) + 7) / 8) +#define NET_SIZE(n) BYTES(net_pxlen(n)) -static inline u16 +static inline uint +bytes_equal(u8 *b1, u8 *b2, uint maxlen) +{ + uint i; + for (i = 0; (i < maxlen) && (*b1 == *b2); i++, b1++, b2++) + ; + return i; +} + +static inline uint get_time16(const void *p) { - u16 v = get_u16(p) / BABEL_TIME_UNITS; - return MAX(1, v); + uint v = get_u16(p) * BABEL_TIME_UNITS; + return MAX(BABEL_MIN_INTERVAL, v); } static inline void -put_time16(void *p, u16 v) +put_time16(void *p, uint v) { - put_u16(p, v * BABEL_TIME_UNITS); + put_u16(p, v / BABEL_TIME_UNITS); } -static inline ip6_addr -get_ip6_px(const void *p, uint plen) +static inline void +read_ip4_px(net_addr *n, const void *p, uint plen) +{ + ip4_addr addr = {0}; + memcpy(&addr, p, BYTES(plen)); + net_fill_ip4(n, ip4_ntoh(addr), plen); +} + +static inline void +put_ip4_px(void *p, net_addr *n) +{ + ip4_addr addr = ip4_hton(net4_prefix(n)); + memcpy(p, &addr, NET_SIZE(n)); +} + +static inline void +read_ip6_px(net_addr *n, const void *p, uint plen) { ip6_addr addr = IPA_NONE; memcpy(&addr, p, BYTES(plen)); - return ip6_ntoh(addr); + net_fill_ip6(n, ip6_ntoh(addr), plen); } static inline void -put_ip6_px(void *p, ip6_addr addr, uint plen) +put_ip6_px(void *p, net_addr *n) { - addr = ip6_hton(addr); - memcpy(p, &addr, BYTES(plen)); + ip6_addr addr = ip6_hton(net6_prefix(n)); + memcpy(p, &addr, NET_SIZE(n)); } static inline ip6_addr @@ -351,14 +382,33 @@ babel_read_ihu(struct babel_tlv *hdr, union babel_msg *m, if (msg->ae >= BABEL_AE_MAX) return PARSE_IGNORE; - // We handle link-local IPs. In every other case, the addr field will be 0 but - // validation will succeed. The handler takes care of these cases. - if (msg->ae == BABEL_AE_IP6_LL) + /* + * We only actually read link-local IPs. In every other case, the addr field + * will be 0 but validation will succeed. The handler takes care of these + * cases. We handle them here anyway because we need the length for parsing + * subtlvs. + */ + switch (msg->ae) { + case BABEL_AE_IP4: + if (TLV_OPT_LENGTH(tlv) < 4) + return PARSE_ERROR; + state->current_tlv_endpos += 4; + break; + + case BABEL_AE_IP6: + if (TLV_OPT_LENGTH(tlv) < 16) + return PARSE_ERROR; + state->current_tlv_endpos += 16; + break; + + case BABEL_AE_IP6_LL: if (TLV_OPT_LENGTH(tlv) < 8) return PARSE_ERROR; msg->addr = ipa_from_ip6(get_ip6_ll(&tlv->addr)); + state->current_tlv_endpos += 8; + break; } return PARSE_SUCCESS; @@ -431,21 +481,27 @@ babel_read_next_hop(struct babel_tlv *hdr, union babel_msg *m UNUSED, return PARSE_ERROR; case BABEL_AE_IP4: - /* TODO */ + if (TLV_OPT_LENGTH(tlv) < sizeof(ip4_addr)) + return PARSE_ERROR; + + state->next_hop_ip4 = ipa_from_ip4(get_ip4(&tlv->addr)); + state->current_tlv_endpos += sizeof(ip4_addr); return PARSE_IGNORE; case BABEL_AE_IP6: if (TLV_OPT_LENGTH(tlv) < sizeof(ip6_addr)) return PARSE_ERROR; - state->next_hop = ipa_from_ip6(get_ip6(&tlv->addr)); + state->next_hop_ip6 = ipa_from_ip6(get_ip6(&tlv->addr)); + state->current_tlv_endpos += sizeof(ip6_addr); return PARSE_IGNORE; case BABEL_AE_IP6_LL: if (TLV_OPT_LENGTH(tlv) < 8) return PARSE_ERROR; - state->next_hop = ipa_from_ip6(get_ip6_ll(&tlv->addr)); + state->next_hop_ip6 = ipa_from_ip6(get_ip6_ll(&tlv->addr)); + state->current_tlv_endpos += 8; return PARSE_IGNORE; default: @@ -455,6 +511,51 @@ babel_read_next_hop(struct babel_tlv *hdr, union babel_msg *m UNUSED, return PARSE_IGNORE; } +/* This is called directly from babel_write_update() and returns -1 if a next + hop should be written but there is not enough space. */ +static int +babel_write_next_hop(struct babel_tlv *hdr, ip_addr addr, + struct babel_write_state *state, uint max_len) +{ + struct babel_tlv_next_hop *tlv = (void *) hdr; + + if (ipa_zero(addr)) + { + /* Should not happen */ + return 0; + } + else if (ipa_is_ip4(addr) && !ipa_equal(addr, state->next_hop_ip4)) + { + uint len = sizeof(struct babel_tlv_next_hop) + sizeof(ip4_addr); + if (len > max_len) + return -1; + + TLV_HDR(tlv, BABEL_TLV_NEXT_HOP, len); + + tlv->ae = BABEL_AE_IP4; + put_ip4(&tlv->addr, ipa_to_ip4(addr)); + state->next_hop_ip4 = addr; + + return len; + } + else if (ipa_is_ip6(addr) && !ipa_equal(addr, state->next_hop_ip6)) + { + uint len = sizeof(struct babel_tlv_next_hop) + sizeof(ip6_addr); + if (len > max_len) + return -1; + + TLV_HDR(tlv, BABEL_TLV_NEXT_HOP, len); + + tlv->ae = BABEL_AE_IP6; + put_ip6(&tlv->addr, ipa_to_ip6(addr)); + state->next_hop_ip6 = addr; + + return len; + } + + return 0; +} + static int babel_read_update(struct babel_tlv *hdr, union babel_msg *m, struct babel_parse_state *state) @@ -480,15 +581,43 @@ babel_read_update(struct babel_tlv *hdr, union babel_msg *m, if (tlv->plen > 0) return PARSE_ERROR; + if (msg->metric != 65535) + return PARSE_ERROR; + msg->wildcard = 1; break; case BABEL_AE_IP4: - /* TODO */ - return PARSE_IGNORE; + if (tlv->plen > IP4_MAX_PREFIX_LENGTH) + return PARSE_ERROR; + + /* Cannot omit data if there is no saved prefix */ + if (tlv->omitted && !state->def_ip4_prefix_seen) + return PARSE_ERROR; + + /* Update must have next hop, unless it is retraction */ + if (ipa_zero(state->next_hop_ip4) && (msg->metric != BABEL_INFINITY)) + return PARSE_ERROR; + + /* Merge saved prefix and received prefix parts */ + memcpy(buf, state->def_ip4_prefix, tlv->omitted); + memcpy(buf + tlv->omitted, tlv->addr, len); + + ip4_addr prefix4 = get_ip4(buf); + net_fill_ip4(&msg->net, prefix4, tlv->plen); + + if (tlv->flags & BABEL_FLAG_DEF_PREFIX) + { + put_ip4(state->def_ip4_prefix, prefix4); + state->def_ip4_prefix_seen = 1; + } + + msg->next_hop = state->next_hop_ip4; + + break; case BABEL_AE_IP6: - if (tlv->plen > MAX_PREFIX_LENGTH) + if (tlv->plen > IP6_MAX_PREFIX_LENGTH) return PARSE_ERROR; /* Cannot omit data if there is no saved prefix */ @@ -499,20 +628,23 @@ babel_read_update(struct babel_tlv *hdr, union babel_msg *m, memcpy(buf, state->def_ip6_prefix, tlv->omitted); memcpy(buf + tlv->omitted, tlv->addr, len); - msg->plen = tlv->plen; - msg->prefix = ipa_from_ip6(get_ip6(buf)); + ip6_addr prefix6 = get_ip6(buf); + net_fill_ip6(&msg->net, prefix6, tlv->plen); if (tlv->flags & BABEL_FLAG_DEF_PREFIX) { - put_ip6(state->def_ip6_prefix, msg->prefix); + put_ip6(state->def_ip6_prefix, prefix6); state->def_ip6_prefix_seen = 1; } if (tlv->flags & BABEL_FLAG_ROUTER_ID) { - state->router_id = ((u64) _I2(msg->prefix)) << 32 | _I3(msg->prefix); + state->router_id = ((u64) _I2(prefix6)) << 32 | _I3(prefix6); state->router_id_seen = 1; } + + msg->next_hop = state->next_hop_ip6; + break; case BABEL_AE_IP6_LL: @@ -531,8 +663,8 @@ babel_read_update(struct babel_tlv *hdr, union babel_msg *m, } msg->router_id = state->router_id; - msg->next_hop = state->next_hop; msg->sender = state->saddr; + state->current_tlv_endpos += len; return PARSE_SUCCESS; } @@ -541,7 +673,6 @@ static uint babel_write_update(struct babel_tlv *hdr, union babel_msg *m, struct babel_write_state *state, uint max_len) { - struct babel_tlv_update *tlv = (void *) hdr; struct babel_msg_update *msg = &m->update; uint len0 = 0; @@ -550,16 +681,35 @@ babel_write_update(struct babel_tlv *hdr, union babel_msg *m, * both of them. There is enough space for the Router-ID TLV, because * sizeof(struct babel_tlv_router_id) == sizeof(struct babel_tlv_update). * - * Router ID is not used for retractions, so do not us it in such case. + * Router ID is not used for retractions, so do not use it in such case. */ if ((msg->metric < BABEL_INFINITY) && (!state->router_id_seen || (msg->router_id != state->router_id))) { len0 = babel_write_router_id(hdr, msg->router_id, state, max_len); - tlv = (struct babel_tlv_update *) NEXT_TLV(tlv); + hdr = NEXT_TLV(hdr); } - uint len = sizeof(struct babel_tlv_update) + BYTES(msg->plen); + /* + * We also may add Next Hop TLV for regular updates. It may fail for not + * enough space or it may be unnecessary as the next hop is the same as the + * last one already announced. So we handle all three cases. + */ + if (msg->metric < BABEL_INFINITY) + { + int l = babel_write_next_hop(hdr, msg->next_hop, state, max_len - len0); + if (l < 0) + return 0; + + if (l) + { + len0 += l; + hdr = NEXT_TLV(hdr); + } + } + + struct babel_tlv_update *tlv = (void *) hdr; + uint len = sizeof(struct babel_tlv_update) + NET_SIZE(&msg->net); if (len0 + len > max_len) return 0; @@ -572,11 +722,39 @@ babel_write_update(struct babel_tlv *hdr, union babel_msg *m, tlv->ae = BABEL_AE_WILDCARD; tlv->plen = 0; } + else if (msg->net.type == NET_IP4) + { + tlv->ae = BABEL_AE_IP4; + tlv->plen = net4_pxlen(&msg->net); + put_ip4_px(tlv->addr, &msg->net); + } else { tlv->ae = BABEL_AE_IP6; - tlv->plen = msg->plen; - put_ip6_px(tlv->addr, msg->prefix, msg->plen); + tlv->plen = net6_pxlen(&msg->net); + + /* Address compression - omit initial matching bytes */ + u8 buf[16], omit; + put_ip6(buf, net6_prefix(&msg->net)); + omit = bytes_equal(buf, state->def_ip6_prefix, + MIN(tlv->plen, state->def_ip6_pxlen) / 8); + + if (omit > 0) + { + memcpy(tlv->addr, buf + omit, NET_SIZE(&msg->net) - omit); + + tlv->omitted = omit; + tlv->length -= omit; + len -= omit; + } + else + { + put_ip6_px(tlv->addr, &msg->net); + tlv->flags |= BABEL_FLAG_DEF_PREFIX; + + put_ip6(state->def_ip6_prefix, net6_prefix(&msg->net)); + state->def_ip6_pxlen = tlv->plen; + } } put_time16(&tlv->interval, msg->interval); @@ -606,18 +784,25 @@ babel_read_route_request(struct babel_tlv *hdr, union babel_msg *m, return PARSE_SUCCESS; case BABEL_AE_IP4: - /* TODO */ - return PARSE_IGNORE; + if (tlv->plen > IP4_MAX_PREFIX_LENGTH) + return PARSE_ERROR; + + if (TLV_OPT_LENGTH(tlv) < BYTES(tlv->plen)) + return PARSE_ERROR; + + read_ip4_px(&msg->net, tlv->addr, tlv->plen); + state->current_tlv_endpos += BYTES(tlv->plen); + return PARSE_SUCCESS; case BABEL_AE_IP6: - if (tlv->plen > MAX_PREFIX_LENGTH) + if (tlv->plen > IP6_MAX_PREFIX_LENGTH) return PARSE_ERROR; if (TLV_OPT_LENGTH(tlv) < BYTES(tlv->plen)) return PARSE_ERROR; - msg->plen = tlv->plen; - msg->prefix = get_ip6_px(tlv->addr, tlv->plen); + read_ip6_px(&msg->net, tlv->addr, tlv->plen); + state->current_tlv_endpos += BYTES(tlv->plen); return PARSE_SUCCESS; case BABEL_AE_IP6_LL: @@ -637,7 +822,7 @@ babel_write_route_request(struct babel_tlv *hdr, union babel_msg *m, struct babel_tlv_route_request *tlv = (void *) hdr; struct babel_msg_route_request *msg = &m->route_request; - uint len = sizeof(struct babel_tlv_route_request) + BYTES(msg->plen); + uint len = sizeof(struct babel_tlv_route_request) + NET_SIZE(&msg->net); if (len > max_len) return 0; @@ -649,11 +834,17 @@ babel_write_route_request(struct babel_tlv *hdr, union babel_msg *m, tlv->ae = BABEL_AE_WILDCARD; tlv->plen = 0; } + else if (msg->net.type == NET_IP4) + { + tlv->ae = BABEL_AE_IP4; + tlv->plen = net4_pxlen(&msg->net); + put_ip4_px(tlv->addr, &msg->net); + } else { tlv->ae = BABEL_AE_IP6; - tlv->plen = msg->plen; - put_ip6_px(tlv->addr, msg->prefix, msg->plen); + tlv->plen = net6_pxlen(&msg->net); + put_ip6_px(tlv->addr, &msg->net); } return len; @@ -681,18 +872,25 @@ babel_read_seqno_request(struct babel_tlv *hdr, union babel_msg *m, return PARSE_ERROR; case BABEL_AE_IP4: - /* TODO */ - return PARSE_IGNORE; + if (tlv->plen > IP4_MAX_PREFIX_LENGTH) + return PARSE_ERROR; + + if (TLV_OPT_LENGTH(tlv) < BYTES(tlv->plen)) + return PARSE_ERROR; + + read_ip4_px(&msg->net, tlv->addr, tlv->plen); + state->current_tlv_endpos += BYTES(tlv->plen); + return PARSE_SUCCESS; case BABEL_AE_IP6: - if (tlv->plen > MAX_PREFIX_LENGTH) + if (tlv->plen > IP6_MAX_PREFIX_LENGTH) return PARSE_ERROR; if (TLV_OPT_LENGTH(tlv) < BYTES(tlv->plen)) return PARSE_ERROR; - msg->plen = tlv->plen; - msg->prefix = get_ip6_px(tlv->addr, tlv->plen); + read_ip6_px(&msg->net, tlv->addr, tlv->plen); + state->current_tlv_endpos += BYTES(tlv->plen); return PARSE_SUCCESS; case BABEL_AE_IP6_LL: @@ -712,23 +910,70 @@ babel_write_seqno_request(struct babel_tlv *hdr, union babel_msg *m, struct babel_tlv_seqno_request *tlv = (void *) hdr; struct babel_msg_seqno_request *msg = &m->seqno_request; - uint len = sizeof(struct babel_tlv_seqno_request) + BYTES(msg->plen); + uint len = sizeof(struct babel_tlv_seqno_request) + NET_SIZE(&msg->net); if (len > max_len) return 0; TLV_HDR(tlv, BABEL_TLV_SEQNO_REQUEST, len); - tlv->ae = BABEL_AE_IP6; - tlv->plen = msg->plen; + + if (msg->net.type == NET_IP4) + { + tlv->ae = BABEL_AE_IP4; + tlv->plen = net4_pxlen(&msg->net); + put_ip4_px(tlv->addr, &msg->net); + } + else + { + tlv->ae = BABEL_AE_IP6; + tlv->plen = net6_pxlen(&msg->net); + put_ip6_px(tlv->addr, &msg->net); + } + put_u16(&tlv->seqno, msg->seqno); tlv->hop_count = msg->hop_count; put_u64(&tlv->router_id, msg->router_id); - put_ip6_px(tlv->addr, msg->prefix, msg->plen); return len; } static inline int +babel_read_subtlvs(struct babel_tlv *hdr, + union babel_msg *msg UNUSED, + struct babel_parse_state *state) +{ + struct babel_tlv *tlv; + + for (tlv = (void *) hdr + state->current_tlv_endpos; + (void *) tlv < (void *) hdr + TLV_LENGTH(hdr); + tlv = NEXT_TLV(tlv)) + { + /* + * The subtlv type space is non-contiguous (due to the mandatory bit), so + * use a switch for dispatch instead of the mapping array we use for TLVs + */ + switch (tlv->type) + { + case BABEL_SUBTLV_PAD1: + case BABEL_SUBTLV_PADN: + /* FIXME: Framing errors in PADN are silently ignored, see babel_process_packet() */ + break; + + default: + /* Unknown mandatory subtlv; PARSE_IGNORE ignores the whole TLV */ + if (tlv->type > 128) + { + DBG("Babel: Mandatory subtlv %d found; skipping TLV\n", tlv->type); + return PARSE_IGNORE; + } + break; + } + } + + return PARSE_SUCCESS; +} + +static inline int babel_read_tlv(struct babel_tlv *hdr, union babel_msg *msg, struct babel_parse_state *state) @@ -741,8 +986,14 @@ babel_read_tlv(struct babel_tlv *hdr, if (TLV_LENGTH(hdr) < tlv_data[hdr->type].min_length) return PARSE_ERROR; + state->current_tlv_endpos = tlv_data[hdr->type].min_length; memset(msg, 0, sizeof(*msg)); - return tlv_data[hdr->type].read_tlv(hdr, msg, state); + + int res = tlv_data[hdr->type].read_tlv(hdr, msg, state); + if (res != PARSE_SUCCESS) + return res; + + return babel_read_subtlvs(hdr, msg, state); } static uint @@ -797,7 +1048,7 @@ static uint babel_write_queue(struct babel_iface *ifa, list *queue) { struct babel_proto *p = ifa->proto; - struct babel_write_state state = {}; + struct babel_write_state state = { .next_hop_ip6 = ifa->addr }; if (EMPTY_LIST(*queue)) return 0; @@ -933,10 +1184,10 @@ babel_process_packet(struct babel_pkt_header *pkt, int len, byte *end = (byte *)pkt + plen; struct babel_parse_state state = { - .proto = p, - .ifa = ifa, - .saddr = saddr, - .next_hop = saddr, + .proto = p, + .ifa = ifa, + .saddr = saddr, + .next_hop_ip6 = saddr, }; if ((pkt->magic != BABEL_MAGIC) || (pkt->version != BABEL_VERSION)) @@ -1045,7 +1296,7 @@ babel_rx_hook(sock *sk, uint len) sk->iface->name, sk->faddr, sk->laddr); /* Silently ignore my own packets */ - if (ipa_equal(ifa->iface->addr->ip, sk->faddr)) + if (ipa_equal(sk->faddr, sk->saddr)) return 1; if (!ipa_is_link_local(sk->faddr)) @@ -1080,6 +1331,7 @@ babel_open_socket(struct babel_iface *ifa) sk->sport = ifa->cf->port; sk->dport = ifa->cf->port; sk->iface = ifa->iface; + sk->saddr = ifa->addr; sk->rx_hook = babel_rx_hook; sk->tx_hook = babel_tx_hook; diff --git a/proto/bfd/Makefile b/proto/bfd/Makefile index c28cedec..402122fc 100644 --- a/proto/bfd/Makefile +++ b/proto/bfd/Makefile @@ -1,5 +1,6 @@ -source=bfd.c packets.c io.c -root-rel=../../ -dir-name=proto/bfd +src := bfd.c io.c packets.c +obj := $(src-o-files) +$(all-daemon) +$(cf-local) -include ../../Rules +tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file diff --git a/proto/bfd/bfd.c b/proto/bfd/bfd.c index 79135fae..67ec2270 100644 --- a/proto/bfd/bfd.c +++ b/proto/bfd/bfd.c @@ -64,16 +64,15 @@ * ready, the protocol just creates a BFD request like any other protocol. * * The protocol uses a new generic event loop (structure &birdloop) from |io.c|, - * which supports sockets, timers and events like the main loop. Timers - * (structure &timer2) are new microsecond based timers, while sockets and - * events are the same. A birdloop is associated with a thread (field @thread) - * in which event hooks are executed. Most functions for setting event sources - * (like sk_start() or tm2_start()) must be called from the context of that - * thread. Birdloop allows to temporarily acquire the context of that thread for - * the main thread by calling birdloop_enter() and then birdloop_leave(), which - * also ensures mutual exclusion with all event hooks. Note that resources - * associated with a birdloop (like timers) should be attached to the - * independent resource pool, detached from the main resource tree. + * which supports sockets, timers and events like the main loop. A birdloop is + * associated with a thread (field @thread) in which event hooks are executed. + * Most functions for setting event sources (like sk_start() or tm_start()) must + * be called from the context of that thread. Birdloop allows to temporarily + * acquire the context of that thread for the main thread by calling + * birdloop_enter() and then birdloop_leave(), which also ensures mutual + * exclusion with all event hooks. Note that resources associated with a + * birdloop (like timers) should be attached to the independent resource pool, + * detached from the main resource tree. * * There are two kinds of interaction between the BFD core (running in the BFD * thread) and the rest of BFD (running in the main thread). The first kind are @@ -112,7 +111,7 @@ #define HASH_IP_KEY(n) n->addr #define HASH_IP_NEXT(n) n->next_ip #define HASH_IP_EQ(a,b) ipa_equal(a,b) -#define HASH_IP_FN(k) ipa_hash32(k) +#define HASH_IP_FN(k) ipa_hash(k) static list bfd_proto_list; static list bfd_wait_list; @@ -145,6 +144,7 @@ bfd_session_update_state(struct bfd_session *s, uint state, uint diag) bfd_lock_sessions(p); s->loc_state = state; s->loc_diag = diag; + s->last_state_change = current_time(); notify = !NODE_VALID(&s->n); if (notify) @@ -176,7 +176,7 @@ bfd_session_update_tx_interval(struct bfd_session *s) return; /* Set timer relative to last tx_timer event */ - tm2_set(s->tx_timer, s->last_tx + tx_int_l); + tm_set(s->tx_timer, s->last_tx + tx_int_l); } static void @@ -190,7 +190,7 @@ bfd_session_update_detection_time(struct bfd_session *s, int kick) if (!s->last_rx) return; - tm2_set(s->hold_timer, s->last_rx + timeout); + tm_set(s->hold_timer, s->last_rx + timeout); } static void @@ -211,16 +211,16 @@ bfd_session_control_tx_timer(struct bfd_session *s, int reset) goto stop; /* So TX timer should run */ - if (reset || !tm2_active(s->tx_timer)) + if (reset || !tm_active(s->tx_timer)) { s->last_tx = 0; - tm2_start(s->tx_timer, 0); + tm_start(s->tx_timer, 0); } return; stop: - tm2_stop(s->tx_timer); + tm_stop(s->tx_timer); s->last_tx = 0; } @@ -379,7 +379,7 @@ bfd_find_session_by_addr(struct bfd_proto *p, ip_addr addr) } static void -bfd_tx_timer_hook(timer2 *t) +bfd_tx_timer_hook(timer *t) { struct bfd_session *s = t->data; @@ -388,7 +388,7 @@ bfd_tx_timer_hook(timer2 *t) } static void -bfd_hold_timer_hook(timer2 *t) +bfd_hold_timer_hook(timer *t) { bfd_session_timeout(t->data); } @@ -432,13 +432,13 @@ bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface * s->passive = ifa->cf->passive; s->tx_csn = random_u32(); - s->tx_timer = tm2_new_init(p->tpool, bfd_tx_timer_hook, s, 0, 0); - s->hold_timer = tm2_new_init(p->tpool, bfd_hold_timer_hook, s, 0, 0); + s->tx_timer = tm_new_init(p->tpool, bfd_tx_timer_hook, s, 0, 0); + s->hold_timer = tm_new_init(p->tpool, bfd_hold_timer_hook, s, 0, 0); bfd_session_update_tx_interval(s); bfd_session_control_tx_timer(s, 1); init_list(&s->request_list); - s->last_state_change = now; + s->last_state_change = current_time(); TRACE(D_EVENTS, "Session to %I added", s->addr); @@ -879,9 +879,6 @@ bfd_notify_hook(sock *sk, uint len UNUSED) diag = s->loc_diag; bfd_unlock_sessions(p); - /* FIXME: convert to btime and move to bfd_session_update_state() */ - s->last_state_change = now; - s->notify_running = 1; WALK_LIST_DELSAFE(n, nn, s->request_list) bfd_request_notify(SKIP_BACK(struct bfd_request, n, n), state, diag); @@ -954,7 +951,7 @@ bfd_init_all(void) static struct proto * bfd_init(struct proto_config *c) { - struct proto *p = proto_new(c, sizeof(struct bfd_proto)); + struct proto *p = proto_new(c); p->neigh_notify = bfd_neigh_notify; @@ -983,8 +980,10 @@ bfd_start(struct proto *P) add_tail(&bfd_proto_list, &p->bfd_node); birdloop_enter(p->loop); - p->rx_1 = bfd_open_rx_sk(p, 0); - p->rx_m = bfd_open_rx_sk(p, 1); + p->rx4_1 = bfd_open_rx_sk(p, 0, SK_IPV4); + p->rx4_m = bfd_open_rx_sk(p, 1, SK_IPV4); + p->rx6_1 = bfd_open_rx_sk(p, 0, SK_IPV6); + p->rx6_m = bfd_open_rx_sk(p, 1, SK_IPV6); birdloop_leave(p->loop); bfd_take_requests(p); @@ -1078,7 +1077,7 @@ bfd_show_sessions(struct proto *P) byte tbuf[TM_DATETIME_BUFFER_SIZE]; struct bfd_proto *p = (struct bfd_proto *) P; uint state, diag UNUSED; - u32 tx_int, timeout; + btime tx_int, timeout; const char *ifname; if (p->p.proto_state != PS_UP) @@ -1099,15 +1098,14 @@ bfd_show_sessions(struct proto *P) state = s->loc_state; diag = s->loc_diag; ifname = (s->ifa && s->ifa->iface) ? s->ifa->iface->name : "---"; - tx_int = s->last_tx ? (MAX(s->des_min_tx_int, s->rem_min_rx_int) TO_MS) : 0; - timeout = (MAX(s->req_min_rx_int, s->rem_min_tx_int) TO_MS) * s->rem_detect_mult; + tx_int = s->last_tx ? MAX(s->des_min_tx_int, s->rem_min_rx_int) : 0; + timeout = (btime) MAX(s->req_min_rx_int, s->rem_min_tx_int) * s->rem_detect_mult; state = (state < 4) ? state : 0; - tm_format_datetime(tbuf, &config->tf_proto, s->last_state_change); + tm_format_time(tbuf, &config->tf_proto, s->last_state_change); - cli_msg(-1020, "%-25I %-10s %-10s %-10s %3u.%03u %3u.%03u", - s->addr, ifname, bfd_state_names[state], tbuf, - tx_int / 1000, tx_int % 1000, timeout / 1000, timeout % 1000); + cli_msg(-1020, "%-25I %-10s %-10s %-10s %7t %7t", + s->addr, ifname, bfd_state_names[state], tbuf, tx_int, timeout); } HASH_WALK_END; @@ -1118,6 +1116,7 @@ bfd_show_sessions(struct proto *P) struct protocol proto_bfd = { .name = "BFD", .template = "bfd%d", + .proto_size = sizeof(struct bfd_proto), .config_size = sizeof(struct bfd_config), .init = bfd_init, .start = bfd_start, diff --git a/proto/bfd/bfd.h b/proto/bfd/bfd.h index 46e09879..bc4fe969 100644 --- a/proto/bfd/bfd.h +++ b/proto/bfd/bfd.h @@ -87,8 +87,10 @@ struct bfd_proto sock *notify_ws; list notify_list; - sock *rx_1; - sock *rx_m; + sock *rx4_1; + sock *rx6_1; + sock *rx4_m; + sock *rx6_m; list iface_list; }; @@ -138,11 +140,11 @@ struct bfd_session btime last_tx; /* Time of last sent periodic control packet */ btime last_rx; /* Time of last received valid control packet */ - timer2 *tx_timer; /* Periodic control packet timer */ - timer2 *hold_timer; /* Timer for session down detection time */ + timer *tx_timer; /* Periodic control packet timer */ + timer *hold_timer; /* Timer for session down detection time */ list request_list; /* List of client requests (struct bfd_request) */ - bird_clock_t last_state_change; /* Time of last state change */ + btime last_state_change; /* Time of last state change */ u8 notify_running; /* 1 if notify hooks are running */ u8 rx_csn_known; /* Received crypto sequence number is known */ @@ -201,7 +203,7 @@ void bfd_show_sessions(struct proto *P); /* packets.c */ void bfd_send_ctl(struct bfd_proto *p, struct bfd_session *s, int final); -sock * bfd_open_rx_sk(struct bfd_proto *p, int multihop); +sock * bfd_open_rx_sk(struct bfd_proto *p, int multihop, int inet_version); sock * bfd_open_tx_sk(struct bfd_proto *p, ip_addr local, struct iface *ifa); diff --git a/proto/bfd/io.c b/proto/bfd/io.c index 8f4f5007..b01cbfce 100644 --- a/proto/bfd/io.c +++ b/proto/bfd/io.c @@ -18,10 +18,10 @@ #include "proto/bfd/io.h" #include "lib/buffer.h" -#include "lib/heap.h" #include "lib/lists.h" #include "lib/resource.h" #include "lib/event.h" +#include "lib/timer.h" #include "lib/socket.h" @@ -31,16 +31,12 @@ struct birdloop pthread_t thread; pthread_mutex_t mutex; - btime last_time; - btime real_time; - u8 use_monotonic_clock; - u8 stop_called; u8 poll_active; u8 wakeup_masked; int wakeup_fds[2]; - BUFFER(timer2 *) timers; + struct timeloop time; list event_list; list sock_list; uint sock_num; @@ -57,6 +53,7 @@ struct birdloop */ static pthread_key_t current_loop_key; +extern pthread_key_t current_time_key; static inline struct birdloop * birdloop_current(void) @@ -68,6 +65,7 @@ static inline void birdloop_set_current(struct birdloop *loop) { pthread_setspecific(current_loop_key, loop); + pthread_setspecific(current_time_key, loop ? &loop->time : &main_timeloop); } static inline void @@ -78,98 +76,6 @@ birdloop_init_current(void) /* - * Time clock - */ - -static void times_update_alt(struct birdloop *loop); - -static void -times_init(struct birdloop *loop) -{ - struct timespec ts; - int rv; - - rv = clock_gettime(CLOCK_MONOTONIC, &ts); - if (rv < 0) - { - log(L_WARN "Monotonic clock is missing"); - - loop->use_monotonic_clock = 0; - loop->last_time = 0; - loop->real_time = 0; - times_update_alt(loop); - return; - } - - if ((ts.tv_sec < 0) || (((s64) ts.tv_sec) > ((s64) 1 << 40))) - log(L_WARN "Monotonic clock is crazy"); - - loop->use_monotonic_clock = 1; - loop->last_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000); - loop->real_time = 0; -} - -static void -times_update_pri(struct birdloop *loop) -{ - struct timespec ts; - int rv; - - rv = clock_gettime(CLOCK_MONOTONIC, &ts); - if (rv < 0) - die("clock_gettime: %m"); - - btime new_time = ((s64) ts.tv_sec S) + (ts.tv_nsec / 1000); - - if (new_time < loop->last_time) - log(L_ERR "Monotonic clock is broken"); - - loop->last_time = new_time; - loop->real_time = 0; -} - -static void -times_update_alt(struct birdloop *loop) -{ - struct timeval tv; - int rv; - - rv = gettimeofday(&tv, NULL); - if (rv < 0) - die("gettimeofday: %m"); - - btime new_time = ((s64) tv.tv_sec S) + tv.tv_usec; - btime delta = new_time - loop->real_time; - - if ((delta < 0) || (delta > (60 S))) - { - if (loop->real_time) - log(L_WARN "Time jump, delta %d us", (int) delta); - - delta = 100 MS; - } - - loop->last_time += delta; - loop->real_time = new_time; -} - -static void -times_update(struct birdloop *loop) -{ - if (loop->use_monotonic_clock) - times_update_pri(loop); - else - times_update_alt(loop); -} - -btime -current_time(void) -{ - return birdloop_current()->last_time; -} - - -/* * Wakeup code for birdloop */ @@ -238,7 +144,7 @@ wakeup_drain(struct birdloop *loop) } static inline void -wakeup_do_kick(struct birdloop *loop) +wakeup_do_kick(struct birdloop *loop) { pipe_kick(loop->wakeup_fds[1]); } @@ -252,6 +158,16 @@ wakeup_kick(struct birdloop *loop) loop->wakeup_masked = 2; } +/* For notifications from outside */ +void +wakeup_kick_current(void) +{ + struct birdloop *loop = birdloop_current(); + + if (loop && loop->poll_active) + wakeup_kick(loop); +} + /* * Events @@ -272,7 +188,7 @@ events_init(struct birdloop *loop) static void events_fire(struct birdloop *loop) { - times_update(loop); + times_update(&loop->time); ev_run_list(&loop->event_list); } @@ -292,154 +208,6 @@ ev2_schedule(event *e) /* - * Timers - */ - -#define TIMER_LESS(a,b) ((a)->expires < (b)->expires) -#define TIMER_SWAP(heap,a,b,t) (t = heap[a], heap[a] = heap[b], heap[b] = t, \ - heap[a]->index = (a), heap[b]->index = (b)) - -static inline uint timers_count(struct birdloop *loop) -{ return loop->timers.used - 1; } - -static inline timer2 *timers_first(struct birdloop *loop) -{ return (loop->timers.used > 1) ? loop->timers.data[1] : NULL; } - - -static void -tm2_free(resource *r) -{ - timer2 *t = (timer2 *) r; - - tm2_stop(t); -} - -static void -tm2_dump(resource *r) -{ - timer2 *t = (timer2 *) r; - - debug("(code %p, data %p, ", t->hook, t->data); - if (t->randomize) - debug("rand %d, ", t->randomize); - if (t->recurrent) - debug("recur %d, ", t->recurrent); - if (t->expires) - debug("expires in %d ms)\n", (t->expires - current_time()) TO_MS); - else - debug("inactive)\n"); -} - - -static struct resclass tm2_class = { - "Timer", - sizeof(timer2), - tm2_free, - tm2_dump, - NULL, - NULL -}; - -timer2 * -tm2_new(pool *p) -{ - timer2 *t = ralloc(p, &tm2_class); - t->index = -1; - return t; -} - -void -tm2_set(timer2 *t, btime when) -{ - struct birdloop *loop = birdloop_current(); - uint tc = timers_count(loop); - - if (!t->expires) - { - t->index = ++tc; - t->expires = when; - BUFFER_PUSH(loop->timers) = t; - HEAP_INSERT(loop->timers.data, tc, timer2 *, TIMER_LESS, TIMER_SWAP); - } - else if (t->expires < when) - { - t->expires = when; - HEAP_INCREASE(loop->timers.data, tc, timer2 *, TIMER_LESS, TIMER_SWAP, t->index); - } - else if (t->expires > when) - { - t->expires = when; - HEAP_DECREASE(loop->timers.data, tc, timer2 *, TIMER_LESS, TIMER_SWAP, t->index); - } - - if (loop->poll_active && (t->index == 1)) - wakeup_kick(loop); -} - -void -tm2_start(timer2 *t, btime after) -{ - tm2_set(t, current_time() + MAX(after, 0)); -} - -void -tm2_stop(timer2 *t) -{ - if (!t->expires) - return; - - struct birdloop *loop = birdloop_current(); - uint tc = timers_count(loop); - - HEAP_DELETE(loop->timers.data, tc, timer2 *, TIMER_LESS, TIMER_SWAP, t->index); - BUFFER_POP(loop->timers); - - t->index = -1; - t->expires = 0; -} - -static void -timers_init(struct birdloop *loop) -{ - BUFFER_INIT(loop->timers, loop->pool, 4); - BUFFER_PUSH(loop->timers) = NULL; -} - -static void -timers_fire(struct birdloop *loop) -{ - btime base_time; - timer2 *t; - - times_update(loop); - base_time = loop->last_time; - - while (t = timers_first(loop)) - { - if (t->expires > base_time) - return; - - if (t->recurrent) - { - btime when = t->expires + t->recurrent; - - if (when <= loop->last_time) - when = loop->last_time + t->recurrent; - - if (t->randomize) - when += random() % (t->randomize + 1); - - tm2_set(t, when); - } - else - tm2_stop(t); - - t->hook(t); - } -} - - -/* * Sockets */ @@ -586,7 +354,7 @@ sockets_fire(struct birdloop *loop) sock **psk = loop->poll_sk.data; int poll_num = loop->poll_fd.used - 1; - times_update(loop); + times_update(&loop->time); /* Last fd is internal wakeup fd */ if (pfd[poll_num].revents & POLLIN) @@ -634,11 +402,10 @@ birdloop_new(void) loop->pool = p; pthread_mutex_init(&loop->mutex, NULL); - times_init(loop); wakeup_init(loop); events_init(loop); - timers_init(loop); + timers_init(&loop->time, p); sockets_init(loop); return loop; @@ -710,7 +477,7 @@ static void * birdloop_main(void *arg) { struct birdloop *loop = arg; - timer2 *t; + timer *t; int rv, timeout; birdloop_set_current(loop); @@ -719,13 +486,13 @@ birdloop_main(void *arg) while (1) { events_fire(loop); - timers_fire(loop); + timers_fire(&loop->time); - times_update(loop); + times_update(&loop->time); if (events_waiting(loop)) timeout = 0; - else if (t = timers_first(loop)) - timeout = (tm2_remains(t) TO_MS) + 1; + else if (t = timers_first(&loop->time)) + timeout = (tm_remains(t) TO_MS) + 1; else timeout = -1; @@ -756,7 +523,7 @@ birdloop_main(void *arg) if (rv) sockets_fire(loop); - timers_fire(loop); + timers_fire(&loop->time); } loop->stop_called = 0; diff --git a/proto/bfd/io.h b/proto/bfd/io.h index 641ee054..ec706e9a 100644 --- a/proto/bfd/io.h +++ b/proto/bfd/io.h @@ -11,80 +11,15 @@ #include "lib/lists.h" #include "lib/resource.h" #include "lib/event.h" +#include "lib/timer.h" #include "lib/socket.h" -// #include "lib/timer.h" -typedef struct timer2 -{ - resource r; - void (*hook)(struct timer2 *); - void *data; - - btime expires; /* 0=inactive */ - uint randomize; /* Amount of randomization */ - uint recurrent; /* Timer recurrence */ - - int index; -} timer2; - - -btime current_time(void); - void ev2_schedule(event *e); - -timer2 *tm2_new(pool *p); -void tm2_set(timer2 *t, btime when); -void tm2_start(timer2 *t, btime after); -void tm2_stop(timer2 *t); - -static inline int -tm2_active(timer2 *t) -{ - return t->expires != 0; -} - -static inline btime -tm2_remains(timer2 *t) -{ - btime now = current_time(); - return (t->expires > now) ? (t->expires - now) : 0; -} - -static inline timer2 * -tm2_new_init(pool *p, void (*hook)(struct timer2 *), void *data, uint rec, uint rand) -{ - timer2 *t = tm2_new(p); - t->hook = hook; - t->data = data; - t->recurrent = rec; - t->randomize = rand; - return t; -} - -static inline void -tm2_set_max(timer2 *t, btime when) -{ - if (when > t->expires) - tm2_set(t, when); -} - -/* -static inline void -tm2_start_max(timer2 *t, btime after) -{ - btime rem = tm2_remains(t); - tm2_start(t, MAX_(rem, after)); -} -*/ - - void sk_start(sock *s); void sk_stop(sock *s); - - struct birdloop *birdloop_new(void); void birdloop_start(struct birdloop *loop); void birdloop_stop(struct birdloop *loop); diff --git a/proto/bfd/packets.c b/proto/bfd/packets.c index 129db72f..b76efda6 100644 --- a/proto/bfd/packets.c +++ b/proto/bfd/packets.c @@ -248,7 +248,7 @@ bfd_check_authentication(struct bfd_proto *p, struct bfd_session *s, struct bfd_ /* BFD CSNs are in 32-bit circular number space */ u32 csn = ntohl(auth->csn); if (s->rx_csn_known && - (((csn - s->rx_csn) > (3 * s->detect_mult)) || + (((csn - s->rx_csn) > (3 * (uint) s->detect_mult)) || (meticulous && (csn == s->rx_csn)))) { /* We want to report both new and old CSN */ @@ -405,10 +405,11 @@ bfd_err_hook(sock *sk, int err) } sock * -bfd_open_rx_sk(struct bfd_proto *p, int multihop) +bfd_open_rx_sk(struct bfd_proto *p, int multihop, int af) { sock *sk = sk_new(p->tpool); sk->type = SK_UDP; + sk->subtype = af; sk->sport = !multihop ? BFD_CONTROL_PORT : BFD_MULTI_CTL_PORT; sk->data = p; @@ -421,10 +422,6 @@ bfd_open_rx_sk(struct bfd_proto *p, int multihop) sk->priority = sk_priority_control; sk->flags = SKF_THREAD | SKF_LADDR_RX | (!multihop ? SKF_TTL_RX : 0); -#ifdef IPV6 - sk->flags |= SKF_V6ONLY; -#endif - if (sk_open(sk) < 0) goto err; @@ -456,10 +453,6 @@ bfd_open_tx_sk(struct bfd_proto *p, ip_addr local, struct iface *ifa) sk->ttl = ifa ? 255 : -1; sk->flags = SKF_THREAD | SKF_BIND | SKF_HIGH_PORT; -#ifdef IPV6 - sk->flags |= SKF_V6ONLY; -#endif - if (sk_open(sk) < 0) goto err; diff --git a/proto/bgp/Makefile b/proto/bgp/Makefile index a634cf0d..00aaef5e 100644 --- a/proto/bgp/Makefile +++ b/proto/bgp/Makefile @@ -1,5 +1,6 @@ -source=bgp.c attrs.c packets.c -root-rel=../../ -dir-name=proto/bgp +src := attrs.c bgp.c packets.c +obj := $(src-o-files) +$(all-daemon) +$(cf-local) -include ../../Rules +tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index b9e2490d..882ba44e 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -2,6 +2,8 @@ * BIRD -- BGP Attributes * * (c) 2000 Martin Mares <mj@ucw.cz> + * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2008--2016 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -39,890 +41,1249 @@ * specifies that such updates should be ignored, but that is generally * a bad idea. * - * Error checking of optional transitive attributes is done according to - * draft-ietf-idr-optional-transitive-03, but errors are handled always - * as withdraws. + * BGP attribute table has several hooks: * - * Unexpected AS_CONFED_* segments in AS_PATH are logged and removed, - * but unknown segments cause a session drop with Malformed AS_PATH - * error (see validate_path()). The behavior in such case is not - * explicitly specified by RFC 4271. RFC 5065 specifies that - * inconsistent AS_CONFED_* segments should cause a session drop, but - * implementations that pass invalid AS_CONFED_* segments are - * widespread. + * export - Hook that validates and normalizes attribute during export phase. + * Receives eattr, may modify it (e.g., sort community lists for canonical + * representation), UNSET() it (e.g., skip empty lists), or WITHDRAW() it if + * necessary. May assume that eattr has value valid w.r.t. its type, but may be + * invalid w.r.t. BGP constraints. Optional. * - * Error handling of AS4_* attributes is done as specified by RFC 6793. There - * are several possible inconsistencies between AGGREGATOR and AS4_AGGREGATOR - * that are not handled by that RFC, these are logged and ignored (see - * bgp_reconstruct_4b_attrs()). + * encode - Hook that converts internal representation to external one during + * packet writing. Receives eattr and puts it in the buffer (including attribute + * header). Returns number of bytes, or -1 if not enough space. May assume that + * eattr has value valid w.r.t. its type and validated by export hook. Mandatory + * for all known attributes that exist internally after export phase (i.e., all + * except pseudoattributes MP_(UN)REACH_NLRI). + * + * decode - Hook that converts external representation to internal one during + * packet parsing. Receives attribute data in buffer, validates it and adds + * attribute to ea_list. If data are invalid, steps DISCARD(), WITHDRAW() or + * bgp_parse_error() may be used to escape. Mandatory for all known attributes. + * + * format - Optional hook that converts eattr to textual representation. */ +// XXXX review pool usage : c->c.proto->pool -static byte bgp_mandatory_attrs[] = { BA_ORIGIN, BA_AS_PATH -#ifndef IPV6 -,BA_NEXT_HOP -#endif -}; -struct attr_desc { - char *name; - int expected_length; - int expected_flags; - int type; - int allow_in_ebgp; - int (*validate)(struct bgp_proto *p, byte *attr, int len); - void (*format)(eattr *ea, byte *buf, int buflen); +struct bgp_attr_desc { + const char *name; + uint type; + uint flags; + void (*export)(struct bgp_export_state *s, eattr *a); + int (*encode)(struct bgp_write_state *s, eattr *a, byte *buf, uint size); + void (*decode)(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to); + void (*format)(eattr *ea, byte *buf, uint size); }; -#define IGNORE -1 -#define WITHDRAW -2 +static const struct bgp_attr_desc bgp_attr_table[]; + +static inline int bgp_attr_known(uint code); + +eattr * +bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintptr_t val) +{ + ASSERT(bgp_attr_known(code)); + + ea_list *a = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr)); + eattr *e = &a->attrs[0]; + + a->flags = EALF_SORTED; + a->count = 1; + a->next = *attrs; + *attrs = a; + + e->id = EA_CODE(EAP_BGP, code); + e->type = bgp_attr_table[code].type; + e->flags = flags; + + if (e->type & EAF_EMBEDDED) + e->u.data = (u32) val; + else + e->u.ptr = (struct adata *) val; + + return e; +} + + + +#define REPORT(msg, args...) \ + ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); }) + +#define DISCARD(msg, args...) \ + ({ REPORT(msg, ## args); return; }) + +#define WITHDRAW(msg, args...) \ + ({ REPORT(msg, ## args); s->err_withdraw = 1; return; }) + +#define UNSET(a) \ + ({ a->type = EAF_TYPE_UNDEF; return; }) + +#define NEW_BGP "Discarding %s attribute received from AS4-aware neighbor" +#define BAD_EBGP "Discarding %s attribute received from EBGP neighbor" +#define BAD_LENGTH "Malformed %s attribute - invalid length (%u)" +#define BAD_VALUE "Malformed %s attribute - invalid value (%u)" +#define NO_MANDATORY "Missing mandatory %s attribute" + + +static inline int +bgp_put_attr_hdr3(byte *buf, uint code, uint flags, uint len) +{ + *buf++ = flags; + *buf++ = code; + *buf++ = len; + return 3; +} + +static inline int +bgp_put_attr_hdr4(byte *buf, uint code, uint flags, uint len) +{ + *buf++ = flags | BAF_EXT_LEN; + *buf++ = code; + put_u16(buf, len); + return 4; +} + +static inline int +bgp_put_attr_hdr(byte *buf, uint code, uint flags, uint len) +{ + if (len < 256) + return bgp_put_attr_hdr3(buf, code, flags, len); + else + return bgp_put_attr_hdr4(buf, code, flags, len); +} static int -bgp_check_origin(struct bgp_proto *p UNUSED, byte *a, int len UNUSED) +bgp_encode_u8(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) { - if (*a > 2) - return 6; - return 0; + if (size < (3+1)) + return -1; + + bgp_put_attr_hdr3(buf, EA_ID(a->id), a->flags, 1); + buf[3] = a->u.data; + + return 3+1; } -static void -bgp_format_origin(eattr *a, byte *buf, int buflen UNUSED) +static int +bgp_encode_u32(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) { - static char *bgp_origin_names[] = { "IGP", "EGP", "Incomplete" }; + if (size < (3+4)) + return -1; + + bgp_put_attr_hdr3(buf, EA_ID(a->id), a->flags, 4); + put_u32(buf+3, a->u.data); - bsprintf(buf, bgp_origin_names[a->u.data]); + return 3+4; } static int -path_segment_contains(byte *p, int bs, u32 asn) +bgp_encode_u32s(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) { - int i; - int len = p[1]; - p += 2; + uint len = a->u.ptr->length; - for(i=0; i<len; i++) - { - u32 asn2 = (bs == 4) ? get_u32(p) : get_u16(p); - if (asn2 == asn) - return 1; - p += bs; - } + if (size < (4+len)) + return -1; - return 0; + uint hdr = bgp_put_attr_hdr(buf, EA_ID(a->id), a->flags, len); + put_u32s(buf + hdr, (u32 *) a->u.ptr->data, len / 4); + + return hdr + len; } -/* Validates path attribute, removes AS_CONFED_* segments, and also returns path length */ static int -validate_path(struct bgp_proto *p, int as_path, int bs, byte *idata, uint *ilength) +bgp_put_attr(byte *buf, uint size, uint code, uint flags, byte *data, uint len) { - int res = 0; - u8 *a, *dst; - int len, plen; + if (size < (4+len)) + return -1; - dst = a = idata; - len = *ilength; + uint hdr = bgp_put_attr_hdr(buf, code, flags, len); + memcpy(buf + hdr, data, len); - while (len) - { - if (len < 2) - return -1; - - plen = 2 + bs * a[1]; - if (len < plen) - return -1; - - if (a[1] == 0) - { - log(L_WARN "%s: %s_PATH attribute contains empty segment, skipping it", - p->p.name, as_path ? "AS" : "AS4"); - goto skip; - } - - switch (a[0]) - { - case AS_PATH_SET: - res++; - break; - - case AS_PATH_SEQUENCE: - res += a[1]; - break; - - case AS_PATH_CONFED_SEQUENCE: - case AS_PATH_CONFED_SET: - if (as_path && path_segment_contains(a, bs, p->remote_as)) - { - log(L_WARN "%s: AS_CONFED_* segment with peer ASN found, misconfigured confederation?", p->p.name); - return -1; - } - - log(L_WARN "%s: %s_PATH attribute contains AS_CONFED_* segment, skipping segment", - p->p.name, as_path ? "AS" : "AS4"); - goto skip; - - default: - return -1; - } - - if (dst != a) - memmove(dst, a, plen); - dst += plen; - - skip: - len -= plen; - a += plen; - } + return hdr + len; +} - *ilength = dst - idata; - return res; +static int +bgp_encode_raw(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) +{ + return bgp_put_attr(buf, size, EA_ID(a->id), a->flags, a->u.ptr->data, a->u.ptr->length); } -static inline int -validate_as_path(struct bgp_proto *p, byte *a, int *len) + +/* + * Attribute hooks + */ + +static void +bgp_export_origin(struct bgp_export_state *s, eattr *a) { - return validate_path(p, 1, p->as4_session ? 4 : 2, a, len); + if (a->u.data > 2) + WITHDRAW(BAD_VALUE, "ORIGIN", a->u.data); } -static inline int -validate_as4_path(struct bgp_proto *p, struct adata *path) +static void +bgp_decode_origin(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - return validate_path(p, 0, 4, path->data, &path->length); + if (len != 1) + WITHDRAW(BAD_LENGTH, "ORIGIN", len); + + if (data[0] > 2) + WITHDRAW(BAD_VALUE, "ORIGIN", data[0]); + + bgp_set_attr_u32(to, s->pool, BA_ORIGIN, flags, data[0]); } +static void +bgp_format_origin(eattr *a, byte *buf, uint size UNUSED) +{ + static const char *bgp_origin_names[] = { "IGP", "EGP", "Incomplete" }; + + bsprintf(buf, (a->u.data <= 2) ? bgp_origin_names[a->u.data] : "?"); +} + + static int -bgp_check_next_hop(struct bgp_proto *p UNUSED, byte *a UNUSED6, int len UNUSED6) +bgp_encode_as_path(struct bgp_write_state *s, eattr *a, byte *buf, uint size) { -#ifdef IPV6 - return IGNORE; -#else - ip_addr addr; + byte *data = a->u.ptr->data; + uint len = a->u.ptr->length; + + if (!s->as4_session) + { + /* Prepare 16-bit AS_PATH (from 32-bit one) in a temporary buffer */ + byte *src = data; + data = alloca(len); + len = as_path_32to16(data, src, len); + } + + return bgp_put_attr(buf, size, BA_AS_PATH, a->flags, data, len); +} - memcpy(&addr, a, len); - ipa_ntoh(addr); - if (ipa_classify(addr) & IADDR_HOST) +static void +bgp_decode_as_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) +{ + struct bgp_proto *p = s->proto; + int as_length = s->as4_session ? 4 : 2; + int as_confed = p->cf->confederation && p->is_interior; + char err[128]; + + if (!as_path_valid(data, len, as_length, as_confed, err, sizeof(err))) + WITHDRAW("Malformed AS_PATH attribute - %s", err); + + /* In some circumstances check for initial AS_CONFED_SEQUENCE; RFC 5065 5.0 */ + if (p->is_interior && !p->is_internal && + ((len < 2) || (data[0] != AS_PATH_CONFED_SEQUENCE))) + WITHDRAW("Malformed AS_PATH attribute - %s", "missing initial AS_CONFED_SEQUENCE"); + + if (!s->as4_session) + { + /* Prepare 32-bit AS_PATH (from 16-bit one) in a temporary buffer */ + byte *src = data; + data = alloca(2*len); + len = as_path_16to32(data, src, len); + } + + bgp_set_attr_data(to, s->pool, BA_AS_PATH, flags, data, len); +} + + +static int +bgp_encode_next_hop(struct bgp_write_state *s, eattr *a, byte *buf, uint size) +{ + /* + * The NEXT_HOP attribute is used only in traditional (IPv4) BGP. In MP-BGP, + * the next hop is encoded as a part of the MP_REACH_NLRI attribute, so we + * store it and encode it later by AFI-specific hooks. + */ + + if (s->channel->afi == BGP_AF_IPV4) + { + ASSERT(a->u.ptr->length == sizeof(ip_addr)); + + if (size < (3+4)) + return -1; + + bgp_put_attr_hdr3(buf, BA_NEXT_HOP, a->flags, 4); + put_ip4(buf+3, ipa_to_ip4( *(ip_addr *) a->u.ptr->data )); + + return 3+4; + } + else + { + s->mp_next_hop = a; return 0; + } +} + +static void +bgp_decode_next_hop(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data, uint len, ea_list **to UNUSED) +{ + if (len != 4) + WITHDRAW(BAD_LENGTH, "NEXT_HOP", len); + + /* Semantic checks are done later */ + s->ip_next_hop_len = len; + s->ip_next_hop_data = data; +} + +/* TODO: This function should use AF-specific hook */ +static void +bgp_format_next_hop(eattr *a, byte *buf, uint size UNUSED) +{ + ip_addr *nh = (void *) a->u.ptr->data; + uint len = a->u.ptr->length; + + ASSERT((len == 16) || (len == 32)); + + /* in IPv6, we may have two addresses in NEXT HOP */ + if ((len == 16) || ipa_zero(nh[1])) + bsprintf(buf, "%I", nh[0]); else - return 8; -#endif + bsprintf(buf, "%I %I", nh[0], nh[1]); } + static void -bgp_format_next_hop(eattr *a, byte *buf, int buflen UNUSED) +bgp_decode_med(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - ip_addr *ipp = (ip_addr *) a->u.ptr->data; -#ifdef IPV6 - /* in IPv6, we might have two addresses in NEXT HOP */ - if ((a->u.ptr->length == NEXT_HOP_LENGTH) && ipa_nonzero(ipp[1])) - { - bsprintf(buf, "%I %I", ipp[0], ipp[1]); - return; - } -#endif + if (len != 4) + WITHDRAW(BAD_LENGTH, "MULTI_EXIT_DISC", len); - bsprintf(buf, "%I", ipp[0]); + u32 val = get_u32(data); + bgp_set_attr_u32(to, s->pool, BA_MULTI_EXIT_DISC, flags, val); } -static int -bgp_check_aggregator(struct bgp_proto *p, byte *a UNUSED, int len) + +static void +bgp_export_local_pref(struct bgp_export_state *s, eattr *a) { - int exp_len = p->as4_session ? 8 : 6; - - return (len == exp_len) ? 0 : WITHDRAW; + if (!s->proto->is_interior && !s->proto->cf->allow_local_pref) + UNSET(a); } static void -bgp_format_aggregator(eattr *a, byte *buf, int buflen UNUSED) +bgp_decode_local_pref(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - struct adata *ad = a->u.ptr; - byte *data = ad->data; - u32 as; + if (!s->proto->is_interior && !s->proto->cf->allow_local_pref) + DISCARD(BAD_EBGP, "LOCAL_PREF"); - as = get_u32(data); - data += 4; + if (len != 4) + WITHDRAW(BAD_LENGTH, "LOCAL_PREF", len); - bsprintf(buf, "%d.%d.%d.%d AS%u", data[0], data[1], data[2], data[3], as); + u32 val = get_u32(data); + bgp_set_attr_u32(to, s->pool, BA_LOCAL_PREF, flags, val); } -static int -bgp_check_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len) + +static void +bgp_decode_atomic_aggr(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to) { - return ((len % 4) == 0) ? 0 : WITHDRAW; + if (len != 0) + DISCARD(BAD_LENGTH, "ATOMIC_AGGR", len); + + bgp_set_attr_data(to, s->pool, BA_ATOMIC_AGGR, flags, NULL, 0); } static int -bgp_check_cluster_list(struct bgp_proto *p UNUSED, byte *a UNUSED, int len) +bgp_encode_aggregator(struct bgp_write_state *s, eattr *a, byte *buf, uint size) { - return ((len % 4) == 0) ? 0 : 5; + byte *data = a->u.ptr->data; + uint len = a->u.ptr->length; + + if (!s->as4_session) + { + /* Prepare 16-bit AGGREGATOR (from 32-bit one) in a temporary buffer */ + byte *src = data; + data = alloca(6); + len = aggregator_32to16(data, src); + } + + return bgp_put_attr(buf, size, BA_AGGREGATOR, a->flags, data, len); } static void -bgp_format_cluster_list(eattr *a, byte *buf, int buflen) +bgp_decode_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - /* Truncates cluster lists larger than buflen, probably not a problem */ - int_set_format(a->u.ptr, 0, -1, buf, buflen); + if (len != (s->as4_session ? 8 : 6)) + DISCARD(BAD_LENGTH, "AGGREGATOR", len); + + if (!s->as4_session) + { + /* Prepare 32-bit AGGREGATOR (from 16-bit one) in a temporary buffer */ + byte *src = data; + data = alloca(8); + len = aggregator_16to32(data, src); + } + + bgp_set_attr_data(to, s->pool, BA_AGGREGATOR, flags, data, len); } -static int -bgp_check_reach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED) +static void +bgp_format_aggregator(eattr *a, byte *buf, uint size UNUSED) { -#ifdef IPV6 - p->mp_reach_start = a; - p->mp_reach_len = len; -#endif - return IGNORE; + byte *data = a->u.ptr->data; + + bsprintf(buf, "%I4 AS%u", get_ip4(data+4), get_u32(data+0)); } -static int -bgp_check_unreach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED) + +static void +bgp_export_community(struct bgp_export_state *s, eattr *a) { -#ifdef IPV6 - p->mp_unreach_start = a; - p->mp_unreach_len = len; -#endif - return IGNORE; + if (a->u.ptr->length == 0) + UNSET(a); + + a->u.ptr = int_set_sort(s->pool, a->u.ptr); } -static int -bgp_check_ext_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len) +static void +bgp_decode_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - return ((len % 8) == 0) ? 0 : WITHDRAW; + if (!len || (len % 4)) + WITHDRAW(BAD_LENGTH, "COMMUNITY", len); + + struct adata *ad = lp_alloc_adata(s->pool, len); + get_u32s(data, (u32 *) ad->data, len / 4); + bgp_set_attr_ptr(to, s->pool, BA_COMMUNITY, flags, ad); } -static int -bgp_check_large_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len) -{ - return ((len % 12) == 0) ? 0 : WITHDRAW; -} - - -static struct attr_desc bgp_attr_table[] = { - { NULL, -1, 0, 0, 0, /* Undefined */ - NULL, NULL }, - { "origin", 1, BAF_TRANSITIVE, EAF_TYPE_INT, 1, /* BA_ORIGIN */ - bgp_check_origin, bgp_format_origin }, - { "as_path", -1, BAF_TRANSITIVE, EAF_TYPE_AS_PATH, 1, /* BA_AS_PATH */ - NULL, NULL }, /* is checked by validate_as_path() as a special case */ - { "next_hop", 4, BAF_TRANSITIVE, EAF_TYPE_IP_ADDRESS, 1, /* BA_NEXT_HOP */ - bgp_check_next_hop, bgp_format_next_hop }, - { "med", 4, BAF_OPTIONAL, EAF_TYPE_INT, 1, /* BA_MULTI_EXIT_DISC */ - NULL, NULL }, - { "local_pref", 4, BAF_TRANSITIVE, EAF_TYPE_INT, 1, /* BA_LOCAL_PREF */ - NULL, NULL }, - { "atomic_aggr", 0, BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_ATOMIC_AGGR */ - NULL, NULL }, - { "aggregator", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AGGREGATOR */ - bgp_check_aggregator, bgp_format_aggregator }, - { "community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_INT_SET, 1, /* BA_COMMUNITY */ - bgp_check_community, NULL }, - { "originator_id", 4, BAF_OPTIONAL, EAF_TYPE_ROUTER_ID, 0, /* BA_ORIGINATOR_ID */ - NULL, NULL }, - { "cluster_list", -1, BAF_OPTIONAL, EAF_TYPE_INT_SET, 0, /* BA_CLUSTER_LIST */ - bgp_check_cluster_list, bgp_format_cluster_list }, - { .name = NULL }, /* BA_DPA */ - { .name = NULL }, /* BA_ADVERTISER */ - { .name = NULL }, /* BA_RCID_PATH */ - { "mp_reach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1, /* BA_MP_REACH_NLRI */ - bgp_check_reach_nlri, NULL }, - { "mp_unreach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1, /* BA_MP_UNREACH_NLRI */ - bgp_check_unreach_nlri, NULL }, - { "ext_community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_EC_SET, 1, /* BA_EXT_COMMUNITY */ - bgp_check_ext_community, NULL }, - { "as4_path", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AS4_PATH */ - NULL, NULL }, - { "as4_aggregator", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AS4_PATH */ - NULL, NULL }, - [BA_LARGE_COMMUNITY] = - { "large_community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_LC_SET, 1, - bgp_check_large_community, NULL } -}; -/* BA_AS4_PATH is type EAF_TYPE_OPAQUE and not type EAF_TYPE_AS_PATH. - * It does not matter as this attribute does not appear on routes in the routing table. - */ +static void +bgp_export_originator_id(struct bgp_export_state *s, eattr *a) +{ + if (!s->proto->is_internal) + UNSET(a); +} + +static void +bgp_decode_originator_id(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) +{ + if (!s->proto->is_internal) + DISCARD(BAD_EBGP, "ORIGINATOR_ID"); -#define ATTR_KNOWN(code) ((code) < ARRAY_SIZE(bgp_attr_table) && bgp_attr_table[code].name) + if (len != 4) + WITHDRAW(BAD_LENGTH, "ORIGINATOR_ID", len); -static inline struct adata * -bgp_alloc_adata(struct linpool *pool, unsigned len) + u32 val = get_u32(data); + bgp_set_attr_u32(to, s->pool, BA_ORIGINATOR_ID, flags, val); +} + + +static void +bgp_export_cluster_list(struct bgp_export_state *s UNUSED, eattr *a) { - struct adata *ad = lp_alloc(pool, sizeof(struct adata) + len); - ad->length = len; - return ad; + if (!s->proto->is_internal) + UNSET(a); + + if (a->u.ptr->length == 0) + UNSET(a); } static void -bgp_set_attr(eattr *e, unsigned attr, uintptr_t val) +bgp_decode_cluster_list(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - ASSERT(ATTR_KNOWN(attr)); - e->id = EA_CODE(EAP_BGP, attr); - e->type = bgp_attr_table[attr].type; - e->flags = bgp_attr_table[attr].expected_flags; - if (e->type & EAF_EMBEDDED) - e->u.data = val; - else - e->u.ptr = (struct adata *) val; + if (!s->proto->is_internal) + DISCARD(BAD_EBGP, "CLUSTER_LIST"); + + if (!len || (len % 4)) + WITHDRAW(BAD_LENGTH, "CLUSTER_LIST", len); + + struct adata *ad = lp_alloc_adata(s->pool, len); + get_u32s(data, (u32 *) ad->data, len / 4); + bgp_set_attr_ptr(to, s->pool, BA_CLUSTER_LIST, flags, ad); } -static byte * -bgp_set_attr_wa(eattr *e, struct linpool *pool, unsigned attr, unsigned len) +static void +bgp_format_cluster_list(eattr *a, byte *buf, uint size) { - struct adata *ad = bgp_alloc_adata(pool, len); - bgp_set_attr(e, attr, (uintptr_t) ad); - return ad->data; + /* Truncates cluster lists larger than buflen, probably not a problem */ + int_set_format(a->u.ptr, 0, -1, buf, size); } -void -bgp_attach_attr(ea_list **to, struct linpool *pool, unsigned attr, uintptr_t val) + +static inline u32 +get_af3(byte *buf) { - ea_list *a = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr)); - a->next = *to; - *to = a; - a->flags = EALF_SORTED; - a->count = 1; - bgp_set_attr(a->attrs, attr, val); + return (get_u16(buf) << 16) | buf[2]; } -byte * -bgp_attach_attr_wa(ea_list **to, struct linpool *pool, unsigned attr, unsigned len) +static void +bgp_decode_mp_reach_nlri(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data, uint len, ea_list **to UNUSED) { - struct adata *ad = bgp_alloc_adata(pool, len); - bgp_attach_attr(to, pool, attr, (uintptr_t) ad); - return ad->data; + /* + * 2 B MP_REACH_NLRI data - Address Family Identifier + * 1 B MP_REACH_NLRI data - Subsequent Address Family Identifier + * 1 B MP_REACH_NLRI data - Length of Next Hop Network Address + * var MP_REACH_NLRI data - Network Address of Next Hop + * 1 B MP_REACH_NLRI data - Reserved (zero) + * var MP_REACH_NLRI data - Network Layer Reachability Information + */ + + if ((len < 5) || (len < (5 + (uint) data[3]))) + bgp_parse_error(s, 9); + + s->mp_reach_af = get_af3(data); + s->mp_next_hop_len = data[3]; + s->mp_next_hop_data = data + 4; + s->mp_reach_len = len - 5 - s->mp_next_hop_len; + s->mp_reach_nlri = data + 5 + s->mp_next_hop_len; } -static int -bgp_encode_attr_hdr(byte *dst, uint flags, unsigned code, int len) + +static void +bgp_decode_mp_unreach_nlri(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data, uint len, ea_list **to UNUSED) { - int wlen; + /* + * 2 B MP_UNREACH_NLRI data - Address Family Identifier + * 1 B MP_UNREACH_NLRI data - Subsequent Address Family Identifier + * var MP_UNREACH_NLRI data - Network Layer Reachability Information + */ - DBG("\tAttribute %02x (%d bytes, flags %02x)\n", code, len, flags); + if (len < 3) + bgp_parse_error(s, 9); + + s->mp_unreach_af = get_af3(data); + s->mp_unreach_len = len - 3; + s->mp_unreach_nlri = data + 3; +} - if (len < 256) - { - *dst++ = flags; - *dst++ = code; - *dst++ = len; - wlen = 3; - } - else - { - *dst++ = flags | BAF_EXT_LEN; - *dst++ = code; - put_u16(dst, len); - wlen = 4; - } - return wlen; +static void +bgp_export_ext_community(struct bgp_export_state *s, eattr *a) +{ + if (a->u.ptr->length == 0) + UNSET(a); + + a->u.ptr = ec_set_sort(s->pool, a->u.ptr); } static void -aggregator_convert_to_old(struct adata *aggr, byte *dst, int *new_used) +bgp_decode_ext_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - byte *src = aggr->data; - *new_used = 0; + if (!len || (len % 8)) + WITHDRAW(BAD_LENGTH, "EXT_COMMUNITY", len); - u32 as = get_u32(src); - if (as > 0xFFFF) - { - as = AS_TRANS; - *new_used = 1; - } - put_u16(dst, as); + struct adata *ad = lp_alloc_adata(s->pool, len); + get_u32s(data, (u32 *) ad->data, len / 4); + bgp_set_attr_ptr(to, s->pool, BA_EXT_COMMUNITY, flags, ad); +} - /* Copy IPv4 address */ - memcpy(dst + 2, src + 4, 4); + +static void +bgp_decode_as4_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) +{ + if (s->as4_session) + DISCARD(NEW_BGP, "AS4_AGGREGATOR"); + + if (len != 8) + DISCARD(BAD_LENGTH, "AS4_AGGREGATOR", len); + + bgp_set_attr_data(to, s->pool, BA_AS4_AGGREGATOR, flags, data, len); } static void -aggregator_convert_to_new(struct adata *aggr, byte *dst) +bgp_decode_as4_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - byte *src = aggr->data; + char err[128]; - u32 as = get_u16(src); - put_u32(dst, as); + if (s->as4_session) + DISCARD(NEW_BGP, "AS4_PATH"); - /* Copy IPv4 address */ - memcpy(dst + 4, src + 2, 4); + if (len < 6) + DISCARD(BAD_LENGTH, "AS4_PATH", len); + + if (!as_path_valid(data, len, 4, 1, err, sizeof(err))) + DISCARD("Malformed AS4_PATH attribute - %s", err); + + struct adata *a = lp_alloc_adata(s->pool, len); + memcpy(a->data, data, len); + + /* AS_CONFED* segments are invalid in AS4_PATH; RFC 6793 6 */ + if (as_path_contains_confed(a)) + { + REPORT("Discarding AS_CONFED* segment from AS4_PATH attribute"); + a = as_path_strip_confed(s->pool, a); + } + + bgp_set_attr_ptr(to, s->pool, BA_AS4_PATH, flags, a); +} + +static void +bgp_export_large_community(struct bgp_export_state *s, eattr *a) +{ + if (a->u.ptr->length == 0) + UNSET(a); + + a->u.ptr = lc_set_sort(s->pool, a->u.ptr); +} + +static void +bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) +{ + if (!len || (len % 12)) + WITHDRAW(BAD_LENGTH, "LARGE_COMMUNITY", len); + + struct adata *ad = lp_alloc_adata(s->pool, len); + get_u32s(data, (u32 *) ad->data, len / 4); + bgp_set_attr_ptr(to, s->pool, BA_LARGE_COMMUNITY, flags, ad); +} + +static void +bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a) +{ + net_addr *n = s->route->net->n.addr; + u32 *labels = (u32 *) a->u.ptr->data; + uint lnum = a->u.ptr->length / 4; + + /* Perhaps we should just ignore it? */ + if (!s->mpls) + WITHDRAW("Unexpected MPLS stack"); + + /* Empty MPLS stack is not allowed */ + if (!lnum) + WITHDRAW("Malformed MPLS stack - empty"); + + /* This is ugly, but we must ensure that labels fit into NLRI field */ + if ((24*lnum + (net_is_vpn(n) ? 64 : 0) + net_pxlen(n)) > 255) + WITHDRAW("Malformed MPLS stack - too many labels (%u)", lnum); + + for (uint i = 0; i < lnum; i++) + { + if (labels[i] > 0xfffff) + WITHDRAW("Malformed MPLS stack - invalid label (%u)", labels[i]); + + /* TODO: Check for special-purpose label values? */ + } } static int -bgp_get_attr_len(eattr *a) +bgp_encode_mpls_label_stack(struct bgp_write_state *s, eattr *a, byte *buf UNUSED, uint size UNUSED) { - int len; - if (ATTR_KNOWN(EA_ID(a->id))) + /* + * MPLS labels are encoded as a part of the NLRI in MP_REACH_NLRI attribute, + * so we store MPLS_LABEL_STACK and encode it later by AFI-specific hooks. + */ + + s->mpls_labels = a->u.ptr; + return 0; +} + +static void +bgp_decode_mpls_label_stack(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data UNUSED, uint len UNUSED, ea_list **to UNUSED) +{ + DISCARD("Discarding received attribute #0"); +} + +static void +bgp_format_mpls_label_stack(eattr *a, byte *buf, uint size) +{ + u32 *labels = (u32 *) a->u.ptr->data; + uint lnum = a->u.ptr->length / 4; + char *pos = buf; + + for (uint i = 0; i < lnum; i++) + { + if (size < 20) { - int code = EA_ID(a->id); - struct attr_desc *desc = &bgp_attr_table[code]; - len = desc->expected_length; - if (len < 0) - { - ASSERT(!(a->type & EAF_EMBEDDED)); - len = a->u.ptr->length; - } + bsprintf(pos, "..."); + return; } + + uint l = bsprintf(pos, "%d/", labels[i]); + ADVANCE(pos, size, l); + } + + /* Clear last slash or terminate empty string */ + pos[lnum ? -1 : 0] = 0; +} + +static inline void +bgp_decode_unknown(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to) +{ + bgp_set_attr_data(to, s->pool, code, flags, data, len); +} + + +/* + * Attribute table + */ + +static const struct bgp_attr_desc bgp_attr_table[] = { + [BA_ORIGIN] = { + .name = "origin", + .type = EAF_TYPE_INT, + .flags = BAF_TRANSITIVE, + .export = bgp_export_origin, + .encode = bgp_encode_u8, + .decode = bgp_decode_origin, + .format = bgp_format_origin, + }, + [BA_AS_PATH] = { + .name = "as_path", + .type = EAF_TYPE_AS_PATH, + .flags = BAF_TRANSITIVE, + .encode = bgp_encode_as_path, + .decode = bgp_decode_as_path, + }, + [BA_NEXT_HOP] = { + .name = "next_hop", + .type = EAF_TYPE_IP_ADDRESS, + .flags = BAF_TRANSITIVE, + .encode = bgp_encode_next_hop, + .decode = bgp_decode_next_hop, + .format = bgp_format_next_hop, + }, + [BA_MULTI_EXIT_DISC] = { + .name = "med", + .type = EAF_TYPE_INT, + .flags = BAF_OPTIONAL, + .encode = bgp_encode_u32, + .decode = bgp_decode_med, + }, + [BA_LOCAL_PREF] = { + .name = "local_pref", + .type = EAF_TYPE_INT, + .flags = BAF_TRANSITIVE, + .export = bgp_export_local_pref, + .encode = bgp_encode_u32, + .decode = bgp_decode_local_pref, + }, + [BA_ATOMIC_AGGR] = { + .name = "atomic_aggr", + .type = EAF_TYPE_OPAQUE, + .flags = BAF_TRANSITIVE, + .encode = bgp_encode_raw, + .decode = bgp_decode_atomic_aggr, + }, + [BA_AGGREGATOR] = { + .name = "aggregator", + .type = EAF_TYPE_OPAQUE, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .encode = bgp_encode_aggregator, + .decode = bgp_decode_aggregator, + .format = bgp_format_aggregator, + }, + [BA_COMMUNITY] = { + .name = "community", + .type = EAF_TYPE_INT_SET, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .export = bgp_export_community, + .encode = bgp_encode_u32s, + .decode = bgp_decode_community, + }, + [BA_ORIGINATOR_ID] = { + .name = "originator_id", + .type = EAF_TYPE_ROUTER_ID, + .flags = BAF_OPTIONAL, + .export = bgp_export_originator_id, + .encode = bgp_encode_u32, + .decode = bgp_decode_originator_id, + }, + [BA_CLUSTER_LIST] = { + .name = "cluster_list", + .type = EAF_TYPE_INT_SET, + .flags = BAF_OPTIONAL, + .export = bgp_export_cluster_list, + .encode = bgp_encode_u32s, + .decode = bgp_decode_cluster_list, + .format = bgp_format_cluster_list, + }, + [BA_MP_REACH_NLRI] = { + .name = "mp_reach_nlri", + .type = EAF_TYPE_OPAQUE, + .flags = BAF_OPTIONAL, + .decode = bgp_decode_mp_reach_nlri, + }, + [BA_MP_UNREACH_NLRI] = { + .name = "mp_unreach_nlri", + .type = EAF_TYPE_OPAQUE, + .flags = BAF_OPTIONAL, + .decode = bgp_decode_mp_unreach_nlri, + }, + [BA_EXT_COMMUNITY] = { + .name = "ext_community", + .type = EAF_TYPE_EC_SET, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .export = bgp_export_ext_community, + .encode = bgp_encode_u32s, + .decode = bgp_decode_ext_community, + }, + [BA_AS4_PATH] = { + .name = "as4_path", + .type = EAF_TYPE_AS_PATH, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .encode = bgp_encode_raw, + .decode = bgp_decode_as4_path, + }, + [BA_AS4_AGGREGATOR] = { + .name = "as4_aggregator", + .type = EAF_TYPE_OPAQUE, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .encode = bgp_encode_raw, + .decode = bgp_decode_as4_aggregator, + .format = bgp_format_aggregator, + }, + [BA_LARGE_COMMUNITY] = { + .name = "large_community", + .type = EAF_TYPE_LC_SET, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .export = bgp_export_large_community, + .encode = bgp_encode_u32s, + .decode = bgp_decode_large_community, + }, + [BA_MPLS_LABEL_STACK] = { + .name = "mpls_label_stack", + .type = EAF_TYPE_INT_SET, + .export = bgp_export_mpls_label_stack, + .encode = bgp_encode_mpls_label_stack, + .decode = bgp_decode_mpls_label_stack, + .format = bgp_format_mpls_label_stack, + }, +}; + +static inline int +bgp_attr_known(uint code) +{ + return (code < ARRAY_SIZE(bgp_attr_table)) && bgp_attr_table[code].name; +} + + +/* + * Attribute export + */ + +static inline void +bgp_export_attr(struct bgp_export_state *s, eattr *a, ea_list *to) +{ + if (EA_PROTO(a->id) != EAP_BGP) + return; + + uint code = EA_ID(a->id); + + if (bgp_attr_known(code)) + { + const struct bgp_attr_desc *desc = &bgp_attr_table[code]; + + /* The flags might have been zero if the attr was added by filters */ + a->flags = (a->flags & BAF_PARTIAL) | desc->flags; + + /* Set partial bit if new opt-trans attribute is attached to non-local route */ + if ((s->src != NULL) && (a->type & EAF_ORIGINATED) && + (a->flags & BAF_OPTIONAL) && (a->flags & BAF_TRANSITIVE)) + a->flags |= BAF_PARTIAL; + + /* Call specific hook */ + CALL(desc->export, s, a); + + /* Attribute might become undefined in hook */ + if ((a->type & EAF_TYPE_MASK) == EAF_TYPE_UNDEF) + return; + } else - { - ASSERT((a->type & EAF_TYPE_MASK) == EAF_TYPE_OPAQUE); - len = a->u.ptr->length; - } - - return len; + { + /* Don't re-export unknown non-transitive attributes */ + if (!(a->flags & BAF_TRANSITIVE)) + return; + + a->flags |= BAF_PARTIAL; + } + + /* Append updated attribute */ + to->attrs[to->count++] = *a; } -#define ADVANCE(w, r, l) do { r -= l; w += l; } while (0) +/** + * bgp_export_attrs - export BGP attributes + * @s: BGP export state + * @attrs: a list of extended attributes + * + * The bgp_export_attrs() function takes a list of attributes and merges it to + * one newly allocated and sorted segment. Attributes are validated and + * normalized by type-specific export hooks and attribute flags are updated. + * Some attributes may be eliminated (e.g. unknown non-tranitive attributes, or + * empty community sets). + * + * Result: one sorted attribute list segment, or NULL if attributes are unsuitable. + */ +static inline ea_list * +bgp_export_attrs(struct bgp_export_state *s, ea_list *attrs) +{ + /* Merge the attribute list */ + ea_list *new = lp_alloc(s->pool, ea_scan(attrs)); + ea_merge(attrs, new); + ea_sort(new); + + uint i, count; + count = new->count; + new->count = 0; + + /* Export each attribute */ + for (i = 0; i < count; i++) + bgp_export_attr(s, &new->attrs[i], new); + + if (s->err_withdraw) + return NULL; + + return new; +} + + +/* + * Attribute encoding + */ + +static inline int +bgp_encode_attr(struct bgp_write_state *s, eattr *a, byte *buf, uint size) +{ + ASSERT(EA_PROTO(a->id) == EAP_BGP); + + uint code = EA_ID(a->id); + + if (bgp_attr_known(code)) + return bgp_attr_table[code].encode(s, a, buf, size); + else + return bgp_encode_raw(s, a, buf, size); +} /** * bgp_encode_attrs - encode BGP attributes - * @p: BGP instance - * @w: buffer + * @s: BGP write state * @attrs: a list of extended attributes - * @remains: remaining space in the buffer + * @buf: buffer + * @end: buffer end * * The bgp_encode_attrs() function takes a list of extended attributes * and converts it to its BGP representation (a part of an Update message). * * Result: Length of the attribute block generated or -1 if not enough space. */ -uint -bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains) +int +bgp_encode_attrs(struct bgp_write_state *s, ea_list *attrs, byte *buf, byte *end) { - uint i, code, type, flags; - byte *start = w; - int len, rv; + byte *pos = buf; + int i, len; - for(i=0; i<attrs->count; i++) - { - eattr *a = &attrs->attrs[i]; - ASSERT(EA_PROTO(a->id) == EAP_BGP); - code = EA_ID(a->id); - -#ifdef IPV6 - /* When talking multiprotocol BGP, the NEXT_HOP attributes are used only temporarily. */ - if (code == BA_NEXT_HOP) - continue; -#endif - - /* When AS4-aware BGP speaker is talking to non-AS4-aware BGP speaker, - * we have to convert our 4B AS_PATH to 2B AS_PATH and send our AS_PATH - * as optional AS4_PATH attribute. - */ - if ((code == BA_AS_PATH) && (! p->as4_session)) - { - len = a->u.ptr->length; - - if (remains < (len + 4)) - goto err_no_buffer; - - /* Using temporary buffer because don't know a length of created attr - * and therefore a length of a header. Perhaps i should better always - * use BAF_EXT_LEN. */ - - byte buf[len]; - int new_used; - int nl = as_path_convert_to_old(a->u.ptr, buf, &new_used); - - DBG("BGP: Encoding old AS_PATH\n"); - rv = bgp_encode_attr_hdr(w, BAF_TRANSITIVE, BA_AS_PATH, nl); - ADVANCE(w, remains, rv); - memcpy(w, buf, nl); - ADVANCE(w, remains, nl); - - if (! new_used) - continue; - - if (remains < (len + 4)) - goto err_no_buffer; - - /* We should discard AS_CONFED_SEQUENCE or AS_CONFED_SET path segments - * here but we don't support confederations and such paths we already - * discarded in bgp_check_as_path(). - */ - - DBG("BGP: Encoding AS4_PATH\n"); - rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_PATH, len); - ADVANCE(w, remains, rv); - memcpy(w, a->u.ptr->data, len); - ADVANCE(w, remains, len); - - continue; - } - - /* The same issue with AGGREGATOR attribute */ - if ((code == BA_AGGREGATOR) && (! p->as4_session)) - { - int new_used; - - len = 6; - if (remains < (len + 3)) - goto err_no_buffer; - - rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AGGREGATOR, len); - ADVANCE(w, remains, rv); - aggregator_convert_to_old(a->u.ptr, w, &new_used); - ADVANCE(w, remains, len); - - if (! new_used) - continue; - - len = 8; - if (remains < (len + 3)) - goto err_no_buffer; - - rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_AGGREGATOR, len); - ADVANCE(w, remains, rv); - memcpy(w, a->u.ptr->data, len); - ADVANCE(w, remains, len); - - continue; - } - - /* Standard path continues here ... */ - - type = a->type & EAF_TYPE_MASK; - flags = a->flags & (BAF_OPTIONAL | BAF_TRANSITIVE | BAF_PARTIAL); - len = bgp_get_attr_len(a); - - /* Skip empty sets */ - if (((type == EAF_TYPE_INT_SET) || (type == EAF_TYPE_EC_SET) || (type == EAF_TYPE_LC_SET)) && (len == 0)) - continue; - - if (remains < len + 4) - goto err_no_buffer; - - rv = bgp_encode_attr_hdr(w, flags, code, len); - ADVANCE(w, remains, rv); - - switch (type) - { - case EAF_TYPE_INT: - case EAF_TYPE_ROUTER_ID: - if (len == 4) - put_u32(w, a->u.data); - else - *w = a->u.data; - break; - case EAF_TYPE_IP_ADDRESS: - { - ip_addr ip = *(ip_addr *)a->u.ptr->data; - ipa_hton(ip); - memcpy(w, &ip, len); - break; - } - case EAF_TYPE_INT_SET: - case EAF_TYPE_LC_SET: - case EAF_TYPE_EC_SET: - { - u32 *z = int_set_get_data(a->u.ptr); - int i; - for(i=0; i<len; i+=4) - put_u32(w+i, *z++); - break; - } - case EAF_TYPE_OPAQUE: - case EAF_TYPE_AS_PATH: - memcpy(w, a->u.ptr->data, len); - break; - default: - bug("bgp_encode_attrs: unknown attribute type %02x", a->type); - } - ADVANCE(w, remains, len); - } - return w - start; + for (i = 0; i < attrs->count; i++) + { + len = bgp_encode_attr(s, &attrs->attrs[i], pos, end - pos); - err_no_buffer: - return -1; + if (len < 0) + return -1; + + pos += len; + } + + return pos - buf; } + /* -static void -bgp_init_prefix(struct fib_node *N) + * Attribute decoding + */ + +static void bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool); + +static inline int +bgp_as_path_loopy(struct bgp_proto *p, ea_list *attrs, u32 asn) { - struct bgp_prefix *p = (struct bgp_prefix *) N; - p->bucket_node.next = NULL; + eattr *e = bgp_find_attr(attrs, BA_AS_PATH); + int num = p->cf->allow_local_as + 1; + return (e && (num > 0) && as_path_contains(e->u.ptr, asn, num)); } -*/ -static int -bgp_compare_u32(const u32 *x, const u32 *y) +static inline int +bgp_originator_id_loopy(struct bgp_proto *p, ea_list *attrs) { - return (*x < *y) ? -1 : (*x > *y) ? 1 : 0; + eattr *e = bgp_find_attr(attrs, BA_ORIGINATOR_ID); + return (e && (e->u.data == p->local_id)); } -static inline void -bgp_normalize_int_set(u32 *dest, u32 *src, unsigned cnt) +static inline int +bgp_cluster_list_loopy(struct bgp_proto *p, ea_list *attrs) { - memcpy(dest, src, sizeof(u32) * cnt); - qsort(dest, cnt, sizeof(u32), (int(*)(const void *, const void *)) bgp_compare_u32); + eattr *e = bgp_find_attr(attrs, BA_CLUSTER_LIST); + return (e && int_set_contains(e->u.ptr, p->rr_cluster_id)); } -static int -bgp_compare_ec(const u32 *xp, const u32 *yp) +static inline void +bgp_decode_attr(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to) { - u64 x = ec_get(xp, 0); - u64 y = ec_get(yp, 0); - return (x < y) ? -1 : (x > y) ? 1 : 0; + /* Handle duplicate attributes; RFC 7606 3 (g) */ + if (BIT32_TEST(s->attrs_seen, code)) + { + if ((code == BA_MP_REACH_NLRI) || (code == BA_MP_UNREACH_NLRI)) + bgp_parse_error(s, 1); + else + DISCARD("Discarding duplicate attribute (code %u)", code); + } + BIT32_SET(s->attrs_seen, code); + + if (bgp_attr_known(code)) + { + const struct bgp_attr_desc *desc = &bgp_attr_table[code]; + + /* Handle conflicting flags; RFC 7606 3 (c) */ + if ((flags ^ desc->flags) & (BAF_OPTIONAL | BAF_TRANSITIVE)) + WITHDRAW("Malformed %s attribute - conflicting flags (%02x)", desc->name, flags); + + desc->decode(s, code, flags, data, len, to); + } + else /* Unknown attribute */ + { + if (!(flags & BAF_OPTIONAL)) + WITHDRAW("Unknown attribute (code %u) - conflicting flags (%02x)", code, flags); + + bgp_decode_unknown(s, code, flags, data, len, to); + } } -static inline void -bgp_normalize_ec_set(struct adata *ad, u32 *src, int internal) +/** + * bgp_decode_attrs - check and decode BGP attributes + * @s: BGP parse state + * @data: start of attribute block + * @len: length of attribute block + * + * This function takes a BGP attribute block (a part of an Update message), checks + * its consistency and converts it to a list of BIRD route attributes represented + * by an (uncached) &rta. + */ +ea_list * +bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len) { - u32 *dst = int_set_get_data(ad); + struct bgp_proto *p = s->proto; + ea_list *attrs = NULL; + uint code, flags, alen; + byte *pos = data; + + /* Parse the attributes */ + while (len) + { + alen = 0; - /* Remove non-transitive communities (EC_TBIT active) on external sessions */ - if (! internal) + /* Read attribute type */ + if (len < 2) + goto framing_error; + flags = pos[0]; + code = pos[1]; + ADVANCE(pos, len, 2); + + /* Read attribute length */ + if (flags & BAF_EXT_LEN) { - int len = int_set_get_size(ad); - u32 *t = dst; - int i; - - for (i=0; i < len; i += 2) - { - if (src[i] & EC_TBIT) - continue; - - *t++ = src[i]; - *t++ = src[i+1]; - } - - ad->length = (t - dst) * 4; + if (len < 2) + goto framing_error; + alen = get_u16(pos); + ADVANCE(pos, len, 2); + } + else + { + if (len < 1) + goto framing_error; + alen = *pos; + ADVANCE(pos, len, 1); } - else - memcpy(dst, src, ad->length); - qsort(dst, ad->length / 8, 8, (int(*)(const void *, const void *)) bgp_compare_ec); -} + if (alen > len) + goto framing_error; -static int -bgp_compare_lc(const u32 *x, const u32 *y) -{ - if (x[0] != y[0]) - return (x[0] > y[0]) ? 1 : -1; - if (x[1] != y[1]) - return (x[1] > y[1]) ? 1 : -1; - if (x[2] != y[2]) - return (x[2] > y[2]) ? 1 : -1; - return 0; + DBG("Attr %02x %02x %u\n", code, flags, alen); + + bgp_decode_attr(s, code, flags, pos, alen, &attrs); + ADVANCE(pos, len, alen); + } + + if (s->err_withdraw) + goto withdraw; + + /* If there is no reachability NLRI, we are finished */ + if (!s->ip_reach_len && !s->mp_reach_len) + return NULL; + + + /* Handle missing mandatory attributes; RFC 7606 3 (d) */ + if (!BIT32_TEST(s->attrs_seen, BA_ORIGIN)) + { REPORT(NO_MANDATORY, "ORIGIN"); goto withdraw; } + + if (!BIT32_TEST(s->attrs_seen, BA_AS_PATH)) + { REPORT(NO_MANDATORY, "AS_PATH"); goto withdraw; } + + /* When receiving attributes from non-AS4-aware BGP speaker, we have to + reconstruct AS_PATH and AGGREGATOR attributes; RFC 6793 4.2.3 */ + if (!p->as4_session) + bgp_process_as4_attrs(&attrs, s->pool); + + /* Reject routes with our ASN in AS_PATH attribute */ + if (bgp_as_path_loopy(p, attrs, p->local_as)) + goto withdraw; + + /* Reject routes with our Confederation ID in AS_PATH attribute; RFC 5065 4.0 */ + if ((p->public_as != p->local_as) && bgp_as_path_loopy(p, attrs, p->public_as)) + goto withdraw; + + /* Reject routes with our Router ID in ORIGINATOR_ID attribute; RFC 4456 8 */ + if (p->is_internal && bgp_originator_id_loopy(p, attrs)) + goto withdraw; + + /* Reject routes with our Cluster ID in CLUSTER_LIST attribute; RFC 4456 8 */ + if (p->rr_client && bgp_cluster_list_loopy(p, attrs)) + goto withdraw; + + /* If there is no local preference, define one */ + if (!BIT32_TEST(s->attrs_seen, BA_LOCAL_PREF)) + bgp_set_attr_u32(&attrs, s->pool, BA_LOCAL_PREF, 0, p->cf->default_local_pref); + + return attrs; + + +framing_error: + /* RFC 7606 4 - handle attribute framing errors */ + REPORT("Malformed attribute list - framing error (%u/%u) at %d", + alen, len, (int) (pos - s->attrs)); + +withdraw: + /* RFC 7606 5.2 - handle missing NLRI during errors */ + if (!s->ip_reach_len && !s->mp_reach_len) + bgp_parse_error(s, 1); + + s->err_withdraw = 1; + return NULL; } -static inline void -bgp_normalize_lc_set(u32 *dest, u32 *src, unsigned cnt) + +/* + * Route bucket hash table + */ + +#define RBH_KEY(b) b->eattrs, b->hash +#define RBH_NEXT(b) b->next +#define RBH_EQ(a1,h1,a2,h2) h1 == h2 && ea_same(a1, a2) +#define RBH_FN(a,h) h + +#define RBH_REHASH bgp_rbh_rehash +#define RBH_PARAMS /8, *2, 2, 2, 8, 20 + + +HASH_DEFINE_REHASH_FN(RBH, struct bgp_bucket) + +void +bgp_init_bucket_table(struct bgp_channel *c) { - memcpy(dest, src, LCOMM_LENGTH * cnt); - qsort(dest, cnt, LCOMM_LENGTH, (int(*)(const void *, const void *)) bgp_compare_lc); -} + HASH_INIT(c->bucket_hash, c->pool, 8); -static void -bgp_rehash_buckets(struct bgp_proto *p) -{ - struct bgp_bucket **old = p->bucket_hash; - struct bgp_bucket **new; - unsigned oldn = p->hash_size; - unsigned i, e, mask; - struct bgp_bucket *b; - - p->hash_size = p->hash_limit; - DBG("BGP: Rehashing bucket table from %d to %d\n", oldn, p->hash_size); - p->hash_limit *= 4; - if (p->hash_limit >= 65536) - p->hash_limit = ~0; - new = p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *)); - mask = p->hash_size - 1; - for (i=0; i<oldn; i++) - while (b = old[i]) - { - old[i] = b->hash_next; - e = b->hash & mask; - b->hash_next = new[e]; - if (b->hash_next) - b->hash_next->hash_prev = b; - b->hash_prev = NULL; - new[e] = b; - } - mb_free(old); + init_list(&c->bucket_queue); + c->withdraw_bucket = NULL; } static struct bgp_bucket * -bgp_new_bucket(struct bgp_proto *p, ea_list *new, unsigned hash) +bgp_get_bucket(struct bgp_channel *c, ea_list *new) { - struct bgp_bucket *b; - unsigned ea_size = sizeof(ea_list) + new->count * sizeof(eattr); - unsigned ea_size_aligned = BIRD_ALIGN(ea_size, CPU_STRUCT_ALIGN); - unsigned size = sizeof(struct bgp_bucket) + ea_size_aligned; - unsigned i; + /* Hash and lookup */ + u32 hash = ea_hash(new); + struct bgp_bucket *b = HASH_FIND(c->bucket_hash, RBH, new, hash); + + if (b) + return b; + + uint ea_size = sizeof(ea_list) + new->count * sizeof(eattr); + uint ea_size_aligned = BIRD_ALIGN(ea_size, CPU_STRUCT_ALIGN); + uint size = sizeof(struct bgp_bucket) + ea_size_aligned; + uint i; byte *dest; - unsigned index = hash & (p->hash_size - 1); /* Gather total size of non-inline attributes */ - for (i=0; i<new->count; i++) - { - eattr *a = &new->attrs[i]; - if (!(a->type & EAF_EMBEDDED)) - size += BIRD_ALIGN(sizeof(struct adata) + a->u.ptr->length, CPU_STRUCT_ALIGN); - } + for (i = 0; i < new->count; i++) + { + eattr *a = &new->attrs[i]; - /* Create the bucket and hash it */ - b = mb_alloc(p->p.pool, size); - b->hash_next = p->bucket_hash[index]; - if (b->hash_next) - b->hash_next->hash_prev = b; - p->bucket_hash[index] = b; - b->hash_prev = NULL; - b->hash = hash; - add_tail(&p->bucket_queue, &b->send_node); + if (!(a->type & EAF_EMBEDDED)) + size += BIRD_ALIGN(sizeof(struct adata) + a->u.ptr->length, CPU_STRUCT_ALIGN); + } + + /* Create the bucket */ + b = mb_alloc(c->pool, size); init_list(&b->prefixes); + b->hash = hash; + + /* Copy list of extended attributes */ memcpy(b->eattrs, new, ea_size); - dest = ((byte *)b->eattrs) + ea_size_aligned; + dest = ((byte *) b->eattrs) + ea_size_aligned; /* Copy values of non-inline attributes */ - for (i=0; i<new->count; i++) + for (i = 0; i < new->count; i++) + { + eattr *a = &b->eattrs->attrs[i]; + + if (!(a->type & EAF_EMBEDDED)) { - eattr *a = &b->eattrs->attrs[i]; - if (!(a->type & EAF_EMBEDDED)) - { - struct adata *oa = a->u.ptr; - struct adata *na = (struct adata *) dest; - memcpy(na, oa, sizeof(struct adata) + oa->length); - a->u.ptr = na; - dest += BIRD_ALIGN(sizeof(struct adata) + na->length, CPU_STRUCT_ALIGN); - } + struct adata *oa = a->u.ptr; + struct adata *na = (struct adata *) dest; + memcpy(na, oa, sizeof(struct adata) + oa->length); + a->u.ptr = na; + dest += BIRD_ALIGN(sizeof(struct adata) + na->length, CPU_STRUCT_ALIGN); } + } - /* If needed, rehash */ - p->hash_count++; - if (p->hash_count > p->hash_limit) - bgp_rehash_buckets(p); + /* Insert the bucket to send queue and bucket hash */ + add_tail(&c->bucket_queue, &b->send_node); + HASH_INSERT2(c->bucket_hash, RBH, c->pool, b); return b; } static struct bgp_bucket * -bgp_get_bucket(struct bgp_proto *p, net *n, ea_list *attrs, int originate) +bgp_get_withdraw_bucket(struct bgp_channel *c) { - ea_list *new; - unsigned i, cnt, hash, code; - eattr *a, *d; - u32 seen = 0; - struct bgp_bucket *b; - - /* Merge the attribute list */ - new = alloca(ea_scan(attrs)); - ea_merge(attrs, new); - ea_sort(new); + if (!c->withdraw_bucket) + { + c->withdraw_bucket = mb_allocz(c->pool, sizeof(struct bgp_bucket)); + init_list(&c->withdraw_bucket->prefixes); + } - /* Normalize attributes */ - d = new->attrs; - cnt = new->count; - new->count = 0; - for(i=0; i<cnt; i++) - { - a = &new->attrs[i]; - if (EA_PROTO(a->id) != EAP_BGP) - continue; - code = EA_ID(a->id); - if (ATTR_KNOWN(code)) - { - if (!p->is_internal) - { - if (!bgp_attr_table[code].allow_in_ebgp) - continue; - if ((code == BA_LOCAL_PREF) && !p->cf->allow_local_pref) - continue; - } - /* The flags might have been zero if the attr was added by filters */ - a->flags = (a->flags & BAF_PARTIAL) | bgp_attr_table[code].expected_flags; - if (code < 32) - seen |= 1 << code; - } - else - { - /* Don't re-export unknown non-transitive attributes */ - if (!(a->flags & BAF_TRANSITIVE)) - continue; - } - *d = *a; - if ((d->type & EAF_ORIGINATED) && !originate && (d->flags & BAF_TRANSITIVE) && (d->flags & BAF_OPTIONAL)) - d->flags |= BAF_PARTIAL; - switch (d->type & EAF_TYPE_MASK) - { - case EAF_TYPE_INT_SET: - { - struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length); - z->length = d->u.ptr->length; - bgp_normalize_int_set((u32 *) z->data, (u32 *) d->u.ptr->data, z->length / 4); - d->u.ptr = z; - break; - } - case EAF_TYPE_EC_SET: - { - struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length); - z->length = d->u.ptr->length; - bgp_normalize_ec_set(z, (u32 *) d->u.ptr->data, p->is_internal); - d->u.ptr = z; - break; - } - case EAF_TYPE_LC_SET: - { - struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length); - z->length = d->u.ptr->length; - bgp_normalize_lc_set((u32 *) z->data, (u32 *) d->u.ptr->data, z->length / LCOMM_LENGTH); - d->u.ptr = z; - break; - } - default: ; - } - d++; - new->count++; - } + return c->withdraw_bucket; +} - /* Hash */ - hash = ea_hash(new); - for(b=p->bucket_hash[hash & (p->hash_size - 1)]; b; b=b->hash_next) - if (b->hash == hash && ea_same(b->eattrs, new)) - { - DBG("Found bucket.\n"); - return b; - } - - /* Ensure that there are all mandatory attributes */ - for(i=0; i<ARRAY_SIZE(bgp_mandatory_attrs); i++) - if (!(seen & (1 << bgp_mandatory_attrs[i]))) - { - log(L_ERR "%s: Mandatory attribute %s missing in route %I/%d", p->p.name, bgp_attr_table[bgp_mandatory_attrs[i]].name, n->n.prefix, n->n.pxlen); - return NULL; - } - - /* Check if next hop is valid */ - a = ea_find(new, EA_CODE(EAP_BGP, BA_NEXT_HOP)); - if (!a || ipa_equal(p->cf->remote_ip, *(ip_addr *)a->u.ptr->data)) - { - log(L_ERR "%s: Invalid NEXT_HOP attribute in route %I/%d", p->p.name, n->n.prefix, n->n.pxlen); - return NULL; - } +void +bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b) +{ + rem_node(&b->send_node); + HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b); + mb_free(b); +} - /* Create new bucket */ - DBG("Creating bucket.\n"); - return bgp_new_bucket(p, new, hash); +void +bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b) +{ + rem_node(&b->send_node); + add_tail(&c->bucket_queue, &b->send_node); } void -bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck) +bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) { - if (buck->hash_next) - buck->hash_next->hash_prev = buck->hash_prev; - if (buck->hash_prev) - buck->hash_prev->hash_next = buck->hash_next; - else - p->bucket_hash[buck->hash & (p->hash_size-1)] = buck->hash_next; - mb_free(buck); + struct bgp_proto *p = (void *) c->c.proto; + struct bgp_bucket *wb = bgp_get_withdraw_bucket(c); + + log(L_ERR "%s: Attribute list too long", p->p.name); + while (!EMPTY_LIST(b->prefixes)) + { + struct bgp_prefix *px = HEAD(b->prefixes); + + log(L_ERR "%s: - withdrawing %N", p->p.name, &px->net); + rem_node(&px->buck_node); + add_tail(&wb->prefixes, &px->buck_node); + } } -/* Prefix hash table */ +/* + * Prefix hash table + */ -#define PXH_KEY(n1) n1->n.prefix, n1->n.pxlen, n1->path_id -#define PXH_NEXT(n) n->next -#define PXH_EQ(p1,l1,i1,p2,l2,i2) ipa_equal(p1, p2) && l1 == l2 && i1 == i2 -#define PXH_FN(p,l,i) ipa_hash32(p) ^ u32_hash((l << 16) ^ i) +#define PXH_KEY(px) px->net, px->path_id, px->hash +#define PXH_NEXT(px) px->next +#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && i1 == i2 && net_equal(n1, n2) +#define PXH_FN(n,i,h) h #define PXH_REHASH bgp_pxh_rehash #define PXH_PARAMS /8, *2, 2, 2, 8, 20 @@ -931,308 +1292,282 @@ bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck) HASH_DEFINE_REHASH_FN(PXH, struct bgp_prefix) void -bgp_init_prefix_table(struct bgp_proto *p, u32 order) +bgp_init_prefix_table(struct bgp_channel *c) { - HASH_INIT(p->prefix_hash, p->p.pool, order); + HASH_INIT(c->prefix_hash, c->pool, 8); - p->prefix_slab = sl_new(p->p.pool, sizeof(struct bgp_prefix)); + uint alen = net_addr_length[c->c.net_type]; + c->prefix_slab = alen ? sl_new(c->pool, sizeof(struct bgp_prefix) + alen) : NULL; } void -bgp_free_prefix_table(struct bgp_proto *p) +bgp_free_prefix_table(struct bgp_channel *c) { - HASH_FREE(p->prefix_hash); + HASH_FREE(c->prefix_hash); - rfree(p->prefix_slab); - p->prefix_slab = NULL; + rfree(c->prefix_slab); + c->prefix_slab = NULL; } static struct bgp_prefix * -bgp_get_prefix(struct bgp_proto *p, ip_addr prefix, int pxlen, u32 path_id) +bgp_get_prefix(struct bgp_channel *c, net_addr *net, u32 path_id) { - struct bgp_prefix *bp = HASH_FIND(p->prefix_hash, PXH, prefix, pxlen, path_id); + u32 hash = net_hash(net) ^ u32_hash(path_id); + struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id, hash); + + if (px) + { + rem_node(&px->buck_node); + return px; + } - if (bp) - return bp; + if (c->prefix_slab) + px = sl_alloc(c->prefix_slab); + else + px = mb_alloc(c->pool, sizeof(struct bgp_prefix) + net->length); - bp = sl_alloc(p->prefix_slab); - bp->n.prefix = prefix; - bp->n.pxlen = pxlen; - bp->path_id = path_id; - bp->bucket_node.next = NULL; + px->buck_node.next = NULL; + px->buck_node.prev = NULL; + px->hash = hash; + px->path_id = path_id; + net_copy(px->net, net); - HASH_INSERT2(p->prefix_hash, PXH, p->p.pool, bp); + HASH_INSERT2(c->prefix_hash, PXH, c->pool, px); - return bp; + return px; } void -bgp_free_prefix(struct bgp_proto *p, struct bgp_prefix *bp) +bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px) { - HASH_REMOVE2(p->prefix_hash, PXH, p->p.pool, bp); - sl_free(p->prefix_slab, bp); + rem_node(&px->buck_node); + HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px); + + if (c->prefix_slab) + sl_free(c->prefix_slab, px); + else + mb_free(px); } -void -bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs) +/* + * BGP protocol glue + */ + +int +bgp_import_control(struct proto *P, rte **new, ea_list **attrs UNUSED, struct linpool *pool UNUSED) { + rte *e = *new; + struct proto *SRC = e->attrs->src->proto; struct bgp_proto *p = (struct bgp_proto *) P; - struct bgp_bucket *buck; - struct bgp_prefix *px; - rte *key; - u32 path_id; + struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (struct bgp_proto *) SRC : NULL; - DBG("BGP: Got route %I/%d %s\n", n->n.prefix, n->n.pxlen, new ? "up" : "down"); + /* Reject our routes */ + if (src == p) + return -1; - if (new) - { - key = new; - buck = bgp_get_bucket(p, n, attrs, new->attrs->source != RTS_BGP); - if (!buck) /* Inconsistent attribute list */ - return; - } - else - { - key = old; - if (!(buck = p->withdraw_bucket)) - { - buck = p->withdraw_bucket = mb_alloc(P->pool, sizeof(struct bgp_bucket)); - init_list(&buck->prefixes); - } - } - path_id = p->add_path_tx ? key->attrs->src->global_id : 0; - px = bgp_get_prefix(p, n->n.prefix, n->n.pxlen, path_id); - if (px->bucket_node.next) - { - DBG("\tRemoving old entry.\n"); - rem_node(&px->bucket_node); - } - add_tail(&buck->prefixes, &px->bucket_node); - bgp_schedule_packet(p->conn, PKT_UPDATE); -} + /* Accept non-BGP routes */ + if (src == NULL) + return 0; -static int -bgp_create_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool) -{ - ea_list *ea = lp_alloc(pool, sizeof(ea_list) + 4*sizeof(eattr)); - rta *rta = e->attrs; - byte *z; + // XXXX: Check next hop AF - ea->next = *attrs; - *attrs = ea; - ea->flags = EALF_SORTED; - ea->count = 4; + /* IBGP route reflection, RFC 4456 */ + if (p->is_internal && src->is_internal && (p->local_as == src->local_as)) + { + /* Rejected unless configured as route reflector */ + if (!p->rr_client && !src->rr_client) + return -1; + + /* Generally, this should be handled when path is received, but we check it + also here as rr_cluster_id may be undefined or different in src. */ + if (p->rr_cluster_id && bgp_cluster_list_loopy(p, e->attrs->eattrs)) + return -1; + } - bgp_set_attr(ea->attrs, BA_ORIGIN, - ((rta->source == RTS_OSPF_EXT1) || (rta->source == RTS_OSPF_EXT2)) ? ORIGIN_INCOMPLETE : ORIGIN_IGP); + /* Handle well-known communities, RFC 1997 */ + struct eattr *c; + if (p->cf->interpret_communities && + (c = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_COMMUNITY)))) + { + struct adata *d = c->u.ptr; - if (p->is_internal) - bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, 0); - else - { - z = bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, 6); - z[0] = AS_PATH_SEQUENCE; - z[1] = 1; /* 1 AS */ - put_u32(z+2, p->local_as); - } + /* Do not export anywhere */ + if (int_set_contains(d, BGP_COMM_NO_ADVERTISE)) + return -1; - /* iBGP -> use gw, eBGP multi-hop -> use source_addr, - eBGP single-hop -> use gw if on the same iface */ - z = bgp_set_attr_wa(ea->attrs+2, pool, BA_NEXT_HOP, NEXT_HOP_LENGTH); - if (p->cf->next_hop_self || - rta->dest != RTD_ROUTER || - ipa_equal(rta->gw, IPA_NONE) || - ipa_is_link_local(rta->gw) || - (!p->is_internal && !p->cf->next_hop_keep && - (!p->neigh || (rta->iface != p->neigh->iface)))) - set_next_hop(z, p->source_addr); - else - set_next_hop(z, rta->gw); + /* Do not export outside of AS (or member-AS) */ + if (!p->is_internal && int_set_contains(d, BGP_COMM_NO_EXPORT_SUBCONFED)) + return -1; - bgp_set_attr(ea->attrs+3, BA_LOCAL_PREF, p->cf->default_local_pref); + /* Do not export outside of AS (or confederation) */ + if (!p->is_interior && int_set_contains(d, BGP_COMM_NO_EXPORT)) + return -1; + } - return 0; /* Leave decision to the filters */ + return 0; } -static inline int -bgp_as_path_loopy(struct bgp_proto *p, rta *a) -{ - int num = p->cf->allow_local_as + 1; - eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - return (e && (num > 0) && as_path_contains(e->u.ptr, p->local_as, num)); -} +static adata null_adata; /* adata of length 0 */ -static inline int -bgp_originator_id_loopy(struct bgp_proto *p, rta *a) -{ - eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID)); - return (e && (e->u.data == p->local_id)); -} - -static inline int -bgp_cluster_list_loopy(struct bgp_proto *p, rta *a) +static ea_list * +bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *attrs0, struct linpool *pool) { - eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST)); - return (e && p->rr_client && int_set_contains(e->u.ptr, p->rr_cluster_id)); -} + struct proto *SRC = e->attrs->src->proto; + struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (void *) SRC : NULL; + struct bgp_export_state s = { .proto = p, .channel = c, .pool = pool, .src = src, .route = e, .mpls = c->desc->mpls }; + ea_list *attrs = attrs0; + eattr *a; + adata *ad; + /* ORIGIN attribute - mandatory, attach if missing */ + if (! bgp_find_attr(attrs0, BA_ORIGIN)) + bgp_set_attr_u32(&attrs, pool, BA_ORIGIN, 0, src ? ORIGIN_INCOMPLETE : ORIGIN_IGP); -static inline void -bgp_path_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 as) -{ - eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - bgp_attach_attr(attrs, pool, BA_AS_PATH, (uintptr_t) as_path_prepend(pool, a->u.ptr, as)); -} + /* AS_PATH attribute - mandatory */ + a = bgp_find_attr(attrs0, BA_AS_PATH); + ad = a ? a->u.ptr : &null_adata; -static inline void -bgp_cluster_list_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 cid) -{ - eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST)); - bgp_attach_attr(attrs, pool, BA_CLUSTER_LIST, (uintptr_t) int_set_prepend(pool, a ? a->u.ptr : NULL, cid)); -} + /* AS_PATH attribute - strip AS_CONFED* segments outside confederation */ + if ((!p->cf->confederation || !p->is_interior) && as_path_contains_confed(ad)) + ad = as_path_strip_confed(pool, ad); -static int -bgp_update_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool, int rr) -{ - eattr *a; + /* AS_PATH attribute - keep or prepend ASN */ + if (p->is_internal || + (p->rs_client && src && src->rs_client)) + { + /* IBGP or route server -> just ensure there is one */ + if (!a) + bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, &null_adata); + } + else if (p->is_interior) + { + /* Confederation -> prepend ASN as AS_CONFED_SEQUENCE */ + ad = as_path_prepend2(pool, ad, AS_PATH_CONFED_SEQUENCE, p->public_as); + bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, ad); + } + else /* Regular EBGP (no RS, no confederation) */ + { + /* Regular EBGP -> prepend ASN as regular sequence */ + ad = as_path_prepend2(pool, ad, AS_PATH_SEQUENCE, p->public_as); + bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, ad); + + /* MULTI_EXIT_DESC attribute - accept only if set in export filter */ + a = bgp_find_attr(attrs0, BA_MULTI_EXIT_DISC); + if (a && !(a->type & EAF_FRESH)) + bgp_unset_attr(&attrs, pool, BA_MULTI_EXIT_DISC); + } - if (!p->is_internal && !p->rs_client) - { - bgp_path_prepend(e, attrs, pool, p->local_as); - - /* The MULTI_EXIT_DISC attribute received from a neighboring AS MUST NOT be - * propagated to other neighboring ASes. - * Perhaps it would be better to undefine it. - */ - a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); - if (a) - bgp_attach_attr(attrs, pool, BA_MULTI_EXIT_DISC, 0); - } + /* NEXT_HOP attribute - delegated to AF-specific hook */ + a = bgp_find_attr(attrs0, BA_NEXT_HOP); + bgp_update_next_hop(&s, a, &attrs); - /* iBGP -> keep next_hop, eBGP multi-hop -> use source_addr, - * eBGP single-hop -> keep next_hop if on the same iface. - * If the next_hop is zero (i.e. link-local), keep only if on the same iface. - * - * Note that same-iface-check uses iface from route, which is based on gw. - */ - a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP)); - if (a && !p->cf->next_hop_self && - (p->cf->next_hop_keep || - (p->is_internal && ipa_nonzero(*((ip_addr *) a->u.ptr->data))) || - (p->neigh && (e->attrs->iface == p->neigh->iface)))) - { - /* Leave the original next hop attribute, will check later where does it point */ - } - else - { - /* Need to create new one */ - byte *b = bgp_attach_attr_wa(attrs, pool, BA_NEXT_HOP, NEXT_HOP_LENGTH); - set_next_hop(b, p->source_addr); - } + /* LOCAL_PREF attribute - required for IBGP, attach if missing */ + if (p->is_interior && ! bgp_find_attr(attrs0, BA_LOCAL_PREF)) + bgp_set_attr_u32(&attrs, pool, BA_LOCAL_PREF, 0, p->cf->default_local_pref); - if (rr) - { - /* Handling route reflection, RFC 4456 */ - struct bgp_proto *src = (struct bgp_proto *) e->attrs->src->proto; + /* IBGP route reflection, RFC 4456 */ + if (src && src->is_internal && p->is_internal && (src->local_as == p->local_as)) + { + /* ORIGINATOR_ID attribute - attach if not already set */ + if (! bgp_find_attr(attrs0, BA_ORIGINATOR_ID)) + bgp_set_attr_u32(&attrs, pool, BA_ORIGINATOR_ID, 0, src->remote_id); - a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID)); - if (!a) - bgp_attach_attr(attrs, pool, BA_ORIGINATOR_ID, src->remote_id); + /* CLUSTER_LIST attribute - prepend cluster ID */ + a = bgp_find_attr(attrs0, BA_CLUSTER_LIST); + ad = a ? a->u.ptr : NULL; - /* We attach proper cluster ID according to whether the route is entering or leaving the cluster */ - bgp_cluster_list_prepend(e, attrs, pool, src->rr_client ? src->rr_cluster_id : p->rr_cluster_id); + /* Prepend src cluster ID */ + if (src->rr_cluster_id) + ad = int_set_prepend(pool, ad, src->rr_cluster_id); - /* Two RR clients with different cluster ID, hmmm */ - if (src->rr_client && p->rr_client && (src->rr_cluster_id != p->rr_cluster_id)) - bgp_cluster_list_prepend(e, attrs, pool, p->rr_cluster_id); - } + /* Prepend dst cluster ID if src and dst clusters are different */ + if (p->rr_cluster_id && (src->rr_cluster_id != p->rr_cluster_id)) + ad = int_set_prepend(pool, ad, p->rr_cluster_id); - return 0; /* Leave decision to the filters */ -} + /* Should be at least one prepended cluster ID */ + bgp_set_attr_ptr(&attrs, pool, BA_CLUSTER_LIST, 0, ad); + } -static int -bgp_community_filter(struct bgp_proto *p, rte *e) -{ - eattr *a; - struct adata *d; + /* AS4_* transition attributes, RFC 6793 4.2.2 */ + if (! p->as4_session) + { + a = bgp_find_attr(attrs, BA_AS_PATH); + if (a && as_path_contains_as4(a->u.ptr)) + { + bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, as_path_to_old(pool, a->u.ptr)); + bgp_set_attr_ptr(&attrs, pool, BA_AS4_PATH, 0, as_path_strip_confed(pool, a->u.ptr)); + } - /* Check if we aren't forbidden to export the route by communities */ - a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_COMMUNITY)); - if (a) + a = bgp_find_attr(attrs, BA_AGGREGATOR); + if (a && aggregator_contains_as4(a->u.ptr)) { - d = a->u.ptr; - if (int_set_contains(d, BGP_COMM_NO_ADVERTISE)) - { - DBG("\tNO_ADVERTISE\n"); - return 1; - } - if (!p->is_internal && - (int_set_contains(d, BGP_COMM_NO_EXPORT) || - int_set_contains(d, BGP_COMM_NO_EXPORT_SUBCONFED))) - { - DBG("\tNO_EXPORT\n"); - return 1; - } + bgp_set_attr_ptr(&attrs, pool, BA_AGGREGATOR, 0, aggregator_to_old(pool, a->u.ptr)); + bgp_set_attr_ptr(&attrs, pool, BA_AS4_AGGREGATOR, 0, a->u.ptr); } + } - return 0; + /* + * Presence of mandatory attributes ORIGIN and AS_PATH is ensured by above + * conditions. Presence and validity of quasi-mandatory NEXT_HOP attribute + * should be checked in AF-specific hooks. + */ + + /* Apply per-attribute export hooks for validatation and normalization */ + return bgp_export_attrs(&s, attrs); } -int -bgp_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool *pool) +void +bgp_rt_notify(struct proto *P, struct channel *C, net *n, rte *new, rte *old, ea_list *attrs) { - rte *e = *new; - struct bgp_proto *p = (struct bgp_proto *) P; - struct bgp_proto *new_bgp = (e->attrs->src->proto->proto == &proto_bgp) ? - (struct bgp_proto *) e->attrs->src->proto : NULL; + struct bgp_proto *p = (void *) P; + struct bgp_channel *c = (void *) C; + struct bgp_bucket *buck; + struct bgp_prefix *px; + u32 path; - if (p == new_bgp) /* Poison reverse updates */ - return -1; - if (new_bgp) - { - /* We should check here for cluster list loop, because the receiving BGP instance - might have different cluster ID */ - if (bgp_cluster_list_loopy(p, e->attrs)) - return -1; - - if (p->cf->interpret_communities && bgp_community_filter(p, e)) - return -1; - - if (p->local_as == new_bgp->local_as && p->is_internal && new_bgp->is_internal) - { - /* Redistribution of internal routes with IBGP */ - if (p->rr_client || new_bgp->rr_client) - /* Route reflection, RFC 4456 */ - return bgp_update_attrs(p, e, attrs, pool, 1); - else - return -1; - } - else - return bgp_update_attrs(p, e, attrs, pool, 0); - } + if (new) + { + attrs = bgp_update_attrs(p, c, new, attrs, bgp_linpool2); + + /* If attributes are invalid, we fail back to withdraw */ + buck = attrs ? bgp_get_bucket(c, attrs) : bgp_get_withdraw_bucket(c); + path = new->attrs->src->global_id; + + lp_flush(bgp_linpool2); + } else - return bgp_create_attrs(p, e, attrs, pool); + { + buck = bgp_get_withdraw_bucket(c); + path = old->attrs->src->global_id; + } + + px = bgp_get_prefix(c, n->n.addr, c->add_path_tx ? path : 0); + add_tail(&buck->prefixes, &px->buck_node); + + bgp_schedule_packet(p->conn, c, PKT_UPDATE); } + static inline u32 bgp_get_neighbor(rte *r) { eattr *e = ea_find(r->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); u32 as; - if (e && as_path_get_first(e->u.ptr, &as)) + if (e && as_path_get_first_regular(e->u.ptr, &as)) return as; - else - return ((struct bgp_proto *) r->attrs->src->proto)->remote_as; + + /* If AS_PATH is not defined, we treat rte as locally originated */ + struct bgp_proto *p = (void *) r->attrs->src->proto; + return p->cf->confederation ?: p->local_as; } static inline int rte_resolvable(rte *rt) { - int rd = rt->attrs->dest; - return (rd == RTD_ROUTER) || (rd == RTD_DEVICE) || (rd == RTD_MULTIPATH); + return rt->attrs->dest == RTD_UNICAST; } int @@ -1271,16 +1606,16 @@ bgp_rte_better(rte *new, rte *old) /* RFC 4271 9.1.2.2. a) Use AS path lengths */ if (new_bgp->cf->compare_path_lengths || old_bgp->cf->compare_path_lengths) - { - x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; - o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; - if (n < o) - return 1; - if (n > o) - return 0; - } + { + x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); + y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); + n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; + o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; + if (n < o) + return 1; + if (n > o) + return 0; + } /* RFC 4271 9.1.2.2. b) Use origins */ x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN)); @@ -1305,21 +1640,21 @@ bgp_rte_better(rte *new, rte *old) */ if (new_bgp->cf->med_metric || old_bgp->cf->med_metric || (bgp_get_neighbor(new) == bgp_get_neighbor(old))) - { - x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); - y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); - n = x ? x->u.data : new_bgp->cf->default_med; - o = y ? y->u.data : old_bgp->cf->default_med; - if (n < o) - return 1; - if (n > o) - return 0; - } + { + x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); + y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); + n = x ? x->u.data : new_bgp->cf->default_med; + o = y ? y->u.data : old_bgp->cf->default_med; + if (n < o) + return 1; + if (n > o) + return 0; + } /* RFC 4271 9.1.2.2. d) Prefer external peers */ - if (new_bgp->is_internal > old_bgp->is_internal) + if (new_bgp->is_interior > old_bgp->is_interior) return 0; - if (new_bgp->is_internal < old_bgp->is_internal) + if (new_bgp->is_interior < old_bgp->is_interior) return 1; /* RFC 4271 9.1.2.2. e) Compare IGP metrics */ @@ -1331,7 +1666,7 @@ bgp_rte_better(rte *new, rte *old) return 0; /* RFC 4271 9.1.2.2. f) Compare BGP identifiers */ - /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighor ID */ + /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighbor ID */ x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID)); y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID)); n = x ? x->u.data : new_bgp->remote_id; @@ -1390,18 +1725,18 @@ bgp_rte_mergable(rte *pri, rte *sec) /* RFC 4271 9.1.2.2. a) Use AS path lengths */ if (pri_bgp->cf->compare_path_lengths || sec_bgp->cf->compare_path_lengths) - { - x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - p = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; - s = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; + { + x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); + y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); + p = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; + s = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; - if (p != s) - return 0; + if (p != s) + return 0; -// if (DELTA(p, s) > pri_bgp->cf->relax_multipath) -// return 0; - } +// if (DELTA(p, s) > pri_bgp->cf->relax_multipath) +// return 0; + } /* RFC 4271 9.1.2.2. b) Use origins */ x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN)); @@ -1414,17 +1749,17 @@ bgp_rte_mergable(rte *pri, rte *sec) /* RFC 4271 9.1.2.2. c) Compare MED's */ if (pri_bgp->cf->med_metric || sec_bgp->cf->med_metric || (bgp_get_neighbor(pri) == bgp_get_neighbor(sec))) - { - x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); - y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); - p = x ? x->u.data : pri_bgp->cf->default_med; - s = y ? y->u.data : sec_bgp->cf->default_med; - if (p != s) - return 0; - } + { + x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); + y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); + p = x ? x->u.data : pri_bgp->cf->default_med; + s = y ? y->u.data : sec_bgp->cf->default_med; + if (p != s) + return 0; + } /* RFC 4271 9.1.2.2. d) Prefer external peers */ - if (pri_bgp->is_internal != sec_bgp->is_internal) + if (pri_bgp->is_interior != sec_bgp->is_interior) return 0; /* RFC 4271 9.1.2.2. e) Compare IGP metrics */ @@ -1439,7 +1774,6 @@ bgp_rte_mergable(rte *pri, rte *sec) } - static inline int same_group(rte *r, u32 lpref, u32 lasn) { @@ -1484,7 +1818,7 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) * that this fn is not called for them. * * The idea is simple, the implementation is more problematic, - * mostly because of optimizations in rte_recalculate() that + * mostly because of optimizations in rte_recalculate() that * avoids full recalculation in most cases. * * We can assume that at least one of new, old is non-NULL and both @@ -1496,14 +1830,14 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) /* If new and old are from different groups, we just process that as two independent events */ if (new && old && !same_group(old, lpref, lasn)) - { - int i1, i2; - i1 = bgp_rte_recalculate(table, net, NULL, old, old_best); - i2 = bgp_rte_recalculate(table, net, new, NULL, old_best); - return i1 || i2; - } + { + int i1, i2; + i1 = bgp_rte_recalculate(table, net, NULL, old, old_best); + i2 = bgp_rte_recalculate(table, net, new, NULL, old_best); + return i1 || i2; + } - /* + /* * We could find the best-in-group and then make some shortcuts like * in rte_recalculate, but as we would have to walk through all * net->routes just to find it, it is probably not worth. So we @@ -1515,35 +1849,35 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) new->u.bgp.suppressed = 1; if (old) + { + old_is_group_best = !old->u.bgp.suppressed; + old->u.bgp.suppressed = 1; + int new_is_better = new && bgp_rte_better(new, old); + + /* The first case - replace not best with worse (or remove not best) */ + if (!old_is_group_best && !new_is_better) + return 0; + + /* The second case - replace the best with better */ + if (old_is_group_best && new_is_better) { - old_is_group_best = !old->u.bgp.suppressed; - old->u.bgp.suppressed = 1; - int new_is_better = new && bgp_rte_better(new, old); - - /* The first case - replace not best with worse (or remove not best) */ - if (!old_is_group_best && !new_is_better) - return 0; - - /* The second case - replace the best with better */ - if (old_is_group_best && new_is_better) - { - /* new is best-in-group, the see discussion below - this is - a special variant of NBG && OBG. From OBG we can deduce - that same_group(old_best) iff (old == old_best) */ - new->u.bgp.suppressed = 0; - return (old == old_best); - } + /* new is best-in-group, the see discussion below - this is + a special variant of NBG && OBG. From OBG we can deduce + that same_group(old_best) iff (old == old_best) */ + new->u.bgp.suppressed = 0; + return (old == old_best); } + } /* The default case - find a new best-in-group route */ r = new; /* new may not be in the list */ for (s=net->routes; rte_is_valid(s); s=s->next) if (use_deterministic_med(s) && same_group(s, lpref, lasn)) - { - s->u.bgp.suppressed = 1; - if (!r || bgp_rte_better(s, r)) - r = s; - } + { + s->u.bgp.suppressed = 1; + if (!r || bgp_rte_better(s, r)) + r = s; + } /* Simple case - the last route in group disappears */ if (!r) @@ -1582,397 +1916,77 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) return old_is_group_best; } -static struct adata * -bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool) -{ - struct adata *newa = lp_alloc(pool, sizeof(struct adata) + 8); - newa->length = 8; - aggregator_convert_to_new(old, newa->data); - return newa; -} - -/* Take last req_as ASNs from path old2 (in 2B format), convert to 4B format - * and append path old4 (in 4B format). +/* + * Reconstruct AS_PATH and AGGREGATOR according to RFC 6793 4.2.3 */ -static struct adata * -bgp_merge_as_paths(struct adata *old2, struct adata *old4, int req_as, struct linpool *pool) -{ - byte buf[old2->length * 2]; - - int ol = as_path_convert_to_new(old2, buf, req_as); - int nl = ol + (old4 ? old4->length : 0); - - struct adata *newa = lp_alloc(pool, sizeof(struct adata) + nl); - newa->length = nl; - memcpy(newa->data, buf, ol); - if (old4) memcpy(newa->data + ol, old4->data, old4->length); - - return newa; -} - -static int -as4_aggregator_valid(struct adata *aggr) -{ - return aggr->length == 8; -} - - -/* Reconstruct 4B AS_PATH and AGGREGATOR according to RFC 4893 4.2.3 */ -static void -bgp_reconstruct_4b_atts(struct bgp_proto *p, rta *a, struct linpool *pool) -{ - eattr *p2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - eattr *p4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_PATH)); - eattr *a2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AGGREGATOR)); - eattr *a4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR)); - int a4_removed = 0; - - if (a4 && !as4_aggregator_valid(a4->u.ptr)) - { - log(L_WARN "%s: AS4_AGGREGATOR attribute is invalid, skipping attribute", p->p.name); - a4 = NULL; - a4_removed = 1; - } - - if (a2) - { - u32 a2_as = get_u16(a2->u.ptr->data); - - if (a4) - { - if (a2_as != AS_TRANS) - { - /* Routes were aggregated by old router and therefore AS4_PATH - * and AS4_AGGREGATOR is invalid - * - * Convert AS_PATH and AGGREGATOR to 4B format and finish. - */ - - a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool); - p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool); - - return; - } - else - { - /* Common case, use AS4_AGGREGATOR attribute */ - a2->u.ptr = a4->u.ptr; - } - } - else - { - /* Common case, use old AGGREGATOR attribute */ - a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool); - - if ((a2_as == AS_TRANS) && !a4_removed) - log(L_WARN "%s: AGGREGATOR attribute contain AS_TRANS, but AS4_AGGREGATOR is missing", p->p.name); - } - } - else - if (a4) - log(L_WARN "%s: AS4_AGGREGATOR attribute received, but AGGREGATOR attribute is missing", p->p.name); - - int p2_len = as_path_getlen_int(p2->u.ptr, 2); - int p4_len = p4 ? validate_as4_path(p, p4->u.ptr) : -1; - - if (p4 && (p4_len < 0)) - log(L_WARN "%s: AS4_PATH attribute is malformed, skipping attribute", p->p.name); - - if ((p4_len <= 0) || (p2_len < p4_len)) - p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool); - else - p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, p4->u.ptr, p2_len - p4_len, pool); -} - static void -bgp_remove_as4_attrs(struct bgp_proto *p, rta *a) +bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool) { - unsigned id1 = EA_CODE(EAP_BGP, BA_AS4_PATH); - unsigned id2 = EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR); - ea_list **el = &(a->eattrs); + eattr *p2 = bgp_find_attr(*attrs, BA_AS_PATH); + eattr *p4 = bgp_find_attr(*attrs, BA_AS4_PATH); + eattr *a2 = bgp_find_attr(*attrs, BA_AGGREGATOR); + eattr *a4 = bgp_find_attr(*attrs, BA_AS4_AGGREGATOR); - /* We know that ea_lists constructed in bgp_decode attrs have one attribute per ea_list struct */ - while (*el != NULL) - { - unsigned fid = (*el)->attrs[0].id; - - if ((fid == id1) || (fid == id2)) - { - *el = (*el)->next; - if (p->as4_session) - log(L_WARN "%s: Unexpected AS4_* attributes received", p->p.name); - } - else - el = &((*el)->next); - } -} + /* First, unset AS4_* attributes */ + if (p4) bgp_unset_attr(attrs, pool, BA_AS4_PATH); + if (a4) bgp_unset_attr(attrs, pool, BA_AS4_AGGREGATOR); -/** - * bgp_decode_attrs - check and decode BGP attributes - * @conn: connection - * @attr: start of attribute block - * @len: length of attribute block - * @pool: linear pool to make all the allocations in - * @mandatory: 1 iff presence of mandatory attributes has to be checked - * - * This function takes a BGP attribute block (a part of an Update message), checks - * its consistency and converts it to a list of BIRD route attributes represented - * by a &rta. - */ -struct rta * -bgp_decode_attrs(struct bgp_conn *conn, byte *attr, uint len, struct linpool *pool, int mandatory) -{ - struct bgp_proto *bgp = conn->bgp; - rta *a = lp_alloc(pool, sizeof(struct rta)); - uint flags, code, l, i, type; - int errcode; - byte *z, *attr_start; - byte seen[256/8]; - ea_list *ea; - struct adata *ad; - int withdraw = 0; - - bzero(a, sizeof(rta)); - a->source = RTS_BGP; - a->scope = SCOPE_UNIVERSE; - a->cast = RTC_UNICAST; - /* a->dest = RTD_ROUTER; -- set in bgp_set_next_hop() */ - a->from = bgp->cf->remote_ip; - - /* Parse the attributes */ - bzero(seen, sizeof(seen)); - DBG("BGP: Parsing attributes\n"); - while (len) - { - if (len < 2) - goto malformed; - attr_start = attr; - flags = *attr++; - code = *attr++; - len -= 2; - if (flags & BAF_EXT_LEN) - { - if (len < 2) - goto malformed; - l = get_u16(attr); - attr += 2; - len -= 2; - } - else - { - if (len < 1) - goto malformed; - l = *attr++; - len--; - } - if (l > len) - goto malformed; - len -= l; - z = attr; - attr += l; - DBG("Attr %02x %02x %d\n", code, flags, l); - if (seen[code/8] & (1 << (code%8))) - goto malformed; - if (ATTR_KNOWN(code)) - { - struct attr_desc *desc = &bgp_attr_table[code]; - if (desc->expected_length >= 0 && desc->expected_length != (int) l) - { errcode = 5; goto err; } - if ((desc->expected_flags ^ flags) & (BAF_OPTIONAL | BAF_TRANSITIVE)) - { errcode = 4; goto err; } - if (!bgp->is_internal) - { - if (!desc->allow_in_ebgp) - continue; - if ((code == BA_LOCAL_PREF) && !bgp->cf->allow_local_pref) - continue; - } - if (desc->validate) - { - errcode = desc->validate(bgp, z, l); - if (errcode > 0) - goto err; - if (errcode == IGNORE) - continue; - if (errcode <= WITHDRAW) - { - log(L_WARN "%s: Attribute %s is malformed, withdrawing update", - bgp->p.name, desc->name); - withdraw = 1; - } - } - else if (code == BA_AS_PATH) - { - /* Special case as it might also trim the attribute */ - if (validate_as_path(bgp, z, &l) < 0) - { errcode = 11; goto err; } - } - type = desc->type; - } - else /* Unknown attribute */ - { - if (!(flags & BAF_OPTIONAL)) - { errcode = 2; goto err; } - type = EAF_TYPE_OPAQUE; - } - - // Only OPTIONAL and TRANSITIVE attributes may have non-zero PARTIAL flag - // if (!((flags & BAF_OPTIONAL) && (flags & BAF_TRANSITIVE)) && (flags & BAF_PARTIAL)) - // { errcode = 4; goto err; } - - seen[code/8] |= (1 << (code%8)); - ea = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = a->eattrs; - a->eattrs = ea; - ea->flags = 0; - ea->count = 1; - ea->attrs[0].id = EA_CODE(EAP_BGP, code); - ea->attrs[0].flags = flags; - ea->attrs[0].type = type; - if (type & EAF_EMBEDDED) - ad = NULL; - else - { - ad = lp_alloc(pool, sizeof(struct adata) + l); - ea->attrs[0].u.ptr = ad; - ad->length = l; - memcpy(ad->data, z, l); - } - switch (type) - { - case EAF_TYPE_ROUTER_ID: - case EAF_TYPE_INT: - if (l == 1) - ea->attrs[0].u.data = *z; - else - ea->attrs[0].u.data = get_u32(z); - break; - case EAF_TYPE_IP_ADDRESS: - ipa_ntoh(*(ip_addr *)ad->data); - break; - case EAF_TYPE_INT_SET: - case EAF_TYPE_LC_SET: - case EAF_TYPE_EC_SET: - { - u32 *z = (u32 *) ad->data; - for(i=0; i<ad->length/4; i++) - z[i] = ntohl(z[i]); - break; - } - } - } - - if (withdraw) - goto withdraw; - -#ifdef IPV6 - /* If we received MP_REACH_NLRI we should check mandatory attributes */ - if (bgp->mp_reach_len != 0) - mandatory = 1; -#endif - - /* If there is no (reachability) NLRI, we should exit now */ - if (! mandatory) - return a; - - /* Check if all mandatory attributes are present */ - for(i=0; i < ARRAY_SIZE(bgp_mandatory_attrs); i++) - { - code = bgp_mandatory_attrs[i]; - if (!(seen[code/8] & (1 << (code%8)))) - { - bgp_error(conn, 3, 3, &bgp_mandatory_attrs[i], 1); - return NULL; - } - } - - /* When receiving attributes from non-AS4-aware BGP speaker, - * we have to reconstruct 4B AS_PATH and AGGREGATOR attributes - */ - if (! bgp->as4_session) - bgp_reconstruct_4b_atts(bgp, a, pool); - - bgp_remove_as4_attrs(bgp, a); - - /* If the AS path attribute contains our AS, reject the routes */ - if (bgp_as_path_loopy(bgp, a)) - goto withdraw; - - /* Two checks for IBGP loops caused by route reflection, RFC 4456 */ - if (bgp_originator_id_loopy(bgp, a) || - bgp_cluster_list_loopy(bgp, a)) - goto withdraw; + /* Handle AGGREGATOR attribute */ + if (a2 && a4) + { + u32 a2_asn = get_u32(a2->u.ptr->data); - /* If there's no local preference, define one */ - if (!(seen[0] & (1 << BA_LOCAL_PREF))) - bgp_attach_attr(&a->eattrs, pool, BA_LOCAL_PREF, bgp->cf->default_local_pref); + /* If routes were aggregated by an old router, then AS4_PATH and + AS4_AGGREGATOR are invalid. In that case we give up. */ + if (a2_asn != AS_TRANS) + return; - return a; + /* Use AS4_AGGREGATOR instead of AGGREGATOR */ + a2->u.ptr = a4->u.ptr; + } -withdraw: - return NULL; + /* Handle AS_PATH attribute */ + if (p2 && p4) + { + /* Both as_path_getlen() and as_path_cut() take AS_CONFED* as zero length */ + int p2_len = as_path_getlen(p2->u.ptr); + int p4_len = as_path_getlen(p4->u.ptr); -malformed: - bgp_error(conn, 3, 1, NULL, 0); - return NULL; + /* AS_PATH is too short, give up */ + if (p2_len < p4_len) + return; -err: - bgp_error(conn, 3, errcode, attr_start, z+l-attr_start); - return NULL; + /* Merge AS_PATH and AS4_PATH */ + as_path_cut(p2->u.ptr, p2_len - p4_len); + p2->u.ptr = as_path_merge(pool, p2->u.ptr, p4->u.ptr); + } } int bgp_get_attr(eattr *a, byte *buf, int buflen) { uint i = EA_ID(a->id); - struct attr_desc *d; + const struct bgp_attr_desc *d; int len; - if (ATTR_KNOWN(i)) + if (bgp_attr_known(i)) + { + d = &bgp_attr_table[i]; + len = bsprintf(buf, "%s", d->name); + buf += len; + if (d->format) { - d = &bgp_attr_table[i]; - len = bsprintf(buf, "%s", d->name); - buf += len; - if (d->format) - { - *buf++ = ':'; - *buf++ = ' '; - d->format(a, buf, buflen - len - 2); - return GA_FULL; - } - return GA_NAME; + *buf++ = ':'; + *buf++ = ' '; + d->format(a, buf, buflen - len - 2); + return GA_FULL; } - bsprintf(buf, "%02x%s", i, (a->flags & BAF_TRANSITIVE) ? " [t]" : ""); - return GA_NAME; -} - -void -bgp_init_bucket_table(struct bgp_proto *p) -{ - p->hash_size = 256; - p->hash_limit = p->hash_size * 4; - p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *)); - init_list(&p->bucket_queue); - p->withdraw_bucket = NULL; - // fib_init(&p->prefix_fib, p->p.pool, sizeof(struct bgp_prefix), 0, bgp_init_prefix); -} - -void -bgp_free_bucket_table(struct bgp_proto *p) -{ - mb_free(p->bucket_hash); - p->bucket_hash = NULL; - - struct bgp_bucket *b; - WALK_LIST_FIRST(b, p->bucket_queue) - { - rem_node(&b->send_node); - mb_free(b); + return GA_NAME; } - mb_free(p->withdraw_bucket); - p->withdraw_bucket = NULL; + bsprintf(buf, "%02x%s", i, (a->flags & BAF_TRANSITIVE) ? " [t]" : ""); + return GA_NAME; } void @@ -1988,14 +2002,14 @@ bgp_get_route_info(rte *e, byte *buf, ea_list *attrs) buf += bsprintf(buf, "-"); if (e->attrs->hostentry) - { - if (!rte_resolvable(e)) - buf += bsprintf(buf, "/-"); - else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN) - buf += bsprintf(buf, "/?"); - else - buf += bsprintf(buf, "/%d", e->attrs->igp_metric); - } + { + if (!rte_resolvable(e)) + buf += bsprintf(buf, "/-"); + else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN) + buf += bsprintf(buf, "/?"); + else + buf += bsprintf(buf, "/%d", e->attrs->igp_metric); + } buf += bsprintf(buf, ") ["); if (p && as_path_get_last(p->u.ptr, &origas)) diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index f706e76e..f4791215 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -2,6 +2,8 @@ * BIRD -- The Border Gateway Protocol * * (c) 2000 Martin Mares <mj@ucw.cz> + * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2008--2016 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -9,48 +11,52 @@ /** * DOC: Border Gateway Protocol * - * The BGP protocol is implemented in three parts: |bgp.c| which takes care of the - * connection and most of the interface with BIRD core, |packets.c| handling + * The BGP protocol is implemented in three parts: |bgp.c| which takes care of + * the connection and most of the interface with BIRD core, |packets.c| handling * both incoming and outgoing BGP packets and |attrs.c| containing functions for * manipulation with BGP attribute lists. * - * As opposed to the other existing routing daemons, BIRD has a sophisticated core - * architecture which is able to keep all the information needed by BGP in the - * primary routing table, therefore no complex data structures like a central - * BGP table are needed. This increases memory footprint of a BGP router with - * many connections, but not too much and, which is more important, it makes - * BGP much easier to implement. + * As opposed to the other existing routing daemons, BIRD has a sophisticated + * core architecture which is able to keep all the information needed by BGP in + * the primary routing table, therefore no complex data structures like a + * central BGP table are needed. This increases memory footprint of a BGP router + * with many connections, but not too much and, which is more important, it + * makes BGP much easier to implement. * - * Each instance of BGP (corresponding to a single BGP peer) is described by a &bgp_proto - * structure to which are attached individual connections represented by &bgp_connection - * (usually, there exists only one connection, but during BGP session setup, there - * can be more of them). The connections are handled according to the BGP state machine - * defined in the RFC with all the timers and all the parameters configurable. + * Each instance of BGP (corresponding to a single BGP peer) is described by a + * &bgp_proto structure to which are attached individual connections represented + * by &bgp_connection (usually, there exists only one connection, but during BGP + * session setup, there can be more of them). The connections are handled + * according to the BGP state machine defined in the RFC with all the timers and + * all the parameters configurable. * - * In incoming direction, we listen on the connection's socket and each time we receive - * some input, we pass it to bgp_rx(). It decodes packet headers and the markers and - * passes complete packets to bgp_rx_packet() which distributes the packet according - * to its type. + * In incoming direction, we listen on the connection's socket and each time we + * receive some input, we pass it to bgp_rx(). It decodes packet headers and the + * markers and passes complete packets to bgp_rx_packet() which distributes the + * packet according to its type. * - * In outgoing direction, we gather all the routing updates and sort them to buckets - * (&bgp_bucket) according to their attributes (we keep a hash table for fast comparison - * of &rta's and a &fib which helps us to find if we already have another route for - * the same destination queued for sending, so that we can replace it with the new one - * immediately instead of sending both updates). There also exists a special bucket holding - * all the route withdrawals which cannot be queued anywhere else as they don't have any - * attributes. If we have any packet to send (due to either new routes or the connection - * tracking code wanting to send a Open, Keepalive or Notification message), we call - * bgp_schedule_packet() which sets the corresponding bit in a @packet_to_send - * bit field in &bgp_conn and as soon as the transmit socket buffer becomes empty, - * we call bgp_fire_tx(). It inspects state of all the packet type bits and calls - * the corresponding bgp_create_xx() functions, eventually rescheduling the same packet - * type if we have more data of the same type to send. + * In outgoing direction, we gather all the routing updates and sort them to + * buckets (&bgp_bucket) according to their attributes (we keep a hash table for + * fast comparison of &rta's and a &fib which helps us to find if we already + * have another route for the same destination queued for sending, so that we + * can replace it with the new one immediately instead of sending both + * updates). There also exists a special bucket holding all the route + * withdrawals which cannot be queued anywhere else as they don't have any + * attributes. If we have any packet to send (due to either new routes or the + * connection tracking code wanting to send a Open, Keepalive or Notification + * message), we call bgp_schedule_packet() which sets the corresponding bit in a + * @packet_to_send bit field in &bgp_conn and as soon as the transmit socket + * buffer becomes empty, we call bgp_fire_tx(). It inspects state of all the + * packet type bits and calls the corresponding bgp_create_xx() functions, + * eventually rescheduling the same packet type if we have more data of the same + * type to send. * - * The processing of attributes consists of two functions: bgp_decode_attrs() for checking - * of the attribute blocks and translating them to the language of BIRD's extended attributes - * and bgp_encode_attrs() which does the converse. Both functions are built around a - * @bgp_attr_table array describing all important characteristics of all known attributes. - * Unknown transitive attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams. + * The processing of attributes consists of two functions: bgp_decode_attrs() + * for checking of the attribute blocks and translating them to the language of + * BIRD's extended attributes and bgp_encode_attrs() which does the + * converse. Both functions are built around a @bgp_attr_table array describing + * all important characteristics of all known attributes. Unknown transitive + * attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams. * * BGP protocol implements graceful restart in both restarting (local restart) * and receiving (neighbor restart) roles. The first is handled mostly by the @@ -61,10 +67,44 @@ * point of view and therefore maintaining received routes. Routing table * refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing * stale routes after reestablishment of BGP session during graceful restart. - */ + * + * Supported standards: + * <itemize> + * <item> <rfc id="4271"> - Border Gateway Protocol 4 (BGP) + * <item> <rfc id="1997"> - BGP Communities Attribute + * <item> <rfc id="2385"> - Protection of BGP Sessions via TCP MD5 Signature + * <item> <rfc id="2545"> - Use of BGP Multiprotocol Extensions for IPv6 + * <item> <rfc id="2918"> - Route Refresh Capability + * <item> <rfc id="3107"> - Carrying Label Information in BGP + * <item> <rfc id="4360"> - BGP Extended Communities Attribute + * <item> <rfc id="4364"> - BGP/MPLS IPv4 Virtual Private Networks + * <item> <rfc id="4456"> - BGP Route Reflection + * <item> <rfc id="4486"> - Subcodes for BGP Cease Notification Message + * <item> <rfc id="4659"> - BGP/MPLS IPv6 Virtual Private Networks + * <item> <rfc id="4724"> - Graceful Restart Mechanism for BGP + * <item> <rfc id="4760"> - Multiprotocol extensions for BGP + * <item> <rfc id="4798"> - Connecting IPv6 Islands over IPv4 MPLS + * <item> <rfc id="5065"> - AS confederations for BGP + * <item> <rfc id="5082"> - Generalized TTL Security Mechanism + * <item> <rfc id="5492"> - Capabilities Advertisement with BGP + * <item> <rfc id="5549"> - Advertising IPv4 NLRI with an IPv6 Next Hop + * <item> <rfc id="5575"> - Dissemination of Flow Specification Rules + * <item> <rfc id="5668"> - 4-Octet AS Specific BGP Extended Community + * <item> <rfc id="6286"> - AS-Wide Unique BGP Identifier + * <item> <rfc id="6608"> - Subcodes for BGP Finite State Machine Error + * <item> <rfc id="6793"> - BGP Support for 4-Octet AS Numbers + * <item> <rfc id="7313"> - Enhanced Route Refresh Capability for BGP + * <item> <rfc id="7606"> - Revised Error Handling for BGP UPDATE Messages + * <item> <rfc id="7911"> - Advertisement of Multiple Paths in BGP + * <item> <rfc id="7947"> - Internet Exchange BGP Route Server + * <item> <rfc id="8092"> - BGP Large Communities Attribute + * </itemize> +*/ #undef LOCAL_DEBUG +#include <stdlib.h> + #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" @@ -80,70 +120,150 @@ struct linpool *bgp_linpool; /* Global temporary pool */ -static sock *bgp_listen_sk; /* Global listening socket */ -static int bgp_counter; /* Number of protocol instances using the listening socket */ +struct linpool *bgp_linpool2; /* Global temporary pool for bgp_rt_notify() */ +static list bgp_sockets; /* Global list of listening sockets */ + -static void bgp_close(struct bgp_proto *p, int apply_md5); static void bgp_connect(struct bgp_proto *p); static void bgp_active(struct bgp_proto *p); -static sock *bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags); static void bgp_update_bfd(struct bgp_proto *p, int use_bfd); +static int bgp_incoming_connection(sock *sk, uint dummy UNUSED); +static void bgp_listen_sock_err(sock *sk UNUSED, int err); /** * bgp_open - open a BGP instance * @p: BGP instance * - * This function allocates and configures shared BGP resources. - * Should be called as the last step during initialization - * (when lock is acquired and neighbor is ready). - * When error, state changed to PS_DOWN, -1 is returned and caller - * should return immediately. + * This function allocates and configures shared BGP resources, mainly listening + * sockets. Should be called as the last step during initialization (when lock + * is acquired and neighbor is ready). When error, caller should change state to + * PS_DOWN and return immediately. */ static int bgp_open(struct bgp_proto *p) { - struct config *cfg = p->cf->c.global; - int errcode; + struct bgp_socket *bs = NULL; + struct iface *ifa = p->cf->strict_bind ? p->cf->iface : NULL; + ip_addr addr = p->cf->strict_bind ? p->cf->local_ip : + (ipa_is_ip4(p->cf->remote_ip) ? IPA_NONE4 : IPA_NONE6); + uint port = p->cf->local_port; - if (!bgp_listen_sk) - bgp_listen_sk = bgp_setup_listen_sk(cfg->listen_bgp_addr, cfg->listen_bgp_port, cfg->listen_bgp_flags); + /* FIXME: Add some global init? */ + if (!bgp_linpool) + init_list(&bgp_sockets); + + /* We assume that cf->iface is defined iff cf->local_ip is link-local */ - if (!bgp_listen_sk) + WALK_LIST(bs, bgp_sockets) + if (ipa_equal(bs->sk->saddr, addr) && (bs->sk->iface == ifa) && (bs->sk->sport == port)) { - errcode = BEM_NO_SOCKET; - goto err; + bs->uc++; + p->sock = bs; + return 0; } - if (!bgp_linpool) - bgp_linpool = lp_new(&root_pool, 4080); + sock *sk = sk_new(proto_pool); + sk->type = SK_TCP_PASSIVE; + sk->ttl = 255; + sk->saddr = addr; + sk->sport = port; + sk->flags = 0; + sk->tos = IP_PREC_INTERNET_CONTROL; + sk->rbsize = BGP_RX_BUFFER_SIZE; + sk->tbsize = BGP_TX_BUFFER_SIZE; + sk->rx_hook = bgp_incoming_connection; + sk->err_hook = bgp_listen_sock_err; + + if (sk_open(sk) < 0) + goto err; - bgp_counter++; + bs = mb_allocz(proto_pool, sizeof(struct bgp_socket)); + bs->sk = sk; + bs->uc = 1; + p->sock = bs; - if (p->cf->password) - if (sk_set_md5_auth(bgp_listen_sk, p->cf->source_addr, p->cf->remote_ip, - p->cf->iface, p->cf->password, p->cf->setkey) < 0) - { - sk_log_error(bgp_listen_sk, p->p.name); - bgp_close(p, 0); - errcode = BEM_INVALID_MD5; - goto err; - } + add_tail(&bgp_sockets, &bs->n); + + if (!bgp_linpool) + { + bgp_linpool = lp_new_default(proto_pool); + bgp_linpool2 = lp_new_default(proto_pool); + } return 0; err: - p->p.disabled = 1; - bgp_store_error(p, NULL, BE_MISC, errcode); - proto_notify_state(&p->p, PS_DOWN); + sk_log_error(sk, p->p.name); + log(L_ERR "%s: Cannot open listening socket", p->p.name); + rfree(sk); return -1; } +/** + * bgp_close - close a BGP instance + * @p: BGP instance + * + * This function frees and deconfigures shared BGP resources. + */ +static void +bgp_close(struct bgp_proto *p) +{ + struct bgp_socket *bs = p->sock; + + ASSERT(bs && bs->uc); + + if (--bs->uc) + return; + + rfree(bs->sk); + rem_node(&bs->n); + mb_free(bs); + + if (!EMPTY_LIST(bgp_sockets)) + return; + + rfree(bgp_linpool); + bgp_linpool = NULL; + + rfree(bgp_linpool2); + bgp_linpool2 = NULL; +} + +static inline int +bgp_setup_auth(struct bgp_proto *p, int enable) +{ + if (p->cf->password) + { + int rv = sk_set_md5_auth(p->sock->sk, + p->cf->local_ip, p->cf->remote_ip, p->cf->iface, + enable ? p->cf->password : NULL, p->cf->setkey); + + if (rv < 0) + sk_log_error(p->sock->sk, p->p.name); + + return rv; + } + else + return 0; +} + +static inline struct bgp_channel * +bgp_find_channel(struct bgp_proto *p, u32 afi) +{ + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + if (c->afi == afi) + return c; + + return NULL; +} + static void bgp_startup(struct bgp_proto *p) { BGP_TRACE(D_EVENTS, "Started"); - p->start_state = p->cf->capabilities ? BSS_CONNECT : BSS_CONNECT_NOCAP; + p->start_state = BSS_CONNECT; if (!p->cf->passive) bgp_active(p); @@ -159,70 +279,57 @@ bgp_startup_timeout(timer *t) static void bgp_initiate(struct bgp_proto *p) { - int rv = bgp_open(p); - if (rv < 0) - return; + int err_val; + + if (bgp_open(p) < 0) + { err_val = BEM_NO_SOCKET; goto err1; } + + if (bgp_setup_auth(p, 1) < 0) + { err_val = BEM_INVALID_MD5; goto err2; } if (p->cf->bfd) bgp_update_bfd(p, p->cf->bfd); if (p->startup_delay) - { - p->start_state = BSS_DELAY; - BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay); - bgp_start_timer(p->startup_timer, p->startup_delay); - } + { + p->start_state = BSS_DELAY; + BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay); + bgp_start_timer(p->startup_timer, p->startup_delay); + } else bgp_startup(p); -} -/** - * bgp_close - close a BGP instance - * @p: BGP instance - * @apply_md5: 0 to disable unsetting MD5 auth - * - * This function frees and deconfigures shared BGP resources. - * @apply_md5 is set to 0 when bgp_close is called as a cleanup - * from failed bgp_open(). - */ -static void -bgp_close(struct bgp_proto *p, int apply_md5) -{ - ASSERT(bgp_counter); - bgp_counter--; + return; - if (p->cf->password && apply_md5) - if (sk_set_md5_auth(bgp_listen_sk, p->cf->source_addr, p->cf->remote_ip, - p->cf->iface, NULL, p->cf->setkey) < 0) - sk_log_error(bgp_listen_sk, p->p.name); +err2: + bgp_close(p); +err1: + p->p.disabled = 1; + bgp_store_error(p, NULL, BE_MISC, err_val); + proto_notify_state(&p->p, PS_DOWN); - if (!bgp_counter) - { - rfree(bgp_listen_sk); - bgp_listen_sk = NULL; - rfree(bgp_linpool); - bgp_linpool = NULL; - } + return; } /** * bgp_start_timer - start a BGP timer * @t: timer - * @value: time to fire (0 to disable the timer) + * @value: time (in seconds) to fire (0 to disable the timer) * - * This functions calls tm_start() on @t with time @value and the - * amount of randomization suggested by the BGP standard. Please use - * it for all BGP timers. + * This functions calls tm_start() on @t with time @value and the amount of + * randomization suggested by the BGP standard. Please use it for all BGP + * timers. */ void -bgp_start_timer(timer *t, int value) +bgp_start_timer(timer *t, uint value) { if (value) - { - /* The randomization procedure is specified in RFC 1771: 9.2.3.3 */ - t->randomize = value / 4; - tm_start(t, value - t->randomize); - } + { + /* The randomization procedure is specified in RFC 4271 section 10 */ + btime time = value S; + btime randomize = random() % ((time / 4) + 1); + tm_start(t, time - randomize); + } else tm_stop(t); } @@ -231,8 +338,8 @@ bgp_start_timer(timer *t, int value) * bgp_close_conn - close a BGP connection * @conn: connection to close * - * This function takes a connection described by the &bgp_conn structure, - * closes its socket and frees all resources associated with it. + * This function takes a connection described by the &bgp_conn structure, closes + * its socket and frees all resources associated with it. */ void bgp_close_conn(struct bgp_conn *conn) @@ -241,16 +348,22 @@ bgp_close_conn(struct bgp_conn *conn) DBG("BGP: Closing connection\n"); conn->packets_to_send = 0; - rfree(conn->connect_retry_timer); - conn->connect_retry_timer = NULL; + conn->channels_to_send = 0; + rfree(conn->connect_timer); + conn->connect_timer = NULL; rfree(conn->keepalive_timer); conn->keepalive_timer = NULL; rfree(conn->hold_timer); conn->hold_timer = NULL; - rfree(conn->sk); - conn->sk = NULL; rfree(conn->tx_ev); conn->tx_ev = NULL; + rfree(conn->sk); + conn->sk = NULL; + + mb_free(conn->local_caps); + conn->local_caps = NULL; + mb_free(conn->remote_caps); + conn->remote_caps = NULL; } @@ -258,9 +371,9 @@ bgp_close_conn(struct bgp_conn *conn) * bgp_update_startup_delay - update a startup delay * @p: BGP instance * - * This function updates a startup delay that is used to postpone next BGP connect. - * It also handles disable_after_error and might stop BGP instance when error - * happened and disable_after_error is on. + * This function updates a startup delay that is used to postpone next BGP + * connect. It also handles disable_after_error and might stop BGP instance + * when error happened and disable_after_error is on. * * It should be called when BGP protocol error happened. */ @@ -271,17 +384,17 @@ bgp_update_startup_delay(struct bgp_proto *p) DBG("BGP: Updating startup delay\n"); - if (p->last_proto_error && ((now - p->last_proto_error) >= (int) cf->error_amnesia_time)) + if (p->last_proto_error && ((current_time() - p->last_proto_error) >= cf->error_amnesia_time S)) p->startup_delay = 0; - p->last_proto_error = now; + p->last_proto_error = current_time(); if (cf->disable_after_error) - { - p->startup_delay = 0; - p->p.disabled = 1; - return; - } + { + p->startup_delay = 0; + p->p.disabled = 1; + return; + } if (!p->startup_delay) p->startup_delay = cf->error_delay_time_min; @@ -290,32 +403,38 @@ bgp_update_startup_delay(struct bgp_proto *p) } static void -bgp_graceful_close_conn(struct bgp_conn *conn, unsigned subcode) +bgp_graceful_close_conn(struct bgp_conn *conn, uint subcode) { switch (conn->state) - { - case BS_IDLE: - case BS_CLOSE: - return; - case BS_CONNECT: - case BS_ACTIVE: - bgp_conn_enter_idle_state(conn); - return; - case BS_OPENSENT: - case BS_OPENCONFIRM: - case BS_ESTABLISHED: - bgp_error(conn, 6, subcode, NULL, 0); - return; - default: - bug("bgp_graceful_close_conn: Unknown state %d", conn->state); - } + { + case BS_IDLE: + case BS_CLOSE: + return; + + case BS_CONNECT: + case BS_ACTIVE: + bgp_conn_enter_idle_state(conn); + return; + + case BS_OPENSENT: + case BS_OPENCONFIRM: + case BS_ESTABLISHED: + bgp_error(conn, 6, subcode, NULL, 0); + return; + + default: + bug("bgp_graceful_close_conn: Unknown state %d", conn->state); + } } static void bgp_down(struct bgp_proto *p) { if (p->start_state > BSS_PREPARE) - bgp_close(p, 1); + { + bgp_setup_auth(p, 0); + bgp_close(p); + } BGP_TRACE(D_EVENTS, "Down"); proto_notify_state(&p->p, PS_DOWN); @@ -327,20 +446,20 @@ bgp_decision(void *vp) struct bgp_proto *p = vp; DBG("BGP: Decision start\n"); - if ((p->p.proto_state == PS_START) - && (p->outgoing_conn.state == BS_IDLE) - && (p->incoming_conn.state != BS_OPENCONFIRM) - && (!p->cf->passive)) + if ((p->p.proto_state == PS_START) && + (p->outgoing_conn.state == BS_IDLE) && + (p->incoming_conn.state != BS_OPENCONFIRM) && + !p->cf->passive) bgp_active(p); - if ((p->p.proto_state == PS_STOP) - && (p->outgoing_conn.state == BS_IDLE) - && (p->incoming_conn.state == BS_IDLE)) + if ((p->p.proto_state == PS_STOP) && + (p->outgoing_conn.state == BS_IDLE) && + (p->incoming_conn.state == BS_IDLE)) bgp_down(p); } void -bgp_stop(struct bgp_proto *p, unsigned subcode) +bgp_stop(struct bgp_proto *p, uint subcode) { proto_notify_state(&p->p, PS_STOP); bgp_graceful_close_conn(&p->outgoing_conn, subcode); @@ -349,7 +468,7 @@ bgp_stop(struct bgp_proto *p, unsigned subcode) } static inline void -bgp_conn_set_state(struct bgp_conn *conn, unsigned new_state) +bgp_conn_set_state(struct bgp_conn *conn, uint new_state) { if (conn->bgp->p.mrtdump & MD_STATES) mrt_dump_bgp_state_change(conn, conn->state, new_state); @@ -364,13 +483,17 @@ bgp_conn_enter_openconfirm_state(struct bgp_conn *conn) bgp_conn_set_state(conn, BS_OPENCONFIRM); } +static const struct bgp_af_caps dummy_af_caps = { }; + void bgp_conn_enter_established_state(struct bgp_conn *conn) { struct bgp_proto *p = conn->bgp; + struct bgp_caps *local = conn->local_caps; + struct bgp_caps *peer = conn->remote_caps; + struct bgp_channel *c; BGP_TRACE(D_EVENTS, "BGP session established"); - DBG("BGP: UP!!!\n"); /* For multi-hop BGP sessions */ if (ipa_zero(p->source_addr)) @@ -381,30 +504,92 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) p->conn = conn; p->last_error_class = 0; p->last_error_code = 0; - p->feed_state = BFS_NONE; - p->load_state = BFS_NONE; - bgp_init_bucket_table(p); - bgp_init_prefix_table(p, 8); - int peer_gr_ready = conn->peer_gr_aware && !(conn->peer_gr_flags & BGP_GRF_RESTART); + p->as4_session = conn->as4_session; - if (p->p.gr_recovery && !peer_gr_ready) - proto_graceful_restart_unlock(&p->p); + p->route_refresh = peer->route_refresh; + p->enhanced_refresh = local->enhanced_refresh && peer->enhanced_refresh; - if (p->p.gr_recovery && (p->cf->gr_mode == BGP_GR_ABLE) && peer_gr_ready) - p->p.gr_wait = 1; + /* Whether we may handle possible GR of peer (it has some AF GR-able) */ + p->gr_ready = 0; /* Updated later */ - if (p->gr_active) + /* Whether peer is ready to handle our GR recovery */ + int peer_gr_ready = peer->gr_aware && !(peer->gr_flags & BGP_GRF_RESTART); + + if (p->gr_active_num) tm_stop(p->gr_timer); - if (p->gr_active && (!conn->peer_gr_able || !(conn->peer_gr_aflags & BGP_GRF_FORWARDING))) - bgp_graceful_restart_done(p); + /* Number of active channels */ + int num = 0; + + WALK_LIST(c, p->p.channels) + { + const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi); + const struct bgp_af_caps *rem = bgp_find_af_caps(peer, c->afi); + + /* Ignore AFIs that were not announced in multiprotocol capability */ + if (!loc || !loc->ready) + loc = &dummy_af_caps; + + if (!rem || !rem->ready) + rem = &dummy_af_caps; + + int active = loc->ready && rem->ready; + c->c.disabled = !active; + c->c.reloadable = p->route_refresh; + + c->index = active ? num++ : 0; - /* GR capability implies that neighbor will send End-of-RIB */ - if (conn->peer_gr_aware) - p->load_state = BFS_LOADING; + c->feed_state = BFS_NONE; + c->load_state = BFS_NONE; - /* proto_notify_state() will likely call bgp_feed_begin(), setting p->feed_state */ + /* Channels where peer may do GR */ + c->gr_ready = active && local->gr_aware && rem->gr_able; + p->gr_ready = p->gr_ready || c->gr_ready; + + /* Channels not able to recover gracefully */ + if (p->p.gr_recovery && (!active || !peer_gr_ready)) + channel_graceful_restart_unlock(&c->c); + + /* Channels waiting for local convergence */ + if (p->p.gr_recovery && loc->gr_able && peer_gr_ready) + c->c.gr_wait = 1; + + /* Channels where peer is not able to recover gracefully */ + if (c->gr_active && ! (c->gr_ready && (rem->gr_af_flags & BGP_GRF_FORWARDING))) + bgp_graceful_restart_done(c); + + /* GR capability implies that neighbor will send End-of-RIB */ + if (peer->gr_aware) + c->load_state = BFS_LOADING; + + c->ext_next_hop = c->cf->ext_next_hop && (bgp_channel_is_ipv6(c) || rem->ext_next_hop); + c->add_path_rx = (loc->add_path & BGP_ADD_PATH_RX) && (rem->add_path & BGP_ADD_PATH_TX); + c->add_path_tx = (loc->add_path & BGP_ADD_PATH_TX) && (rem->add_path & BGP_ADD_PATH_RX); + + /* Update RA mode */ + if (c->add_path_tx) + c->c.ra_mode = RA_ANY; + else if (c->cf->secondary) + c->c.ra_mode = RA_ACCEPTED; + else + c->c.ra_mode = RA_OPTIMAL; + } + + p->afi_map = mb_alloc(p->p.pool, num * sizeof(u32)); + p->channel_map = mb_alloc(p->p.pool, num * sizeof(void *)); + p->channel_count = num; + + WALK_LIST(c, p->p.channels) + { + if (c->c.disabled) + continue; + + p->afi_map[c->index] = c->afi; + p->channel_map[c->index] = c; + } + + /* proto_notify_state() will likely call bgp_feed_begin(), setting c->feed_state */ bgp_conn_set_state(conn, BS_ESTABLISHED); proto_notify_state(&p->p, PS_UP); @@ -416,8 +601,9 @@ bgp_conn_leave_established_state(struct bgp_proto *p) BGP_TRACE(D_EVENTS, "BGP session closed"); p->conn = NULL; - bgp_free_prefix_table(p); - bgp_free_bucket_table(p); + // XXXX free these tables to avoid memory leak during graceful restart + // bgp_free_prefix_table(p); + // bgp_free_bucket_table(p); if (p->p.proto_state == PS_UP) bgp_stop(p, 0); @@ -471,34 +657,57 @@ bgp_handle_graceful_restart(struct bgp_proto *p) ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready); BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s", - p->gr_active ? " - already pending" : ""); - proto_notify_state(&p->p, PS_START); + p->gr_active_num ? " - already pending" : ""); - if (p->gr_active) - rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook); + p->gr_active_num = 0; + + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + { + if (c->gr_ready) + { + if (c->gr_active) + rt_refresh_end(c->c.table, &c->c); + + c->gr_active = 1; + p->gr_active_num++; + rt_refresh_begin(c->c.table, &c->c); + } + else + { + /* Just flush the routes */ + rt_refresh_begin(c->c.table, &c->c); + rt_refresh_end(c->c.table, &c->c); + } + } - p->gr_active = 1; - bgp_start_timer(p->gr_timer, p->conn->peer_gr_time); - rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook); + proto_notify_state(&p->p, PS_START); + bgp_start_timer(p->gr_timer, p->conn->local_caps->gr_time); } /** * bgp_graceful_restart_done - finish active BGP graceful restart - * @p: BGP instance + * @c: BGP channel * * This function is called when the active BGP graceful restart of the neighbor - * should be finished - either successfully (the neighbor sends all paths and - * reports end-of-RIB on the new session) or unsuccessfully (the neighbor does - * not support BGP graceful restart on the new session). The function ends - * routing table refresh cycle and stops BGP restart timer. + * should be finished for channel @c - either successfully (the neighbor sends + * all paths and reports end-of-RIB for given AFI/SAFI on the new session) or + * unsuccessfully (the neighbor does not support BGP graceful restart on the new + * session). The function ends the routing table refresh cycle. */ void -bgp_graceful_restart_done(struct bgp_proto *p) +bgp_graceful_restart_done(struct bgp_channel *c) { - BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); - p->gr_active = 0; - tm_stop(p->gr_timer); - rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook); + struct bgp_proto *p = (void *) c->c.proto; + + ASSERT(c->gr_active); + c->gr_active = 0; + p->gr_active_num--; + + if (!p->gr_active_num) + BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); + + rt_refresh_end(c->c.table, &c->c); } /** @@ -522,7 +731,7 @@ bgp_graceful_restart_timeout(timer *t) /** * bgp_refresh_begin - start incoming enhanced route refresh sequence - * @p: BGP instance + * @c: BGP channel * * This function is called when an incoming enhanced route refresh sequence is * started by the neighbor, demarcated by the BoRR packet. The function updates @@ -531,18 +740,20 @@ bgp_graceful_restart_timeout(timer *t) * ensure that these two sequences do not overlap. */ void -bgp_refresh_begin(struct bgp_proto *p) +bgp_refresh_begin(struct bgp_channel *c) { - if (p->load_state == BFS_LOADING) - { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } + struct bgp_proto *p = (void *) c->c.proto; + + if (c->load_state == BFS_LOADING) + { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } - p->load_state = BFS_REFRESHING; - rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook); + c->load_state = BFS_REFRESHING; + rt_refresh_begin(c->c.table, &c->c); } /** * bgp_refresh_end - finish incoming enhanced route refresh sequence - * @p: BGP instance + * @c: BGP channel * * This function is called when an incoming enhanced route refresh sequence is * finished by the neighbor, demarcated by the EoRR packet. The function updates @@ -550,39 +761,26 @@ bgp_refresh_begin(struct bgp_proto *p) * during the sequence are removed by the nest. */ void -bgp_refresh_end(struct bgp_proto *p) +bgp_refresh_end(struct bgp_channel *c) { - if (p->load_state != BFS_REFRESHING) - { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } + struct bgp_proto *p = (void *) c->c.proto; + + if (c->load_state != BFS_REFRESHING) + { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } - p->load_state = BFS_NONE; - rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook); + c->load_state = BFS_NONE; + rt_refresh_end(c->c.table, &c->c); } static void bgp_send_open(struct bgp_conn *conn) { - conn->start_state = conn->bgp->start_state; - - // Default values, possibly changed by receiving capabilities. - conn->advertised_as = 0; - conn->peer_refresh_support = 0; - conn->peer_as4_support = 0; - conn->peer_add_path = 0; - conn->peer_enhanced_refresh_support = 0; - conn->peer_gr_aware = 0; - conn->peer_gr_able = 0; - conn->peer_gr_time = 0; - conn->peer_gr_flags = 0; - conn->peer_gr_aflags = 0; - conn->peer_ext_messages_support = 0; - DBG("BGP: Sending open\n"); conn->sk->rx_hook = bgp_rx; conn->sk->tx_hook = bgp_tx; - tm_stop(conn->connect_retry_timer); - bgp_schedule_packet(conn, PKT_OPEN); + tm_stop(conn->connect_timer); + bgp_schedule_packet(conn, NULL, PKT_OPEN); bgp_conn_set_state(conn, BS_OPENSENT); bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time); } @@ -605,10 +803,10 @@ bgp_connect_timeout(timer *t) DBG("BGP: connect_timeout\n"); if (p->p.proto_state == PS_START) - { - bgp_close_conn(conn); - bgp_connect(p); - } + { + bgp_close_conn(conn); + bgp_connect(p); + } else bgp_conn_enter_idle_state(conn); } @@ -672,7 +870,7 @@ bgp_keepalive_timeout(timer *t) struct bgp_conn *conn = t->data; DBG("BGP: Keepalive timer\n"); - bgp_schedule_packet(conn, PKT_KEEPALIVE); + bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE); /* Kick TX a bit faster */ if (ev_active(conn->tx_ev)) @@ -682,21 +880,18 @@ bgp_keepalive_timeout(timer *t) static void bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn) { - timer *t; - conn->sk = NULL; conn->bgp = p; + conn->packets_to_send = 0; + conn->channels_to_send = 0; + conn->last_channel = 0; + conn->last_channel_count = 0; + + conn->connect_timer = tm_new_init(p->p.pool, bgp_connect_timeout, conn, 0, 0); + conn->hold_timer = tm_new_init(p->p.pool, bgp_hold_timeout, conn, 0, 0); + conn->keepalive_timer = tm_new_init(p->p.pool, bgp_keepalive_timeout, conn, 0, 0); - t = conn->connect_retry_timer = tm_new(p->p.pool); - t->hook = bgp_connect_timeout; - t->data = conn; - t = conn->hold_timer = tm_new(p->p.pool); - t->hook = bgp_hold_timeout; - t->data = conn; - t = conn->keepalive_timer = tm_new(p->p.pool); - t->hook = bgp_keepalive_timeout; - t->data = conn; conn->tx_ev = ev_new(p->p.pool); conn->tx_ev->hook = bgp_kick_tx; conn->tx_ev->data = conn; @@ -720,7 +915,7 @@ bgp_active(struct bgp_proto *p) BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay); bgp_setup_conn(p, conn); bgp_conn_set_state(conn, BS_ACTIVE); - bgp_start_timer(conn->connect_retry_timer, delay); + bgp_start_timer(conn->connect_timer, delay); } /** @@ -734,12 +929,11 @@ bgp_active(struct bgp_proto *p) static void bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing connection */ { - sock *s; struct bgp_conn *conn = &p->outgoing_conn; int hops = p->cf->multihop ? : 1; DBG("BGP: Connecting\n"); - s = sk_new(p->p.pool); + sock *s = sk_new(p->p.pool); s->type = SK_TCP_ACTIVE; s->saddr = p->source_addr; s->daddr = p->cf->remote_ip; @@ -766,10 +960,10 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c goto err; DBG("BGP: Waiting for connect success\n"); - bgp_start_timer(conn->connect_retry_timer, p->cf->connect_retry_time); + bgp_start_timer(conn->connect_timer, p->cf->connect_retry_time); return; - err: +err: sk_log_error(s, p->p.name); bgp_sock_err(s, 0); return; @@ -783,16 +977,15 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c static struct bgp_proto * bgp_find_proto(sock *sk) { - struct proto_config *pc; + struct bgp_proto *p; - WALK_LIST(pc, config->protos) - if ((pc->protocol == &proto_bgp) && pc->proto) - { - struct bgp_proto *p = (struct bgp_proto *) pc->proto; - if (ipa_equal(p->cf->remote_ip, sk->daddr) && - (!p->cf->iface || (p->cf->iface == sk->iface))) - return p; - } + WALK_LIST(p, proto_list) + if ((p->p.proto == &proto_bgp) && + ipa_equal(p->cf->remote_ip, sk->daddr) && + (!p->cf->iface || (p->cf->iface == sk->iface)) && + (ipa_zero(p->cf->local_ip) || ipa_equal(p->cf->local_ip, sk->saddr)) && + (p->cf->local_port == sk->sport)) + return p; return NULL; } @@ -818,12 +1011,12 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport); p = bgp_find_proto(sk); if (!p) - { - log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)", - sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport); - rfree(sk); - return 0; - } + { + log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)", + sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport); + rfree(sk); + return 0; + } /* * BIRD should keep multiple incoming connections in OpenSent state (for @@ -836,26 +1029,26 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk); if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready) - { - bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART); - bgp_handle_graceful_restart(p); - bgp_conn_enter_idle_state(p->conn); - acc = 1; - - /* There might be separate incoming connection in OpenSent state */ - if (p->incoming_conn.state > BS_ACTIVE) - bgp_close_conn(&p->incoming_conn); - } + { + bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART); + bgp_handle_graceful_restart(p); + bgp_conn_enter_idle_state(p->conn); + acc = 1; + + /* There might be separate incoming connection in OpenSent state */ + if (p->incoming_conn.state > BS_ACTIVE) + bgp_close_conn(&p->incoming_conn); + } BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s", sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport, acc ? "accepted" : "rejected"); if (!acc) - { - rfree(sk); - return 0; - } + { + rfree(sk); + return 0; + } hops = p->cf->multihop ? : 1; @@ -867,11 +1060,11 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) goto err; if (p->cf->enable_extended_messages) - { - sk->rbsize = BGP_RX_BUFFER_EXT_SIZE; - sk->tbsize = BGP_TX_BUFFER_EXT_SIZE; - sk_reallocate(sk); - } + { + sk->rbsize = BGP_RX_BUFFER_EXT_SIZE; + sk->tbsize = BGP_TX_BUFFER_EXT_SIZE; + sk_reallocate(sk); + } bgp_setup_conn(p, &p->incoming_conn); bgp_setup_sk(&p->incoming_conn, sk); @@ -894,34 +1087,6 @@ bgp_listen_sock_err(sock *sk UNUSED, int err) log(L_ERR "BGP: Error on listening socket: %M", err); } -static sock * -bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags) -{ - sock *s = sk_new(&root_pool); - DBG("BGP: Creating listening socket\n"); - s->type = SK_TCP_PASSIVE; - s->ttl = 255; - s->saddr = addr; - s->sport = port ? port : BGP_PORT; - s->flags = flags ? 0 : SKF_V6ONLY; - s->tos = IP_PREC_INTERNET_CONTROL; - s->rbsize = BGP_RX_BUFFER_SIZE; - s->tbsize = BGP_TX_BUFFER_SIZE; - s->rx_hook = bgp_incoming_connection; - s->err_hook = bgp_listen_sock_err; - - if (sk_open(s) < 0) - goto err; - - return s; - - err: - sk_log_error(s, "BGP"); - log(L_ERR "BGP: Cannot open listening socket"); - rfree(s); - return NULL; -} - static void bgp_start_neighbor(struct bgp_proto *p) { @@ -930,23 +1095,10 @@ bgp_start_neighbor(struct bgp_proto *p) if (ipa_zero(p->source_addr)) p->source_addr = p->neigh->ifa->ip; -#ifdef IPV6 - { - struct ifa *a; - p->local_link = IPA_NONE; - WALK_LIST(a, p->neigh->iface->addrs) - if (a->scope == SCOPE_LINK) - { - p->local_link = a->ip; - break; - } - - if (! ipa_nonzero(p->local_link)) - log(L_WARN "%s: Missing link local address on interface %s", p->p.name, p->neigh->iface->name); - - DBG("BGP: Selected link-level address %I\n", p->local_link); - } -#endif + if (ipa_is_link_local(p->source_addr)) + p->link_addr = p->source_addr; + else if (p->neigh->iface->llv6) + p->link_addr = p->neigh->iface->llv6->ip; bgp_initiate(p); } @@ -966,34 +1118,34 @@ bgp_neigh_notify(neighbor *n) int prepare = (ps == PS_START) && (p->start_state == BSS_PREPARE); if (n->scope <= 0) + { + if (!prepare) { - if (!prepare) - { - BGP_TRACE(D_EVENTS, "Neighbor lost"); - bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST); - /* Perhaps also run bgp_update_startup_delay(p)? */ - bgp_stop(p, 0); - } + BGP_TRACE(D_EVENTS, "Neighbor lost"); + bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST); + /* Perhaps also run bgp_update_startup_delay(p)? */ + bgp_stop(p, 0); } + } else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP)) + { + if (!prepare) { - if (!prepare) - { - BGP_TRACE(D_EVENTS, "Link down"); - bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN); - if (ps == PS_UP) - bgp_update_startup_delay(p); - bgp_stop(p, 0); - } + BGP_TRACE(D_EVENTS, "Link down"); + bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN); + if (ps == PS_UP) + bgp_update_startup_delay(p); + bgp_stop(p, 0); } + } else + { + if (prepare) { - if (prepare) - { - BGP_TRACE(D_EVENTS, "Neighbor ready"); - bgp_start_neighbor(p); - } + BGP_TRACE(D_EVENTS, "Neighbor ready"); + bgp_start_neighbor(p); } + } } static void @@ -1003,13 +1155,13 @@ bgp_bfd_notify(struct bfd_request *req) int ps = p->p.proto_state; if (req->down && ((ps == PS_START) || (ps == PS_UP))) - { - BGP_TRACE(D_EVENTS, "BFD session down"); - bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN); - if (ps == PS_UP) - bgp_update_startup_delay(p); - bgp_stop(p, 0); - } + { + BGP_TRACE(D_EVENTS, "BFD session down"); + bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN); + if (ps == PS_UP) + bgp_update_startup_delay(p); + bgp_stop(p, 0); + } } static void @@ -1021,71 +1173,72 @@ bgp_update_bfd(struct bgp_proto *p, int use_bfd) bgp_bfd_notify, p); if (!use_bfd && p->bfd_req) - { - rfree(p->bfd_req); - p->bfd_req = NULL; - } + { + rfree(p->bfd_req); + p->bfd_req = NULL; + } } -static int -bgp_reload_routes(struct proto *P) +static void +bgp_reload_routes(struct channel *C) { - struct bgp_proto *p = (struct bgp_proto *) P; - if (!p->conn || !p->conn->peer_refresh_support) - return 0; + struct bgp_proto *p = (void *) C->proto; + struct bgp_channel *c = (void *) C; - bgp_schedule_packet(p->conn, PKT_ROUTE_REFRESH); - return 1; + ASSERT(p->conn && p->route_refresh); + + bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); } static void -bgp_feed_begin(struct proto *P, int initial) +bgp_feed_begin(struct channel *C, int initial) { - struct bgp_proto *p = (struct bgp_proto *) P; + struct bgp_proto *p = (void *) C->proto; + struct bgp_channel *c = (void *) C; /* This should not happen */ if (!p->conn) return; if (initial && p->cf->gr_mode) - p->feed_state = BFS_LOADING; + c->feed_state = BFS_LOADING; /* It is refeed and both sides support enhanced route refresh */ - if (!initial && p->cf->enable_refresh && - p->conn->peer_enhanced_refresh_support) - { - /* BoRR must not be sent before End-of-RIB */ - if (p->feed_state == BFS_LOADING || p->feed_state == BFS_LOADED) - return; + if (!initial && p->enhanced_refresh) + { + /* BoRR must not be sent before End-of-RIB */ + if (c->feed_state == BFS_LOADING || c->feed_state == BFS_LOADED) + return; - p->feed_state = BFS_REFRESHING; - bgp_schedule_packet(p->conn, PKT_BEGIN_REFRESH); - } + c->feed_state = BFS_REFRESHING; + bgp_schedule_packet(p->conn, c, PKT_BEGIN_REFRESH); + } } static void -bgp_feed_end(struct proto *P) +bgp_feed_end(struct channel *C) { - struct bgp_proto *p = (struct bgp_proto *) P; + struct bgp_proto *p = (void *) C->proto; + struct bgp_channel *c = (void *) C; /* This should not happen */ if (!p->conn) return; /* Non-demarcated feed ended, nothing to do */ - if (p->feed_state == BFS_NONE) + if (c->feed_state == BFS_NONE) return; /* Schedule End-of-RIB packet */ - if (p->feed_state == BFS_LOADING) - p->feed_state = BFS_LOADED; + if (c->feed_state == BFS_LOADING) + c->feed_state = BFS_LOADED; /* Schedule EoRR packet */ - if (p->feed_state == BFS_REFRESHING) - p->feed_state = BFS_REFRESHED; + if (c->feed_state == BFS_REFRESHING) + c->feed_state = BFS_REFRESHED; /* Kick TX hook */ - bgp_schedule_packet(p->conn, PKT_UPDATE); + bgp_schedule_packet(p->conn, c, PKT_UPDATE); } @@ -1096,30 +1249,30 @@ bgp_start_locked(struct object_lock *lock) struct bgp_config *cf = p->cf; if (p->p.proto_state != PS_START) - { - DBG("BGP: Got lock in different state %d\n", p->p.proto_state); - return; - } + { + DBG("BGP: Got lock in different state %d\n", p->p.proto_state); + return; + } DBG("BGP: Got lock\n"); if (cf->multihop) - { - /* Multi-hop sessions do not use neighbor entries */ - bgp_initiate(p); - return; - } + { + /* Multi-hop sessions do not use neighbor entries */ + bgp_initiate(p); + return; + } neighbor *n = neigh_find2(&p->p, &cf->remote_ip, cf->iface, NEF_STICKY); if (!n) - { - log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface); - /* As we do not start yet, we can just disable protocol */ - p->p.disabled = 1; - bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP); - proto_notify_state(&p->p, PS_DOWN); - return; - } + { + log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface); + /* As we do not start yet, we can just disable protocol */ + p->p.disabled = 1; + bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP); + proto_notify_state(&p->p, PS_DOWN); + return; + } p->neigh = n; @@ -1144,36 +1297,34 @@ bgp_start(struct proto *P) p->neigh = NULL; p->bfd_req = NULL; p->gr_ready = 0; - p->gr_active = 0; - - rt_lock_table(p->igp_table); + p->gr_active_num = 0; p->event = ev_new(p->p.pool); p->event->hook = bgp_decision; p->event->data = p; - p->startup_timer = tm_new(p->p.pool); - p->startup_timer->hook = bgp_startup_timeout; - p->startup_timer->data = p; - - p->gr_timer = tm_new(p->p.pool); - p->gr_timer->hook = bgp_graceful_restart_timeout; - p->gr_timer->data = p; + p->startup_timer = tm_new_init(p->p.pool, bgp_startup_timeout, p, 0, 0); + p->gr_timer = tm_new_init(p->p.pool, bgp_graceful_restart_timeout, p, 0, 0); p->local_id = proto_get_router_id(P->cf); if (p->rr_client) p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id; p->remote_id = 0; - p->source_addr = p->cf->source_addr; + p->source_addr = p->cf->local_ip; + p->link_addr = IPA_NONE; + /* XXXX */ if (p->p.gr_recovery && p->cf->gr_mode) - proto_graceful_restart_lock(P); + { + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + channel_graceful_restart_lock(&c->c); + } /* - * Before attempting to create the connection, we need to lock the - * port, so that are sure we're the only instance attempting to talk - * with that neighbor. + * Before attempting to create the connection, we need to lock the port, + * so that we are the only instance attempting to talk with that neighbor. */ lock = p->lock = olock_new(P->pool); @@ -1194,78 +1345,64 @@ static int bgp_shutdown(struct proto *P) { struct bgp_proto *p = (struct bgp_proto *) P; - unsigned subcode = 0; + uint subcode = 0; BGP_TRACE(D_EVENTS, "Shutdown requested"); switch (P->down_code) - { - case PDC_CF_REMOVE: - case PDC_CF_DISABLE: - subcode = 3; // Errcode 6, 3 - peer de-configured - break; - - case PDC_CF_RESTART: - subcode = 6; // Errcode 6, 6 - other configuration change - break; - - case PDC_CMD_DISABLE: - case PDC_CMD_SHUTDOWN: - subcode = 2; // Errcode 6, 2 - administrative shutdown - break; - - case PDC_CMD_RESTART: - subcode = 4; // Errcode 6, 4 - administrative reset - break; - - case PDC_RX_LIMIT_HIT: - case PDC_IN_LIMIT_HIT: - subcode = 1; // Errcode 6, 1 - max number of prefixes reached - /* log message for compatibility */ - log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name); - goto limit; - - case PDC_OUT_LIMIT_HIT: - subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown - - limit: - bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED); - if (proto_restart) - bgp_update_startup_delay(p); - else - p->startup_delay = 0; - goto done; - } + { + case PDC_CF_REMOVE: + case PDC_CF_DISABLE: + subcode = 3; // Errcode 6, 3 - peer de-configured + break; + + case PDC_CF_RESTART: + subcode = 6; // Errcode 6, 6 - other configuration change + break; + + case PDC_CMD_DISABLE: + case PDC_CMD_SHUTDOWN: + subcode = 2; // Errcode 6, 2 - administrative shutdown + break; + + case PDC_CMD_RESTART: + subcode = 4; // Errcode 6, 4 - administrative reset + break; + + case PDC_RX_LIMIT_HIT: + case PDC_IN_LIMIT_HIT: + subcode = 1; // Errcode 6, 1 - max number of prefixes reached + /* log message for compatibility */ + log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name); + goto limit; + + case PDC_OUT_LIMIT_HIT: + subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown + + limit: + bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED); + if (proto_restart) + bgp_update_startup_delay(p); + else + p->startup_delay = 0; + goto done; + } bgp_store_error(p, NULL, BE_MAN_DOWN, 0); p->startup_delay = 0; - done: +done: bgp_stop(p, subcode); return p->p.proto_state; } -static void -bgp_cleanup(struct proto *P) -{ - struct bgp_proto *p = (struct bgp_proto *) P; - rt_unlock_table(p->igp_table); -} - -static rtable * -get_igp_table(struct bgp_config *cf) -{ - return cf->igp_table ? cf->igp_table->table : cf->c.table->table; -} - static struct proto * -bgp_init(struct proto_config *C) +bgp_init(struct proto_config *CF) { - struct proto *P = proto_new(C, sizeof(struct bgp_proto)); - struct bgp_config *c = (struct bgp_config *) C; + struct proto *P = proto_new(CF); struct bgp_proto *p = (struct bgp_proto *) P; + struct bgp_config *cf = (struct bgp_config *) CF; - P->accept_ra_types = c->secondary ? RA_ACCEPTED : RA_OPTIMAL; P->rt_notify = bgp_rt_notify; P->import_control = bgp_import_control; P->neigh_notify = bgp_neigh_notify; @@ -1274,102 +1411,276 @@ bgp_init(struct proto_config *C) P->feed_end = bgp_feed_end; P->rte_better = bgp_rte_better; P->rte_mergable = bgp_rte_mergable; - P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL; - - p->cf = c; - p->local_as = c->local_as; - p->remote_as = c->remote_as; - p->is_internal = (c->local_as == c->remote_as); - p->rs_client = c->rs_client; - p->rr_client = c->rr_client; - p->igp_table = get_igp_table(c); + P->rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL; + + p->cf = cf; + p->local_as = cf->local_as; + p->remote_as = cf->remote_as; + p->public_as = cf->local_as; + p->is_internal = (cf->local_as == cf->remote_as); + p->is_interior = p->is_internal || cf->confederation_member; + p->rs_client = cf->rs_client; + p->rr_client = cf->rr_client; + + /* Confederation ID is used for truly external peers */ + if (cf->confederation && !p->is_interior) + p->public_as = cf->confederation; + + /* Add all channels */ + struct bgp_channel_config *cc; + WALK_LIST(cc, CF->channels) + proto_add_channel(P, &cc->c); return P; } +static void +bgp_channel_init(struct channel *C, struct channel_config *CF) +{ + struct bgp_channel *c = (void *) C; + struct bgp_channel_config *cf = (void *) CF; + + c->cf = cf; + c->afi = cf->afi; + c->desc = cf->desc; + + if (cf->igp_table_ip4) + c->igp_table_ip4 = cf->igp_table_ip4->table; + + if (cf->igp_table_ip6) + c->igp_table_ip6 = cf->igp_table_ip6->table; +} + +static int +bgp_channel_start(struct channel *C) +{ + struct bgp_proto *p = (void *) C->proto; + struct bgp_channel *c = (void *) C; + ip_addr src = p->source_addr; + + if (c->igp_table_ip4) + rt_lock_table(c->igp_table_ip4); + + if (c->igp_table_ip6) + rt_lock_table(c->igp_table_ip6); + + c->pool = p->p.pool; // XXXX + bgp_init_bucket_table(c); + bgp_init_prefix_table(c); + + c->next_hop_addr = c->cf->next_hop_addr; + c->link_addr = IPA_NONE; + c->packets_to_send = 0; + + /* Try to use source address as next hop address */ + if (ipa_zero(c->next_hop_addr)) + { + if (bgp_channel_is_ipv4(c) && (ipa_is_ip4(src) || c->ext_next_hop)) + c->next_hop_addr = src; + + if (bgp_channel_is_ipv6(c) && (ipa_is_ip6(src) || c->ext_next_hop)) + c->next_hop_addr = src; + } + + /* Exit if no feasible next hop address is found */ + if (ipa_zero(c->next_hop_addr)) + { + log(L_WARN "%s: Missing next hop address", p->p.name); + return 0; + } + + /* Set link-local address for IPv6 single-hop BGP */ + if (ipa_is_ip6(c->next_hop_addr) && p->neigh) + { + c->link_addr = p->link_addr; + + if (ipa_zero(c->link_addr)) + log(L_WARN "%s: Missing link-local address", p->p.name); + } + + /* Link local address is already in c->link_addr */ + if (ipa_is_link_local(c->next_hop_addr)) + c->next_hop_addr = IPA_NONE; + + return 0; /* XXXX: Currently undefined */ +} + +static void +bgp_channel_shutdown(struct channel *C) +{ + struct bgp_channel *c = (void *) C; + + /* XXXX: cleanup bucket and prefix tables */ + + c->next_hop_addr = IPA_NONE; + c->link_addr = IPA_NONE; +} + +static void +bgp_channel_cleanup(struct channel *C) +{ + struct bgp_channel *c = (void *) C; + + if (c->igp_table_ip4) + rt_unlock_table(c->igp_table_ip4); + + if (c->igp_table_ip6) + rt_unlock_table(c->igp_table_ip6); +} + +static inline struct bgp_channel_config * +bgp_find_channel_config(struct bgp_config *cf, u32 afi) +{ + struct bgp_channel_config *cc; + + WALK_LIST(cc, cf->c.channels) + if (cc->afi == afi) + return cc; + + return NULL; +} + +struct rtable_config * +bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32 type) +{ + struct bgp_channel_config *cc2; + struct rtable_config *tab; + + /* First, try table connected by the channel */ + if (cc->c.table->addr_type == type) + return cc->c.table; + + /* Find paired channel with the same SAFI but the other AFI */ + u32 afi2 = cc->afi ^ 0x30000; + cc2 = bgp_find_channel_config(cf, afi2); + + /* Second, try IGP table configured in the paired channel */ + if (cc2 && (tab = (type == NET_IP4) ? cc2->igp_table_ip4 : cc2->igp_table_ip6)) + return tab; + + /* Third, try table connected by the paired channel */ + if (cc2 && (cc2->c.table->addr_type == type)) + return cc2->c.table; + + /* Last, try default table of given type */ + if (tab = cf->c.global->def_tables[type]) + return tab; + + cf_error("Undefined IGP table"); +} + void -bgp_check_config(struct bgp_config *c) +bgp_postconfig(struct proto_config *CF) { - int internal = (c->local_as == c->remote_as); + struct bgp_config *cf = (void *) CF; + int internal = (cf->local_as == cf->remote_as); /* Do not check templates at all */ - if (c->c.class == SYM_TEMPLATE) + if (cf->c.class == SYM_TEMPLATE) return; /* EBGP direct by default, IBGP multihop by default */ - if (c->multihop < 0) - c->multihop = internal ? 64 : 0; - - /* Different default for gw_mode */ - if (!c->gw_mode) - c->gw_mode = c->multihop ? GW_RECURSIVE : GW_DIRECT; + if (cf->multihop < 0) + cf->multihop = internal ? 64 : 0; - /* Different default based on rs_client */ - if (!c->missing_lladdr) - c->missing_lladdr = c->rs_client ? MLL_IGNORE : MLL_SELF; - /* Disable after error incompatible with restart limit action */ - if (c->c.in_limit && (c->c.in_limit->action == PLA_RESTART) && c->disable_after_error) - c->c.in_limit->action = PLA_DISABLE; - - - if (!c->local_as) + if (!cf->local_as) cf_error("Local AS number must be set"); - if (ipa_zero(c->remote_ip)) + if (ipa_zero(cf->remote_ip)) cf_error("Neighbor must be configured"); - if (!c->remote_as) + if (!cf->remote_as) cf_error("Remote AS number must be set"); - if (ipa_is_link_local(c->remote_ip) && !c->iface) + if (ipa_is_link_local(cf->remote_ip) && !cf->iface) cf_error("Link-local neighbor address requires specified interface"); - if (!(c->capabilities && c->enable_as4) && (c->remote_as > 0xFFFF)) + if (!(cf->capabilities && cf->enable_as4) && (cf->remote_as > 0xFFFF)) cf_error("Neighbor AS number out of range (AS4 not available)"); - if (!internal && c->rr_client) + if (!internal && cf->rr_client) cf_error("Only internal neighbor can be RR client"); - if (internal && c->rs_client) + if (internal && cf->rs_client) cf_error("Only external neighbor can be RS client"); - if (c->multihop && (c->gw_mode == GW_DIRECT)) - cf_error("Multihop BGP cannot use direct gateway mode"); + if (!cf->confederation && cf->confederation_member) + cf_error("Confederation ID must be set for member sessions"); - if (c->multihop && (ipa_is_link_local(c->remote_ip) || - ipa_is_link_local(c->source_addr))) + if (cf->multihop && (ipa_is_link_local(cf->local_ip) || + ipa_is_link_local(cf->remote_ip))) cf_error("Multihop BGP cannot be used with link-local addresses"); - if (c->multihop && c->iface) + if (cf->multihop && cf->iface) cf_error("Multihop BGP cannot be bound to interface"); - if (c->multihop && c->check_link) + if (cf->multihop && cf->check_link) cf_error("Multihop BGP cannot depend on link state"); - if (c->multihop && c->bfd && ipa_zero(c->source_addr)) - cf_error("Multihop BGP with BFD requires specified source address"); + if (cf->multihop && cf->bfd && ipa_zero(cf->local_ip)) + cf_error("Multihop BGP with BFD requires specified local address"); - if ((c->gw_mode == GW_RECURSIVE) && c->c.table->sorted) - cf_error("BGP in recursive mode prohibits sorted table"); - if (c->deterministic_med && c->c.table->sorted) - cf_error("BGP with deterministic MED prohibits sorted table"); + struct bgp_channel_config *cc; + WALK_LIST(cc, CF->channels) + { + /* Disable after error incompatible with restart limit action */ + if ((cc->c.in_limit.action == PLA_RESTART) && cf->disable_after_error) + cc->c.in_limit.action = PLA_DISABLE; - if (c->secondary && !c->c.table->sorted) - cf_error("BGP with secondary option requires sorted table"); + /* Different default based on rs_client */ + if (!cc->missing_lladdr) + cc->missing_lladdr = cf->rs_client ? MLL_IGNORE : MLL_SELF; + + /* Different default for gw_mode */ + if (!cc->gw_mode) + cc->gw_mode = cf->multihop ? GW_RECURSIVE : GW_DIRECT; + + /* Default based on proto config */ + if (cc->gr_able == 0xff) + cc->gr_able = (cf->gr_mode == BGP_GR_ABLE); + + /* Default values of IGP tables */ + if ((cc->gw_mode == GW_RECURSIVE) && !cc->desc->no_igp) + { + if (!cc->igp_table_ip4 && (bgp_cc_is_ipv4(cc) || cc->ext_next_hop)) + cc->igp_table_ip4 = bgp_default_igp_table(cf, cc, NET_IP4); + + if (!cc->igp_table_ip6 && (bgp_cc_is_ipv6(cc) || cc->ext_next_hop)) + cc->igp_table_ip6 = bgp_default_igp_table(cf, cc, NET_IP6); + + if (cc->igp_table_ip4 && bgp_cc_is_ipv6(cc) && !cc->ext_next_hop) + cf_error("Mismatched IGP table type"); + + if (cc->igp_table_ip6 && bgp_cc_is_ipv4(cc) && !cc->ext_next_hop) + cf_error("Mismatched IGP table type"); + } + + if (cf->multihop && (cc->gw_mode == GW_DIRECT)) + cf_error("Multihop BGP cannot use direct gateway mode"); + + if ((cc->gw_mode == GW_RECURSIVE) && cc->c.table->sorted) + cf_error("BGP in recursive mode prohibits sorted table"); + + if (cf->deterministic_med && cc->c.table->sorted) + cf_error("BGP with deterministic MED prohibits sorted table"); + + if (cc->secondary && !cc->c.table->sorted) + cf_error("BGP with secondary option requires sorted table"); + } } static int -bgp_reconfigure(struct proto *P, struct proto_config *C) +bgp_reconfigure(struct proto *P, struct proto_config *CF) { - struct bgp_config *new = (struct bgp_config *) C; - struct bgp_proto *p = (struct bgp_proto *) P; + struct bgp_proto *p = (void *) P; + struct bgp_config *new = (void *) CF; struct bgp_config *old = p->cf; - if (proto_get_router_id(C) != p->local_id) + if (proto_get_router_id(CF) != p->local_id) return 0; int same = !memcmp(((byte *) old) + sizeof(struct proto_config), @@ -1377,8 +1688,26 @@ bgp_reconfigure(struct proto *P, struct proto_config *C) // password item is last and must be checked separately OFFSETOF(struct bgp_config, password) - sizeof(struct proto_config)) && ((!old->password && !new->password) - || (old->password && new->password && !strcmp(old->password, new->password))) - && (get_igp_table(old) == get_igp_table(new)); + || (old->password && new->password && !strcmp(old->password, new->password))); + + /* FIXME: Move channel reconfiguration to generic protocol code ? */ + struct channel *C, *C2; + struct bgp_channel_config *cc; + + WALK_LIST(C, p->p.channels) + C->stale = 1; + + WALK_LIST(cc, new->c.channels) + { + C = (struct channel *) bgp_find_channel(p, cc->afi); + same = proto_configure_channel(P, &C, &cc->c) && same; + C->stale = 0; + } + + WALK_LIST_DELSAFE(C, C2, p->p.channels) + if (C->stale) + same = proto_configure_channel(P, &C, NULL) && same; + if (same && (p->start_state > BSS_PREPARE)) bgp_update_bfd(p, new->bfd); @@ -1390,11 +1719,34 @@ bgp_reconfigure(struct proto *P, struct proto_config *C) return same; } +#define IGP_TABLE(cf, sym) ((cf)->igp_table_##sym ? (cf)->igp_table_##sym ->table : NULL ) + +static int +bgp_channel_reconfigure(struct channel *C, struct channel_config *CC) +{ + struct bgp_channel *c = (void *) C; + struct bgp_channel_config *new = (void *) CC; + struct bgp_channel_config *old = c->cf; + + if (memcmp(((byte *) old) + sizeof(struct channel_config), + ((byte *) new) + sizeof(struct channel_config), + /* Remaining items must be checked separately */ + OFFSETOF(struct bgp_channel_config, rest) - sizeof(struct channel_config))) + return 0; + + /* Check change in IGP tables */ + if ((IGP_TABLE(old, ip4) != IGP_TABLE(new, ip4)) || + (IGP_TABLE(old, ip6) != IGP_TABLE(new, ip6))) + return 0; + + c->cf = new; + return 1; +} + static void -bgp_copy_config(struct proto_config *dest, struct proto_config *src) +bgp_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUSED) { /* Just a shallow copy */ - proto_copy_rest(dest, src, sizeof(struct bgp_config)); } @@ -1411,14 +1763,14 @@ bgp_copy_config(struct proto_config *dest, struct proto_config *src) * closes the connection. */ void -bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int len) +bgp_error(struct bgp_conn *c, uint code, uint subcode, byte *data, int len) { struct bgp_proto *p = c->bgp; if (c->state == BS_CLOSE) return; - bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, (len > 0) ? len : -len); + bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, ABS(len)); bgp_store_error(p, c, BE_BGP_TX, (code << 16) | subcode); bgp_conn_enter_close_state(c); @@ -1426,13 +1778,13 @@ bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int l c->notify_subcode = subcode; c->notify_data = data; c->notify_size = (len > 0) ? len : 0; - bgp_schedule_packet(c, PKT_NOTIFICATION); + bgp_schedule_packet(c, NULL, PKT_NOTIFICATION); if (code != 6) - { - bgp_update_startup_delay(p); - bgp_stop(p, 0); - } + { + bgp_update_startup_delay(p); + bgp_stop(p, 0); + } } /** @@ -1471,19 +1823,19 @@ static const char * bgp_last_errmsg(struct bgp_proto *p) { switch (p->last_error_class) - { - case BE_MISC: - return bgp_misc_errors[p->last_error_code]; - case BE_SOCKET: - return (p->last_error_code == 0) ? "Connection closed" : strerror(p->last_error_code); - case BE_BGP_RX: - case BE_BGP_TX: - return bgp_error_dsc(p->last_error_code >> 16, p->last_error_code & 0xFF); - case BE_AUTO_DOWN: - return bgp_auto_errors[p->last_error_code]; - default: - return ""; - } + { + case BE_MISC: + return bgp_misc_errors[p->last_error_code]; + case BE_SOCKET: + return (p->last_error_code == 0) ? "Connection closed" : strerror(p->last_error_code); + case BE_BGP_RX: + case BE_BGP_TX: + return bgp_error_dsc(p->last_error_code >> 16, p->last_error_code & 0xFF); + case BE_AUTO_DOWN: + return bgp_auto_errors[p->last_error_code]; + default: + return ""; + } } static const char * @@ -1514,48 +1866,165 @@ bgp_get_status(struct proto *P, byte *buf) } static void +bgp_show_afis(int code, char *s, u32 *afis, uint count) +{ + buffer b; + LOG_BUFFER_INIT(b); + + buffer_puts(&b, s); + + for (u32 *af = afis; af < (afis + count); af++) + { + const struct bgp_af_desc *desc = bgp_get_af_desc(*af); + if (desc) + buffer_print(&b, " %s", desc->name); + else + buffer_print(&b, " <%u/%u>", BGP_AFI(*af), BGP_SAFI(*af)); + } + + if (b.pos == b.end) + strcpy(b.end - 32, " ... <too long>"); + + cli_msg(code, b.start); +} + +static void +bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) +{ + struct bgp_af_caps *ac; + uint any_mp_bgp = 0; + uint any_gr_able = 0; + uint any_add_path = 0; + uint any_ext_next_hop = 0; + u32 *afl1 = alloca(caps->af_count * sizeof(u32)); + u32 *afl2 = alloca(caps->af_count * sizeof(u32)); + uint afn1, afn2; + + WALK_AF_CAPS(caps, ac) + { + any_mp_bgp |= ac->ready; + any_gr_able |= ac->gr_able; + any_add_path |= ac->add_path; + any_ext_next_hop |= ac->ext_next_hop; + } + + if (any_mp_bgp) + { + cli_msg(-1006, " Multiprotocol"); + + afn1 = 0; + WALK_AF_CAPS(caps, ac) + if (ac->ready) + afl1[afn1++] = ac->afi; + + bgp_show_afis(-1006, " AF announced:", afl1, afn1); + } + + if (caps->route_refresh) + cli_msg(-1006, " Route refresh"); + + if (any_ext_next_hop) + { + cli_msg(-1006, " Extended next hop"); + + afn1 = 0; + WALK_AF_CAPS(caps, ac) + if (ac->ext_next_hop) + afl1[afn1++] = ac->afi; + + bgp_show_afis(-1006, " IPv6 nexthop:", afl1, afn1); + } + + if (caps->ext_messages) + cli_msg(-1006, " Extended message"); + + if (caps->gr_aware) + cli_msg(-1006, " Graceful restart"); + + if (any_gr_able) + { + /* Continues from gr_aware */ + cli_msg(-1006, " Restart time: %u", caps->gr_time); + if (caps->gr_flags & BGP_GRF_RESTART) + cli_msg(-1006, " Restart recovery"); + + afn1 = afn2 = 0; + WALK_AF_CAPS(caps, ac) + { + if (ac->gr_able) + afl1[afn1++] = ac->afi; + + if (ac->gr_af_flags & BGP_GRF_FORWARDING) + afl2[afn2++] = ac->afi; + } + + bgp_show_afis(-1006, " AF supported:", afl1, afn1); + bgp_show_afis(-1006, " AF preserved:", afl2, afn2); + } + + if (caps->as4_support) + cli_msg(-1006, " 4-octet AS numbers"); + + if (any_add_path) + { + cli_msg(-1006, " ADD-PATH"); + + afn1 = afn2 = 0; + WALK_AF_CAPS(caps, ac) + { + if (ac->add_path & BGP_ADD_PATH_RX) + afl1[afn1++] = ac->afi; + + if (ac->add_path & BGP_ADD_PATH_TX) + afl2[afn2++] = ac->afi; + } + + bgp_show_afis(-1006, " RX:", afl1, afn1); + bgp_show_afis(-1006, " TX:", afl2, afn2); + } + + if (caps->enhanced_refresh) + cli_msg(-1006, " Enhanced refresh"); +} + +static void bgp_show_proto_info(struct proto *P) { struct bgp_proto *p = (struct bgp_proto *) P; - struct bgp_conn *c = p->conn; - - proto_show_basic_info(P); cli_msg(-1006, " BGP state: %s", bgp_state_dsc(p)); cli_msg(-1006, " Neighbor address: %I%J", p->cf->remote_ip, p->cf->iface); cli_msg(-1006, " Neighbor AS: %u", p->remote_as); - if (p->gr_active) + if (p->gr_active_num) cli_msg(-1006, " Neighbor graceful restart active"); if (P->proto_state == PS_START) - { - struct bgp_conn *oc = &p->outgoing_conn; + { + struct bgp_conn *oc = &p->outgoing_conn; - if ((p->start_state < BSS_CONNECT) && - (p->startup_timer->expires)) - cli_msg(-1006, " Error wait: %d/%d", - p->startup_timer->expires - now, p->startup_delay); + if ((p->start_state < BSS_CONNECT) && + (tm_active(p->startup_timer))) + cli_msg(-1006, " Error wait: %t/%u", + tm_remains(p->startup_timer), p->startup_delay); - if ((oc->state == BS_ACTIVE) && - (oc->connect_retry_timer->expires)) - cli_msg(-1006, " Connect delay: %d/%d", - oc->connect_retry_timer->expires - now, p->cf->connect_delay_time); + if ((oc->state == BS_ACTIVE) && + (tm_active(oc->connect_timer))) + cli_msg(-1006, " Connect delay: %t/%u", + tm_remains(oc->connect_timer), p->cf->connect_delay_time); - if (p->gr_active && p->gr_timer->expires) - cli_msg(-1006, " Restart timer: %d/-", p->gr_timer->expires - now); - } + if (p->gr_active_num && tm_active(p->gr_timer)) + cli_msg(-1006, " Restart timer: %t/-", + tm_remains(p->gr_timer)); + } else if (P->proto_state == PS_UP) - { - cli_msg(-1006, " Neighbor ID: %R", p->remote_id); - cli_msg(-1006, " Neighbor caps: %s%s%s%s%s%s%s", - c->peer_refresh_support ? " refresh" : "", - c->peer_enhanced_refresh_support ? " enhanced-refresh" : "", - c->peer_gr_able ? " restart-able" : (c->peer_gr_aware ? " restart-aware" : ""), - c->peer_as4_support ? " AS4" : "", - (c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "", - (c->peer_add_path & ADD_PATH_TX) ? " add-path-tx" : "", - c->peer_ext_messages_support ? " ext-messages" : ""); + { + cli_msg(-1006, " Neighbor ID: %R", p->remote_id); + cli_msg(-1006, " Local capabilities"); + bgp_show_capabilities(p, p->conn->local_caps); + cli_msg(-1006, " Neighbor capabilities"); + bgp_show_capabilities(p, p->conn->remote_caps); +/* XXXX cli_msg(-1006, " Session: %s%s%s%s%s%s%s%s", p->is_internal ? "internal" : "external", p->cf->multihop ? " multihop" : "", @@ -1565,35 +2034,60 @@ bgp_show_proto_info(struct proto *P) p->add_path_rx ? " add-path-rx" : "", p->add_path_tx ? " add-path-tx" : "", p->ext_messages ? " ext-messages" : ""); - cli_msg(-1006, " Source address: %I", p->source_addr); - if (P->cf->in_limit) - cli_msg(-1006, " Route limit: %d/%d", - p->p.stats.imp_routes + p->p.stats.filt_routes, P->cf->in_limit->limit); - cli_msg(-1006, " Hold timer: %d/%d", - tm_remains(c->hold_timer), c->hold_time); - cli_msg(-1006, " Keepalive timer: %d/%d", - tm_remains(c->keepalive_timer), c->keepalive_time); - } +*/ + cli_msg(-1006, " Source address: %I", p->source_addr); + cli_msg(-1006, " Hold timer: %t/%u", + tm_remains(p->conn->hold_timer), p->conn->hold_time); + cli_msg(-1006, " Keepalive timer: %t/%u", + tm_remains(p->conn->keepalive_timer), p->conn->keepalive_time); + } if ((p->last_error_class != BE_NONE) && (p->last_error_class != BE_MAN_DOWN)) + { + const char *err1 = bgp_err_classes[p->last_error_class]; + const char *err2 = bgp_last_errmsg(p); + cli_msg(-1006, " Last error: %s%s", err1, err2); + } + + { + /* XXXX ?? */ + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) { - const char *err1 = bgp_err_classes[p->last_error_class]; - const char *err2 = bgp_last_errmsg(p); - cli_msg(-1006, " Last error: %s%s", err1, err2); + channel_show_info(&c->c); + + if (c->igp_table_ip4) + cli_msg(-1006, " IGP IPv4 table: %s", c->igp_table_ip4->name); + + if (c->igp_table_ip6) + cli_msg(-1006, " IGP IPv6 table: %s", c->igp_table_ip6->name); } + } } +struct channel_class channel_bgp = { + .channel_size = sizeof(struct bgp_channel), + .config_size = sizeof(struct bgp_channel_config), + .init = bgp_channel_init, + .start = bgp_channel_start, + .shutdown = bgp_channel_shutdown, + .cleanup = bgp_channel_cleanup, + .reconfigure = bgp_channel_reconfigure, +}; + struct protocol proto_bgp = { .name = "BGP", .template = "bgp%d", .attr_class = EAP_BGP, .preference = DEF_PREF_BGP, + .channel_mask = NB_IP | NB_VPN | NB_FLOW, + .proto_size = sizeof(struct bgp_proto), .config_size = sizeof(struct bgp_config), + .postconfig = bgp_postconfig, .init = bgp_init, .start = bgp_start, .shutdown = bgp_shutdown, - .cleanup = bgp_cleanup, .reconfigure = bgp_reconfigure, .copy_config = bgp_copy_config, .get_status = bgp_get_status, diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index e47a0eb1..3d940c22 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -2,6 +2,8 @@ * BIRD -- The Border Gateway Protocol * * (c) 2000 Martin Mares <mj@ucw.cz> + * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2008--2016 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -10,26 +12,80 @@ #define _BIRD_BGP_H_ #include <stdint.h> +#include <setjmp.h> +#include "nest/bird.h" #include "nest/route.h" #include "nest/bfd.h" +//#include "lib/lists.h" #include "lib/hash.h" +#include "lib/socket.h" struct linpool; struct eattr; + +/* Address families */ + +#define BGP_AFI_IPV4 1 +#define BGP_AFI_IPV6 2 + +#define BGP_SAFI_UNICAST 1 +#define BGP_SAFI_MULTICAST 2 +#define BGP_SAFI_MPLS 4 +#define BGP_SAFI_MPLS_VPN 128 +#define BGP_SAFI_VPN_MULTICAST 129 +#define BGP_SAFI_FLOW 133 + +/* Internal AF codes */ + +#define BGP_AF(A, B) (((u32)(A) << 16) | (u32)(B)) +#define BGP_AFI(A) ((u32)(A) >> 16) +#define BGP_SAFI(A) ((u32)(A) & 0xFFFF) + +#define BGP_AF_IPV4 BGP_AF( BGP_AFI_IPV4, BGP_SAFI_UNICAST ) +#define BGP_AF_IPV6 BGP_AF( BGP_AFI_IPV6, BGP_SAFI_UNICAST ) +#define BGP_AF_IPV4_MC BGP_AF( BGP_AFI_IPV4, BGP_SAFI_MULTICAST ) +#define BGP_AF_IPV6_MC BGP_AF( BGP_AFI_IPV6, BGP_SAFI_MULTICAST ) +#define BGP_AF_IPV4_MPLS BGP_AF( BGP_AFI_IPV4, BGP_SAFI_MPLS ) +#define BGP_AF_IPV6_MPLS BGP_AF( BGP_AFI_IPV6, BGP_SAFI_MPLS ) +#define BGP_AF_VPN4_MPLS BGP_AF( BGP_AFI_IPV4, BGP_SAFI_MPLS_VPN ) +#define BGP_AF_VPN6_MPLS BGP_AF( BGP_AFI_IPV6, BGP_SAFI_MPLS_VPN ) +#define BGP_AF_VPN4_MC BGP_AF( BGP_AFI_IPV4, BGP_SAFI_VPN_MULTICAST ) +#define BGP_AF_VPN6_MC BGP_AF( BGP_AFI_IPV6, BGP_SAFI_VPN_MULTICAST ) +#define BGP_AF_FLOW4 BGP_AF( BGP_AFI_IPV4, BGP_SAFI_FLOW ) +#define BGP_AF_FLOW6 BGP_AF( BGP_AFI_IPV6, BGP_SAFI_FLOW ) + + +struct bgp_write_state; +struct bgp_parse_state; +struct bgp_export_state; +struct bgp_bucket; + +struct bgp_af_desc { + u32 afi; + u32 net; + u8 mpls; + u8 no_igp; + const char *name; + uint (*encode_nlri)(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size); + void (*decode_nlri)(struct bgp_parse_state *s, byte *pos, uint len, rta *a); + void (*update_next_hop)(struct bgp_export_state *s, eattr *nh, ea_list **to); + uint (*encode_next_hop)(struct bgp_write_state *s, eattr *nh, byte *buf, uint size); + void (*decode_next_hop)(struct bgp_parse_state *s, byte *pos, uint len, rta *a); +}; + + struct bgp_config { struct proto_config c; u32 local_as, remote_as; + ip_addr local_ip; /* Source address to use */ ip_addr remote_ip; - ip_addr source_addr; /* Source address to use */ struct iface *iface; /* Interface for link-local addresses */ + u16 local_port; /* Local listening port */ u16 remote_port; /* Neighbor destination port */ int multihop; /* Number of hops if multihop */ - int ttl_security; /* Enable TTL security [RFC5082] */ - int next_hop_self; /* Always set next hop to local IP address */ - int next_hop_keep; /* Do not touch next hop attribute */ - int missing_lladdr; /* What we will do when we don' know link-local addr, see MLL_* */ - int gw_mode; /* How we compute route gateway from next_hop attr, see GW_* */ + int strict_bind; /* Bind listening socket to local address */ + int ttl_security; /* Enable TTL security [RFC 5082] */ int compare_path_lengths; /* Use path lengths when selecting best route */ int med_metric; /* Compare MULTI_EXIT_DISC even between routes from differen ASes */ int igp_metric; /* Use IGP metrics when selecting best route */ @@ -37,22 +93,22 @@ struct bgp_config { int deterministic_med; /* Use more complicated algo to have strict RFC 4271 MED comparison */ u32 default_local_pref; /* Default value for LOCAL_PREF attribute */ u32 default_med; /* Default value for MULTI_EXIT_DISC attribute */ - int capabilities; /* Enable capability handshake [RFC3392] */ - int enable_refresh; /* Enable local support for route refresh [RFC2918] */ - int enable_as4; /* Enable local support for 4B AS numbers [RFC4893] */ + int capabilities; /* Enable capability handshake [RFC 5492] */ + int enable_refresh; /* Enable local support for route refresh [RFC 2918] */ + int enable_as4; /* Enable local support for 4B AS numbers [RFC 6793] */ int enable_extended_messages; /* Enable local support for extended messages [draft] */ u32 rr_cluster_id; /* Route reflector cluster ID, if different from local ID */ int rr_client; /* Whether neighbor is RR client of me */ int rs_client; /* Whether neighbor is RS client of me */ - int advertise_ipv4; /* Whether we should add IPv4 capability advertisement to OPEN message */ + u32 confederation; /* Confederation ID, or zero if confeds not active */ + int confederation_member; /* Whether neighbor AS is member of our confederation */ int passive; /* Do not initiate outgoing connection */ int interpret_communities; /* Hardwired handling of well-known communities */ - int secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */ - int add_path; /* Use ADD-PATH extension [RFC7911] */ int allow_local_as; /* Allow that number of local ASNs in incoming AS_PATHs */ int allow_local_pref; /* Allow LOCAL_PREF in EBGP sessions */ int gr_mode; /* Graceful restart mode (BGP_GR_*) */ int setkey; /* Set MD5 password to system SA/SP database */ + /* Times below are in seconds */ unsigned gr_time; /* Graceful restart timeout */ unsigned connect_delay_time; /* Minimum delay between connect attempts */ unsigned connect_retry_time; /* Timeout for connect attempts */ @@ -64,11 +120,31 @@ struct bgp_config { unsigned disable_after_error; /* Disable the protocol when error is detected */ char *password; /* Password used for MD5 authentication */ - struct rtable_config *igp_table; /* Table used for recursive next hop lookups */ int check_link; /* Use iface link state for liveness detection */ int bfd; /* Use BFD for liveness detection */ }; +struct bgp_channel_config { + struct channel_config c; + + u32 afi; + const struct bgp_af_desc *desc; + + ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */ + u8 next_hop_self; /* Always set next hop to local IP address */ + u8 next_hop_keep; /* Do not touch next hop attribute */ + u8 missing_lladdr; /* What we will do when we don' know link-local addr, see MLL_* */ + u8 gw_mode; /* How we compute route gateway from next_hop attr, see GW_* */ + u8 secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */ + u8 gr_able; /* Allow full graceful restart for the channel */ + u8 ext_next_hop; /* Allow both IPv4 and IPv6 next hops */ + u8 add_path; /* Use ADD-PATH extension [RFC 7911] */ + + uint rest[0]; /* Remaining items are reconfigured separately */ + struct rtable_config *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ + struct rtable_config *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ +}; + #define MLL_SELF 1 #define MLL_DROP 2 #define MLL_IGNORE 3 @@ -76,112 +152,241 @@ struct bgp_config { #define GW_DIRECT 1 #define GW_RECURSIVE 2 -#define ADD_PATH_RX 1 -#define ADD_PATH_TX 2 -#define ADD_PATH_FULL 3 +#define BGP_ADD_PATH_RX 1 +#define BGP_ADD_PATH_TX 2 +#define BGP_ADD_PATH_FULL 3 -#define BGP_GR_ABLE 1 -#define BGP_GR_AWARE 2 +#define BGP_GR_ABLE 1 +#define BGP_GR_AWARE 2 -/* For peer_gr_flags */ +/* For GR capability common flags */ #define BGP_GRF_RESTART 0x80 -/* For peer_gr_aflags */ +/* For GR capability per-AF flags */ #define BGP_GRF_FORWARDING 0x80 +struct bgp_af_caps { + u32 afi; + u8 ready; /* Multiprotocol capability, RFC 4760 */ + u8 gr_able; /* Graceful restart support, RFC 4724 */ + u8 gr_af_flags; /* Graceful restart per-AF flags */ + u8 ext_next_hop; /* Extended IPv6 next hop, RFC 5549 */ + u8 add_path; /* Multiple paths support, RFC 7911 */ +}; + +struct bgp_caps { + u32 as4_number; /* Announced ASN */ + + u8 as4_support; /* Four-octet AS capability, RFC 6793 */ + u8 ext_messages; /* Extended message length, RFC draft */ + u8 route_refresh; /* Route refresh capability, RFC 2918 */ + u8 enhanced_refresh; /* Enhanced route refresh, RFC 7313 */ + + u8 gr_aware; /* Graceful restart capability, RFC 4724 */ + u8 gr_flags; /* Graceful restart flags */ + u16 gr_time; /* Graceful restart time in seconds */ + + u16 af_count; /* Number of af_data items */ + + struct bgp_af_caps af_data[0]; /* Per-AF capability data */ +}; + +#define WALK_AF_CAPS(caps,ac) \ + for (ac = caps->af_data; ac < &caps->af_data[caps->af_count]; ac++) + + +struct bgp_socket { + node n; /* Node in global bgp_sockets */ + sock *sk; /* Real listening socket */ + u32 uc; /* Use count */ +}; + struct bgp_conn { struct bgp_proto *bgp; struct birdsock *sk; - uint state; /* State of connection state machine */ - struct timer *connect_retry_timer; - struct timer *hold_timer; - struct timer *keepalive_timer; - struct event *tx_ev; - int packets_to_send; /* Bitmap of packet types to be sent */ + u8 state; /* State of connection state machine */ + u8 as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */ + u8 ext_messages; /* Session uses extended message length */ + + struct bgp_caps *local_caps; + struct bgp_caps *remote_caps; + timer *connect_timer; + timer *hold_timer; + timer *keepalive_timer; + event *tx_ev; + u32 packets_to_send; /* Bitmap of packet types to be sent */ + u32 channels_to_send; /* Bitmap of channels with packets to be sent */ + u8 last_channel; /* Channel used last time for TX */ + u8 last_channel_count; /* Number of times the last channel was used in succession */ int notify_code, notify_subcode, notify_size; byte *notify_data; - u32 advertised_as; /* Temporary value for AS number received */ - int start_state; /* protocol start_state snapshot when connection established */ - u8 peer_refresh_support; /* Peer supports route refresh [RFC2918] */ - u8 peer_as4_support; /* Peer supports 4B AS numbers [RFC4893] */ - u8 peer_add_path; /* Peer supports ADD-PATH [RFC7911] */ - u8 peer_enhanced_refresh_support; /* Peer supports enhanced refresh [RFC7313] */ - u8 peer_gr_aware; - u8 peer_gr_able; - u16 peer_gr_time; - u8 peer_gr_flags; - u8 peer_gr_aflags; - u8 peer_ext_messages_support; /* Peer supports extended message length [draft] */ - unsigned hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */ + + uint hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */ }; struct bgp_proto { struct proto p; struct bgp_config *cf; /* Shortcut to BGP configuration */ u32 local_as, remote_as; - int start_state; /* Substates that partitions BS_START */ - u8 is_internal; /* Internal BGP connection (local_as == remote_as) */ - u8 as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */ - u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */ - u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */ - u8 ext_messages; /* Session allows to use extended messages (both sides support it) */ + u32 public_as; /* Externally visible ASN (local_as or confederation id) */ u32 local_id; /* BGP identifier of this router */ u32 remote_id; /* BGP identifier of the neighbor */ u32 rr_cluster_id; /* Route reflector cluster ID */ - int rr_client; /* Whether neighbor is RR client of me */ - int rs_client; /* Whether neighbor is RS client of me */ + int start_state; /* Substates that partitions BS_START */ + u8 is_internal; /* Internal BGP session (local_as == remote_as) */ + u8 is_interior; /* Internal or intra-confederation BGP session */ + u8 as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */ + u8 rr_client; /* Whether neighbor is RR client of me */ + u8 rs_client; /* Whether neighbor is RS client of me */ + u8 route_refresh; /* Route refresh allowed to send [RFC 2918] */ + u8 enhanced_refresh; /* Enhanced refresh is negotiated [RFC 7313] */ u8 gr_ready; /* Neighbor could do graceful restart */ - u8 gr_active; /* Neighbor is doing graceful restart */ - u8 feed_state; /* Feed state (TX) for EoR, RR packets, see BFS_* */ - u8 load_state; /* Load state (RX) for EoR, RR packets, see BFS_* */ + u8 gr_active_num; /* Neighbor is doing GR, number of active channels */ + u8 channel_count; /* Number of active channels */ + u32 *afi_map; /* Map channel index -> AFI */ + struct bgp_channel **channel_map; /* Map channel index -> channel */ struct bgp_conn *conn; /* Connection we have established */ struct bgp_conn outgoing_conn; /* Outgoing connection we're working with */ struct bgp_conn incoming_conn; /* Incoming connection we have neither accepted nor rejected yet */ struct object_lock *lock; /* Lock for neighbor connection */ struct neighbor *neigh; /* Neighbor entry corresponding to remote ip, NULL if multihop */ + struct bgp_socket *sock; /* Shared listening socket */ struct bfd_request *bfd_req; /* BFD request, if BFD is used */ ip_addr source_addr; /* Local address used as an advertised next hop */ - rtable *igp_table; /* Table used for recursive next hop lookups */ - struct event *event; /* Event for respawning and shutting process */ - struct timer *startup_timer; /* Timer used to delay protocol startup due to previous errors (startup_delay) */ - struct timer *gr_timer; /* Timer waiting for reestablishment after graceful restart */ - struct bgp_bucket **bucket_hash; /* Hash table of attribute buckets */ - uint hash_size, hash_count, hash_limit; - HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ - slab *prefix_slab; /* Slab holding prefix nodes */ - list bucket_queue; /* Queue of buckets to send */ - struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ - unsigned startup_delay; /* Time to delay protocol startup by due to errors */ - bird_clock_t last_proto_error; /* Time of last error that leads to protocol stop */ + ip_addr link_addr; /* Link-local version of source_addr */ + event *event; /* Event for respawning and shutting process */ + timer *startup_timer; /* Timer used to delay protocol startup due to previous errors (startup_delay) */ + timer *gr_timer; /* Timer waiting for reestablishment after graceful restart */ + uint startup_delay; /* Delay (in seconds) of protocol startup due to previous errors */ + btime last_proto_error; /* Time of last error that leads to protocol stop */ u8 last_error_class; /* Error class of last error */ u32 last_error_code; /* Error code of last error. BGP protocol errors are encoded as (bgp_err_code << 16 | bgp_err_subcode) */ -#ifdef IPV6 - byte *mp_reach_start, *mp_unreach_start; /* Multiprotocol BGP attribute notes */ - unsigned mp_reach_len, mp_unreach_len; - ip_addr local_link; /* Link-level version of source_addr */ -#endif +}; + +struct bgp_channel { + struct channel c; + + /* Rest are BGP specific data */ + struct bgp_channel_config *cf; + pool *pool; /* XXXX */ + + u32 afi; + u32 index; + const struct bgp_af_desc *desc; + + HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ + struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ + list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ + + HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ + slab *prefix_slab; /* Slab holding prefix nodes */ + + rtable *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ + rtable *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ + ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */ + ip_addr link_addr; /* Link-local version of next_hop_addr */ + + u32 packets_to_send; /* Bitmap of packet types to be sent */ + + u8 gr_ready; /* Neighbor could do GR on this AF */ + u8 gr_active; /* Neighbor is doing GR and keeping fwd state */ + + u8 ext_next_hop; /* Session allows both IPv4 and IPv6 next hops */ + + u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */ + u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */ + + u8 feed_state; /* Feed state (TX) for EoR, RR packets, see BFS_* */ + u8 load_state; /* Load state (RX) for EoR, RR packets, see BFS_* */ }; struct bgp_prefix { - struct { - ip_addr prefix; - int pxlen; - } n; + node buck_node; /* Node in per-bucket list */ + struct bgp_prefix *next; /* Node in prefix hash table */ + u32 hash; u32 path_id; - struct bgp_prefix *next; - node bucket_node; /* Node in per-bucket list */ + net_addr net[0]; }; struct bgp_bucket { node send_node; /* Node in send queue */ - struct bgp_bucket *hash_next, *hash_prev; /* Node in bucket hash table */ - unsigned hash; /* Hash over extended attributes */ - list prefixes; /* Prefixes in this buckets */ + struct bgp_bucket *next; /* Node in bucket hash table */ + list prefixes; /* Prefixes in this bucket (struct bgp_prefix) */ + u32 hash; /* Hash over extended attributes */ ea_list eattrs[0]; /* Per-bucket extended attributes */ }; +struct bgp_export_state { + struct bgp_proto *proto; + struct bgp_channel *channel; + struct linpool *pool; + + struct bgp_proto *src; + rte *route; + int mpls; + + u32 attrs_seen[1]; + uint err_withdraw; +}; + +struct bgp_write_state { + struct bgp_proto *proto; + struct bgp_channel *channel; + struct linpool *pool; + + int as4_session; + int add_path; + int mpls; + + eattr *mp_next_hop; + adata *mpls_labels; +}; + +struct bgp_parse_state { + struct bgp_proto *proto; + struct bgp_channel *channel; + struct linpool *pool; + + int as4_session; + int add_path; + int mpls; + + u32 attrs_seen[256/32]; + + u32 mp_reach_af; + u32 mp_unreach_af; + + uint attr_len; + uint ip_reach_len; + uint ip_unreach_len; + uint ip_next_hop_len; + uint mp_reach_len; + uint mp_unreach_len; + uint mp_next_hop_len; + + byte *attrs; + byte *ip_reach_nlri; + byte *ip_unreach_nlri; + byte *ip_next_hop_data; + byte *mp_reach_nlri; + byte *mp_unreach_nlri; + byte *mp_next_hop_data; + + uint err_withdraw; + uint err_subcode; + jmp_buf err_jmpbuf; + + struct hostentry *hostentry; + adata *mpls_labels; + + /* Cached state for bgp_rte_update() */ + u32 last_id; + struct rte_src *last_src; + rta *cached_rta; +}; + #define BGP_PORT 179 #define BGP_VERSION 4 #define BGP_HEADER_LENGTH 19 @@ -192,13 +397,33 @@ struct bgp_bucket { #define BGP_RX_BUFFER_EXT_SIZE 65535 #define BGP_TX_BUFFER_EXT_SIZE 65535 -static inline uint bgp_max_packet_length(struct bgp_proto *p) -{ return p->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; } +static inline int bgp_channel_is_ipv4(struct bgp_channel *c) +{ return BGP_AFI(c->afi) == BGP_AFI_IPV4; } + +static inline int bgp_channel_is_ipv6(struct bgp_channel *c) +{ return BGP_AFI(c->afi) == BGP_AFI_IPV6; } + +static inline int bgp_cc_is_ipv4(struct bgp_channel_config *c) +{ return BGP_AFI(c->afi) == BGP_AFI_IPV4; } + +static inline int bgp_cc_is_ipv6(struct bgp_channel_config *c) +{ return BGP_AFI(c->afi) == BGP_AFI_IPV6; } + +static inline uint bgp_max_packet_length(struct bgp_conn *conn) +{ return conn->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; } + +static inline void +bgp_parse_error(struct bgp_parse_state *s, uint subcode) +{ + s->err_subcode = subcode; + longjmp(s->err_jmpbuf, 1); +} extern struct linpool *bgp_linpool; +extern struct linpool *bgp_linpool2; -void bgp_start_timer(struct timer *t, int value); +void bgp_start_timer(timer *t, uint value); void bgp_check_config(struct bgp_config *c); void bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int len); void bgp_close_conn(struct bgp_conn *c); @@ -208,9 +433,9 @@ void bgp_conn_enter_established_state(struct bgp_conn *conn); void bgp_conn_enter_close_state(struct bgp_conn *conn); void bgp_conn_enter_idle_state(struct bgp_conn *conn); void bgp_handle_graceful_restart(struct bgp_proto *p); -void bgp_graceful_restart_done(struct bgp_proto *p); -void bgp_refresh_begin(struct bgp_proto *p); -void bgp_refresh_end(struct bgp_proto *p); +void bgp_graceful_restart_done(struct bgp_channel *c); +void bgp_refresh_begin(struct bgp_channel *c); +void bgp_refresh_end(struct bgp_channel *c); void bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code); void bgp_stop(struct bgp_proto *p, unsigned subcode); @@ -233,48 +458,71 @@ struct rte_source *bgp_get_source(struct bgp_proto *p, u32 path_id); /* attrs.c */ -/* Hack: although BA_NEXT_HOP attribute has type EAF_TYPE_IP_ADDRESS, in IPv6 - * we store two addesses in it - a global address and a link local address. - */ -#ifdef IPV6 -#define NEXT_HOP_LENGTH (2*sizeof(ip_addr)) -static inline void set_next_hop(byte *b, ip_addr addr) { ((ip_addr *) b)[0] = addr; ((ip_addr *) b)[1] = IPA_NONE; } -#else -#define NEXT_HOP_LENGTH sizeof(ip_addr) -static inline void set_next_hop(byte *b, ip_addr addr) { ((ip_addr *) b)[0] = addr; } -#endif +static inline eattr * +bgp_find_attr(ea_list *attrs, uint code) +{ + return ea_find(attrs, EA_CODE(EAP_BGP, code)); +} + +eattr * +bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintptr_t val); + +static inline void +bgp_set_attr_u32(ea_list **to, struct linpool *pool, uint code, uint flags, u32 val) +{ bgp_set_attr(to, pool, code, flags, (uintptr_t) val); } + +static inline void +bgp_set_attr_ptr(ea_list **to, struct linpool *pool, uint code, uint flags, struct adata *val) +{ bgp_set_attr(to, pool, code, flags, (uintptr_t) val); } + +static inline void +bgp_set_attr_data(ea_list **to, struct linpool *pool, uint code, uint flags, void *data, uint len) +{ + struct adata *a = lp_alloc_adata(pool, len); + memcpy(a->data, data, len); + bgp_set_attr(to, pool, code, flags, (uintptr_t) a); +} + +static inline void +bgp_unset_attr(ea_list **to, struct linpool *pool, uint code) +{ eattr *e = bgp_set_attr(to, pool, code, 0, 0); e->type = EAF_TYPE_UNDEF; } + + +int bgp_encode_attrs(struct bgp_write_state *s, ea_list *attrs, byte *buf, byte *end); +ea_list * bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len); + +void bgp_init_bucket_table(struct bgp_channel *c); +void bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b); +void bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b); +void bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b); + +void bgp_init_prefix_table(struct bgp_channel *c); +void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *bp); -void bgp_attach_attr(struct ea_list **to, struct linpool *pool, unsigned attr, uintptr_t val); -byte *bgp_attach_attr_wa(struct ea_list **to, struct linpool *pool, unsigned attr, unsigned len); -struct rta *bgp_decode_attrs(struct bgp_conn *conn, byte *a, uint len, struct linpool *pool, int mandatory); -int bgp_get_attr(struct eattr *e, byte *buf, int buflen); int bgp_rte_better(struct rte *, struct rte *); int bgp_rte_mergable(rte *pri, rte *sec); int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best); -void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs); +void bgp_rt_notify(struct proto *P, struct channel *C, net *n, rte *new, rte *old, ea_list *attrs); int bgp_import_control(struct proto *, struct rte **, struct ea_list **, struct linpool *); -void bgp_init_bucket_table(struct bgp_proto *); -void bgp_free_bucket_table(struct bgp_proto *p); -void bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck); -void bgp_init_prefix_table(struct bgp_proto *p, u32 order); -void bgp_free_prefix_table(struct bgp_proto *p); -void bgp_free_prefix(struct bgp_proto *p, struct bgp_prefix *bp); -uint bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains); +int bgp_get_attr(struct eattr *e, byte *buf, int buflen); void bgp_get_route_info(struct rte *, byte *buf, struct ea_list *attrs); -inline static void bgp_attach_attr_ip(struct ea_list **to, struct linpool *pool, unsigned attr, ip_addr a) -{ *(ip_addr *) bgp_attach_attr_wa(to, pool, attr, sizeof(ip_addr)) = a; } /* packets.c */ void mrt_dump_bgp_state_change(struct bgp_conn *conn, unsigned old, unsigned new); -void bgp_schedule_packet(struct bgp_conn *conn, int type); +const struct bgp_af_desc *bgp_get_af_desc(u32 afi); +const struct bgp_af_caps *bgp_find_af_caps(struct bgp_caps *caps, u32 afi); +void bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type); void bgp_kick_tx(void *vconn); void bgp_tx(struct birdsock *sk); int bgp_rx(struct birdsock *sk, uint size); const char * bgp_error_dsc(unsigned code, unsigned subcode); void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len); +void bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to); + + /* Packet types */ #define PKT_OPEN 0x01 @@ -292,26 +540,25 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi #define BAF_PARTIAL 0x20 #define BAF_EXT_LEN 0x10 -#define BA_ORIGIN 0x01 /* [RFC1771] */ /* WM */ +#define BA_ORIGIN 0x01 /* RFC 4271 */ /* WM */ #define BA_AS_PATH 0x02 /* WM */ #define BA_NEXT_HOP 0x03 /* WM */ #define BA_MULTI_EXIT_DISC 0x04 /* ON */ #define BA_LOCAL_PREF 0x05 /* WD */ #define BA_ATOMIC_AGGR 0x06 /* WD */ #define BA_AGGREGATOR 0x07 /* OT */ -#define BA_COMMUNITY 0x08 /* [RFC1997] */ /* OT */ -#define BA_ORIGINATOR_ID 0x09 /* [RFC1966] */ /* ON */ -#define BA_CLUSTER_LIST 0x0a /* ON */ -/* We don't support these: */ -#define BA_DPA 0x0b /* ??? */ -#define BA_ADVERTISER 0x0c /* [RFC1863] */ -#define BA_RCID_PATH 0x0d -#define BA_MP_REACH_NLRI 0x0e /* [RFC2283] */ -#define BA_MP_UNREACH_NLRI 0x0f -#define BA_EXT_COMMUNITY 0x10 /* [RFC4360] */ -#define BA_AS4_PATH 0x11 /* [RFC4893] */ -#define BA_AS4_AGGREGATOR 0x12 -#define BA_LARGE_COMMUNITY 0x20 /* [RFC8092] */ +#define BA_COMMUNITY 0x08 /* RFC 1997 */ /* OT */ +#define BA_ORIGINATOR_ID 0x09 /* RFC 4456 */ /* ON */ +#define BA_CLUSTER_LIST 0x0a /* RFC 4456 */ /* ON */ +#define BA_MP_REACH_NLRI 0x0e /* RFC 4760 */ +#define BA_MP_UNREACH_NLRI 0x0f /* RFC 4760 */ +#define BA_EXT_COMMUNITY 0x10 /* RFC 4360 */ +#define BA_AS4_PATH 0x11 /* RFC 6793 */ +#define BA_AS4_AGGREGATOR 0x12 /* RFC 6793 */ +#define BA_LARGE_COMMUNITY 0x20 /* RFC 8092 */ + +/* Bird's private internal BGP attributes */ +#define BA_MPLS_LABEL_STACK 0xfe /* MPLS label stack transfer attribute */ /* BGP connection states */ @@ -331,14 +578,12 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi * * When BGP protocol is started by core, it goes to BSS_PREPARE. When BGP * protocol done what is neccessary to start itself (like acquiring the lock), - * it goes to BSS_CONNECT. When some connection attempt failed because of - * option or capability error, it goes to BSS_CONNECT_NOCAP. + * it goes to BSS_CONNECT. */ #define BSS_PREPARE 0 /* Used before ordinary BGP started, i. e. waiting for lock */ #define BSS_DELAY 1 /* Startup delay due to previous errors */ #define BSS_CONNECT 2 /* Ordinary BGP connecting */ -#define BSS_CONNECT_NOCAP 3 /* Legacy BGP connecting (without capabilities) */ /* BGP feed states (TX) @@ -347,7 +592,7 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi * * RFC 7313 specifies that a route refresh should be demarcated by BoRR and EoRR packets. * - * These states (stored in p->feed_state) are used to keep track of these + * These states (stored in c->feed_state) are used to keep track of these * requirements. When such feed is started, BFS_LOADING / BFS_REFRESHING is * set. When it ended, BFS_LOADED / BFS_REFRESHED is set to schedule End-of-RIB * or EoRR packet. When the packet is sent, the state returned to BFS_NONE. @@ -403,15 +648,5 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi #define ORIGIN_EGP 1 #define ORIGIN_INCOMPLETE 2 -/* Address families */ - -#define BGP_AF_IPV4 1 -#define BGP_AF_IPV6 2 - -#ifdef IPV6 -#define BGP_AF BGP_AF_IPV6 -#else -#define BGP_AF BGP_AF_IPV4 -#endif #endif diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 55c602f1..941ae5b6 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -13,28 +13,32 @@ CF_HDR CF_DEFINES #define BGP_CFG ((struct bgp_config *) this_proto) +#define BGP_CC ((struct bgp_channel_config *) this_channel) CF_DECLS -CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, - KEEPALIVE, MULTIHOP, STARTUP, VIA, NEXT, HOP, SELF, DEFAULT, - PATH, METRIC, ERROR, START, DELAY, FORGET, WAIT, ENABLE, - DISABLE, AFTER, BGP_PATH, BGP_LOCAL_PREF, BGP_MED, BGP_ORIGIN, - BGP_NEXT_HOP, BGP_ATOMIC_AGGR, BGP_AGGREGATOR, BGP_COMMUNITY, - BGP_EXT_COMMUNITY, SOURCE, ADDRESS, PASSWORD, RR, RS, CLIENT, - CLUSTER, ID, AS4, ADVERTISE, IPV4, CAPABILITIES, LIMIT, PASSIVE, - PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH, - INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP, - TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC, - SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX, GRACEFUL, RESTART, AWARE, - CHECK, LINK, PORT, EXTENDED, MESSAGES, SETKEY, BGP_LARGE_COMMUNITY) +CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE, + MULTIHOP, STARTUP, VIA, NEXT, HOP, SELF, DEFAULT, PATH, METRIC, ERROR, + START, DELAY, FORGET, WAIT, ENABLE, DISABLE, AFTER, BGP_PATH, + BGP_LOCAL_PREF, BGP_MED, BGP_ORIGIN, BGP_NEXT_HOP, BGP_ATOMIC_AGGR, + BGP_AGGREGATOR, BGP_COMMUNITY, BGP_EXT_COMMUNITY, BGP_LARGE_COMMUNITY, + SOURCE, ADDRESS, PASSWORD, RR, RS, CLIENT, CLUSTER, ID, AS4, ADVERTISE, + IPV4, CAPABILITIES, LIMIT, PASSIVE, PREFER, OLDER, MISSING, LLADDR, + DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, + BGP_CLUSTER_LIST, IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, + SECURITY, DETERMINISTIC, SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX, + GRACEFUL, RESTART, AWARE, CHECK, LINK, PORT, EXTENDED, MESSAGES, SETKEY, + STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6) + +%type <i32> bgp_afi CF_GRAMMAR -CF_ADDTO(proto, bgp_proto '}' { bgp_check_config(BGP_CFG); } ) +CF_ADDTO(proto, bgp_proto '}' ) bgp_proto_start: proto_start BGP { this_proto = proto_config_new(&proto_bgp, $1); + BGP_CFG->local_port = BGP_PORT; BGP_CFG->remote_port = BGP_PORT; BGP_CFG->multihop = -1; /* undefined */ BGP_CFG->hold_time = 240; @@ -49,26 +53,35 @@ bgp_proto_start: proto_start BGP { BGP_CFG->enable_refresh = 1; BGP_CFG->enable_as4 = 1; BGP_CFG->capabilities = 2; - BGP_CFG->advertise_ipv4 = 1; BGP_CFG->interpret_communities = 1; BGP_CFG->default_local_pref = 100; BGP_CFG->gr_mode = BGP_GR_AWARE; BGP_CFG->gr_time = 120; BGP_CFG->setkey = 1; - } + } + ; + +bgp_loc_opts: + /* empty */ + | bgp_loc_opts PORT expr { BGP_CFG->local_port = $3; if (($3<1) || ($3>65535)) cf_error("Invalid port number"); } + | bgp_loc_opts AS expr { BGP_CFG->local_as = $3; } ; bgp_nbr_opts: /* empty */ - | bgp_nbr_opts PORT expr { BGP_CFG->remote_port = $3; if (($3<1) || ($3>65535)) cf_error("Invalid port number"); } + | bgp_nbr_opts PORT expr { BGP_CFG->remote_port = $3; if (($3<1) || ($3>65535)) cf_error("Invalid port number"); } | bgp_nbr_opts AS expr { BGP_CFG->remote_as = $3; } ; bgp_proto: bgp_proto_start proto_name '{' | bgp_proto proto_item ';' - | bgp_proto LOCAL AS expr ';' { BGP_CFG->local_as = $4; } - | bgp_proto LOCAL ipa AS expr ';' { BGP_CFG->source_addr = $3; BGP_CFG->local_as = $5; } + | bgp_proto bgp_proto_channel ';' + | bgp_proto LOCAL bgp_loc_opts ';' + | bgp_proto LOCAL ipa ipa_scope bgp_loc_opts ';' { + BGP_CFG->local_ip = $3; + if ($4) BGP_CFG->iface = $4; + } | bgp_proto NEIGHBOR bgp_nbr_opts ';' | bgp_proto NEIGHBOR ipa ipa_scope bgp_nbr_opts ';' { if (ipa_nonzero(BGP_CFG->remote_ip)) @@ -78,20 +91,16 @@ bgp_proto: } | bgp_proto INTERFACE TEXT ';' { BGP_CFG->iface = if_get_by_name($3); } | bgp_proto RR CLUSTER ID idval ';' { BGP_CFG->rr_cluster_id = $5; } - | bgp_proto RR CLIENT ';' { BGP_CFG->rr_client = 1; } - | bgp_proto RS CLIENT ';' { BGP_CFG->rs_client = 1; } + | bgp_proto RR CLIENT bool ';' { BGP_CFG->rr_client = $4; } + | bgp_proto RS CLIENT bool ';' { BGP_CFG->rs_client = $4; } + | bgp_proto CONFEDERATION expr ';' { BGP_CFG->confederation = $3; } + | bgp_proto CONFEDERATION MEMBER bool ';' { BGP_CFG->confederation_member = $4; } | bgp_proto HOLD TIME expr ';' { BGP_CFG->hold_time = $4; } | bgp_proto STARTUP HOLD TIME expr ';' { BGP_CFG->initial_hold_time = $5; } | bgp_proto DIRECT ';' { BGP_CFG->multihop = 0; } | bgp_proto MULTIHOP ';' { BGP_CFG->multihop = 64; } | bgp_proto MULTIHOP expr ';' { BGP_CFG->multihop = $3; if (($3<1) || ($3>255)) cf_error("Multihop must be in range 1-255"); } - | bgp_proto NEXT HOP SELF ';' { BGP_CFG->next_hop_self = 1; BGP_CFG->next_hop_keep = 0; } - | bgp_proto NEXT HOP KEEP ';' { BGP_CFG->next_hop_keep = 1; BGP_CFG->next_hop_self = 0; } - | bgp_proto MISSING LLADDR SELF ';' { BGP_CFG->missing_lladdr = MLL_SELF; } - | bgp_proto MISSING LLADDR DROP ';' { BGP_CFG->missing_lladdr = MLL_DROP; } - | bgp_proto MISSING LLADDR IGNORE ';' { BGP_CFG->missing_lladdr = MLL_IGNORE; } - | bgp_proto GATEWAY DIRECT ';' { BGP_CFG->gw_mode = GW_DIRECT; } - | bgp_proto GATEWAY RECURSIVE ';' { BGP_CFG->gw_mode = GW_RECURSIVE; } + | bgp_proto STRICT BIND bool ';' { BGP_CFG->strict_bind = $4; } | bgp_proto PATH METRIC bool ';' { BGP_CFG->compare_path_lengths = $4; } | bgp_proto MED METRIC bool ';' { BGP_CFG->med_metric = $4; } | bgp_proto IGP METRIC bool ';' { BGP_CFG->igp_metric = $4; } @@ -99,7 +108,7 @@ bgp_proto: | bgp_proto DETERMINISTIC MED bool ';' { BGP_CFG->deterministic_med = $4; } | bgp_proto DEFAULT BGP_MED expr ';' { BGP_CFG->default_med = $4; } | bgp_proto DEFAULT BGP_LOCAL_PREF expr ';' { BGP_CFG->default_local_pref = $4; } - | bgp_proto SOURCE ADDRESS ipa ';' { BGP_CFG->source_addr = $4; } + | bgp_proto SOURCE ADDRESS ipa ';' { BGP_CFG->local_ip = $4; } | bgp_proto START DELAY TIME expr ';' { BGP_CFG->connect_delay_time = $5; log(L_WARN "%s: Start delay time option is deprecated, use connect delay time", this_proto->name); } | bgp_proto CONNECT DELAY TIME expr ';' { BGP_CFG->connect_delay_time = $5; } | bgp_proto CONNECT RETRY TIME expr ';' { BGP_CFG->connect_retry_time = $5; } @@ -111,33 +120,101 @@ bgp_proto: | bgp_proto ENABLE AS4 bool ';' { BGP_CFG->enable_as4 = $4; } | bgp_proto ENABLE EXTENDED MESSAGES bool ';' { BGP_CFG->enable_extended_messages = $5; } | bgp_proto CAPABILITIES bool ';' { BGP_CFG->capabilities = $3; } - | bgp_proto ADVERTISE IPV4 bool ';' { BGP_CFG->advertise_ipv4 = $4; } | bgp_proto PASSWORD text ';' { BGP_CFG->password = $3; } | bgp_proto SETKEY bool ';' { BGP_CFG->setkey = $3; } - | bgp_proto ROUTE LIMIT expr ';' { - this_proto->in_limit = cfg_allocz(sizeof(struct proto_limit)); - this_proto->in_limit->limit = $4; - this_proto->in_limit->action = PLA_RESTART; - log(L_WARN "%s: Route limit option is deprecated, use import limit", this_proto->name); - } | bgp_proto PASSIVE bool ';' { BGP_CFG->passive = $3; } | bgp_proto INTERPRET COMMUNITIES bool ';' { BGP_CFG->interpret_communities = $4; } - | bgp_proto SECONDARY bool ';' { BGP_CFG->secondary = $3; } - | bgp_proto ADD PATHS RX ';' { BGP_CFG->add_path = ADD_PATH_RX; } - | bgp_proto ADD PATHS TX ';' { BGP_CFG->add_path = ADD_PATH_TX; } - | bgp_proto ADD PATHS bool ';' { BGP_CFG->add_path = $4 ? ADD_PATH_FULL : 0; } - | bgp_proto ALLOW BGP_LOCAL_PREF bool ';' { BGP_CFG->allow_local_pref = $4; } | bgp_proto ALLOW LOCAL AS ';' { BGP_CFG->allow_local_as = -1; } | bgp_proto ALLOW LOCAL AS expr ';' { BGP_CFG->allow_local_as = $5; } + | bgp_proto ALLOW BGP_LOCAL_PREF bool ';' { BGP_CFG->allow_local_pref = $4; } | bgp_proto GRACEFUL RESTART bool ';' { BGP_CFG->gr_mode = $4; } | bgp_proto GRACEFUL RESTART AWARE ';' { BGP_CFG->gr_mode = BGP_GR_AWARE; } | bgp_proto GRACEFUL RESTART TIME expr ';' { BGP_CFG->gr_time = $5; } - | bgp_proto IGP TABLE rtable ';' { BGP_CFG->igp_table = $4; } | bgp_proto TTL SECURITY bool ';' { BGP_CFG->ttl_security = $4; } | bgp_proto CHECK LINK bool ';' { BGP_CFG->check_link = $4; } | bgp_proto BFD bool ';' { BGP_CFG->bfd = $3; cf_check_bfd($3); } ; +bgp_afi: + IPV4 { $$ = BGP_AF_IPV4; } + | IPV6 { $$ = BGP_AF_IPV6; } + | IPV4 MULTICAST { $$ = BGP_AF_IPV4_MC; } + | IPV6 MULTICAST { $$ = BGP_AF_IPV6_MC; } + | IPV4 MPLS { $$ = BGP_AF_IPV4_MPLS; } + | IPV6 MPLS { $$ = BGP_AF_IPV6_MPLS; } + | VPN4 MPLS { $$ = BGP_AF_VPN4_MPLS; } + | VPN6 MPLS { $$ = BGP_AF_VPN6_MPLS; } + | VPN4 MULTICAST { $$ = BGP_AF_VPN4_MC; } + | VPN6 MULTICAST { $$ = BGP_AF_VPN6_MC; } + | FLOW4 { $$ = BGP_AF_FLOW4; } + | FLOW6 { $$ = BGP_AF_FLOW6; } + ; + +bgp_channel_start: bgp_afi +{ + const struct bgp_af_desc *desc = bgp_get_af_desc($1); + + if (!desc) + cf_error("Unknown AFI/SAFI"); + + this_channel = channel_config_new(&channel_bgp, desc->net, this_proto); + BGP_CC->c.name = desc->name; + BGP_CC->c.ra_mode = RA_UNDEF; + BGP_CC->afi = $1; + BGP_CC->desc = desc; + BGP_CC->gr_able = 0xff; /* undefined */ +}; + +bgp_channel_item: + channel_item + | NEXT HOP ADDRESS ipa { BGP_CC->next_hop_addr = $4; } + | NEXT HOP SELF { BGP_CC->next_hop_self = 1; BGP_CC->next_hop_keep = 0; } + | NEXT HOP KEEP { BGP_CC->next_hop_keep = 1; BGP_CC->next_hop_self = 0; } + | MISSING LLADDR SELF { BGP_CC->missing_lladdr = MLL_SELF; } + | MISSING LLADDR DROP { BGP_CC->missing_lladdr = MLL_DROP; } + | MISSING LLADDR IGNORE { BGP_CC->missing_lladdr = MLL_IGNORE; } + | GATEWAY DIRECT { BGP_CC->gw_mode = GW_DIRECT; } + | GATEWAY RECURSIVE { BGP_CC->gw_mode = GW_RECURSIVE; } + | SECONDARY bool { BGP_CC->secondary = $2; } + | GRACEFUL RESTART bool { BGP_CC->gr_able = $3; } + | EXTENDED NEXT HOP bool { BGP_CC->ext_next_hop = $4; } + | ADD PATHS RX { BGP_CC->add_path = BGP_ADD_PATH_RX; } + | ADD PATHS TX { BGP_CC->add_path = BGP_ADD_PATH_TX; } + | ADD PATHS bool { BGP_CC->add_path = $3 ? BGP_ADD_PATH_FULL : 0; } + | IGP TABLE rtable { + if (BGP_CC->desc->no_igp) + cf_error("IGP table not allowed here"); + + if ($3->addr_type == NET_IP4) + BGP_CC->igp_table_ip4 = $3; + else if ($3->addr_type == NET_IP6) + BGP_CC->igp_table_ip6 = $3; + else + cf_error("Mismatched IGP table type"); + } + ; + +bgp_channel_opts: + /* empty */ + | bgp_channel_opts bgp_channel_item ';' + ; + +bgp_channel_opt_list: + /* empty */ + | '{' bgp_channel_opts '}' + ; + +bgp_channel_end: +{ + if (!this_channel->table) + cf_error("Routing table not specified"); + + this_channel = NULL; +}; + +bgp_proto_channel: bgp_channel_start bgp_channel_opt_list bgp_channel_end; + + CF_ADDTO(dynamic_attr, BGP_ORIGIN { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_ENUM_BGP_ORIGIN, EA_CODE(EAP_BGP, BA_ORIGIN)); }) CF_ADDTO(dynamic_attr, BGP_PATH diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index ab87bdcc..0e974746 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -2,12 +2,16 @@ * BIRD -- BGP Packet Processing * * (c) 2000 Martin Mares <mj@ucw.cz> + * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2008--2016 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ #undef LOCAL_DEBUG +#include <stdlib.h> + #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" @@ -16,6 +20,7 @@ #include "nest/mrtdump.h" #include "conf/conf.h" #include "lib/unaligned.h" +#include "lib/flowspec.h" #include "lib/socket.h" #include "nest/cli.h" @@ -27,6 +32,13 @@ #define BGP_RR_BEGIN 1 #define BGP_RR_END 2 +#define BGP_NLRI_MAX (4 + 1 + 32) + +#define BGP_MPLS_BOS 1 /* Bottom-of-stack bit */ +#define BGP_MPLS_MAX 10 /* Max number of labels that 24*n <= 255 */ +#define BGP_MPLS_NULL 3 /* Implicit NULL label */ +#define BGP_MPLS_MAGIC 0x800000 /* Magic withdraw label value, RFC 3107 3 */ + static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS; static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS; @@ -38,6 +50,46 @@ static byte fsm_err_subcode[BS_MAX] = { [BS_ESTABLISHED] = 3 }; + +static struct bgp_channel * +bgp_get_channel(struct bgp_proto *p, u32 afi) +{ + uint i; + + for (i = 0; i < p->channel_count; i++) + if (p->afi_map[i] == afi) + return p->channel_map[i]; + + return NULL; +} + +static inline void +put_af3(byte *buf, u32 id) +{ + put_u16(buf, id >> 16); + buf[2] = id & 0xff; +} + +static inline void +put_af4(byte *buf, u32 id) +{ + put_u16(buf, id >> 16); + buf[2] = 0; + buf[3] = id & 0xff; +} + +static inline u32 +get_af3(byte *buf) +{ + return (get_u16(buf) << 16) | buf[2]; +} + +static inline u32 +get_af4(byte *buf) +{ + return (get_u16(buf) << 16) | buf[3]; +} + /* * MRT Dump format is not semantically specified. * We will use these values in appropriate fields: @@ -58,31 +110,41 @@ static byte * mrt_put_bgp4_hdr(byte *buf, struct bgp_conn *conn, int as4) { struct bgp_proto *p = conn->bgp; + uint v4 = ipa_is_ip4(p->cf->remote_ip); if (as4) - { - put_u32(buf+0, p->remote_as); - put_u32(buf+4, p->local_as); - buf+=8; - } + { + put_u32(buf+0, p->remote_as); + put_u32(buf+4, p->public_as); + buf+=8; + } else - { - put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS); - put_u16(buf+2, (p->local_as <= 0xFFFF) ? p->local_as : AS_TRANS); - buf+=4; - } + { + put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS); + put_u16(buf+2, (p->public_as <= 0xFFFF) ? p->public_as : AS_TRANS); + buf+=4; + } put_u16(buf+0, (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0); - put_u16(buf+2, BGP_AF); + put_u16(buf+2, v4 ? BGP_AFI_IPV4 : BGP_AFI_IPV6); buf+=4; - buf = put_ipa(buf, conn->sk ? conn->sk->daddr : IPA_NONE); - buf = put_ipa(buf, conn->sk ? conn->sk->saddr : IPA_NONE); + + if (v4) + { + buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->daddr) : IP4_NONE); + buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->saddr) : IP4_NONE); + } + else + { + buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->daddr) : IP6_NONE); + buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->saddr) : IP6_NONE); + } return buf; } static void -mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, unsigned len) +mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, uint len) { byte *buf = alloca(128+len); /* 128 is enough for MRT headers */ byte *bp = buf + MRTDUMP_HDR_LENGTH; @@ -96,14 +158,14 @@ mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, unsigned len) } static inline u16 -convert_state(unsigned state) +convert_state(uint state) { /* Convert state from our BS_* values to values used in MRTDump */ return (state == BS_CLOSE) ? 1 : state + 1; } void -mrt_dump_bgp_state_change(struct bgp_conn *conn, unsigned old, unsigned new) +mrt_dump_bgp_state_change(struct bgp_conn *conn, uint old, uint new) { byte buf[128]; byte *bp = buf + MRTDUMP_HDR_LENGTH; @@ -127,1303 +189,2426 @@ bgp_create_notification(struct bgp_conn *conn, byte *buf) return buf + 2 + conn->notify_size; } -#ifdef IPV6 -static byte * -bgp_put_cap_ipv6(struct bgp_proto *p UNUSED, byte *buf) -{ - *buf++ = 1; /* Capability 1: Multiprotocol extensions */ - *buf++ = 4; /* Capability data length */ - *buf++ = 0; /* We support AF IPv6 */ - *buf++ = BGP_AF_IPV6; - *buf++ = 0; /* RFU */ - *buf++ = 1; /* and SAFI 1 */ - return buf; -} -#else +/* Capability negotiation as per RFC 5492 */ -static byte * -bgp_put_cap_ipv4(struct bgp_proto *p UNUSED, byte *buf) -{ - *buf++ = 1; /* Capability 1: Multiprotocol extensions */ - *buf++ = 4; /* Capability data length */ - *buf++ = 0; /* We support AF IPv4 */ - *buf++ = BGP_AF_IPV4; - *buf++ = 0; /* RFU */ - *buf++ = 1; /* and SAFI 1 */ - return buf; +const struct bgp_af_caps * +bgp_find_af_caps(struct bgp_caps *caps, u32 afi) +{ + struct bgp_af_caps *ac; + + WALK_AF_CAPS(caps, ac) + if (ac->afi == afi) + return ac; + + return NULL; } -#endif -static byte * -bgp_put_cap_rr(struct bgp_proto *p UNUSED, byte *buf) +static struct bgp_af_caps * +bgp_get_af_caps(struct bgp_caps *caps, u32 afi) { - *buf++ = 2; /* Capability 2: Support for route refresh */ - *buf++ = 0; /* Capability data length */ - return buf; + struct bgp_af_caps *ac; + + WALK_AF_CAPS(caps, ac) + if (ac->afi == afi) + return ac; + + ac = &caps->af_data[caps->af_count++]; + memset(ac, 0, sizeof(struct bgp_af_caps)); + ac->afi = afi; + + return ac; } -static byte * -bgp_put_cap_ext_msg(struct bgp_proto *p UNUSED, byte *buf) +static int +bgp_af_caps_cmp(const void *X, const void *Y) { - *buf++ = 6; /* Capability 6: Support for extended messages */ - *buf++ = 0; /* Capability data length */ - return buf; + const struct bgp_af_caps *x = X, *y = Y; + return (x->afi < y->afi) ? -1 : (x->afi > y->afi) ? 1 : 0; } + static byte * -bgp_put_cap_gr1(struct bgp_proto *p, byte *buf) +bgp_write_capabilities(struct bgp_conn *conn, byte *buf) { - *buf++ = 64; /* Capability 64: Support for graceful restart */ - *buf++ = 6; /* Capability data length */ + struct bgp_proto *p = conn->bgp; + struct bgp_channel *c; + struct bgp_caps *caps; + struct bgp_af_caps *ac; + uint any_ext_next_hop = 0; + uint any_add_path = 0; + byte *data; - put_u16(buf, p->cf->gr_time); - if (p->p.gr_recovery) - buf[0] |= BGP_GRF_RESTART; - buf += 2; + /* Prepare bgp_caps structure */ + + int n = list_length(&p->p.channels); + caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + n * sizeof(struct bgp_af_caps)); + conn->local_caps = caps; + + caps->as4_support = p->cf->enable_as4; + caps->ext_messages = p->cf->enable_extended_messages; + caps->route_refresh = p->cf->enable_refresh; + caps->enhanced_refresh = p->cf->enable_refresh; + + if (caps->as4_support) + caps->as4_number = p->public_as; + + if (p->cf->gr_mode) + { + caps->gr_aware = 1; + caps->gr_time = p->cf->gr_time; + caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0; + } + + /* Allocate and fill per-AF fields */ + WALK_LIST(c, p->p.channels) + { + ac = &caps->af_data[caps->af_count++]; + ac->afi = c->afi; + ac->ready = 1; + + ac->ext_next_hop = bgp_channel_is_ipv4(c) && c->cf->ext_next_hop; + any_ext_next_hop |= ac->ext_next_hop; + + ac->add_path = c->cf->add_path; + any_add_path |= ac->add_path; + + if (c->cf->gr_able) + { + ac->gr_able = 1; + + if (p->p.gr_recovery) + ac->gr_af_flags |= BGP_GRF_FORWARDING; + } + } + + /* Sort capability fields by AFI/SAFI */ + qsort(caps->af_data, caps->af_count, sizeof(struct bgp_af_caps), bgp_af_caps_cmp); - *buf++ = 0; /* Appropriate AF */ - *buf++ = BGP_AF; - *buf++ = 1; /* and SAFI 1 */ - *buf++ = p->p.gr_recovery ? BGP_GRF_FORWARDING : 0; + + /* Create capability list in buffer */ + + /* + * Note that max length is ~ 20+14*af_count. With max 12 channels that is + * 188. Option limit is 253 and buffer size is 4096, so we cannot overflow + * unless we add new capabilities or more AFs. + */ + + WALK_AF_CAPS(caps, ac) + if (ac->ready) + { + *buf++ = 1; /* Capability 1: Multiprotocol extensions */ + *buf++ = 4; /* Capability data length */ + put_af4(buf, ac->afi); + buf += 4; + } + + if (caps->route_refresh) + { + *buf++ = 2; /* Capability 2: Support for route refresh */ + *buf++ = 0; /* Capability data length */ + } + + if (any_ext_next_hop) + { + *buf++ = 5; /* Capability 5: Support for extended next hop */ + *buf++ = 0; /* Capability data length, will be fixed later */ + data = buf; + + WALK_AF_CAPS(caps, ac) + if (ac->ext_next_hop) + { + put_af4(buf, ac->afi); + put_u16(buf+4, BGP_AFI_IPV6); + buf += 6; + } + + data[-1] = buf - data; + } + + if (caps->ext_messages) + { + *buf++ = 6; /* Capability 6: Support for extended messages */ + *buf++ = 0; /* Capability data length */ + } + + if (caps->gr_aware) + { + *buf++ = 64; /* Capability 64: Support for graceful restart */ + *buf++ = 0; /* Capability data length, will be fixed later */ + data = buf; + + put_u16(buf, caps->gr_time); + buf[0] |= caps->gr_flags; + buf += 2; + + WALK_AF_CAPS(caps, ac) + if (ac->gr_able) + { + put_af3(buf, ac->afi); + buf[3] = ac->gr_af_flags; + buf += 4; + } + + data[-1] = buf - data; + } + + if (caps->as4_support) + { + *buf++ = 65; /* Capability 65: Support for 4-octet AS number */ + *buf++ = 4; /* Capability data length */ + put_u32(buf, p->public_as); + buf += 4; + } + + if (any_add_path) + { + *buf++ = 69; /* Capability 69: Support for ADD-PATH */ + *buf++ = 0; /* Capability data length, will be fixed later */ + data = buf; + + WALK_AF_CAPS(caps, ac) + if (ac->add_path) + { + put_af3(buf, ac->afi); + buf[3] = ac->add_path; + buf += 4; + } + + data[-1] = buf - data; + } + + if (caps->enhanced_refresh) + { + *buf++ = 70; /* Capability 70: Support for enhanced route refresh */ + *buf++ = 0; /* Capability data length */ + } return buf; } -static byte * -bgp_put_cap_gr2(struct bgp_proto *p UNUSED, byte *buf) +static void +bgp_read_capabilities(struct bgp_conn *conn, struct bgp_caps *caps, byte *pos, int len) { - *buf++ = 64; /* Capability 64: Support for graceful restart */ - *buf++ = 2; /* Capability data length */ - put_u16(buf, 0); - return buf + 2; -} + struct bgp_proto *p = conn->bgp; + struct bgp_af_caps *ac; + int i, cl; + u32 af; -static byte * -bgp_put_cap_as4(struct bgp_proto *p, byte *buf) -{ - *buf++ = 65; /* Capability 65: Support for 4-octet AS number */ - *buf++ = 4; /* Capability data length */ - put_u32(buf, p->local_as); - return buf + 4; -} + while (len > 0) + { + if (len < 2 || len < (2 + pos[1])) + goto err; -static byte * -bgp_put_cap_add_path(struct bgp_proto *p, byte *buf) -{ - *buf++ = 69; /* Capability 69: Support for ADD-PATH */ - *buf++ = 4; /* Capability data length */ + /* Capability length */ + cl = pos[1]; - *buf++ = 0; /* Appropriate AF */ - *buf++ = BGP_AF; - *buf++ = 1; /* SAFI 1 */ + /* Capability type */ + switch (pos[0]) + { + case 1: /* Multiprotocol capability, RFC 4760 */ + if (cl != 4) + goto err; - *buf++ = p->cf->add_path; + af = get_af4(pos+2); + ac = bgp_get_af_caps(caps, af); + ac->ready = 1; + break; - return buf; + case 2: /* Route refresh capability, RFC 2918 */ + if (cl != 0) + goto err; + + caps->route_refresh = 1; + break; + + case 5: /* Extended next hop encoding capability, RFC 5549 */ + if (cl % 6) + goto err; + + for (i = 0; i < cl; i += 6) + { + /* Specified only for IPv4 prefixes with IPv6 next hops */ + if ((get_u16(pos+2+i+0) != BGP_AFI_IPV4) || + (get_u16(pos+2+i+4) != BGP_AFI_IPV6)) + continue; + + af = get_af4(pos+2+i); + ac = bgp_get_af_caps(caps, af); + ac->ext_next_hop = 1; + } + break; + + case 6: /* Extended message length capability, RFC draft */ + if (cl != 0) + goto err; + + caps->ext_messages = 1; + break; + + case 64: /* Graceful restart capability, RFC 4724 */ + if (cl % 4 != 2) + goto err; + + /* Only the last instance is valid */ + WALK_AF_CAPS(caps, ac) + { + ac->gr_able = 0; + ac->gr_af_flags = 0; + } + + caps->gr_aware = 1; + caps->gr_flags = pos[2] & 0xf0; + caps->gr_time = get_u16(pos + 2) & 0x0fff; + + for (i = 2; i < cl; i += 4) + { + af = get_af3(pos+2+i); + ac = bgp_get_af_caps(caps, af); + ac->gr_able = 1; + ac->gr_af_flags = pos[2+i+3]; + } + break; + + case 65: /* AS4 capability, RFC 6793 */ + if (cl != 4) + goto err; + + caps->as4_support = 1; + caps->as4_number = get_u32(pos + 2); + break; + + case 69: /* ADD-PATH capability, RFC 7911 */ + if (cl % 4) + goto err; + + for (i = 0; i < cl; i += 4) + { + byte val = pos[2+i+3]; + if (!val || (val > BGP_ADD_PATH_FULL)) + { + log(L_WARN "%s: Got ADD-PATH capability with unknown value %u, ignoring", + p->p.name, val); + break; + } + } + + for (i = 0; i < cl; i += 4) + { + af = get_af3(pos+2+i); + ac = bgp_get_af_caps(caps, af); + ac->add_path = pos[2+i+3]; + } + break; + + case 70: /* Enhanced route refresh capability, RFC 7313 */ + if (cl != 0) + goto err; + + caps->enhanced_refresh = 1; + break; + + /* We can safely ignore all other capabilities */ + } + + ADVANCE(pos, len, 2 + cl); + } + return; + +err: + bgp_error(conn, 2, 0, NULL, 0); + return; } -static byte * -bgp_put_cap_err(struct bgp_proto *p UNUSED, byte *buf) +static int +bgp_read_options(struct bgp_conn *conn, byte *pos, int len) { - *buf++ = 70; /* Capability 70: Support for enhanced route refresh */ - *buf++ = 0; /* Capability data length */ - return buf; -} + struct bgp_proto *p = conn->bgp; + struct bgp_caps *caps; + int ol; + + /* Max number of announced AFIs is limited by max option length (255) */ + caps = alloca(sizeof(struct bgp_caps) + 64 * sizeof(struct bgp_af_caps)); + memset(caps, 0, sizeof(struct bgp_caps)); + + while (len > 0) + { + if ((len < 2) || (len < (2 + pos[1]))) + { bgp_error(conn, 2, 0, NULL, 0); return -1; } + ol = pos[1]; + if (pos[0] == 2) + { + /* BGP capabilities, RFC 5492 */ + if (p->cf->capabilities) + bgp_read_capabilities(conn, caps, pos + 2, ol); + } + else + { + /* Unknown option */ + bgp_error(conn, 2, 4, pos, ol); /* FIXME: ol or ol+2 ? */ + return -1; + } + + ADVANCE(pos, len, 2 + ol); + } + + uint n = sizeof(struct bgp_caps) + caps->af_count * sizeof(struct bgp_af_caps); + conn->remote_caps = mb_allocz(p->p.pool, n); + memcpy(conn->remote_caps, caps, n); + + return 0; +} static byte * bgp_create_open(struct bgp_conn *conn, byte *buf) { struct bgp_proto *p = conn->bgp; - byte *cap; - int cap_len; BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)", - BGP_VERSION, p->local_as, p->cf->hold_time, p->local_id); + BGP_VERSION, p->public_as, p->cf->hold_time, p->local_id); + buf[0] = BGP_VERSION; - put_u16(buf+1, (p->local_as < 0xFFFF) ? p->local_as : AS_TRANS); + put_u16(buf+1, (p->public_as < 0xFFFF) ? p->public_as : AS_TRANS); put_u16(buf+3, p->cf->hold_time); put_u32(buf+5, p->local_id); - if (conn->start_state == BSS_CONNECT_NOCAP) - { - BGP_TRACE(D_PACKETS, "Skipping capabilities"); - buf[9] = 0; - return buf + 10; - } + if (p->cf->capabilities) + { + /* Prepare local_caps and write capabilities to buffer */ + byte *end = bgp_write_capabilities(conn, buf+12); + uint len = end - (buf+12); + + buf[9] = len + 2; /* Optional parameters length */ + buf[10] = 2; /* Option 2: Capability list */ + buf[11] = len; /* Option data length */ + + return end; + } + else + { + /* Prepare empty local_caps */ + conn->local_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps)); + + buf[9] = 0; /* No optional parameters */ + return buf + 10; + } + + return buf; +} + +static void +bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) +{ + struct bgp_proto *p = conn->bgp; + struct bgp_conn *other; + u32 asn, hold, id; - /* Skipped 3 B for length field and Capabilities parameter header */ - cap = buf + 12; + /* Check state */ + if (conn->state != BS_OPENSENT) + { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } + + /* Check message contents */ + if (len < 29 || len != 29 + (uint) pkt[28]) + { bgp_error(conn, 1, 2, pkt+16, 2); return; } -#ifndef IPV6 - if (p->cf->advertise_ipv4) - cap = bgp_put_cap_ipv4(p, cap); -#endif + if (pkt[19] != BGP_VERSION) + { u16 val = BGP_VERSION; bgp_error(conn, 2, 1, (byte *) &val, 2); return; } -#ifdef IPV6 - cap = bgp_put_cap_ipv6(p, cap); -#endif + asn = get_u16(pkt+20); + hold = get_u16(pkt+22); + id = get_u32(pkt+24); + BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%R)", asn, hold, id); - if (p->cf->enable_refresh) - cap = bgp_put_cap_rr(p, cap); + if (bgp_read_options(conn, pkt+29, pkt[28]) < 0) + return; - if (p->cf->gr_mode == BGP_GR_ABLE) - cap = bgp_put_cap_gr1(p, cap); - else if (p->cf->gr_mode == BGP_GR_AWARE) - cap = bgp_put_cap_gr2(p, cap); + if (hold > 0 && hold < 3) + { bgp_error(conn, 2, 6, pkt+22, 2); return; } - if (p->cf->enable_as4) - cap = bgp_put_cap_as4(p, cap); + /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */ + if (!id || (p->is_internal && id == p->local_id)) + { bgp_error(conn, 2, 3, pkt+24, -4); return; } - if (p->cf->add_path) - cap = bgp_put_cap_add_path(p, cap); + struct bgp_caps *caps = conn->remote_caps; - if (p->cf->enable_refresh) - cap = bgp_put_cap_err(p, cap); + if (caps->as4_support) + { + u32 as4 = caps->as4_number; - if (p->cf->enable_extended_messages) - cap = bgp_put_cap_ext_msg(p, cap); + if ((as4 != asn) && (asn != AS_TRANS)) + log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name); - cap_len = cap - buf - 12; - if (cap_len > 0) - { - buf[9] = cap_len + 2; /* Optional params len */ - buf[10] = 2; /* Option: Capability list */ - buf[11] = cap_len; /* Option length */ - return cap; - } + if (as4 != p->remote_as) + { as4 = htonl(as4); bgp_error(conn, 2, 2, (byte *) &as4, 4); return; } + } else + { + if (asn != p->remote_as) + { bgp_error(conn, 2, 2, pkt+20, 2); return; } + } + + /* Check the other connection */ + other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; + switch (other->state) + { + case BS_CONNECT: + case BS_ACTIVE: + /* Stop outgoing connection attempts */ + bgp_conn_enter_idle_state(other); + break; + + case BS_IDLE: + case BS_OPENSENT: + case BS_CLOSE: + break; + + case BS_OPENCONFIRM: + /* + * Description of collision detection rules in RFC 4271 is confusing and + * contradictory, but it is essentially: + * + * 1. Router with higher ID is dominant + * 2. If both have the same ID, router with higher ASN is dominant [RFC6286] + * 3. When both connections are in OpenConfirm state, one initiated by + * the dominant router is kept. + * + * The first line in the expression below evaluates whether the neighbor + * is dominant, the second line whether the new connection was initiated + * by the neighbor. If both are true (or both are false), we keep the new + * connection, otherwise we keep the old one. + */ + if (((p->local_id < id) || ((p->local_id == id) && (p->public_as < p->remote_as))) + == (conn == &p->incoming_conn)) { - buf[9] = 0; /* No optional parameters */ - return buf + 10; + /* Should close the other connection */ + BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection"); + bgp_error(other, 6, 7, NULL, 0); + break; } + /* Fall thru */ + case BS_ESTABLISHED: + /* Should close this connection */ + BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection"); + bgp_error(conn, 6, 7, NULL, 0); + return; + + default: + bug("bgp_rx_open: Unknown state"); + } + + /* Update our local variables */ + conn->hold_time = MIN(hold, p->cf->hold_time); + conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3; + conn->as4_session = conn->local_caps->as4_support && caps->as4_support; + conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages; + p->remote_id = id; + + DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n", + conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session); + + bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE); + bgp_start_timer(conn->hold_timer, conn->hold_time); + bgp_conn_enter_openconfirm_state(conn); } -static uint -bgp_encode_prefixes(struct bgp_proto *p, byte *w, struct bgp_bucket *buck, uint remains) + +/* + * Next hop handling + */ + +#define REPORT(msg, args...) \ + ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); }) + +#define DISCARD(msg, args...) \ + ({ REPORT(msg, ## args); return; }) + +#define WITHDRAW(msg, args...) \ + ({ REPORT(msg, ## args); s->err_withdraw = 1; return; }) + +#define BAD_AFI "Unexpected AF <%u/%u> in UPDATE" +#define BAD_NEXT_HOP "Invalid NEXT_HOP attribute" +#define NO_NEXT_HOP "Missing NEXT_HOP attribute" +#define NO_LABEL_STACK "Missing MPLS stack" + + +static void +bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll) { - byte *start = w; - ip_addr a; - int bytes; + struct bgp_proto *p = s->proto; + struct bgp_channel *c = s->channel; - while (!EMPTY_LIST(buck->prefixes) && (remains >= (5+sizeof(ip_addr)))) - { - struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes)); - DBG("\tDequeued route %I/%d\n", px->n.prefix, px->n.pxlen); + if (c->cf->gw_mode == GW_DIRECT) + { + neighbor *nbr = NULL; - if (p->add_path_tx) - { - put_u32(w, px->path_id); - w += 4; - remains -= 4; - } + /* GW_DIRECT -> single_hop -> p->neigh != NULL */ + if (ipa_nonzero(gw)) + nbr = neigh_find2(&p->p, &gw, NULL, 0); + else if (ipa_nonzero(ll)) + nbr = neigh_find2(&p->p, &ll, p->neigh->iface, 0); - *w++ = px->n.pxlen; - bytes = (px->n.pxlen + 7) / 8; - a = px->n.prefix; - ipa_hton(a); - memcpy(w, &a, bytes); - w += bytes; - remains -= bytes + 1; - rem_node(&px->bucket_node); - bgp_free_prefix(p, px); - // fib_delete(&p->prefix_fib, px); - } - return w - start; + if (!nbr || (nbr->scope == SCOPE_HOST)) + WITHDRAW(BAD_NEXT_HOP); + + a->dest = RTD_UNICAST; + a->nh.gw = nbr->addr; + a->nh.iface = nbr->iface; + } + else /* GW_RECURSIVE */ + { + if (ipa_zero(gw)) + WITHDRAW(BAD_NEXT_HOP); + + rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6; + s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table); + + if (!s->mpls) + rta_apply_hostentry(a, s->hostentry, NULL); + + /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */ + } } static void -bgp_flush_prefixes(struct bgp_proto *p, struct bgp_bucket *buck) +bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum) { - while (!EMPTY_LIST(buck->prefixes)) - { - struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes)); - log(L_ERR "%s: - route %I/%d skipped", p->p.name, px->n.prefix, px->n.pxlen); - rem_node(&px->bucket_node); - bgp_free_prefix(p, px); - // fib_delete(&p->prefix_fib, px); - } + if (lnum > MPLS_MAX_LABEL_STACK) + { + REPORT("Too many MPLS labels ($u)", lnum); + + a->dest = RTD_UNREACHABLE; + a->hostentry = NULL; + a->nh = (struct nexthop) { }; + return; + } + + /* Handle implicit NULL as empty MPLS stack */ + if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL)) + lnum = 0; + + if (s->channel->cf->gw_mode == GW_DIRECT) + { + a->nh.labels = lnum; + memcpy(a->nh.label, labels, 4*lnum); + } + else /* GW_RECURSIVE */ + { + mpls_label_stack ms; + + ms.len = lnum; + memcpy(ms.stack, labels, 4*lnum); + rta_apply_hostentry(a, s->hostentry, &ms); + } } -#ifndef IPV6 /* IPv4 version */ -static byte * -bgp_create_update(struct bgp_conn *conn, byte *buf) +static inline int +bgp_use_next_hop(struct bgp_export_state *s, eattr *a) { - struct bgp_proto *p = conn->bgp; - struct bgp_bucket *buck; - int remains = bgp_max_packet_length(p) - BGP_HEADER_LENGTH - 4; - byte *w; - int wd_size = 0; - int r_size = 0; - int a_size = 0; - - w = buf+2; - if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) - { - DBG("Withdrawn routes:\n"); - wd_size = bgp_encode_prefixes(p, w, buck, remains); - w += wd_size; - remains -= wd_size; - } - put_u16(buf, wd_size); + struct bgp_proto *p = s->proto; + ip_addr *nh = (void *) a->u.ptr->data; - if (!wd_size) - { - while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next) - { - if (EMPTY_LIST(buck->prefixes)) - { - DBG("Deleting empty bucket %p\n", buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - } - - DBG("Processing bucket %p\n", buck); - a_size = bgp_encode_attrs(p, w+2, buck->eattrs, remains - 1024); - - if (a_size < 0) - { - log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name); - bgp_flush_prefixes(p, buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - } - - put_u16(w, a_size); - w += a_size + 2; - r_size = bgp_encode_prefixes(p, w, buck, remains - a_size); - w += r_size; - break; - } - } - if (!a_size) /* Attributes not already encoded */ + if (s->channel->cf->next_hop_self) + return 0; + + if (s->channel->cf->next_hop_keep) + return 1; + + /* Keep it when explicitly set in export filter */ + if (a->type & EAF_FRESH) + return 1; + + /* Keep it when exported to internal peers */ + if (p->is_interior && ipa_nonzero(*nh)) + return 1; + + /* Keep it when forwarded between single-hop BGPs on the same iface */ + struct iface *ifa = (s->src && s->src->neigh) ? s->src->neigh->iface : NULL; + return p->neigh && (p->neigh->iface == ifa); +} + +static inline int +bgp_use_gateway(struct bgp_export_state *s) +{ + struct bgp_proto *p = s->proto; + rta *ra = s->route->attrs; + + if (s->channel->cf->next_hop_self) + return 0; + + /* We need one valid global gateway */ + if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw)) + return 0; + + /* Use it when exported to internal peers */ + if (p->is_interior) + return 1; + + /* Use it when forwarded to single-hop BGP peer on on the same iface */ + return p->neigh && (p->neigh->iface == ra->nh.iface); +} + +static void +bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to) +{ + if (!a || !bgp_use_next_hop(s, a)) + { + if (bgp_use_gateway(s)) { - put_u16(w, 0); - w += 2; + rta *ra = s->route->attrs; + ip_addr nh[1] = { ra->nh.gw }; + bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16); + + if (s->mpls) + { + u32 implicit_null = BGP_MPLS_NULL; + u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null; + uint lnum = ra->nh.labels ? ra->nh.labels : 1; + bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4); + } } - if (wd_size || r_size) + else { - BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE"); - return w; + ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr }; + bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16); + + /* TODO: Use local MPLS assigned label */ + if (s->mpls) + bgp_unset_attr(to, s->pool, BA_MPLS_LABEL_STACK); } + } + + /* Check if next hop is valid */ + a = bgp_find_attr(*to, BA_NEXT_HOP); + if (!a) + WITHDRAW(NO_NEXT_HOP); + + ip_addr *nh = (void *) a->u.ptr->data; + ip_addr peer = s->proto->cf->remote_ip; + uint len = a->u.ptr->length; + + /* Forbid zero next hop */ + if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1]))) + WITHDRAW(BAD_NEXT_HOP); + + /* Forbid next hop equal to neighbor IP */ + if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1]))) + WITHDRAW(BAD_NEXT_HOP); + + /* Forbid next hop with non-matching AF */ + if ((ipa_is_ip4(nh[0]) != bgp_channel_is_ipv4(s->channel)) && + !s->channel->ext_next_hop) + WITHDRAW(BAD_NEXT_HOP); + + /* Just check if MPLS stack */ + if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK)) + WITHDRAW(NO_LABEL_STACK); +} + +static uint +bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED) +{ + /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */ + ip_addr *nh = (void *) a->u.ptr->data; + uint len = a->u.ptr->length; + + ASSERT((len == 16) || (len == 32)); + + /* + * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This + * is specified in RFC 5549 for IPv4 and in RFC 4798 for IPv6. The difference + * is that IPv4 address is directly encoded with IPv4 NLRI, but as IPv4-mapped + * IPv6 address with IPv6 NLRI. + */ + + if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0])) + { + put_ip4(buf, ipa_to_ip4(nh[0])); + return 4; + } + + put_ip6(buf, ipa_to_ip6(nh[0])); + + if (len == 32) + put_ip6(buf+16, ipa_to_ip6(nh[1])); + + return len; +} + +static void +bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a) +{ + struct bgp_channel *c = s->channel; + struct adata *ad = lp_alloc_adata(s->pool, 32); + ip_addr *nh = (void *) ad->data; + + if (len == 4) + { + nh[0] = ipa_from_ip4(get_ip4(data)); + nh[1] = IPA_NONE; + } + else if (len == 16) + { + nh[0] = ipa_from_ip6(get_ip6(data)); + nh[1] = IPA_NONE; + + if (ipa_is_link_local(nh[0])) + { nh[1] = nh[0]; nh[0] = IPA_NONE; } + } + else if (len == 32) + { + nh[0] = ipa_from_ip6(get_ip6(data)); + nh[1] = ipa_from_ip6(get_ip6(data+16)); + + if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1])) + nh[1] = IPA_NONE; + } else - return NULL; + bgp_parse_error(s, 9); + + if (ipa_zero(nh[1])) + ad->length = 16; + + if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop) + WITHDRAW(BAD_NEXT_HOP); + + // XXXX validate next hop + + bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad); + bgp_apply_next_hop(s, a, nh[0], nh[1]); } -static byte * -bgp_create_end_mark(struct bgp_conn *conn, byte *buf) +static uint +bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED) { - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending END-OF-RIB"); + ip_addr *nh = (void *) a->u.ptr->data; + uint len = a->u.ptr->length; - put_u32(buf, 0); - return buf+4; + ASSERT((len == 16) || (len == 32)); + + /* + * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This + * is specified in RFC 5549 for VPNv4 and in RFC 4659 for VPNv6. The difference + * is that IPv4 address is directly encoded with VPNv4 NLRI, but as IPv4-mapped + * IPv6 address with VPNv6 NLRI. + */ + + if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0])) + { + put_u64(buf, 0); /* VPN RD is 0 */ + put_ip4(buf+8, ipa_to_ip4(nh[0])); + return 12; + } + + put_u64(buf, 0); /* VPN RD is 0 */ + put_ip6(buf+8, ipa_to_ip6(nh[0])); + + if (len == 16) + return 24; + + put_u64(buf+24, 0); /* VPN RD is 0 */ + put_ip6(buf+32, ipa_to_ip6(nh[1])); + + return 48; } -#else /* IPv6 version */ +static void +bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a) +{ + struct bgp_channel *c = s->channel; + struct adata *ad = lp_alloc_adata(s->pool, 32); + ip_addr *nh = (void *) ad->data; -static inline int -same_iface(struct bgp_proto *p, ip_addr *ip) + if (len == 12) + { + nh[0] = ipa_from_ip4(get_ip4(data+8)); + nh[1] = IPA_NONE; + } + else if (len == 24) + { + nh[0] = ipa_from_ip6(get_ip6(data+8)); + nh[1] = IPA_NONE; + + if (ipa_is_link_local(nh[0])) + { nh[1] = nh[0]; nh[0] = IPA_NONE; } + } + else if (len == 48) + { + nh[0] = ipa_from_ip6(get_ip6(data+8)); + nh[1] = ipa_from_ip6(get_ip6(data+32)); + + if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1])) + nh[1] = IPA_NONE; + } + else + bgp_parse_error(s, 9); + + if (ipa_zero(nh[1])) + ad->length = 16; + + /* XXXX which error */ + if ((get_u64(data) != 0) || ((len == 48) && (get_u64(data+24) != 0))) + bgp_parse_error(s, 9); + + if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop) + WITHDRAW(BAD_NEXT_HOP); + + // XXXX validate next hop + + bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad); + bgp_apply_next_hop(s, a, nh[0], nh[1]); +} + + + +static uint +bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte *buf UNUSED, uint size UNUSED) { - neighbor *n = neigh_find(&p->p, ip, 0); - return n && p->neigh && n->iface == p->neigh->iface; + return 0; } -static byte * -bgp_create_update(struct bgp_conn *conn, byte *buf) +static void +bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED) { - struct bgp_proto *p = conn->bgp; - struct bgp_bucket *buck; - int size, second, rem_stored; - int remains = bgp_max_packet_length(p) - BGP_HEADER_LENGTH - 4; - byte *w, *w_stored, *tmp, *tstart; - ip_addr *ipp, ip, ip_ll; - ea_list *ea; - eattr *nh; + /* + * Although we expect no next hop and RFC 7606 7.11 states that attribute + * MP_REACH_NLRI with unexpected next hop length is considered malformed, + * FlowSpec RFC 5575 4 states that next hop shall be ignored on receipt. + */ + + return; +} - put_u16(buf, 0); - w = buf+4; +static void +bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to) +{ + /* NEXT_HOP shall not pass */ + if (a) + bgp_unset_attr(to, s->pool, BA_NEXT_HOP); +} + + +/* + * UPDATE + */ + +static void +bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0) +{ + if (path_id != s->last_id) + { + s->last_src = rt_get_source(&s->proto->p, path_id); + s->last_id = path_id; - if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) + rta_free(s->cached_rta); + s->cached_rta = NULL; + } + + if (!a0) + { + /* Route withdraw */ + rte_update2(&s->channel->c, n, NULL, s->last_src); + return; + } + + /* Prepare cached route attributes */ + if (s->cached_rta == NULL) + { + a0->src = s->last_src; + + /* Workaround for rta_lookup() breaking eattrs */ + ea_list *ea = a0->eattrs; + s->cached_rta = rta_lookup(a0); + a0->eattrs = ea; + } + + rta *a = rta_clone(s->cached_rta); + rte *e = rte_get_temp(a); + + e->pflags = 0; + e->u.bgp.suppressed = 0; + rte_update2(&s->channel->c, n, e, s->last_src); +} + +static void +bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, adata *mpls, byte **pos, uint *size, byte *pxlen) +{ + u32 dummy = 0; + u32 *labels = mpls ? (u32 *) mpls->data : &dummy; + uint lnum = mpls ? (mpls->length / 4) : 1; + + for (uint i = 0; i < lnum; i++) + { + put_u24(*pos, labels[i] << 4); + ADVANCE(*pos, *size, 3); + } + + /* Add bottom-of-stack flag */ + (*pos)[-1] |= BGP_MPLS_BOS; + + *pxlen += 24 * lnum; +} + +static void +bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a) +{ + u32 labels[BGP_MPLS_MAX], label; + uint lnum = 0; + + do { + if (*pxlen < 24) + bgp_parse_error(s, 1); + + label = get_u24(*pos); + labels[lnum++] = label >> 4; + ADVANCE(*pos, *len, 3); + *pxlen -= 24; + + /* Withdraw: Magic label stack value 0x800000 according to RFC 3107, section 3, last paragraph */ + if (!a && !s->err_withdraw && (lnum == 1) && (label == BGP_MPLS_MAGIC)) + break; + } + while (!(label & BGP_MPLS_BOS)); + + if (!a) + return; + + /* Attach MPLS attribute unless we already have one */ + if (!s->mpls_labels) + { + s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX); + bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels); + } + + /* Overwrite data in the attribute */ + s->mpls_labels->length = 4*lnum; + memcpy(s->mpls_labels->data, labels, 4*lnum); + + /* Update next hop entry in rta */ + bgp_apply_mpls_labels(s, a, labels, lnum); + + /* Attributes were changed, invalidate cached entry */ + rta_free(s->cached_rta); + s->cached_rta = NULL; + + return; +} + +static uint +bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) +{ + byte *pos = buf; + + while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_ip4 *net = (void *) px->net; + + /* Encode path ID */ + if (s->add_path) { - DBG("Withdrawn routes:\n"); - tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_UNREACH_NLRI, remains-8); - *tmp++ = 0; - *tmp++ = BGP_AF_IPV6; - *tmp++ = 1; - ea->attrs[0].u.ptr->length = 3 + bgp_encode_prefixes(p, tmp, buck, remains-11); - size = bgp_encode_attrs(p, w, ea, remains); - ASSERT(size >= 0); - w += size; - remains -= size; + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); } - else + + /* Encode prefix length */ + *pos = net->pxlen; + ADVANCE(pos, size, 1); + + /* Encode MPLS labels */ + if (s->mpls) + bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1); + + /* Encode prefix body */ + ip4_addr a = ip4_hton(net->prefix); + uint b = (net->pxlen + 7) / 8; + memcpy(pos, &a, b); + ADVANCE(pos, size, b); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; +} + +static void +bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +{ + while (len) + { + net_addr_ip4 net; + u32 path_id = 0; + + /* Decode path ID */ + if (s->add_path) { - while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next) - { - if (EMPTY_LIST(buck->prefixes)) - { - DBG("Deleting empty bucket %p\n", buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - } - - DBG("Processing bucket %p\n", buck); - rem_stored = remains; - w_stored = w; - - size = bgp_encode_attrs(p, w, buck->eattrs, remains - 1024); - if (size < 0) - { - log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name); - bgp_flush_prefixes(p, buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - } - w += size; - remains -= size; - - /* We have two addresses here in NEXT_HOP eattr. Really. - Unless NEXT_HOP was modified by filter */ - nh = ea_find(buck->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP)); - ASSERT(nh); - second = (nh->u.ptr->length == NEXT_HOP_LENGTH); - ipp = (ip_addr *) nh->u.ptr->data; - ip = ipp[0]; - ip_ll = IPA_NONE; - - if (ipa_equal(ip, p->source_addr)) - ip_ll = p->local_link; - else - { - /* If we send a route with 'third party' next hop destinated - * in the same interface, we should also send a link local - * next hop address. We use the received one (stored in the - * other part of BA_NEXT_HOP eattr). If we didn't received - * it (for example it is a static route), we can't use - * 'third party' next hop and we have to use local IP address - * as next hop. Sending original next hop address without - * link local address seems to be a natural way to solve that - * problem, but it is contrary to RFC 2545 and Quagga does not - * accept such routes. - * - * There are two cases, either we have global IP, or - * IPA_NONE if the neighbor is link-local. For IPA_NONE, - * we suppose it is on the same iface, see bgp_update_attrs(). - */ - - if (ipa_zero(ip) || same_iface(p, &ip)) - { - if (second && ipa_nonzero(ipp[1])) - ip_ll = ipp[1]; - else - { - switch (p->cf->missing_lladdr) - { - case MLL_SELF: - ip = p->source_addr; - ip_ll = p->local_link; - break; - case MLL_DROP: - log(L_ERR "%s: Missing link-local next hop address, skipping corresponding routes", p->p.name); - w = w_stored; - remains = rem_stored; - bgp_flush_prefixes(p, buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - case MLL_IGNORE: - break; - } - } - } - } - - tstart = tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_REACH_NLRI, remains-8); - *tmp++ = 0; - *tmp++ = BGP_AF_IPV6; - *tmp++ = 1; - - if (ipa_is_link_local(ip)) - ip = IPA_NONE; - - if (ipa_nonzero(ip_ll)) - { - *tmp++ = 32; - ipa_hton(ip); - memcpy(tmp, &ip, 16); - ipa_hton(ip_ll); - memcpy(tmp+16, &ip_ll, 16); - tmp += 32; - } - else - { - *tmp++ = 16; - ipa_hton(ip); - memcpy(tmp, &ip, 16); - tmp += 16; - } - - *tmp++ = 0; /* No SNPA information */ - tmp += bgp_encode_prefixes(p, tmp, buck, remains - (8+3+32+1)); - ea->attrs[0].u.ptr->length = tmp - tstart; - size = bgp_encode_attrs(p, w, ea, remains); - ASSERT(size >= 0); - w += size; - break; - } + if (len < 5) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); } - size = w - (buf+4); - put_u16(buf+2, size); - lp_flush(bgp_linpool); - if (size) + /* Decode prefix length */ + uint l = *pos; + ADVANCE(pos, len, 1); + + if (len < ((l + 7) / 8)) + bgp_parse_error(s, 1); + + /* Decode MPLS labels */ + if (s->mpls) + bgp_decode_mpls_labels(s, &pos, &len, &l, a); + + if (l > IP4_MAX_PREFIX_LENGTH) + bgp_parse_error(s, 10); + + /* Decode prefix body */ + ip4_addr addr = IP4_NONE; + uint b = (l + 7) / 8; + memcpy(&addr, pos, b); + ADVANCE(pos, len, b); + + net = NET_ADDR_IP4(ip4_ntoh(addr), l); + net_normalize_ip4(&net); + + // XXXX validate prefix + + bgp_rte_update(s, (net_addr *) &net, path_id, a); + } +} + + +static uint +bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) +{ + byte *pos = buf; + + while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_ip6 *net = (void *) px->net; + + /* Encode path ID */ + if (s->add_path) { - BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE"); - return w; + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); } - else - return NULL; + + /* Encode prefix length */ + *pos = net->pxlen; + ADVANCE(pos, size, 1); + + /* Encode MPLS labels */ + if (s->mpls) + bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1); + + /* Encode prefix body */ + ip6_addr a = ip6_hton(net->prefix); + uint b = (net->pxlen + 7) / 8; + memcpy(pos, &a, b); + ADVANCE(pos, size, b); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; } -static byte * -bgp_create_end_mark(struct bgp_conn *conn, byte *buf) +static void +bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) { - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending END-OF-RIB"); + while (len) + { + net_addr_ip6 net; + u32 path_id = 0; - put_u16(buf+0, 0); - put_u16(buf+2, 6); /* length 4-9 */ - buf += 4; + /* Decode path ID */ + if (s->add_path) + { + if (len < 5) + bgp_parse_error(s, 1); - /* Empty MP_UNREACH_NLRI atribute */ - *buf++ = BAF_OPTIONAL; - *buf++ = BA_MP_UNREACH_NLRI; - *buf++ = 3; /* Length 7-9 */ - *buf++ = 0; /* AFI */ - *buf++ = BGP_AF_IPV6; - *buf++ = 1; /* SAFI */ - return buf; -} + path_id = get_u32(pos); + ADVANCE(pos, len, 4); + } -#endif + /* Decode prefix length */ + uint l = *pos; + ADVANCE(pos, len, 1); -static inline byte * -bgp_create_route_refresh(struct bgp_conn *conn, byte *buf) -{ - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH"); + if (len < ((l + 7) / 8)) + bgp_parse_error(s, 1); - /* Original original route refresh request, RFC 2918 */ - *buf++ = 0; - *buf++ = BGP_AF; - *buf++ = BGP_RR_REQUEST; - *buf++ = 1; /* SAFI */ - return buf; + /* Decode MPLS labels */ + if (s->mpls) + bgp_decode_mpls_labels(s, &pos, &len, &l, a); + + if (l > IP6_MAX_PREFIX_LENGTH) + bgp_parse_error(s, 10); + + /* Decode prefix body */ + ip6_addr addr = IP6_NONE; + uint b = (l + 7) / 8; + memcpy(&addr, pos, b); + ADVANCE(pos, len, b); + + net = NET_ADDR_IP6(ip6_ntoh(addr), l); + net_normalize_ip6(&net); + + // XXXX validate prefix + + bgp_rte_update(s, (net_addr *) &net, path_id, a); + } } -static inline byte * -bgp_create_begin_refresh(struct bgp_conn *conn, byte *buf) +static uint +bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) { - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR"); + byte *pos = buf; - /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */ - *buf++ = 0; - *buf++ = BGP_AF; - *buf++ = BGP_RR_BEGIN; - *buf++ = 1; /* SAFI */ - return buf; + while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_vpn4 *net = (void *) px->net; + + /* Encode path ID */ + if (s->add_path) + { + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); + } + + /* Encode prefix length */ + *pos = 64 + net->pxlen; + ADVANCE(pos, size, 1); + + /* Encode MPLS labels */ + if (s->mpls) + bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1); + + /* Encode route distinguisher */ + put_u64(pos, net->rd); + ADVANCE(pos, size, 8); + + /* Encode prefix body */ + ip4_addr a = ip4_hton(net->prefix); + uint b = (net->pxlen + 7) / 8; + memcpy(pos, &a, b); + ADVANCE(pos, size, b); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; } -static inline byte * -bgp_create_end_refresh(struct bgp_conn *conn, byte *buf) +static void +bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) { - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending END-OF-RR"); + while (len) + { + net_addr_vpn4 net; + u32 path_id = 0; - /* Demarcation of ending of route refresh (EoRR), RFC 7313 */ - *buf++ = 0; - *buf++ = BGP_AF; - *buf++ = BGP_RR_END; - *buf++ = 1; /* SAFI */ - return buf; + /* Decode path ID */ + if (s->add_path) + { + if (len < 5) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); + } + + /* Decode prefix length */ + uint l = *pos; + ADVANCE(pos, len, 1); + + if (len < ((l + 7) / 8)) + bgp_parse_error(s, 1); + + /* Decode MPLS labels */ + if (s->mpls) + bgp_decode_mpls_labels(s, &pos, &len, &l, a); + + /* Decode route distinguisher */ + if (l < 64) + bgp_parse_error(s, 1); + + u64 rd = get_u64(pos); + ADVANCE(pos, len, 8); + l -= 64; + + if (l > IP4_MAX_PREFIX_LENGTH) + bgp_parse_error(s, 10); + + /* Decode prefix body */ + ip4_addr addr = IP4_NONE; + uint b = (l + 7) / 8; + memcpy(&addr, pos, b); + ADVANCE(pos, len, b); + + net = NET_ADDR_VPN4(ip4_ntoh(addr), l, rd); + net_normalize_vpn4(&net); + + // XXXX validate prefix + + bgp_rte_update(s, (net_addr *) &net, path_id, a); + } } +static uint +bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) +{ + byte *pos = buf; + + while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_vpn6 *net = (void *) px->net; + + /* Encode path ID */ + if (s->add_path) + { + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); + } + + /* Encode prefix length */ + *pos = 64 + net->pxlen; + ADVANCE(pos, size, 1); + + /* Encode MPLS labels */ + bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1); + + /* Encode route distinguisher */ + put_u64(pos, net->rd); + ADVANCE(pos, size, 8); + + /* Encode prefix body */ + ip6_addr a = ip6_hton(net->prefix); + uint b = (net->pxlen + 7) / 8; + memcpy(pos, &a, b); + ADVANCE(pos, size, b); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; +} + static void -bgp_create_header(byte *buf, uint len, uint type) +bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) { - memset(buf, 0xff, 16); /* Marker */ - put_u16(buf+16, len); - buf[18] = type; + while (len) + { + net_addr_vpn6 net; + u32 path_id = 0; + + /* Decode path ID */ + if (s->add_path) + { + if (len < 5) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); + } + + /* Decode prefix length */ + uint l = *pos; + ADVANCE(pos, len, 1); + + if (len < ((l + 7) / 8)) + bgp_parse_error(s, 1); + + /* Decode MPLS labels */ + if (s->mpls) + bgp_decode_mpls_labels(s, &pos, &len, &l, a); + + /* Decode route distinguisher */ + if (l < 64) + bgp_parse_error(s, 1); + + u64 rd = get_u64(pos); + ADVANCE(pos, len, 8); + l -= 64; + + if (l > IP6_MAX_PREFIX_LENGTH) + bgp_parse_error(s, 10); + + /* Decode prefix body */ + ip6_addr addr = IP6_NONE; + uint b = (l + 7) / 8; + memcpy(&addr, pos, b); + ADVANCE(pos, len, b); + + net = NET_ADDR_VPN6(ip6_ntoh(addr), l, rd); + net_normalize_vpn6(&net); + + // XXXX validate prefix + + bgp_rte_update(s, (net_addr *) &net, path_id, a); + } } -/** - * bgp_fire_tx - transmit packets - * @conn: connection - * - * Whenever the transmit buffers of the underlying TCP connection - * are free and we have any packets queued for sending, the socket functions - * call bgp_fire_tx() which takes care of selecting the highest priority packet - * queued (Notification > Keepalive > Open > Update), assembling its header - * and body and sending it to the connection. - */ -static int -bgp_fire_tx(struct bgp_conn *conn) + +static uint +bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) { - struct bgp_proto *p = conn->bgp; - uint s = conn->packets_to_send; - sock *sk = conn->sk; - byte *buf, *pkt, *end; - int type; + byte *pos = buf; + + while (!EMPTY_LIST(buck->prefixes) && (size >= 4)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_flow4 *net = (void *) px->net; + uint flen = net->length - sizeof(net_addr_flow4); - if (!sk) + /* Encode path ID */ + if (s->add_path) { - conn->packets_to_send = 0; - return 0; + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); } - buf = sk->tbuf; - pkt = buf + BGP_HEADER_LENGTH; - if (s & (1 << PKT_SCHEDULE_CLOSE)) + if (flen > size) + break; + + /* Copy whole flow data including length */ + memcpy(pos, net->data, flen); + ADVANCE(pos, size, flen); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; +} + +static void +bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +{ + while (len) + { + u32 path_id = 0; + + /* Decode path ID */ + if (s->add_path) { - /* We can finally close connection and enter idle state */ - bgp_conn_enter_idle_state(conn); - return 0; + if (len < 4) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); } - if (s & (1 << PKT_NOTIFICATION)) + + if (len < 2) + bgp_parse_error(s, 1); + + /* Decode flow length */ + uint hlen = flow_hdr_length(pos); + uint dlen = flow_read_length(pos); + uint flen = hlen + dlen; + byte *data = pos + hlen; + + if (len < flen) + bgp_parse_error(s, 1); + + /* Validate flow data */ + enum flow_validated_state r = flow4_validate(data, dlen); + if (r != FLOW_ST_VALID) { - s = 1 << PKT_SCHEDULE_CLOSE; - type = PKT_NOTIFICATION; - end = bgp_create_notification(conn, pkt); + log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r)); + bgp_parse_error(s, 1); } - else if (s & (1 << PKT_KEEPALIVE)) + + if (data[0] != FLOW_TYPE_DST_PREFIX) { - s &= ~(1 << PKT_KEEPALIVE); - type = PKT_KEEPALIVE; - end = pkt; /* Keepalives carry no data */ - BGP_TRACE(D_PACKETS, "Sending KEEPALIVE"); - bgp_start_timer(conn->keepalive_timer, conn->keepalive_time); + log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name); + bgp_parse_error(s, 1); } - else if (s & (1 << PKT_OPEN)) + + /* Decode dst prefix */ + ip4_addr px = IP4_NONE; + uint pxlen = data[1]; + + // FIXME: Use some generic function + memcpy(&px, data, BYTES(pxlen)); + px = ip4_and(px, ip4_mkmask(pxlen)); + + /* Prepare the flow */ + net_addr *n = alloca(sizeof(struct net_addr_flow4) + flen); + net_fill_flow4(n, px, pxlen, pos, flen); + ADVANCE(pos, len, flen); + + bgp_rte_update(s, n, path_id, a); + } +} + + +static uint +bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) +{ + byte *pos = buf; + + while (!EMPTY_LIST(buck->prefixes) && (size >= 4)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_flow6 *net = (void *) px->net; + uint flen = net->length - sizeof(net_addr_flow6); + + /* Encode path ID */ + if (s->add_path) { - s &= ~(1 << PKT_OPEN); - type = PKT_OPEN; - end = bgp_create_open(conn, pkt); + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); } - else if (s & (1 << PKT_ROUTE_REFRESH)) + + if (flen > size) + break; + + /* Copy whole flow data including length */ + memcpy(pos, net->data, flen); + ADVANCE(pos, size, flen); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; +} + +static void +bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +{ + while (len) + { + u32 path_id = 0; + + /* Decode path ID */ + if (s->add_path) { - s &= ~(1 << PKT_ROUTE_REFRESH); - type = PKT_ROUTE_REFRESH; - end = bgp_create_route_refresh(conn, pkt); + if (len < 4) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); } - else if (s & (1 << PKT_BEGIN_REFRESH)) + + if (len < 2) + bgp_parse_error(s, 1); + + /* Decode flow length */ + uint hlen = flow_hdr_length(pos); + uint dlen = flow_read_length(pos); + uint flen = hlen + dlen; + byte *data = pos + hlen; + + if (len < flen) + bgp_parse_error(s, 1); + + /* Validate flow data */ + enum flow_validated_state r = flow6_validate(data, dlen); + if (r != FLOW_ST_VALID) { - s &= ~(1 << PKT_BEGIN_REFRESH); - type = PKT_ROUTE_REFRESH; /* BoRR is a subtype of RR */ - end = bgp_create_begin_refresh(conn, pkt); + log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r)); + bgp_parse_error(s, 1); } - else if (s & (1 << PKT_UPDATE)) + + if (data[0] != FLOW_TYPE_DST_PREFIX) { - type = PKT_UPDATE; - end = bgp_create_update(conn, pkt); + log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name); + bgp_parse_error(s, 1); + } - if (!end) - { - /* No update to send, perhaps we need to send End-of-RIB or EoRR */ + /* Decode dst prefix */ + ip6_addr px = IP6_NONE; + uint pxlen = data[1]; - conn->packets_to_send = 0; + // FIXME: Use some generic function + memcpy(&px, data, BYTES(pxlen)); + px = ip6_and(px, ip6_mkmask(pxlen)); - if (p->feed_state == BFS_LOADED) - { - type = PKT_UPDATE; - end = bgp_create_end_mark(conn, pkt); - } + /* Prepare the flow */ + net_addr *n = alloca(sizeof(struct net_addr_flow6) + flen); + net_fill_flow6(n, px, pxlen, pos, flen); + ADVANCE(pos, len, flen); - else if (p->feed_state == BFS_REFRESHED) - { - type = PKT_ROUTE_REFRESH; - end = bgp_create_end_refresh(conn, pkt); - } + bgp_rte_update(s, n, path_id, a); + } +} - else /* Really nothing to send */ - return 0; - p->feed_state = BFS_NONE; - } - } - else - return 0; +static const struct bgp_af_desc bgp_af_table[] = { + { + .afi = BGP_AF_IPV4, + .net = NET_IP4, + .name = "ipv4", + .encode_nlri = bgp_encode_nlri_ip4, + .decode_nlri = bgp_decode_nlri_ip4, + .encode_next_hop = bgp_encode_next_hop_ip, + .decode_next_hop = bgp_decode_next_hop_ip, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_IPV4_MC, + .net = NET_IP4, + .name = "ipv4-mc", + .encode_nlri = bgp_encode_nlri_ip4, + .decode_nlri = bgp_decode_nlri_ip4, + .encode_next_hop = bgp_encode_next_hop_ip, + .decode_next_hop = bgp_decode_next_hop_ip, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_IPV4_MPLS, + .net = NET_IP4, + .mpls = 1, + .name = "ipv4-mpls", + .encode_nlri = bgp_encode_nlri_ip4, + .decode_nlri = bgp_decode_nlri_ip4, + .encode_next_hop = bgp_encode_next_hop_ip, + .decode_next_hop = bgp_decode_next_hop_ip, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_IPV6, + .net = NET_IP6, + .name = "ipv6", + .encode_nlri = bgp_encode_nlri_ip6, + .decode_nlri = bgp_decode_nlri_ip6, + .encode_next_hop = bgp_encode_next_hop_ip, + .decode_next_hop = bgp_decode_next_hop_ip, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_IPV6_MC, + .net = NET_IP6, + .name = "ipv6-mc", + .encode_nlri = bgp_encode_nlri_ip6, + .decode_nlri = bgp_decode_nlri_ip6, + .encode_next_hop = bgp_encode_next_hop_ip, + .decode_next_hop = bgp_decode_next_hop_ip, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_IPV6_MPLS, + .net = NET_IP6, + .mpls = 1, + .name = "ipv6-mpls", + .encode_nlri = bgp_encode_nlri_ip6, + .decode_nlri = bgp_decode_nlri_ip6, + .encode_next_hop = bgp_encode_next_hop_ip, + .decode_next_hop = bgp_decode_next_hop_ip, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_VPN4_MPLS, + .net = NET_VPN4, + .mpls = 1, + .name = "vpn4-mpls", + .encode_nlri = bgp_encode_nlri_vpn4, + .decode_nlri = bgp_decode_nlri_vpn4, + .encode_next_hop = bgp_encode_next_hop_vpn, + .decode_next_hop = bgp_decode_next_hop_vpn, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_VPN6_MPLS, + .net = NET_VPN6, + .mpls = 1, + .name = "vpn6-mpls", + .encode_nlri = bgp_encode_nlri_vpn6, + .decode_nlri = bgp_decode_nlri_vpn6, + .encode_next_hop = bgp_encode_next_hop_vpn, + .decode_next_hop = bgp_decode_next_hop_vpn, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_VPN4_MC, + .net = NET_VPN4, + .name = "vpn4-mc", + .encode_nlri = bgp_encode_nlri_vpn4, + .decode_nlri = bgp_decode_nlri_vpn4, + .encode_next_hop = bgp_encode_next_hop_vpn, + .decode_next_hop = bgp_decode_next_hop_vpn, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_VPN6_MC, + .net = NET_VPN6, + .name = "vpn6-mc", + .encode_nlri = bgp_encode_nlri_vpn6, + .decode_nlri = bgp_decode_nlri_vpn6, + .encode_next_hop = bgp_encode_next_hop_vpn, + .decode_next_hop = bgp_decode_next_hop_vpn, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_FLOW4, + .net = NET_FLOW4, + .no_igp = 1, + .name = "flow4", + .encode_nlri = bgp_encode_nlri_flow4, + .decode_nlri = bgp_decode_nlri_flow4, + .encode_next_hop = bgp_encode_next_hop_none, + .decode_next_hop = bgp_decode_next_hop_none, + .update_next_hop = bgp_update_next_hop_none, + }, + { + .afi = BGP_AF_FLOW6, + .net = NET_FLOW6, + .no_igp = 1, + .name = "flow6", + .encode_nlri = bgp_encode_nlri_flow6, + .decode_nlri = bgp_decode_nlri_flow6, + .encode_next_hop = bgp_encode_next_hop_none, + .decode_next_hop = bgp_decode_next_hop_none, + .update_next_hop = bgp_update_next_hop_none, + }, +}; - conn->packets_to_send = s; - bgp_create_header(buf, end - buf, type); - return sk_send(sk, end - buf); +const struct bgp_af_desc * +bgp_get_af_desc(u32 afi) +{ + uint i; + for (i = 0; i < ARRAY_SIZE(bgp_af_table); i++) + if (bgp_af_table[i].afi == afi) + return &bgp_af_table[i]; + + return NULL; } -/** - * bgp_schedule_packet - schedule a packet for transmission - * @conn: connection - * @type: packet type - * - * Schedule a packet of type @type to be sent as soon as possible. - */ -void -bgp_schedule_packet(struct bgp_conn *conn, int type) +static inline uint +bgp_encode_nlri(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - DBG("BGP: Scheduling packet type %d\n", type); - conn->packets_to_send |= 1 << type; - if (conn->sk && conn->sk->tpos == conn->sk->tbuf && !ev_active(conn->tx_ev)) - ev_schedule(conn->tx_ev); + return s->channel->desc->encode_nlri(s, buck, buf, end - buf); } -void -bgp_kick_tx(void *vconn) +static inline uint +bgp_encode_next_hop(struct bgp_write_state *s, eattr *nh, byte *buf) { - struct bgp_conn *conn = vconn; - - DBG("BGP: kicking TX\n"); - while (bgp_fire_tx(conn) > 0) - ; + return s->channel->desc->encode_next_hop(s, nh, buf, 255); } void -bgp_tx(sock *sk) +bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to) { - struct bgp_conn *conn = sk->data; - - DBG("BGP: TX hook\n"); - while (bgp_fire_tx(conn) > 0) - ; + s->channel->desc->update_next_hop(s, a, to); } -/* Capatibility negotiation as per RFC 2842 */ +#define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024) -void -bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) +static byte * +bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - // struct bgp_proto *p = conn->bgp; - int i, cl; + /* + * 2 B Withdrawn Routes Length (zero) + * --- IPv4 Withdrawn Routes NLRI (unused) + * 2 B Total Path Attribute Length + * var Path Attributes + * var IPv4 Network Layer Reachability Information + */ + + int lr, la; + + la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH); + if (la < 0) + { + /* Attribute list too long */ + bgp_withdraw_bucket(s->channel, buck); + return NULL; + } - while (len > 0) - { - if (len < 2 || len < 2 + opt[1]) - goto err; + put_u16(buf+0, 0); + put_u16(buf+2, la); - cl = opt[1]; + lr = bgp_encode_nlri(s, buck, buf+4+la, end); - switch (opt[0]) - { - case 2: /* Route refresh capability, RFC 2918 */ - if (cl != 0) - goto err; - conn->peer_refresh_support = 1; - break; + return buf+4+la+lr; +} - case 6: /* Extended message length capability, draft */ - if (cl != 0) - goto err; - conn->peer_ext_messages_support = 1; - break; +static byte * +bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) +{ + /* + * 2 B IPv4 Withdrawn Routes Length (zero) + * --- IPv4 Withdrawn Routes NLRI (unused) + * 2 B Total Path Attribute Length + * 1 B MP_REACH_NLRI hdr - Attribute Flags + * 1 B MP_REACH_NLRI hdr - Attribute Type Code + * 2 B MP_REACH_NLRI hdr - Length of Attribute Data + * 2 B MP_REACH_NLRI data - Address Family Identifier + * 1 B MP_REACH_NLRI data - Subsequent Address Family Identifier + * 1 B MP_REACH_NLRI data - Length of Next Hop Network Address + * var MP_REACH_NLRI data - Network Address of Next Hop + * 1 B MP_REACH_NLRI data - Reserved (zero) + * var MP_REACH_NLRI data - Network Layer Reachability Information + * var Rest of Path Attributes + * --- IPv4 Network Layer Reachability Information (unused) + */ + + int lh, lr, la; /* Lengths of next hop, NLRI and attributes */ + + /* Begin of MP_REACH_NLRI atribute */ + buf[4] = BAF_OPTIONAL | BAF_EXT_LEN; + buf[5] = BA_MP_REACH_NLRI; + put_u16(buf+6, 0); /* Will be fixed later */ + put_af3(buf+8, s->channel->afi); + byte *pos = buf+11; + + /* Encode attributes to temporary buffer */ + byte *abuf = alloca(MAX_ATTRS_LENGTH); + la = bgp_encode_attrs(s, buck->eattrs, abuf, abuf + MAX_ATTRS_LENGTH); + if (la < 0) + { + /* Attribute list too long */ + bgp_withdraw_bucket(s->channel, buck); + return NULL; + } - case 64: /* Graceful restart capability, RFC 4724 */ - if (cl % 4 != 2) - goto err; - conn->peer_gr_aware = 1; - conn->peer_gr_able = 0; - conn->peer_gr_time = get_u16(opt + 2) & 0x0fff; - conn->peer_gr_flags = opt[2] & 0xf0; - conn->peer_gr_aflags = 0; - for (i = 2; i < cl; i += 4) - if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */ - { - conn->peer_gr_able = 1; - conn->peer_gr_aflags = opt[2+i+3]; - } - break; + /* Encode the next hop */ + lh = bgp_encode_next_hop(s, s->mp_next_hop, pos+1); + *pos = lh; + pos += 1+lh; - case 65: /* AS4 capability, RFC 4893 */ - if (cl != 4) - goto err; - conn->peer_as4_support = 1; - if (conn->bgp->cf->enable_as4) - conn->advertised_as = get_u32(opt + 2); - break; + /* Reserved field */ + *pos++ = 0; - case 69: /* ADD-PATH capability, RFC 7911 */ - if (cl % 4) - goto err; - for (i = 0; i < cl; i += 4) - if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */ - conn->peer_add_path = opt[2+i+3]; - if (conn->peer_add_path > ADD_PATH_FULL) - goto err; - break; + /* Encode the NLRI */ + lr = bgp_encode_nlri(s, buck, pos, end - la); + pos += lr; - case 70: /* Enhanced route refresh capability, RFC 7313 */ - if (cl != 0) - goto err; - conn->peer_enhanced_refresh_support = 1; - break; + /* End of MP_REACH_NLRI atribute, update data length */ + put_u16(buf+6, pos-buf-8); - /* We can safely ignore all other capabilities */ - } - len -= 2 + cl; - opt += 2 + cl; - } - return; + /* Copy remaining attributes */ + memcpy(pos, abuf, la); + pos += la; - err: - bgp_error(conn, 2, 0, NULL, 0); - return; + /* Initial UPDATE fields */ + put_u16(buf+0, 0); + put_u16(buf+2, pos-buf-4); + + return pos; } -static int -bgp_parse_options(struct bgp_conn *conn, byte *opt, int len) +#undef MAX_ATTRS_LENGTH + +static byte * +bgp_create_ip_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - struct bgp_proto *p = conn->bgp; - int ol; + /* + * 2 B Withdrawn Routes Length + * var IPv4 Withdrawn Routes NLRI + * 2 B Total Path Attribute Length (zero) + * --- Path Attributes (unused) + * --- IPv4 Network Layer Reachability Information (unused) + */ - while (len > 0) - { - if (len < 2 || len < 2 + opt[1]) - { bgp_error(conn, 2, 0, NULL, 0); return 0; } -#ifdef LOCAL_DEBUG - { - int i; - DBG("\tOption %02x:", opt[0]); - for(i=0; i<opt[1]; i++) - DBG(" %02x", opt[2+i]); - DBG("\n"); - } -#endif + uint len = bgp_encode_nlri(s, buck, buf+2, end); - ol = opt[1]; - switch (opt[0]) - { - case 2: - if (conn->start_state == BSS_CONNECT_NOCAP) - BGP_TRACE(D_PACKETS, "Ignoring received capabilities"); - else - bgp_parse_capabilities(conn, opt + 2, ol); - break; + put_u16(buf+0, len); + put_u16(buf+2+len, 0); - default: - /* - * BGP specs don't tell us to send which option - * we didn't recognize, but it's common practice - * to do so. Also, capability negotiation with - * Cisco routers doesn't work without that. - */ - bgp_error(conn, 2, 4, opt, ol); - return 0; - } - len -= 2 + ol; - opt += 2 + ol; - } - return 0; + return buf+4+len; } -static void -bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) +static byte * +bgp_create_mp_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - struct bgp_conn *other; - struct bgp_proto *p = conn->bgp; - unsigned hold; - u16 base_as; - u32 id; + /* + * 2 B Withdrawn Routes Length (zero) + * --- IPv4 Withdrawn Routes NLRI (unused) + * 2 B Total Path Attribute Length + * 1 B MP_UNREACH_NLRI hdr - Attribute Flags + * 1 B MP_UNREACH_NLRI hdr - Attribute Type Code + * 2 B MP_UNREACH_NLRI hdr - Length of Attribute Data + * 2 B MP_UNREACH_NLRI data - Address Family Identifier + * 1 B MP_UNREACH_NLRI data - Subsequent Address Family Identifier + * var MP_UNREACH_NLRI data - Network Layer Reachability Information + * --- IPv4 Network Layer Reachability Information (unused) + */ + + uint len = bgp_encode_nlri(s, buck, buf+11, end); - /* Check state */ - if (conn->state != BS_OPENSENT) - { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } + put_u16(buf+0, 0); + put_u16(buf+2, 7+len); - /* Check message contents */ - if (len < 29 || len != 29U + pkt[28]) - { bgp_error(conn, 1, 2, pkt+16, 2); return; } - if (pkt[19] != BGP_VERSION) - { bgp_error(conn, 2, 1, pkt+19, 1); return; } /* RFC 1771 says 16 bits, draft-09 tells to use 8 */ - conn->advertised_as = base_as = get_u16(pkt+20); - hold = get_u16(pkt+22); - id = get_u32(pkt+24); - BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%08x)", conn->advertised_as, hold, id); + /* Begin of MP_UNREACH_NLRI atribute */ + buf[4] = BAF_OPTIONAL | BAF_EXT_LEN; + buf[5] = BA_MP_UNREACH_NLRI; + put_u16(buf+6, 3+len); + put_af3(buf+8, s->channel->afi); - if (bgp_parse_options(conn, pkt+29, pkt[28])) - return; + return buf+11+len; +} - if (hold > 0 && hold < 3) - { bgp_error(conn, 2, 6, pkt+22, 2); return; } +static byte * +bgp_create_update(struct bgp_channel *c, byte *buf) +{ + struct bgp_proto *p = (void *) c->c.proto; + struct bgp_bucket *buck; + byte *end = buf + (bgp_max_packet_length(p->conn) - BGP_HEADER_LENGTH); + byte *res = NULL; + +again: ; + + /* Initialize write state */ + struct bgp_write_state s = { + .proto = p, + .channel = c, + .pool = bgp_linpool, + .as4_session = p->as4_session, + .add_path = c->add_path_tx, + .mpls = c->desc->mpls, + }; + + /* Try unreachable bucket */ + if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) + { + res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ? + bgp_create_ip_unreach(&s, buck, buf, end): + bgp_create_mp_unreach(&s, buck, buf, end); - /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */ - if (!id || (p->is_internal && id == p->local_id)) - { bgp_error(conn, 2, 3, pkt+24, -4); return; } + goto done; + } - if ((conn->advertised_as != base_as) && (base_as != AS_TRANS)) - log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name); + /* Try reachable buckets */ + if (!EMPTY_LIST(c->bucket_queue)) + { + buck = HEAD(c->bucket_queue); - if (conn->advertised_as != p->remote_as) + /* Cleanup empty buckets */ + if (EMPTY_LIST(buck->prefixes)) { - if (conn->peer_as4_support) - { - u32 val = htonl(conn->advertised_as); - bgp_error(conn, 2, 2, (byte *) &val, 4); - } - else - bgp_error(conn, 2, 2, pkt+20, 2); - - return; + bgp_free_bucket(c, buck); + goto again; } - /* Check the other connection */ - other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; - switch (other->state) - { - case BS_CONNECT: - case BS_ACTIVE: - /* Stop outgoing connection attempts */ - bgp_conn_enter_idle_state(other); - break; + res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ? + bgp_create_ip_reach(&s, buck, buf, end): + bgp_create_mp_reach(&s, buck, buf, end); - case BS_IDLE: - case BS_OPENSENT: - case BS_CLOSE: - break; + if (EMPTY_LIST(buck->prefixes)) + bgp_free_bucket(c, buck); + else + bgp_defer_bucket(c, buck); - case BS_OPENCONFIRM: - /* - * Description of collision detection rules in RFC 4271 is confusing and - * contradictory, but it is essentially: - * - * 1. Router with higher ID is dominant - * 2. If both have the same ID, router with higher ASN is dominant [RFC6286] - * 3. When both connections are in OpenConfirm state, one initiated by - * the dominant router is kept. - * - * The first line in the expression below evaluates whether the neighbor - * is dominant, the second line whether the new connection was initiated - * by the neighbor. If both are true (or both are false), we keep the new - * connection, otherwise we keep the old one. - */ - if (((p->local_id < id) || ((p->local_id == id) && (p->local_as < p->remote_as))) - == (conn == &p->incoming_conn)) - { - /* Should close the other connection */ - BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection"); - bgp_error(other, 6, 7, NULL, 0); - break; - } - /* Fall thru */ - case BS_ESTABLISHED: - /* Should close this connection */ - BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection"); - bgp_error(conn, 6, 7, NULL, 0); - return; - default: - bug("bgp_rx_open: Unknown state"); - } + if (!res) + goto again; - /* Update our local variables */ - conn->hold_time = MIN(hold, p->cf->hold_time); - conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3; - p->remote_id = id; - p->as4_session = p->cf->enable_as4 && conn->peer_as4_support; - p->add_path_rx = (p->cf->add_path & ADD_PATH_RX) && (conn->peer_add_path & ADD_PATH_TX); - p->add_path_tx = (p->cf->add_path & ADD_PATH_TX) && (conn->peer_add_path & ADD_PATH_RX); - p->gr_ready = p->cf->gr_mode && conn->peer_gr_able; - p->ext_messages = p->cf->enable_extended_messages && conn->peer_ext_messages_support; - - /* Update RA mode */ - if (p->add_path_tx) - p->p.accept_ra_types = RA_ANY; - else if (p->cf->secondary) - p->p.accept_ra_types = RA_ACCEPTED; - else - p->p.accept_ra_types = RA_OPTIMAL; + goto done; + } - DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n", conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, p->as4_session); + /* No more prefixes to send */ + return NULL; - bgp_schedule_packet(conn, PKT_KEEPALIVE); - bgp_start_timer(conn->hold_timer, conn->hold_time); - bgp_conn_enter_openconfirm_state(conn); +done: + BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE"); + lp_flush(s.pool); + + return res; } +static byte * +bgp_create_ip_end_mark(struct bgp_channel *c UNUSED, byte *buf) +{ + /* Empty update packet */ + put_u32(buf, 0); -static inline void -bgp_rx_end_mark(struct bgp_proto *p) + return buf+4; +} + +static byte * +bgp_create_mp_end_mark(struct bgp_channel *c, byte *buf) { - BGP_TRACE(D_PACKETS, "Got END-OF-RIB"); + put_u16(buf+0, 0); + put_u16(buf+2, 6); /* length 4--9 */ - if (p->load_state == BFS_LOADING) - p->load_state = BFS_NONE; + /* Empty MP_UNREACH_NLRI atribute */ + buf[4] = BAF_OPTIONAL; + buf[5] = BA_MP_UNREACH_NLRI; + buf[6] = 3; /* Length 7--9 */ + put_af3(buf+7, c->afi); - if (p->p.gr_recovery) - proto_graceful_restart_unlock(&p->p); - - if (p->gr_active) - bgp_graceful_restart_done(p); -} - - -#define DECODE_PREFIX(pp, ll) do { \ - if (p->add_path_rx) \ - { \ - if (ll < 5) { err=1; goto done; } \ - path_id = get_u32(pp); \ - pp += 4; \ - ll -= 4; \ - } \ - int b = *pp++; \ - int q; \ - ll--; \ - if (b > BITS_PER_IP_ADDRESS) { err=10; goto done; } \ - q = (b+7) / 8; \ - if (ll < q) { err=1; goto done; } \ - memcpy(&prefix, pp, q); \ - pp += q; \ - ll -= q; \ - ipa_ntoh(prefix); \ - prefix = ipa_and(prefix, ipa_mkmask(b)); \ - pxlen = b; \ -} while (0) + return buf+10; +} +static byte * +bgp_create_end_mark(struct bgp_channel *c, byte *buf) +{ + struct bgp_proto *p = (void *) c->c.proto; + + BGP_TRACE(D_PACKETS, "Sending END-OF-RIB"); + + return (c->afi == BGP_AF_IPV4) ? + bgp_create_ip_end_mark(c, buf): + bgp_create_mp_end_mark(c, buf); +} static inline void -bgp_rte_update(struct bgp_proto *p, ip_addr prefix, int pxlen, - u32 path_id, u32 *last_id, struct rte_src **src, - rta *a0, rta **a) +bgp_rx_end_mark(struct bgp_parse_state *s, u32 afi) { - if (path_id != *last_id) - { - *src = rt_get_source(&p->p, path_id); - *last_id = path_id; + struct bgp_proto *p = s->proto; + struct bgp_channel *c = bgp_get_channel(p, afi); - if (*a) - { - rta_free(*a); - *a = NULL; - } - } + BGP_TRACE(D_PACKETS, "Got END-OF-RIB"); - /* Prepare cached route attributes */ - if (!*a) - { - a0->src = *src; + if (!c) + DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi)); - /* Workaround for rta_lookup() breaking eattrs */ - ea_list *ea = a0->eattrs; - *a = rta_lookup(a0); - a0->eattrs = ea; - } + if (c->load_state == BFS_LOADING) + c->load_state = BFS_NONE; - net *n = net_get(p->p.table, prefix, pxlen); - rte *e = rte_get_temp(rta_clone(*a)); - e->net = n; - e->pflags = 0; - e->u.bgp.suppressed = 0; - rte_update2(p->p.main_ahook, n, e, *src); + if (p->p.gr_recovery) + channel_graceful_restart_unlock(&c->c); + + if (c->gr_active) + bgp_graceful_restart_done(c); } static inline void -bgp_rte_withdraw(struct bgp_proto *p, ip_addr prefix, int pxlen, - u32 path_id, u32 *last_id, struct rte_src **src) +bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len) { - if (path_id != *last_id) - { - *src = rt_find_source(&p->p, path_id); - *last_id = path_id; - } + struct bgp_channel *c = bgp_get_channel(s->proto, afi); + rta *a = NULL; - net *n = net_find(p->p.table, prefix, pxlen); - rte_update2( p->p.main_ahook, n, NULL, *src); -} + if (!c) + DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi)); -static inline int -bgp_set_next_hop(struct bgp_proto *p, rta *a) -{ - struct eattr *nh = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP)); - ip_addr *nexthop = (ip_addr *) nh->u.ptr->data; + s->channel = c; + s->add_path = c->add_path_rx; + s->mpls = c->desc->mpls; -#ifdef IPV6 - int second = (nh->u.ptr->length == NEXT_HOP_LENGTH) && ipa_nonzero(nexthop[1]); + s->last_id = 0; + s->last_src = s->proto->p.main_source; - /* First address should not be link-local, but may be zero in direct mode */ - if (ipa_is_link_local(*nexthop)) - *nexthop = IPA_NONE; -#else - int second = 0; -#endif - - if (p->cf->gw_mode == GW_DIRECT) - { - neighbor *ng = NULL; + /* + * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not + * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for + * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by + * decode_next_hop hooks) by restoring a->eattrs afterwards. + */ - if (ipa_nonzero(*nexthop)) - ng = neigh_find(&p->p, nexthop, 0); - else if (second) /* GW_DIRECT -> single_hop -> p->neigh != NULL */ - ng = neigh_find2(&p->p, nexthop + 1, p->neigh->iface, 0); + if (ea) + { + a = allocz(RTA_MAX_SIZE); - /* Fallback */ - if (!ng) - ng = p->neigh; + a->source = RTS_BGP; + a->scope = SCOPE_UNIVERSE; + a->from = s->proto->cf->remote_ip; + a->eattrs = ea; - if (ng->scope == SCOPE_HOST) - return 0; + c->desc->decode_next_hop(s, nh, nh_len, a); - a->dest = RTD_ROUTER; - a->gw = ng->addr; - a->iface = ng->iface; - a->hostentry = NULL; - a->igp_metric = 0; - } - else /* GW_RECURSIVE */ - { - if (ipa_zero(*nexthop)) - return 0; + /* Handle withdraw during next hop decoding */ + if (s->err_withdraw) + a = NULL; + } - rta_set_recursive_next_hop(p->p.table, a, p->igp_table, nexthop, nexthop + second); - } + c->desc->decode_nlri(s, nlri, len, a); - return 1; + rta_free(s->cached_rta); + s->cached_rta = NULL; } -#ifndef IPV6 /* IPv4 version */ - static void -bgp_do_rx_update(struct bgp_conn *conn, - byte *withdrawn, int withdrawn_len, - byte *nlri, int nlri_len, - byte *attrs, int attr_len) +bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) { struct bgp_proto *p = conn->bgp; - struct rte_src *src = p->p.main_source; - rta *a0, *a = NULL; - ip_addr prefix; - int pxlen, err = 0; - u32 path_id = 0; - u32 last_id = 0; + ea_list *ea = NULL; - /* Check for End-of-RIB marker */ - if (!withdrawn_len && !attr_len && !nlri_len) - { - bgp_rx_end_mark(p); - return; - } + BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE"); - /* Withdraw routes */ - while (withdrawn_len) - { - DECODE_PREFIX(withdrawn, withdrawn_len); - DBG("Withdraw %I/%d\n", prefix, pxlen); + /* Workaround for some BGP implementations that skip initial KEEPALIVE */ + if (conn->state == BS_OPENCONFIRM) + bgp_conn_enter_established_state(conn); - bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); - } + if (conn->state != BS_ESTABLISHED) + { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - if (!attr_len && !nlri_len) /* shortcut */ - return; + bgp_start_timer(conn->hold_timer, conn->hold_time); - a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, nlri_len); + /* Initialize parse state */ + struct bgp_parse_state s = { + .proto = p, + .pool = bgp_linpool, + .as4_session = p->as4_session, + }; - if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */ - return; + /* Parse error handler */ + if (setjmp(s.err_jmpbuf)) + { + bgp_error(conn, 3, s.err_subcode, NULL, 0); + goto done; + } - if (a0 && nlri_len && !bgp_set_next_hop(p, a0)) - a0 = NULL; + /* Check minimal length */ + if (len < 23) + { bgp_error(conn, 1, 2, pkt+16, 2); return; } - last_id = 0; - src = p->p.main_source; + /* Skip fixed header */ + uint pos = 19; - while (nlri_len) - { - DECODE_PREFIX(nlri, nlri_len); - DBG("Add %I/%d\n", prefix, pxlen); + /* + * UPDATE message format + * + * 2 B IPv4 Withdrawn Routes Length + * var IPv4 Withdrawn Routes NLRI + * 2 B Total Path Attribute Length + * var Path Attributes + * var IPv4 Reachable Routes NLRI + */ - if (a0) - bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a); - else /* Forced withdraw as a result of soft error */ - bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); - } + s.ip_unreach_len = get_u16(pkt + pos); + s.ip_unreach_nlri = pkt + pos + 2; + pos += 2 + s.ip_unreach_len; - done: - if (a) - rta_free(a); + if (pos + 2 > len) + bgp_parse_error(&s, 1); - if (err) - bgp_error(conn, 3, err, NULL, 0); + s.attr_len = get_u16(pkt + pos); + s.attrs = pkt + pos + 2; + pos += 2 + s.attr_len; + + if (pos > len) + bgp_parse_error(&s, 1); + s.ip_reach_len = len - pos; + s.ip_reach_nlri = pkt + pos; + + + if (s.attr_len) + ea = bgp_decode_attrs(&s, s.attrs, s.attr_len); + + /* Check for End-of-RIB marker */ + if (!s.attr_len && !s.ip_unreach_len && !s.ip_reach_len) + { bgp_rx_end_mark(&s, BGP_AF_IPV4); goto done; } + + /* Check for MP End-of-RIB marker */ + if ((s.attr_len < 8) && !s.ip_unreach_len && !s.ip_reach_len && + !s.mp_reach_len && !s.mp_unreach_len && s.mp_unreach_af) + { bgp_rx_end_mark(&s, s.mp_unreach_af); goto done; } + + if (s.ip_unreach_len) + bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_unreach_nlri, s.ip_unreach_len, NULL, NULL, 0); + + if (s.mp_unreach_len) + bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0); + + if (s.ip_reach_len) + bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len, + ea, s.ip_next_hop_data, s.ip_next_hop_len); + + if (s.mp_reach_len) + bgp_decode_nlri(&s, s.mp_reach_af, s.mp_reach_nlri, s.mp_reach_len, + ea, s.mp_next_hop_data, s.mp_next_hop_len); + +done: + rta_free(s.cached_rta); + lp_flush(s.pool); return; } -#else /* IPv6 version */ -#define DO_NLRI(name) \ - x = p->name##_start; \ - len = len0 = p->name##_len; \ - if (len) \ - { \ - if (len < 3) { err=9; goto done; } \ - af = get_u16(x); \ - x += 3; \ - len -= 3; \ - DBG("\tNLRI AF=%d sub=%d len=%d\n", af, x[-1], len);\ - } \ - else \ - af = 0; \ - if (af == BGP_AF_IPV6) +/* + * ROUTE-REFRESH + */ -static void -bgp_attach_next_hop(rta *a0, byte *x) +static inline byte * +bgp_create_route_refresh(struct bgp_channel *c, byte *buf) { - ip_addr *nh = (ip_addr *) bgp_attach_attr_wa(&a0->eattrs, bgp_linpool, BA_NEXT_HOP, NEXT_HOP_LENGTH); - memcpy(nh, x+1, 16); - ipa_ntoh(nh[0]); + struct bgp_proto *p = (void *) c->c.proto; - /* We store received link local address in the other part of BA_NEXT_HOP eattr. */ - if (*x == 32) - { - memcpy(nh+1, x+17, 16); - ipa_ntoh(nh[1]); - } - else - nh[1] = IPA_NONE; + BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH"); + + /* Original route refresh request, RFC 2918 */ + put_af4(buf, c->afi); + buf[2] = BGP_RR_REQUEST; + + return buf+4; +} + +static inline byte * +bgp_create_begin_refresh(struct bgp_channel *c, byte *buf) +{ + struct bgp_proto *p = (void *) c->c.proto; + + BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR"); + + /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */ + put_af4(buf, c->afi); + buf[2] = BGP_RR_BEGIN; + + return buf+4; } +static inline byte * +bgp_create_end_refresh(struct bgp_channel *c, byte *buf) +{ + struct bgp_proto *p = (void *) c->c.proto; + + BGP_TRACE(D_PACKETS, "Sending END-OF-RR"); + + /* Demarcation of ending of route refresh (EoRR), RFC 7313 */ + put_af4(buf, c->afi); + buf[2] = BGP_RR_END; + + return buf+4; +} static void -bgp_do_rx_update(struct bgp_conn *conn, - byte *withdrawn UNUSED, int withdrawn_len, - byte *nlri UNUSED, int nlri_len, - byte *attrs, int attr_len) +bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len) { struct bgp_proto *p = conn->bgp; - struct rte_src *src = p->p.main_source; - byte *x; - int len, len0; - unsigned af; - rta *a0, *a = NULL; - ip_addr prefix; - int pxlen, err = 0; - u32 path_id = 0; - u32 last_id = 0; - - p->mp_reach_len = 0; - p->mp_unreach_len = 0; - a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, 0); - - if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */ - return; - /* Check for End-of-RIB marker */ - if ((attr_len < 8) && !withdrawn_len && !nlri_len && !p->mp_reach_len && - (p->mp_unreach_len == 3) && (get_u16(p->mp_unreach_start) == BGP_AF_IPV6)) - { - bgp_rx_end_mark(p); - return; - } + if (conn->state != BS_ESTABLISHED) + { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - DO_NLRI(mp_unreach) - { - while (len) - { - DECODE_PREFIX(x, len); - DBG("Withdraw %I/%d\n", prefix, pxlen); - bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); - } - } + if (!conn->local_caps->route_refresh) + { bgp_error(conn, 1, 3, pkt+18, 1); return; } - DO_NLRI(mp_reach) - { - /* Create fake NEXT_HOP attribute */ - if (len < 1 || (*x != 16 && *x != 32) || len < *x + 2) - { err = 9; goto done; } + if (len < (BGP_HEADER_LENGTH + 4)) + { bgp_error(conn, 1, 2, pkt+16, 2); return; } - if (a0) - bgp_attach_next_hop(a0, x); + if (len > (BGP_HEADER_LENGTH + 4)) + { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; } - /* Also ignore one reserved byte */ - len -= *x + 2; - x += *x + 2; + struct bgp_channel *c = bgp_get_channel(p, get_af4(pkt+19)); + if (!c) + { + log(L_WARN "%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring", + p->p.name, pkt[21], get_u16(pkt+19), pkt[22]); + return; + } - if (a0 && ! bgp_set_next_hop(p, a0)) - a0 = NULL; + /* RFC 7313 redefined reserved field as RR message subtype */ + uint subtype = p->enhanced_refresh ? pkt[21] : BGP_RR_REQUEST; - last_id = 0; - src = p->p.main_source; + switch (subtype) + { + case BGP_RR_REQUEST: + BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH"); + channel_request_feeding(&c->c); + break; - while (len) - { - DECODE_PREFIX(x, len); - DBG("Add %I/%d\n", prefix, pxlen); + case BGP_RR_BEGIN: + BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR"); + bgp_refresh_begin(c); + break; - if (a0) - bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a); - else /* Forced withdraw as a result of soft error */ - bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); - } - } + case BGP_RR_END: + BGP_TRACE(D_PACKETS, "Got END-OF-RR"); + bgp_refresh_end(c); + break; - done: - if (a) - rta_free(a); + default: + log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring", + p->p.name, subtype); + break; + } +} + +static inline struct bgp_channel * +bgp_get_channel_to_send(struct bgp_proto *p, struct bgp_conn *conn) +{ + uint i = conn->last_channel; - if (err) /* Use subcode 9, not err */ - bgp_error(conn, 3, 9, NULL, 0); + /* Try the last channel, but at most several times */ + if ((conn->channels_to_send & (1 << i)) && + (conn->last_channel_count < 16)) + goto found; - return; + /* Find channel with non-zero channels_to_send */ + do + { + i++; + if (i >= p->channel_count) + i = 0; + } + while (! (conn->channels_to_send & (1 << i))); + + /* Use that channel */ + conn->last_channel = i; + conn->last_channel_count = 0; + +found: + conn->last_channel_count++; + return p->channel_map[i]; } -#endif +static inline int +bgp_send(struct bgp_conn *conn, uint type, uint len) +{ + sock *sk = conn->sk; + byte *buf = sk->tbuf; -static void -bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) + memset(buf, 0xff, 16); /* Marker */ + put_u16(buf+16, len); + buf[18] = type; + + return sk_send(sk, len); +} + +/** + * bgp_fire_tx - transmit packets + * @conn: connection + * + * Whenever the transmit buffers of the underlying TCP connection + * are free and we have any packets queued for sending, the socket functions + * call bgp_fire_tx() which takes care of selecting the highest priority packet + * queued (Notification > Keepalive > Open > Update), assembling its header + * and body and sending it to the connection. + */ +static int +bgp_fire_tx(struct bgp_conn *conn) { struct bgp_proto *p = conn->bgp; - byte *withdrawn, *attrs, *nlri; - uint withdrawn_len, attr_len, nlri_len; + struct bgp_channel *c; + byte *buf, *pkt, *end; + uint s; - BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE"); + if (!conn->sk) + return 0; - /* Workaround for some BGP implementations that skip initial KEEPALIVE */ - if (conn->state == BS_OPENCONFIRM) - bgp_conn_enter_established_state(conn); + buf = conn->sk->tbuf; + pkt = buf + BGP_HEADER_LENGTH; + s = conn->packets_to_send; - if (conn->state != BS_ESTABLISHED) - { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - bgp_start_timer(conn->hold_timer, conn->hold_time); + if (s & (1 << PKT_SCHEDULE_CLOSE)) + { + /* We can finally close connection and enter idle state */ + bgp_conn_enter_idle_state(conn); + return 0; + } + if (s & (1 << PKT_NOTIFICATION)) + { + conn->packets_to_send = 1 << PKT_SCHEDULE_CLOSE; + end = bgp_create_notification(conn, pkt); + return bgp_send(conn, PKT_NOTIFICATION, end - buf); + } + else if (s & (1 << PKT_KEEPALIVE)) + { + conn->packets_to_send &= ~(1 << PKT_KEEPALIVE); + BGP_TRACE(D_PACKETS, "Sending KEEPALIVE"); + bgp_start_timer(conn->keepalive_timer, conn->keepalive_time); + return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH); + } + else if (s & (1 << PKT_OPEN)) + { + conn->packets_to_send &= ~(1 << PKT_OPEN); + end = bgp_create_open(conn, pkt); + return bgp_send(conn, PKT_OPEN, end - buf); + } + else while (conn->channels_to_send) + { + c = bgp_get_channel_to_send(p, conn); + s = c->packets_to_send; - /* Find parts of the packet and check sizes */ - if (len < 23) + if (s & (1 << PKT_ROUTE_REFRESH)) { - bgp_error(conn, 1, 2, pkt+16, 2); - return; + c->packets_to_send &= ~(1 << PKT_ROUTE_REFRESH); + end = bgp_create_route_refresh(c, pkt); + return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf); } - withdrawn = pkt + 21; - withdrawn_len = get_u16(pkt + 19); - if (withdrawn_len + 23 > len) - goto malformed; - attrs = withdrawn + withdrawn_len + 2; - attr_len = get_u16(attrs - 2); - if (withdrawn_len + attr_len + 23 > len) - goto malformed; - nlri = attrs + attr_len; - nlri_len = len - withdrawn_len - attr_len - 23; - if (!attr_len && nlri_len) - goto malformed; - DBG("Sizes: withdrawn=%d, attrs=%d, NLRI=%d\n", withdrawn_len, attr_len, nlri_len); - - lp_flush(bgp_linpool); - - bgp_do_rx_update(conn, withdrawn, withdrawn_len, nlri, nlri_len, attrs, attr_len); - return; + else if (s & (1 << PKT_BEGIN_REFRESH)) + { + /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */ + c->packets_to_send &= ~(1 << PKT_BEGIN_REFRESH); + end = bgp_create_begin_refresh(c, pkt); + return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf); + } + else if (s & (1 << PKT_UPDATE)) + { + end = bgp_create_update(c, pkt); + if (end) + return bgp_send(conn, PKT_UPDATE, end - buf); + + /* No update to send, perhaps we need to send End-of-RIB or EoRR */ + c->packets_to_send = 0; + conn->channels_to_send &= ~(1 << c->index); + + if (c->feed_state == BFS_LOADED) + { + c->feed_state = BFS_NONE; + end = bgp_create_end_mark(c, pkt); + return bgp_send(conn, PKT_UPDATE, end - buf); + } + + else if (c->feed_state == BFS_REFRESHED) + { + c->feed_state = BFS_NONE; + end = bgp_create_end_refresh(c, pkt); + return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf); + } + } + else if (s) + bug("Channel packets_to_send: %x", s); + + c->packets_to_send = 0; + conn->channels_to_send &= ~(1 << c->index); + } + + return 0; +} + +/** + * bgp_schedule_packet - schedule a packet for transmission + * @conn: connection + * @c: channel + * @type: packet type + * + * Schedule a packet of type @type to be sent as soon as possible. + */ +void +bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) +{ + ASSERT(conn->sk); + + DBG("BGP: Scheduling packet type %d\n", type); + + if (c) + { + if (! conn->channels_to_send) + { + conn->last_channel = c->index; + conn->last_channel_count = 0; + } + + c->packets_to_send |= 1 << type; + conn->channels_to_send |= 1 << c->index; + } + else + conn->packets_to_send |= 1 << type; + + if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev)) + ev_schedule(conn->tx_ev); +} + +void +bgp_kick_tx(void *vconn) +{ + struct bgp_conn *conn = vconn; -malformed: - bgp_error(conn, 3, 1, NULL, 0); + DBG("BGP: kicking TX\n"); + while (bgp_fire_tx(conn) > 0) + ; +} + +void +bgp_tx(sock *sk) +{ + struct bgp_conn *conn = sk->data; + + DBG("BGP: TX hook\n"); + while (bgp_fire_tx(conn) > 0) + ; } + static struct { byte major, minor; byte *msg; @@ -1480,26 +2665,25 @@ static struct { * which might be static string or given temporary buffer. */ const char * -bgp_error_dsc(unsigned code, unsigned subcode) +bgp_error_dsc(uint code, uint subcode) { static char buff[32]; - unsigned i; + uint i; + for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++) if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode) - { - return bgp_msg_table[i].msg; - } + return bgp_msg_table[i].msg; - bsprintf(buff, "Unknown error %d.%d", code, subcode); + bsprintf(buff, "Unknown error %u.%u", code, subcode); return buff; } void -bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len) +bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, byte *data, uint len) { const byte *name; byte *t, argbuf[36]; - unsigned i; + uint i; /* Don't report Cease messages generated by myself */ if (code == 6 && class == BE_BGP_TX) @@ -1515,7 +2699,7 @@ bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4))) { /* Bad peer AS - we would like to print the AS */ - t += bsprintf(t, "%d", (len == 2) ? get_u16(data) : get_u32(data)); + t += bsprintf(t, "%u", (len == 2) ? get_u16(data) : get_u32(data)); goto done; } if (len > 16) @@ -1532,47 +2716,25 @@ static void bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len) { struct bgp_proto *p = conn->bgp; + if (len < 21) - { - bgp_error(conn, 1, 2, pkt+16, 2); - return; - } + { bgp_error(conn, 1, 2, pkt+16, 2); return; } - unsigned code = pkt[19]; - unsigned subcode = pkt[20]; + uint code = pkt[19]; + uint subcode = pkt[20]; int err = (code != 6); bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21); bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode); -#ifndef IPV6 - if ((code == 2) && ((subcode == 4) || (subcode == 7)) - /* Error related to capability: - * 4 - Peer does not support capabilities at all. - * 7 - Peer request some capability. Strange unless it is IPv6 only peer. - */ - && (p->cf->capabilities == 2) - /* Capabilities are not explicitly enabled or disabled, therefore heuristic is used */ - && (conn->start_state == BSS_CONNECT) - /* Failed connection attempt have used capabilities */ - && (p->cf->remote_as <= 0xFFFF)) - /* Not possible with disabled capabilities */ - { - /* We try connect without capabilities */ - log(L_WARN "%s: Capability related error received, retry with capabilities disabled", p->p.name); - p->start_state = BSS_CONNECT_NOCAP; - err = 0; - } -#endif - bgp_conn_enter_close_state(conn); - bgp_schedule_packet(conn, PKT_SCHEDULE_CLOSE); + bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE); - if (err) - { - bgp_update_startup_delay(p); - bgp_stop(p, 0); - } + if (err) + { + bgp_update_startup_delay(p); + bgp_stop(p, 0); + } } static void @@ -1582,64 +2744,12 @@ bgp_rx_keepalive(struct bgp_conn *conn) BGP_TRACE(D_PACKETS, "Got KEEPALIVE"); bgp_start_timer(conn->hold_timer, conn->hold_time); - switch (conn->state) - { - case BS_OPENCONFIRM: - bgp_conn_enter_established_state(conn); - break; - case BS_ESTABLISHED: - break; - default: - bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); - } -} -static void -bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len) -{ - struct bgp_proto *p = conn->bgp; + if (conn->state == BS_OPENCONFIRM) + { bgp_conn_enter_established_state(conn); return; } if (conn->state != BS_ESTABLISHED) - { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - - if (!p->cf->enable_refresh) - { bgp_error(conn, 1, 3, pkt+18, 1); return; } - - if (len < (BGP_HEADER_LENGTH + 4)) - { bgp_error(conn, 1, 2, pkt+16, 2); return; } - - if (len > (BGP_HEADER_LENGTH + 4)) - { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; } - - /* FIXME - we ignore AFI/SAFI values, as we support - just one value and even an error code for an invalid - request is not defined */ - - /* RFC 7313 redefined reserved field as RR message subtype */ - uint subtype = conn->peer_enhanced_refresh_support ? pkt[21] : BGP_RR_REQUEST; - - switch (subtype) - { - case BGP_RR_REQUEST: - BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH"); - proto_request_feeding(&p->p); - break; - - case BGP_RR_BEGIN: - BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR"); - bgp_refresh_begin(p); - break; - - case BGP_RR_END: - BGP_TRACE(D_PACKETS, "Got END-OF-RR"); - bgp_refresh_end(p); - break; - - default: - log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring", - p->p.name, subtype); - break; - } + bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); } @@ -1653,7 +2763,7 @@ bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len) * packet handler according to the packet type. */ static void -bgp_rx_packet(struct bgp_conn *conn, byte *pkt, unsigned len) +bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len) { byte type = pkt[18]; @@ -1663,14 +2773,14 @@ bgp_rx_packet(struct bgp_conn *conn, byte *pkt, unsigned len) mrt_dump_bgp_packet(conn, pkt, len); switch (type) - { - case PKT_OPEN: return bgp_rx_open(conn, pkt, len); - case PKT_UPDATE: return bgp_rx_update(conn, pkt, len); - case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len); - case PKT_KEEPALIVE: return bgp_rx_keepalive(conn); - case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len); - default: bgp_error(conn, 1, 3, pkt+18, 1); - } + { + case PKT_OPEN: return bgp_rx_open(conn, pkt, len); + case PKT_UPDATE: return bgp_rx_update(conn, pkt, len); + case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len); + case PKT_KEEPALIVE: return bgp_rx_keepalive(conn); + case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len); + default: bgp_error(conn, 1, 3, pkt+18, 1); + } } /** @@ -1687,10 +2797,9 @@ int bgp_rx(sock *sk, uint size) { struct bgp_conn *conn = sk->data; - struct bgp_proto *p = conn->bgp; byte *pkt_start = sk->rbuf; byte *end = pkt_start + size; - unsigned i, len; + uint i, len; DBG("BGP: RX hook: Got %d bytes\n", size); while (end >= pkt_start + BGP_HEADER_LENGTH) @@ -1704,7 +2813,7 @@ bgp_rx(sock *sk, uint size) break; } len = get_u16(pkt_start+16); - if (len < BGP_HEADER_LENGTH || len > bgp_max_packet_length(p)) + if ((len < BGP_HEADER_LENGTH) || (len > bgp_max_packet_length(conn))) { bgp_error(conn, 1, 2, pkt_start+16, 2); break; diff --git a/proto/ospf/Makefile b/proto/ospf/Makefile index f90222cf..39e74f71 100644 --- a/proto/ospf/Makefile +++ b/proto/ospf/Makefile @@ -1,5 +1,6 @@ -source=ospf.c topology.c packet.c hello.c neighbor.c iface.c dbdes.c lsreq.c lsupd.c lsack.c lsalib.c rt.c -root-rel=../../ -dir-name=proto/ospf +src := dbdes.c hello.c iface.c lsack.c lsalib.c lsreq.c lsupd.c neighbor.c ospf.c packet.c rt.c topology.c +obj := $(src-o-files) +$(all-daemon) +$(cf-local) -include ../../Rules +tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file diff --git a/proto/ospf/config.Y b/proto/ospf/config.Y index 7b35b191..98ddf5d0 100644 --- a/proto/ospf/config.Y +++ b/proto/ospf/config.Y @@ -78,14 +78,66 @@ static void ospf_proto_finish(void) { struct ospf_config *cf = OSPF_CFG; + struct ospf_area_config *ac; + struct ospf_iface_patt *ic; + + /* Define default channel */ + if (EMPTY_LIST(this_proto->channels)) + { + this_proto->net_type = ospf_cfg_is_v2() ? NET_IP4 : NET_IP6; + channel_config_new(NULL, this_proto->net_type, this_proto); + } + + /* Propagate global instance ID to interfaces */ + if (cf->instance_id_set) + { + WALK_LIST(ac, cf->area_list) + WALK_LIST(ic, ac->patt_list) + if (!ic->instance_id_set) + { ic->instance_id = cf->instance_id; ic->instance_id_set = 1; } + + WALK_LIST(ic, cf->vlink_list) + if (!ic->instance_id_set) + { ic->instance_id = cf->instance_id; ic->instance_id_set = 1; } + } + + if (ospf_cfg_is_v3()) + { + uint ipv4 = (this_proto->net_type == NET_IP4); + uint base = (ipv4 ? 64 : 0) + (cf->af_mc ? 32 : 0); + + /* RFC 5838 - OSPFv3-AF */ + if (cf->af_ext) + { + /* RFC 5838 2.1 - instance IDs based on AFs */ + WALK_LIST(ac, cf->area_list) + WALK_LIST(ic, ac->patt_list) + { + if (!ic->instance_id_set) + ic->instance_id = base; + else if (ic->instance_id >= 128) + log(L_WARN "Instance ID %d from unassigned/private range", ic->instance_id); + else if ((ic->instance_id < base) || (ic->instance_id >= (base + 32))) + cf_error("Instance ID %d invalid for given channel type", ic->instance_id); + } + + /* RFC 5838 2.8 - vlinks limited to IPv6 unicast */ + if ((ipv4 || cf->af_mc) && !EMPTY_LIST(cf->vlink_list)) + cf_error("Vlinks not supported in AFs other than IPv6 unicast"); + } + else + { + if (ipv4 || cf->af_mc) + cf_error("Different channel type"); + } + } if (EMPTY_LIST(cf->area_list)) - cf_error( "No configured areas in OSPF"); + cf_error("No configured areas in OSPF"); int areano = 0; int backbone = 0; int nssa = 0; - struct ospf_area_config *ac; WALK_LIST(ac, cf->area_list) { areano++; @@ -98,7 +150,7 @@ ospf_proto_finish(void) cf->abr = areano > 1; /* Route export or NSSA translation (RFC 3101 3.1) */ - cf->asbr = (this_proto->out_filter != FILTER_REJECT) || (nssa && cf->abr); + cf->asbr = (proto_cf_main_channel(this_proto)->out_filter != FILTER_REJECT) || (nssa && cf->abr); if (cf->abr && !backbone) { @@ -122,7 +174,7 @@ static inline void ospf_check_defcost(int cost) { if ((cost <= 0) || (cost >= LSINFINITY)) - cf_error("Default cost must be in range 1-%d", LSINFINITY-1); + cf_error("Default cost must be in range 1-%u", LSINFINITY-1); } static inline void @@ -135,8 +187,8 @@ ospf_check_auth(void) CF_DECLS -CF_KEYWORDS(OSPF, AREA, OSPF_METRIC1, OSPF_METRIC2, OSPF_TAG, OSPF_ROUTER_ID) -CF_KEYWORDS(NEIGHBORS, RFC1583COMPAT, STUB, TICK, COST, COST2, RETRANSMIT) +CF_KEYWORDS(OSPF, V2, V3, OSPF_METRIC1, OSPF_METRIC2, OSPF_TAG, OSPF_ROUTER_ID) +CF_KEYWORDS(AREA, NEIGHBORS, RFC1583COMPAT, STUB, TICK, COST, COST2, RETRANSMIT) CF_KEYWORDS(HELLO, TRANSMIT, PRIORITY, DEAD, TYPE, BROADCAST, BCAST, DEFAULT) CF_KEYWORDS(NONBROADCAST, NBMA, POINTOPOINT, PTP, POINTOMULTIPOINT, PTMP) CF_KEYWORDS(NONE, SIMPLE, AUTHENTICATION, STRICT, CRYPTOGRAPHIC, TTL, SECURITY) @@ -144,39 +196,67 @@ CF_KEYWORDS(ELIGIBLE, POLL, NETWORKS, HIDDEN, VIRTUAL, CHECK, LINK, ONLY, BFD) CF_KEYWORDS(RX, BUFFER, LARGE, NORMAL, STUBNET, HIDDEN, SUMMARY, TAG, EXTERNAL) CF_KEYWORDS(WAIT, DELAY, LSADB, ECMP, LIMIT, WEIGHT, NSSA, TRANSLATOR, STABILITY) CF_KEYWORDS(GLOBAL, LSID, ROUTER, SELF, INSTANCE, REAL, NETMASK, TX, PRIORITY, LENGTH) -CF_KEYWORDS(SECONDARY, MERGE, LSA, SUPPRESSION) +CF_KEYWORDS(SECONDARY, MERGE, LSA, SUPPRESSION, MULTICAST, RFC5838) -%type <t> opttext %type <ld> lsadb_args -%type <i> nbma_eligible +%type <i> ospf_variant ospf_af_mc nbma_eligible +%type <cc> ospf_channel_start ospf_channel CF_GRAMMAR CF_ADDTO(proto, ospf_proto '}' { ospf_proto_finish(); } ) -ospf_proto_start: proto_start OSPF { - this_proto = proto_config_new(&proto_ospf, $1); - init_list(&OSPF_CFG->area_list); - init_list(&OSPF_CFG->vlink_list); - OSPF_CFG->tick = OSPF_DEFAULT_TICK; - OSPF_CFG->ospf2 = OSPF_IS_V2; - } +ospf_variant: + OSPF { $$ = 1; } + | OSPF V2 { $$ = 1; } + | OSPF V3 { $$ = 0; } ; +ospf_proto_start: proto_start ospf_variant +{ + this_proto = proto_config_new(&proto_ospf, $1); + this_proto->net_type = $2 ? NET_IP4 : 0; + + init_list(&OSPF_CFG->area_list); + init_list(&OSPF_CFG->vlink_list); + OSPF_CFG->tick = OSPF_DEFAULT_TICK; + OSPF_CFG->ospf2 = $2; + OSPF_CFG->af_ext = !$2; +}; + ospf_proto: ospf_proto_start proto_name '{' | ospf_proto ospf_proto_item ';' ; +ospf_af_mc: + { $$ = 0; } + | MULTICAST { $$ = 1; } + ; + +/* We redefine proto_channel to add multicast flag */ +ospf_channel_start: net_type ospf_af_mc +{ + $$ = this_channel = channel_config_new(NULL, $1, this_proto); + + /* Save the multicast flag */ + if (this_channel == proto_cf_main_channel(this_proto)) + OSPF_CFG->af_mc = $2; +}; + +ospf_channel: ospf_channel_start channel_opt_list channel_end; + ospf_proto_item: proto_item + | ospf_channel { this_proto->net_type = $1->net_type; } | RFC1583COMPAT bool { OSPF_CFG->rfc1583 = $2; } + | RFC5838 bool { OSPF_CFG->af_ext = $2; if (!ospf_cfg_is_v3()) cf_error("RFC5838 option requires OSPFv3"); } | STUB ROUTER bool { OSPF_CFG->stub_router = $3; } | ECMP bool { OSPF_CFG->ecmp = $2 ? OSPF_DEFAULT_ECMP_LIMIT : 0; } - | ECMP bool LIMIT expr { OSPF_CFG->ecmp = $2 ? $4 : 0; if ($4 < 0) cf_error("ECMP limit cannot be negative"); } + | ECMP bool LIMIT expr { OSPF_CFG->ecmp = $2 ? $4 : 0; } | MERGE EXTERNAL bool { OSPF_CFG->merge_external = $3; } - | TICK expr { OSPF_CFG->tick = $2; if($2<=0) cf_error("Tick must be greater than zero"); } - | INSTANCE ID expr { OSPF_CFG->instance_id = $3; if (($3<0) || ($3>255)) cf_error("Instance ID must be in range 0-255"); } + | TICK expr { OSPF_CFG->tick = $2; if($2 <= 0) cf_error("Tick must be greater than zero"); } + | INSTANCE ID expr { OSPF_CFG->instance_id = $3; OSPF_CFG->instance_id_set = 1; if ($3 > 255) cf_error("Instance ID must be in range 0-255"); } | ospf_area ; @@ -226,10 +306,10 @@ ospf_stubnet: ; ospf_stubnet_start: - prefix { + net_ip { this_stubnet = cfg_allocz(sizeof(struct ospf_stubnet_config)); add_tail(&this_area->stubnet_list, NODE this_stubnet); - this_stubnet->px = $1; + this_stubnet->prefix = $1; this_stubnet->cost = COST_D; } ; @@ -281,7 +361,6 @@ ospf_vlink_start: VIRTUAL LINK idval OSPF_PATT->inftransdelay = INFTRANSDELAY_D; OSPF_PATT->deadc = DEADC_D; OSPF_PATT->type = OSPF_IT_VLINK; - OSPF_PATT->instance_id = OSPF_CFG->instance_id; init_list(&OSPF_PATT->nbma_list); reset_passwords(); } @@ -306,7 +385,7 @@ ospf_iface_item: | REAL BROADCAST bool { OSPF_PATT->real_bcast = $3; if (!ospf_cfg_is_v2()) cf_error("Real broadcast option requires OSPFv2"); } | PTP NETMASK bool { OSPF_PATT->ptp_netmask = $3; if (!ospf_cfg_is_v2()) cf_error("PtP netmask option requires OSPFv2"); } | TRANSMIT DELAY expr { OSPF_PATT->inftransdelay = $3 ; if (($3<=0) || ($3>65535)) cf_error("Transmit delay must be in range 1-65535"); } - | PRIORITY expr { OSPF_PATT->priority = $2 ; if (($2<0) || ($2>255)) cf_error("Priority must be in range 0-255"); } + | PRIORITY expr { OSPF_PATT->priority = $2 ; if ($2>255) cf_error("Priority must be in range 0-255"); } | STRICT NONBROADCAST bool { OSPF_PATT->strictnbma = $3 ; } | STUB bool { OSPF_PATT->stub = $2 ; } | CHECK LINK bool { OSPF_PATT->check_link = $3; } @@ -325,7 +404,6 @@ ospf_iface_item: | TTL SECURITY bool { OSPF_PATT->ttl_security = $3; } | TTL SECURITY TX ONLY { OSPF_PATT->ttl_security = 2; } | BFD bool { OSPF_PATT->bfd = $2; cf_check_bfd($2); } - | SECONDARY bool { OSPF_PATT->bsd_secondary = $2; } | password_list { ospf_check_auth(); } ; @@ -336,12 +414,11 @@ pref_list: pref_item: pref_base pref_opt ';' ; -pref_base: prefix +pref_base: net_ip { this_pref = cfg_allocz(sizeof(struct area_net_config)); add_tail(this_nets, NODE this_pref); - this_pref->px.addr = $1.addr; - this_pref->px.len = $1.len; + this_pref->prefix = $1; } ; @@ -383,7 +460,6 @@ ospf_iface_start: OSPF_PATT->priority = PRIORITY_D; OSPF_PATT->deadc = DEADC_D; OSPF_PATT->type = OSPF_IT_UNDEF; - OSPF_PATT->instance_id = OSPF_CFG->instance_id; init_list(&OSPF_PATT->nbma_list); OSPF_PATT->ptp_netmask = 2; /* not specified */ OSPF_PATT->tx_tos = IP_PREC_INTERNET_CONTROL; @@ -394,7 +470,7 @@ ospf_iface_start: ospf_instance_id: /* empty */ - | INSTANCE expr { OSPF_PATT->instance_id = $2; if (($2<0) || ($2>255)) cf_error("Instance ID must be in range 0-255"); } + | INSTANCE expr { OSPF_PATT->instance_id = $2; OSPF_PATT->instance_id_set = 1; if ($2 > 255) cf_error("Instance ID must be in range 0-255"); } ; ospf_iface_patt_list: @@ -415,11 +491,6 @@ ospf_iface: ospf_iface_start ospf_iface_patt_list ospf_iface_opt_list { ospf_iface_finish(); } ; -opttext: - TEXT - | /* empty */ { $$ = NULL; } - ; - CF_ADDTO(dynamic_attr, OSPF_METRIC1 { $$ = f_new_dynamic_attr(EAF_TYPE_INT | EAF_TEMP, T_INT, EA_OSPF_METRIC1); }) CF_ADDTO(dynamic_attr, OSPF_METRIC2 { $$ = f_new_dynamic_attr(EAF_TYPE_INT | EAF_TEMP, T_INT, EA_OSPF_METRIC2); }) CF_ADDTO(dynamic_attr, OSPF_TAG { $$ = f_new_dynamic_attr(EAF_TYPE_INT | EAF_TEMP, T_INT, EA_OSPF_TAG); }) diff --git a/proto/ospf/dbdes.c b/proto/ospf/dbdes.c index d6904343..f211935f 100644 --- a/proto/ospf/dbdes.c +++ b/proto/ospf/dbdes.c @@ -39,7 +39,7 @@ struct ospf_dbdes3_packet static inline uint -ospf_dbdes_hdrlen(struct ospf_proto *p UNUSED4 UNUSED6) +ospf_dbdes_hdrlen(struct ospf_proto *p) { return ospf_is_v2(p) ? sizeof(struct ospf_dbdes2_packet) : sizeof(struct ospf_dbdes3_packet); @@ -356,7 +356,7 @@ ospf_receive_dbdes(struct ospf_packet *pkt, struct ospf_iface *ifa, LOG_PKT_WARN("MTU mismatch with nbr %R on %s (remote %d, local %d)", n->rid, ifa->ifname, rcv_iface_mtu, ifa->iface->mtu); - if ((rcv_imms == DBDES_IMMS) && + if (((rcv_imms & DBDES_IMMS) == DBDES_IMMS) && (n->rid > p->router_id) && (plen == ospf_dbdes_hdrlen(p))) { @@ -428,7 +428,7 @@ ospf_receive_dbdes(struct ospf_packet *pkt, struct ospf_iface *ifa, } ospf_send_dbdes(p, n); - tm_start(n->dbdes_timer, n->ifa->rxmtint); + tm_start(n->dbdes_timer, n->ifa->rxmtint S); } else { diff --git a/proto/ospf/hello.c b/proto/ospf/hello.c index e00487dc..e706ea0f 100644 --- a/proto/ospf/hello.c +++ b/proto/ospf/hello.c @@ -32,10 +32,7 @@ struct ospf_hello3_packet struct ospf_packet hdr; u32 iface_id; - u8 priority; - u8 options3; - u8 options2; - u8 options; + u32 options; u16 helloint; u16 deadint; u32 dr; @@ -74,7 +71,7 @@ ospf_send_hello(struct ospf_iface *ifa, int kind, struct ospf_neighbor *dirn) ((ifa->type == OSPF_IT_PTP) && !ifa->ptp_netmask)) ps->netmask = 0; else - ps->netmask = htonl(u32_mkmask(ifa->addr->pxlen)); + ps->netmask = htonl(u32_mkmask(ifa->addr->prefix.pxlen)); ps->helloint = ntohs(ifa->helloint); ps->options = ifa->oa->options; @@ -91,10 +88,7 @@ ospf_send_hello(struct ospf_iface *ifa, int kind, struct ospf_neighbor *dirn) struct ospf_hello3_packet *ps = (void *) pkt; ps->iface_id = htonl(ifa->iface_id); - ps->priority = ifa->priority; - ps->options3 = ifa->oa->options >> 16; - ps->options2 = ifa->oa->options >> 8; - ps->options = ifa->oa->options; + ps->options = ntohl(ifa->oa->options | (ifa->priority << 24)); ps->helloint = ntohs(ifa->helloint); ps->deadint = htons(ifa->deadint); ps->dr = htonl(ifa->drid); @@ -190,7 +184,8 @@ ospf_receive_hello(struct ospf_packet *pkt, struct ospf_iface *ifa, struct ospf_proto *p = ifa->oa->po; const char *err_dsc = NULL; u32 rcv_iface_id, rcv_helloint, rcv_deadint, rcv_dr, rcv_bdr; - u8 rcv_options, rcv_priority; + uint rcv_options, rcv_priority; + uint loc_options = ifa->oa->options; u32 *neighbors; u32 neigh_count; uint plen, i, err_val = 0; @@ -198,7 +193,7 @@ ospf_receive_hello(struct ospf_packet *pkt, struct ospf_iface *ifa, /* RFC 2328 10.5 */ /* - * We may not yet havethe associate neighbor, so we use Router ID from the + * We may not yet have the associate neighbor, so we use Router ID from the * packet instead of one from the neighbor structure for log messages. */ u32 rcv_rid = ntohl(pkt->routerid); @@ -227,7 +222,7 @@ ospf_receive_hello(struct ospf_packet *pkt, struct ospf_iface *ifa, if ((ifa->type != OSPF_IT_VLINK) && (ifa->type != OSPF_IT_PTP) && - ((uint) pxlen != ifa->addr->pxlen)) + ((uint) pxlen != ifa->addr->prefix.pxlen)) DROP("prefix length mismatch", pxlen); neighbors = ps->neighbors; @@ -245,8 +240,8 @@ ospf_receive_hello(struct ospf_packet *pkt, struct ospf_iface *ifa, rcv_deadint = ntohs(ps->deadint); rcv_dr = ntohl(ps->dr); rcv_bdr = ntohl(ps->bdr); - rcv_options = ps->options; - rcv_priority = ps->priority; + rcv_options = ntohl(ps->options) & 0x00FFFFFF; + rcv_priority = ntohl(ps->options) >> 24; neighbors = ps->neighbors; neigh_count = (plen - sizeof(struct ospf_hello3_packet)) / sizeof(u32); @@ -259,9 +254,13 @@ ospf_receive_hello(struct ospf_packet *pkt, struct ospf_iface *ifa, DROP("dead interval mismatch", rcv_deadint); /* Check whether bits E, N match */ - if ((rcv_options ^ ifa->oa->options) & (OPT_E | OPT_N)) + if ((rcv_options ^ loc_options) & (OPT_E | OPT_N)) DROP("area type mismatch", rcv_options); + /* RFC 5838 2.4 - AF-bit check unless on IPv6 unicast */ + if ((loc_options & OPT_AF) && !(loc_options & OPT_V6) && !(rcv_options & OPT_AF)) + DROP("AF-bit mismatch", rcv_options); + /* Check consistency of existing neighbor entry */ if (n) { diff --git a/proto/ospf/iface.c b/proto/ospf/iface.c index 280fa4c1..29d21a07 100644 --- a/proto/ospf/iface.c +++ b/proto/ospf/iface.c @@ -55,7 +55,9 @@ ifa_tx_length(struct ospf_iface *ifa) static inline uint ifa_tx_hdrlen(struct ospf_iface *ifa) { - uint hlen = SIZE_OF_IP_HEADER; + struct ospf_proto *p = ifa->oa->po; + + uint hlen = ospf_is_v2(p) ? IP4_HEADER_LENGTH : IP6_HEADER_LENGTH; /* Relevant just for OSPFv2 */ if (ifa->autype == OSPF_AUTH_CRYPT) @@ -115,6 +117,7 @@ ospf_sk_open(struct ospf_iface *ifa) sock *sk = sk_new(ifa->pool); sk->type = SK_IP; + sk->subtype = ospf_is_v2(p) ? SK_IPV4 : SK_IPV6; sk->dport = OSPF_PROTO; sk->saddr = ifa->addr->ip; sk->iface = ifa->iface; @@ -199,6 +202,7 @@ ospf_open_vlink_sk(struct ospf_proto *p) { sock *sk = sk_new(p->p.pool); sk->type = SK_IP; + sk->subtype = ospf_is_v2(p) ? SK_IPV4 : SK_IPV6; sk->dport = OSPF_PROTO; /* FIXME: configurable tos/priority ? */ @@ -244,8 +248,8 @@ ospf_iface_down(struct ospf_iface *ifa) OSPF_TRACE(D_EVENTS, "Removing interface %s (peer %I) from area %R", ifa->ifname, ifa->addr->opposite, ifa->oa->areaid); else - OSPF_TRACE(D_EVENTS, "Removing interface %s (%I/%d) from area %R", - ifa->ifname, ifa->addr->prefix, ifa->addr->pxlen, ifa->oa->areaid); + OSPF_TRACE(D_EVENTS, "Removing interface %s (%N) from area %R", + ifa->ifname, &ifa->addr->prefix, ifa->oa->areaid); /* First of all kill all the related vlinks */ WALK_LIST(iff, p->iface_list) @@ -392,15 +396,15 @@ ospf_iface_sm(struct ospf_iface *ifa, int event) { ospf_iface_chstate(ifa, OSPF_IS_WAITING); if (ifa->wait_timer) - tm_start(ifa->wait_timer, ifa->waitint); + tm_start(ifa->wait_timer, ifa->waitint S); } } if (ifa->hello_timer) - tm_start(ifa->hello_timer, ifa->helloint); + tm_start(ifa->hello_timer, ifa->helloint S); if (ifa->poll_timer) - tm_start(ifa->poll_timer, ifa->pollint); + tm_start(ifa->poll_timer, ifa->pollint S); ospf_send_hello(ifa, OHS_HELLO, NULL); } @@ -490,13 +494,13 @@ ospf_iface_add(struct object_lock *lock) if (! ifa->stub) { - ifa->hello_timer = tm_new_set(ifa->pool, hello_timer_hook, ifa, 0, ifa->helloint); + ifa->hello_timer = tm_new_init(ifa->pool, hello_timer_hook, ifa, ifa->helloint S, 0); if (ifa->type == OSPF_IT_NBMA) - ifa->poll_timer = tm_new_set(ifa->pool, poll_timer_hook, ifa, 0, ifa->pollint); + ifa->poll_timer = tm_new_init(ifa->pool, poll_timer_hook, ifa, ifa->pollint S, 0); if ((ifa->type == OSPF_IT_BCAST) || (ifa->type == OSPF_IT_NBMA)) - ifa->wait_timer = tm_new_set(ifa->pool, wait_timer_hook, ifa, 0, 0); + ifa->wait_timer = tm_new_init(ifa->pool, wait_timer_hook, ifa, 0, 0); ifa->flood_queue_size = ifa_flood_queue_size(ifa); ifa->flood_queue = mb_allocz(ifa->pool, ifa->flood_queue_size * sizeof(void *)); @@ -530,15 +534,6 @@ ospf_iface_stubby(struct ospf_iface_patt *ip, struct ifa *addr) if (addr->iface->flags & IF_LOOPBACK) return 1; - /* - * For compatibility reasons on BSD systems, we force OSPF - * interfaces with non-primary IP prefixes to be stub. - */ -#if defined(OSPFv2) && !defined(CONFIG_MC_PROPER_SRC) - if (!ip->bsd_secondary && !(addr->flags & IA_PRIMARY)) - return 1; -#endif - return ip->stub; } @@ -557,8 +552,8 @@ ospf_iface_new(struct ospf_area *oa, struct ifa *addr, struct ospf_iface_patt *i OSPF_TRACE(D_EVENTS, "Adding interface %s (peer %I) to area %R", iface->name, addr->opposite, oa->areaid); else - OSPF_TRACE(D_EVENTS, "Adding interface %s (%I/%d) to area %R", - iface->name, addr->prefix, addr->pxlen, oa->areaid); + OSPF_TRACE(D_EVENTS, "Adding interface %s (%N) to area %R", + iface->name, &addr->prefix, oa->areaid); pool = rp_new(p->p.pool, "OSPF Interface"); ifa = mb_allocz(pool, sizeof(struct ospf_iface)); @@ -596,6 +591,7 @@ ospf_iface_new(struct ospf_area *oa, struct ifa *addr, struct ospf_iface_patt *i if (ip->ptp_netmask < 2) ifa->ptp_netmask = ip->ptp_netmask; + ifa->drip = ifa->bdrip = ospf_is_v2(p) ? IPA_NONE4 : IPA_NONE6; ifa->type = ospf_iface_classify(ip->type, addr); @@ -635,7 +631,7 @@ ospf_iface_new(struct ospf_area *oa, struct ifa *addr, struct ospf_iface_patt *i should be used). Because OSPFv3 iface is not subnet-specific, there is no need for ipa_in_net() check */ - if (ospf_is_v2(p) && !ipa_in_net(nb->ip, addr->prefix, addr->pxlen)) + if (ospf_is_v2(p) && !ipa_in_netX(nb->ip, &addr->prefix)) continue; if (ospf_is_v3(p) && !ipa_is_link_local(nb->ip)) @@ -648,7 +644,7 @@ ospf_iface_new(struct ospf_area *oa, struct ifa *addr, struct ospf_iface_patt *i add_tail(&oa->po->iface_list, NODE ifa); struct object_lock *lock = olock_new(pool); - lock->addr = ospf_is_v2(p) ? ifa->addr->prefix : IPA_NONE; + lock->addr = ospf_is_v2(p) ? ipa_from_ip4(net4_prefix(&ifa->addr->prefix)) : IPA_NONE; lock->type = OBJLOCK_IP; lock->port = OSPF_PROTO; lock->inst = ifa->instance_id; @@ -707,7 +703,7 @@ ospf_iface_new_vlink(struct ospf_proto *p, struct ospf_iface_patt *ip) add_tail(&p->iface_list, NODE ifa); - ifa->hello_timer = tm_new_set(ifa->pool, hello_timer_hook, ifa, 0, ifa->helloint); + ifa->hello_timer = tm_new_init(ifa->pool, hello_timer_hook, ifa, ifa->helloint S, 0); ifa->flood_queue_size = ifa_flood_queue_size(ifa); ifa->flood_queue = mb_allocz(ifa->pool, ifa->flood_queue_size * sizeof(void *)); @@ -719,10 +715,10 @@ ospf_iface_change_timer(timer *tm, uint val) if (!tm) return; - tm->recurrent = val; + tm->recurrent = val S; - if (tm->expires) - tm_start(tm, val); + if (tm_active(tm)) + tm_start(tm, val S); } static inline void @@ -805,8 +801,8 @@ ospf_iface_reconfigure(struct ospf_iface *ifa, struct ospf_iface_patt *new) ifname, ifa->waitint, new->waitint); ifa->waitint = new->waitint; - if (ifa->wait_timer && ifa->wait_timer->expires) - tm_start(ifa->wait_timer, ifa->waitint); + if (ifa->wait_timer && tm_active(ifa->wait_timer)) + tm_start(ifa->wait_timer, ifa->waitint S); } /* DEAD TIMER */ @@ -898,7 +894,7 @@ ospf_iface_reconfigure(struct ospf_iface *ifa, struct ospf_iface_patt *new) WALK_LIST(nb, new->nbma_list) { /* See related note in ospf_iface_new() */ - if (ospf_is_v2(p) && !ipa_in_net(nb->ip, ifa->addr->prefix, ifa->addr->pxlen)) + if (ospf_is_v2(p) && !ipa_in_netX(nb->ip, &ifa->addr->prefix)) continue; if (ospf_is_v3(p) && !ipa_is_link_local(nb->ip)) @@ -1085,6 +1081,9 @@ ospf_ifa_notify2(struct proto *P, uint flags, struct ifa *a) { struct ospf_proto *p = (struct ospf_proto *) P; + if (a->prefix.type != NET_IP4) + return; + if (a->flags & IA_SECONDARY) return; @@ -1124,6 +1123,9 @@ ospf_ifa_notify3(struct proto *P, uint flags, struct ifa *a) other addresses are used for link-LSA. */ if (a->scope == SCOPE_LINK) { + if (a->prefix.type != NET_IP6) + return; + if (flags & IF_CHANGE_UP) { struct ospf_mip_walk s = { .iface = a->iface }; @@ -1141,6 +1143,9 @@ ospf_ifa_notify3(struct proto *P, uint flags, struct ifa *a) } else { + if (a->prefix.type != ospf_get_af(p)) + return; + struct ospf_iface *ifa; WALK_LIST(ifa, p->iface_list) if (ifa->iface == a->iface) @@ -1166,6 +1171,9 @@ ospf_reconfigure_ifaces2(struct ospf_proto *p) WALK_LIST(a, iface->addrs) { + if (a->prefix.type != NET_IP4) + continue; + if (a->flags & IA_SECONDARY) continue; @@ -1184,8 +1192,8 @@ ospf_reconfigure_ifaces2(struct ospf_proto *p) continue; /* Hard restart */ - log(L_INFO "%s: Restarting interface %s (%I/%d) in area %R", - p->p.name, ifa->ifname, a->prefix, a->pxlen, s.oa->areaid); + log(L_INFO "%s: Restarting interface %s (%N) in area %R", + p->p.name, ifa->ifname, &a->prefix, s.oa->areaid); ospf_iface_shutdown(ifa); ospf_iface_remove(ifa); } @@ -1209,6 +1217,9 @@ ospf_reconfigure_ifaces3(struct ospf_proto *p) WALK_LIST(a, iface->addrs) { + if (a->prefix.type != NET_IP6) + continue; + if (a->flags & IA_SECONDARY) continue; @@ -1340,7 +1351,7 @@ ospf_iface_info(struct ospf_iface *ifa) else if (ifa->addr->flags & IA_PEER) cli_msg(-1015, "Interface %s (peer %I)", ifa->ifname, ifa->addr->opposite); else - cli_msg(-1015, "Interface %s (%I/%d)", ifa->ifname, ifa->addr->prefix, ifa->addr->pxlen); + cli_msg(-1015, "Interface %s (%N)", ifa->ifname, &ifa->addr->prefix); cli_msg(-1015, "\tType: %s%s", ospf_it[ifa->type], more); cli_msg(-1015, "\tArea: %R (%u)", ifa->oa->areaid, ifa->oa->areaid); diff --git a/proto/ospf/lsalib.c b/proto/ospf/lsalib.c index cb7b186a..fbfd8d29 100644 --- a/proto/ospf/lsalib.c +++ b/proto/ospf/lsalib.c @@ -280,21 +280,19 @@ lsa_walk_rt(struct ospf_lsa_rt_walk *rt) void -lsa_parse_sum_net(struct top_hash_entry *en, int ospf2, ip_addr *ip, int *pxlen, u8 *pxopts, u32 *metric) +lsa_parse_sum_net(struct top_hash_entry *en, int ospf2, int af, net_addr *net, u8 *pxopts, u32 *metric) { if (ospf2) { struct ospf_lsa_sum2 *ls = en->lsa_body; - *ip = ipa_from_u32(en->lsa.id & ls->netmask); - *pxlen = u32_masklen(ls->netmask); + net_fill_ip4(net, ip4_from_u32(en->lsa.id & ls->netmask), u32_masklen(ls->netmask)); *pxopts = 0; *metric = ls->metric & LSA_METRIC_MASK; } else { struct ospf_lsa_sum3_net *ls = en->lsa_body; - u16 rest; - lsa_get_ipv6_prefix(ls->prefix, ip, pxlen, pxopts, &rest); + ospf3_get_prefix(ls->prefix, af, net, pxopts, NULL); *metric = ls->metric & LSA_METRIC_MASK; } } @@ -319,13 +317,14 @@ lsa_parse_sum_rt(struct top_hash_entry *en, int ospf2, u32 *drid, u32 *metric, u } void -lsa_parse_ext(struct top_hash_entry *en, int ospf2, struct ospf_lsa_ext_local *rt) +lsa_parse_ext(struct top_hash_entry *en, int ospf2, int af, struct ospf_lsa_ext_local *rt) { if (ospf2) { struct ospf_lsa_ext2 *ext = en->lsa_body; - rt->ip = ipa_from_u32(en->lsa.id & ext->netmask); - rt->pxlen = u32_masklen(ext->netmask); + net_fill_ip4(&rt->net, + ip4_from_u32(en->lsa.id & ext->netmask), + u32_masklen(ext->netmask)); rt->pxopts = 0; rt->metric = ext->metric & LSA_METRIC_MASK; rt->ebit = ext->metric & LSA_EXT2_EBIT; @@ -339,14 +338,13 @@ lsa_parse_ext(struct top_hash_entry *en, int ospf2, struct ospf_lsa_ext_local *r else { struct ospf_lsa_ext3 *ext = en->lsa_body; - u16 rest; - u32 *buf = lsa_get_ipv6_prefix(ext->rest, &rt->ip, &rt->pxlen, &rt->pxopts, &rest); + u32 *buf = ospf3_get_prefix(ext->rest, af, &rt->net, &rt->pxopts, NULL); rt->metric = ext->metric & LSA_METRIC_MASK; rt->ebit = ext->metric & LSA_EXT3_EBIT; rt->fbit = ext->metric & LSA_EXT3_FBIT; if (rt->fbit) - buf = lsa_get_ipv6_addr(buf, &rt->fwaddr); + buf = ospf3_get_addr(buf, af, &rt->fwaddr); else rt->fwaddr = IPA_NONE; @@ -452,7 +450,7 @@ lsa_validate_sum3_net(struct ospf_lsa_header *lsa, struct ospf_lsa_sum3_net *bod return 0; u8 pxl = pxlen(body->prefix); - if (pxl > MAX_PREFIX_LENGTH) + if (pxl > IP6_MAX_PREFIX_LENGTH) return 0; if (lsa->length != (HDRLEN + sizeof(struct ospf_lsa_sum3_net) + @@ -491,11 +489,11 @@ lsa_validate_ext3(struct ospf_lsa_header *lsa, struct ospf_lsa_ext3 *body) return 0; u8 pxl = pxlen(body->rest); - if (pxl > MAX_PREFIX_LENGTH) + if (pxl > IP6_MAX_PREFIX_LENGTH) return 0; int len = IPV6_PREFIX_SPACE(pxl); - if (body->metric & LSA_EXT3_FBIT) // forwardinf address + if (body->metric & LSA_EXT3_FBIT) // forwarding address len += 16; if (body->metric & LSA_EXT3_TBIT) // route tag len += 4; @@ -520,7 +518,7 @@ lsa_validate_pxlist(struct ospf_lsa_header *lsa, u32 pxcount, uint offset, u8 *p return 0; u8 pxl = pxlen((u32 *) (pbuf + offset)); - if (pxl > MAX_PREFIX_LENGTH) + if (pxl > IP6_MAX_PREFIX_LENGTH) return 0; offset += IPV6_PREFIX_SPACE(pxl); @@ -554,8 +552,8 @@ lsa_validate_prefix(struct ospf_lsa_header *lsa, struct ospf_lsa_prefix *body) /** * lsa_validate - check whether given LSA is valid * @lsa: LSA header - * @lsa_type: one of %LSA_T_xxx - * @ospf2: %true means OSPF version 2, %false means OSPF version 3 + * @lsa_type: internal LSA type (%LSA_T_xxx) + * @ospf2: %true for OSPFv2, %false for OSPFv3 * @body: pointer to LSA body * * Checks internal structure of given LSA body (minimal length, diff --git a/proto/ospf/lsalib.h b/proto/ospf/lsalib.h index 638b3525..fca7faec 100644 --- a/proto/ospf/lsalib.h +++ b/proto/ospf/lsalib.h @@ -41,7 +41,7 @@ void lsa_get_type_domain_(u32 itype, struct ospf_iface *ifa, u32 *otype, u32 *do static inline void lsa_get_type_domain(struct ospf_lsa_header *lsa, struct ospf_iface *ifa, u32 *otype, u32 *domain) { lsa_get_type_domain_(lsa->type_raw, ifa, otype, domain); } -static inline u32 lsa_get_etype(struct ospf_lsa_header *h, struct ospf_proto *p UNUSED4 UNUSED6) +static inline u32 lsa_get_etype(struct ospf_lsa_header *h, struct ospf_proto *p) { return ospf_is_v2(p) ? (h->type_raw & LSA_T_V2_MASK) : h->type_raw; } @@ -55,9 +55,12 @@ u16 lsa_verify_checksum(const void *lsa_n, int lsa_len); int lsa_comp(struct ospf_lsa_header *l1, struct ospf_lsa_header *l2); void lsa_walk_rt_init(struct ospf_proto *po, struct top_hash_entry *act, struct ospf_lsa_rt_walk *rt); int lsa_walk_rt(struct ospf_lsa_rt_walk *rt); -void lsa_parse_sum_net(struct top_hash_entry *en, int ospf2, ip_addr *ip, int *pxlen, u8 *pxopts, u32 *metric); +void lsa_parse_sum_net(struct top_hash_entry *en, int ospf2, int af, net_addr *net, u8 *pxopts, u32 *metric); void lsa_parse_sum_rt(struct top_hash_entry *en, int ospf2, u32 *drid, u32 *metric, u32 *options); -void lsa_parse_ext(struct top_hash_entry *en, int ospf2, struct ospf_lsa_ext_local *rt); +void lsa_parse_ext(struct top_hash_entry *en, int ospf2, int af, struct ospf_lsa_ext_local *rt); int lsa_validate(struct ospf_lsa_header *lsa, u32 lsa_type, int ospf2, void *body); +static inline btime lsa_inst_age(struct top_hash_entry *en) +{ return current_time() - en->inst_time; } + #endif /* _BIRD_OSPF_LSALIB_H_ */ diff --git a/proto/ospf/lsupd.c b/proto/ospf/lsupd.c index 157d9628..a98c9098 100644 --- a/proto/ospf/lsupd.c +++ b/proto/ospf/lsupd.c @@ -137,7 +137,7 @@ ospf_lsa_lsrt_up(struct top_hash_entry *en, struct ospf_neighbor *n) ret->lsa_body = LSA_BODY_DUMMY; if (!tm_active(n->lsrt_timer)) - tm_start(n->lsrt_timer, n->ifa->rxmtint); + tm_start(n->lsrt_timer, n->ifa->rxmtint S); } void @@ -572,7 +572,7 @@ ospf_receive_lsupd(struct ospf_packet *pkt, struct ospf_iface *ifa, { /* 13. (5a) - enforce minimum time between updates for received LSAs */ /* We also use this to ratelimit reactions to received self-originated LSAs */ - if (en && ((now - en->inst_time) < MINLSARRIVAL)) + if (en && (lsa_inst_age(en) < MINLSARRIVAL)) { OSPF_TRACE(D_EVENTS, "Skipping LSA received in less that MinLSArrival"); continue; @@ -700,7 +700,7 @@ ospf_receive_lsupd(struct ospf_packet *pkt, struct ospf_iface *ifa, if (!EMPTY_SLIST(n->lsrql) && (n->lsrqi == SHEAD(n->lsrql))) { ospf_send_lsreq(p, n); - tm_start(n->lsrq_timer, n->ifa->rxmtint); + tm_start(n->lsrq_timer, n->ifa->rxmtint S); } return; diff --git a/proto/ospf/neighbor.c b/proto/ospf/neighbor.c index 9fe3c028..f2d3505e 100644 --- a/proto/ospf/neighbor.c +++ b/proto/ospf/neighbor.c @@ -94,11 +94,11 @@ ospf_neighbor_new(struct ospf_iface *ifa) init_list(&n->ackl[ACKL_DIRECT]); init_list(&n->ackl[ACKL_DELAY]); - n->inactim = tm_new_set(pool, inactivity_timer_hook, n, 0, 0); - n->dbdes_timer = tm_new_set(pool, dbdes_timer_hook, n, 0, ifa->rxmtint); - n->lsrq_timer = tm_new_set(pool, lsrq_timer_hook, n, 0, ifa->rxmtint); - n->lsrt_timer = tm_new_set(pool, lsrt_timer_hook, n, 0, ifa->rxmtint); - n->ackd_timer = tm_new_set(pool, ackd_timer_hook, n, 0, ifa->rxmtint / 2); + n->inactim = tm_new_init(pool, inactivity_timer_hook, n, 0, 0); + n->dbdes_timer = tm_new_init(pool, dbdes_timer_hook, n, ifa->rxmtint S, 0); + n->lsrq_timer = tm_new_init(pool, lsrq_timer_hook, n, ifa->rxmtint S, 0); + n->lsrt_timer = tm_new_init(pool, lsrt_timer_hook, n, ifa->rxmtint S, 0); + n->ackd_timer = tm_new_init(pool, ackd_timer_hook, n, ifa->rxmtint S / 2, 0); return (n); } @@ -186,7 +186,7 @@ ospf_neigh_chstate(struct ospf_neighbor *n, u8 state) n->myimms = DBDES_IMMS; tm_start(n->dbdes_timer, 0); - tm_start(n->ackd_timer, ifa->rxmtint / 2); + tm_start(n->ackd_timer, ifa->rxmtint S / 2); } if (state > NEIGHBOR_EXSTART) @@ -231,7 +231,7 @@ ospf_neigh_sm(struct ospf_neighbor *n, int event) ospf_neigh_chstate(n, NEIGHBOR_INIT); /* Restart inactivity timer */ - tm_start(n->inactim, n->ifa->deadint); + tm_start(n->inactim, n->ifa->deadint S); break; case INM_2WAYREC: @@ -359,7 +359,7 @@ can_do_adj(struct ospf_neighbor *n) } -static inline u32 neigh_get_id(struct ospf_proto *p UNUSED4 UNUSED6, struct ospf_neighbor *n) +static inline u32 neigh_get_id(struct ospf_proto *p, struct ospf_neighbor *n) { return ospf_is_v2(p) ? ipa_to_u32(n->ip) : n->rid; } static struct ospf_neighbor * @@ -507,13 +507,14 @@ ospf_dr_election(struct ospf_iface *ifa) u32 old_drid = ifa->drid; u32 old_bdrid = ifa->bdrid; + ip_addr none = ospf_is_v2(p) ? IPA_NONE4 : IPA_NONE6; ifa->drid = ndr ? ndr->rid : 0; - ifa->drip = ndr ? ndr->ip : IPA_NONE; + ifa->drip = ndr ? ndr->ip : none; ifa->dr_iface_id = ndr ? ndr->iface_id : 0; ifa->bdrid = nbdr ? nbdr->rid : 0; - ifa->bdrip = nbdr ? nbdr->ip : IPA_NONE; + ifa->bdrip = nbdr ? nbdr->ip : none; DBG("DR=%R, BDR=%R\n", ifa->drid, ifa->bdrid); @@ -650,20 +651,6 @@ ospf_sh_neigh_info(struct ospf_neighbor *n) { struct ospf_iface *ifa = n->ifa; char *pos = "PtP "; - char etime[6]; - int exp, sec, min; - - exp = n->inactim->expires - now; - sec = exp % 60; - min = exp / 60; - if (min > 59) - { - bsprintf(etime, "-Inf-"); - } - else - { - bsprintf(etime, "%02u:%02u", min, sec); - } if ((ifa->type == OSPF_IT_BCAST) || (ifa->type == OSPF_IT_NBMA)) { @@ -675,6 +662,7 @@ ospf_sh_neigh_info(struct ospf_neighbor *n) pos = "Other"; } - cli_msg(-1013, "%-1R\t%3u\t%s/%s\t%-5s\t%-10s %-1I", n->rid, n->priority, - ospf_ns_names[n->state], pos, etime, ifa->ifname, n->ip); + cli_msg(-1013, "%-1R\t%3u\t%s/%s\t%7t\t%-10s %-1I", + n->rid, n->priority, ospf_ns_names[n->state], pos, + tm_remains(n->inactim), ifa->ifname, n->ip); } diff --git a/proto/ospf/ospf.c b/proto/ospf/ospf.c index d5d5d354..3ebebdaa 100644 --- a/proto/ospf/ospf.c +++ b/proto/ospf/ospf.c @@ -92,8 +92,10 @@ * - RFC 2328 - main OSPFv2 standard * - RFC 5340 - main OSPFv3 standard * - RFC 3101 - OSPFv2 NSSA areas - * - RFC 6549 - OSPFv2 multi-instance extensions - * - RFC 6987 - OSPF stub router advertisement + * - RFC 5709 - OSPFv2 HMAC-SHA Cryptographic Authentication + * - RFC 5838 - OSPFv3 Support of Address Families + * - RFC 6549 - OSPFv2 Multi-Instance Extensions + * - RFC 6987 - OSPF Stub Router Advertisement */ #include <stdlib.h> @@ -102,18 +104,11 @@ static int ospf_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool *pool); static struct ea_list *ospf_make_tmp_attrs(struct rte *rt, struct linpool *pool); static void ospf_store_tmp_attrs(struct rte *rt, struct ea_list *attrs); -static int ospf_reload_routes(struct proto *P); +static void ospf_reload_routes(struct channel *C); static int ospf_rte_better(struct rte *new, struct rte *old); static int ospf_rte_same(struct rte *new, struct rte *old); static void ospf_disp(timer *timer); -static void -ospf_area_initfib(struct fib_node *fn) -{ - struct area_net *an = (struct area_net *) fn; - an->hidden = 0; - an->active = 0; -} static void add_area_nets(struct ospf_area *oa, struct ospf_area_config *ac) @@ -122,23 +117,35 @@ add_area_nets(struct ospf_area *oa, struct ospf_area_config *ac) struct area_net_config *anc; struct area_net *an; - fib_init(&oa->net_fib, p->p.pool, sizeof(struct area_net), 0, ospf_area_initfib); - fib_init(&oa->enet_fib, p->p.pool, sizeof(struct area_net), 0, ospf_area_initfib); + fib_init(&oa->net_fib, p->p.pool, ospf_get_af(p), + sizeof(struct area_net), OFFSETOF(struct area_net, fn), 0, NULL); + fib_init(&oa->enet_fib, p->p.pool, ospf_get_af(p), + sizeof(struct area_net), OFFSETOF(struct area_net, fn), 0, NULL); WALK_LIST(anc, ac->net_list) { - an = (struct area_net *) fib_get(&oa->net_fib, &anc->px.addr, anc->px.len); + an = fib_get(&oa->net_fib, &anc->prefix); an->hidden = anc->hidden; } WALK_LIST(anc, ac->enet_list) { - an = (struct area_net *) fib_get(&oa->enet_fib, &anc->px.addr, anc->px.len); + an = fib_get(&oa->enet_fib, &anc->prefix); an->hidden = anc->hidden; an->tag = anc->tag; } } +static inline uint +ospf_opts(struct ospf_proto *p) +{ + if (ospf_is_v2(p)) + return 0; + + return ((ospf_is_ip6(p) && !p->af_mc) ? OPT_V6 : 0) | + (!p->stub_router ? OPT_R : 0) | (p->af_ext ? OPT_AF : 0); +} + static void ospf_area_add(struct ospf_proto *p, struct ospf_area_config *ac) { @@ -154,16 +161,13 @@ ospf_area_add(struct ospf_proto *p, struct ospf_area_config *ac) oa->areaid = ac->areaid; oa->rt = NULL; oa->po = p; - fib_init(&oa->rtr, p->p.pool, sizeof(ort), 0, ospf_rt_initort); + fib_init(&oa->rtr, p->p.pool, NET_IP4, sizeof(ort), OFFSETOF(ort, fn), 0, NULL); add_area_nets(oa, ac); if (oa->areaid == 0) p->backbone = oa; - if (ospf_is_v2(p)) - oa->options = ac->type; - else - oa->options = ac->type | OPT_V6 | (p->stub_router ? 0 : OPT_R); + oa->options = ac->type | ospf_opts(p); ospf_notify_rt_lsa(oa); } @@ -229,21 +233,25 @@ ospf_start(struct proto *P) p->router_id = proto_get_router_id(P->cf); p->ospf2 = c->ospf2; + p->af_ext = c->af_ext; + p->af_mc = c->af_mc; p->rfc1583 = c->rfc1583; p->stub_router = c->stub_router; p->merge_external = c->merge_external; p->asbr = c->asbr; p->ecmp = c->ecmp; p->tick = c->tick; - p->disp_timer = tm_new_set(P->pool, ospf_disp, p, 0, p->tick); - tm_start(p->disp_timer, 1); + p->disp_timer = tm_new_init(P->pool, ospf_disp, p, p->tick S, 0); + tm_start(p->disp_timer, 100 MS); p->lsab_size = 256; p->lsab_used = 0; p->lsab = mb_alloc(P->pool, p->lsab_size); - p->nhpool = lp_new(P->pool, 12*sizeof(struct mpnh)); + p->nhpool = lp_new(P->pool, 12*sizeof(struct nexthop)); init_list(&(p->iface_list)); init_list(&(p->area_list)); - fib_init(&p->rtf, P->pool, sizeof(ort), 0, ospf_rt_initort); + fib_init(&p->rtf, P->pool, ospf_get_af(p), sizeof(ort), OFFSETOF(ort, fn), 0, NULL); + if (ospf_is_v3(p)) + idm_init(&p->idm, P->pool, 16); p->areano = 0; p->gr = ospf_top_new(p, P->pool); s_init_list(&(p->lsal)); @@ -299,15 +307,16 @@ ospf_dump(struct proto *P) } static struct proto * -ospf_init(struct proto_config *c) +ospf_init(struct proto_config *CF) { - struct ospf_config *oc = (struct ospf_config *) c; - struct proto *P = proto_new(c, sizeof(struct ospf_proto)); + struct ospf_config *cf = (struct ospf_config *) CF; + struct proto *P = proto_new(CF); + + P->main_channel = proto_add_channel(P, proto_cf_main_channel(CF)); - P->accept_ra_types = RA_OPTIMAL; P->rt_notify = ospf_rt_notify; P->if_notify = ospf_if_notify; - P->ifa_notify = oc->ospf2 ? ospf_ifa_notify2 : ospf_ifa_notify3; + P->ifa_notify = cf->ospf2 ? ospf_ifa_notify2 : ospf_ifa_notify3; P->import_control = ospf_import_control; P->reload_routes = ospf_reload_routes; P->make_tmp_attrs = ospf_make_tmp_attrs; @@ -391,17 +400,16 @@ ospf_schedule_rtcalc(struct ospf_proto *p) p->calcrt = 1; } -static int -ospf_reload_routes(struct proto *P) +static void +ospf_reload_routes(struct channel *C) { - struct ospf_proto *p = (struct ospf_proto *) P; + struct ospf_proto *p = (struct ospf_proto *) C->proto; - if (p->calcrt != 2) - OSPF_TRACE(D_EVENTS, "Scheduling routing table calculation with route reload"); + if (p->calcrt == 2) + return; + OSPF_TRACE(D_EVENTS, "Scheduling routing table calculation with route reload"); p->calcrt = 2; - - return 1; } @@ -506,9 +514,9 @@ ospf_shutdown(struct proto *P) ospf_iface_shutdown(ifa); /* Cleanup locked rta entries */ - FIB_WALK(&p->rtf, nftmp) + FIB_WALK(&p->rtf, ort, nf) { - rta_free(((ort *) nftmp)->old_rta); + rta_free(nf->old_rta); } FIB_WALK_END; @@ -603,11 +611,7 @@ ospf_area_reconfigure(struct ospf_area *oa, struct ospf_area_config *nac) struct ospf_iface *ifa; oa->ac = nac; - - if (ospf_is_v2(p)) - oa->options = nac->type; - else - oa->options = nac->type | OPT_V6 | (p->stub_router ? 0 : OPT_R); + oa->options = nac->type | ospf_opts(p); if (nac->type != oac->type) { @@ -639,17 +643,20 @@ ospf_area_reconfigure(struct ospf_area *oa, struct ospf_area_config *nac) * nonbroadcast network, cost of interface, etc. */ static int -ospf_reconfigure(struct proto *P, struct proto_config *c) +ospf_reconfigure(struct proto *P, struct proto_config *CF) { struct ospf_proto *p = (struct ospf_proto *) P; struct ospf_config *old = (struct ospf_config *) (P->cf); - struct ospf_config *new = (struct ospf_config *) c; + struct ospf_config *new = (struct ospf_config *) CF; struct ospf_area_config *nac; struct ospf_area *oa, *oax; struct ospf_iface *ifa, *ifx; struct ospf_iface_patt *ip; - if (proto_get_router_id(c) != p->router_id) + if (proto_get_router_id(CF) != p->router_id) + return 0; + + if (p->ospf2 != new->ospf2) return 0; if (p->rfc1583 != new->rfc1583) @@ -658,13 +665,19 @@ ospf_reconfigure(struct proto *P, struct proto_config *c) if (old->abr != new->abr) return 0; + if ((p->af_ext != new->af_ext) || (p->af_mc != new->af_mc)) + return 0; + + if (!proto_configure_channel(P, &P->main_channel, proto_cf_main_channel(CF))) + return 0; + p->stub_router = new->stub_router; p->merge_external = new->merge_external; p->asbr = new->asbr; p->ecmp = new->ecmp; p->tick = new->tick; - p->disp_timer->recurrent = p->tick; - tm_start(p->disp_timer, 1); + p->disp_timer->recurrent = p->tick S; + tm_start(p->disp_timer, 100 MS); /* Mark all areas and ifaces */ WALK_LIST(oa, p->area_list) @@ -746,7 +759,6 @@ ospf_sh(struct proto *P) struct ospf_iface *ifa; struct ospf_neighbor *n; int ifano, nno, adjno, firstfib; - struct area_net *anet; if (p->p.proto_state != PS_UP) { @@ -795,29 +807,27 @@ ospf_sh(struct proto *P) cli_msg(-1014, "\t\tNumber of adjacent neighbors:\t%u", adjno); firstfib = 1; - FIB_WALK(&oa->net_fib, nftmp) + FIB_WALK(&oa->net_fib, struct area_net, anet) { - anet = (struct area_net *) nftmp; if(firstfib) { cli_msg(-1014, "\t\tArea networks:"); firstfib = 0; } - cli_msg(-1014, "\t\t\t%1I/%u\t%s\t%s", anet->fn.prefix, anet->fn.pxlen, + cli_msg(-1014, "\t\t\t%1N\t%s\t%s", anet->fn.addr, anet->hidden ? "Hidden" : "Advertise", anet->active ? "Active" : ""); } FIB_WALK_END; firstfib = 1; - FIB_WALK(&oa->enet_fib, nftmp) + FIB_WALK(&oa->enet_fib, struct area_net, anet) { - anet = (struct area_net *) nftmp; if(firstfib) { cli_msg(-1014, "\t\tArea external networks:"); firstfib = 0; } - cli_msg(-1014, "\t\t\t%1I/%u\t%s\t%s", anet->fn.prefix, anet->fn.pxlen, + cli_msg(-1014, "\t\t\t%1N\t%s\t%s", anet->fn.addr, anet->hidden ? "Hidden" : "Advertise", anet->active ? "Active" : ""); } FIB_WALK_END; @@ -1072,15 +1082,14 @@ show_lsa_network(struct top_hash_entry *he, int ospf2) } static inline void -show_lsa_sum_net(struct top_hash_entry *he, int ospf2) +show_lsa_sum_net(struct top_hash_entry *he, int ospf2, int af) { - ip_addr ip; - int pxlen; + net_addr net; u8 pxopts; u32 metric; - lsa_parse_sum_net(he, ospf2, &ip, &pxlen, &pxopts, &metric); - cli_msg(-1016, "\t\txnetwork %I/%d metric %u", ip, pxlen, metric); + lsa_parse_sum_net(he, ospf2, af, &net, &pxopts, &metric); + cli_msg(-1016, "\t\txnetwork %N metric %u", &net, metric); } static inline void @@ -1096,16 +1105,16 @@ show_lsa_sum_rt(struct top_hash_entry *he, int ospf2) static inline void -show_lsa_external(struct top_hash_entry *he, int ospf2) +show_lsa_external(struct top_hash_entry *he, int ospf2, int af) { struct ospf_lsa_ext_local rt; - char str_via[STD_ADDRESS_P_LENGTH + 8] = ""; + char str_via[IPA_MAX_TEXT_LENGTH + 8] = ""; char str_tag[16] = ""; if (he->lsa_type == LSA_T_EXT) he->domain = 0; /* Unmark the LSA */ - lsa_parse_ext(he, ospf2, &rt); + lsa_parse_ext(he, ospf2, af, &rt); if (rt.fbit) bsprintf(str_via, " via %I", rt.fwaddr); @@ -1113,19 +1122,15 @@ show_lsa_external(struct top_hash_entry *he, int ospf2) if (rt.tag) bsprintf(str_tag, " tag %08x", rt.tag); - cli_msg(-1016, "\t\t%s %I/%d metric%s %u%s%s", + cli_msg(-1016, "\t\t%s %N metric%s %u%s%s", (he->lsa_type == LSA_T_NSSA) ? "nssa-ext" : "external", - rt.ip, rt.pxlen, rt.ebit ? "2" : "", rt.metric, str_via, str_tag); + &rt.net, rt.ebit ? "2" : "", rt.metric, str_via, str_tag); } static inline void -show_lsa_prefix(struct top_hash_entry *he, struct top_hash_entry *cnode) +show_lsa_prefix(struct top_hash_entry *he, struct top_hash_entry *cnode, int af) { struct ospf_lsa_prefix *px = he->lsa_body; - ip_addr pxa; - int pxlen; - u8 pxopts; - u16 metric; u32 *buf; int i; @@ -1141,14 +1146,18 @@ show_lsa_prefix(struct top_hash_entry *he, struct top_hash_entry *cnode) buf = px->rest; for (i = 0; i < px->pxcount; i++) - { - buf = lsa_get_ipv6_prefix(buf, &pxa, &pxlen, &pxopts, &metric); + { + net_addr net; + u8 pxopts; + u16 metric; - if (px->ref_type == LSA_T_RT) - cli_msg(-1016, "\t\tstubnet %I/%d metric %u", pxa, pxlen, metric); - else - cli_msg(-1016, "\t\taddress %I/%d", pxa, pxlen); - } + buf = ospf3_get_prefix(buf, af, &net, &pxopts, &metric); + + if (px->ref_type == LSA_T_RT) + cli_msg(-1016, "\t\tstubnet %N metric %u", &net, metric); + else + cli_msg(-1016, "\t\taddress %N", &net); + } } void @@ -1156,6 +1165,7 @@ ospf_sh_state(struct proto *P, int verbose, int reachable) { struct ospf_proto *p = (struct ospf_proto *) P; int ospf2 = ospf_is_v2(p); + int af = ospf_get_af(p); uint i, ix, j1, jx; u32 last_area = 0xFFFFFFFF; @@ -1276,7 +1286,7 @@ ospf_sh_state(struct proto *P, int verbose, int reachable) case LSA_T_SUM_NET: if (cnode->lsa_type == LSA_T_RT) - show_lsa_sum_net(he, ospf2); + show_lsa_sum_net(he, ospf2, af); break; case LSA_T_SUM_RT: @@ -1286,11 +1296,11 @@ ospf_sh_state(struct proto *P, int verbose, int reachable) case LSA_T_EXT: case LSA_T_NSSA: - show_lsa_external(he, ospf2); + show_lsa_external(he, ospf2, af); break; case LSA_T_PREFIX: - show_lsa_prefix(he, cnode); + show_lsa_prefix(he, cnode, af); break; } @@ -1304,7 +1314,7 @@ ospf_sh_state(struct proto *P, int verbose, int reachable) ix++; while ((ix < jx) && (hex[ix]->lsa.rt == cnode->lsa.rt)) - show_lsa_external(hex[ix++], ospf2); + show_lsa_external(hex[ix++], ospf2, af); cnode = NULL; } @@ -1338,7 +1348,7 @@ ospf_sh_state(struct proto *P, int verbose, int reachable) last_rt = he->lsa.rt; } - show_lsa_external(he, ospf2); + show_lsa_external(he, ospf2, af); } } @@ -1468,6 +1478,8 @@ struct protocol proto_ospf = { .template = "ospf%d", .attr_class = EAP_OSPF, .preference = DEF_PREF_OSPF, + .channel_mask = NB_IP, + .proto_size = sizeof(struct ospf_proto), .config_size = sizeof(struct ospf_config), .init = ospf_init, .dump = ospf_dump, diff --git a/proto/ospf/ospf.h b/proto/ospf/ospf.h index 81c610d5..54eeb74c 100644 --- a/proto/ospf/ospf.h +++ b/proto/ospf/ospf.h @@ -14,7 +14,7 @@ #include "nest/bird.h" #include "lib/checksum.h" -#include "lib/ip.h" +#include "lib/idm.h" #include "lib/lists.h" #include "lib/slists.h" #include "lib/socket.h" @@ -37,14 +37,6 @@ #endif -#ifdef IPV6 -#define OSPF_IS_V2 0 -#else -#define OSPF_IS_V2 1 -#endif - -// FIXME: MAX_PREFIX_LENGTH - #define OSPF_TRACE(flags, msg, args...) \ do { if ((p->p.debug & flags) || OSPF_FORCE_DEBUG) \ log(L_TRACE "%s: " msg, p->p.name , ## args ); } while(0) @@ -66,16 +58,16 @@ log_rl(&p->log_lsa_tbf, L_REMOTE "%s: " msg, p->p.name, args) #define LOG_LSA2(msg, args...) \ - do { if (! p->log_lsa_tbf.mark) \ + do { if (! p->log_lsa_tbf.drop) \ log(L_REMOTE "%s: " msg, p->p.name, args); } while(0) #define OSPF_PROTO 89 -#define LSREFRESHTIME 1800 /* 30 minutes */ -#define MINLSINTERVAL 5 -#define MINLSARRIVAL 1 -#define LSINFINITY 0xffffff +#define LSREFRESHTIME 1800 /* 30 minutes */ +#define MINLSINTERVAL (5 S_) +#define MINLSARRIVAL (1 S_) +#define LSINFINITY 0xffffff #define OSPF_DEFAULT_TICK 1 #define OSPF_DEFAULT_STUB_COST 1000 @@ -87,16 +79,18 @@ #define OSPF_VLINK_ID_OFFSET 0x80000000 - struct ospf_config { struct proto_config c; uint tick; u8 ospf2; + u8 af_ext; + u8 af_mc; u8 rfc1583; u8 stub_router; u8 merge_external; u8 instance_id; + u8 instance_id_set; u8 abr; u8 asbr; int ecmp; @@ -125,24 +119,24 @@ struct ospf_area_config struct area_net_config { node n; - struct prefix px; + net_addr prefix; u32 tag; u8 hidden; }; struct area_net { - struct fib_node fn; u32 metric; /* With possible LSA_EXT3_EBIT for NSSA area nets */ u32 tag; u8 hidden; u8 active; + struct fib_node fn; }; struct ospf_stubnet_config { node n; - struct prefix px; + net_addr prefix; u32 cost; u8 hidden; u8 summary; @@ -177,9 +171,9 @@ struct ospf_iface_patt int tx_priority; u16 tx_length; u16 rx_buffer; - #define OSPF_RXBUF_MINSIZE 256 /* Minimal allowed size */ u8 instance_id; + u8 instance_id_set; u8 autype; /* OSPF_AUTH_*, not really used in OSPFv3 */ u8 strictnbma; u8 check_link; @@ -189,7 +183,6 @@ struct ospf_iface_patt u8 ptp_netmask; /* bool + 2 for unspecified */ u8 ttl_security; /* bool + 2 for TX only */ u8 bfd; - u8 bsd_secondary; list *passwords; }; @@ -220,12 +213,15 @@ struct ospf_proto int areano; /* Number of area I belong to */ int padj; /* Number of neighbors in Exchange or Loading state */ struct fib rtf; /* Routing table */ - byte ospf2; /* OSPF v2 or v3 */ - byte rfc1583; /* RFC1583 compatibility */ - byte stub_router; /* Do not forward transit traffic */ - byte merge_external; /* Should i merge external routes? */ - byte asbr; /* May i originate any ext/NSSA lsa? */ - byte ecmp; /* Maximal number of nexthops in ECMP route, or 0 */ + struct idm idm; /* OSPFv3 LSA ID map */ + u8 ospf2; /* OSPF v2 or v3 */ + u8 af_ext; /* OSPFv3-AF extension */ + u8 af_mc; /* OSPFv3-AF multicast */ + u8 rfc1583; /* RFC1583 compatibility */ + u8 stub_router; /* Do not forward transit traffic */ + u8 merge_external; /* Should i merge external routes? */ + u8 asbr; /* May i originate any ext/NSSA lsa? */ + u8 ecmp; /* Maximal number of nexthops in ECMP route, or 0 */ struct ospf_area *backbone; /* If exists */ event *flood_event; /* Event for flooding LS updates */ void *lsab; /* LSA buffer used when originating router LSAs */ @@ -273,10 +269,10 @@ struct ospf_iface sock *sk; /* IP socket */ list neigh_list; /* List of neighbors (struct ospf_neighbor) */ u32 cost; /* Cost of iface */ - u32 waitint; /* number of sec before changing state from wait */ - u32 rxmtint; /* number of seconds between LSA retransmissions */ - u32 pollint; /* Poll interval */ - u32 deadint; /* after "deadint" missing hellos is router dead */ + u32 waitint; /* Number of seconds before changing state from wait */ + u32 rxmtint; /* Number of seconds between LSA retransmissions */ + u32 pollint; /* Poll interval in seconds */ + u32 deadint; /* After deadint seconds without hellos is router dead */ u32 iface_id; /* Interface ID (iface->index or new value for vlinks) */ u32 vid; /* ID of peer of virtual link */ ip_addr vip; /* IP of peer of virtual link */ @@ -287,8 +283,8 @@ struct ospf_iface interface. LSAs contained in the update */ u16 helloint; /* number of seconds between hello sending */ list *passwords; - u32 csn; /* Last used crypt seq number */ - bird_clock_t csn_use; /* Last time when packet with that CSN was sent */ + u32 csn; /* Last used crypt seq number */ + btime csn_use; /* Last time when packet with that CSN was sent */ ip_addr all_routers; /* Multicast (or broadcast) address for all routers */ ip_addr des_routers; /* Multicast (or NULL) address for designated routers */ ip_addr drip; /* Designated router IP */ @@ -458,14 +454,15 @@ struct ospf_neighbor /* Generic option flags */ -#define OPT_V6 0x01 /* OSPFv3, LSA relevant for IPv6 routing calculation */ -#define OPT_E 0x02 /* Related to AS-external LSAs */ -#define OPT_MC 0x04 /* Related to MOSPF, not used and obsolete */ -#define OPT_N 0x08 /* Related to NSSA */ -#define OPT_P 0x08 /* OSPFv2, flags P and N share position, see NSSA RFC */ -#define OPT_EA 0x10 /* OSPFv2, external attributes, not used and obsolete */ -#define OPT_R 0x10 /* OSPFv3, originator is active router */ -#define OPT_DC 0x20 /* Related to demand circuits, not used */ +#define OPT_V6 0x0001 /* OSPFv3, LSA relevant for IPv6 routing calculation */ +#define OPT_E 0x0002 /* Related to AS-external LSAs */ +#define OPT_MC 0x0004 /* Related to MOSPF, not used and obsolete */ +#define OPT_N 0x0008 /* Related to NSSA */ +#define OPT_P 0x0008 /* OSPFv2, flags P and N share position, see NSSA RFC */ +#define OPT_EA 0x0010 /* OSPFv2, external attributes, not used and obsolete */ +#define OPT_R 0x0010 /* OSPFv3, originator is active router */ +#define OPT_DC 0x0020 /* Related to demand circuits, not used */ +#define OPT_AF 0x0100 /* OSPFv3 Address Families (RFC 5838) */ /* Router-LSA VEB flags are are stored together with links (OSPFv2) or options (OSPFv3) */ #define OPT_RT_B (0x01 << 24) @@ -682,8 +679,8 @@ struct ospf_lsa_ext3 struct ospf_lsa_ext_local { - ip_addr ip, fwaddr; - int pxlen; + net_addr net; + ip_addr fwaddr; u32 metric, ebit, fbit, tag, propagate; u8 pxopts; }; @@ -721,73 +718,102 @@ lsa_net_count(struct ospf_lsa_header *lsa) /* In ospf_area->rtr we store paths to routers, but we use RID (and not IP address) as index, so we need to encapsulate RID to IP address */ -#define ipa_from_rid(x) ipa_from_u32(x) -#define ipa_to_rid(x) ipa_to_u32(x) +#define net_from_rid(x) NET_ADDR_IP4(ip4_from_u32(x), IP4_MAX_PREFIX_LENGTH) +#define rid_from_net(x) ip4_to_u32(((net_addr_ip4 *) x)->prefix) #define IPV6_PREFIX_SPACE(x) ((((x) + 63) / 32) * 4) #define IPV6_PREFIX_WORDS(x) (((x) + 63) / 32) -/* FIXME: these four functions should be significantly redesigned w.r.t. integration, - also should be named as ospf3_* instead of *_ipv6_* */ + +static inline int +ospf_valid_prefix(net_addr *n) +{ + /* + * In OSPFv2, prefix is stored as netmask; ip4_masklen() returns 255 for + * invalid one. But OSPFv3-AF may receive IPv4 net with 32 < pxlen < 128. + */ + uint max = (n->type == NET_IP4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH; + return n->pxlen <= max; +} + +/* + * In OSPFv3-AF (RFC 5835), IPv4 address is encoded by just placing it in the + * first 32 bits of IPv6 address and setting remaining bits to zero. Likewise + * for IPv4 prefix, where remaining bits do not matter. We use following + * functions to convert between IPv4 and IPv4-in-IPv6 representations: + */ + +static inline ip4_addr ospf3_6to4(ip6_addr a) +{ return _MI4(_I0(a)); } + +static inline ip6_addr ospf3_4to6(ip4_addr a) +{ return _MI6(_I(a), 0, 0, 0); } + static inline u32 * -lsa_get_ipv6_prefix(u32 *buf, ip_addr *addr, int *pxlen, u8 *pxopts, u16 *rest) +ospf3_get_prefix(u32 *buf, int af, net_addr *n, u8 *pxopts, u16 *rest) { - u8 pxl = (*buf >> 24); - *pxopts = (*buf >> 16); - *rest = *buf; - *pxlen = pxl; + ip6_addr px = IP6_NONE; + uint pxlen = (*buf >> 24); + *pxopts = (*buf >> 16) & 0xff; + if (rest) *rest = *buf & 0xffff; buf++; - *addr = IPA_NONE; - -#ifdef IPV6 - if (pxl > 0) - _I0(*addr) = *buf++; - if (pxl > 32) - _I1(*addr) = *buf++; - if (pxl > 64) - _I2(*addr) = *buf++; - if (pxl > 96) - _I3(*addr) = *buf++; + if (pxlen > 0) + _I0(px) = *buf++; + if (pxlen > 32) + _I1(px) = *buf++; + if (pxlen > 64) + _I2(px) = *buf++; + if (pxlen > 96) + _I3(px) = *buf++; /* Clean up remaining bits */ - if (pxl < 128) - addr->addr[pxl / 32] &= u32_mkmask(pxl % 32); -#endif + if (pxlen < 128) + px.addr[pxlen / 32] &= u32_mkmask(pxlen % 32); + + if (af == NET_IP4) + net_fill_ip4(n, ospf3_6to4(px), pxlen); + else + net_fill_ip6(n, px, pxlen); return buf; } static inline u32 * -lsa_get_ipv6_addr(u32 *buf, ip_addr *addr) +ospf3_put_prefix(u32 *buf, net_addr *n, u8 pxopts, u16 rest) { - *addr = *(ip_addr *) buf; - return buf + 4; -} + ip6_addr px = (n->type == NET_IP4) ? ospf3_4to6(net4_prefix(n)) : net6_prefix(n); + uint pxlen = n->pxlen; -static inline u32 * -put_ipv6_prefix(u32 *buf, ip_addr addr UNUSED4, u8 pxlen UNUSED4, u8 pxopts UNUSED4, u16 lh UNUSED4) -{ -#ifdef IPV6 - *buf++ = ((pxlen << 24) | (pxopts << 16) | lh); + *buf++ = ((pxlen << 24) | (pxopts << 16) | rest); if (pxlen > 0) - *buf++ = _I0(addr); + *buf++ = _I0(px); if (pxlen > 32) - *buf++ = _I1(addr); + *buf++ = _I1(px); if (pxlen > 64) - *buf++ = _I2(addr); + *buf++ = _I2(px); if (pxlen > 96) - *buf++ = _I3(addr); -#endif + *buf++ = _I3(px); + return buf; } static inline u32 * -put_ipv6_addr(u32 *buf, ip_addr addr) +ospf3_get_addr(u32 *buf, int af, ip_addr *addr) { - *(ip_addr *) buf = addr; + ip6_addr a; + memcpy(&a, buf, 16); + *addr = (af == NET_IP4) ? ipa_from_ip4(ospf3_6to4(a)) : ipa_from_ip6(a); + return buf + 4; +} + +static inline u32 * +ospf3_put_addr(u32 *buf, ip_addr addr) +{ + ip6_addr a = ipa_is_ip4(addr) ? ospf3_4to6(ipa_to_ip4(addr)) : ipa_to_ip6(addr); + memcpy(buf, &a, 16); return buf + 4; } @@ -831,19 +857,24 @@ static inline void ospf_notify_net_lsa(struct ospf_iface *ifa) static inline void ospf_notify_link_lsa(struct ospf_iface *ifa) { ifa->update_link_lsa = 1; } - -#define ospf_is_v2(X) OSPF_IS_V2 -#define ospf_is_v3(X) (!OSPF_IS_V2) -/* static inline int ospf_is_v2(struct ospf_proto *p) { return p->ospf2; } static inline int ospf_is_v3(struct ospf_proto *p) { return ! p->ospf2; } -*/ -static inline int ospf_get_version(struct ospf_proto *p UNUSED4 UNUSED6) + +static inline int ospf_get_version(struct ospf_proto *p) { return ospf_is_v2(p) ? 2 : 3; } +static inline int ospf_is_ip4(struct ospf_proto *p) +{ return p->p.net_type == NET_IP4; } + +static inline int ospf_is_ip6(struct ospf_proto *p) +{ return p->p.net_type == NET_IP6; } + +static inline int ospf_get_af(struct ospf_proto *p) +{ return p->p.net_type; } + struct ospf_area *ospf_find_area(struct ospf_proto *p, u32 aid); static inline struct ospf_area *ospf_main_area(struct ospf_proto *p) @@ -925,7 +956,7 @@ static inline void ospf_send_to_des(struct ospf_iface *ifa) #define SKIP(DSC) do { err_dsc = DSC; goto skip; } while(0) #endif -static inline uint ospf_pkt_hdrlen(struct ospf_proto *p UNUSED4 UNUSED6) +static inline uint ospf_pkt_hdrlen(struct ospf_proto *p) { return ospf_is_v2(p) ? (sizeof(struct ospf_packet) + sizeof(union ospf_auth)) : sizeof(struct ospf_packet); } static inline void * ospf_tx_buffer(struct ospf_iface *ifa) diff --git a/proto/ospf/packet.c b/proto/ospf/packet.c index 6b6a97a4..38d7a75f 100644 --- a/proto/ospf/packet.c +++ b/proto/ospf/packet.c @@ -77,16 +77,16 @@ ospf_pkt_finalize(struct ospf_iface *ifa, struct ospf_packet *pkt, uint *plen) reboot when system does not have independent RTC? */ if (!ifa->csn) { - ifa->csn = (u32) now; - ifa->csn_use = now; + ifa->csn = (u32) (current_real_time() TO_S); + ifa->csn_use = current_time(); } /* We must have sufficient delay between sending a packet and increasing CSN to prevent reordering of packets (in a network) with different CSNs */ - if ((now - ifa->csn_use) > 1) + if ((current_time() - ifa->csn_use) > 1 S) ifa->csn++; - ifa->csn_use = now; + ifa->csn_use = current_time(); uint auth_len = mac_type_length(pass->alg); byte *auth_tail = ((byte *) pkt + *plen); @@ -203,7 +203,7 @@ drop: /** * ospf_rx_hook * @sk: socket we received the packet. - * @len: size of the packet + * @len: length of the packet * * This is the entry point for messages from neighbors. Many checks (like * authentication, checksums, size) are done before the packet is passed to @@ -231,7 +231,7 @@ ospf_rx_hook(sock *sk, uint len) return 1; int src_local, dst_local, dst_mcast; - src_local = ipa_in_net(sk->faddr, ifa->addr->prefix, ifa->addr->pxlen); + src_local = ipa_in_netX(sk->faddr, &ifa->addr->prefix); dst_local = ipa_equal(sk->laddr, ifa->addr->ip); dst_mcast = ipa_equal(sk->laddr, ifa->all_routers) || ipa_equal(sk->laddr, ifa->des_routers); @@ -270,9 +270,6 @@ ospf_rx_hook(sock *sk, uint len) if (pkt == NULL) DROP("bad IP header", len); - if (ifa->check_ttl && (sk->rcv_ttl < 255)) - DROP("wrong TTL", sk->rcv_ttl); - if (len < sizeof(struct ospf_packet)) DROP("too short", len); @@ -379,6 +376,10 @@ found: if (ipa_equal(sk->laddr, ifa->des_routers) && (ifa->sk_dr == 0)) return 1; + /* TTL check must be done after instance dispatch */ + if (ifa->check_ttl && (sk->rcv_ttl < 255)) + DROP("wrong TTL", sk->rcv_ttl); + if (rid == p->router_id) DROP1("my own router ID"); @@ -491,8 +492,8 @@ ospf_send_to_agt(struct ospf_iface *ifa, u8 state) void ospf_send_to_bdr(struct ospf_iface *ifa) { - if (ipa_nonzero(ifa->drip)) + if (ipa_nonzero2(ifa->drip)) ospf_send_to(ifa, ifa->drip); - if (ipa_nonzero(ifa->bdrip)) + if (ipa_nonzero2(ifa->bdrip)) ospf_send_to(ifa, ifa->bdrip); } diff --git a/proto/ospf/rt.c b/proto/ospf/rt.c index 368e3d05..c0fe218a 100644 --- a/proto/ospf/rt.c +++ b/proto/ospf/rt.c @@ -10,9 +10,7 @@ #include "ospf.h" -static void add_cand(list * l, struct top_hash_entry *en, - struct top_hash_entry *par, u32 dist, - struct ospf_area *oa, int i); +static void add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry *par, u32 dist, int i, uint lif, uint nif); static void rt_sync(struct ospf_proto *p); @@ -21,17 +19,8 @@ static inline void reset_ri(ort *ort) bzero(&ort->n, sizeof(orta)); } -void -ospf_rt_initort(struct fib_node *fn) -{ - ort *ri = (ort *) fn; - reset_ri(ri); - ri->old_rta = NULL; - ri->fn.flags = 0; -} - static inline int -nh_is_vlink(struct mpnh *nhs) +nh_is_vlink(struct nexthop *nhs) { return !nhs->iface; } @@ -42,20 +31,19 @@ unresolved_vlink(ort *ort) return ort->n.nhs && nh_is_vlink(ort->n.nhs); } -static inline struct mpnh * +static inline struct nexthop * new_nexthop(struct ospf_proto *p, ip_addr gw, struct iface *iface, byte weight) { - struct mpnh *nh = lp_alloc(p->nhpool, sizeof(struct mpnh)); + struct nexthop *nh = lp_allocz(p->nhpool, sizeof(struct nexthop)); nh->gw = gw; nh->iface = iface; - nh->next = NULL; nh->weight = weight; return nh; } /* Returns true if there are device nexthops in n */ static inline int -has_device_nexthops(const struct mpnh *n) +has_device_nexthops(const struct nexthop *n) { for (; n; n = n->next) if (ipa_zero(n->gw)) @@ -65,13 +53,13 @@ has_device_nexthops(const struct mpnh *n) } /* Replace device nexthops with nexthops to gw */ -static struct mpnh * -fix_device_nexthops(struct ospf_proto *p, const struct mpnh *n, ip_addr gw) +static struct nexthop * +fix_device_nexthops(struct ospf_proto *p, const struct nexthop *n, ip_addr gw) { - struct mpnh *root1 = NULL; - struct mpnh *root2 = NULL; - struct mpnh **nn1 = &root1; - struct mpnh **nn2 = &root2; + struct nexthop *root1 = NULL; + struct nexthop *root2 = NULL; + struct nexthop **nn1 = &root1; + struct nexthop **nn2 = &root2; if (!p->ecmp) return new_nexthop(p, gw, n->iface, n->weight); @@ -82,7 +70,7 @@ fix_device_nexthops(struct ospf_proto *p, const struct mpnh *n, ip_addr gw) for (; n; n = n->next) { - struct mpnh *nn = new_nexthop(p, ipa_zero(n->gw) ? gw : n->gw, n->iface, n->weight); + struct nexthop *nn = new_nexthop(p, ipa_zero(n->gw) ? gw : n->gw, n->iface, n->weight); if (ipa_zero(n->gw)) { @@ -96,7 +84,7 @@ fix_device_nexthops(struct ospf_proto *p, const struct mpnh *n, ip_addr gw) } } - return mpnh_merge(root1, root2, 1, 1, p->ecmp, p->nhpool); + return nexthop_merge(root1, root2, 1, 1, p->ecmp, p->nhpool); } @@ -292,7 +280,7 @@ ort_merge(struct ospf_proto *p, ort *o, const orta *new) if (old->nhs != new->nhs) { - old->nhs = mpnh_merge(old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse, + old->nhs = nexthop_merge(old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse, p->ecmp, p->nhpool); old->nhs_reuse = 1; } @@ -308,7 +296,7 @@ ort_merge_ext(struct ospf_proto *p, ort *o, const orta *new) if (old->nhs != new->nhs) { - old->nhs = mpnh_merge(old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse, + old->nhs = nexthop_merge(old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse, p->ecmp, p->nhpool); old->nhs_reuse = 1; } @@ -334,9 +322,9 @@ ort_merge_ext(struct ospf_proto *p, ort *o, const orta *new) static inline void -ri_install_net(struct ospf_proto *p, ip_addr prefix, int pxlen, const orta *new) +ri_install_net(struct ospf_proto *p, net_addr *net, const orta *new) { - ort *old = (ort *) fib_get(&p->rtf, &prefix, pxlen); + ort *old = fib_get(&p->rtf, net); int cmp = orta_compare(p, new, &old->n); if (cmp > 0) @@ -348,8 +336,8 @@ ri_install_net(struct ospf_proto *p, ip_addr prefix, int pxlen, const orta *new) static inline void ri_install_rt(struct ospf_area *oa, u32 rid, const orta *new) { - ip_addr addr = ipa_from_rid(rid); - ort *old = (ort *) fib_get(&oa->rtr, &addr, MAX_PREFIX_LENGTH); + net_addr_ip4 nrid = net_from_rid(rid); + ort *old = fib_get(&oa->rtr, (net_addr *) &nrid); int cmp = orta_compare(oa->po, new, &old->n); if (cmp > 0) @@ -359,17 +347,19 @@ ri_install_rt(struct ospf_area *oa, u32 rid, const orta *new) } static inline void -ri_install_asbr(struct ospf_proto *p, ip_addr *addr, const orta *new) +ri_install_asbr(struct ospf_proto *p, u32 rid, const orta *new) { - ort *old = (ort *) fib_get(&p->backbone->rtr, addr, MAX_PREFIX_LENGTH); + net_addr_ip4 nrid = net_from_rid(rid); + ort *old = fib_get(&p->backbone->rtr, (net_addr *) &nrid); + if (orta_compare_asbr(p, new, &old->n) > 0) ort_replace(old, new); } static inline void -ri_install_ext(struct ospf_proto *p, ip_addr prefix, int pxlen, const orta *new) +ri_install_ext(struct ospf_proto *p, net_addr *net, const orta *new) { - ort *old = (ort *) fib_get(&p->rtf, &prefix, pxlen); + ort *old = fib_get(&p->rtf, net); int cmp = orta_compare_ext(p, new, &old->n); if (cmp > 0) @@ -404,7 +394,7 @@ px_pos_to_ifa(struct ospf_area *oa, int pos) static void -add_network(struct ospf_area *oa, ip_addr px, int pxlen, int metric, struct top_hash_entry *en, int pos) +add_network(struct ospf_area *oa, net_addr *net, int metric, struct top_hash_entry *en, int pos) { struct ospf_proto *p = oa->po; @@ -419,7 +409,7 @@ add_network(struct ospf_area *oa, ip_addr px, int pxlen, int metric, struct top_ .nhs = en->nhs }; - if (pxlen < 0 || pxlen > MAX_PREFIX_LENGTH) + if (!ospf_valid_prefix(net)) { log(L_WARN "%s: Invalid prefix in LSA (Type: %04x, Id: %R, Rt: %R)", p->p.name, en->lsa_type, en->lsa.id, en->lsa.rt); @@ -440,7 +430,7 @@ add_network(struct ospf_area *oa, ip_addr px, int pxlen, int metric, struct top_ nf.nhs = ifa ? new_nexthop(p, IPA_NONE, ifa->iface, ifa->ecmp_weight) : NULL; } - ri_install_net(p, px, pxlen, &nf); + ri_install_net(p, net, &nf); } @@ -451,8 +441,7 @@ spfa_process_rt(struct ospf_proto *p, struct ospf_area *oa, struct top_hash_entr struct ospf_lsa_rt *rt = act->lsa_body; struct ospf_lsa_rt_walk rtl; struct top_hash_entry *tmp; - ip_addr prefix; - int pxlen, i; + int i; if (rt->options & OPT_RT_V) oa->trcap = 1; @@ -502,9 +491,10 @@ spfa_process_rt(struct ospf_proto *p, struct ospf_area *oa, struct top_hash_entr * the same result by handing them here because add_network() * will keep the best (not the first) found route. */ - prefix = ipa_from_u32(rtl.id & rtl.data); - pxlen = u32_masklen(rtl.data); - add_network(oa, prefix, pxlen, act->dist + rtl.metric, act, i); + net_addr_ip4 net = + NET_ADDR_IP4(ip4_from_u32(rtl.id & rtl.data), u32_masklen(rtl.data)); + + add_network(oa, (net_addr *) &net, act->dist + rtl.metric, act, i); break; case LSART_NET: @@ -517,7 +507,7 @@ spfa_process_rt(struct ospf_proto *p, struct ospf_area *oa, struct top_hash_entr break; } - add_cand(&oa->cand, tmp, act, act->dist + rtl.metric, oa, i); + add_cand(oa, tmp, act, act->dist + rtl.metric, i, rtl.lif, rtl.nif); } } @@ -526,21 +516,21 @@ spfa_process_net(struct ospf_proto *p, struct ospf_area *oa, struct top_hash_ent { struct ospf_lsa_net *ln = act->lsa_body; struct top_hash_entry *tmp; - ip_addr prefix; - int pxlen, i, cnt; + int i, cnt; if (ospf_is_v2(p)) { - prefix = ipa_from_u32(act->lsa.id & ln->optx); - pxlen = u32_masklen(ln->optx); - add_network(oa, prefix, pxlen, act->dist, act, -1); + net_addr_ip4 net = + NET_ADDR_IP4(ip4_from_u32(act->lsa.id & ln->optx), u32_masklen(ln->optx)); + + add_network(oa, (net_addr *) &net, act->dist, act, -1); } cnt = lsa_net_count(&act->lsa); for (i = 0; i < cnt; i++) { tmp = ospf_hash_find_rt(p->gr, oa->areaid, ln->routers[i]); - add_cand(&oa->cand, tmp, act, act->dist, oa, -1); + add_cand(oa, tmp, act, act->dist, -1, 0, 0); } } @@ -549,10 +539,6 @@ spfa_process_prefixes(struct ospf_proto *p, struct ospf_area *oa) { struct top_hash_entry *en, *src; struct ospf_lsa_prefix *px; - ip_addr pxa; - int pxlen; - u8 pxopts; - u16 metric; u32 *buf; int i; @@ -587,18 +573,22 @@ spfa_process_prefixes(struct ospf_proto *p, struct ospf_area *oa) buf = px->rest; for (i = 0; i < px->pxcount; i++) - { - buf = lsa_get_ipv6_prefix(buf, &pxa, &pxlen, &pxopts, &metric); + { + net_addr net; + u8 pxopts; + u16 metric; - if (pxopts & OPT_PX_NU) - continue; + buf = ospf3_get_prefix(buf, ospf_get_af(p), &net, &pxopts, &metric); - /* Store the first global address to use it later as a vlink endpoint */ - if ((pxopts & OPT_PX_LA) && ipa_zero(src->lb)) - src->lb = pxa; + if (pxopts & OPT_PX_NU) + continue; - add_network(oa, pxa, pxlen, src->dist + metric, src, i); - } + /* Store the first global address to use it later as a vlink endpoint */ + if ((pxopts & OPT_PX_LA) && (net.type == NET_IP6) && ipa_zero(src->lb)) + src->lb = ipa_from_ip6(net6_prefix(&net)); + + add_network(oa, &net, src->dist + metric, src, i); + } } } @@ -659,7 +649,8 @@ ospf_rt_spfa(struct ospf_area *oa) } static int -link_back(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry *par) +link_back(struct ospf_area *oa, struct top_hash_entry *en, + struct top_hash_entry *par, uint lif, uint nif) { struct ospf_proto *p = oa->po; struct ospf_lsa_rt_walk rtl; @@ -697,6 +688,10 @@ link_back(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry tmp = ospf_hash_find_net(p->gr, oa->areaid, rtl.id, rtl.nif); if (tmp == par) { + /* + * Note that there may be multiple matching Rt-fields if router 'en' + * have multiple interfaces to net 'par'. Perhaps we should do ECMP. + */ if (ospf_is_v2(p)) en->lb = ipa_from_u32(rtl.data); else @@ -708,7 +703,13 @@ link_back(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry case LSART_VLNK: case LSART_PTP: - /* Not necessary the same link, see RFC 2328 [23] */ + /* + * For OSPFv2, not necessary the same link, see RFC 2328 [23]. + * For OSPFv3, we verify that by comparing nif and lif fields. + */ + if (ospf_is_v3(p) && ((rtl.lif != nif) || (rtl.nif != lif))) + break; + tmp = ospf_hash_find_rt(p->gr, oa->areaid, rtl.id); if (tmp == par) return 1; @@ -741,13 +742,12 @@ ospf_rt_sum(struct ospf_area *oa) { struct ospf_proto *p = oa->po; struct top_hash_entry *en; - ip_addr ip, abrip; + net_addr net; u32 dst_rid, metric, options; ort *abr; - int pxlen = -1, type = -1; + int type; u8 pxopts; - OSPF_TRACE(D_EVENTS, "Starting routing table calculation for inter-area (area %R)", oa->areaid); WALK_SLIST(en, p->lsal) @@ -770,18 +770,18 @@ ospf_rt_sum(struct ospf_area *oa) if (en->lsa_type == LSA_T_SUM_NET) { - lsa_parse_sum_net(en, ospf_is_v2(p), &ip, &pxlen, &pxopts, &metric); - - if (pxopts & OPT_PX_NU) - continue; + lsa_parse_sum_net(en, ospf_is_v2(p), ospf_get_af(p), &net, &pxopts, &metric); - if (pxlen < 0 || pxlen > MAX_PREFIX_LENGTH) + if (!ospf_valid_prefix(&net)) { log(L_WARN "%s: Invalid prefix in LSA (Type: %04x, Id: %R, Rt: %R)", p->p.name, en->lsa_type, en->lsa.id, en->lsa.rt); continue; } + if (pxopts & OPT_PX_NU) + continue; + options = 0; type = ORT_NET; } @@ -802,8 +802,8 @@ ospf_rt_sum(struct ospf_area *oa) continue; /* 16.2. (4) */ - abrip = ipa_from_rid(en->lsa.rt); - abr = (ort *) fib_find(&oa->rtr, &abrip, MAX_PREFIX_LENGTH); + net_addr_ip4 nrid = net_from_rid(en->lsa.rt); + abr = fib_find(&oa->rtr, (net_addr *) &nrid); if (!abr || !abr->n.type) continue; @@ -827,7 +827,7 @@ ospf_rt_sum(struct ospf_area *oa) }; if (type == ORT_NET) - ri_install_net(p, ip, pxlen, &nf); + ri_install_net(p, &net, &nf); else ri_install_rt(oa, dst_rid, &nf); } @@ -841,11 +841,7 @@ ospf_rt_sum_tr(struct ospf_area *oa) struct ospf_area *bb = p->backbone; struct top_hash_entry *en; ort *re, *abr; - ip_addr ip, abrip; - u32 dst_rid, metric, options; - int pxlen; - u8 pxopts; - + u32 metric; if (!bb) return; @@ -868,26 +864,31 @@ ospf_rt_sum_tr(struct ospf_area *oa) if (en->lsa_type == LSA_T_SUM_NET) { - lsa_parse_sum_net(en, ospf_is_v2(p), &ip, &pxlen, &pxopts, &metric); + net_addr net; + u8 pxopts; - if (pxopts & OPT_PX_NU) - continue; + lsa_parse_sum_net(en, ospf_is_v2(p), ospf_get_af(p), &net, &pxopts, &metric); - if (pxlen < 0 || pxlen > MAX_PREFIX_LENGTH) + if (!ospf_valid_prefix(&net)) { log(L_WARN "%s: Invalid prefix in LSA (Type: %04x, Id: %R, Rt: %R)", p->p.name, en->lsa_type, en->lsa.id, en->lsa.rt); continue; } - re = fib_find(&p->rtf, &ip, pxlen); + if (pxopts & OPT_PX_NU) + continue; + + re = fib_find(&p->rtf, &net); } else // en->lsa_type == LSA_T_SUM_RT { + u32 dst_rid, options; + lsa_parse_sum_rt(en, ospf_is_v2(p), &dst_rid, &metric, &options); - ip = ipa_from_rid(dst_rid); - re = fib_find(&bb->rtr, &ip, MAX_PREFIX_LENGTH); + net_addr_ip4 nrid = net_from_rid(dst_rid); + re = fib_find(&bb->rtr, (net_addr *) &nrid); } /* 16.3 (1b) */ @@ -905,8 +906,8 @@ ospf_rt_sum_tr(struct ospf_area *oa) continue; /* 16.3. (4) */ - abrip = ipa_from_rid(en->lsa.rt); - abr = fib_find(&oa->rtr, &abrip, MAX_PREFIX_LENGTH); + net_addr_ip4 nrid = net_from_rid(en->lsa.rt); + abr = fib_find(&oa->rtr, (net_addr *) &nrid); if (!abr || !abr->n.type) continue; @@ -997,7 +998,7 @@ decide_sum_lsa(struct ospf_area *oa, ort *nf, int dest) return 1; struct area_net *anet = (struct area_net *) - fib_route(&nf->n.oa->net_fib, nf->fn.prefix, nf->fn.pxlen); + fib_route(&nf->n.oa->net_fib, nf->fn.addr); /* Condensed area network found */ if (anet) @@ -1016,13 +1017,13 @@ check_sum_net_lsa(struct ospf_proto *p, ort *nf) if (nf->area_net) { /* It is a default route for stub areas, handled entirely in ospf_rt_abr() */ - if (nf->fn.pxlen == 0) + if (nf->fn.addr->pxlen == 0) return; /* Find that area network */ WALK_LIST(anet_oa, p->area_list) { - anet = (struct area_net *) fib_find(&anet_oa->net_fib, &nf->fn.prefix, nf->fn.pxlen); + anet = fib_find(&anet_oa->net_fib, nf->fn.addr); if (anet) break; } @@ -1041,14 +1042,16 @@ check_sum_net_lsa(struct ospf_proto *p, ort *nf) static inline void check_sum_rt_lsa(struct ospf_proto *p, ort *nf) { + u32 rid = rid_from_net(nf->fn.addr); + struct ospf_area *oa; WALK_LIST(oa, p->area_list) if (decide_sum_lsa(oa, nf, ORT_ROUTER)) - ospf_originate_sum_rt_lsa(p, oa, nf, nf->n.metric1, nf->n.options); + ospf_originate_sum_rt_lsa(p, oa, rid, nf->n.metric1, nf->n.options); } static inline int -decide_nssa_lsa(struct ospf_proto *p UNUSED4 UNUSED6, ort *nf, struct ospf_lsa_ext_local *rt) +decide_nssa_lsa(struct ospf_proto *p, ort *nf, struct ospf_lsa_ext_local *rt) { struct ospf_area *oa = nf->n.oa; struct top_hash_entry *en = nf->n.en; @@ -1057,14 +1060,14 @@ decide_nssa_lsa(struct ospf_proto *p UNUSED4 UNUSED6, ort *nf, struct ospf_lsa_e return 0; /* Condensed area network found */ - if (fib_route(&oa->enet_fib, nf->fn.prefix, nf->fn.pxlen)) + if (fib_route(&oa->enet_fib, nf->fn.addr)) return 0; if (!en || (en->lsa_type != LSA_T_NSSA)) return 0; /* We do not store needed data in struct orta, we have to parse the LSA */ - lsa_parse_ext(en, ospf_is_v2(p), rt); + lsa_parse_ext(en, ospf_is_v2(p), ospf_get_af(p), rt); if (rt->pxopts & OPT_PX_NU) return 0; @@ -1092,7 +1095,7 @@ check_nssa_lsa(struct ospf_proto *p, ort *nf) /* Find that area network */ WALK_LIST(oa, p->area_list) { - anet = (struct area_net *) fib_find(&oa->enet_fib, &nf->fn.prefix, nf->fn.pxlen); + anet = fib_find(&oa->enet_fib, nf->fn.addr); if (anet) break; } @@ -1162,24 +1165,20 @@ static void ospf_rt_abr1(struct ospf_proto *p) { struct area_net *anet; - ort *nf, *default_nf; + ort *default_nf; + net_addr default_net; /* RFC 2328 G.3 - incomplete resolution of virtual next hops - routers */ - FIB_WALK(&p->backbone->rtr, nftmp) + FIB_WALK(&p->backbone->rtr, ort, nf) { - nf = (ort *) nftmp; - if (nf->n.type && unresolved_vlink(nf)) reset_ri(nf); } FIB_WALK_END; - FIB_WALK(&p->rtf, nftmp) + FIB_WALK(&p->rtf, ort, nf) { - nf = (ort *) nftmp; - - /* RFC 2328 G.3 - incomplete resolution of virtual next hops - networks */ if (nf->n.type && unresolved_vlink(nf)) reset_ri(nf); @@ -1188,7 +1187,7 @@ ospf_rt_abr1(struct ospf_proto *p) /* Compute condensed area networks */ if (nf->n.type == RTS_OSPF) { - anet = (struct area_net *) fib_route(&nf->n.oa->net_fib, nf->fn.prefix, nf->fn.pxlen); + anet = (struct area_net *) fib_route(&nf->n.oa->net_fib, nf->fn.addr); if (anet) { if (!anet->active) @@ -1196,7 +1195,7 @@ ospf_rt_abr1(struct ospf_proto *p) anet->active = 1; /* Get a RT entry and mark it to know that it is an area network */ - ort *nfi = (ort *) fib_get(&p->rtf, &anet->fn.prefix, anet->fn.pxlen); + ort *nfi = fib_get(&p->rtf, anet->fn.addr); nfi->area_net = 1; /* 16.2. (3) */ @@ -1211,8 +1210,13 @@ ospf_rt_abr1(struct ospf_proto *p) } FIB_WALK_END; - ip_addr addr = IPA_NONE; - default_nf = (ort *) fib_get(&p->rtf, &addr, 0); + + if (ospf_is_v2(p)) + net_fill_ip4(&default_net, IP4_NONE, 0); + else + net_fill_ip6(&default_net, IP6_NONE, 0); + + default_nf = fib_get(&p->rtf, &default_net); default_nf->area_net = 1; struct ospf_area *oa; @@ -1239,11 +1243,10 @@ ospf_rt_abr1(struct ospf_proto *p) /* RFC 2328 16.4. (3) - precompute preferred ASBR entries */ if (oa_is_ext(oa)) { - FIB_WALK(&oa->rtr, nftmp) + FIB_WALK(&oa->rtr, ort, nf) { - nf = (ort *) nftmp; if (nf->n.options & ORTA_ASBR) - ri_install_asbr(p, &nf->fn.prefix, &nf->n); + ri_install_asbr(p, rid_from_net(nf->fn.addr), &nf->n); } FIB_WALK_END; } @@ -1251,9 +1254,9 @@ ospf_rt_abr1(struct ospf_proto *p) /* Originate or flush ASBR summary LSAs */ - FIB_WALK(&p->backbone->rtr, nftmp) + FIB_WALK(&p->backbone->rtr, ort, nf) { - check_sum_rt_lsa(p, (ort *) nftmp); + check_sum_rt_lsa(p, nf); } FIB_WALK_END; @@ -1280,8 +1283,6 @@ ospf_rt_abr2(struct ospf_proto *p) { struct ospf_area *oa; struct top_hash_entry *en; - ort *nf, *nf2; - /* RFC 3103 3.1 - type-7 translator election */ struct ospf_area *bb = p->backbone; @@ -1293,13 +1294,12 @@ ospf_rt_abr2(struct ospf_proto *p) if (oa->ac->translator) goto decided; - FIB_WALK(&oa->rtr, nftmp) + FIB_WALK(&oa->rtr, ort, nf) { - nf = (ort *) nftmp; if (!nf->n.type || !(nf->n.options & ORTA_ABR)) continue; - nf2 = fib_find(&bb->rtr, &nf->fn.prefix, MAX_PREFIX_LENGTH); + ort *nf2 = fib_find(&bb->rtr, nf->fn.addr); if (!nf2 || !nf2->n.type || !(nf2->n.options & ORTA_ABR)) continue; @@ -1329,23 +1329,21 @@ ospf_rt_abr2(struct ospf_proto *p) if (!translate && (oa->translate == TRANS_ON)) { if (oa->translator_timer == NULL) - oa->translator_timer = tm_new_set(p->p.pool, translator_timer_hook, oa, 0, 0); + oa->translator_timer = tm_new_init(p->p.pool, translator_timer_hook, oa, 0, 0); /* Schedule the end of translation */ - tm_start(oa->translator_timer, oa->ac->transint); + tm_start(oa->translator_timer, oa->ac->transint S); oa->translate = TRANS_WAIT; } } /* Compute condensed external networks */ - FIB_WALK(&p->rtf, nftmp) + FIB_WALK(&p->rtf, ort, nf) { - nf = (ort *) nftmp; if (rt_is_nssa(nf) && (nf->n.options & ORTA_PROP)) { - struct area_net *anet = (struct area_net *) - fib_route(&nf->n.oa->enet_fib, nf->fn.prefix, nf->fn.pxlen); + struct area_net *anet = fib_route(&nf->n.oa->enet_fib, nf->fn.addr); if (anet) { @@ -1354,7 +1352,7 @@ ospf_rt_abr2(struct ospf_proto *p) anet->active = 1; /* Get a RT entry and mark it to know that it is an area network */ - nf2 = (ort *) fib_get(&p->rtf, &anet->fn.prefix, anet->fn.pxlen); + ort *nf2 = fib_get(&p->rtf, anet->fn.addr); nf2->area_net = 1; } @@ -1369,10 +1367,8 @@ ospf_rt_abr2(struct ospf_proto *p) FIB_WALK_END; - FIB_WALK(&p->rtf, nftmp) + FIB_WALK(&p->rtf, ort, nf) { - nf = (ort *) nftmp; - check_sum_net_lsa(p, nf); check_nssa_lsa(p, nf); } @@ -1382,22 +1378,57 @@ ospf_rt_abr2(struct ospf_proto *p) /* Like fib_route(), but ignores dummy rt entries */ static void * -ospf_fib_route(struct fib *f, ip_addr a, int len) +ospf_fib_route_ip4(struct fib *f, ip4_addr a, int len) +{ + net_addr_ip4 net = NET_ADDR_IP4(a, len); + ort *nf; + +loop: + nf = fib_find(f, (net_addr *) &net); + if (nf && nf->n.type) + return nf; + + if (net.pxlen > 0) + { + net.pxlen--; + ip4_clrbit(&net.prefix, net.pxlen); + goto loop; + } + + return NULL; +} + +static void * +ospf_fib_route_ip6(struct fib *f, ip6_addr a, int len) { - ip_addr a0; + net_addr_ip6 net = NET_ADDR_IP6(a, len); ort *nf; - while (len >= 0) +loop: + nf = fib_find(f, (net_addr *) &net); + if (nf && nf->n.type) + return nf; + + if (net.pxlen > 0) { - a0 = ipa_and(a, ipa_mkmask(len)); - nf = fib_find(f, &a0, len); - if (nf && nf->n.type) - return nf; - len--; + net.pxlen--; + ip6_clrbit(&net.prefix, net.pxlen); + goto loop; } + return NULL; } +static void * +ospf_fib_route(struct fib *f, ip_addr a) +{ + if (f->addr_type == NET_IP4) + return ospf_fib_route_ip4(f, ipa_to_ip4(a), IP4_MAX_PREFIX_LENGTH); + else + return ospf_fib_route_ip6(f, ipa_to_ip6(a), IP6_MAX_PREFIX_LENGTH); +} + + /* RFC 2328 16.4. calculating external routes */ static void ospf_ext_spf(struct ospf_proto *p) @@ -1405,7 +1436,6 @@ ospf_ext_spf(struct ospf_proto *p) struct top_hash_entry *en; struct ospf_lsa_ext_local rt; ort *nf1, *nf2; - ip_addr rtid; u32 br_metric; struct ospf_area *atmp; @@ -1429,21 +1459,20 @@ ospf_ext_spf(struct ospf_proto *p) DBG("%s: Working on LSA. ID: %R, RT: %R, Type: %u\n", p->p.name, en->lsa.id, en->lsa.rt, en->lsa_type); - lsa_parse_ext(en, ospf_is_v2(p), &rt); - - if (rt.metric == LSINFINITY) - continue; + lsa_parse_ext(en, ospf_is_v2(p), ospf_get_af(p), &rt); - if (rt.pxopts & OPT_PX_NU) - continue; - - if (rt.pxlen < 0 || rt.pxlen > MAX_PREFIX_LENGTH) + if (!ospf_valid_prefix(&rt.net)) { log(L_WARN "%s: Invalid prefix in LSA (Type: %04x, Id: %R, Rt: %R)", p->p.name, en->lsa_type, en->lsa.id, en->lsa.rt); continue; } + if (rt.metric == LSINFINITY) + continue; + + if (rt.pxopts & OPT_PX_NU) + continue; /* 16.4. (3) */ /* If there are more areas, we already precomputed preferred ASBR @@ -1457,8 +1486,8 @@ ospf_ext_spf(struct ospf_proto *p) if (!atmp) continue; /* Should not happen */ - rtid = ipa_from_rid(en->lsa.rt); - nf1 = fib_find(&atmp->rtr, &rtid, MAX_PREFIX_LENGTH); + net_addr_ip4 nrid = net_from_rid(en->lsa.rt); + nf1 = fib_find(&atmp->rtr, (net_addr *) &nrid); if (!nf1 || !nf1->n.type) continue; /* No AS boundary router found */ @@ -1468,7 +1497,7 @@ ospf_ext_spf(struct ospf_proto *p) /* 16.4. (3) NSSA - special rule for default routes */ /* ABR should use default only if P-bit is set and summaries are active */ - if ((en->lsa_type == LSA_T_NSSA) && ipa_zero(rt.ip) && (rt.pxlen == 0) && + if ((en->lsa_type == LSA_T_NSSA) && (rt.net.pxlen == 0) && (p->areano > 1) && !(rt.propagate && atmp->ac->summary)) continue; @@ -1480,7 +1509,7 @@ ospf_ext_spf(struct ospf_proto *p) } else { - nf2 = ospf_fib_route(&p->rtf, rt.fwaddr, MAX_PREFIX_LENGTH); + nf2 = ospf_fib_route(&p->rtf, rt.fwaddr); if (!nf2) continue; @@ -1542,7 +1571,7 @@ ospf_ext_spf(struct ospf_proto *p) nfa.oa = atmp; /* undefined in RFC 2328 */ nfa.en = en; /* store LSA for later (NSSA processing) */ - ri_install_ext(p, rt.ip, rt.pxlen, &nfa); + ri_install_ext(p, &rt.net, &nfa); } } @@ -1552,13 +1581,10 @@ ospf_rt_reset(struct ospf_proto *p) { struct ospf_area *oa; struct top_hash_entry *en; - struct area_net *anet; - ort *ri; /* Reset old routing table */ - FIB_WALK(&p->rtf, nftmp) + FIB_WALK(&p->rtf, ort, ri) { - ri = (ort *) nftmp; ri->area_net = 0; ri->keep = 0; reset_ri(ri); @@ -1580,9 +1606,8 @@ ospf_rt_reset(struct ospf_proto *p) WALK_LIST(oa, p->area_list) { /* Reset ASBR routing tables */ - FIB_WALK(&oa->rtr, nftmp) + FIB_WALK(&oa->rtr, ort, ri) { - ri = (ort *) nftmp; reset_ri(ri); } FIB_WALK_END; @@ -1590,17 +1615,15 @@ ospf_rt_reset(struct ospf_proto *p) /* Reset condensed area networks */ if (p->areano > 1) { - FIB_WALK(&oa->net_fib, nftmp) + FIB_WALK(&oa->net_fib, struct area_net, anet) { - anet = (struct area_net *) nftmp; anet->active = 0; anet->metric = 0; } FIB_WALK_END; - FIB_WALK(&oa->enet_fib, nftmp) + FIB_WALK(&oa->enet_fib, struct area_net, anet) { - anet = (struct area_net *) nftmp; anet->active = 0; anet->metric = 0; } @@ -1659,19 +1682,33 @@ ospf_rt_spf(struct ospf_proto *p) static inline int -inherit_nexthops(struct mpnh *pn) +inherit_nexthops(struct nexthop *pn) { /* Proper nexthops (with defined GW) or dummy vlink nexthops (without iface) */ return pn && (ipa_nonzero(pn->gw) || !pn->iface); } -static struct mpnh * +static inline ip_addr +link_lsa_lladdr(struct ospf_proto *p, struct top_hash_entry *en) +{ + struct ospf_lsa_link *link_lsa = en->lsa_body; + ip6_addr ll = link_lsa->lladdr; + + if (ip6_zero(ll)) + return IPA_NONE; + + return ospf_is_ip4(p) ? ipa_from_ip4(ospf3_6to4(ll)) : ipa_from_ip6(ll); +} + +static struct nexthop * calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, - struct top_hash_entry *par, int pos) + struct top_hash_entry *par, int pos, uint lif, uint nif) { struct ospf_proto *p = oa->po; - struct mpnh *pn = par->nhs; - struct ospf_iface *ifa; + struct nexthop *pn = par->nhs; + struct top_hash_entry *link = NULL; + struct ospf_iface *ifa = NULL; + ip_addr nh = IPA_NONE; u32 rid = en->lsa.rt; /* 16.1.1. The next hop calculation */ @@ -1696,6 +1733,9 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, if (!ifa) return NULL; + if (ospf_is_v3(p) && (ifa->iface_id != lif)) + log(L_WARN "%s: Inconsistent interface ID %u/%u", p->p.name, ifa->iface_id, lif); + return new_nexthop(p, IPA_NONE, ifa->iface, ifa->ecmp_weight); } @@ -1706,14 +1746,44 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, if (!ifa) return NULL; + if (ospf_is_v3(p) && (ifa->iface_id != lif)) + log(L_WARN "%s: Inconsistent interface ID %u/%u", p->p.name, ifa->iface_id, lif); + if (ifa->type == OSPF_IT_VLINK) return new_nexthop(p, IPA_NONE, NULL, 0); - struct ospf_neighbor *m = find_neigh(ifa, rid); - if (!m || (m->state != NEIGHBOR_FULL)) - return NULL; + /* FIXME: On physical PtP links we may skip next-hop altogether */ + + if (ospf_is_v2(p) || ospf_is_ip6(p)) + { + /* + * In this case, next-hop is a source address from neighbor's packets. + * That is necessary for OSPFv2 and practical for OSPFv3 (as it works even + * if neighbor uses LinkLSASuppression), but does not work with OSPFv3-AF + * on IPv4 topology, where src is IPv6 but next-hop should be IPv4. + */ + struct ospf_neighbor *m = find_neigh(ifa, rid); + if (!m || (m->state != NEIGHBOR_FULL)) + return NULL; + + nh = m->ip; + } + else + { + /* + * Next-hop is taken from lladdr field of Link-LSA, based on Neighbor + * Iface ID (nif) field in our Router-LSA, which is just nbr->iface_id. + */ + link = ospf_hash_find(p->gr, ifa->iface_id, nif, rid, LSA_T_LINK); + if (!link) + return NULL; + + nh = link_lsa_lladdr(p, link); + if (ipa_zero(nh)) + return NULL; + } - return new_nexthop(p, m->ip, ifa->iface, ifa->ecmp_weight); + return new_nexthop(p, nh, ifa->iface, ifa->ecmp_weight); } /* The third case - bcast or nbma neighbor */ @@ -1740,18 +1810,15 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, * Next-hop is taken from lladdr field of Link-LSA, en->lb_id * is computed in link_back(). */ - struct top_hash_entry *lhe; - lhe = ospf_hash_find(p->gr, pn->iface->index, en->lb_id, rid, LSA_T_LINK); - - if (!lhe) + link = ospf_hash_find(p->gr, pn->iface->index, en->lb_id, rid, LSA_T_LINK); + if (!link) return NULL; - struct ospf_lsa_link *llsa = lhe->lsa_body; - - if (ip6_zero(llsa->lladdr)) + nh = link_lsa_lladdr(p, link); + if (ipa_zero(nh)) return NULL; - return new_nexthop(p, ipa_from_ip6(llsa->lladdr), pn->iface, pn->weight); + return new_nexthop(p, nh, pn->iface, pn->weight); } } @@ -1764,8 +1831,8 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, /* Add LSA into list of candidates in Dijkstra's algorithm */ static void -add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, - u32 dist, struct ospf_area *oa, int pos) +add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry *par, + u32 dist, int pos, uint lif, uint nif) { struct ospf_proto *p = oa->po; node *prev, *n; @@ -1778,9 +1845,9 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, if (en->lsa.age == LSA_MAXAGE) return; - if (ospf_is_v3(p) && (en->lsa_type == LSA_T_RT)) + if (ospf_is_v3(p) && (oa->options & OPT_V6) && (en->lsa_type == LSA_T_RT)) { - /* In OSPFv3, check V6 flag */ + /* In OSPFv3 IPv6 unicast, check V6 flag */ struct ospf_lsa_rt *rt = en->lsa_body; if (!(rt->options & OPT_V6)) return; @@ -1795,10 +1862,10 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, return; /* We should check whether there is a reverse link from en to par, */ - if (!link_back(oa, en, par)) + if (!link_back(oa, en, par, lif, nif)) return; - struct mpnh *nhs = calc_next_hop(oa, en, par, pos); + struct nexthop *nhs = calc_next_hop(oa, en, par, pos, lif, nif); if (!nhs) { log(L_WARN "%s: Cannot find next hop for LSA (Type: %04x, Id: %R, Rt: %R)", @@ -1836,7 +1903,7 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, /* Merge old and new */ int new_reuse = (par->nhs != nhs); - en->nhs = mpnh_merge(en->nhs, nhs, en->nhs_reuse, new_reuse, p->ecmp, p->nhpool); + en->nhs = nexthop_merge(en->nhs, nhs, en->nhs_reuse, new_reuse, p->ecmp, p->nhpool); en->nhs_reuse = 1; return; } @@ -1855,20 +1922,20 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, prev = NULL; - if (EMPTY_LIST(*l)) + if (EMPTY_LIST(oa->cand)) { - add_head(l, &en->cn); + add_head(&oa->cand, &en->cn); } else { - WALK_LIST(n, *l) + WALK_LIST(n, oa->cand) { act = SKIP_BACK(struct top_hash_entry, cn, n); if ((act->dist > dist) || ((act->dist == dist) && (act->lsa_type == LSA_T_RT))) { if (prev == NULL) - add_head(l, &en->cn); + add_head(&oa->cand, &en->cn); else insert_node(&en->cn, prev); added = 1; @@ -1879,7 +1946,7 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par, if (!added) { - add_tail(l, &en->cn); + add_tail(&oa->cand, &en->cn); } } } @@ -1892,8 +1959,7 @@ ort_changed(ort *nf, rta *nr) (nf->n.metric1 != nf->old_metric1) || (nf->n.metric2 != nf->old_metric2) || (nf->n.tag != nf->old_tag) || (nf->n.rid != nf->old_rid) || (nr->source != or->source) || (nr->dest != or->dest) || - (nr->iface != or->iface) || !ipa_equal(nr->gw, or->gw) || - !mpnh_same(nr->nexthops, or->nexthops); + !nexthop_same(&(nr->nh), &(or->nh)); } static void @@ -1902,25 +1968,22 @@ rt_sync(struct ospf_proto *p) struct top_hash_entry *en; struct fib_iterator fit; struct fib *fib = &p->rtf; - ort *nf; struct ospf_area *oa; /* This is used for forced reload of routes */ int reload = (p->calcrt == 2); - OSPF_TRACE(D_EVENTS, "Starting routing table synchronisation"); + OSPF_TRACE(D_EVENTS, "Starting routing table synchronization"); DBG("Now syncing my rt table with nest's\n"); FIB_ITERATE_INIT(&fit, fib); again1: - FIB_ITERATE_START(fib, &fit, nftmp) + FIB_ITERATE_START(fib, &fit, ort, nf) { - nf = (ort *) nftmp; - /* Sanity check of next-hop addresses, failure should not happen */ if (nf->n.type) { - struct mpnh *nh; + struct nexthop *nh; for (nh = nf->n.nhs; nh; nh = nh->next) if (ipa_nonzero(nh->gw)) { @@ -1943,29 +2006,12 @@ again1: .src = p->p.main_source, .source = nf->n.type, .scope = SCOPE_UNIVERSE, - .cast = RTC_UNICAST + .dest = RTD_UNICAST, + .nh = *(nf->n.nhs), }; - if (nf->n.nhs->next) - { - a0.dest = RTD_MULTIPATH; - a0.nexthops = nf->n.nhs; - } - else if (ipa_nonzero(nf->n.nhs->gw)) - { - a0.dest = RTD_ROUTER; - a0.iface = nf->n.nhs->iface; - a0.gw = nf->n.nhs->gw; - } - else - { - a0.dest = RTD_DEVICE; - a0.iface = nf->n.nhs->iface; - } - if (reload || ort_changed(nf, &a0)) { - net *ne = net_get(p->p.table, nf->fn.prefix, nf->fn.pxlen); rta *a = rta_lookup(&a0); rte *e = rte_get_temp(a); @@ -1976,12 +2022,10 @@ again1: e->u.ospf.tag = nf->old_tag = nf->n.tag; e->u.ospf.router_id = nf->old_rid = nf->n.rid; e->pflags = 0; - e->net = ne; - e->pref = p->p.preference; - DBG("Mod rte type %d - %I/%d via %I on iface %s, met %d\n", - a0.source, nf->fn.prefix, nf->fn.pxlen, a0.gw, a0.iface ? a0.iface->name : "(none)", nf->n.metric1); - rte_update(&p->p, ne, e); + DBG("Mod rte type %d - %N via %I on iface %s, met %d\n", + a0.source, nf->fn.addr, a0.gw, a0.iface ? a0.iface->name : "(none)", nf->n.metric1); + rte_update(&p->p, nf->fn.addr, e); } } else if (nf->old_rta) @@ -1990,19 +2034,21 @@ again1: rta_free(nf->old_rta); nf->old_rta = NULL; - net *ne = net_get(p->p.table, nf->fn.prefix, nf->fn.pxlen); - rte_update(&p->p, ne, NULL); + rte_update(&p->p, nf->fn.addr, NULL); } /* Remove unused rt entry, some special entries are persistent */ if (!nf->n.type && !nf->external_rte && !nf->area_net && !nf->keep) { - FIB_ITERATE_PUT(&fit, nftmp); - fib_delete(fib, nftmp); + if (nf->lsa_id) + idm_free(&p->idm, nf->lsa_id); + + FIB_ITERATE_PUT(&fit); + fib_delete(fib, nf); goto again1; } } - FIB_ITERATE_END(nftmp); + FIB_ITERATE_END; WALK_LIST(oa, p->area_list) @@ -2010,18 +2056,16 @@ again1: /* Cleanup ASBR hash tables */ FIB_ITERATE_INIT(&fit, &oa->rtr); again2: - FIB_ITERATE_START(&oa->rtr, &fit, nftmp) + FIB_ITERATE_START(&oa->rtr, &fit, ort, nf) { - nf = (ort *) nftmp; - if (!nf->n.type) { - FIB_ITERATE_PUT(&fit, nftmp); - fib_delete(&oa->rtr, nftmp); + FIB_ITERATE_PUT(&fit); + fib_delete(&oa->rtr, nf); goto again2; } } - FIB_ITERATE_END(nftmp); + FIB_ITERATE_END; } /* Cleanup stale LSAs */ diff --git a/proto/ospf/rt.h b/proto/ospf/rt.h index 73b28375..589d2bc5 100644 --- a/proto/ospf/rt.h +++ b/proto/ospf/rt.h @@ -53,7 +53,7 @@ typedef struct orta struct ospf_area *oa; struct ospf_area *voa; /* Used when route is replaced in ospf_rt_sum_tr(), NULL otherwise */ - struct mpnh *nhs; /* Next hops computed during SPF */ + struct nexthop *nhs; /* Next hops computed during SPF */ struct top_hash_entry *en; /* LSA responsible for this orta */ } orta; @@ -78,13 +78,15 @@ typedef struct ort * route was not in the last update, in that case other old_* values are not * valid. */ - struct fib_node fn; orta n; u32 old_metric1, old_metric2, old_tag, old_rid; rta *old_rta; + u32 lsa_id; u8 external_rte; u8 area_net; u8 keep; + + struct fib_node fn; } ort; diff --git a/proto/ospf/topology.c b/proto/ospf/topology.c index 341eff87..717c8280 100644 --- a/proto/ospf/topology.c +++ b/proto/ospf/topology.c @@ -70,7 +70,7 @@ ospf_install_lsa(struct ospf_proto *p, struct ospf_lsa_header *lsa, u32 type, u3 en->lsa_body = body; en->lsa = *lsa; en->init_age = en->lsa.age; - en->inst_time = now; + en->inst_time = current_time(); /* * We do not set en->mode. It is either default LSA_M_BASIC, or in a special @@ -128,7 +128,7 @@ ospf_advance_lsa(struct ospf_proto *p, struct top_hash_entry *en, struct ospf_ls en->lsa.sn = lsa->sn + 1; en->lsa.age = 0; en->init_age = 0; - en->inst_time = now; + en->inst_time = current_time(); lsa_generate_checksum(&en->lsa, en->lsa_body); OSPF_TRACE(D_EVENTS, "Advancing LSA: Type: %04x, Id: %R, Rt: %R, Seq: %08x", @@ -160,7 +160,7 @@ ospf_advance_lsa(struct ospf_proto *p, struct top_hash_entry *en, struct ospf_ls en->lsa = *lsa; en->lsa.age = LSA_MAXAGE; en->init_age = lsa->age; - en->inst_time = now; + en->inst_time = current_time(); OSPF_TRACE(D_EVENTS, "Resetting LSA: Type: %04x, Id: %R, Rt: %R, Seq: %08x", en->lsa_type, en->lsa.id, en->lsa.rt, en->lsa.sn); @@ -196,7 +196,7 @@ static int ospf_do_originate_lsa(struct ospf_proto *p, struct top_hash_entry *en, void *lsa_body, u16 lsa_blen, u16 lsa_opts) { /* Enforce MinLSInterval */ - if ((en->init_age == 0) && en->inst_time && ((en->inst_time + MINLSINTERVAL) > now)) + if (!en->init_age && en->inst_time && (lsa_inst_age(en) < MINLSINTERVAL)) return 0; /* Handle wrapping sequence number */ @@ -237,7 +237,7 @@ ospf_do_originate_lsa(struct ospf_proto *p, struct top_hash_entry *en, void *lsa en->lsa.sn++; en->lsa.age = 0; en->init_age = 0; - en->inst_time = now; + en->inst_time = current_time(); lsa_generate_checksum(&en->lsa, en->lsa_body); OSPF_TRACE(D_EVENTS, "Originating LSA: Type: %04x, Id: %R, Rt: %R, Seq: %08x", @@ -283,8 +283,8 @@ ospf_originate_lsa(struct ospf_proto *p, struct ospf_new_lsa *lsa) if (en->nf != lsa->nf) { - log(L_ERR "%s: LSA ID collision for %I/%d", - p->p.name, lsa->nf->fn.prefix, lsa->nf->fn.pxlen); + log(L_ERR "%s: LSA ID collision for %N", + p->p.name, lsa->nf->fn.addr); en = NULL; goto drop; @@ -381,7 +381,7 @@ ospf_refresh_lsa(struct ospf_proto *p, struct top_hash_entry *en) en->lsa.sn++; en->lsa.age = 0; en->init_age = 0; - en->inst_time = now; + en->inst_time = current_time(); lsa_generate_checksum(&en->lsa, en->lsa_body); ospf_flood_lsa(p, en, NULL); } @@ -476,14 +476,15 @@ void ospf_update_lsadb(struct ospf_proto *p) { struct top_hash_entry *en, *nxt; - bird_clock_t real_age; + btime now_ = current_time(); + int real_age; WALK_SLIST_DELSAFE(en, nxt, p->lsal) { if (en->next_lsa_body) ospf_originate_next_lsa(p, en); - real_age = en->init_age + (now - en->inst_time); + real_age = en->init_age + (now_ - en->inst_time) TO_S; if (en->lsa.age == LSA_MAXAGE) { @@ -514,14 +515,14 @@ ospf_update_lsadb(struct ospf_proto *p) } -static inline u32 -ort_to_lsaid(struct ospf_proto *p UNUSED4 UNUSED6, ort *nf) +static u32 +ort_to_lsaid(struct ospf_proto *p, ort *nf) { /* * In OSPFv2, We have to map IP prefixes to u32 in such manner that resulting * u32 interpreted as IP address is a member of given prefix. Therefore, /32 - * prefix have to be mapped on itself. All received prefixes have to be - * mapped on different u32s. + * prefix has to be mapped on itself. All received prefixes have to be mapped + * on different u32s. * * We have an assumption that if there is nontrivial (non-/32) network prefix, * then there is not /32 prefix for the first and the last IP address of the @@ -542,17 +543,21 @@ ort_to_lsaid(struct ospf_proto *p UNUSED4 UNUSED6, ort *nf) * network appeared, we choose a different way. * * In OSPFv3, it is simpler. There is not a requirement for membership of the - * result in the input network, so we just use a hash-based unique ID of a - * routing table entry for a route that originated given LSA. For ext-LSA, it - * is an imported route in the nest's routing table (p->table). For summary-LSA, - * it is a 'source' route in the protocol internal routing table (p->rtf). + * result in the input network, so we just allocate a unique ID from ID map + * and store it in nf->lsa_id for further reference. */ if (ospf_is_v3(p)) - return nf->fn.uid; + { + if (!nf->lsa_id) + nf->lsa_id = idm_alloc(&p->idm); - u32 id = ipa_to_u32(nf->fn.prefix); - int pxlen = nf->fn.pxlen; + return nf->lsa_id; + } + + net_addr_ip4 *net = (void *) nf->fn.addr; + u32 id = ip4_to_u32(net->prefix); + int pxlen = net->pxlen; if ((pxlen == 0) || (pxlen == 32)) return id; @@ -628,12 +633,12 @@ configured_stubnet(struct ospf_area *oa, struct ifa *a) { if (sn->summary) { - if (ipa_in_net(a->prefix, sn->px.addr, sn->px.len) && (a->pxlen >= sn->px.len)) + if (net_in_netX(&a->prefix, &sn->prefix)) return 1; } else { - if (ipa_equal(a->prefix, sn->px.addr) && (a->pxlen == sn->px.len)) + if (net_equal(&a->prefix, &sn->prefix)) return 1; } } @@ -781,7 +786,8 @@ prepare_rt2_lsa_body(struct ospf_proto *p, struct ospf_area *oa) (ifa->type == OSPF_IT_PTMP)) add_rt2_lsa_link(p, LSART_STUB, ipa_to_u32(ifa->addr->ip), 0xffffffff, 0); else - add_rt2_lsa_link(p, LSART_STUB, ipa_to_u32(ifa->addr->prefix), u32_mkmask(ifa->addr->pxlen), ifa->cost); + add_rt2_lsa_link(p, LSART_STUB, ip4_to_u32(net4_prefix(&ifa->addr->prefix)), + u32_mkmask(net4_pxlen(&ifa->addr->prefix)), ifa->cost); i++; ifa->rt_pos_end = i; @@ -790,7 +796,8 @@ prepare_rt2_lsa_body(struct ospf_proto *p, struct ospf_area *oa) struct ospf_stubnet_config *sn; WALK_LIST(sn, oa->ac->stubnet_list) if (!sn->hidden) - add_rt2_lsa_link(p, LSART_STUB, ipa_to_u32(sn->px.addr), u32_mkmask(sn->px.len), sn->cost), i++; + add_rt2_lsa_link(p, LSART_STUB, ip4_to_u32(net4_prefix(&sn->prefix)), + u32_mkmask(net4_pxlen(&sn->prefix)), sn->cost), i++; struct ospf_lsa_rt *rt = p->lsab; /* Store number of links in lower half of options */ @@ -907,7 +914,7 @@ prepare_net2_lsa_body(struct ospf_proto *p, struct ospf_iface *ifa) ASSERT(p->lsab_used == 0); net = lsab_alloc(p, sizeof(struct ospf_lsa_net) + 4 * nodes); - net->optx = u32_mkmask(ifa->addr->pxlen); + net->optx = u32_mkmask(ifa->addr->prefix.pxlen); net->routers[0] = p->router_id; WALK_LIST(n, ifa->neigh_list) @@ -999,9 +1006,10 @@ prepare_sum3_net_lsa_body(struct ospf_proto *p, ort *nf, u32 metric) { struct ospf_lsa_sum3_net *sum; - sum = lsab_allocz(p, sizeof(struct ospf_lsa_sum3_net) + IPV6_PREFIX_SPACE(nf->fn.pxlen)); + sum = lsab_allocz(p, sizeof(struct ospf_lsa_sum3_net) + + IPV6_PREFIX_SPACE(nf->fn.addr->pxlen)); sum->metric = metric; - put_ipv6_prefix(sum->prefix, nf->fn.prefix, nf->fn.pxlen, 0, 0); + ospf3_put_prefix(sum->prefix, nf->fn.addr, 0, 0); } static inline void @@ -1028,7 +1036,7 @@ ospf_originate_sum_net_lsa(struct ospf_proto *p, struct ospf_area *oa, ort *nf, }; if (ospf_is_v2(p)) - prepare_sum2_lsa_body(p, nf->fn.pxlen, metric); + prepare_sum2_lsa_body(p, nf->fn.addr->pxlen, metric); else prepare_sum3_net_lsa_body(p, nf, metric); @@ -1036,20 +1044,20 @@ ospf_originate_sum_net_lsa(struct ospf_proto *p, struct ospf_area *oa, ort *nf, } void -ospf_originate_sum_rt_lsa(struct ospf_proto *p, struct ospf_area *oa, ort *nf, int metric, u32 options) +ospf_originate_sum_rt_lsa(struct ospf_proto *p, struct ospf_area *oa, u32 drid, int metric, u32 options) { struct ospf_new_lsa lsa = { .type = LSA_T_SUM_RT, .mode = LSA_M_RTCALC, .dom = oa->areaid, - .id = ipa_to_rid(nf->fn.prefix), /* Router ID of ASBR, irrelevant for OSPFv3 */ + .id = drid, /* Router ID of ASBR, irrelevant for OSPFv3 */ .opts = oa->options }; if (ospf_is_v2(p)) prepare_sum2_lsa_body(p, 0, metric); else - prepare_sum3_rt_lsa_body(p, lsa.id, metric, options & LSA_OPTIONS_MASK); + prepare_sum3_rt_lsa_body(p, drid, metric, options & LSA_OPTIONS_MASK); ospf_originate_lsa(p, &lsa); } @@ -1082,7 +1090,7 @@ prepare_ext3_lsa_body(struct ospf_proto *p, ort *nf, { struct ospf_lsa_ext3 *ext; int bsize = sizeof(struct ospf_lsa_ext3) - + IPV6_PREFIX_SPACE(nf->fn.pxlen) + + IPV6_PREFIX_SPACE(nf->fn.addr->pxlen) + (ipa_nonzero(fwaddr) ? 16 : 0) + (tag ? 4 : 0); @@ -1090,7 +1098,7 @@ prepare_ext3_lsa_body(struct ospf_proto *p, ort *nf, ext->metric = metric & LSA_METRIC_MASK; u32 *buf = ext->rest; - buf = put_ipv6_prefix(buf, nf->fn.prefix, nf->fn.pxlen, pbit ? OPT_PX_P : 0, 0); + buf = ospf3_put_prefix(buf, nf->fn.addr, pbit ? OPT_PX_P : 0, 0); if (ebit) ext->metric |= LSA_EXT3_EBIT; @@ -1098,7 +1106,7 @@ prepare_ext3_lsa_body(struct ospf_proto *p, ort *nf, if (ipa_nonzero(fwaddr)) { ext->metric |= LSA_EXT3_FBIT; - buf = put_ipv6_addr(buf, fwaddr); + buf = ospf3_put_addr(buf, fwaddr); } if (tag) @@ -1140,7 +1148,7 @@ ospf_originate_ext_lsa(struct ospf_proto *p, struct ospf_area *oa, ort *nf, u8 m }; if (ospf_is_v2(p)) - prepare_ext2_lsa_body(p, nf->fn.pxlen, metric, ebit, fwaddr, tag); + prepare_ext2_lsa_body(p, nf->fn.addr->pxlen, metric, ebit, fwaddr, tag); else prepare_ext3_lsa_body(p, nf, metric, ebit, fwaddr, tag, oa && pbit); @@ -1177,7 +1185,7 @@ use_gw_for_fwaddr(struct ospf_proto *p, ip_addr gw, struct iface *iface) WALK_LIST(ifa, p->iface_list) if ((ifa->iface == iface) && - (!ospf_is_v2(p) || ipa_in_net(gw, ifa->addr->prefix, ifa->addr->pxlen))) + (!ospf_is_v2(p) || ipa_in_netX(gw, &ifa->addr->prefix))) return 1; return 0; @@ -1215,7 +1223,8 @@ find_surrogate_fwaddr(struct ospf_proto *p, struct ospf_area *oa) { WALK_LIST(a, ifa->iface->addrs) { - if ((a->flags & IA_SECONDARY) || + if ((a->prefix.type != ospf_get_af(p)) || + (a->flags & IA_SECONDARY) || (a->flags & IA_PEER) || (a->scope <= SCOPE_LINK)) continue; @@ -1234,7 +1243,7 @@ find_surrogate_fwaddr(struct ospf_proto *p, struct ospf_area *oa) } void -ospf_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *ea) +ospf_rt_notify(struct proto *P, struct channel *ch UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *ea) { struct ospf_proto *p = (struct ospf_proto *) P; struct ospf_area *oa = NULL; /* non-NULL for NSSA-LSA */ @@ -1253,7 +1262,7 @@ ospf_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old U if (!new) { - nf = (ort *) fib_find(&p->rtf, &n->n.prefix, n->n.pxlen); + nf = fib_find(&p->rtf, n->n.addr); if (!nf || !nf->external_rte) return; @@ -1280,8 +1289,8 @@ ospf_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old U ip_addr fwd = IPA_NONE; - if ((a->dest == RTD_ROUTER) && use_gw_for_fwaddr(p, a->gw, a->iface)) - fwd = a->gw; + if ((a->dest == RTD_UNICAST) && use_gw_for_fwaddr(p, a->nh.gw, a->nh.iface)) + fwd = a->nh.gw; /* NSSA-LSA with P-bit set must have non-zero forwarding address */ if (oa && ipa_zero(fwd)) @@ -1290,13 +1299,13 @@ ospf_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old U if (ipa_zero(fwd)) { - log(L_ERR "%s: Cannot find forwarding address for NSSA-LSA %I/%d", - p->p.name, n->n.prefix, n->n.pxlen); + log(L_ERR "%s: Cannot find forwarding address for NSSA-LSA %N", + p->p.name, n->n.addr); return; } } - nf = (ort *) fib_get(&p->rtf, &n->n.prefix, n->n.pxlen); + nf = fib_get(&p->rtf, n->n.addr); ospf_originate_ext_lsa(p, oa, nf, LSA_M_EXPORT, metric, ebit, fwd, tag, 1); nf->external_rte = 1; } @@ -1308,38 +1317,47 @@ ospf_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old U */ static inline void -lsab_put_prefix(struct ospf_proto *p, ip_addr prefix, u32 pxlen, u32 cost) +lsab_put_prefix(struct ospf_proto *p, net_addr *n, u32 cost) { - void *buf = lsab_alloc(p, IPV6_PREFIX_SPACE(pxlen)); - u8 flags = (pxlen < MAX_PREFIX_LENGTH) ? 0 : OPT_PX_LA; - put_ipv6_prefix(buf, prefix, pxlen, flags, cost); + void *buf = lsab_alloc(p, IPV6_PREFIX_SPACE(net_pxlen(n))); + uint max = (n->type == NET_IP4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH; + u8 flags = (net_pxlen(n) < max) ? 0 : OPT_PX_LA; + ospf3_put_prefix(buf, n, flags, cost); } static void prepare_link_lsa_body(struct ospf_proto *p, struct ospf_iface *ifa) { - struct ospf_lsa_link *ll; + ip_addr nh = ospf_is_ip4(p) ? IPA_NONE : ifa->addr->ip; int i = 0; + /* Preallocating space for header */ ASSERT(p->lsab_used == 0); - ll = lsab_allocz(p, sizeof(struct ospf_lsa_link)); - ll->options = ifa->oa->options | (ifa->priority << 24); - ll->lladdr = ipa_to_ip6(ifa->addr->ip); - ll = NULL; /* buffer might be reallocated later */ + lsab_allocz(p, sizeof(struct ospf_lsa_link)); struct ifa *a; WALK_LIST(a, ifa->iface->addrs) { - if ((a->flags & IA_SECONDARY) || - (a->scope < SCOPE_SITE)) + if ((a->prefix.type != ospf_get_af(p)) || + (a->flags & IA_SECONDARY) || + (a->scope <= SCOPE_LINK)) continue; - lsab_put_prefix(p, a->prefix, a->pxlen, 0); + if (ospf_is_ip4(p) && ipa_zero(nh)) + nh = a->ip; + + lsab_put_prefix(p, &a->prefix, 0); i++; } - ll = p->lsab; + /* Filling the preallocated header */ + struct ospf_lsa_link *ll = p->lsab; + ll->options = ifa->oa->options | (ifa->priority << 24); + ll->lladdr = ospf_is_ip4(p) ? ospf3_4to6(ipa_to_ip4(nh)) : ipa_to_ip6(nh); ll->pxcount = i; + + if (ipa_zero(nh)) + log(L_ERR "%s: Cannot find next hop address for %s", p->p.name, ifa->ifname); } static void @@ -1401,12 +1419,13 @@ prepare_prefix_rt_lsa_body(struct ospf_proto *p, struct ospf_area *oa) struct ifa *a; WALK_LIST(a, ifa->iface->addrs) { - if ((a->flags & IA_SECONDARY) || + if ((a->prefix.type != ospf_get_af(p)) || + (a->flags & IA_SECONDARY) || (a->flags & IA_PEER) || (a->scope <= SCOPE_LINK)) continue; - if (((a->pxlen < MAX_PREFIX_LENGTH) && net_lsa) || + if (((a->prefix.pxlen < IP6_MAX_PREFIX_LENGTH) && net_lsa) || configured_stubnet(oa, a)) continue; @@ -1414,11 +1433,12 @@ prepare_prefix_rt_lsa_body(struct ospf_proto *p, struct ospf_area *oa) (ifa->state == OSPF_IS_LOOP) || (ifa->type == OSPF_IT_PTMP)) { - lsab_put_prefix(p, a->ip, MAX_PREFIX_LENGTH, 0); + net_addr_ip6 net = NET_ADDR_IP6(a->ip, IP6_MAX_PREFIX_LENGTH); + lsab_put_prefix(p, (net_addr *) &net, 0); host_addr = 1; } else - lsab_put_prefix(p, a->prefix, a->pxlen, ifa->cost); + lsab_put_prefix(p, &a->prefix, ifa->cost); i++; } @@ -1429,15 +1449,15 @@ prepare_prefix_rt_lsa_body(struct ospf_proto *p, struct ospf_area *oa) WALK_LIST(sn, oa->ac->stubnet_list) if (!sn->hidden) { - lsab_put_prefix(p, sn->px.addr, sn->px.len, sn->cost); - if (sn->px.len == MAX_PREFIX_LENGTH) + lsab_put_prefix(p, &sn->prefix, sn->cost); + if (sn->prefix.pxlen == IP6_MAX_PREFIX_LENGTH) host_addr = 1; i++; } /* If there are some configured vlinks, find some global address (even from another area), which will be used as a vlink endpoint. */ - if (!EMPTY_LIST(cf->vlink_list) && !host_addr) + if (!EMPTY_LIST(cf->vlink_list) && !host_addr && ospf_is_ip6(p)) { WALK_LIST(ifa, p->iface_list) { @@ -1447,11 +1467,14 @@ prepare_prefix_rt_lsa_body(struct ospf_proto *p, struct ospf_area *oa) struct ifa *a; WALK_LIST(a, ifa->iface->addrs) { - if ((a->flags & IA_SECONDARY) || (a->scope <= SCOPE_LINK)) + if ((a->prefix.type != NET_IP6) || + (a->flags & IA_SECONDARY) || + (a->scope <= SCOPE_LINK)) continue; /* Found some IP */ - lsab_put_prefix(p, a->ip, MAX_PREFIX_LENGTH, 0); + net_addr_ip6 net = NET_ADDR_IP6(a->ip, IP6_MAX_PREFIX_LENGTH); + lsab_put_prefix(p, (net_addr *) &net, 0); i++; goto done; } @@ -1557,7 +1580,7 @@ add_link_lsa(struct ospf_proto *p, struct ospf_lsa_link *ll, int offset, int *px continue; /* Skip link-local prefixes */ - if ((pxlen >= 10) && ((pxb[1] & 0xffc00000) == 0xfe800000)) + if (ospf_is_ip6(p) && (pxlen >= 10) && ((pxb[1] & 0xffc00000) == 0xfe800000)) continue; add_prefix(p, pxb, offset, pxc); @@ -1614,7 +1637,7 @@ ospf_originate_prefix_net_lsa(struct ospf_proto *p, struct ospf_iface *ifa) } static inline int breaks_minlsinterval(struct top_hash_entry *en) -{ return en && (en->lsa.age < LSA_MAXAGE) && ((en->inst_time + MINLSINTERVAL) > now); } +{ return en && (en->lsa.age < LSA_MAXAGE) && (lsa_inst_age(en) < MINLSINTERVAL); } void ospf_update_topology(struct ospf_proto *p) @@ -1748,7 +1771,7 @@ ospf_top_hash(struct top_graph *f, u32 domain, u32 lsaid, u32 rtrid, u32 type) * and request lists of OSPF neighbors. */ struct top_graph * -ospf_top_new(struct ospf_proto *p UNUSED4 UNUSED6, pool *pool) +ospf_top_new(struct ospf_proto *p, pool *pool) { struct top_graph *f; diff --git a/proto/ospf/topology.h b/proto/ospf/topology.h index 5652ced0..ac87334b 100644 --- a/proto/ospf/topology.h +++ b/proto/ospf/topology.h @@ -26,9 +26,9 @@ struct top_hash_entry void *next_lsa_body; /* For postponed LSA origination */ u16 next_lsa_blen; /* For postponed LSA origination */ u16 next_lsa_opts; /* For postponed LSA origination */ - bird_clock_t inst_time; /* Time of installation into DB */ + btime inst_time; /* Time of installation into DB */ struct ort *nf; /* Reference fibnode for sum and ext LSAs, NULL for otherwise */ - struct mpnh *nhs; /* Computed nexthops - valid only in ospf_rt_spf() */ + struct nexthop *nhs; /* Computed nexthops - valid only in ospf_rt_spf() */ ip_addr lb; /* In OSPFv2, link back address. In OSPFv3, any global address in the area useful for vlinks */ u32 lb_id; /* Interface ID of link back iface (for bcast or NBMA networks) */ u32 dist; /* Distance from the root */ @@ -185,10 +185,10 @@ static inline void ospf_flush2_lsa(struct ospf_proto *p, struct top_hash_entry * { if (*en) { ospf_flush_lsa(p, *en); *en = NULL; } } void ospf_originate_sum_net_lsa(struct ospf_proto *p, struct ospf_area *oa, ort *nf, int metric); -void ospf_originate_sum_rt_lsa(struct ospf_proto *p, struct ospf_area *oa, ort *nf, int metric, u32 options); +void ospf_originate_sum_rt_lsa(struct ospf_proto *p, struct ospf_area *oa, u32 drid, int metric, u32 options); void ospf_originate_ext_lsa(struct ospf_proto *p, struct ospf_area *oa, ort *nf, u8 mode, u32 metric, u32 ebit, ip_addr fwaddr, u32 tag, int pbit); -void ospf_rt_notify(struct proto *P, rtable *tbl, net *n, rte *new, rte *old, ea_list *attrs); +void ospf_rt_notify(struct proto *P, struct channel *ch, net *n, rte *new, rte *old, ea_list *attrs); void ospf_update_topology(struct ospf_proto *p); struct top_hash_entry *ospf_hash_find(struct top_graph *, u32 domain, u32 lsa, u32 rtr, u32 type); diff --git a/proto/pipe/Makefile b/proto/pipe/Makefile index 77de5b88..5093da98 100644 --- a/proto/pipe/Makefile +++ b/proto/pipe/Makefile @@ -1,6 +1,6 @@ -source=pipe.c -root-rel=../../ -dir-name=proto/pipe - -include ../../Rules +src := pipe.c +obj := $(src-o-files) +$(all-daemon) +$(cf-local) +tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file diff --git a/proto/pipe/config.Y b/proto/pipe/config.Y index 8daf2e7c..f51ee575 100644 --- a/proto/pipe/config.Y +++ b/proto/pipe/config.Y @@ -16,28 +16,25 @@ CF_DEFINES CF_DECLS -CF_KEYWORDS(PIPE, PEER, TABLE, MODE, OPAQUE, TRANSPARENT) +CF_KEYWORDS(PIPE, PEER, TABLE) CF_GRAMMAR -CF_ADDTO(proto, pipe_proto '}') +CF_ADDTO(proto, pipe_proto '}' { this_channel = NULL; } ) -pipe_proto_start: proto_start PIPE { - this_proto = proto_config_new(&proto_pipe, $1); - PIPE_CFG->mode = PIPE_TRANSPARENT; - } - ; +pipe_proto_start: proto_start PIPE +{ + this_proto = proto_config_new(&proto_pipe, $1); + this_channel = channel_config_new(NULL, 0, this_proto); + this_channel->in_filter = FILTER_ACCEPT; + this_channel->out_filter = FILTER_ACCEPT; +}; pipe_proto: pipe_proto_start proto_name '{' | pipe_proto proto_item ';' - | pipe_proto PEER TABLE SYM ';' { - if ($4->class != SYM_TABLE) - cf_error("Routing table name expected"); - PIPE_CFG->peer = $4->def; - } - | pipe_proto MODE OPAQUE ';' { PIPE_CFG->mode = PIPE_OPAQUE; } - | pipe_proto MODE TRANSPARENT ';' { PIPE_CFG->mode = PIPE_TRANSPARENT; } + | pipe_proto channel_item ';' + | pipe_proto PEER TABLE rtable ';' { PIPE_CFG->peer = $4; } ; CF_CODE diff --git a/proto/pipe/pipe.c b/proto/pipe/pipe.c index 6ef80322..310f3c01 100644 --- a/proto/pipe/pipe.c +++ b/proto/pipe/pipe.c @@ -44,54 +44,42 @@ #include "pipe.h" static void -pipe_rt_notify(struct proto *P, rtable *src_table, net *n, rte *new, rte *old, ea_list *attrs) +pipe_rt_notify(struct proto *P, struct channel *src_ch, net *n, rte *new, rte *old, ea_list *attrs) { - struct pipe_proto *p = (struct pipe_proto *) P; - struct announce_hook *ah = (src_table == P->table) ? p->peer_ahook : P->main_ahook; - rtable *dst_table = ah->table; + struct pipe_proto *p = (void *) P; + struct channel *dst = (src_ch == p->pri) ? p->sec : p->pri; struct rte_src *src; - net *nn; rte *e; - rta a; + rta *a; if (!new && !old) return; - if (dst_table->pipe_busy) + if (dst->table->pipe_busy) { - log(L_ERR "Pipe loop detected when sending %I/%d to table %s", - n->n.prefix, n->n.pxlen, dst_table->name); + log(L_ERR "Pipe loop detected when sending %N to table %s", + n->n.addr, dst->table->name); return; } - nn = net_get(dst_table, n->n.prefix, n->n.pxlen); if (new) { - memcpy(&a, new->attrs, sizeof(rta)); - - if (p->mode == PIPE_OPAQUE) - { - a.src = P->main_source; - a.source = RTS_PIPE; - } - - a.aflags = 0; - a.eattrs = attrs; - a.hostentry = NULL; - e = rte_get_temp(&a); - e->net = nn; + a = alloca(rta_size(new->attrs)); + memcpy(a, new->attrs, rta_size(new->attrs)); + + a->aflags = 0; + a->eattrs = attrs; + a->hostentry = NULL; + e = rte_get_temp(a); e->pflags = 0; - if (p->mode == PIPE_TRANSPARENT) - { - /* Copy protocol specific embedded attributes. */ - memcpy(&(e->u), &(new->u), sizeof(e->u)); - e->pref = new->pref; - e->pflags = new->pflags; - } + /* Copy protocol specific embedded attributes. */ + memcpy(&(e->u), &(new->u), sizeof(e->u)); + e->pref = new->pref; + e->pflags = new->pflags; - src = a.src; + src = a->src; } else { @@ -99,9 +87,9 @@ pipe_rt_notify(struct proto *P, rtable *src_table, net *n, rte *new, rte *old, e src = old->attrs->src; } - src_table->pipe_busy = 1; - rte_update2(ah, nn, e, src); - src_table->pipe_busy = 0; + src_ch->table->pipe_busy = 1; + rte_update2(dst, n->n.addr, e, src); + src_ch->table->pipe_busy = 0; } static int @@ -111,171 +99,117 @@ pipe_import_control(struct proto *P, rte **ee, ea_list **ea UNUSED, struct linpo if (pp == P) return -1; /* Avoid local loops automatically */ + return 0; } -static int -pipe_reload_routes(struct proto *P) +static void +pipe_reload_routes(struct channel *C) { - struct pipe_proto *p = (struct pipe_proto *) P; - - /* - * Because the pipe protocol feeds routes from both routing tables - * together, both directions are reloaded during refeed and 'reload - * out' command works like 'reload' command. For symmetry, we also - * request refeed when 'reload in' command is used. - */ - proto_request_feeding(P); + struct pipe_proto *p = (void *) C->proto; - proto_reset_limit(P->main_ahook->in_limit); - proto_reset_limit(p->peer_ahook->in_limit); - - return 1; + /* Route reload on one channel is just refeed on the other */ + channel_request_feeding((C == p->pri) ? p->sec : p->pri); } -static struct proto * -pipe_init(struct proto_config *C) -{ - struct pipe_config *c = (struct pipe_config *) C; - struct proto *P = proto_new(C, sizeof(struct pipe_proto)); - struct pipe_proto *p = (struct pipe_proto *) P; - p->mode = c->mode; - p->peer_table = c->peer->table; - P->accept_ra_types = (p->mode == PIPE_OPAQUE) ? RA_OPTIMAL : RA_ANY; - P->rt_notify = pipe_rt_notify; - P->import_control = pipe_import_control; - P->reload_routes = pipe_reload_routes; - - return P; -} - -static int -pipe_start(struct proto *P) +static void +pipe_postconfig(struct proto_config *CF) { - struct pipe_config *cf = (struct pipe_config *) P->cf; - struct pipe_proto *p = (struct pipe_proto *) P; + struct pipe_config *cf = (void *) CF; + struct channel_config *cc = proto_cf_main_channel(CF); - /* Lock both tables, unlock is handled in pipe_cleanup() */ - rt_lock_table(P->table); - rt_lock_table(p->peer_table); + if (!cc->table) + cf_error("Primary routing table not specified"); - /* Going directly to PS_UP - prepare for feeding, - connect the protocol to both routing tables */ + if (!cf->peer) + cf_error("Secondary routing table not specified"); - P->main_ahook = proto_add_announce_hook(P, P->table, &P->stats); - P->main_ahook->out_filter = cf->c.out_filter; - P->main_ahook->in_limit = cf->c.in_limit; - proto_reset_limit(P->main_ahook->in_limit); + if (cc->table == cf->peer) + cf_error("Primary table and peer table must be different"); - p->peer_ahook = proto_add_announce_hook(P, p->peer_table, &p->peer_stats); - p->peer_ahook->out_filter = cf->c.in_filter; - p->peer_ahook->in_limit = cf->c.out_limit; - proto_reset_limit(p->peer_ahook->in_limit); + if (cc->table->addr_type != cf->peer->addr_type) + cf_error("Primary table and peer table must have the same type"); - if (p->mode == PIPE_OPAQUE) - { - P->main_source = rt_get_source(P, 0); - rt_lock_source(P->main_source); - } + if (cc->rx_limit.action) + cf_error("Pipe protocol does not support receive limits"); - return PS_UP; + if (cc->in_keep_filtered) + cf_error("Pipe protocol prohibits keeping filtered routes"); } -static void -pipe_cleanup(struct proto *P) +static int +pipe_configure_channels(struct pipe_proto *p, struct pipe_config *cf) { - struct pipe_proto *p = (struct pipe_proto *) P; - - bzero(&P->stats, sizeof(struct proto_stats)); - bzero(&p->peer_stats, sizeof(struct proto_stats)); - - P->main_ahook = NULL; - p->peer_ahook = NULL; - - if (p->mode == PIPE_OPAQUE) - rt_unlock_source(P->main_source); - P->main_source = NULL; - - rt_unlock_table(P->table); - rt_unlock_table(p->peer_table); + struct channel_config *cc = proto_cf_main_channel(&cf->c); + + struct channel_config pri_cf = { + .name = "pri", + .channel = cc->channel, + .table = cc->table, + .out_filter = cc->out_filter, + .in_limit = cc->in_limit, + .ra_mode = RA_ANY + }; + + struct channel_config sec_cf = { + .name = "sec", + .channel = cc->channel, + .table = cf->peer, + .out_filter = cc->in_filter, + .in_limit = cc->out_limit, + .ra_mode = RA_ANY + }; + + return + proto_configure_channel(&p->p, &p->pri, &pri_cf) && + proto_configure_channel(&p->p, &p->sec, &sec_cf); } -static void -pipe_postconfig(struct proto_config *C) +static struct proto * +pipe_init(struct proto_config *CF) { - struct pipe_config *c = (struct pipe_config *) C; + struct proto *P = proto_new(CF); + struct pipe_proto *p = (void *) P; + struct pipe_config *cf = (void *) CF; - if (!c->peer) - cf_error("Name of peer routing table not specified"); - if (c->peer == C->table) - cf_error("Primary table and peer table must be different"); + P->rt_notify = pipe_rt_notify; + P->import_control = pipe_import_control; + P->reload_routes = pipe_reload_routes; - if (C->in_keep_filtered) - cf_error("Pipe protocol prohibits keeping filtered routes"); - if (C->rx_limit) - cf_error("Pipe protocol does not support receive limits"); -} + pipe_configure_channels(p, cf); -extern int proto_reconfig_type; + return P; +} static int -pipe_reconfigure(struct proto *P, struct proto_config *new) +pipe_reconfigure(struct proto *P, struct proto_config *CF) { - struct pipe_proto *p = (struct pipe_proto *)P; - struct proto_config *old = P->cf; - struct pipe_config *oc = (struct pipe_config *) old; - struct pipe_config *nc = (struct pipe_config *) new; - - if ((oc->peer->table != nc->peer->table) || (oc->mode != nc->mode)) - return 0; - - /* Update output filters in ahooks */ - if (P->main_ahook) - { - P->main_ahook->out_filter = new->out_filter; - P->main_ahook->in_limit = new->in_limit; - proto_verify_limits(P->main_ahook); - } - - if (p->peer_ahook) - { - p->peer_ahook->out_filter = new->in_filter; - p->peer_ahook->in_limit = new->out_limit; - proto_verify_limits(p->peer_ahook); - } - - if ((P->proto_state != PS_UP) || (proto_reconfig_type == RECONFIG_SOFT)) - return 1; - - if ((new->preference != old->preference) - || ! filter_same(new->in_filter, old->in_filter) - || ! filter_same(new->out_filter, old->out_filter)) - proto_request_feeding(P); + struct pipe_proto *p = (void *) P; + struct pipe_config *cf = (void *) CF; - return 1; + return pipe_configure_channels(p, cf); } static void -pipe_copy_config(struct proto_config *dest, struct proto_config *src) +pipe_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUSED) { /* Just a shallow copy, not many items here */ - proto_copy_rest(dest, src, sizeof(struct pipe_config)); } static void pipe_get_status(struct proto *P, byte *buf) { - struct pipe_proto *p = (struct pipe_proto *) P; + struct pipe_proto *p = (void *) P; - bsprintf(buf, "%c> %s", (p->mode == PIPE_OPAQUE) ? '-' : '=', p->peer_table->name); + bsprintf(buf, "%s <=> %s", p->pri->table->name, p->sec->table->name); } static void pipe_show_stats(struct pipe_proto *p) { - struct proto_stats *s1 = &p->p.stats; - struct proto_stats *s2 = &p->peer_stats; + struct proto_stats *s1 = &p->pri->stats; + struct proto_stats *s2 = &p->sec->stats; /* * Pipe stats (as anything related to pipes) are a bit tricky. There @@ -318,17 +252,16 @@ pipe_show_stats(struct pipe_proto *p) static void pipe_show_proto_info(struct proto *P) { - struct pipe_proto *p = (struct pipe_proto *) P; - struct pipe_config *cf = (struct pipe_config *) P->cf; + struct pipe_proto *p = (void *) P; - // cli_msg(-1006, " Table: %s", P->table->name); - // cli_msg(-1006, " Peer table: %s", p->peer_table->name); - cli_msg(-1006, " Preference: %d", P->preference); - cli_msg(-1006, " Input filter: %s", filter_name(cf->c.in_filter)); - cli_msg(-1006, " Output filter: %s", filter_name(cf->c.out_filter)); + cli_msg(-1006, " Channel %s", "main"); + cli_msg(-1006, " Table: %s", p->pri->table->name); + cli_msg(-1006, " Peer table: %s", p->sec->table->name); + cli_msg(-1006, " Import filter: %s", filter_name(p->sec->out_filter)); + cli_msg(-1006, " Export filter: %s", filter_name(p->pri->out_filter)); - proto_show_limit(cf->c.in_limit, "Import limit:"); - proto_show_limit(cf->c.out_limit, "Export limit:"); + channel_show_limit(&p->pri->in_limit, "Import limit:"); + channel_show_limit(&p->sec->in_limit, "Export limit:"); if (P->proto_state != PS_DOWN) pipe_show_stats(p); @@ -338,13 +271,10 @@ pipe_show_proto_info(struct proto *P) struct protocol proto_pipe = { .name = "Pipe", .template = "pipe%d", - .multitable = 1, - .preference = DEF_PREF_PIPE, + .proto_size = sizeof(struct pipe_proto), .config_size = sizeof(struct pipe_config), .postconfig = pipe_postconfig, .init = pipe_init, - .start = pipe_start, - .cleanup = pipe_cleanup, .reconfigure = pipe_reconfigure, .copy_config = pipe_copy_config, .get_status = pipe_get_status, diff --git a/proto/pipe/pipe.h b/proto/pipe/pipe.h index 50b31698..038c6666 100644 --- a/proto/pipe/pipe.h +++ b/proto/pipe/pipe.h @@ -9,27 +9,15 @@ #ifndef _BIRD_PIPE_H_ #define _BIRD_PIPE_H_ -#define PIPE_OPAQUE 0 -#define PIPE_TRANSPARENT 1 - struct pipe_config { struct proto_config c; struct rtable_config *peer; /* Table we're connected to */ - int mode; /* PIPE_OPAQUE or PIPE_TRANSPARENT */ }; struct pipe_proto { struct proto p; - struct rtable *peer_table; - struct announce_hook *peer_ahook; /* Announce hook for direction peer->primary */ - struct proto_stats peer_stats; /* Statistics for the direction peer->primary */ - int mode; /* PIPE_OPAQUE or PIPE_TRANSPARENT */ + struct channel *pri; + struct channel *sec; }; - -extern struct protocol proto_pipe; - -static inline int proto_is_pipe(struct proto *p) -{ return p->proto == &proto_pipe; } - #endif diff --git a/proto/radv/Makefile b/proto/radv/Makefile index efc4d4af..05317eff 100644 --- a/proto/radv/Makefile +++ b/proto/radv/Makefile @@ -1,5 +1,6 @@ -source=radv.c packets.c -root-rel=../../ -dir-name=proto/radv +src := packets.c radv.c +obj := $(src-o-files) +$(all-daemon) +$(cf-local) -include ../../Rules +tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file diff --git a/proto/radv/config.Y b/proto/radv/config.Y index 0ff84aeb..0e43c237 100644 --- a/proto/radv/config.Y +++ b/proto/radv/config.Y @@ -41,6 +41,7 @@ CF_ADDTO(proto, radv_proto) radv_proto_start: proto_start RADV { this_proto = proto_config_new(&proto_radv, $1); + init_list(&RADV_CFG->patt_list); init_list(&RADV_CFG->pref_list); init_list(&RADV_CFG->rdnss_list); @@ -49,15 +50,12 @@ radv_proto_start: proto_start RADV radv_proto_item: proto_item + | proto_channel | INTERFACE radv_iface | PREFIX radv_prefix { add_tail(&RADV_CFG->pref_list, NODE this_radv_prefix); } | RDNSS { init_list(&radv_dns_list); } radv_rdnss { add_tail_list(&RADV_CFG->rdnss_list, &radv_dns_list); } | DNSSL { init_list(&radv_dns_list); } radv_dnssl { add_tail_list(&RADV_CFG->dnssl_list, &radv_dns_list); } - | TRIGGER prefix { - RADV_CFG->trigger_prefix = $2.addr; - RADV_CFG->trigger_pxlen = $2.len; - RADV_CFG->trigger_valid = 1; - } + | TRIGGER net_ip6 { RADV_CFG->trigger = $2; } ; radv_proto_opts: @@ -94,15 +92,15 @@ radv_iface_item: | MIN DELAY expr { RADV_IFACE->min_delay = $3; if ($3 <= 0) cf_error("Min delay must be positive"); } | MANAGED bool { RADV_IFACE->managed = $2; } | OTHER CONFIG bool { RADV_IFACE->other_config = $3; } - | LINK MTU expr { RADV_IFACE->link_mtu = $3; if ($3 < 0) cf_error("Link MTU must be 0 or positive"); } - | REACHABLE TIME expr { RADV_IFACE->reachable_time = $3; if (($3 < 0) || ($3 > 3600000)) cf_error("Reachable time must be in range 0-3600000"); } - | RETRANS TIMER expr { RADV_IFACE->retrans_timer = $3; if ($3 < 0) cf_error("Retrans timer must be 0 or positive"); } - | LINGER TIME expr { RADV_IFACE->linger_time = $3; if (($3 < 0) || ($3 > 3600)) cf_error("Linger time must be in range 0-3600"); } - | CURRENT HOP LIMIT expr { RADV_IFACE->current_hop_limit = $4; if (($4 < 0) || ($4 > 255)) cf_error("Current hop limit must be in range 0-255"); } + | LINK MTU expr { RADV_IFACE->link_mtu = $3; } + | REACHABLE TIME expr { RADV_IFACE->reachable_time = $3; if ($3 > 3600000) cf_error("Reachable time must be in range 0-3600000"); } + | RETRANS TIMER expr { RADV_IFACE->retrans_timer = $3; } + | LINGER TIME expr { RADV_IFACE->linger_time = $3; if ($3 > 3600) cf_error("Linger time must be in range 0-3600"); } + | CURRENT HOP LIMIT expr { RADV_IFACE->current_hop_limit = $4; if ($4 > 255) cf_error("Current hop limit must be in range 0-255"); } | DEFAULT LIFETIME expr radv_sensitive { RADV_IFACE->default_lifetime = $3; - if (($3 < 0) || ($3 > 9000)) cf_error("Default lifetime must be in range 0-9000"); - if ($4 != -1) RADV_IFACE->default_lifetime_sensitive = $4; + if ($3 > 9000) cf_error("Default lifetime must be in range 0-9000"); + if ($4 != (uint) -1) RADV_IFACE->default_lifetime_sensitive = $4; } | DEFAULT PREFERENCE radv_preference { RADV_IFACE->default_preference = $3; } | PREFIX radv_prefix { add_tail(&RADV_IFACE->pref_list, NODE this_radv_prefix); } @@ -129,7 +127,7 @@ radv_iface_finish: if ((ic->min_ra_int > 3) && (ic->min_ra_int > (ic->max_ra_int * 3 / 4))) - cf_error("Min RA interval must be at most 3/4 * Max RA interval %d %d", ic->min_ra_int, ic->max_ra_int); + cf_error("Min RA interval must be at most 3/4 * Max RA interval"); if ((ic->default_lifetime > 0) && (ic->default_lifetime < ic->max_ra_int)) cf_error("Default lifetime must be either 0 or at least Max RA interval"); @@ -150,11 +148,10 @@ radv_iface: radv_iface_start iface_patt_list_nopx radv_iface_opt_list radv_iface_finish; -radv_prefix_start: prefix +radv_prefix_start: net_ip6 { this_radv_prefix = cfg_allocz(sizeof(struct radv_prefix_config)); - RADV_PREFIX->prefix = $1.addr; - RADV_PREFIX->pxlen = $1.len; + RADV_PREFIX->prefix = *(net_addr_ip6 *) &($1); RADV_PREFIX->onlink = 1; RADV_PREFIX->autonomous = 1; @@ -168,13 +165,11 @@ radv_prefix_item: | AUTONOMOUS bool { RADV_PREFIX->autonomous = $2; } | VALID LIFETIME expr radv_sensitive { RADV_PREFIX->valid_lifetime = $3; - if ($3 < 0) cf_error("Valid lifetime must be 0 or positive"); - if ($4 != -1) RADV_PREFIX->valid_lifetime_sensitive = $4; + if ($4 != (uint) -1) RADV_PREFIX->valid_lifetime_sensitive = $4; } | PREFERRED LIFETIME expr radv_sensitive { RADV_PREFIX->preferred_lifetime = $3; - if ($3 < 0) cf_error("Preferred lifetime must be 0 or positive"); - if ($4 != -1) RADV_PREFIX->preferred_lifetime_sensitive = $4; + if ($4 != (uint) -1) RADV_PREFIX->preferred_lifetime_sensitive = $4; } ; diff --git a/proto/radv/packets.c b/proto/radv/packets.c index 19d71f97..e07296e1 100644 --- a/proto/radv/packets.c +++ b/proto/radv/packets.c @@ -38,7 +38,7 @@ struct radv_opt_prefix u32 valid_lifetime; u32 preferred_lifetime; u32 reserved; - ip_addr prefix; + ip6_addr prefix; }; #define OPT_PX_ONLINK 0x80 @@ -58,7 +58,7 @@ struct radv_opt_rdnss u8 length; u16 reserved; u32 lifetime; - ip_addr servers[]; + ip6_addr servers[]; }; struct radv_opt_dnssl @@ -79,7 +79,7 @@ radv_prepare_rdnss(struct radv_iface *ifa, list *rdnss_list, char **buf, char *b { struct radv_rdnss_config *rcf_base = rcf; struct radv_opt_rdnss *op = (void *) *buf; - int max_i = (bufend - *buf - sizeof(struct radv_opt_rdnss)) / sizeof(ip_addr); + int max_i = (bufend - *buf - sizeof(struct radv_opt_rdnss)) / sizeof(ip6_addr); int i = 0; if (max_i < 1) @@ -100,8 +100,7 @@ radv_prepare_rdnss(struct radv_iface *ifa, list *rdnss_list, char **buf, char *b if (i >= max_i) goto too_much; - op->servers[i] = rcf->server; - ipa_hton(op->servers[i]); + op->servers[i] = ip6_hton(rcf->server); i++; rcf = NODE_NEXT(rcf); @@ -206,10 +205,10 @@ radv_prepare_dnssl(struct radv_iface *ifa, list *dnssl_list, char **buf, char *b } static int -radv_prepare_prefix(struct radv_iface *ifa, struct radv_prefix *prefix, +radv_prepare_prefix(struct radv_iface *ifa, struct radv_prefix *px, char **buf, char *bufend) { - struct radv_prefix_config *pc = prefix->cf; + struct radv_prefix_config *pc = px->cf; if (*buf + sizeof(struct radv_opt_prefix) > bufend) { @@ -221,7 +220,7 @@ radv_prepare_prefix(struct radv_iface *ifa, struct radv_prefix *prefix, struct radv_opt_prefix *op = (void *) *buf; op->type = OPT_PREFIX; op->length = 4; - op->pxlen = prefix->len; + op->pxlen = px->prefix.pxlen; op->flags = (pc->onlink ? OPT_PX_ONLINK : 0) | (pc->autonomous ? OPT_PX_AUTONOMOUS : 0); op->valid_lifetime = (ifa->ra->active || !pc->valid_lifetime_sensitive) ? @@ -229,8 +228,7 @@ radv_prepare_prefix(struct radv_iface *ifa, struct radv_prefix *prefix, op->preferred_lifetime = (ifa->ra->active || !pc->preferred_lifetime_sensitive) ? htonl(pc->preferred_lifetime) : 0; op->reserved = 0; - op->prefix = prefix->prefix; - ipa_hton(op->prefix); + op->prefix = ip6_hton(px->prefix.prefix); *buf += sizeof(*op); return 0; @@ -334,7 +332,7 @@ radv_rx_hook(sock *sk, uint size) if (sk->lifindex != sk->iface->index) return 1; - if (ipa_equal(sk->faddr, ifa->addr->ip)) + if (ipa_equal(sk->faddr, sk->saddr)) return 1; if (size < 8) @@ -386,6 +384,7 @@ radv_sk_open(struct radv_iface *ifa) { sock *sk = sk_new(ifa->pool); sk->type = SK_IP; + sk->subtype = SK_IPV6; sk->dport = ICMPV6_PROTO; sk->saddr = ifa->addr->ip; diff --git a/proto/radv/radv.c b/proto/radv/radv.c index 227c8ef6..c96d7724 100644 --- a/proto/radv/radv.c +++ b/proto/radv/radv.c @@ -58,23 +58,24 @@ radv_timer(timer *tm) * This sets the timer, but we replace it just at the end of this function * (replacing a timer is fine). */ - if (ifa->prefix_expires && (ifa->prefix_expires <= now)) + if (ifa->prefix_expires && (ifa->prefix_expires <= current_time())) radv_iface_notify(ifa, RA_EV_GC); radv_send_ra(ifa, 0); /* Update timer */ - ifa->last = now; - unsigned after = ifa->cf->min_ra_int; - after += random() % (ifa->cf->max_ra_int - ifa->cf->min_ra_int + 1); + ifa->last = current_time(); + btime t = ifa->cf->min_ra_int S; + btime r = (ifa->cf->max_ra_int - ifa->cf->min_ra_int) S; + t += random() % (r + 1); if (ifa->initial) + { + t = MIN(t, MAX_INITIAL_RTR_ADVERT_INTERVAL); ifa->initial--; + } - if (ifa->initial) - after = MIN(after, MAX_INITIAL_RTR_ADVERT_INTERVAL); - - tm_start(ifa->timer, after); + tm_start(ifa->timer, t); } static struct radv_prefix_config default_prefix = { @@ -89,21 +90,18 @@ static struct radv_prefix_config dead_prefix = { /* Find a corresponding config for the given prefix */ static struct radv_prefix_config * -radv_prefix_match(struct radv_iface *ifa, struct ifa *a) +radv_prefix_match(struct radv_iface *ifa, net_addr_ip6 *px) { struct radv_proto *p = ifa->ra; struct radv_config *cf = (struct radv_config *) (p->p.cf); struct radv_prefix_config *pc; - if (a->scope <= SCOPE_LINK) - return NULL; - WALK_LIST(pc, ifa->cf->pref_list) - if ((a->pxlen >= pc->pxlen) && ipa_in_net(a->prefix, pc->prefix, pc->pxlen)) + if (net_in_net_ip6(px, &pc->prefix)) return pc; WALK_LIST(pc, cf->pref_list) - if ((a->pxlen >= pc->pxlen) && ipa_in_net(a->prefix, pc->prefix, pc->pxlen)) + if (net_in_net_ip6(px, &pc->prefix)) return pc; return &default_prefix; @@ -128,7 +126,12 @@ radv_prepare_prefixes(struct radv_iface *ifa) struct ifa *addr; WALK_LIST(addr, ifa->iface->addrs) { - struct radv_prefix_config *pc = radv_prefix_match(ifa, addr); + if ((addr->prefix.type != NET_IP6) || + (addr->scope <= SCOPE_LINK)) + continue; + + net_addr_ip6 *prefix = (void *) &addr->prefix; + struct radv_prefix_config *pc = radv_prefix_match(ifa, prefix); if (!pc || pc->skip) continue; @@ -136,7 +139,7 @@ radv_prepare_prefixes(struct radv_iface *ifa) /* Do we have it already? */ struct radv_prefix *existing = NULL; WALK_LIST(pfx, ifa->prefixes) - if ((pfx->len == addr->pxlen) && ipa_equal(pfx->prefix, addr->prefix)) + if (net_equal_ip6(&pfx->prefix, prefix)) { existing = pfx; break; @@ -144,12 +147,11 @@ radv_prepare_prefixes(struct radv_iface *ifa) if (!existing) { - RADV_TRACE(D_EVENTS, "Adding new prefix %I/%d on %s", - addr->prefix, addr->pxlen, ifa->iface->name); + RADV_TRACE(D_EVENTS, "Adding new prefix %N on %s", + prefix, ifa->iface->name); existing = mb_allocz(ifa->pool, sizeof *existing); - existing->prefix = addr->prefix; - existing->len = addr->pxlen; + net_copy_ip6(&existing->prefix, prefix); add_tail(&ifa->prefixes, NODE existing); } @@ -167,15 +169,16 @@ radv_prepare_prefixes(struct radv_iface *ifa) * dropped just yet). If something is dead and rots there for long enough, * clean it up. */ - bird_clock_t expires = now + cf->linger_time; - bird_clock_t expires_min = 0; + btime now_ = current_time(); + btime expires = now_ + cf->linger_time S; + btime expires_min = 0; struct radv_prefix *next; WALK_LIST_DELSAFE(pfx, next, ifa->prefixes) { if (pfx->alive && !pfx->mark) { - RADV_TRACE(D_EVENTS, "Marking prefix %I/$d on %s as dead", - pfx->prefix, pfx->len, ifa->iface->name); + RADV_TRACE(D_EVENTS, "Marking prefix %N on %s as dead", + pfx->prefix, ifa->iface->name); pfx->alive = 0; pfx->expires = expires; @@ -184,10 +187,10 @@ radv_prepare_prefixes(struct radv_iface *ifa) if (!pfx->alive) { - if (pfx->expires <= now) + if (pfx->expires <= now_) { - RADV_TRACE(D_EVENTS, "Removing prefix %I/%d on %s", - pfx->prefix, pfx->len, ifa->iface->name); + RADV_TRACE(D_EVENTS, "Removing prefix %N on %s", + pfx->prefix, ifa->iface->name); rem_node(NODE pfx); mb_free(pfx); @@ -232,13 +235,8 @@ radv_iface_notify(struct radv_iface *ifa, int event) radv_prepare_prefixes(ifa); /* Update timer */ - unsigned delta = now - ifa->last; - unsigned after = 0; - - if (delta < ifa->cf->min_delay) - after = ifa->cf->min_delay - delta; - - tm_start(ifa->timer, after); + btime t = ifa->last + ifa->cf->min_delay S - current_time(); + tm_start(ifa->timer, t); } static void @@ -278,17 +276,6 @@ radv_iface_add(struct object_lock *lock) radv_iface_notify(ifa, RA_EV_INIT); } -static inline struct ifa * -find_lladdr(struct iface *iface) -{ - struct ifa *a; - WALK_LIST(a, iface->addrs) - if (a->scope == SCOPE_LINK) - return a; - - return NULL; -} - static void radv_iface_new(struct radv_proto *p, struct iface *iface, struct radv_iface_config *cf) { @@ -302,23 +289,12 @@ radv_iface_new(struct radv_proto *p, struct iface *iface, struct radv_iface_conf ifa->ra = p; ifa->cf = cf; ifa->iface = iface; + ifa->addr = iface->llv6; init_list(&ifa->prefixes); add_tail(&p->iface_list, NODE ifa); - ifa->addr = find_lladdr(iface); - if (!ifa->addr) - { - log(L_ERR "%s: Missing link-local address on interface %s", p->p.name, iface->name); - return; - } - - timer *tm = tm_new(pool); - tm->hook = radv_timer; - tm->data = ifa; - tm->randomize = 0; - tm->recurrent = 0; - ifa->timer = tm; + ifa->timer = tm_new_init(pool, radv_timer, ifa, 0, 0); struct object_lock *lock = olock_new(pool); lock->addr = IPA_NONE; @@ -354,8 +330,15 @@ radv_if_notify(struct proto *P, unsigned flags, struct iface *iface) if (flags & IF_CHANGE_UP) { - struct radv_iface_config *ic = (struct radv_iface_config *) - iface_patt_find(&cf->patt_list, iface, NULL); + struct radv_iface_config *ic = (void *) iface_patt_find(&cf->patt_list, iface, NULL); + + /* Ignore non-multicast ifaces */ + if (!(iface->flags & IF_MULTICAST)) + return; + + /* Ignore ifaces without link-local address */ + if (!iface->llv6) + return; if (ic) radv_iface_new(p, iface, ic); @@ -394,11 +377,16 @@ radv_ifa_notify(struct proto *P, unsigned flags UNUSED, struct ifa *a) radv_iface_notify(ifa, RA_EV_CHANGE); } -static inline int radv_net_match_trigger(struct radv_config *cf, net *n) +static inline int +radv_trigger_valid(struct radv_config *cf) +{ + return cf->trigger.type != 0; +} + +static inline int +radv_net_match_trigger(struct radv_config *cf, net *n) { - return cf->trigger_valid && - (n->n.pxlen == cf->trigger_pxlen) && - ipa_equal(n->n.prefix, cf->trigger_prefix); + return radv_trigger_valid(cf) && net_equal(n->n.addr, &cf->trigger); } int @@ -414,7 +402,7 @@ radv_import_control(struct proto *P, rte **new, ea_list **attrs UNUSED, struct l } static void -radv_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs UNUSED) +radv_rt_notify(struct proto *P, struct channel *ch UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs UNUSED) { struct radv_proto *p = (struct radv_proto *) P; struct radv_config *cf = (struct radv_config *) (P->cf); @@ -441,19 +429,30 @@ radv_check_active(struct radv_proto *p) { struct radv_config *cf = (struct radv_config *) (p->p.cf); - if (! cf->trigger_valid) + if (!radv_trigger_valid(cf)) return 1; - return rt_examine(p->p.table, cf->trigger_prefix, cf->trigger_pxlen, - &(p->p), p->p.cf->out_filter); + struct channel *c = p->p.main_channel; + return rt_examine(c->table, &cf->trigger, &p->p, c->out_filter); +} + +static void +radv_postconfig(struct proto_config *CF) +{ + // struct radv_config *cf = (void *) CF; + + /* Define default channel */ + if (EMPTY_LIST(CF->channels)) + channel_config_new(NULL, NET_IP6, CF); } static struct proto * -radv_init(struct proto_config *c) +radv_init(struct proto_config *CF) { - struct proto *P = proto_new(c, sizeof(struct radv_proto)); + struct proto *P = proto_new(CF); + + P->main_channel = proto_add_channel(P, proto_cf_main_channel(CF)); - P->accept_ra_types = RA_OPTIMAL; P->import_control = radv_import_control; P->rt_notify = radv_rt_notify; P->if_notify = radv_if_notify; @@ -469,7 +468,7 @@ radv_start(struct proto *P) struct radv_config *cf = (struct radv_config *) (P->cf); init_list(&(p->iface_list)); - p->active = !cf->trigger_valid; + p->active = !radv_trigger_valid(cf); return PS_UP; } @@ -494,11 +493,11 @@ radv_shutdown(struct proto *P) } static int -radv_reconfigure(struct proto *P, struct proto_config *c) +radv_reconfigure(struct proto *P, struct proto_config *CF) { struct radv_proto *p = (struct radv_proto *) P; // struct radv_config *old = (struct radv_config *) (p->cf); - struct radv_config *new = (struct radv_config *) c; + struct radv_config *new = (struct radv_config *) CF; /* * The question is why there is a reconfigure function for RAdv if @@ -508,12 +507,26 @@ radv_reconfigure(struct proto *P, struct proto_config *c) * causing nodes to temporary remove their default routes. */ - P->cf = c; /* radv_check_active() requires proper P->cf */ + if (!proto_configure_channel(P, &P->main_channel, proto_cf_main_channel(CF))) + return 0; + + P->cf = CF; /* radv_check_active() requires proper P->cf */ p->active = radv_check_active(p); struct iface *iface; WALK_LIST(iface, iface_list) { + if (!(iface->flags & IF_UP)) + continue; + + /* Ignore non-multicast ifaces */ + if (!(iface->flags & IF_MULTICAST)) + continue; + + /* Ignore ifaces without link-local address */ + if (!iface->llv6) + continue; + struct radv_iface *ifa = radv_iface_find(p, iface); struct radv_iface_config *ic = (struct radv_iface_config *) iface_patt_find(&new->patt_list, iface, NULL); @@ -565,7 +578,10 @@ radv_get_status(struct proto *P, byte *buf) struct protocol proto_radv = { .name = "RAdv", .template = "radv%d", + .channel_mask = NB_IP6, + .proto_size = sizeof(struct radv_proto), .config_size = sizeof(struct radv_config), + .postconfig = radv_postconfig, .init = radv_init, .start = radv_start, .shutdown = radv_shutdown, diff --git a/proto/radv/radv.h b/proto/radv/radv.h index 60b9980f..4672e3b2 100644 --- a/proto/radv/radv.h +++ b/proto/radv/radv.h @@ -30,7 +30,7 @@ #define ICMPV6_RA 134 #define MAX_INITIAL_RTR_ADVERTISEMENTS 3 -#define MAX_INITIAL_RTR_ADVERT_INTERVAL 16 +#define MAX_INITIAL_RTR_ADVERT_INTERVAL (16 S_) #define DEFAULT_MAX_RA_INT 600 #define DEFAULT_MIN_DELAY 3 @@ -51,9 +51,7 @@ struct radv_config list rdnss_list; /* Global list of RDNSS configs (struct radv_rdnss_config) */ list dnssl_list; /* Global list of DNSSL configs (struct radv_dnssl_config) */ - ip_addr trigger_prefix; /* Prefix of a trigger route, if defined */ - u8 trigger_pxlen; /* Pxlen of a trigger route, if defined */ - u8 trigger_valid; /* Whether a trigger route is defined */ + net_addr trigger; /* Prefix of a trigger route, if defined */ }; struct radv_iface_config @@ -87,8 +85,7 @@ struct radv_iface_config struct radv_prefix_config { node n; - ip_addr prefix; - uint pxlen; + net_addr_ip6 prefix; u8 skip; /* Do not include this prefix to RA */ u8 onlink; /* Standard options from RFC 4861 */ @@ -104,7 +101,7 @@ struct radv_rdnss_config node n; u32 lifetime; /* Valid if lifetime_mult is 0 */ u16 lifetime_mult; /* Lifetime specified as multiple of max_ra_int */ - ip_addr server; /* IP address of recursive DNS server */ + ip6_addr server; /* IP address of recursive DNS server */ }; struct radv_dnssl_config @@ -128,12 +125,12 @@ struct radv_proto struct radv_prefix /* One prefix we advertise */ { node n; - ip_addr prefix; - u8 len; + net_addr_ip6 prefix; + u8 alive; /* Is the prefix alive? If not, we advertise it with 0 lifetime, so clients stop using it */ u8 mark; /* A temporary mark for processing */ - bird_clock_t expires; /* The time when we drop this prefix from + btime expires; /* The time when we drop this prefix from advertising. It is valid only if !alive. */ struct radv_prefix_config *cf; /* The config tied to this prefix */ }; @@ -147,13 +144,13 @@ struct radv_iface struct ifa *addr; /* Link-local address of iface */ struct pool *pool; /* A pool for interface-specific things */ list prefixes; /* The prefixes we advertise (struct radv_prefix) */ - bird_clock_t prefix_expires; /* When the soonest prefix expires (0 = none dead) */ + btime prefix_expires; /* When the soonest prefix expires (0 = none dead) */ timer *timer; struct object_lock *lock; sock *sk; - bird_clock_t last; /* Time of last sending of RA */ + btime last; /* Time of last sending of RA */ u16 plen; /* Length of prepared RA in tbuf, or 0 if not valid */ byte initial; /* How many RAs are still to be sent as initial */ }; diff --git a/proto/rip/Makefile b/proto/rip/Makefile index d2d3c987..7feabcd8 100644 --- a/proto/rip/Makefile +++ b/proto/rip/Makefile @@ -1,5 +1,6 @@ -source=rip.c packets.c -root-rel=../../ -dir-name=proto/rip +src := packets.c rip.c +obj := $(src-o-files) +$(all-daemon) +$(cf-local) -include ../../Rules +tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file diff --git a/proto/rip/config.Y b/proto/rip/config.Y index 4ec45c7a..e3bc4ae3 100644 --- a/proto/rip/config.Y +++ b/proto/rip/config.Y @@ -32,34 +32,40 @@ rip_check_auth(void) CF_DECLS -CF_KEYWORDS(RIP, ECMP, LIMIT, WEIGHT, INFINITY, METRIC, UPDATE, TIMEOUT, +CF_KEYWORDS(RIP, NG, ECMP, LIMIT, WEIGHT, INFINITY, METRIC, UPDATE, TIMEOUT, GARBAGE, PORT, ADDRESS, MODE, BROADCAST, MULTICAST, PASSIVE, VERSION, SPLIT, HORIZON, POISON, REVERSE, CHECK, ZERO, TIME, BFD, AUTHENTICATION, NONE, PLAINTEXT, CRYPTOGRAPHIC, MD5, TTL, SECURITY, RX, TX, BUFFER, LENGTH, PRIORITY, ONLY, LINK, RIP_METRIC, RIP_TAG) -%type <i> rip_auth +%type <i> rip_variant rip_auth CF_GRAMMAR CF_ADDTO(proto, rip_proto) -rip_proto_start: proto_start RIP +rip_variant: + RIP { $$ = 1; } + | RIP NG { $$ = 0; } + ; + +rip_proto_start: proto_start rip_variant { this_proto = proto_config_new(&proto_rip, $1); - init_list(&RIP_CFG->patt_list); + this_proto->net_type = $2 ? NET_IP4 : NET_IP6; - RIP_CFG->rip2 = RIP_IS_V2; + init_list(&RIP_CFG->patt_list); + RIP_CFG->rip2 = $2; RIP_CFG->infinity = RIP_DEFAULT_INFINITY; - - RIP_CFG->min_timeout_time = 60; - RIP_CFG->max_garbage_time = 60; + RIP_CFG->min_timeout_time = 60 S_; + RIP_CFG->max_garbage_time = 60 S_; }; rip_proto_item: proto_item + | proto_channel | ECMP bool { RIP_CFG->ecmp = $2 ? RIP_DEFAULT_ECMP_LIMIT : 0; } - | ECMP bool LIMIT expr { RIP_CFG->ecmp = $2 ? $4 : 0; if ($4 < 0) cf_error("ECMP limit cannot be negative"); } + | ECMP bool LIMIT expr { RIP_CFG->ecmp = $2 ? $4 : 0; } | INFINITY expr { RIP_CFG->infinity = $2; } | INTERFACE rip_iface ; @@ -131,7 +137,7 @@ rip_iface_item: | MODE MULTICAST { RIP_IFACE->mode = RIP_IM_MULTICAST; } | MODE BROADCAST { RIP_IFACE->mode = RIP_IM_BROADCAST; if (rip_cfg_is_ng()) cf_error("Broadcast not supported in RIPng"); } | PASSIVE bool { RIP_IFACE->passive = $2; } - | ADDRESS ipa { RIP_IFACE->address = $2; } + | ADDRESS ipa { RIP_IFACE->address = $2; if (ipa_is_ip4($2) != rip_cfg_is_v2()) cf_error("IP address version mismatch"); } | PORT expr { RIP_IFACE->port = $2; if (($2<1) || ($2>65535)) cf_error("Invalid port number"); } | VERSION expr { RIP_IFACE->version = $2; if (rip_cfg_is_ng()) cf_error("Version not supported in RIPng"); @@ -141,9 +147,9 @@ rip_iface_item: | SPLIT HORIZON bool { RIP_IFACE->split_horizon = $3; } | POISON REVERSE bool { RIP_IFACE->poison_reverse = $3; } | CHECK ZERO bool { RIP_IFACE->check_zero = $3; } - | UPDATE TIME expr { RIP_IFACE->update_time = $3; if ($3<=0) cf_error("Update time must be positive"); } - | TIMEOUT TIME expr { RIP_IFACE->timeout_time = $3; if ($3<=0) cf_error("Timeout time must be positive"); } - | GARBAGE TIME expr { RIP_IFACE->garbage_time = $3; if ($3<=0) cf_error("Garbage time must be positive"); } + | UPDATE TIME expr { RIP_IFACE->update_time = $3 S_; if ($3<=0) cf_error("Update time must be positive"); } + | TIMEOUT TIME expr { RIP_IFACE->timeout_time = $3 S_; if ($3<=0) cf_error("Timeout time must be positive"); } + | GARBAGE TIME expr { RIP_IFACE->garbage_time = $3 S_; if ($3<=0) cf_error("Garbage time must be positive"); } | ECMP WEIGHT expr { RIP_IFACE->ecmp_weight = $3 - 1; if (($3<1) || ($3>256)) cf_error("ECMP weight must be in range 1-256"); } | RX BUFFER expr { RIP_IFACE->rx_buffer = $3; if (($3<256) || ($3>65535)) cf_error("RX length must be in range 256-65535"); } | TX LENGTH expr { RIP_IFACE->tx_length = $3; if (($3<256) || ($3>65535)) cf_error("TX length must be in range 256-65535"); } diff --git a/proto/rip/packets.c b/proto/rip/packets.c index 468927e6..4925ca36 100644 --- a/proto/rip/packets.c +++ b/proto/rip/packets.c @@ -9,6 +9,8 @@ * Can be freely distributed and used under the terms of the GNU GPL. */ +#undef LOCAL_DEBUG + #include "rip.h" #include "lib/mac.h" @@ -76,8 +78,7 @@ struct rip_auth_tail /* Internal representation of RTE block data */ struct rip_block { - ip_addr prefix; - int pxlen; + net_addr net; u32 metric; u16 tag; u16 no_af; @@ -106,30 +107,30 @@ static inline uint rip_pkt_hdrlen(struct rip_iface *ifa) { return sizeof(struct rip_packet) + (ifa->cf->auth_type ? RIP_BLOCK_LENGTH : 0); } static inline void -rip_put_block(struct rip_proto *p UNUSED4 UNUSED6, byte *pos, struct rip_block *rte) +rip_put_block(struct rip_proto *p, byte *pos, struct rip_block *rte) { if (rip_is_v2(p)) { struct rip_block_v2 *block = (void *) pos; block->family = rte->no_af ? 0 : htons(RIP_AF_IPV4); block->tag = htons(rte->tag); - block->network = ip4_hton(ipa_to_ip4(rte->prefix)); - block->netmask = ip4_hton(ip4_mkmask(rte->pxlen)); + block->network = ip4_hton(net4_prefix(&rte->net)); + block->netmask = ip4_hton(ip4_mkmask(net4_pxlen(&rte->net))); block->next_hop = ip4_hton(ipa_to_ip4(rte->next_hop)); block->metric = htonl(rte->metric); } else /* RIPng */ { struct rip_block_ng *block = (void *) pos; - block->prefix = ip6_hton(ipa_to_ip6(rte->prefix)); + block->prefix = ip6_hton(net6_prefix(&rte->net)); block->tag = htons(rte->tag); - block->pxlen = rte->pxlen; + block->pxlen = net6_pxlen(&rte->net); block->metric = rte->metric; } } static inline void -rip_put_next_hop(struct rip_proto *p UNUSED, byte *pos, struct rip_block *rte UNUSED4) +rip_put_next_hop(struct rip_proto *p UNUSED, byte *pos, struct rip_block *rte) { struct rip_block_ng *block = (void *) pos; block->prefix = ip6_hton(ipa_to_ip6(rte->next_hop)); @@ -139,7 +140,7 @@ rip_put_next_hop(struct rip_proto *p UNUSED, byte *pos, struct rip_block *rte UN } static inline int -rip_get_block(struct rip_proto *p UNUSED4 UNUSED6, byte *pos, struct rip_block *rte) +rip_get_block(struct rip_proto *p, byte *pos, struct rip_block *rte) { if (rip_is_v2(p)) { @@ -149,8 +150,8 @@ rip_get_block(struct rip_proto *p UNUSED4 UNUSED6, byte *pos, struct rip_block * if (block->family != (rte->no_af ? 0 : htons(RIP_AF_IPV4))) return 0; - rte->prefix = ipa_from_ip4(ip4_ntoh(block->network)); - rte->pxlen = ip4_masklen(ip4_ntoh(block->netmask)); + uint pxlen = ip4_masklen(ip4_ntoh(block->netmask)); + net_fill_ip4(&rte->net, ip4_ntoh(block->network), pxlen); rte->metric = ntohl(block->metric); rte->tag = ntohs(block->tag); rte->next_hop = ipa_from_ip4(ip4_ntoh(block->next_hop)); @@ -169,8 +170,8 @@ rip_get_block(struct rip_proto *p UNUSED4 UNUSED6, byte *pos, struct rip_block * return 0; } - rte->prefix = ipa_from_ip6(ip6_ntoh(block->prefix)); - rte->pxlen = block->pxlen; + uint pxlen = (block->pxlen <= IP6_MAX_PREFIX_LENGTH) ? block->pxlen : 255; + net_fill_ip6(&rte->net, ip6_ntoh(block->prefix), pxlen); rte->metric = block->metric; rte->tag = ntohs(block->tag); /* rte->next_hop is deliberately kept unmodified */; @@ -188,7 +189,10 @@ rip_update_csn(struct rip_proto *p UNUSED, struct rip_iface *ifa) * have the same CSN. We are using real time, but enforcing monotonicity. */ if (ifa->cf->auth_type == RIP_AUTH_CRYPTO) - ifa->csn = (ifa->csn < (u32) now_real) ? (u32) now_real : ifa->csn + 1; + { + u32 now_real = (u32) (current_real_time() TO_S); + ifa->csn = (ifa->csn < now_real) ? now_real : ifa->csn + 1; + } } static void @@ -406,8 +410,9 @@ rip_receive_request(struct rip_proto *p, struct rip_iface *ifa, struct rip_packe if (!rip_get_block(p, pos, &b)) return; - /* Special case - zero prefix, infinity metric */ - if (ipa_nonzero(b.prefix) || b.pxlen || (b.metric != p->infinity)) + /* Special case - infinity metric, for RIPng also zero prefix */ + if ((b.metric != p->infinity) || + (rip_is_ng(p) && !net_zero_ip6((net_addr_ip6 *) &b.net))) return; /* We do nothing if TX is already active */ @@ -432,6 +437,7 @@ rip_send_response(struct rip_proto *p, struct rip_iface *ifa) byte *max = rip_tx_buffer(ifa) + ifa->tx_plen - (rip_is_v2(p) ? RIP_BLOCK_LENGTH : 2*RIP_BLOCK_LENGTH); ip_addr last_next_hop = IPA_NONE; + btime now_ = current_time(); int send = 0; struct rip_packet *pkt = (void *) pos; @@ -440,17 +446,15 @@ rip_send_response(struct rip_proto *p, struct rip_iface *ifa) pkt->unused = 0; pos += rip_pkt_hdrlen(ifa); - FIB_ITERATE_START(&p->rtable, &ifa->tx_fit, z) + FIB_ITERATE_START(&p->rtable, &ifa->tx_fit, struct rip_entry, en) { - struct rip_entry *en = (struct rip_entry *) z; - /* Dummy entries */ if (!en->valid) goto next_entry; /* Stale entries that should be removed */ if ((en->valid == RIP_ENTRY_STALE) && - ((en->changed + ifa->cf->garbage_time) <= now)) + ((en->changed + ifa->cf->garbage_time) <= now_)) goto next_entry; /* Triggered updates */ @@ -460,28 +464,28 @@ rip_send_response(struct rip_proto *p, struct rip_iface *ifa) /* Not enough space for current entry */ if (pos > max) { - FIB_ITERATE_PUT(&ifa->tx_fit, z); + FIB_ITERATE_PUT(&ifa->tx_fit); goto break_loop; } struct rip_block rte = { - .prefix = en->n.prefix, - .pxlen = en->n.pxlen, .metric = en->metric, .tag = en->tag }; + net_copy(&rte.net, en->n.addr); + if (en->iface == ifa->iface) rte.next_hop = en->next_hop; if (rip_is_v2(p) && (ifa->cf->version == RIP_V1)) { /* Skipping subnets (i.e. not hosts, classful networks or default route) */ - if (ip4_masklen(ip4_class_mask(ipa_to_ip4(en->n.prefix))) != en->n.pxlen) + if (ip4_masklen(ip4_class_mask(net4_prefix(&rte.net))) != rte.net.pxlen) goto next_entry; rte.tag = 0; - rte.pxlen = 0; + rte.net.pxlen = 0; rte.next_hop = IPA_NONE; } @@ -497,7 +501,7 @@ rip_send_response(struct rip_proto *p, struct rip_iface *ifa) goto next_entry; } - // TRACE(D_PACKETS, " %I/%d -> %I metric %d", rte.prefix, rte.pxlen, rte.next_hop, rte.metric); + // TRACE(D_PACKETS, " %N -> %I metric %d", &rte.net, rte.next_hop, rte.metric); /* RIPng next hop entry */ if (rip_is_ng(p) && !ipa_equal(rte.next_hop, last_next_hop)) @@ -513,7 +517,7 @@ rip_send_response(struct rip_proto *p, struct rip_iface *ifa) next_entry: ; } - FIB_ITERATE_END(z); + FIB_ITERATE_END; ifa->tx_active = 0; /* Do not send empty packet */ @@ -540,9 +544,9 @@ break_loop: * activating the new one. */ void -rip_send_table(struct rip_proto *p, struct rip_iface *ifa, ip_addr addr, bird_clock_t changed) +rip_send_table(struct rip_proto *p, struct rip_iface *ifa, ip_addr addr, btime changed) { - DBG("RIP: Opening TX session to %I on %s\n", dst, ifa->iface->name); + DBG("RIP: Opening TX session to %I on %s\n", addr, ifa->iface->name); rip_reset_tx_session(p, ifa); @@ -591,6 +595,7 @@ rip_receive_response(struct rip_proto *p, struct rip_iface *ifa, struct rip_pack byte *pos = (byte *) pkt + sizeof(struct rip_packet); byte *end = (byte *) pkt + plen; + btime now_ = current_time(); for (; pos < end; pos += RIP_BLOCK_LENGTH) { @@ -598,23 +603,25 @@ rip_receive_response(struct rip_proto *p, struct rip_iface *ifa, struct rip_pack if (!rip_get_block(p, pos, &rte)) continue; - int c = ipa_classify_net(rte.prefix); - if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK)) - SKIP("invalid prefix"); - if (rip_is_v2(p) && (pkt->version == RIP_V1)) { - if (ifa->cf->check_zero && (rte.tag || rte.pxlen || ipa_nonzero(rte.next_hop))) + if (ifa->cf->check_zero && (rte.tag || rte.net.pxlen || ipa_nonzero(rte.next_hop))) SKIP("RIPv1 reserved field is nonzero"); rte.tag = 0; - rte.pxlen = ip4_masklen(ip4_class_mask(ipa_to_ip4(rte.prefix))); + rte.net.pxlen = ip4_masklen(ip4_class_mask(net4_prefix(&rte.net))); rte.next_hop = IPA_NONE; } - if ((rte.pxlen < 0) || (rte.pxlen > MAX_PREFIX_LENGTH)) + if (rte.net.pxlen == 255) SKIP("invalid prefix length"); + net_normalize(&rte.net); + + int c = net_classify(&rte.net); + if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK)) + SKIP("invalid prefix"); + if (rte.metric > p->infinity) SKIP("invalid metric"); @@ -625,7 +632,7 @@ rip_receive_response(struct rip_proto *p, struct rip_iface *ifa, struct rip_pack rte.next_hop = IPA_NONE; } - // TRACE(D_PACKETS, " %I/%d -> %I metric %d", rte.prefix, rte.pxlen, rte.next_hop, rte.metric); + // TRACE(D_PACKETS, " %N -> %I metric %d", &rte.net.n, rte.next_hop, rte.metric); rte.metric += ifa->cf->metric; @@ -636,19 +643,19 @@ rip_receive_response(struct rip_proto *p, struct rip_iface *ifa, struct rip_pack .next_hop = ipa_nonzero(rte.next_hop) ? rte.next_hop : from->nbr->addr, .metric = rte.metric, .tag = rte.tag, - .expires = now + ifa->cf->timeout_time + .expires = now_ + ifa->cf->timeout_time }; - rip_update_rte(p, &rte.prefix, rte.pxlen, &new); + rip_update_rte(p, &rte.net, &new); } else - rip_withdraw_rte(p, &rte.prefix, rte.pxlen, from); + rip_withdraw_rte(p, &rte.net, from); continue; skip: - LOG_RTE("Ignoring route %I/%d received from %I - %s", - rte.prefix, rte.pxlen, from->nbr->addr, err_dsc); + LOG_RTE("Ignoring route %N received from %I - %s", + &rte.net, from->nbr->addr, err_dsc); } } @@ -667,8 +674,7 @@ rip_rx_hook(sock *sk, uint len) sk->iface->name, sk->faddr, sk->laddr); /* Silently ignore my own packets */ - /* FIXME: Better local address check */ - if (ipa_equal(ifa->iface->addr->ip, sk->faddr)) + if (ipa_equal(sk->faddr, sk->saddr)) return 1; if (rip_is_ng(p) && !ipa_is_link_local(sk->faddr)) @@ -704,7 +710,7 @@ rip_rx_hook(sock *sk, uint len) if ((plen - sizeof(struct rip_packet)) % RIP_BLOCK_LENGTH) DROP("invalid length", plen); - n->last_seen = now; + n->last_seen = current_time(); rip_update_bfd(p, n); switch (pkt->command) @@ -736,17 +742,11 @@ rip_open_socket(struct rip_iface *ifa) sock *sk = sk_new(p->p.pool); sk->type = SK_UDP; + sk->subtype = rip_is_v2(p) ? SK_IPV4 : SK_IPV6; sk->sport = ifa->cf->port; sk->dport = ifa->cf->port; sk->iface = ifa->iface; - - /* - * For RIPv2, we explicitly choose a primary address, mainly to ensure that - * RIP and BFD uses the same one. For RIPng, we left it to kernel, which - * should choose some link-local address based on the same scope rule. - */ - if (rip_is_v2(p)) - sk->saddr = ifa->iface->addr->ip; + sk->saddr = rip_is_v2(p) ? ifa->iface->addr4->ip : ifa->iface->llv6->ip; sk->rx_hook = rip_rx_hook; sk->tx_hook = rip_tx_hook; diff --git a/proto/rip/rip.c b/proto/rip/rip.c index 7b380097..a3eeaf17 100644 --- a/proto/rip/rip.c +++ b/proto/rip/rip.c @@ -92,15 +92,6 @@ static void rip_trigger_update(struct rip_proto *p); * RIP routes */ -static void -rip_init_entry(struct fib_node *fn) -{ - // struct rip_entry *en = (void) *fn; - - const uint offset = OFFSETOF(struct rip_entry, routes); - memset((byte *)fn + offset, 0, sizeof(struct rip_entry) - offset); -} - static struct rip_rte * rip_add_rte(struct rip_proto *p, struct rip_rte **rp, struct rip_rte *src) { @@ -152,27 +143,20 @@ rip_announce_rte(struct rip_proto *p, struct rip_entry *en) if (rt) { /* Update */ - net *n = net_get(p->p.table, en->n.prefix, en->n.pxlen); - rta a0 = { .src = p->p.main_source, .source = RTS_RIP, .scope = SCOPE_UNIVERSE, - .cast = RTC_UNICAST + .dest = RTD_UNICAST, }; u8 rt_metric = rt->metric; u16 rt_tag = rt->tag; - struct rip_rte *rt2 = rt->next; - /* Find second valid rte */ - while (rt2 && !rip_valid_rte(rt2)) - rt2 = rt2->next; - - if (p->ecmp && rt2) + if (p->ecmp) { /* ECMP route */ - struct mpnh *nhs = NULL; + struct nexthop *nhs = NULL; int num = 0; for (rt = en->routes; rt && (num < p->ecmp); rt = rt->next) @@ -180,54 +164,51 @@ rip_announce_rte(struct rip_proto *p, struct rip_entry *en) if (!rip_valid_rte(rt)) continue; - struct mpnh *nh = alloca(sizeof(struct mpnh)); + struct nexthop *nh = allocz(sizeof(struct nexthop)); + nh->gw = rt->next_hop; nh->iface = rt->from->nbr->iface; nh->weight = rt->from->ifa->cf->ecmp_weight; - mpnh_insert(&nhs, nh); + + nexthop_insert(&nhs, nh); num++; if (rt->tag != rt_tag) rt_tag = 0; } - a0.dest = RTD_MULTIPATH; - a0.nexthops = nhs; + a0.nh = *nhs; } else { /* Unipath route */ - a0.dest = RTD_ROUTER; - a0.gw = rt->next_hop; - a0.iface = rt->from->nbr->iface; a0.from = rt->from->nbr->addr; + a0.nh.gw = rt->next_hop; + a0.nh.iface = rt->from->nbr->iface; } rta *a = rta_lookup(&a0); rte *e = rte_get_temp(a); - e->u.rip.from = a0.iface; + e->u.rip.from = a0.nh.iface; e->u.rip.metric = rt_metric; e->u.rip.tag = rt_tag; - e->net = n; e->pflags = 0; - rte_update(&p->p, n, e); + rte_update(&p->p, en->n.addr, e); } else { /* Withdraw */ - net *n = net_find(p->p.table, en->n.prefix, en->n.pxlen); - rte_update(&p->p, n, NULL); + rte_update(&p->p, en->n.addr, NULL); } } /** * rip_update_rte - enter a route update to RIP routing table * @p: RIP instance - * @prefix: network prefix - * @pxlen: network prefix length + * @addr: network address * @new: a &rip_rte representing the new route * * The function is called by the RIP packet processing code whenever it receives @@ -237,9 +218,9 @@ rip_announce_rte(struct rip_proto *p, struct rip_entry *en) * rip_withdraw_rte() should be called instead of rip_update_rte(). */ void -rip_update_rte(struct rip_proto *p, ip_addr *prefix, int pxlen, struct rip_rte *new) +rip_update_rte(struct rip_proto *p, net_addr *n, struct rip_rte *new) { - struct rip_entry *en = fib_get(&p->rtable, prefix, pxlen); + struct rip_entry *en = fib_get(&p->rtable, n); struct rip_rte *rt, **rp; int changed = 0; @@ -279,8 +260,7 @@ rip_update_rte(struct rip_proto *p, ip_addr *prefix, int pxlen, struct rip_rte * /** * rip_withdraw_rte - enter a route withdraw to RIP routing table * @p: RIP instance - * @prefix: network prefix - * @pxlen: network prefix length + * @addr: network address * @from: a &rip_neighbor propagating the withdraw * * The function is called by the RIP packet processing code whenever it receives @@ -288,9 +268,9 @@ rip_update_rte(struct rip_proto *p, ip_addr *prefix, int pxlen, struct rip_rte * * removed. Eventually, the change is also propagated by rip_announce_rte(). */ void -rip_withdraw_rte(struct rip_proto *p, ip_addr *prefix, int pxlen, struct rip_neighbor *from) +rip_withdraw_rte(struct rip_proto *p, net_addr *n, struct rip_neighbor *from) { - struct rip_entry *en = fib_find(&p->rtable, prefix, pxlen); + struct rip_entry *en = fib_find(&p->rtable, n); struct rip_rte *rt, **rp; if (!en) @@ -317,7 +297,7 @@ rip_withdraw_rte(struct rip_proto *p, ip_addr *prefix, int pxlen, struct rip_nei * it into our data structures. */ static void -rip_rt_notify(struct proto *P, struct rtable *table UNUSED, struct network *net, struct rte *new, +rip_rt_notify(struct proto *P, struct channel *ch UNUSED, struct network *net, struct rte *new, struct rte *old UNUSED, struct ea_list *attrs) { struct rip_proto *p = (struct rip_proto *) P; @@ -332,15 +312,15 @@ rip_rt_notify(struct proto *P, struct rtable *table UNUSED, struct network *net, if (rt_metric > p->infinity) { - log(L_WARN "%s: Invalid rip_metric value %u for route %I/%d", - p->p.name, rt_metric, net->n.prefix, net->n.pxlen); + log(L_WARN "%s: Invalid rip_metric value %u for route %N", + p->p.name, rt_metric, net->n.addr); rt_metric = p->infinity; } if (rt_tag > 0xffff) { - log(L_WARN "%s: Invalid rip_tag value %u for route %I/%d", - p->p.name, rt_tag, net->n.prefix, net->n.pxlen); + log(L_WARN "%s: Invalid rip_tag value %u for route %N", + p->p.name, rt_tag, net->n.addr); rt_metric = p->infinity; rt_tag = 0; } @@ -352,7 +332,7 @@ rip_rt_notify(struct proto *P, struct rtable *table UNUSED, struct network *net, * collection. */ - en = fib_get(&p->rtable, &net->n.prefix, net->n.pxlen); + en = fib_get(&p->rtable, net->n.addr); old_metric = en->valid ? en->metric : -1; @@ -360,13 +340,13 @@ rip_rt_notify(struct proto *P, struct rtable *table UNUSED, struct network *net, en->metric = rt_metric; en->tag = rt_tag; en->from = (new->attrs->src->proto == P) ? new->u.rip.from : NULL; - en->iface = new->attrs->iface; - en->next_hop = new->attrs->gw; + en->iface = new->attrs->nh.iface; + en->next_hop = new->attrs->nh.gw; } else { /* Withdraw */ - en = fib_find(&p->rtable, &net->n.prefix, net->n.pxlen); + en = fib_find(&p->rtable, net->n.addr); if (!en || en->valid != RIP_ENTRY_VALID) return; @@ -384,7 +364,7 @@ rip_rt_notify(struct proto *P, struct rtable *table UNUSED, struct network *net, /* Activate triggered updates */ if (en->metric != old_metric) { - en->changed = now; + en->changed = current_time(); rip_trigger_update(p); } } @@ -526,10 +506,10 @@ rip_iface_start(struct rip_iface *ifa) TRACE(D_EVENTS, "Starting interface %s", ifa->iface->name); - ifa->next_regular = now + (random() % ifa->cf->update_time) + 1; - ifa->next_triggered = now; /* Available immediately */ - ifa->want_triggered = 1; /* All routes in triggered update */ - tm_start(ifa->timer, 1); /* Or 100 ms */ + ifa->next_regular = current_time() + (random() % ifa->cf->update_time) + 100 MS; + ifa->next_triggered = current_time(); /* Available immediately */ + ifa->want_triggered = 1; /* All routes in triggered update */ + tm_start(ifa->timer, 100 MS); ifa->up = 1; if (!ifa->cf->passive) @@ -650,13 +630,19 @@ rip_add_iface(struct rip_proto *p, struct iface *iface, struct rip_iface_config else if (ic->mode == RIP_IM_MULTICAST) ifa->addr = rip_is_v2(p) ? IP4_RIP_ROUTERS : IP6_RIP_ROUTERS; else /* Broadcast */ - ifa->addr = iface->addr->brd; + ifa->addr = iface->addr4->brd; + /* + * The above is just a workaround for BSD as it can't send broadcasts + * to 255.255.255.255. BSD systems need the network broadcast address instead. + * + * TODO: move this to sysdep code + */ init_list(&ifa->neigh_list); add_tail(&p->iface_list, NODE ifa); - ifa->timer = tm_new_set(p->p.pool, rip_iface_timer, ifa, 0, 0); + ifa->timer = tm_new_init(p->p.pool, rip_iface_timer, ifa, 0, 0); struct object_lock *lock = olock_new(p->p.pool); lock->type = OBJLOCK_UDP; @@ -704,8 +690,8 @@ rip_reconfigure_iface(struct rip_proto *p, struct rip_iface *ifa, struct rip_ifa rip_iface_update_buffers(ifa); - if (ifa->next_regular > (now + new->update_time)) - ifa->next_regular = now + (random() % new->update_time) + 1; + if (ifa->next_regular > (current_time() + new->update_time)) + ifa->next_regular = current_time() + (random() % new->update_time) + 100 MS; if (new->check_link != old->check_link) rip_iface_update_state(ifa); @@ -726,7 +712,11 @@ rip_reconfigure_ifaces(struct rip_proto *p, struct rip_config *cf) WALK_LIST(iface, iface_list) { - if (! (iface->flags & IF_UP)) + if (!(iface->flags & IF_UP)) + continue; + + /* Ignore ifaces without appropriate address */ + if (rip_is_v2(p) ? !iface->addr4 : !iface->llv6) continue; struct rip_iface *ifa = rip_find_iface(p, iface); @@ -764,6 +754,10 @@ rip_if_notify(struct proto *P, unsigned flags, struct iface *iface) { struct rip_iface_config *ic = (void *) iface_patt_find(&cf->patt_list, iface, NULL); + /* Ignore ifaces without appropriate address */ + if (rip_is_v2(p) ? !iface->addr4 : !iface->llv6) + return; + if (ic) rip_add_iface(p, iface, ic); @@ -822,24 +816,24 @@ rip_timer(timer *t) struct rip_iface *ifa; struct rip_neighbor *n, *nn; struct fib_iterator fit; - bird_clock_t next = now + MIN(cf->min_timeout_time, cf->max_garbage_time); - bird_clock_t expires = 0; + btime now_ = current_time(); + btime next = now_ + MIN(cf->min_timeout_time, cf->max_garbage_time); + btime expires = 0; TRACE(D_EVENTS, "Main timer fired"); FIB_ITERATE_INIT(&fit, &p->rtable); loop: - FIB_ITERATE_START(&p->rtable, &fit, node) + FIB_ITERATE_START(&p->rtable, &fit, struct rip_entry, en) { - struct rip_entry *en = (struct rip_entry *) node; struct rip_rte *rt, **rp; int changed = 0; /* Checking received routes for timeout and for dead neighbors */ for (rp = &en->routes; rt = *rp; /* rp = &rt->next */) { - if (!rip_valid_rte(rt) || (rt->expires <= now)) + if (!rip_valid_rte(rt) || (rt->expires <= now_)) { rip_remove_rte(p, rp); changed = 1; @@ -859,7 +853,7 @@ rip_timer(timer *t) * rip_rt_notify() -> p->rtable change, invalidating hidden variables. */ - FIB_ITERATE_PUT_NEXT(&fit, &p->rtable, node); + FIB_ITERATE_PUT_NEXT(&fit, &p->rtable); rip_announce_rte(p, en); goto loop; } @@ -869,9 +863,9 @@ rip_timer(timer *t) { expires = en->changed + cf->max_garbage_time; - if (expires <= now) + if (expires <= now_) { - // TRACE(D_EVENTS, "entry is too old: %I/%d", en->n.prefix, en->n.pxlen); + // TRACE(D_EVENTS, "entry is too old: %N", en->n.addr); en->valid = 0; } else @@ -881,12 +875,12 @@ rip_timer(timer *t) /* Remove empty nodes */ if (!en->valid && !en->routes) { - FIB_ITERATE_PUT(&fit, node); - fib_delete(&p->rtable, node); + FIB_ITERATE_PUT(&fit); + fib_delete(&p->rtable, en); goto loop; } } - FIB_ITERATE_END(node); + FIB_ITERATE_END; p->rt_reload = 0; @@ -897,20 +891,20 @@ rip_timer(timer *t) { expires = n->last_seen + n->ifa->cf->timeout_time; - if (expires <= now) + if (expires <= now_) rip_remove_neighbor(p, n); else next = MIN(next, expires); } - tm_start(p->timer, MAX(next - now, 1)); + tm_start(p->timer, MAX(next - now_, 100 MS)); } static inline void rip_kick_timer(struct rip_proto *p) { - if (p->timer->expires > (now + 1)) - tm_start(p->timer, 1); /* Or 100 ms */ + if (p->timer->expires > (current_time() + 100 MS)) + tm_start(p->timer, 100 MS); } /** @@ -928,7 +922,8 @@ rip_iface_timer(timer *t) { struct rip_iface *ifa = t->data; struct rip_proto *p = ifa->rip; - bird_clock_t period = ifa->cf->update_time; + btime now_ = current_time(); + btime period = ifa->cf->update_time; if (ifa->cf->passive) return; @@ -937,40 +932,40 @@ rip_iface_timer(timer *t) if (ifa->tx_active) { - if (now < (ifa->next_regular + period)) - { tm_start(ifa->timer, 1); return; } + if (now_ < (ifa->next_regular + period)) + { tm_start(ifa->timer, 100 MS); return; } /* We are too late, reset is done by rip_send_table() */ log(L_WARN "%s: Too slow update on %s, resetting", p->p.name, ifa->iface->name); } - if (now >= ifa->next_regular) + if (now_ >= ifa->next_regular) { /* Send regular update, set timer for next period (or following one if necessay) */ TRACE(D_EVENTS, "Sending regular updates for %s", ifa->iface->name); rip_send_table(p, ifa, ifa->addr, 0); - ifa->next_regular += period * (1 + ((now - ifa->next_regular) / period)); + ifa->next_regular += period * (1 + ((now_ - ifa->next_regular) / period)); ifa->want_triggered = 0; p->triggered = 0; } - else if (ifa->want_triggered && (now >= ifa->next_triggered)) + else if (ifa->want_triggered && (now_ >= ifa->next_triggered)) { /* Send triggered update, enforce interval between triggered updates */ TRACE(D_EVENTS, "Sending triggered updates for %s", ifa->iface->name); rip_send_table(p, ifa, ifa->addr, ifa->want_triggered); - ifa->next_triggered = now + MIN(5, period / 2 + 1); + ifa->next_triggered = now_ + MIN(5 S, period / 2); ifa->want_triggered = 0; p->triggered = 0; } - tm_start(ifa->timer, ifa->want_triggered ? 1 : (ifa->next_regular - now)); + tm_start(ifa->timer, ifa->want_triggered ? (1 S) : (ifa->next_regular - now_)); } static inline void rip_iface_kick_timer(struct rip_iface *ifa) { - if (ifa->timer->expires > (now + 1)) - tm_start(ifa->timer, 1); /* Or 100 ms */ + if (ifa->timer->expires > (current_time() + 100 MS)) + tm_start(ifa->timer, 100 MS); } static void @@ -991,7 +986,7 @@ rip_trigger_update(struct rip_proto *p) continue; TRACE(D_EVENTS, "Scheduling triggered updates for %s", ifa->iface->name); - ifa->want_triggered = now; + ifa->want_triggered = current_time(); rip_iface_kick_timer(ifa); } @@ -1035,19 +1030,17 @@ rip_import_control(struct proto *P UNUSED, struct rte **rt, struct ea_list **att return 0; } -static int -rip_reload_routes(struct proto *P) +static void +rip_reload_routes(struct channel *C) { - struct rip_proto *p = (struct rip_proto *) P; + struct rip_proto *p = (struct rip_proto *) C->proto; if (p->rt_reload) - return 1; + return; TRACE(D_EVENTS, "Scheduling route reload"); p->rt_reload = 1; rip_kick_timer(p); - - return 1; } static struct ea_list * @@ -1078,12 +1071,23 @@ rip_rte_same(struct rte *new, struct rte *old) } +static void +rip_postconfig(struct proto_config *CF) +{ + // struct rip_config *cf = (void *) CF; + + /* Define default channel */ + if (EMPTY_LIST(CF->channels)) + channel_config_new(NULL, CF->net_type, CF); +} + static struct proto * -rip_init(struct proto_config *cfg) +rip_init(struct proto_config *CF) { - struct proto *P = proto_new(cfg, sizeof(struct rip_proto)); + struct proto *P = proto_new(CF); + + P->main_channel = proto_add_channel(P, proto_cf_main_channel(CF)); - P->accept_ra_types = RA_OPTIMAL; P->if_notify = rip_if_notify; P->rt_notify = rip_rt_notify; P->neigh_notify = rip_neigh_notify; @@ -1104,10 +1108,12 @@ rip_start(struct proto *P) struct rip_config *cf = (void *) (P->cf); init_list(&p->iface_list); - fib_init(&p->rtable, P->pool, sizeof(struct rip_entry), 0, rip_init_entry); + fib_init(&p->rtable, P->pool, cf->rip2 ? NET_IP4 : NET_IP6, + sizeof(struct rip_entry), OFFSETOF(struct rip_entry, n), 0, NULL); p->rte_slab = sl_new(P->pool, sizeof(struct rip_rte)); - p->timer = tm_new_set(P->pool, rip_timer, p, 0, 0); + p->timer = tm_new_init(P->pool, rip_timer, p, 0, 0); + p->rip2 = cf->rip2; p->ecmp = cf->ecmp; p->infinity = cf->infinity; p->triggered = 0; @@ -1121,18 +1127,24 @@ rip_start(struct proto *P) } static int -rip_reconfigure(struct proto *P, struct proto_config *c) +rip_reconfigure(struct proto *P, struct proto_config *CF) { struct rip_proto *p = (void *) P; - struct rip_config *new = (void *) c; + struct rip_config *new = (void *) CF; // struct rip_config *old = (void *) (P->cf); + if (new->rip2 != p->rip2) + return 0; + if (new->infinity != p->infinity) return 0; + if (!proto_configure_channel(P, &P->main_channel, proto_cf_main_channel(CF))) + return 0; + TRACE(D_EVENTS, "Reconfiguring"); - p->p.cf = c; + p->p.cf = CF; p->ecmp = new->ecmp; rip_reconfigure_ifaces(p, new); @@ -1184,7 +1196,7 @@ rip_show_interfaces(struct proto *P, char *iff) } cli_msg(-1021, "%s:", p->p.name); - cli_msg(-1021, "%-10s %-6s %6s %6s %6s", + cli_msg(-1021, "%-10s %-6s %6s %6s %7s", "Interface", "State", "Metric", "Nbrs", "Timer"); WALK_LIST(ifa, p->iface_list) @@ -1197,8 +1209,9 @@ rip_show_interfaces(struct proto *P, char *iff) if (n->last_seen) nbrs++; - int timer = MAX(ifa->next_regular - now, 0); - cli_msg(-1021, "%-10s %-6s %6u %6u %6u", + btime now_ = current_time(); + btime timer = (ifa->next_regular > now_) ? (ifa->next_regular - now_) : 0; + cli_msg(-1021, "%-10s %-6s %6u %6u %7t", ifa->iface->name, (ifa->up ? "Up" : "Down"), ifa->cf->metric, nbrs, timer); } @@ -1220,7 +1233,7 @@ rip_show_neighbors(struct proto *P, char *iff) } cli_msg(-1022, "%s:", p->p.name); - cli_msg(-1022, "%-25s %-10s %6s %6s %6s", + cli_msg(-1022, "%-25s %-10s %6s %6s %7s", "IP address", "Interface", "Metric", "Routes", "Seen"); WALK_LIST(ifa, p->iface_list) @@ -1233,8 +1246,8 @@ rip_show_neighbors(struct proto *P, char *iff) if (!n->last_seen) continue; - int timer = now - n->last_seen; - cli_msg(-1022, "%-25I %-10s %6u %6u %6u", + btime timer = current_time() - n->last_seen; + cli_msg(-1022, "%-25I %-10s %6u %6u %7t", n->nbr->addr, ifa->iface->name, ifa->cf->metric, n->uc, timer); } } @@ -1250,12 +1263,11 @@ rip_dump(struct proto *P) int i; i = 0; - FIB_WALK(&p->rtable, e) + FIB_WALK(&p->rtable, struct rip_entry, en) { - struct rip_entry *en = (struct rip_entry *) e; - debug("RIP: entry #%d: %I/%d via %I dev %s valid %d metric %d age %d s\n", - i++, en->n.prefix, en->n.pxlen, en->next_hop, en->iface->name, - en->valid, en->metric, now - en->changed); + debug("RIP: entry #%d: %N via %I dev %s valid %d metric %d age %t\n", + i++, en->n.addr, en->next_hop, en->iface->name, + en->valid, en->metric, current_time() - en->changed); } FIB_WALK_END; @@ -1274,7 +1286,10 @@ struct protocol proto_rip = { .template = "rip%d", .attr_class = EAP_RIP, .preference = DEF_PREF_RIP, + .channel_mask = NB_IP, + .proto_size = sizeof(struct rip_proto), .config_size = sizeof(struct rip_config), + .postconfig = rip_postconfig, .init = rip_init, .dump = rip_dump, .start = rip_start, diff --git a/proto/rip/rip.h b/proto/rip/rip.h index b24d9536..55696333 100644 --- a/proto/rip/rip.h +++ b/proto/rip/rip.h @@ -27,12 +27,6 @@ #include "lib/timer.h" -#ifdef IPV6 -#define RIP_IS_V2 0 -#else -#define RIP_IS_V2 1 -#endif - #define RIP_V1 1 #define RIP_V2 2 @@ -44,9 +38,9 @@ #define RIP_DEFAULT_ECMP_LIMIT 16 #define RIP_DEFAULT_INFINITY 16 -#define RIP_DEFAULT_UPDATE_TIME 30 -#define RIP_DEFAULT_TIMEOUT_TIME 180 -#define RIP_DEFAULT_GARBAGE_TIME 120 +#define RIP_DEFAULT_UPDATE_TIME (30 S_) +#define RIP_DEFAULT_TIMEOUT_TIME (180 S_) +#define RIP_DEFAULT_GARBAGE_TIME (120 S_) struct rip_config @@ -58,8 +52,8 @@ struct rip_config u8 ecmp; /* Maximum number of nexthops in ECMP route, or 0 */ u8 infinity; /* Maximum metric value, representing infinity */ - u32 min_timeout_time; /* Minimum of interface timeout_time */ - u32 max_garbage_time; /* Maximum of interface garbage_time */ + btime min_timeout_time; /* Minimum of interface timeout_time */ + btime max_garbage_time; /* Maximum of interface garbage_time */ }; struct rip_iface_config @@ -84,9 +78,9 @@ struct rip_iface_config u16 tx_length; /* TX packet length limit (including headers), 0 for MTU */ int tx_tos; int tx_priority; - u32 update_time; /* Periodic update interval */ - u32 timeout_time; /* Route expiration timeout */ - u32 garbage_time; /* Unreachable entry GC timeout */ + btime update_time; /* Periodic update interval */ + btime timeout_time; /* Route expiration timeout */ + btime garbage_time; /* Unreachable entry GC timeout */ list *passwords; /* Passwords for authentication */ }; @@ -98,6 +92,7 @@ struct rip_proto slab *rte_slab; /* Slab for internal routes (struct rip_rte) */ timer *timer; /* Main protocol timer */ + u8 rip2; /* RIPv2 (IPv4) or RIPng (IPv6) */ u8 ecmp; /* Maximum number of nexthops in ECMP route, or 0 */ u8 infinity; /* Maximum metric value, representing infinity */ u8 triggered; /* Logical AND of interface want_triggered values */ @@ -125,14 +120,14 @@ struct rip_iface list neigh_list; /* List of iface neighbors (struct rip_neighbor) */ /* Update scheduling */ - bird_clock_t next_regular; /* Next time when regular update should be called */ - bird_clock_t next_triggered; /* Next time when triggerd update may be called */ - bird_clock_t want_triggered; /* Nonzero if triggered update is scheduled */ + btime next_regular; /* Next time when regular update should be called */ + btime next_triggered; /* Next time when triggerd update may be called */ + btime want_triggered; /* Nonzero if triggered update is scheduled */ /* Active update */ int tx_active; /* Update session is active */ ip_addr tx_addr; /* Update session destination address */ - bird_clock_t tx_changed; /* Minimal changed time for triggered update */ + btime tx_changed; /* Minimal changed time for triggered update */ struct fib_iterator tx_fit; /* FIB iterator in RIP routing table (p.rtable) */ }; @@ -142,14 +137,13 @@ struct rip_neighbor struct rip_iface *ifa; /* Associated interface, may be NULL if stale */ struct neighbor *nbr; /* Associaded core neighbor, may be NULL if stale */ struct bfd_request *bfd_req; /* BFD request, if BFD is used */ - bird_clock_t last_seen; /* Time of last received and accepted message */ + btime last_seen; /* Time of last received and accepted message */ u32 uc; /* Use count, number of routes linking the neighbor */ u32 csn; /* Last received crypto sequence number */ }; struct rip_entry { - struct fib_node n; struct rip_rte *routes; /* List of incoming routes */ u8 valid; /* Entry validity state (RIP_ENTRY_*) */ @@ -159,7 +153,9 @@ struct rip_entry struct iface *iface; /* Outgoing route iface (for next hop) */ ip_addr next_hop; /* Outgoing route next hop */ - bird_clock_t changed; /* Last time when the outgoing route metric changed */ + btime changed; /* Last time when the outgoing route metric changed */ + + struct fib_node n; }; struct rip_rte @@ -171,7 +167,7 @@ struct rip_rte u16 metric; /* Route metric (after increase) */ u16 tag; /* Route tag */ - bird_clock_t expires; /* Time of route expiration */ + btime expires; /* Time of route expiration */ }; @@ -189,16 +185,11 @@ struct rip_rte #define EA_RIP_METRIC EA_CODE(EAP_RIP, 0) #define EA_RIP_TAG EA_CODE(EAP_RIP, 1) -#define rip_is_v2(X) RIP_IS_V2 -#define rip_is_ng(X) (!RIP_IS_V2) - -/* static inline int rip_is_v2(struct rip_proto *p) { return p->rip2; } static inline int rip_is_ng(struct rip_proto *p) { return ! p->rip2; } -*/ static inline void rip_reset_tx_session(struct rip_proto *p, struct rip_iface *ifa) @@ -211,8 +202,8 @@ rip_reset_tx_session(struct rip_proto *p, struct rip_iface *ifa) } /* rip.c */ -void rip_update_rte(struct rip_proto *p, ip_addr *prefix, int pxlen, struct rip_rte *new); -void rip_withdraw_rte(struct rip_proto *p, ip_addr *prefix, int pxlen, struct rip_neighbor *from); +void rip_update_rte(struct rip_proto *p, net_addr *n, struct rip_rte *new); +void rip_withdraw_rte(struct rip_proto *p, net_addr *n, struct rip_neighbor *from); struct rip_neighbor * rip_get_neighbor(struct rip_proto *p, ip_addr *a, struct rip_iface *ifa); void rip_update_bfd(struct rip_proto *p, struct rip_neighbor *n); void rip_show_interfaces(struct proto *P, char *iff); @@ -220,7 +211,7 @@ void rip_show_neighbors(struct proto *P, char *iff); /* packets.c */ void rip_send_request(struct rip_proto *p, struct rip_iface *ifa); -void rip_send_table(struct rip_proto *p, struct rip_iface *ifa, ip_addr addr, bird_clock_t changed); +void rip_send_table(struct rip_proto *p, struct rip_iface *ifa, ip_addr addr, btime changed); int rip_open_socket(struct rip_iface *ifa); diff --git a/proto/rpki/Doc b/proto/rpki/Doc new file mode 100644 index 00000000..d1d1bf55 --- /dev/null +++ b/proto/rpki/Doc @@ -0,0 +1,5 @@ +S rpki.c +S packets.c +S transport.c +S tcp_transport.c +S ssh_transport.c diff --git a/proto/rpki/Makefile b/proto/rpki/Makefile new file mode 100644 index 00000000..eb09b7df --- /dev/null +++ b/proto/rpki/Makefile @@ -0,0 +1,6 @@ +src := rpki.c packets.c tcp_transport.c ssh_transport.c transport.c +obj := $(src-o-files) +$(all-daemon) +$(cf-local) + +tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file diff --git a/proto/rpki/config.Y b/proto/rpki/config.Y new file mode 100644 index 00000000..39fdfd01 --- /dev/null +++ b/proto/rpki/config.Y @@ -0,0 +1,144 @@ +/* + * BIRD -- The Resource Public Key Infrastructure (RPKI) to Router Protocol + * + * (c) 2015 CZ.NIC + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +CF_HDR + +#include "proto/rpki/rpki.h" + +CF_DEFINES + +#define RPKI_CFG ((struct rpki_config *) this_proto) +#define RPKI_TR_SSH_CFG ((struct rpki_tr_ssh_config *) RPKI_CFG->tr_config.spec) + +static void +rpki_check_unused_hostname(void) +{ + if (RPKI_CFG->hostname != NULL) + cf_error("Only one cache server per protocol allowed"); +} + +static void +rpki_check_unused_transport(void) +{ + if (RPKI_CFG->tr_config.spec != NULL) + cf_error("At the most one transport per protocol allowed"); +} + +CF_DECLS + +CF_KEYWORDS(RPKI, REMOTE, BIRD, PRIVATE, PUBLIC, KEY, TCP, SSH, TRANSPORT, USER, + RETRY, REFRESH, EXPIRE, KEEP) + +%type <i> rpki_keep_interval + +CF_GRAMMAR + +CF_ADDTO(proto, rpki_proto) + +rpki_proto_start: proto_start RPKI { + this_proto = proto_config_new(&proto_rpki, $1); + RPKI_CFG->retry_interval = RPKI_RETRY_INTERVAL; + RPKI_CFG->refresh_interval = RPKI_REFRESH_INTERVAL; + RPKI_CFG->expire_interval = RPKI_EXPIRE_INTERVAL; +}; + +rpki_proto: rpki_proto_start proto_name '{' rpki_proto_opts '}' { rpki_check_config(RPKI_CFG); }; + +rpki_proto_opts: + /* empty */ + | rpki_proto_opts rpki_proto_item ';' + ; + +rpki_proto_item: + proto_item + | proto_channel + | REMOTE rpki_cache_addr + | REMOTE rpki_cache_addr rpki_proto_item_port + | rpki_proto_item_port + | TRANSPORT rpki_transport + | REFRESH rpki_keep_interval expr { + if (rpki_check_refresh_interval($3)) + cf_error(rpki_check_refresh_interval($3)); + RPKI_CFG->refresh_interval = $3; + RPKI_CFG->keep_refresh_interval = $2; + } + | RETRY rpki_keep_interval expr { + if (rpki_check_retry_interval($3)) + cf_error(rpki_check_retry_interval($3)); + RPKI_CFG->retry_interval = $3; + RPKI_CFG->keep_retry_interval = $2; + } + | EXPIRE rpki_keep_interval expr { + if (rpki_check_expire_interval($3)) + cf_error(rpki_check_expire_interval($3)); + RPKI_CFG->expire_interval = $3; + RPKI_CFG->keep_expire_interval = $2; + } + ; + +rpki_keep_interval: + /* empty */ { $$ = 0; } + | KEEP { $$ = 1; } + ; + +rpki_proto_item_port: PORT expr { check_u16($2); RPKI_CFG->port = $2; }; + +rpki_cache_addr: + text { + rpki_check_unused_hostname(); + RPKI_CFG->hostname = $1; + } + | ipa { + rpki_check_unused_hostname(); + RPKI_CFG->ip = $1; + /* Ensure hostname is filled */ + char *hostname = cfg_allocz(sizeof(INET6_ADDRSTRLEN + 1)); + bsnprintf(hostname, INET6_ADDRSTRLEN+1, "%I", RPKI_CFG->ip); + RPKI_CFG->hostname = hostname; + } + ; + +rpki_transport: + TCP rpki_transport_tcp_init + | SSH rpki_transport_ssh_init '{' rpki_transport_ssh_opts '}' rpki_transport_ssh_check + ; + +rpki_transport_tcp_init: +{ + rpki_check_unused_transport(); + RPKI_CFG->tr_config.spec = cfg_allocz(sizeof(struct rpki_tr_tcp_config)); + RPKI_CFG->tr_config.type = RPKI_TR_TCP; +}; + +rpki_transport_ssh_init: +{ + rpki_check_unused_transport(); + RPKI_CFG->tr_config.spec = cfg_allocz(sizeof(struct rpki_tr_ssh_config)); + RPKI_CFG->tr_config.type = RPKI_TR_SSH; +}; + +rpki_transport_ssh_opts: + /* empty */ + | rpki_transport_ssh_opts rpki_transport_ssh_item ';' + ; + +rpki_transport_ssh_item: + BIRD PRIVATE KEY text { RPKI_TR_SSH_CFG->bird_private_key = $4; } + | REMOTE PUBLIC KEY text { RPKI_TR_SSH_CFG->cache_public_key = $4; } + | USER text { RPKI_TR_SSH_CFG->user = $2; } + ; + +rpki_transport_ssh_check: +{ + if (RPKI_TR_SSH_CFG->user == NULL) + cf_error("User must be set"); +}; + +CF_CODE + +CF_END diff --git a/proto/rpki/packets.c b/proto/rpki/packets.c new file mode 100644 index 00000000..59a5efaf --- /dev/null +++ b/proto/rpki/packets.c @@ -0,0 +1,1073 @@ +/* + * BIRD -- The Resource Public Key Infrastructure (RPKI) to Router Protocol + * + * (c) 2015 CZ.NIC + * (c) 2015 Pavel Tvrdik <pawel.tvrdik@gmail.com> + * + * This file was a part of RTRlib: http://rpki.realmv6.org/ + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include <stdlib.h> +#include <string.h> +#include <stdio.h> + +#undef LOCAL_DEBUG + +#include "rpki.h" +#include "transport.h" +#include "packets.h" + +#define RPKI_ADD_FLAG 0b00000001 + +enum rpki_transmit_type { + RPKI_RECV = 0, + RPKI_SEND = 1, +}; + +enum pdu_error_type { + CORRUPT_DATA = 0, + INTERNAL_ERROR = 1, + NO_DATA_AVAIL = 2, + INVALID_REQUEST = 3, + UNSUPPORTED_PROTOCOL_VER = 4, + UNSUPPORTED_PDU_TYPE = 5, + WITHDRAWAL_OF_UNKNOWN_RECORD = 6, + DUPLICATE_ANNOUNCEMENT = 7, + PDU_TOO_BIG = 32 +}; + +static const char *str_pdu_error_type[] = { + [CORRUPT_DATA] = "Corrupt-Data", + [INTERNAL_ERROR] = "Internal-Error", + [NO_DATA_AVAIL] = "No-Data-Available", + [INVALID_REQUEST] = "Invalid-Request", + [UNSUPPORTED_PROTOCOL_VER] = "Unsupported-Protocol-Version", + [UNSUPPORTED_PDU_TYPE] = "Unsupported-PDU-Type", + [WITHDRAWAL_OF_UNKNOWN_RECORD]= "Withdrawal-Of-Unknown-Record", + [DUPLICATE_ANNOUNCEMENT] = "Duplicate-Announcement", + [PDU_TOO_BIG] = "PDU-Too-Big", +}; + +enum pdu_type { + SERIAL_NOTIFY = 0, + SERIAL_QUERY = 1, + RESET_QUERY = 2, + CACHE_RESPONSE = 3, + IPV4_PREFIX = 4, + RESERVED = 5, + IPV6_PREFIX = 6, + END_OF_DATA = 7, + CACHE_RESET = 8, + ROUTER_KEY = 9, + ERROR = 10, + PDU_TYPE_MAX +}; + +static const char *str_pdu_type_[] = { + [SERIAL_NOTIFY] = "Serial Notify", + [SERIAL_QUERY] = "Serial Query", + [RESET_QUERY] = "Reset Query", + [CACHE_RESPONSE] = "Cache Response", + [IPV4_PREFIX] = "IPv4 Prefix", + [RESERVED] = "Reserved", + [IPV6_PREFIX] = "IPv6 Prefix", + [END_OF_DATA] = "End of Data", + [CACHE_RESET] = "Cache Reset", + [ROUTER_KEY] = "Router Key", + [ERROR] = "Error" +}; + +static const char *str_pdu_type(uint type) { + if (type < PDU_TYPE_MAX) + return str_pdu_type_[type]; + else + return "Undefined packet type"; +} + +/* + * 0 8 16 24 31 + * .-------------------------------------------. + * | Protocol | PDU | | + * | Version | Type | reserved = zero | + * | 0 or 1 | 0 - 10 | | + * +-------------------------------------------+ + * | | + * | Length >= 8 | + * | | + * `-------------------------------------------' */ +struct pdu_header { + u8 ver; + u8 type; + u16 reserved; + u32 len; +} PACKED; + +struct pdu_cache_response { + u8 ver; + u8 type; + u16 session_id; + u32 len; +} PACKED; + +struct pdu_serial_notify { + u8 ver; + u8 type; + u16 session_id; + u32 len; + u32 serial_num; +} PACKED; + +struct pdu_serial_query { + u8 ver; + u8 type; + u16 session_id; + u32 len; + u32 serial_num; +} PACKED; + +struct pdu_ipv4 { + u8 ver; + u8 type; + u16 reserved; + u32 len; + u8 flags; + u8 prefix_len; + u8 max_prefix_len; + u8 zero; + ip4_addr prefix; + u32 asn; +} PACKED; + +struct pdu_ipv6 { + u8 ver; + u8 type; + u16 reserved; + u32 len; + u8 flags; + u8 prefix_len; + u8 max_prefix_len; + u8 zero; + ip6_addr prefix; + u32 asn; +} PACKED; + +/* + * 0 8 16 24 31 + * .-------------------------------------------. + * | Protocol | PDU | | + * | Version | Type | Error Code | + * | 1 | 10 | | + * +-------------------------------------------+ + * | | + * | Length | + * | | + * +-------------------------------------------+ + * | | + * | Length of Encapsulated PDU | + * | | + * +-------------------------------------------+ + * | | + * ~ Copy of Erroneous PDU ~ + * | | + * +-------------------------------------------+ + * | | + * | Length of Error Text | + * | | + * +-------------------------------------------+ + * | | + * | Arbitrary Text | + * | of | + * ~ Error Diagnostic Message ~ + * | | + * `-------------------------------------------' */ +struct pdu_error { + u8 ver; + u8 type; + u16 error_code; + u32 len; + u32 len_enc_pdu; /* Length of Encapsulated PDU */ + byte rest[]; /* Copy of Erroneous PDU + * Length of Error Text + * Error Diagnostic Message */ +} PACKED; + +struct pdu_reset_query { + u8 ver; + u8 type; + u16 flags; + u32 len; +} PACKED; + +struct pdu_end_of_data_v0 { + u8 ver; + u8 type; + u16 session_id; + u32 len; + u32 serial_num; +} PACKED; + +struct pdu_end_of_data_v1 { + u8 ver; + u8 type; + u16 session_id; + u32 len; + u32 serial_num; + u32 refresh_interval; + u32 retry_interval; + u32 expire_interval; +} PACKED; + +static const size_t min_pdu_size[] = { + [SERIAL_NOTIFY] = sizeof(struct pdu_serial_notify), + [SERIAL_QUERY] = sizeof(struct pdu_serial_query), + [RESET_QUERY] = sizeof(struct pdu_reset_query), + [CACHE_RESPONSE] = sizeof(struct pdu_cache_response), + [IPV4_PREFIX] = sizeof(struct pdu_ipv4), + [RESERVED] = sizeof(struct pdu_header), + [IPV6_PREFIX] = sizeof(struct pdu_ipv6), + [END_OF_DATA] = sizeof(struct pdu_end_of_data_v0), + [CACHE_RESET] = sizeof(struct pdu_cache_response), + [ROUTER_KEY] = sizeof(struct pdu_header), /* FIXME */ + [ERROR] = 16, +}; + +static int rpki_send_error_pdu(struct rpki_cache *cache, const enum pdu_error_type error_code, const u32 err_pdu_len, const struct pdu_header *erroneous_pdu, const char *fmt, ...); + +static void +rpki_pdu_to_network_byte_order(struct pdu_header *pdu) +{ + pdu->reserved = htons(pdu->reserved); + pdu->len = htonl(pdu->len); + + switch (pdu->type) + { + case SERIAL_QUERY: + { + /* Note that a session_id is converted using converting header->reserved */ + struct pdu_serial_query *sq_pdu = (void *) pdu; + sq_pdu->serial_num = htonl(sq_pdu->serial_num); + break; + } + + case ERROR: + { + struct pdu_error *err = (void *) pdu; + u32 *err_text_len = (u32 *)(err->rest + err->len_enc_pdu); + *err_text_len = htonl(*err_text_len); + err->len_enc_pdu = htonl(err->len_enc_pdu); + break; + } + + case RESET_QUERY: + break; + + default: + bug("PDU type %s should not be sent by us", str_pdu_type(pdu->type)); + } +} + +static void +rpki_pdu_to_host_byte_order(struct pdu_header *pdu) +{ + /* The Router Key PDU has two one-byte fields instead of one two-bytes field. */ + if (pdu->type != ROUTER_KEY) + pdu->reserved = ntohs(pdu->reserved); + + pdu->len = ntohl(pdu->len); + + switch (pdu->type) + { + case SERIAL_NOTIFY: + { + /* Note that a session_id is converted using converting header->reserved */ + struct pdu_serial_notify *sn_pdu = (void *) pdu; + sn_pdu->serial_num = ntohl(sn_pdu->serial_num); + break; + } + + case END_OF_DATA: + { + /* Note that a session_id is converted using converting header->reserved */ + struct pdu_end_of_data_v0 *eod0 = (void *) pdu; + eod0->serial_num = ntohl(eod0->serial_num); /* Same either for version 1 */ + + if (pdu->ver == RPKI_VERSION_1) + { + struct pdu_end_of_data_v1 *eod1 = (void *) pdu; + eod1->expire_interval = ntohl(eod1->expire_interval); + eod1->refresh_interval = ntohl(eod1->refresh_interval); + eod1->retry_interval = ntohl(eod1->retry_interval); + } + break; + } + + case IPV4_PREFIX: + { + struct pdu_ipv4 *ipv4 = (void *) pdu; + ipv4->prefix = ip4_ntoh(ipv4->prefix); + ipv4->asn = ntohl(ipv4->asn); + break; + } + + case IPV6_PREFIX: + { + struct pdu_ipv6 *ipv6 = (void *) pdu; + ipv6->prefix = ip6_ntoh(ipv6->prefix); + ipv6->asn = ntohl(ipv6->asn); + break; + } + + case ERROR: + { + /* Note that a error_code is converted using converting header->reserved */ + struct pdu_error *err = (void *) pdu; + err->len_enc_pdu = ntohl(err->len_enc_pdu); + u32 *err_text_len = (u32 *)(err->rest + err->len_enc_pdu); + *err_text_len = htonl(*err_text_len); + break; + } + + case ROUTER_KEY: + /* Router Key PDU is not supported yet */ + + case SERIAL_QUERY: + case RESET_QUERY: + /* Serial/Reset Query are sent only in direction router to cache. + * We don't care here. */ + + case CACHE_RESPONSE: + case CACHE_RESET: + /* Converted with pdu->reserved */ + break; + } +} + +/** + * rpki_convert_pdu_back_to_network_byte_order - convert host-byte order PDU back to network-byte order + * @out: allocated memory for writing a converted PDU of size @in->len + * @in: host-byte order PDU + * + * Assumed: |A == ntoh(ntoh(A))| + */ +static struct pdu_header * +rpki_pdu_back_to_network_byte_order(struct pdu_header *out, const struct pdu_header *in) +{ + memcpy(out, in, in->len); + rpki_pdu_to_host_byte_order(out); + return out; +} + +static void +rpki_log_packet(struct rpki_cache *cache, const struct pdu_header *pdu, const enum rpki_transmit_type action) +{ + if (!(cache->p->p.debug & D_PACKETS)) + return; + + const char *str_type = str_pdu_type(pdu->type); + char detail[256]; + +#define SAVE(fn) \ + do { \ + if (fn < 0) \ + { \ + bsnprintf(detail + sizeof(detail) - 16, 16, "... <too long>)"); \ + goto detail_finished; \ + } \ + } while(0) \ + + switch (pdu->type) + { + case SERIAL_NOTIFY: + case SERIAL_QUERY: + SAVE(bsnprintf(detail, sizeof(detail), "(session id: %u, serial number: %u)", pdu->reserved, ((struct pdu_serial_notify *) pdu)->serial_num)); + break; + + case END_OF_DATA: + { + const struct pdu_end_of_data_v1 *eod = (void *) pdu; + if (eod->ver == RPKI_VERSION_1) + SAVE(bsnprintf(detail, sizeof(detail), "(session id: %u, serial number: %u, refresh: %us, retry: %us, expire: %us)", eod->session_id, eod->serial_num, eod->refresh_interval, eod->retry_interval, eod->expire_interval)); + else + SAVE(bsnprintf(detail, sizeof(detail), "(session id: %u, serial number: %u)", eod->session_id, eod->serial_num)); + break; + } + + case CACHE_RESPONSE: + SAVE(bsnprintf(detail, sizeof(detail), "(session id: %u)", pdu->reserved)); + break; + + case IPV4_PREFIX: + { + const struct pdu_ipv4 *ipv4 = (void *) pdu; + SAVE(bsnprintf(detail, sizeof(detail), "(%I4/%u-%u AS%u)", ipv4->prefix, ipv4->prefix_len, ipv4->max_prefix_len, ipv4->asn)); + break; + } + + case IPV6_PREFIX: + { + const struct pdu_ipv6 *ipv6 = (void *) pdu; + SAVE(bsnprintf(detail, sizeof(detail), "(%I6/%u-%u AS%u)", ipv6->prefix, ipv6->prefix_len, ipv6->max_prefix_len, ipv6->asn)); + break; + } + + case ROUTER_KEY: + /* We don't support saving Router Key PDUs yet */ + SAVE(bsnprintf(detail, sizeof(detail), "(ignored)")); + break; + + case ERROR: + { + const struct pdu_error *err = (void *) pdu; + SAVE(bsnprintf(detail, sizeof(detail), "(%s", str_pdu_error_type[err->error_code])); + + /* Optional description of error */ + const u32 len_err_txt = *((u32 *) (err->rest + err->len_enc_pdu)); + if (len_err_txt > 0) + { + size_t expected_len = err->len_enc_pdu + len_err_txt + 16; + if (expected_len == err->len) + { + char txt[len_err_txt + 1]; + char *pdu_txt = (char *) err->rest + err->len_enc_pdu + 4; + bsnprintf(txt, sizeof(txt), "%s", pdu_txt); /* it's ensured that txt is ended with a null byte */ + SAVE(bsnprintf(detail + strlen(detail), sizeof(detail) - strlen(detail), ": '%s'", txt)); + } + else + { + SAVE(bsnprintf(detail + strlen(detail), sizeof(detail) - strlen(detail), ", malformed size")); + } + } + + /* Optional encapsulated erroneous packet */ + if (err->len_enc_pdu) + { + SAVE(bsnprintf(detail + strlen(detail), sizeof(detail) - strlen(detail), ", %s packet:", str_pdu_type(((struct pdu_header *) err->rest)->type))); + if (err->rest + err->len_enc_pdu <= (byte *)err + err->len) + { + for (const byte *c = err->rest; c != err->rest + err->len_enc_pdu; c++) + SAVE(bsnprintf(detail + strlen(detail), sizeof(detail) - strlen(detail), " %02X", *c)); + } + } + + SAVE(bsnprintf(detail + strlen(detail), sizeof(detail) - strlen(detail), ")")); + break; + } + + default: + *detail = '\0'; + } +#undef SAVE + + detail_finished: + + if (action == RPKI_RECV) + { + CACHE_TRACE(D_PACKETS, cache, "Received %s packet %s", str_type, detail); + } + else + { + CACHE_TRACE(D_PACKETS, cache, "Sending %s packet %s", str_type, detail); + } + +#if defined(LOCAL_DEBUG) || defined(GLOBAL_DEBUG) + int seq = 0; + for(const byte *c = pdu; c != pdu + pdu->len; c++) + { + if ((seq % 4) == 0) + DBG("%2d: ", seq); + + DBG(" 0x%02X %-3u", *c, *c); + + if ((++seq % 4) == 0) + DBG("\n"); + } + if ((seq % 4) != 0) + DBG("\n"); +#endif +} + +static int +rpki_send_pdu(struct rpki_cache *cache, const void *pdu, const uint len) +{ + struct rpki_proto *p = cache->p; + sock *sk = cache->tr_sock->sk; + + rpki_log_packet(cache, pdu, RPKI_SEND); + + if (sk->tbuf != sk->tpos) + { + RPKI_WARN(p, "Old packet overwritten in TX buffer"); + } + + if (len > sk->tbsize) + { + RPKI_WARN(p, "%u bytes is too much for send", len); + ASSERT(0); + return RPKI_ERROR; + } + + memcpy(sk->tbuf, pdu, len); + rpki_pdu_to_network_byte_order((void *) sk->tbuf); + + if (!sk_send(sk, len)) + { + DBG("Cannot send just the whole data. It will be sent using a call of tx_hook()"); + } + + return RPKI_SUCCESS; +} + +/** + * rpki_check_receive_packet - make a basic validation of received RPKI PDU header + * @cache: cache connection instance + * @pdu: RPKI PDU in network byte order + * + * This function checks protocol version, PDU type, version and size. If all is all right then + * function returns |RPKI_SUCCESS| otherwise sends Error PDU and returns + * |RPKI_ERROR|. + */ +static int +rpki_check_receive_packet(struct rpki_cache *cache, const struct pdu_header *pdu) +{ + u32 pdu_len = ntohl(pdu->len); + + /* + * Minimal and maximal allowed PDU size is treated in rpki_rx_hook() function. + * @header.len corresponds to number of bytes of @pdu and + * it is in range from RPKI_PDU_HEADER_LEN to RPKI_PDU_MAX_LEN bytes. + */ + + /* Do not handle error PDUs here, leave this task to rpki_handle_error_pdu() */ + if (pdu->ver != cache->version && pdu->type != ERROR) + { + /* If this is the first PDU we have received */ + if (cache->request_session_id) + { + if (pdu->type == SERIAL_NOTIFY) + { + /* + * The router MUST ignore any Serial Notify PDUs it might receive from + * the cache during this initial start-up period, regardless of the + * Protocol Version field in the Serial Notify PDU. + * (https://tools.ietf.org/html/draft-ietf-sidr-rpki-rtr-rfc6810-bis-07#section-7) + */ + } + else if (!cache->last_update && + (pdu->ver <= RPKI_MAX_VERSION) && + (pdu->ver < cache->version)) + { + CACHE_TRACE(D_EVENTS, cache, "Downgrade session to %s from %u to %u version", rpki_get_cache_ident(cache), cache->version, pdu->ver); + cache->version = pdu->ver; + } + else + { + /* If this is not the first PDU we have received, something is wrong with + * the server implementation -> Error */ + rpki_send_error_pdu(cache, UNSUPPORTED_PROTOCOL_VER, pdu_len, pdu, "PDU with unsupported Protocol version received"); + return RPKI_ERROR; + } + } + } + + if ((pdu->type >= PDU_TYPE_MAX) || (pdu->ver == RPKI_VERSION_0 && pdu->type == ROUTER_KEY)) + { + rpki_send_error_pdu(cache, UNSUPPORTED_PDU_TYPE, pdu_len, pdu, "Unsupported PDU type %u received", pdu->type); + return RPKI_ERROR; + } + + if (pdu_len < min_pdu_size[pdu->type]) + { + rpki_send_error_pdu(cache, CORRUPT_DATA, pdu_len, pdu, "Received %s packet with %d bytes, but expected at least %d bytes", str_pdu_type(pdu->type), pdu_len, min_pdu_size[pdu->type]); + return RPKI_ERROR; + } + + return RPKI_SUCCESS; +} + +static int +rpki_handle_error_pdu(struct rpki_cache *cache, const struct pdu_error *pdu) +{ + switch (pdu->error_code) + { + case CORRUPT_DATA: + case INTERNAL_ERROR: + case INVALID_REQUEST: + case UNSUPPORTED_PDU_TYPE: + rpki_cache_change_state(cache, RPKI_CS_ERROR_FATAL); + break; + + case NO_DATA_AVAIL: + rpki_cache_change_state(cache, RPKI_CS_ERROR_NO_DATA_AVAIL); + break; + + case UNSUPPORTED_PROTOCOL_VER: + CACHE_TRACE(D_PACKETS, cache, "Client uses unsupported protocol version"); + if (pdu->ver <= RPKI_MAX_VERSION && + pdu->ver < cache->version) + { + CACHE_TRACE(D_EVENTS, cache, "Downgrading from protocol version %d to version %d", cache->version, pdu->ver); + cache->version = pdu->ver; + rpki_cache_change_state(cache, RPKI_CS_FAST_RECONNECT); + } + else + { + CACHE_TRACE(D_PACKETS, cache, "Got UNSUPPORTED_PROTOCOL_VER error PDU with invalid values, " \ + "current version: %d, PDU version: %d", cache->version, pdu->ver); + rpki_cache_change_state(cache, RPKI_CS_ERROR_FATAL); + } + break; + + default: + CACHE_TRACE(D_PACKETS, cache, "Error unknown, server sent unsupported error code %u", pdu->error_code); + rpki_cache_change_state(cache, RPKI_CS_ERROR_FATAL); + break; + } + + return RPKI_SUCCESS; +} + +static void +rpki_handle_serial_notify_pdu(struct rpki_cache *cache, const struct pdu_serial_notify *pdu) +{ + /* The router MUST ignore any Serial Notify PDUs it might receive from + * the cache during this initial start-up period, regardless of the + * Protocol Version field in the Serial Notify PDU. + * (https://tools.ietf.org/html/draft-ietf-sidr-rpki-rtr-rfc6810-bis-07#section-7) + */ + if (cache->request_session_id) + { + CACHE_TRACE(D_PACKETS, cache, "Ignore a Serial Notify packet during initial start-up period"); + return; + } + + /* XXX Serial number should be compared using method RFC 1982 (3.2) */ + if (cache->serial_num != pdu->serial_num) + rpki_cache_change_state(cache, RPKI_CS_SYNC_START); +} + +static int +rpki_handle_cache_response_pdu(struct rpki_cache *cache, const struct pdu_cache_response *pdu) +{ + if (cache->request_session_id) + { + if (cache->last_update) + { + /* + * This isn't the first sync and we already received records. This point + * is after Reset Query and before importing new records from cache + * server. We need to load new ones and kick out missing ones. So start + * a refresh cycle. + */ + if (cache->p->roa4_channel) + rt_refresh_begin(cache->p->roa4_channel->table, cache->p->roa4_channel); + if (cache->p->roa6_channel) + rt_refresh_begin(cache->p->roa6_channel->table, cache->p->roa6_channel); + + cache->p->refresh_channels = 1; + } + cache->session_id = pdu->session_id; + cache->request_session_id = 0; + } + else + { + if (cache->session_id != pdu->session_id) + { + byte tmp[pdu->len]; + const struct pdu_header *hton_pdu = rpki_pdu_back_to_network_byte_order((void *) tmp, (const void *) pdu); + rpki_send_error_pdu(cache, CORRUPT_DATA, pdu->len, hton_pdu, "Wrong session_id %u in Cache Response PDU", pdu->session_id); + rpki_cache_change_state(cache, RPKI_CS_ERROR_FATAL); + return RPKI_ERROR; + } + } + + rpki_cache_change_state(cache, RPKI_CS_SYNC_RUNNING); + return RPKI_SUCCESS; +} + +/** + * rpki_prefix_pdu_2_net_addr - convert IPv4/IPv6 Prefix PDU into net_addr_union + * @pdu: host byte order IPv4/IPv6 Prefix PDU + * @n: allocated net_addr_union for save ROA + * + * This function reads ROA data from IPv4/IPv6 Prefix PDU and + * write them into net_addr_roa4 or net_addr_roa6 data structure. + */ +static net_addr_union * +rpki_prefix_pdu_2_net_addr(const struct pdu_header *pdu, net_addr_union *n) +{ + /* + * Note that sizeof(net_addr_roa6) > sizeof(net_addr) + * and thence we must use net_addr_union and not only net_addr + */ + + if (pdu->type == IPV4_PREFIX) + { + const struct pdu_ipv4 *ipv4 = (void *) pdu; + n->roa4.type = NET_ROA4; + n->roa4.length = sizeof(net_addr_roa4); + n->roa4.prefix = ipv4->prefix; + n->roa4.asn = ipv4->asn; + n->roa4.pxlen = ipv4->prefix_len; + n->roa4.max_pxlen = ipv4->max_prefix_len; + } + else + { + const struct pdu_ipv6 *ipv6 = (void *) pdu; + n->roa6.type = NET_ROA6; + n->roa6.length = sizeof(net_addr_roa6); + n->roa6.prefix = ipv6->prefix; + n->roa6.asn = ipv6->asn; + n->roa6.pxlen = ipv6->prefix_len; + n->roa6.max_pxlen = ipv6->max_prefix_len; + } + + return n; +} + +static int +rpki_handle_prefix_pdu(struct rpki_cache *cache, const struct pdu_header *pdu) +{ + const enum pdu_type type = pdu->type; + ASSERT(type == IPV4_PREFIX || type == IPV6_PREFIX); + + net_addr_union addr = {}; + rpki_prefix_pdu_2_net_addr(pdu, &addr); + + struct channel *channel = NULL; + + if (type == IPV4_PREFIX) + channel = cache->p->roa4_channel; + if (type == IPV6_PREFIX) + channel = cache->p->roa6_channel; + + if (!channel) + { + CACHE_TRACE(D_ROUTES, cache, "Skip %N, missing %s channel", &addr, (type == IPV4_PREFIX ? "roa4" : "roa6"), addr); + return RPKI_ERROR; + } + + cache->last_rx_prefix = current_time(); + + /* A place for 'flags' is same for both data structures pdu_ipv4 or pdu_ipv6 */ + struct pdu_ipv4 *pfx = (void *) pdu; + if (pfx->flags & RPKI_ADD_FLAG) + rpki_table_add_roa(cache, channel, &addr); + else + rpki_table_remove_roa(cache, channel, &addr); + + return RPKI_SUCCESS; +} + +static uint +rpki_check_interval(struct rpki_cache *cache, const char *(check_fn)(uint), uint interval) +{ + if (check_fn(interval)) + { + RPKI_WARN(cache->p, "%s, received %u seconds", check_fn(interval), interval); + return 0; + } + return 1; +} + +static void +rpki_handle_end_of_data_pdu(struct rpki_cache *cache, const struct pdu_end_of_data_v1 *pdu) +{ + const struct rpki_config *cf = (void *) cache->p->p.cf; + + if (pdu->session_id != cache->session_id) + { + byte tmp[pdu->len]; + const struct pdu_header *hton_pdu = rpki_pdu_back_to_network_byte_order((void *) tmp, (const void *) pdu); + rpki_send_error_pdu(cache, CORRUPT_DATA, pdu->len, hton_pdu, "Received Session ID %u, but expected %u", pdu->session_id, cache->session_id); + rpki_cache_change_state(cache, RPKI_CS_ERROR_FATAL); + return; + } + + if (pdu->ver == RPKI_VERSION_1) + { + if (!cf->keep_refresh_interval && rpki_check_interval(cache, rpki_check_refresh_interval, pdu->refresh_interval)) + cache->refresh_interval = pdu->refresh_interval; + + if (!cf->keep_retry_interval && rpki_check_interval(cache, rpki_check_retry_interval, pdu->retry_interval)) + cache->retry_interval = pdu->retry_interval; + + if (!cf->keep_expire_interval && rpki_check_interval(cache, rpki_check_expire_interval, pdu->expire_interval)) + cache->expire_interval = pdu->expire_interval; + + CACHE_TRACE(D_EVENTS, cache, "New interval values: " + "refresh: %s%us, " + "retry: %s%us, " + "expire: %s%us", + (cf->keep_refresh_interval ? "keeps " : ""), cache->refresh_interval, + (cf->keep_retry_interval ? "keeps " : ""), cache->retry_interval, + (cf->keep_expire_interval ? "keeps " : ""), cache->expire_interval); + } + + if (cache->p->refresh_channels) + { + cache->p->refresh_channels = 0; + if (cache->p->roa4_channel) + rt_refresh_end(cache->p->roa4_channel->table, cache->p->roa4_channel); + if (cache->p->roa6_channel) + rt_refresh_end(cache->p->roa6_channel->table, cache->p->roa6_channel); + } + + cache->last_update = current_time(); + cache->serial_num = pdu->serial_num; + rpki_cache_change_state(cache, RPKI_CS_ESTABLISHED); +} + +/** + * rpki_rx_packet - process a received RPKI PDU + * @cache: RPKI connection instance + * @pdu: a RPKI PDU in network byte order + */ +static void +rpki_rx_packet(struct rpki_cache *cache, struct pdu_header *pdu) +{ + struct rpki_proto *p = cache->p; + + if (rpki_check_receive_packet(cache, pdu) == RPKI_ERROR) + { + rpki_cache_change_state(cache, RPKI_CS_ERROR_FATAL); + return; + } + + rpki_pdu_to_host_byte_order(pdu); + rpki_log_packet(cache, pdu, RPKI_RECV); + + switch (pdu->type) + { + case RESET_QUERY: + case SERIAL_QUERY: + RPKI_WARN(p, "Received a %s packet that is destined for cache server", str_pdu_type(pdu->type)); + break; + + case SERIAL_NOTIFY: + /* This is a signal to synchronize with the cache server just now */ + rpki_handle_serial_notify_pdu(cache, (void *) pdu); + break; + + case CACHE_RESPONSE: + rpki_handle_cache_response_pdu(cache, (void *) pdu); + break; + + case IPV4_PREFIX: + case IPV6_PREFIX: + rpki_handle_prefix_pdu(cache, pdu); + break; + + case END_OF_DATA: + rpki_handle_end_of_data_pdu(cache, (void *) pdu); + break; + + case CACHE_RESET: + /* Cache cannot provide an incremental update. */ + rpki_cache_change_state(cache, RPKI_CS_NO_INCR_UPDATE_AVAIL); + break; + + case ERROR: + rpki_handle_error_pdu(cache, (void *) pdu); + break; + + case ROUTER_KEY: + /* TODO: Implement Router Key PDU handling */ + break; + + default: + CACHE_TRACE(D_PACKETS, cache, "Received unsupported type (%u)", pdu->type); + }; +} + +int +rpki_rx_hook(struct birdsock *sk, uint size) +{ + struct rpki_cache *cache = sk->data; + struct rpki_proto *p = cache->p; + + byte *pkt_start = sk->rbuf; + byte *end = pkt_start + size; + + DBG("rx hook got %u bytes \n", size); + + while (end >= pkt_start + RPKI_PDU_HEADER_LEN) + { + struct pdu_header *pdu = (void *) pkt_start; + u32 pdu_size = ntohl(pdu->len); + + if (pdu_size < RPKI_PDU_HEADER_LEN || pdu_size > RPKI_PDU_MAX_LEN) + { + RPKI_WARN(p, "Received invalid packet length %u, purge the whole receiving buffer", pdu_size); + return 1; /* Purge recv buffer */ + } + + if (end < pkt_start + pdu_size) + break; + + rpki_rx_packet(cache, pdu); + + /* It is possible that bird socket was freed/closed */ + if (p->p.proto_state == PS_DOWN || sk != cache->tr_sock->sk) + return 0; + + pkt_start += pdu_size; + } + + if (pkt_start != sk->rbuf) + { + CACHE_DBG(cache, "Move %u bytes of a memory at the start of buffer", end - pkt_start); + memmove(sk->rbuf, pkt_start, end - pkt_start); + sk->rpos = sk->rbuf + (end - pkt_start); + } + + return 0; /* Not purge sk->rbuf */ +} + +void +rpki_err_hook(struct birdsock *sk, int error_num) +{ + struct rpki_cache *cache = sk->data; + + if (error_num) + { + /* sk->err may contains a SSH error description */ + if (sk->err) + CACHE_TRACE(D_EVENTS, cache, "Lost connection: %s", sk->err); + else + CACHE_TRACE(D_EVENTS, cache, "Lost connection: %M", error_num); + } + else + { + CACHE_TRACE(D_EVENTS, cache, "The other side closed a connection"); + } + + + rpki_cache_change_state(cache, RPKI_CS_ERROR_TRANSPORT); +} + +static int +rpki_fire_tx(struct rpki_cache *cache) +{ + sock *sk = cache->tr_sock->sk; + + uint bytes_to_send = sk->tpos - sk->tbuf; + DBG("Sending %u bytes", bytes_to_send); + return sk_send(sk, bytes_to_send); +} + +void +rpki_tx_hook(sock *sk) +{ + struct rpki_cache *cache = sk->data; + + while (rpki_fire_tx(cache) > 0) + ; +} + +void +rpki_connected_hook(sock *sk) +{ + struct rpki_cache *cache = sk->data; + + CACHE_TRACE(D_EVENTS, cache, "Connected"); + proto_notify_state(&cache->p->p, PS_UP); + + sk->rx_hook = rpki_rx_hook; + sk->tx_hook = rpki_tx_hook; + + rpki_cache_change_state(cache, RPKI_CS_SYNC_START); +} + +/** + * rpki_send_error_pdu - send RPKI Error PDU + * @cache: RPKI connection instance + * @error_code: PDU Error type + * @err_pdu_len: length of @erroneous_pdu + * @erroneous_pdu: optional network byte-order PDU that invokes Error by us or NULL + * @fmt: optional description text of error or NULL + * @args: optional arguments for @fmt + * + * This function prepares Error PDU and sends it to a cache server. + */ +static int +rpki_send_error_pdu(struct rpki_cache *cache, const enum pdu_error_type error_code, const u32 err_pdu_len, const struct pdu_header *erroneous_pdu, const char *fmt, ...) +{ + va_list args; + char msg[128]; + + /* Size including the terminating null byte ('\0') */ + int msg_len = 0; + + /* Don't send errors for erroneous error PDUs */ + if (err_pdu_len >= 2) + { + if (erroneous_pdu->type == ERROR) + return RPKI_SUCCESS; + } + + if (fmt) + { + va_start(args, fmt); + msg_len = bvsnprintf(msg, sizeof(msg), fmt, args) + 1; + } + + u32 pdu_size = 16 + err_pdu_len + msg_len; + byte pdu[pdu_size]; + memset(pdu, 0, sizeof(pdu)); + + struct pdu_error *e = (void *) pdu; + e->ver = cache->version; + e->type = ERROR; + e->error_code = error_code; + e->len = pdu_size; + + e->len_enc_pdu = err_pdu_len; + if (err_pdu_len > 0) + memcpy(e->rest, erroneous_pdu, err_pdu_len); + + *((u32 *)(e->rest + err_pdu_len)) = msg_len; + if (msg_len > 0) + memcpy(e->rest + err_pdu_len + 4, msg, msg_len); + + return rpki_send_pdu(cache, pdu, pdu_size); +} + +int +rpki_send_serial_query(struct rpki_cache *cache) +{ + struct pdu_serial_query pdu = { + .ver = cache->version, + .type = SERIAL_QUERY, + .session_id = cache->session_id, + .len = sizeof(pdu), + .serial_num = cache->serial_num + }; + + if (rpki_send_pdu(cache, &pdu, sizeof(pdu)) != RPKI_SUCCESS) + { + rpki_cache_change_state(cache, RPKI_CS_ERROR_TRANSPORT); + return RPKI_ERROR; + } + + return RPKI_SUCCESS; +} + +int +rpki_send_reset_query(struct rpki_cache *cache) +{ + struct pdu_reset_query pdu = { + .ver = cache->version, + .type = RESET_QUERY, + .len = sizeof(pdu), + }; + + if (rpki_send_pdu(cache, &pdu, sizeof(pdu)) != RPKI_SUCCESS) + { + rpki_cache_change_state(cache, RPKI_CS_ERROR_TRANSPORT); + return RPKI_ERROR; + } + + return RPKI_SUCCESS; +} diff --git a/proto/rpki/packets.h b/proto/rpki/packets.h new file mode 100644 index 00000000..d6f8a249 --- /dev/null +++ b/proto/rpki/packets.h @@ -0,0 +1,45 @@ +/* + * BIRD -- The Resource Public Key Infrastructure (RPKI) to Router Protocol + * + * (c) 2015 CZ.NIC + * (c) 2015 Pavel Tvrdik <pawel.tvrdik@gmail.com> + * + * This file was a part of RTRlib: http://rpki.realmv6.org/ + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_RPKI_PACKETS_H_ +#define _BIRD_RPKI_PACKETS_H_ + +#include <arpa/inet.h> + +#define RPKI_PDU_HEADER_LEN 8 + +/* A Error PDU size is the biggest (has encapsulate PDU inside): + * +8 bytes (Header size) + * +4 bytes (Length of Encapsulated PDU) + * +32 bytes (Encapsulated PDU IPv6 32) + * +4 bytes (Length of inserted text) + * +800 bytes (UTF-8 text 400*2 bytes) + * ------------ + * = 848 bytes (Maximal expected PDU size) */ +#define RPKI_PDU_MAX_LEN 848 + +/* RX buffer size has a great impact to scheduler granularity */ +#define RPKI_RX_BUFFER_SIZE 4096 +#define RPKI_TX_BUFFER_SIZE RPKI_PDU_MAX_LEN + +/* Return values */ +enum rpki_rtvals { + RPKI_SUCCESS = 0, + RPKI_ERROR = -1 +}; + +int rpki_send_serial_query(struct rpki_cache *cache); +int rpki_send_reset_query(struct rpki_cache *cache); +int rpki_rx_hook(sock *sk, uint size); +void rpki_connected_hook(sock *sk); +void rpki_err_hook(sock *sk, int size); + +#endif diff --git a/proto/rpki/rpki.c b/proto/rpki/rpki.c new file mode 100644 index 00000000..3145399b --- /dev/null +++ b/proto/rpki/rpki.c @@ -0,0 +1,928 @@ +/* + * BIRD -- The Resource Public Key Infrastructure (RPKI) to Router Protocol + * + * (c) 2015 CZ.NIC + * (c) 2015 Pavel Tvrdik <pawel.tvrdik@gmail.com> + * + * Using RTRlib: http://rpki.realmv6.org/ + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +/** + * DOC: RPKI To Router (RPKI-RTR) + * + * The RPKI-RTR protocol is implemented in several files: |rpki.c| containing + * the routes handling, protocol logic, timer events, cache connection, + * reconfiguration, configuration and protocol glue with BIRD core, |packets.c| + * containing the RPKI packets handling and finally all transports files: + * |transport.c|, |tcp_transport.c| and |ssh_transport.c|. + * + * The |transport.c| is a middle layer and interface for each specific + * transport. Transport is a way how to wrap a communication with a cache + * server. There is supported an unprotected TCP transport and an encrypted + * SSHv2 transport. The SSH transport requires LibSSH library. LibSSH is + * loading dynamically using |dlopen()| function. SSH support is integrated in + * |sysdep/unix/io.c|. Each transport must implement an initialization + * function, an open function and a socket identification function. That's all. + * + * This implementation is based on the RTRlib (http://rpki.realmv6.org/). The + * BIRD takes over files |packets.c|, |rtr.c| (inside |rpki.c|), |transport.c|, + * |tcp_transport.c| and |ssh_transport.c| from RTRlib. + * + * A RPKI-RTR connection is described by a structure &rpki_cache. The main + * logic is located in |rpki_cache_change_state()| function. There is a state + * machine. The standard starting state flow looks like |Down| ~> |Connecting| + * ~> |Sync-Start| ~> |Sync-Running| ~> |Established| and then the last three + * states are periodically repeated. + * + * |Connecting| state establishes the transport connection. The state from a + * call |rpki_cache_change_state(CONNECTING)| to a call |rpki_connected_hook()| + * + * |Sync-Start| state starts with sending |Reset Query| or |Serial Query| and + * then waits for |Cache Response|. The state from |rpki_connected_hook()| to + * |rpki_handle_cache_response_pdu()| + * + * During |Sync-Running| BIRD receives data with IPv4/IPv6 Prefixes from cache + * server. The state starts from |rpki_handle_cache_response_pdu()| and ends + * in |rpki_handle_end_of_data_pdu()|. + * + * |Established| state means that BIRD has synced all data with cache server. + * Schedules a refresh timer event that invokes |Sync-Start|. Schedules Expire + * timer event and stops a Retry timer event. + * + * |Transport Error| state means that we have some troubles with a network + * connection. We cannot connect to a cache server or we wait too long for some + * expected PDU for received - |Cache Response| or |End of Data|. It closes + * current connection and schedules a Retry timer event. + * + * |Fatal Protocol Error| is occurred e.g. by received a bad Session ID. We + * restart a protocol, so all ROAs are flushed immediately. + * + * The RPKI-RTR protocol (RFC 6810 bis) defines configurable refresh, retry and + * expire intervals. For maintaining a connection are used timer events that + * are scheduled by |rpki_schedule_next_refresh()|, + * |rpki_schedule_next_retry()| and |rpki_schedule_next_expire()| functions. + * + * A Refresh timer event performs a sync of |Established| connection. So it + * shifts state to |Sync-Start|. If at the beginning of second call of a + * refresh event is connection in |Sync-Start| state then we didn't receive a + * |Cache Response| from a cache server and we invoke |Transport Error| state. + * + * A Retry timer event attempts to connect cache server. It is activated after + * |Transport Error| state and terminated by reaching |Established| state. + * If cache connection is still connecting to the cache server at the beginning + * of an event call then the Retry timer event invokes |Transport Error| state. + * + * An Expire timer event checks expiration of ROAs. If a last successful sync + * was more ago than the expire interval then the Expire timer event invokes a + * protocol restart thereby removes all ROAs learned from that cache server and + * continue trying to connect to cache server. The Expire event is activated + * by initial successful loading of ROAs, receiving End of Data PDU. + * + * A reconfiguration of cache connection works well without restarting when we + * change only intervals values. + * + * Supported standards: + * - RFC 6810 - main RPKI-RTR standard + * - RFC 6810 bis - an explicit timing parameters and protocol version number negotiation + */ + +#include <stdlib.h> +#include <netdb.h> + +#undef LOCAL_DEBUG + +#include "rpki.h" +#include "lib/string.h" +#include "nest/cli.h" + +/* Return values for reconfiguration functions */ +#define NEED_RESTART 0 +#define SUCCESSFUL_RECONF 1 + +static int rpki_open_connection(struct rpki_cache *cache); +static void rpki_close_connection(struct rpki_cache *cache); +static void rpki_schedule_next_refresh(struct rpki_cache *cache); +static void rpki_schedule_next_retry(struct rpki_cache *cache); +static void rpki_schedule_next_expire_check(struct rpki_cache *cache); +static void rpki_stop_refresh_timer_event(struct rpki_cache *cache); +static void rpki_stop_retry_timer_event(struct rpki_cache *cache); +static void rpki_stop_expire_timer_event(struct rpki_cache *cache); + + +/* + * Routes handling + */ + +void +rpki_table_add_roa(struct rpki_cache *cache, struct channel *channel, const net_addr_union *pfxr) +{ + struct rpki_proto *p = cache->p; + + rta a0 = { + .src = p->p.main_source, + .source = RTS_RPKI, + .scope = SCOPE_UNIVERSE, + .dest = RTD_NONE, + }; + + rta *a = rta_lookup(&a0); + rte *e = rte_get_temp(a); + + e->pflags = 0; + + rte_update2(channel, &pfxr->n, e, a0.src); +} + +void +rpki_table_remove_roa(struct rpki_cache *cache, struct channel *channel, const net_addr_union *pfxr) +{ + struct rpki_proto *p = cache->p; + rte_update2(channel, &pfxr->n, NULL, p->p.main_source); +} + + +/* + * RPKI Protocol Logic + */ + +static const char *str_cache_states[] = { + [RPKI_CS_CONNECTING] = "Connecting", + [RPKI_CS_ESTABLISHED] = "Established", + [RPKI_CS_RESET] = "Reseting", + [RPKI_CS_SYNC_START] = "Sync-Start", + [RPKI_CS_SYNC_RUNNING] = "Sync-Running", + [RPKI_CS_FAST_RECONNECT] = "Fast-Reconnect", + [RPKI_CS_NO_INCR_UPDATE_AVAIL]= "No-Increment-Update-Available", + [RPKI_CS_ERROR_NO_DATA_AVAIL] = "Cache-Error-No-Data-Available", + [RPKI_CS_ERROR_FATAL] = "Fatal-Protocol-Error", + [RPKI_CS_ERROR_TRANSPORT] = "Transport-Error", + [RPKI_CS_SHUTDOWN] = "Down" +}; + +/** + * rpki_cache_state_to_str - give a text representation of cache state + * @state: A cache state + * + * The function converts logic cache state into string. + */ +const char * +rpki_cache_state_to_str(enum rpki_cache_state state) +{ + return str_cache_states[state]; +} + +/** + * rpki_start_cache - connect to a cache server + * @cache: RPKI connection instance + * + * This function is a high level method to kick up a connection to a cache server. + */ +static void +rpki_start_cache(struct rpki_cache *cache) +{ + rpki_cache_change_state(cache, RPKI_CS_CONNECTING); +} + +/** + * rpki_force_restart_proto - force shutdown and start protocol again + * @p: RPKI protocol instance + * + * This function calls shutdown and frees all protocol resources as well. + * After calling this function should be no operations with protocol data, + * they could be freed already. + */ +static void +rpki_force_restart_proto(struct rpki_proto *p) +{ + if (p->cache) + { + CACHE_DBG(p->cache, "Connection object destroying"); + } + + /* Sign as freed */ + p->cache = NULL; + + proto_notify_state(&p->p, PS_DOWN); +} + +/** + * rpki_cache_change_state - check and change cache state + * @cache: RPKI cache instance + * @new_state: suggested new state + * + * This function makes transitions between internal states. + * It represents the core of logic management of RPKI protocol. + * Cannot transit into the same state as cache is in already. + */ +void +rpki_cache_change_state(struct rpki_cache *cache, const enum rpki_cache_state new_state) +{ + const enum rpki_cache_state old_state = cache->state; + + if (old_state == new_state) + return; + + cache->state = new_state; + CACHE_TRACE(D_EVENTS, cache, "Changing from %s to %s state", rpki_cache_state_to_str(old_state), rpki_cache_state_to_str(new_state)); + + switch (new_state) + { + case RPKI_CS_CONNECTING: + { + sock *sk = cache->tr_sock->sk; + + if (sk == NULL || sk->fd < 0) + rpki_open_connection(cache); + else + rpki_cache_change_state(cache, RPKI_CS_SYNC_START); + + rpki_schedule_next_retry(cache); + break; + } + + case RPKI_CS_ESTABLISHED: + rpki_schedule_next_refresh(cache); + rpki_schedule_next_expire_check(cache); + rpki_stop_retry_timer_event(cache); + break; + + case RPKI_CS_RESET: + /* Resetting cache connection. */ + cache->request_session_id = 1; + cache->serial_num = 0; + rpki_cache_change_state(cache, RPKI_CS_SYNC_START); + break; + + case RPKI_CS_SYNC_START: + /* Requesting for receive ROAs from a cache server. */ + if (cache->request_session_id) + { + /* Send request for Session ID */ + if (rpki_send_reset_query(cache) != RPKI_SUCCESS) + rpki_cache_change_state(cache, RPKI_CS_ERROR_TRANSPORT); + } + else + { + /* We have already a session_id. So send a Serial Query and start an incremental sync */ + if (rpki_send_serial_query(cache) != RPKI_SUCCESS) + rpki_cache_change_state(cache, RPKI_CS_ERROR_TRANSPORT); + } + break; + + case RPKI_CS_SYNC_RUNNING: + /* The state between Cache Response and End of Data. Only waiting for + * receiving all IP Prefix PDUs and finally a End of Data PDU. */ + break; + + case RPKI_CS_NO_INCR_UPDATE_AVAIL: + /* Server was unable to answer the last Serial Query and sent Cache Reset. */ + rpki_cache_change_state(cache, RPKI_CS_RESET); + break; + + case RPKI_CS_ERROR_NO_DATA_AVAIL: + /* No validation records are available on the cache server. */ + rpki_cache_change_state(cache, RPKI_CS_RESET); + break; + + case RPKI_CS_ERROR_FATAL: + /* Fatal protocol error occurred. */ + rpki_force_restart_proto(cache->p); + break; + + case RPKI_CS_ERROR_TRANSPORT: + /* Error on the transport socket occurred. */ + rpki_close_connection(cache); + rpki_schedule_next_retry(cache); + rpki_stop_refresh_timer_event(cache); + break; + + case RPKI_CS_FAST_RECONNECT: + /* Reconnect without any waiting period */ + rpki_close_connection(cache); + rpki_cache_change_state(cache, RPKI_CS_CONNECTING); + break; + + case RPKI_CS_SHUTDOWN: + bug("This isn't never really called."); + break; + }; +} + + +/* + * RPKI Timer Events + */ + +static void +rpki_schedule_next_refresh(struct rpki_cache *cache) +{ + btime t = cache->refresh_interval S; + + CACHE_DBG(cache, "after %t s", t); + tm_start(cache->refresh_timer, t); +} + +static void +rpki_schedule_next_retry(struct rpki_cache *cache) +{ + btime t = cache->retry_interval S; + + CACHE_DBG(cache, "after %t s", t); + tm_start(cache->retry_timer, t); +} + +static void +rpki_schedule_next_expire_check(struct rpki_cache *cache) +{ + /* A minimum time to wait is 1 second */ + btime t = cache->last_update + cache->expire_interval S - current_time(); + t = MAX(t, 1 S); + + CACHE_DBG(cache, "after %t s", t); + tm_start(cache->expire_timer, t); +} + +static void +rpki_stop_refresh_timer_event(struct rpki_cache *cache) +{ + CACHE_DBG(cache, "Stop"); + tm_stop(cache->refresh_timer); +} + +static void +rpki_stop_retry_timer_event(struct rpki_cache *cache) +{ + CACHE_DBG(cache, "Stop"); + tm_stop(cache->retry_timer); +} + +static void UNUSED +rpki_stop_expire_timer_event(struct rpki_cache *cache) +{ + CACHE_DBG(cache, "Stop"); + tm_stop(cache->expire_timer); +} + +static int +rpki_do_we_recv_prefix_pdu_in_last_seconds(struct rpki_cache *cache) +{ + if (!cache->last_rx_prefix) + return 0; + + return ((current_time() - cache->last_rx_prefix) <= 2 S); +} + +/** + * rpki_refresh_hook - control a scheduling of downloading data from cache server + * @tm: refresh timer with cache connection instance in data + * + * This function is periodically called during &ESTABLISHED or &SYNC* state + * cache connection. The first refresh schedule is invoked after receiving a + * |End of Data| PDU and has run by some &ERROR is occurred. + */ +static void +rpki_refresh_hook(timer *tm) +{ + struct rpki_cache *cache = tm->data; + + CACHE_DBG(cache, "%s", rpki_cache_state_to_str(cache->state)); + + switch (cache->state) + { + case RPKI_CS_ESTABLISHED: + rpki_cache_change_state(cache, RPKI_CS_SYNC_START); + break; + + case RPKI_CS_SYNC_START: + /* We sent Serial/Reset Query in last refresh hook call + * and didn't receive Cache Response yet. It is probably + * troubles with network. */ + case RPKI_CS_SYNC_RUNNING: + /* We sent Serial/Reset Query in last refresh hook call + * and we got Cache Response but didn't get End-Of-Data yet. + * It could be a trouble with network or only too long synchronization. */ + if (!rpki_do_we_recv_prefix_pdu_in_last_seconds(cache)) + { + CACHE_TRACE(D_EVENTS, cache, "Sync takes more time than refresh interval %us, resetting connection", cache->refresh_interval); + rpki_cache_change_state(cache, RPKI_CS_ERROR_TRANSPORT); + } + break; + + default: + break; + } + + if (cache->state != RPKI_CS_SHUTDOWN && cache->state != RPKI_CS_ERROR_TRANSPORT) + rpki_schedule_next_refresh(cache); + else + rpki_stop_refresh_timer_event(cache); +} + +/** + * rpki_retry_hook - control a scheduling of retrying connection to cache server + * @tm: retry timer with cache connection instance in data + * + * This function is periodically called during &ERROR* state cache connection. + * The first retry schedule is invoked after any &ERROR* state occurred and + * ends by reaching of &ESTABLISHED state again. + */ +static void +rpki_retry_hook(timer *tm) +{ + struct rpki_cache *cache = tm->data; + + CACHE_DBG(cache, "%s", rpki_cache_state_to_str(cache->state)); + + switch (cache->state) + { + case RPKI_CS_ESTABLISHED: + case RPKI_CS_SHUTDOWN: + break; + + case RPKI_CS_CONNECTING: + case RPKI_CS_SYNC_START: + case RPKI_CS_SYNC_RUNNING: + if (!rpki_do_we_recv_prefix_pdu_in_last_seconds(cache)) + { + /* We tried to establish a connection in last retry hook call and haven't done + * yet. It looks like troubles with network. We are aggressive here. */ + CACHE_TRACE(D_EVENTS, cache, "Sync takes more time than retry interval %us, resetting connection.", cache->retry_interval); + rpki_cache_change_state(cache, RPKI_CS_ERROR_TRANSPORT); + } + break; + + default: + rpki_cache_change_state(cache, RPKI_CS_CONNECTING); + break; + } + + if (cache->state != RPKI_CS_ESTABLISHED) + rpki_schedule_next_retry(cache); + else + rpki_stop_retry_timer_event(cache); +} + +/** + * rpki_expire_hook - control a expiration of ROA entries + * @tm: expire timer with cache connection instance in data + * + * This function is scheduled after received a |End of Data| PDU. + * A waiting interval is calculated dynamically by last update. + * If we reach an expiration time then we invoke a restarting + * of the protocol. + */ +static void +rpki_expire_hook(timer *tm) +{ + struct rpki_cache *cache = tm->data; + + if (!cache->last_update) + return; + + CACHE_DBG(cache, "%s", rpki_cache_state_to_str(cache->state)); + + btime t = cache->last_update + cache->expire_interval S - current_time(); + if (t <= 0) + { + CACHE_TRACE(D_EVENTS, cache, "All ROAs expired"); + rpki_force_restart_proto(cache->p); + } + else + { + CACHE_DBG(cache, "Remains %t seconds to become ROAs obsolete", t); + rpki_schedule_next_expire_check(cache); + } +} + +/** + * rpki_check_refresh_interval - check validity of refresh interval value + * @seconds: suggested value + * + * This function validates value and should return |NULL|. + * If the check doesn't pass then returns error message. + */ +const char * +rpki_check_refresh_interval(uint seconds) +{ + if (seconds < 1) + return "Minimum allowed refresh interval is 1 second"; + if (seconds > 86400) + return "Maximum allowed refresh interval is 86400 seconds"; + return NULL; +} + +/** + * rpki_check_retry_interval - check validity of retry interval value + * @seconds: suggested value + * + * This function validates value and should return |NULL|. + * If the check doesn't pass then returns error message. + */ +const char * +rpki_check_retry_interval(uint seconds) +{ + if (seconds < 1) + return "Minimum allowed retry interval is 1 second"; + if (seconds > 7200) + return "Maximum allowed retry interval is 7200 seconds"; + return NULL; +} + +/** + * rpki_check_expire_interval - check validity of expire interval value + * @seconds: suggested value + * + * This function validates value and should return |NULL|. + * If the check doesn't pass then returns error message. + */ +const char * +rpki_check_expire_interval(uint seconds) +{ + if (seconds < 600) + return "Minimum allowed expire interval is 600 seconds"; + if (seconds > 172800) + return "Maximum allowed expire interval is 172800 seconds"; + return NULL; +} + + +/* + * RPKI Cache + */ + +static struct rpki_cache * +rpki_init_cache(struct rpki_proto *p, struct rpki_config *cf) +{ + pool *pool = rp_new(p->p.pool, cf->hostname); + + struct rpki_cache *cache = mb_allocz(pool, sizeof(struct rpki_cache)); + + cache->pool = pool; + cache->p = p; + + cache->state = RPKI_CS_SHUTDOWN; + cache->request_session_id = 1; + cache->version = RPKI_MAX_VERSION; + + cache->refresh_interval = cf->refresh_interval; + cache->retry_interval = cf->retry_interval; + cache->expire_interval = cf->expire_interval; + cache->refresh_timer = tm_new_init(pool, &rpki_refresh_hook, cache, 0, 0); + cache->retry_timer = tm_new_init(pool, &rpki_retry_hook, cache, 0, 0); + cache->expire_timer = tm_new_init(pool, &rpki_expire_hook, cache, 0, 0); + + cache->tr_sock = mb_allocz(pool, sizeof(struct rpki_tr_sock)); + cache->tr_sock->cache = cache; + + switch (cf->tr_config.type) + { + case RPKI_TR_TCP: rpki_tr_tcp_init(cache->tr_sock); break; + case RPKI_TR_SSH: rpki_tr_ssh_init(cache->tr_sock); break; + }; + + CACHE_DBG(cache, "Connection object created"); + + return cache; +} + +/** + * rpki_get_cache_ident - give a text representation of cache server name + * @cache: RPKI connection instance + * + * The function converts cache connection into string. + */ +const char * +rpki_get_cache_ident(struct rpki_cache *cache) +{ + return rpki_tr_ident(cache->tr_sock); +} + +static int +rpki_open_connection(struct rpki_cache *cache) +{ + CACHE_TRACE(D_EVENTS, cache, "Opening a connection"); + + if (rpki_tr_open(cache->tr_sock) == RPKI_TR_ERROR) + { + rpki_cache_change_state(cache, RPKI_CS_ERROR_TRANSPORT); + return RPKI_TR_ERROR; + } + + return RPKI_TR_SUCCESS; +} + +static void +rpki_close_connection(struct rpki_cache *cache) +{ + CACHE_TRACE(D_EVENTS, cache, "Closing a connection"); + rpki_tr_close(cache->tr_sock); + proto_notify_state(&cache->p->p, PS_START); +} + +static int +rpki_shutdown(struct proto *P) +{ + struct rpki_proto *p = (void *) P; + + rpki_force_restart_proto(p); + + /* Protocol memory pool will be automatically freed */ + return PS_DOWN; +} + + +/* + * RPKI Reconfiguration + */ + +static int +rpki_try_fast_reconnect(struct rpki_cache *cache) +{ + if (cache->state == RPKI_CS_ESTABLISHED) + { + rpki_cache_change_state(cache, RPKI_CS_FAST_RECONNECT); + return SUCCESSFUL_RECONF; + } + + return NEED_RESTART; +} + +/** + * rpki_reconfigure_cache - a cache reconfiguration + * @p: RPKI protocol instance + * @cache: a cache connection + * @new: new RPKI configuration + * @old: old RPKI configuration + * + * This function reconfigures existing single cache server connection with new + * existing configuration. Generally, a change of time intervals could be + * reconfigured without restarting and all others changes requires a restart of + * protocol. Returns |NEED_TO_RESTART| or |SUCCESSFUL_RECONF|. + */ +static int +rpki_reconfigure_cache(struct rpki_proto *p UNUSED, struct rpki_cache *cache, struct rpki_config *new, struct rpki_config *old) +{ + u8 try_fast_reconnect = 0; + + if (strcmp(old->hostname, new->hostname) != 0) + { + CACHE_TRACE(D_EVENTS, cache, "Cache server address changed to %s", new->hostname); + return NEED_RESTART; + } + + if (old->port != new->port) + { + CACHE_TRACE(D_EVENTS, cache, "Cache server port changed to %u", new->port); + return NEED_RESTART; + } + + if (old->tr_config.type != new->tr_config.type) + { + CACHE_TRACE(D_EVENTS, cache, "Transport type changed"); + return NEED_RESTART; + } + else if (new->tr_config.type == RPKI_TR_SSH) + { + struct rpki_tr_ssh_config *ssh_old = (void *) old->tr_config.spec; + struct rpki_tr_ssh_config *ssh_new = (void *) new->tr_config.spec; + if ((strcmp(ssh_old->bird_private_key, ssh_new->bird_private_key) != 0) || + (strcmp(ssh_old->cache_public_key, ssh_new->cache_public_key) != 0) || + (strcmp(ssh_old->user, ssh_new->user) != 0)) + { + CACHE_TRACE(D_EVENTS, cache, "Settings of SSH transport configuration changed"); + try_fast_reconnect = 1; + } + } + +#define TEST_INTERVAL(name, Name) \ + if (cache->name##_interval != new->name##_interval || \ + old->keep_##name##_interval != new->keep_##name##_interval) \ + { \ + cache->name##_interval = new->name##_interval; \ + CACHE_TRACE(D_EVENTS, cache, #Name " interval changed to %u seconds %s", cache->name##_interval, (new->keep_##name##_interval ? "and keep it" : "")); \ + try_fast_reconnect = 1; \ + } + TEST_INTERVAL(refresh, Refresh); + TEST_INTERVAL(retry, Retry); + TEST_INTERVAL(expire, Expire); +#undef TEST_INTERVAL + + if (try_fast_reconnect) + return rpki_try_fast_reconnect(cache); + + return SUCCESSFUL_RECONF; +} + +/** + * rpki_reconfigure - a protocol reconfiguration hook + * @P: a protocol instance + * @CF: a new protocol configuration + * + * This function reconfigures whole protocol. + * It sets new protocol configuration into a protocol structure. + * Returns |NEED_TO_RESTART| or |SUCCESSFUL_RECONF|. + */ +static int +rpki_reconfigure(struct proto *P, struct proto_config *CF) +{ + struct rpki_proto *p = (void *) P; + struct rpki_config *new = (void *) CF; + struct rpki_config *old = (void *) p->p.cf; + struct rpki_cache *cache = p->cache; + + if (!proto_configure_channel(&p->p, &p->roa4_channel, proto_cf_find_channel(CF, NET_ROA4)) || + !proto_configure_channel(&p->p, &p->roa6_channel, proto_cf_find_channel(CF, NET_ROA6))) + return NEED_RESTART; + + if (rpki_reconfigure_cache(p, cache, new, old) != SUCCESSFUL_RECONF) + return NEED_RESTART; + + return SUCCESSFUL_RECONF; +} + + +/* + * RPKI Protocol Glue + */ + +static struct proto * +rpki_init(struct proto_config *CF) +{ + struct proto *P = proto_new(CF); + struct rpki_proto *p = (void *) P; + + proto_configure_channel(&p->p, &p->roa4_channel, proto_cf_find_channel(CF, NET_ROA4)); + proto_configure_channel(&p->p, &p->roa6_channel, proto_cf_find_channel(CF, NET_ROA6)); + + return P; +} + +static int +rpki_start(struct proto *P) +{ + struct rpki_proto *p = (void *) P; + struct rpki_config *cf = (void *) P->cf; + + p->cache = rpki_init_cache(p, cf); + rpki_start_cache(p->cache); + + return PS_START; +} + +static void +rpki_get_status(struct proto *P, byte *buf) +{ + struct rpki_proto *p = (struct rpki_proto *) P; + + if (P->proto_state == PS_DOWN) + { + *buf = 0; + return; + } + + if (p->cache) + bsprintf(buf, "%s", rpki_cache_state_to_str(p->cache->state)); + else + bsprintf(buf, "No cache server configured"); +} + +static void +rpki_show_proto_info_timer(const char *name, uint num, timer *t) +{ + if (tm_active(t)) + cli_msg(-1006, " %-16s: %t/%u", name, tm_remains(t), num); + else + cli_msg(-1006, " %-16s: ---", name); +} + +static void +rpki_show_proto_info(struct proto *P) +{ + struct rpki_proto *p = (struct rpki_proto *) P; + struct rpki_config *cf = (void *) p->p.cf; + struct rpki_cache *cache = p->cache; + + if (P->proto_state == PS_DOWN) + return; + + if (cache) + { + const char *transport_name = "---"; + + switch (cf->tr_config.type) + { + case RPKI_TR_SSH: transport_name = "SSHv2"; break; + case RPKI_TR_TCP: transport_name = "Unprotected over TCP"; break; + }; + + cli_msg(-1006, " Cache server: %s", rpki_get_cache_ident(cache)); + cli_msg(-1006, " Status: %s", rpki_cache_state_to_str(cache->state)); + cli_msg(-1006, " Transport: %s", transport_name); + cli_msg(-1006, " Protocol version: %u", cache->version); + + if (cache->request_session_id) + cli_msg(-1006, " Session ID: ---"); + else + cli_msg(-1006, " Session ID: %u", cache->session_id); + + if (cache->last_update) + { + cli_msg(-1006, " Serial number: %u", cache->serial_num); + cli_msg(-1006, " Last update: before %t s", current_time() - cache->last_update); + } + else + { + cli_msg(-1006, " Serial number: ---"); + cli_msg(-1006, " Last update: ---"); + } + + rpki_show_proto_info_timer("Refresh timer", cache->refresh_interval, cache->refresh_timer); + rpki_show_proto_info_timer("Retry timer", cache->retry_interval, cache->retry_timer); + rpki_show_proto_info_timer("Expire timer", cache->expire_interval, cache->expire_timer); + + if (p->roa4_channel) + channel_show_info(p->roa4_channel); + else + cli_msg(-1006, " No roa4 channel"); + + if (p->roa6_channel) + channel_show_info(p->roa6_channel); + else + cli_msg(-1006, " No roa6 channel"); + } +} + + +/* + * RPKI Protocol Configuration + */ + +/** + * rpki_check_config - check and complete configuration of RPKI protocol + * @cf: RPKI configuration + * + * This function is called at the end of parsing RPKI protocol configuration. + */ +void +rpki_check_config(struct rpki_config *cf) +{ + /* Do not check templates at all */ + if (cf->c.class == SYM_TEMPLATE) + return; + + if (ipa_zero(cf->ip) && cf->hostname == NULL) + cf_error("IP address or hostname of cache server must be set"); + + /* Set default transport type */ + if (cf->tr_config.spec == NULL) + { + cf->tr_config.spec = cfg_allocz(sizeof(struct rpki_tr_tcp_config)); + cf->tr_config.type = RPKI_TR_TCP; + } + + if (cf->port == 0) + { + /* Set default port numbers */ + switch (cf->tr_config.type) + { + case RPKI_TR_SSH: + cf->port = RPKI_SSH_PORT; + break; + default: + cf->port = RPKI_TCP_PORT; + } + } +} + +static void +rpki_postconfig(struct proto_config *CF) +{ + /* Define default channel */ + if (EMPTY_LIST(CF->channels)) + channel_config_new(NULL, CF->net_type, CF); +} + +static void +rpki_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUSED) +{ + /* FIXME: Should copy transport */ +} + +struct protocol proto_rpki = { + .name = "RPKI", + .template = "rpki%d", + .preference = DEF_PREF_RPKI, + .proto_size = sizeof(struct rpki_proto), + .config_size = sizeof(struct rpki_config), + .init = rpki_init, + .start = rpki_start, + .postconfig = rpki_postconfig, + .channel_mask = (NB_ROA4 | NB_ROA6), + .show_proto_info = rpki_show_proto_info, + .shutdown = rpki_shutdown, + .copy_config = rpki_copy_config, + .reconfigure = rpki_reconfigure, + .get_status = rpki_get_status, +}; diff --git a/proto/rpki/rpki.h b/proto/rpki/rpki.h new file mode 100644 index 00000000..8972b33a --- /dev/null +++ b/proto/rpki/rpki.h @@ -0,0 +1,165 @@ +/* + * BIRD -- The Resource Public Key Infrastructure (RPKI) to Router Protocol + * + * (c) 2015 CZ.NIC + * (c) 2015 Pavel Tvrdik <pawel.tvrdik@gmail.com> + * + * Using RTRlib: http://rpki.realmv6.org/ + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_RPKI_H_ +#define _BIRD_RPKI_H_ + +#include "nest/bird.h" +#include "nest/route.h" +#include "nest/protocol.h" +#include "lib/socket.h" +#include "lib/ip.h" + +#include "transport.h" +#include "packets.h" + +#define RPKI_TCP_PORT 323 +#define RPKI_SSH_PORT 22 +#define RPKI_RETRY_INTERVAL 600 +#define RPKI_REFRESH_INTERVAL 3600 +#define RPKI_EXPIRE_INTERVAL 7200 + +#define RPKI_VERSION_0 0 +#define RPKI_VERSION_1 1 +#define RPKI_MAX_VERSION RPKI_VERSION_1 + + +/* + * RPKI Cache + */ + +enum rpki_cache_state { + RPKI_CS_CONNECTING, /* Socket is establishing the transport connection. */ + RPKI_CS_ESTABLISHED, /* Connection is established, socket is waiting for a Serial Notify or expiration of the refresh_interval timer */ + RPKI_CS_RESET, /* Resetting RTR connection. */ + RPKI_CS_SYNC_START, /* Sending a Serial/Reset Query PDU and expecting a Cache Response PDU */ + RPKI_CS_SYNC_RUNNING, /* Receiving validation records from the RTR server. A state between Cache Response PDU and End of Data PDU */ + RPKI_CS_FAST_RECONNECT, /* Reconnect without any waiting period */ + RPKI_CS_NO_INCR_UPDATE_AVAIL, /* Server is unable to answer the last Serial Query and sent Cache Reset. */ + RPKI_CS_ERROR_NO_DATA_AVAIL, /* Server is unable to answer either a Serial Query or a Reset Query because it has no useful data available at this time. */ + RPKI_CS_ERROR_FATAL, /* Fatal protocol error occurred. */ + RPKI_CS_ERROR_TRANSPORT, /* Error on the transport socket occurred. */ + RPKI_CS_SHUTDOWN, /* RTR Socket is stopped. */ +}; + +struct rpki_cache { + pool *pool; /* Pool containing cache objects */ + struct rpki_proto *p; + + struct rpki_tr_sock *tr_sock; /* Transport specific socket */ + enum rpki_cache_state state; /* RPKI_CS_* */ + u32 session_id; + u8 request_session_id; /* 1: have to request new session id; 0: we have already received session id */ + u32 serial_num; /* Serial number denotes the logical version of data from cache server */ + u8 version; /* Protocol version */ + btime last_update; /* Last successful synchronization with cache server */ + btime last_rx_prefix; /* Last received prefix PDU */ + + /* Intervals can be changed by cache server on the fly */ + u32 refresh_interval; /* Actual refresh interval (in seconds) */ + u32 retry_interval; + u32 expire_interval; + timer *retry_timer; /* Retry timer event */ + timer *refresh_timer; /* Refresh timer event */ + timer *expire_timer; /* Expire timer event */ +}; + +const char *rpki_get_cache_ident(struct rpki_cache *cache); +const char *rpki_cache_state_to_str(enum rpki_cache_state state); + + +/* + * Routes handling + */ + +void rpki_table_add_roa(struct rpki_cache *cache, struct channel *channel, const net_addr_union *pfxr); +void rpki_table_remove_roa(struct rpki_cache *cache, struct channel *channel, const net_addr_union *pfxr); + + +/* + * RPKI Protocol Logic + */ + +void rpki_cache_change_state(struct rpki_cache *cache, const enum rpki_cache_state new_state); + + +/* + * RPKI Timer Events + */ + +const char *rpki_check_refresh_interval(uint seconds); +const char *rpki_check_retry_interval(uint seconds); +const char *rpki_check_expire_interval(uint seconds); + + +/* + * RPKI Protocol Configuration + */ + +struct rpki_proto { + struct proto p; + struct rpki_cache *cache; + + struct channel *roa4_channel; + struct channel *roa6_channel; + u8 refresh_channels; /* For non-incremental updates using rt_refresh_begin(), rt_refresh_end() */ +}; + +struct rpki_config { + struct proto_config c; + const char *hostname; /* Full domain name or stringified IP address of cache server */ + ip_addr ip; /* IP address of cache server or IPA_NONE */ + u16 port; /* Port number of cache server */ + struct rpki_tr_config tr_config; /* Specific transport configuration structure */ + u32 refresh_interval; /* Time interval (in seconds) for periodical downloading data from cache server */ + u32 retry_interval; /* Time interval (in seconds) for an unreachable server */ + u32 expire_interval; /* Maximal lifetime (in seconds) of ROAs without any successful refreshment */ + u8 keep_refresh_interval:1; /* Do not overwrite refresh interval by cache server update */ + u8 keep_retry_interval:1; /* Do not overwrite retry interval by cache server update */ + u8 keep_expire_interval:1; /* Do not overwrite expire interval by cache server update */ +}; + +void rpki_check_config(struct rpki_config *cf); + + +/* + * Logger + */ + +#define RPKI_LOG(log_level, rpki, msg, args...) \ + do { \ + log(log_level "%s: " msg, (rpki)->p.name , ## args); \ + } while(0) + +#if defined(LOCAL_DEBUG) || defined(GLOBAL_DEBUG) +#define CACHE_DBG(cache,msg,args...) \ + do { \ + RPKI_LOG(L_DEBUG, (cache)->p, "%s [%s] %s " msg, rpki_get_cache_ident(cache), rpki_cache_state_to_str((cache)->state), __func__, ## args); \ + } while(0) +#else +#define CACHE_DBG(cache,msg,args...) do { } while(0) +#endif + +#define RPKI_TRACE(level,rpki,msg,args...) \ + do { \ + if ((rpki)->p.debug & level) \ + RPKI_LOG(L_TRACE, rpki, msg, ## args); \ + } while(0) + +#define CACHE_TRACE(level,cache,msg,args...) \ + do { \ + if ((cache)->p->p.debug & level) \ + RPKI_LOG(L_TRACE, (cache)->p, msg, ## args); \ + } while(0) + +#define RPKI_WARN(p, msg, args...) RPKI_LOG(L_WARN, p, msg, ## args); + +#endif /* _BIRD_RPKI_H_ */ diff --git a/proto/rpki/ssh_transport.c b/proto/rpki/ssh_transport.c new file mode 100644 index 00000000..cd49ab90 --- /dev/null +++ b/proto/rpki/ssh_transport.c @@ -0,0 +1,75 @@ +/* + * BIRD -- An implementation of the SSH protocol for the RPKI transport + * + * (c) 2015 CZ.NIC + * (c) 2015 Pavel Tvrdik <pawel.tvrdik@gmail.com> + * + * This file was a part of RTRlib: http://rpki.realmv6.org/ + * This transport implementation uses libssh (http://www.libssh.org/) + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> + +#include "rpki.h" + +static int +rpki_tr_ssh_open(struct rpki_tr_sock *tr) +{ + struct rpki_cache *cache = tr->cache; + struct rpki_config *cf = (void *) cache->p->p.cf; + struct rpki_tr_ssh_config *ssh_cf = (void *) cf->tr_config.spec; + sock *sk = tr->sk; + + sk->type = SK_SSH_ACTIVE; + sk->ssh = mb_allocz(sk->pool, sizeof(struct ssh_sock)); + sk->ssh->username = ssh_cf->user; + sk->ssh->client_privkey_path = ssh_cf->bird_private_key; + sk->ssh->server_hostkey_path = ssh_cf->cache_public_key; + sk->ssh->subsystem = "rpki-rtr"; + sk->ssh->state = SK_SSH_CONNECT; + + if (sk_open(sk) != 0) + return RPKI_TR_ERROR; + + return RPKI_TR_SUCCESS; +} + +static const char * +rpki_tr_ssh_ident(struct rpki_tr_sock *tr) +{ + ASSERT(tr != NULL); + + struct rpki_cache *cache = tr->cache; + struct rpki_config *cf = (void *) cache->p->p.cf; + struct rpki_tr_ssh_config *ssh_cf = (void *) cf->tr_config.spec; + + if (tr->ident != NULL) + return tr->ident; + + const char *username = ssh_cf->user; + const char *host = cf->hostname; + u16 port = cf->port; + + size_t len = strlen(username) + 1 + strlen(host) + 1 + 5 + 1; /* <user> + '@' + <host> + ':' + <port> + '\0' */ + char *ident = mb_alloc(cache->pool, len); + bsnprintf(ident, len, "%s@%s:%u", username, host, port); + tr->ident = ident; + + return tr->ident; +} + +/** + * rpki_tr_ssh_init - initializes the RPKI transport structure for a SSH connection + * @tr: allocated RPKI transport structure + */ +void +rpki_tr_ssh_init(struct rpki_tr_sock *tr) +{ + tr->open_fp = &rpki_tr_ssh_open; + tr->ident_fp = &rpki_tr_ssh_ident; +} diff --git a/proto/rpki/tcp_transport.c b/proto/rpki/tcp_transport.c new file mode 100644 index 00000000..6c05964a --- /dev/null +++ b/proto/rpki/tcp_transport.c @@ -0,0 +1,78 @@ +/* + * BIRD -- An implementation of the TCP protocol for the RPKI protocol transport + * + * (c) 2015 CZ.NIC + * (c) 2015 Pavel Tvrdik <pawel.tvrdik@gmail.com> + * + * This file was a part of RTRlib: http://rpki.realmv6.org/ + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include <errno.h> +#include <netdb.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <unistd.h> + +#include "rpki.h" +#include "sysdep/unix/unix.h" + +static int +rpki_tr_tcp_open(struct rpki_tr_sock *tr) +{ + sock *sk = tr->sk; + + sk->type = SK_TCP_ACTIVE; + + if (sk_open(sk) != 0) + return RPKI_TR_ERROR; + + return RPKI_TR_SUCCESS; +} + +static const char * +rpki_tr_tcp_ident(struct rpki_tr_sock *tr) +{ + ASSERT(tr != NULL); + + struct rpki_cache *cache = tr->cache; + struct rpki_config *cf = (void *) cache->p->p.cf; + + if (tr->ident != NULL) + return tr->ident; + + const char *host = cf->hostname; + ip_addr ip = cf->ip; + u16 port = cf->port; + + size_t colon_and_port_len = 6; /* max ":65535" */ + size_t ident_len; + if (host) + ident_len = strlen(host) + colon_and_port_len + 1; + else + ident_len = IPA_MAX_TEXT_LENGTH + colon_and_port_len + 1; + + char *ident = mb_alloc(cache->pool, ident_len); + if (host) + bsnprintf(ident, ident_len, "%s:%u", host, port); + else + bsnprintf(ident, ident_len, "%I:%u", ip, port); + + tr->ident = ident; + return tr->ident; +} + +/** + * rpki_tr_tcp_init - initializes the RPKI transport structure for a TCP connection + * @tr: allocated RPKI transport structure + */ +void +rpki_tr_tcp_init(struct rpki_tr_sock *tr) +{ + tr->open_fp = &rpki_tr_tcp_open; + tr->ident_fp = &rpki_tr_tcp_ident; +} diff --git a/proto/rpki/transport.c b/proto/rpki/transport.c new file mode 100644 index 00000000..182667be --- /dev/null +++ b/proto/rpki/transport.c @@ -0,0 +1,135 @@ +/* + * BIRD -- The Resource Public Key Infrastructure (RPKI) to Router Protocol + * + * (c) 2015 CZ.NIC + * (c) 2015 Pavel Tvrdik <pawel.tvrdik@gmail.com> + * + * This file was a part of RTRlib: http://rpki.realmv6.org/ + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include <sys/socket.h> +#include <netdb.h> + +#include "rpki.h" +#include "transport.h" +#include "sysdep/unix/unix.h" + +/** + * rpki_hostname_autoresolv - auto-resolve an IP address from a hostname + * @host: domain name of host, e.g. "rpki-validator.realmv6.org" + * + * This function resolves an IP address from a hostname. + * Returns &ip_addr structure with IP address or |IPA_NONE|. + */ +static ip_addr +rpki_hostname_autoresolv(const char *host) +{ + ip_addr addr = {}; + struct addrinfo *res; + struct addrinfo hints = { + .ai_family = AF_UNSPEC, + .ai_socktype = SOCK_STREAM, + .ai_flags = AI_ADDRCONFIG, + }; + + if (!host) + return IPA_NONE; + + int err_code = getaddrinfo(host, NULL, &hints, &res); + if (err_code != 0) + { + log(L_DEBUG "getaddrinfo failed: %s", gai_strerror(err_code)); + return IPA_NONE; + } + + sockaddr sa = { + .sa = *res->ai_addr, + }; + + uint unused; + sockaddr_read(&sa, res->ai_family, &addr, NULL, &unused); + + freeaddrinfo(res); + return addr; +} + +/** + * rpki_tr_open - prepare and open a socket connection + * @tr: initialized transport socket + * + * Prepare and open a socket connection specified by @tr that must be initialized before. + * This function ends with a calling the sk_open() function. + * Returns RPKI_TR_SUCCESS or RPKI_TR_ERROR. + */ +int +rpki_tr_open(struct rpki_tr_sock *tr) +{ + struct rpki_cache *cache = tr->cache; + struct rpki_config *cf = (void *) cache->p->p.cf; + + ASSERT(tr->sk == NULL); + tr->sk = sk_new(cache->pool); + sock *sk = tr->sk; + + /* sk->type -1 is invalid value, a correct value MUST be set in the specific transport layer in open_fp() hook */ + sk->type = -1; + + sk->tx_hook = rpki_connected_hook; + sk->err_hook = rpki_err_hook; + sk->data = cache; + sk->daddr = cf->ip; + sk->dport = cf->port; + sk->host = cf->hostname; + sk->rbsize = RPKI_RX_BUFFER_SIZE; + sk->tbsize = RPKI_TX_BUFFER_SIZE; + sk->tos = IP_PREC_INTERNET_CONTROL; + + if (ipa_zero2(sk->daddr) && sk->host) + { + sk->daddr = rpki_hostname_autoresolv(sk->host); + if (ipa_zero(sk->daddr)) + { + CACHE_TRACE(D_EVENTS, cache, "Cannot resolve the hostname '%s'", sk->host); + return RPKI_TR_ERROR; + } + } + + return tr->open_fp(tr); +} + +/** + * rpki_tr_close - close socket and prepare it for possible next open + * @tr: successfully opened transport socket + * + * Close socket and free resources. + */ +void +rpki_tr_close(struct rpki_tr_sock *tr) +{ + if (tr->ident) + { + mb_free((char *) tr->ident); + tr->ident = NULL; + } + + if (tr->sk) + { + rfree(tr->sk); + tr->sk = NULL; + } +} + +/** + * rpki_tr_ident - Returns a string identifier for the rpki transport socket + * @tr: successfully opened transport socket + * + * Returns a \0 terminated string identifier for the socket endpoint, e.g. "<host>:<port>". + * Memory is allocated inside @tr structure. + */ +inline const char * +rpki_tr_ident(struct rpki_tr_sock *tr) +{ + return tr->ident_fp(tr); +} diff --git a/proto/rpki/transport.h b/proto/rpki/transport.h new file mode 100644 index 00000000..f90b7e42 --- /dev/null +++ b/proto/rpki/transport.h @@ -0,0 +1,79 @@ +/* + * BIRD -- The Resource Public Key Infrastructure (RPKI) to Router Protocol + * + * (c) 2015 CZ.NIC + * (c) 2015 Pavel Tvrdik <pawel.tvrdik@gmail.com> + * + * This file was a part of RTRlib: http://rpki.realmv6.org/ + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +/* + * The RPKI transport sockets implement the communication channel + * (e.g., SSH, TCP, TCP-AO) between an RPKI server and client. + * + * Before using the transport socket, a tr_socket must be + * initialized based on a protocol-dependent init function (e.g., + * rpki_tr_tcp_init()). + * + * The rpki_tr_* functions call the corresponding function pointers, which are + * passed in the rpki_tr_sock structure, and forward the remaining arguments. + */ + +#ifndef _BIRD_RPKI_TRANSPORT_H_ +#define _BIRD_RPKI_TRANSPORT_H_ + +#include <time.h> + +/* The return values for rpki_tr_ functions */ +enum rpki_tr_rtvals { + RPKI_TR_SUCCESS = 0, /* Operation was successful */ + RPKI_TR_ERROR = -1, /* Error occurred */ + RPKI_TR_WOULDBLOCK = -2, /* No data is available on the socket */ + RPKI_TR_INTR = -3, /* Call was interrupted from a signal */ + RPKI_TR_CLOSED = -4 /* Connection closed */ +}; + +/* A transport socket structure */ +struct rpki_tr_sock { + sock *sk; /* Standard BIRD socket */ + struct rpki_cache *cache; /* Cache server */ + int (*open_fp)(struct rpki_tr_sock *); /* Function that establishes the socket connection */ + const char *(*ident_fp)(struct rpki_tr_sock *); /* Function that returns an identifier for the socket endpoint */ + const char *ident; /* Internal. Use ident_fp() hook instead of this pointer */ +}; + +int rpki_tr_open(struct rpki_tr_sock *tr); +void rpki_tr_close(struct rpki_tr_sock *tr); +const char *rpki_tr_ident(struct rpki_tr_sock *tr); + +/* Types of supported transports */ +enum rpki_tr_type { + RPKI_TR_TCP, /* Unprotected transport over TCP */ + RPKI_TR_SSH, /* Protected transport by SSHv2 connection */ +}; + +/* Common configure structure for transports */ +struct rpki_tr_config { + enum rpki_tr_type type; /* RPKI_TR_TCP or RPKI_TR_SSH */ + const void *spec; /* Specific transport configuration, i.e. rpki_tr_tcp_config or rpki_tr_ssh_config */ +}; + +struct rpki_tr_tcp_config { + /* No internal configuration data */ +}; + +struct rpki_tr_ssh_config { + const char *bird_private_key; /* Filepath to the BIRD server private key */ + const char *cache_public_key; /* Filepath to the public key of cache server, can be file known_hosts */ + const char *user; /* Username for SSH connection */ +}; + +/* ssh_transport.c */ +void rpki_tr_ssh_init(struct rpki_tr_sock *tr); + +/* tcp_transport.c */ +void rpki_tr_tcp_init(struct rpki_tr_sock *tr); + +#endif /* _BIRD_RPKI_TRANSPORT_H_ */ diff --git a/proto/static/Makefile b/proto/static/Makefile index 61fadbea..e38f9b74 100644 --- a/proto/static/Makefile +++ b/proto/static/Makefile @@ -1,6 +1,6 @@ -source=static.c -root-rel=../../ -dir-name=proto/static - -include ../../Rules +src := static.c +obj := $(src-o-files) +$(all-daemon) +$(cf-local) +tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file diff --git a/proto/static/config.Y b/proto/static/config.Y index 182721b3..66e5ea4c 100644 --- a/proto/static/config.Y +++ b/proto/static/config.Y @@ -13,98 +13,119 @@ CF_HDR CF_DEFINES #define STATIC_CFG ((struct static_config *) this_proto) -static struct static_route *this_srt, *this_srt_nh, *last_srt_nh; +static struct static_route *this_srt, *this_snh; static struct f_inst **this_srt_last_cmd; +static struct static_route * +static_nexthop_new(void) +{ + struct static_route *nh = this_srt; + + if (this_snh) + { + /* Additional next hop */ + nh = cfg_allocz(sizeof(struct static_route)); + nh->net = this_srt->net; + this_snh->mp_next = nh; + } + + nh->dest = RTD_UNICAST; + nh->mp_head = this_srt; + return nh; +}; + static void static_route_finish(void) { - struct static_route *r; - - /* Update undefined use_bfd entries in multipath nexthops */ - if (this_srt->dest == RTD_MULTIPATH) - for (r = this_srt->mp_next; r; r = r->mp_next) - if (r->use_bfd < 0) - r->use_bfd = this_srt->use_bfd; + if (net_type_match(this_srt->net, NB_DEST) == !this_srt->dest) + cf_error("Unexpected or missing nexthop/type"); } CF_DECLS CF_KEYWORDS(STATIC, ROUTE, VIA, DROP, REJECT, PROHIBIT, PREFERENCE, CHECK, LINK) -CF_KEYWORDS(MULTIPATH, WEIGHT, RECURSIVE, IGP, TABLE, BLACKHOLE, UNREACHABLE, BFD) +CF_KEYWORDS(ONLINK, WEIGHT, RECURSIVE, IGP, TABLE, BLACKHOLE, UNREACHABLE, BFD, MPLS) CF_GRAMMAR CF_ADDTO(proto, static_proto '}') -static_proto_start: proto_start STATIC { - this_proto = proto_config_new(&proto_static, $1); - static_init_config((struct static_config *) this_proto); - } - ; +static_proto_start: proto_start STATIC +{ + this_proto = proto_config_new(&proto_static, $1); + init_list(&STATIC_CFG->routes); +}; static_proto: static_proto_start proto_name '{' | static_proto proto_item ';' + | static_proto proto_channel ';' { this_proto->net_type = $2->net_type; } | static_proto CHECK LINK bool ';' { STATIC_CFG->check_link = $4; } - | static_proto IGP TABLE rtable ';' { STATIC_CFG->igp_table = $4; } + | static_proto IGP TABLE rtable ';' { + if ($4->addr_type == NET_IP4) + STATIC_CFG->igp_table_ip4 = $4; + else if ($4->addr_type == NET_IP6) + STATIC_CFG->igp_table_ip6 = $4; + else + cf_error("Incompatible IGP table type"); + } | static_proto stat_route stat_route_opt_list ';' { static_route_finish(); } ; -stat_route0: ROUTE prefix { - this_srt = cfg_allocz(sizeof(struct static_route)); - add_tail(&STATIC_CFG->other_routes, &this_srt->n); - this_srt->net = $2.addr; - this_srt->masklen = $2.len; - this_srt_last_cmd = &(this_srt->cmds); +stat_nexthop: + VIA ipa ipa_scope { + this_snh = static_nexthop_new(); + this_snh->via = $2; + this_snh->iface = $3; + } + | VIA TEXT { + this_snh = static_nexthop_new(); + this_snh->via = IPA_NONE; + this_snh->iface = if_get_by_name($2); + } + | stat_nexthop MPLS label_stack { + this_snh->mls = $3; } - ; + | stat_nexthop ONLINK bool { + this_snh->onlink = $3; + } + | stat_nexthop WEIGHT expr { + this_snh->weight = $3 - 1; + if (($3<1) || ($3>256)) cf_error("Weight must be in range 1-256"); + } + | stat_nexthop BFD bool { + this_snh->use_bfd = $3; cf_check_bfd($3); + } +; -stat_multipath1: - VIA ipa ipa_scope { - last_srt_nh = this_srt_nh; - this_srt_nh = cfg_allocz(sizeof(struct static_route)); - this_srt_nh->dest = RTD_NONE; - this_srt_nh->via = $2; - this_srt_nh->via_if = $3; - this_srt_nh->if_name = (void *) this_srt; /* really */ - this_srt_nh->use_bfd = -1; /* undefined */ - } - | stat_multipath1 WEIGHT expr { - this_srt_nh->masklen = $3 - 1; /* really */ - if (($3<1) || ($3>256)) cf_error("Weight must be in range 1-256"); - } - | stat_multipath1 BFD bool { - this_srt_nh->use_bfd = $3; cf_check_bfd($3); - } - ; +stat_nexthops: + stat_nexthop + | stat_nexthops stat_nexthop +; -stat_multipath: - stat_multipath1 { this_srt->mp_next = this_srt_nh; } - | stat_multipath stat_multipath1 { last_srt_nh->mp_next = this_srt_nh; } +stat_route0: ROUTE net_any { + this_srt = cfg_allocz(sizeof(struct static_route)); + add_tail(&STATIC_CFG->routes, &this_srt->n); + this_srt->net = $2; + this_srt_last_cmd = &(this_srt->cmds); + this_srt->mp_next = NULL; + this_snh = NULL; + } ; stat_route: - stat_route0 VIA ipa ipa_scope { - this_srt->dest = RTD_ROUTER; + stat_route0 stat_nexthops + | stat_route0 RECURSIVE ipa { + this_srt->dest = RTDX_RECURSIVE; this_srt->via = $3; - this_srt->via_if = $4; - } - | stat_route0 VIA TEXT { - this_srt->dest = RTD_DEVICE; - this_srt->if_name = $3; - rem_node(&this_srt->n); - add_tail(&STATIC_CFG->iface_routes, &this_srt->n); } - | stat_route0 MULTIPATH stat_multipath { - this_srt->dest = RTD_MULTIPATH; - } - | stat_route0 RECURSIVE ipa { + | stat_route0 RECURSIVE ipa MPLS label_stack { this_srt->dest = RTDX_RECURSIVE; this_srt->via = $3; + this_srt->mls = $5; } - + | stat_route0 { this_srt->dest = RTD_NONE; } | stat_route0 DROP { this_srt->dest = RTD_BLACKHOLE; } | stat_route0 REJECT { this_srt->dest = RTD_UNREACHABLE; } | stat_route0 BLACKHOLE { this_srt->dest = RTD_BLACKHOLE; } @@ -114,7 +135,6 @@ stat_route: stat_route_item: cmd { *this_srt_last_cmd = $1; this_srt_last_cmd = &($1->next); } - | BFD bool ';' { this_srt->use_bfd = $2; cf_check_bfd($2); } ; stat_route_opts: diff --git a/proto/static/static.c b/proto/static/static.c index 849067b9..ede4c734 100644 --- a/proto/static/static.c +++ b/proto/static/static.c @@ -9,33 +9,32 @@ /** * DOC: Static * - * The Static protocol is implemented in a straightforward way. It keeps - * two lists of static routes: one containing interface routes and one - * holding the remaining ones. Interface routes are inserted and removed according - * to interface events received from the core via the if_notify() hook. Routes - * pointing to a neighboring router use a sticky node in the neighbor cache - * to be notified about gaining or losing the neighbor. Special - * routes like black holes or rejects are inserted all the time. + * The Static protocol is implemented in a straightforward way. It keeps a list + * of static routes. Routes of dest RTD_UNICAST have associated sticky node in + * the neighbor cache to be notified about gaining or losing the neighbor and + * about interface-related events (e.g. link down). They may also have a BFD + * request if associated with a BFD session. When a route is notified, + * static_decide() is used to see whether the route activeness is changed. In + * such case, the route is marked as dirty and scheduled to be announced or + * withdrawn, which is done asynchronously from event hook. Routes of other + * types (e.g. black holes) are announced all the time. * - * Multipath routes are tricky. Because these routes depends on - * several neighbors we need to integrate that to the neighbor - * notification handling, we use dummy static_route nodes, one for - * each nexthop. Therefore, a multipath route consists of a master - * static_route node (of dest RTD_MULTIPATH), which specifies prefix - * and is used in most circumstances, and a list of dummy static_route - * nodes (of dest RTD_NONE), which stores info about nexthops and are - * connected to neighbor entries and neighbor notifications. Dummy - * nodes are chained using mp_next, they aren't in other_routes list, - * and abuse some fields (masklen, if_name) for other purposes. + * Multipath routes are a bit tricky. To represent additional next hops, dummy + * static_route nodes are used, which are chained using @mp_next field and link + * to the master node by @mp_head field. Each next hop has a separate neighbor + * entry and an activeness state, but the master node is used for most purposes. + * Note that most functions DO NOT accept dummy nodes as arguments. * * The only other thing worth mentioning is that when asked for reconfiguration, * Static not only compares the two configurations, but it also calculates - * difference between the lists of static routes and it just inserts the - * newly added routes and removes the obsolete ones. + * difference between the lists of static routes and it just inserts the newly + * added routes, removes the obsolete ones and reannounces changed ones. */ #undef LOCAL_DEBUG +#include <stdlib.h> + #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" @@ -50,107 +49,117 @@ static linpool *static_lp; -static inline rtable * -p_igp_table(struct proto *p) -{ - struct static_config *cf = (void *) p->cf; - return cf->igp_table ? cf->igp_table->table : p->table; -} - static void -static_install(struct proto *p, struct static_route *r, struct iface *ifa) +static_announce_rte(struct static_proto *p, struct static_route *r) { - net *n; - rta a; - rte *e; + rta *a = allocz(RTA_MAX_SIZE); + a->src = p->p.main_source; + a->source = RTS_STATIC; + a->scope = SCOPE_UNIVERSE; + a->dest = r->dest; - if (r->installed > 0) - return; + if (r->dest == RTD_UNICAST) + { + struct static_route *r2; + struct nexthop *nhs = NULL; - DBG("Installing static route %I/%d, rtd=%d\n", r->net, r->masklen, r->dest); - bzero(&a, sizeof(a)); - a.src = p->main_source; - a.source = (r->dest == RTD_DEVICE) ? RTS_STATIC_DEVICE : RTS_STATIC; - a.scope = SCOPE_UNIVERSE; - a.cast = RTC_UNICAST; - a.dest = r->dest; - a.gw = r->via; - a.iface = ifa; - - if (r->dest == RTD_MULTIPATH) + for (r2 = r; r2; r2 = r2->mp_next) { - struct static_route *r2; - struct mpnh *nhs = NULL; - - for (r2 = r->mp_next; r2; r2 = r2->mp_next) - if (r2->installed) - { - struct mpnh *nh = alloca(sizeof(struct mpnh)); - nh->gw = r2->via; - nh->iface = r2->neigh->iface; - nh->weight = r2->masklen; /* really */ - mpnh_insert(&nhs, nh); - } - - /* There is at least one nexthop */ - if (!nhs->next) - { - /* Fallback to unipath route for exactly one nexthop */ - a.dest = RTD_ROUTER; - a.gw = nhs->gw; - a.iface = nhs->iface; - } - else - a.nexthops = nhs; + if (!r2->active) + continue; + + struct nexthop *nh = allocz(NEXTHOP_MAX_SIZE); + nh->gw = r2->via; + nh->iface = r2->neigh->iface; + nh->flags = r2->onlink ? RNF_ONLINK : 0; + nh->weight = r2->weight; + if (r2->mls) + { + nh->labels = r2->mls->len; + memcpy(nh->label, r2->mls->stack, r2->mls->len * sizeof(u32)); + } + + nexthop_insert(&nhs, nh); } + if (!nhs) + goto withdraw; + + nexthop_link(a, nhs); + } + if (r->dest == RTDX_RECURSIVE) - rta_set_recursive_next_hop(p->table, &a, p_igp_table(p), &r->via, &r->via); + { + rtable *tab = ipa_is_ip4(r->via) ? p->igp_table_ip4 : p->igp_table_ip6; + rta_set_recursive_next_hop(p->p.main_channel->table, a, tab, r->via, IPA_NONE, r->mls); + } - /* We skip rta_lookup() here */ + /* Already announced */ + if (r->state == SRS_CLEAN) + return; - n = net_get(p->table, r->net, r->masklen); - e = rte_get_temp(&a); - e->net = n; + /* We skip rta_lookup() here */ + rte *e = rte_get_temp(a); e->pflags = 0; if (r->cmds) f_eval_rte(r->cmds, &e, static_lp); - rte_update(p, n, e); - r->installed = 1; + rte_update(&p->p, r->net, e); + r->state = SRS_CLEAN; if (r->cmds) lp_flush(static_lp); + + return; + +withdraw: + if (r->state == SRS_DOWN) + return; + + rte_update(&p->p, r->net, NULL); + r->state = SRS_DOWN; } static void -static_remove(struct proto *p, struct static_route *r) +static_mark_rte(struct static_proto *p, struct static_route *r) { - net *n; - - if (!r->installed) + if (r->state == SRS_DIRTY) return; - DBG("Removing static route %I/%d via %I\n", r->net, r->masklen, r->via); - n = net_find(p->table, r->net, r->masklen); - rte_update(p, n, NULL); - r->installed = 0; + r->state = SRS_DIRTY; + BUFFER_PUSH(p->marked) = r; + + if (!ev_active(p->event)) + ev_schedule(p->event); +} + +static void +static_announce_marked(void *P) +{ + struct static_proto *p = P; + + BUFFER_WALK(p->marked, r) + static_announce_rte(P, r); + + BUFFER_FLUSH(p->marked); } static void static_bfd_notify(struct bfd_request *req); static void -static_update_bfd(struct proto *p, struct static_route *r) +static_update_bfd(struct static_proto *p, struct static_route *r) { + /* The @r is a RTD_UNICAST next hop, may be a dummy node */ + struct neighbor *nb = r->neigh; int bfd_up = (nb->scope > 0) && r->use_bfd; if (bfd_up && !r->bfd_req) { // ip_addr local = ipa_nonzero(r->local) ? r->local : nb->ifa->ip; - r->bfd_req = bfd_request_session(p->pool, r->via, nb->ifa->ip, nb->iface, + r->bfd_req = bfd_request_session(p->p.pool, r->via, nb->ifa->ip, nb->iface, static_bfd_notify, r); } @@ -162,212 +171,173 @@ static_update_bfd(struct proto *p, struct static_route *r) } static int -static_decide(struct static_config *cf, struct static_route *r) +static_decide(struct static_proto *p, struct static_route *r) { - /* r->dest != RTD_MULTIPATH, but may be RTD_NONE (part of multipath route) - the route also have to be valid (r->neigh != NULL) */ + /* The @r is a RTD_UNICAST next hop, may be a dummy node */ + + struct static_config *cf = (void *) p->p.cf; + uint old_active = r->active; if (r->neigh->scope < 0) - return 0; + goto fail; if (cf->check_link && !(r->neigh->iface->flags & IF_LINK_UP)) - return 0; + goto fail; - if (r->bfd_req && r->bfd_req->state != BFD_STATE_UP) - return 0; + if (r->bfd_req && (r->bfd_req->state != BFD_STATE_UP)) + goto fail; - return 1; -} + r->active = 1; + return !old_active; +fail: + r->active = 0; + return old_active; +} static void -static_add(struct proto *p, struct static_config *cf, struct static_route *r) +static_add_rte(struct static_proto *p, struct static_route *r) { - DBG("static_add(%I/%d,%d)\n", r->net, r->masklen, r->dest); - switch (r->dest) - { - case RTD_ROUTER: - { - struct neighbor *n = neigh_find2(p, &r->via, r->via_if, NEF_STICKY); - if (n) - { - r->chain = n->data; - n->data = r; - r->neigh = n; - - static_update_bfd(p, r); - if (static_decide(cf, r)) - static_install(p, r, n->iface); - else - static_remove(p, r); - } - else - { - log(L_ERR "Static route destination %I is invalid. Ignoring.", r->via); - static_remove(p, r); - } - break; - } + if (r->dest == RTD_UNICAST) + { + struct static_route *r2; + struct neighbor *n; - case RTD_DEVICE: - break; + for (r2 = r; r2; r2 = r2->mp_next) + { + n = ipa_nonzero(r2->via) ? + neigh_find2(&p->p, &r2->via, r2->iface, + NEF_STICKY | (r2->onlink ? NEF_ONLINK : 0)) : + neigh_find_iface(&p->p, r2->iface); - case RTD_MULTIPATH: + if (!n) { - int count = 0; - struct static_route *r2; - - for (r2 = r->mp_next; r2; r2 = r2->mp_next) - { - struct neighbor *n = neigh_find2(p, &r2->via, r2->via_if, NEF_STICKY); - if (n) - { - r2->chain = n->data; - n->data = r2; - r2->neigh = n; - - static_update_bfd(p, r2); - r2->installed = static_decide(cf, r2); - count += r2->installed; - } - else - { - log(L_ERR "Static route destination %I is invalid. Ignoring.", r2->via); - r2->installed = 0; - } - } - - if (count) - static_install(p, r, NULL); - else - static_remove(p, r); - break; + log(L_WARN "Invalid next hop %I of static route %N", r2->via, r2->net); + continue; } - default: - static_install(p, r, NULL); + r2->neigh = n; + r2->chain = n->data; + n->data = r2; + + static_update_bfd(p, r2); + static_decide(p, r2); } + } + + static_announce_rte(p, r); } static void -static_rte_cleanup(struct proto *p UNUSED, struct static_route *r) +static_reset_rte(struct static_proto *p UNUSED, struct static_route *r) { struct static_route *r2; - if (r->bfd_req) + for (r2 = r; r2; r2 = r2->mp_next) { - rfree(r->bfd_req); - r->bfd_req = NULL; - } + r2->neigh = NULL; + r2->chain = NULL; - if (r->dest == RTD_MULTIPATH) - for (r2 = r->mp_next; r2; r2 = r2->mp_next) - if (r2->bfd_req) - { - rfree(r2->bfd_req); - r2->bfd_req = NULL; - } + r2->state = 0; + r2->active = 0; + + rfree(r2->bfd_req); + r2->bfd_req = NULL; + } } -static int -static_start(struct proto *p) +static void +static_remove_rte(struct static_proto *p, struct static_route *r) { - struct static_config *cf = (void *) p->cf; - struct static_route *r; + if (r->state) + rte_update(&p->p, r->net, NULL); - DBG("Static: take off!\n"); + static_reset_rte(p, r); +} - if (!static_lp) - static_lp = lp_new(&root_pool, 1008); - if (cf->igp_table) - rt_lock_table(cf->igp_table->table); +static inline int +static_same_dest(struct static_route *x, struct static_route *y) +{ + if (x->dest != y->dest) + return 0; + + switch (x->dest) + { + case RTD_UNICAST: + for (; x && y; x = x->mp_next, y = y->mp_next) + { + if (!ipa_equal(x->via, y->via) || + (x->iface != y->iface) || + (x->onlink != y->onlink) || + (x->weight != y->weight) || + (x->use_bfd != y->use_bfd) || + (!x->mls != !y->mls) || + ((x->mls) && (y->mls) && (x->mls->len != y->mls->len))) + return 0; + + if (!x->mls) + continue; + + for (uint i = 0; i < x->mls->len; i++) + if (x->mls->stack[i] != y->mls->stack[i]) + return 0; + } + return !x && !y; - /* We have to go UP before routes could be installed */ - proto_notify_state(p, PS_UP); + case RTDX_RECURSIVE: + if (!ipa_equal(x->via, y->via) || + (!x->mls != !y->mls) || + ((x->mls) && (y->mls) && (x->mls->len != y->mls->len))) + return 0; - WALK_LIST(r, cf->other_routes) - static_add(p, cf, r); - return PS_UP; -} + if (!x->mls) + return 1; -static int -static_shutdown(struct proto *p) -{ - struct static_config *cf = (void *) p->cf; - struct static_route *r; + for (uint i = 0; i < x->mls->len; i++) + if (x->mls->stack[i] != y->mls->stack[i]) + return 0; - /* Just reset the flag, the routes will be flushed by the nest */ - WALK_LIST(r, cf->iface_routes) - r->installed = 0; - WALK_LIST(r, cf->other_routes) - { - static_rte_cleanup(p, r); - r->installed = 0; - } + return 1; - return PS_DOWN; + default: + return 1; + } } -static void -static_cleanup(struct proto *p) +static inline int +static_same_rte(struct static_route *or, struct static_route *nr) { - struct static_config *cf = (void *) p->cf; - - if (cf->igp_table) - rt_unlock_table(cf->igp_table->table); + /* Note that i_same() requires arguments in (new, old) order */ + return static_same_dest(or, nr) && i_same(nr->cmds, or->cmds); } static void -static_update_rte(struct proto *p, struct static_route *r) +static_reconfigure_rte(struct static_proto *p, struct static_route *or, struct static_route *nr) { - switch (r->dest) - { - case RTD_ROUTER: - if (static_decide((struct static_config *) p->cf, r)) - static_install(p, r, r->neigh->iface); - else - static_remove(p, r); - break; - - case RTD_NONE: /* a part of multipath route */ - { - int decision = static_decide((struct static_config *) p->cf, r); - if (decision == r->installed) - break; /* no change */ - r->installed = decision; - - struct static_route *r1, *r2; - int count = 0; - r1 = (void *) r->if_name; /* really */ - for (r2 = r1->mp_next; r2; r2 = r2->mp_next) - count += r2->installed; - - if (count) - { - /* Set of nexthops changed - force reinstall */ - r1->installed = 0; - static_install(p, r1, NULL); - } - else - static_remove(p, r1); + if ((or->state == SRS_CLEAN) && !static_same_rte(or, nr)) + nr->state = SRS_DIRTY; + else + nr->state = or->state; - break; - } - } + static_add_rte(p, nr); + static_reset_rte(p, or); } + static void static_neigh_notify(struct neighbor *n) { - struct proto *p = n->proto; + struct static_proto *p = (void *) n->proto; struct static_route *r; DBG("Static: neighbor notify for %I: iface %p\n", n->addr, n->iface); - for(r=n->data; r; r=r->chain) + for (r = n->data; r; r = r->chain) { static_update_bfd(p, r); - static_update_rte(p, r); + + if (static_decide(p, r)) + static_mark_rte(p, r->mp_head); } } @@ -375,241 +345,232 @@ static void static_bfd_notify(struct bfd_request *req) { struct static_route *r = req->data; - struct proto *p = r->neigh->proto; + struct static_proto *p = (void *) r->neigh->proto; // if (req->down) TRACE(D_EVENTS, "BFD session down for nbr %I on %s", XXXX); - static_update_rte(p, r); + if (static_decide(p, r)) + static_mark_rte(p, r->mp_head); } -static void -static_dump_rt(struct static_route *r) +static int +static_rte_mergable(rte *pri UNUSED, rte *sec UNUSED) { - debug("%-1I/%2d: ", r->net, r->masklen); - switch (r->dest) - { - case RTD_ROUTER: - debug("via %I\n", r->via); - break; - case RTD_DEVICE: - debug("dev %s\n", r->if_name); - break; - default: - debug("rtd %d\n", r->dest); - break; - } + return 1; } + static void -static_dump(struct proto *p) +static_postconfig(struct proto_config *CF) { - struct static_config *c = (void *) p->cf; + struct static_config *cf = (void *) CF; struct static_route *r; - debug("Independent static routes:\n"); - WALK_LIST(r, c->other_routes) - static_dump_rt(r); - debug("Device static routes:\n"); - WALK_LIST(r, c->iface_routes) - static_dump_rt(r); -} + if (EMPTY_LIST(CF->channels)) + cf_error("Channel not specified"); -static void -static_if_notify(struct proto *p, unsigned flags, struct iface *i) -{ - struct static_route *r; - struct static_config *c = (void *) p->cf; + struct channel_config *cc = proto_cf_main_channel(CF); - if (flags & IF_CHANGE_UP) - { - WALK_LIST(r, c->iface_routes) - if (!strcmp(r->if_name, i->name)) - static_install(p, r, i); - } - else if (flags & IF_CHANGE_DOWN) - { - WALK_LIST(r, c->iface_routes) - if (!strcmp(r->if_name, i->name)) - static_remove(p, r); - } -} + if (!cf->igp_table_ip4) + cf->igp_table_ip4 = (cc->table->addr_type == NET_IP4) ? + cc->table : cf->c.global->def_tables[NET_IP4]; -int -static_rte_mergable(rte *pri UNUSED, rte *sec UNUSED) -{ - return 1; -} + if (!cf->igp_table_ip6) + cf->igp_table_ip6 = (cc->table->addr_type == NET_IP6) ? + cc->table : cf->c.global->def_tables[NET_IP6]; -void -static_init_config(struct static_config *c) -{ - init_list(&c->iface_routes); - init_list(&c->other_routes); + WALK_LIST(r, cf->routes) + if (r->net && (r->net->type != CF->net_type)) + cf_error("Route %N incompatible with channel type", r->net); } static struct proto * -static_init(struct proto_config *c) +static_init(struct proto_config *CF) { - struct proto *p = proto_new(c, sizeof(struct proto)); + struct proto *P = proto_new(CF); + struct static_proto *p = (void *) P; + struct static_config *cf = (void *) CF; - p->neigh_notify = static_neigh_notify; - p->if_notify = static_if_notify; - p->rte_mergable = static_rte_mergable; + P->main_channel = proto_add_channel(P, proto_cf_main_channel(CF)); - return p; -} + P->neigh_notify = static_neigh_notify; + P->rte_mergable = static_rte_mergable; -static inline int -static_same_net(struct static_route *x, struct static_route *y) -{ - return ipa_equal(x->net, y->net) && (x->masklen == y->masklen); + if (cf->igp_table_ip4) + p->igp_table_ip4 = cf->igp_table_ip4->table; + + if (cf->igp_table_ip6) + p->igp_table_ip6 = cf->igp_table_ip6->table; + + return P; } -static inline int -static_same_dest(struct static_route *x, struct static_route *y) +static int +static_start(struct proto *P) { - if (x->dest != y->dest) - return 0; + struct static_proto *p = (void *) P; + struct static_config *cf = (void *) P->cf; + struct static_route *r; - switch (x->dest) - { - case RTD_ROUTER: - return ipa_equal(x->via, y->via) && (x->via_if == y->via_if); + if (!static_lp) + static_lp = lp_new(&root_pool, LP_GOOD_SIZE(1024)); - case RTD_DEVICE: - return !strcmp(x->if_name, y->if_name); + if (p->igp_table_ip4) + rt_lock_table(p->igp_table_ip4); - case RTD_MULTIPATH: - for (x = x->mp_next, y = y->mp_next; - x && y; - x = x->mp_next, y = y->mp_next) - if (!ipa_equal(x->via, y->via) || (x->via_if != y->via_if) || (x->use_bfd != y->use_bfd)) - return 0; - return !x && !y; + if (p->igp_table_ip6) + rt_lock_table(p->igp_table_ip6); - case RTDX_RECURSIVE: - return ipa_equal(x->via, y->via); + p->event = ev_new(p->p.pool); + p->event->hook = static_announce_marked; + p->event->data = p; - default: - return 1; - } + BUFFER_INIT(p->marked, p->p.pool, 4); + + /* We have to go UP before routes could be installed */ + proto_notify_state(P, PS_UP); + + WALK_LIST(r, cf->routes) + static_add_rte(p, r); + + return PS_UP; } -static inline int -static_same_rte(struct static_route *x, struct static_route *y) +static int +static_shutdown(struct proto *P) { - /* Note that i_same() requires arguments in (new, old) order */ - return static_same_dest(x, y) && i_same(y->cmds, x->cmds); -} + struct static_proto *p = (void *) P; + struct static_config *cf = (void *) P->cf; + struct static_route *r; + /* Just reset the flag, the routes will be flushed by the nest */ + WALK_LIST(r, cf->routes) + static_reset_rte(p, r); + + return PS_DOWN; +} static void -static_match(struct proto *p, struct static_route *r, struct static_config *n) +static_cleanup(struct proto *P) { - struct static_route *t; - - /* - * For given old route *r we find whether a route to the same - * network is also in the new route list. In that case, we keep the - * route and possibly update the route later if destination changed. - * Otherwise, we remove the route. - */ + struct static_proto *p = (void *) P; - if (r->neigh) - r->neigh->data = NULL; + if (p->igp_table_ip4) + rt_unlock_table(p->igp_table_ip4); - WALK_LIST(t, n->iface_routes) - if (static_same_net(r, t)) - goto found; + if (p->igp_table_ip6) + rt_unlock_table(p->igp_table_ip6); +} - WALK_LIST(t, n->other_routes) - if (static_same_net(r, t)) - goto found; +static void +static_dump_rte(struct static_route *r) +{ + debug("%-1N: ", r->net); + if (r->dest == RTD_UNICAST) + if (r->iface && ipa_zero(r->via)) + debug("dev %s\n", r->iface->name); + else + debug("via %I%J\n", r->via, r->iface); + else + debug("rtd %d\n", r->dest); +} - static_remove(p, r); - return; +static void +static_dump(struct proto *P) +{ + struct static_config *c = (void *) P->cf; + struct static_route *r; - found: - /* If destination is different, force reinstall */ - if ((r->installed > 0) && !static_same_rte(r, t)) - t->installed = -1; - else - t->installed = r->installed; + debug("Static routes:\n"); + WALK_LIST(r, c->routes) + static_dump_rte(r); } -static inline rtable * -cf_igp_table(struct static_config *cf) +#define IGP_TABLE(cf, sym) ((cf)->igp_table_##sym ? (cf)->igp_table_##sym ->table : NULL ) + +static inline int +static_cmp_rte(const void *X, const void *Y) { - return cf->igp_table ? cf->igp_table->table : NULL; + struct static_route *x = *(void **)X, *y = *(void **)Y; + return net_compare(x->net, y->net); } static int -static_reconfigure(struct proto *p, struct proto_config *new) +static_reconfigure(struct proto *P, struct proto_config *CF) { - struct static_config *o = (void *) p->cf; - struct static_config *n = (void *) new; - struct static_route *r; + struct static_proto *p = (void *) P; + struct static_config *o = (void *) P->cf; + struct static_config *n = (void *) CF; + struct static_route *r, *r2, *or, *nr; + + /* Check change in IGP tables */ + if ((IGP_TABLE(o, ip4) != IGP_TABLE(n, ip4)) || + (IGP_TABLE(o, ip6) != IGP_TABLE(n, ip6))) + return 0; - if (cf_igp_table(o) != cf_igp_table(n)) + if (!proto_configure_channel(P, &P->main_channel, proto_cf_main_channel(CF))) return 0; - /* Delete all obsolete routes and reset neighbor entries */ - WALK_LIST(r, o->iface_routes) - static_match(p, r, n); - WALK_LIST(r, o->other_routes) - static_match(p, r, n); + p->p.cf = CF; - /* Now add all new routes, those not changed will be ignored by static_install() */ - WALK_LIST(r, n->iface_routes) - { - struct iface *ifa; - if ((ifa = if_find_by_name(r->if_name)) && (ifa->flags & IF_UP)) - static_install(p, r, ifa); - } - WALK_LIST(r, n->other_routes) - static_add(p, n, r); + /* Reset route lists in neighbor entries */ + WALK_LIST(r, o->routes) + for (r2 = r; r2; r2 = r2->mp_next) + if (r2->neigh) + r2->neigh->data = NULL; - WALK_LIST(r, o->other_routes) - static_rte_cleanup(p, r); + /* Reconfigure initial matching sequence */ + for (or = HEAD(o->routes), nr = HEAD(n->routes); + NODE_VALID(or) && NODE_VALID(nr) && net_equal(or->net, nr->net); + or = NODE_NEXT(or), nr = NODE_NEXT(nr)) + static_reconfigure_rte(p, or, nr); - return 1; -} + if (!NODE_VALID(or) && !NODE_VALID(nr)) + return 1; -static void -static_copy_routes(list *dlst, list *slst) -{ - struct static_route *dr, *sr; + /* Reconfigure remaining routes, sort them to find matching pairs */ + struct static_route *or2, *nr2, **orbuf, **nrbuf; + uint ornum = 0, nrnum = 0, orpos = 0, nrpos = 0, i; - init_list(dlst); - WALK_LIST(sr, *slst) - { - /* copy one route */ - dr = cfg_alloc(sizeof(struct static_route)); - memcpy(dr, sr, sizeof(struct static_route)); - - /* This fn is supposed to be called on fresh src routes, which have 'live' - fields (like .chain, .neigh or .installed) zero, so no need to zero them */ - - /* We need to copy multipath chain, because there are backptrs in 'if_name' */ - if (dr->dest == RTD_MULTIPATH) - { - struct static_route *md, *ms, **mp_last; - - mp_last = &(dr->mp_next); - for (ms = sr->mp_next; ms; ms = ms->mp_next) - { - md = cfg_alloc(sizeof(struct static_route)); - memcpy(md, ms, sizeof(struct static_route)); - md->if_name = (void *) dr; /* really */ - - *mp_last = md; - mp_last = &(md->mp_next); - } - *mp_last = NULL; - } - - add_tail(dlst, (node *) dr); - } + for (or2 = or; NODE_VALID(or2); or2 = NODE_NEXT(or2)) + ornum++; + + for (nr2 = nr; NODE_VALID(nr2); nr2 = NODE_NEXT(nr2)) + nrnum++; + + orbuf = xmalloc(ornum * sizeof(void *)); + nrbuf = xmalloc(nrnum * sizeof(void *)); + + for (i = 0, or2 = or; i < ornum; i++, or2 = NODE_NEXT(or2)) + orbuf[i] = or2; + + for (i = 0, nr2 = nr; i < nrnum; i++, nr2 = NODE_NEXT(nr2)) + nrbuf[i] = nr2; + + qsort(orbuf, ornum, sizeof(struct static_route *), static_cmp_rte); + qsort(nrbuf, nrnum, sizeof(struct static_route *), static_cmp_rte); + + while ((orpos < ornum) && (nrpos < nrnum)) + { + int x = net_compare(orbuf[orpos]->net, nrbuf[nrpos]->net); + if (x < 0) + static_remove_rte(p, orbuf[orpos++]); + else if (x > 0) + static_add_rte(p, nrbuf[nrpos++]); + else + static_reconfigure_rte(p, orbuf[orpos++], nrbuf[nrpos++]); + } + + while (orpos < ornum) + static_remove_rte(p, orbuf[orpos++]); + + while (nrpos < nrnum) + static_add_rte(p, nrbuf[nrpos++]); + + xfree(orbuf); + xfree(nrbuf); + + return 1; } static void @@ -618,53 +579,66 @@ static_copy_config(struct proto_config *dest, struct proto_config *src) struct static_config *d = (struct static_config *) dest; struct static_config *s = (struct static_config *) src; - /* Shallow copy of everything */ - proto_copy_rest(dest, src, sizeof(struct static_config)); + struct static_route *srt, *snh; - /* Copy route lists */ - static_copy_routes(&d->iface_routes, &s->iface_routes); - static_copy_routes(&d->other_routes, &s->other_routes); -} + /* Copy route list */ + init_list(&d->routes); + WALK_LIST(srt, s->routes) + { + struct static_route *drt = NULL, *dnh = NULL, **dnp = &drt; + for (snh = srt; snh; snh = snh->mp_next) + { + dnh = cfg_alloc(sizeof(struct static_route)); + memcpy(dnh, snh, sizeof(struct static_route)); -struct protocol proto_static = { - .name = "Static", - .template = "static%d", - .preference = DEF_PREF_STATIC, - .config_size = sizeof(struct static_config), - .init = static_init, - .dump = static_dump, - .start = static_start, - .shutdown = static_shutdown, - .cleanup = static_cleanup, - .reconfigure = static_reconfigure, - .copy_config = static_copy_config -}; + if (!drt) + add_tail(&d->routes, &(dnh->n)); + + *dnp = dnh; + dnp = &(dnh->mp_next); + + if (snh->mp_head) + dnh->mp_head = drt; + } + } +} static void static_show_rt(struct static_route *r) { - byte via[STD_ADDRESS_P_LENGTH + 16]; - switch (r->dest) + { + case RTD_UNICAST: + { + struct static_route *r2; + + cli_msg(-1009, "%N", r->net); + for (r2 = r; r2; r2 = r2->mp_next) { - case RTD_ROUTER: bsprintf(via, "via %I%J", r->via, r->via_if); break; - case RTD_DEVICE: bsprintf(via, "dev %s", r->if_name); break; - case RTD_BLACKHOLE: bsprintf(via, "blackhole"); break; - case RTD_UNREACHABLE: bsprintf(via, "unreachable"); break; - case RTD_PROHIBIT: bsprintf(via, "prohibited"); break; - case RTD_MULTIPATH: bsprintf(via, "multipath"); break; - case RTDX_RECURSIVE: bsprintf(via, "recursive %I", r->via); break; - default: bsprintf(via, "???"); + if (r2->iface && ipa_zero(r2->via)) + cli_msg(-1009, "\tdev %s%s", r2->iface->name, + r2->active ? "" : " (dormant)"); + else + cli_msg(-1009, "\tvia %I%J%s%s%s", r2->via, r2->iface, + r2->onlink ? " onlink" : "", + r2->bfd_req ? " (bfd)" : "", + r2->active ? "" : " (dormant)"); } - cli_msg(-1009, "%I/%d %s%s%s", r->net, r->masklen, via, - r->bfd_req ? " (bfd)" : "", r->installed ? "" : " (dormant)"); + break; + } - struct static_route *r2; - if (r->dest == RTD_MULTIPATH) - for (r2 = r->mp_next; r2; r2 = r2->mp_next) - cli_msg(-1009, "\tvia %I%J weight %d%s%s", r2->via, r2->via_if, r2->masklen + 1, /* really */ - r2->bfd_req ? " (bfd)" : "", r2->installed ? "" : " (dormant)"); + case RTD_NONE: + case RTD_BLACKHOLE: + case RTD_UNREACHABLE: + case RTD_PROHIBIT: + cli_msg(-1009, "%N\t%s", r->net, rta_dest_names[r->dest]); + break; + + case RTDX_RECURSIVE: + cli_msg(-1009, "%N\trecursive %I", r->net, r->via); + break; + } } void @@ -673,9 +647,25 @@ static_show(struct proto *P) struct static_config *c = (void *) P->cf; struct static_route *r; - WALK_LIST(r, c->other_routes) - static_show_rt(r); - WALK_LIST(r, c->iface_routes) + WALK_LIST(r, c->routes) static_show_rt(r); cli_msg(0, ""); } + + +struct protocol proto_static = { + .name = "Static", + .template = "static%d", + .preference = DEF_PREF_STATIC, + .channel_mask = NB_ANY, + .proto_size = sizeof(struct static_proto), + .config_size = sizeof(struct static_config), + .postconfig = static_postconfig, + .init = static_init, + .dump = static_dump, + .start = static_start, + .shutdown = static_shutdown, + .cleanup = static_cleanup, + .reconfigure = static_reconfigure, + .copy_config = static_copy_config +}; diff --git a/proto/static/static.h b/proto/static/static.h index 6b047234..b202c0b1 100644 --- a/proto/static/static.h +++ b/proto/static/static.h @@ -11,41 +11,61 @@ #include "nest/route.h" #include "nest/bfd.h" +#include "lib/buffer.h" struct static_config { struct proto_config c; - list iface_routes; /* Routes to search on interface events */ - list other_routes; /* Routes hooked to neighbor cache and reject routes */ + list routes; /* List of static routes (struct static_route) */ int check_link; /* Whether iface link state is used */ - struct rtable_config *igp_table; /* Table used for recursive next hop lookups */ + struct rtable_config *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ + struct rtable_config *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ }; +struct static_proto { + struct proto p; -void static_init_config(struct static_config *); + struct event *event; /* Event for announcing updated routes */ + BUFFER(struct static_route *) marked; /* Routes marked for reannouncement */ + rtable *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ + rtable *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ +}; struct static_route { node n; - struct static_route *chain; /* Next for the same neighbor */ - ip_addr net; /* Network we route */ - int masklen; /* Mask length */ - int dest; /* Destination type (RTD_*) */ + net_addr *net; /* Network we route */ ip_addr via; /* Destination router */ - struct iface *via_if; /* Destination iface, for link-local vias */ - struct neighbor *neigh; - byte *if_name; /* Name for RTD_DEVICE routes */ - struct static_route *mp_next; /* Nexthops for RTD_MULTIPATH routes */ + struct iface *iface; /* Destination iface, for link-local vias or device routes */ + struct neighbor *neigh; /* Associated neighbor entry */ + struct static_route *chain; /* Next for the same neighbor */ + struct static_route *mp_head; /* First nexthop of this route */ + struct static_route *mp_next; /* Nexthops for multipath routes */ struct f_inst *cmds; /* List of commands for setting attributes */ - int installed; /* Installed in rt table, -1 for reinstall */ - int use_bfd; /* Configured to use BFD */ + byte dest; /* Destination type (RTD_*) */ + byte state; /* State of route announcement (SRS_*) */ + byte active; /* Next hop is active (nbr/iface/BFD available) */ + byte onlink; /* Gateway is onlink regardless of IP ranges */ + byte weight; /* Multipath next hop weight */ + byte use_bfd; /* Configured to use BFD */ struct bfd_request *bfd_req; /* BFD request, if BFD is used */ + mpls_label_stack *mls; /* MPLS label stack; may be NULL */ }; -/* Dummy nodes (parts of multipath route) abuses masklen field for weight - and if_name field for a ptr to the master (RTD_MULTIPATH) node. */ - +/* + * Note that data fields neigh, chain, state, active and bfd_req are runtime + * data, not real configuration data. Must be handled carefully. + * + * Regular (i.e. dest == RTD_UNICAST) routes use static_route structure for + * additional next hops (fields mp_head, mp_next). Note that 'state' is for + * whole route, while 'active' is for each next hop. Also note that fields + * mp_head, mp_next, active are zero for other kinds of routes. + */ #define RTDX_RECURSIVE 0x7f /* Phony dest value for recursive routes */ +#define SRS_DOWN 0 /* Route is not announced */ +#define SRS_CLEAN 1 /* Route is active and announced */ +#define SRS_DIRTY 2 /* Route changed since announcement */ + void static_show(struct proto *); #endif |