diff options
author | Ondrej Zajicek <santiago@crfreenet.org> | 2012-08-14 16:25:22 +0200 |
---|---|---|
committer | Ondrej Zajicek <santiago@crfreenet.org> | 2012-08-14 16:46:43 +0200 |
commit | 094d2bdb79e1ffa0a02761fd651aa0f0b6b0c585 (patch) | |
tree | f7cb65c540403ed152677dde3b803c3dd117d8e5 | |
parent | d760229ab897fa1bf1fd0fe7019cc2431d21a1cc (diff) |
Implements ADD-PATH extension for BGP.
Allows to send and receive multiple routes for one network by one BGP
session. Also contains necessary core changes to support this (routing
tables accepting several routes for one network from one protocol).
It needs some more cleanup before merging to the master branch.
-rw-r--r-- | filter/filter.c | 2 | ||||
-rw-r--r-- | lib/string.h | 1 | ||||
-rw-r--r-- | nest/proto.c | 13 | ||||
-rw-r--r-- | nest/protocol.h | 3 | ||||
-rw-r--r-- | nest/route.h | 25 | ||||
-rw-r--r-- | nest/rt-attr.c | 194 | ||||
-rw-r--r-- | nest/rt-dev.c | 26 | ||||
-rw-r--r-- | nest/rt-table.c | 111 | ||||
-rw-r--r-- | proto/bgp/attrs.c | 117 | ||||
-rw-r--r-- | proto/bgp/bgp.c | 31 | ||||
-rw-r--r-- | proto/bgp/bgp.h | 39 | ||||
-rw-r--r-- | proto/bgp/config.Y | 5 | ||||
-rw-r--r-- | proto/bgp/packets.c | 182 | ||||
-rw-r--r-- | proto/ospf/ospf.c | 10 | ||||
-rw-r--r-- | proto/ospf/rt.c | 8 | ||||
-rw-r--r-- | proto/pipe/pipe.c | 20 | ||||
-rw-r--r-- | proto/rip/rip.c | 38 | ||||
-rw-r--r-- | proto/rip/rip.h | 1 | ||||
-rw-r--r-- | proto/static/static.c | 8 | ||||
-rw-r--r-- | sysdep/bsd/krt-sock.c | 2 | ||||
-rw-r--r-- | sysdep/linux/netlink.c | 2 | ||||
-rw-r--r-- | sysdep/unix/krt.c | 13 |
22 files changed, 657 insertions, 194 deletions
diff --git a/filter/filter.c b/filter/filter.c index 49b67391..7c883fff 100644 --- a/filter/filter.c +++ b/filter/filter.c @@ -831,7 +831,7 @@ interpret(struct f_inst *what) res.val.i = * ((char *) rta + what->a2.i); break; case T_STRING: /* Warning: this is a special case for proto attribute */ - res.val.s = rta->proto->name; + res.val.s = rta->src->proto->name; break; case T_PREFIX: /* Warning: this works only for prefix of network */ { diff --git a/lib/string.h b/lib/string.h index 14eaa360..7432d9a4 100644 --- a/lib/string.h +++ b/lib/string.h @@ -11,6 +11,7 @@ #include <stdarg.h> #include <string.h> +#include <strings.h> int bsprintf(char *str, const char *fmt, ...); int bvsprintf(char *str, const char *fmt, va_list args); diff --git a/nest/proto.c b/nest/proto.c index 53d3f1a2..399c02e3 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -219,6 +219,7 @@ proto_free_ahooks(struct proto *p) p->main_ahook = NULL; } + /** * proto_config_new - create a new protocol configuration * @pr: protocol the configuration will belong to @@ -791,11 +792,15 @@ proto_schedule_feed(struct proto *p, int initial) /* Connect protocol to routing table */ if (initial && !p->proto->multitable) { + p->main_source = rt_get_source(p, 0); + rt_lock_source(p->main_source); + p->main_ahook = proto_add_announce_hook(p, p->table, &p->stats); p->main_ahook->in_filter = p->cf->in_filter; p->main_ahook->out_filter = p->cf->out_filter; p->main_ahook->in_limit = p->cf->in_limit; p->main_ahook->out_limit = p->cf->out_limit; + proto_reset_limit(p->main_ahook->in_limit); proto_reset_limit(p->main_ahook->out_limit); } @@ -844,6 +849,8 @@ proto_flush_loop(void *unused UNUSED) return; } + rt_prune_sources(); + again: WALK_LIST(p, flush_proto_list) if (p->flushing) @@ -1040,6 +1047,12 @@ proto_notify_state(struct proto *p, unsigned ps) if ((cs == FS_FEEDING) || (cs == FS_HAPPY)) proto_schedule_flush(p); + if (p->proto->multitable) + { + rt_unlock_source(p->main_source); + p->main_source = NULL; + } + neigh_prune(); // FIXME convert neighbors to resource? rfree(p->pool); p->pool = NULL; diff --git a/nest/protocol.h b/nest/protocol.h index 11fcb164..d80201f3 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -183,7 +183,7 @@ struct proto { int (*reload_routes)(struct proto *); /* - * Routing entry hooks (called only for rte's belonging to this protocol): + * Routing entry hooks (called only for routes belonging to this protocol): * * rte_recalculate Called at the beginning of the best route selection * rte_better Compare two rte's and decide which one is better (1=first, 0=second). @@ -199,6 +199,7 @@ struct proto { void (*rte_remove)(struct network *, struct rte *); struct rtable *table; /* Our primary routing table */ + struct rte_src *main_source; /* Primary route source */ struct announce_hook *main_ahook; /* Primary announcement hook */ struct announce_hook *ahooks; /* Announcement hooks for this protocol */ diff --git a/nest/route.h b/nest/route.h index 524e69b3..3b65a855 100644 --- a/nest/route.h +++ b/nest/route.h @@ -237,10 +237,10 @@ void rt_unlock_table(rtable *); void rt_setup(pool *, rtable *, char *, struct rtable_config *); static inline net *net_find(rtable *tab, ip_addr addr, unsigned len) { return (net *) fib_find(&tab->fib, &addr, len); } static inline net *net_get(rtable *tab, ip_addr addr, unsigned len) { return (net *) fib_get(&tab->fib, &addr, len); } -rte *rte_find(net *net, struct proto *p); +rte *rte_find(net *net, struct rte_src *src); rte *rte_get_temp(struct rta *); -void rte_update2(struct announce_hook *ah, net *net, rte *new, struct proto *src); -static inline void rte_update(rtable *tab, net *net, struct proto *p, struct proto *src, rte *new) { rte_update2(p->main_ahook, net, new, src); } +void rte_update2(struct announce_hook *ah, net *net, rte *new, struct rte_src *src); +static inline void rte_update(struct proto *p, net *net, rte *new) { rte_update2(p->main_ahook, net, new, p->main_source); } void rte_discard(rtable *tab, rte *old); void rte_dump(rte *); void rte_free(rte *); @@ -286,9 +286,18 @@ struct mpnh { unsigned char weight; }; +struct rte_src { + struct rte_src *next; /* Hash chain */ + struct proto *proto; /* Protocol the source is based on */ + u32 private_id; /* Private ID, assigned by the protocol */ + u32 global_id; /* Globally unique ID of the source */ + unsigned uc; /* Use count */ +}; + + typedef struct rta { struct rta *next, **pprev; /* Hash chain */ - struct proto *proto; /* Protocol instance that originally created the route */ + struct rte_src *src; /* Route source that created the route */ unsigned uc; /* Use count */ byte source; /* Route source (RTS_...) */ byte scope; /* Route scope (SCOPE_... -- see ip.h) */ @@ -403,6 +412,13 @@ typedef struct ea_list { #define EALF_BISECT 2 /* Use interval bisection for searching */ #define EALF_CACHED 4 /* Attributes belonging to cached rta */ +struct rte_src *rt_find_source(struct proto *p, u32 id); +struct rte_src *rt_get_source(struct proto *p, u32 id); +static inline void rt_lock_source(struct rte_src *src) { src->uc++; } +static inline void rt_unlock_source(struct rte_src *src) { src->uc--; } +void rt_prune_sources(void); + + eattr *ea_find(ea_list *, unsigned ea); int ea_get_int(ea_list *, unsigned ea, int def); void ea_dump(ea_list *); @@ -419,6 +435,7 @@ static inline int mpnh_same(struct mpnh *x, struct mpnh *y) void rta_init(void); rta *rta_lookup(rta *); /* Get rta equivalent to this one, uc++ */ +static inline int rta_is_cached(rta *r) { return r->aflags & RTAF_CACHED; } static inline rta *rta_clone(rta *r) { r->uc++; return r; } void rta__free(rta *r); static inline void rta_free(rta *r) { if (r && !--r->uc) rta__free(r); } diff --git a/nest/rt-attr.c b/nest/rt-attr.c index 6aed318b..b2bb152f 100644 --- a/nest/rt-attr.c +++ b/nest/rt-attr.c @@ -58,9 +58,194 @@ pool *rta_pool; static slab *rta_slab; static slab *mpnh_slab; +static slab *rte_src_slab; + +/* rte source ID bitmap */ +static u32 *src_ids; +static u32 src_id_size, src_id_used, src_id_pos; +#define SRC_ID_SIZE_DEF 4 + +/* rte source hash */ +static struct rte_src **src_table; +static u32 src_hash_order, src_hash_size, src_hash_count; +#define SRC_HASH_ORDER_DEF 6 +#define SRC_HASH_ORDER_MAX 18 +#define SRC_HASH_ORDER_MIN 10 struct protocol *attr_class_to_protocol[EAP_MAX]; + +static void +rte_src_init(void) +{ + rte_src_slab = sl_new(rta_pool, sizeof(struct rte_src)); + + src_id_pos = 0; + src_id_size = SRC_ID_SIZE_DEF; + src_ids = mb_allocz(rta_pool, src_id_size * sizeof(u32)); + + /* ID 0 is reserved */ + src_ids[0] = 1; + src_id_used = 1; + + src_hash_count = 0; + src_hash_order = SRC_HASH_ORDER_DEF; + src_hash_size = 1 << src_hash_order; + src_table = mb_allocz(rta_pool, src_hash_size * sizeof(struct rte_src *)); +} + +static inline int u32_cto(unsigned int x) { return ffs(~x) - 1; } + +static inline u32 +rte_src_alloc_id(void) +{ + int i, j; + for (i = src_id_pos; i < src_id_size; i++) + if (src_ids[i] != 0xffffffff) + goto found; + + /* If we are at least 7/8 full, expand */ + if (src_id_used > (src_id_size * 28)) + { + src_id_size *= 2; + src_ids = mb_realloc(rta_pool, src_ids, src_id_size * sizeof(u32)); + bzero(src_ids + i, (src_id_size - i) * sizeof(u32)); + goto found; + } + + for (i = 0; i < src_id_pos; i++) + if (src_ids[i] != 0xffffffff) + goto found; + + ASSERT(0); + + found: + ASSERT(i < 0x8000000); + + src_id_pos = i; + j = u32_cto(src_ids[i]); + + src_ids[i] |= (1 << j); + src_id_used++; + return 32 * i + j; +} + +static inline void +rte_src_free_id(u32 id) +{ + int i = id / 32; + int j = id % 32; + + ASSERT((i < src_id_size) && (src_ids[i] & (1 << j))); + src_ids[i] &= ~(1 << j); + src_id_used--; +} + +static inline u32 rte_src_hash(struct proto *p, u32 x, u32 order) +{ return (x * 2902958171u) >> (32 - order); } + +static void +rte_src_rehash(int step) +{ + struct rte_src **old_tab, *src, *src_next; + u32 old_size, hash, i; + + old_tab = src_table; + old_size = src_hash_size; + + src_hash_order += step; + src_hash_size = 1 << src_hash_order; + src_table = mb_allocz(rta_pool, src_hash_size * sizeof(struct rte_src *)); + + for (i = 0; i < old_size; i++) + for (src = old_tab[i]; src; src = src_next) + { + src_next = src->next; + hash = rte_src_hash(src->proto, src->private_id, src_hash_order); + src->next = src_table[hash]; + src_table[hash] = src; + } + + mb_free(old_tab); +} + +struct rte_src * +rt_find_source(struct proto *p, u32 id) +{ + struct rte_src *src; + u32 hash = rte_src_hash(p, id, src_hash_order); + + for (src = src_table[hash]; src; src = src->next) + if ((src->proto == p) && (src->private_id == id)) + return src; + + return NULL; +} + +struct rte_src * +rt_get_source(struct proto *p, u32 id) +{ + struct rte_src *src; + u32 hash = rte_src_hash(p, id, src_hash_order); + + for (src = src_table[hash]; src; src = src->next) + if ((src->proto == p) && (src->private_id == id)) + return src; + + src = sl_alloc(rte_src_slab); + src->proto = p; + src->private_id = id; + src->global_id = rte_src_alloc_id(); + src->uc = 0; + + src->next = src_table[hash]; + src_table[hash] = src; + + src_hash_count++; + if ((src_hash_count > src_hash_size) && (src_hash_order < SRC_HASH_ORDER_MAX)) + rte_src_rehash(1); + + return src; +} + +static inline void +rt_remove_source(struct rte_src **sp) +{ + struct rte_src *src = *sp; + + *sp = src->next; + rte_src_free_id(src->global_id); + sl_free(rte_src_slab, src); + src_hash_count--; +} + +void +rt_prune_sources(void) +{ + struct rte_src **sp; + int i; + + for (i = 0; i < src_hash_size; i++) + { + sp = &src_table[i]; + while (*sp) + { + if ((*sp)->uc == 0) + rt_remove_source(sp); + else + sp = &(*sp)->next; + } + } + + while ((src_hash_count < (src_hash_size / 4)) && (src_hash_order > SRC_HASH_ORDER_MIN)) + rte_src_rehash(-1); +} + + +/* + * Multipath Next Hop + */ + static inline unsigned int mpnh_hash(struct mpnh *x) { @@ -682,14 +867,14 @@ rta_alloc_hash(void) static inline unsigned int rta_hash(rta *a) { - return (a->proto->hash_key ^ ipa_hash(a->gw) ^ + return (((unsigned) a->src) ^ ipa_hash(a->gw) ^ mpnh_hash(a->nexthops) ^ ea_hash(a->eattrs)) & 0xffff; } static inline int rta_same(rta *x, rta *y) { - return (x->proto == y->proto && + return (x->src == y->src && x->source == y->source && x->scope == y->scope && x->cast == y->cast && @@ -786,6 +971,7 @@ rta_lookup(rta *o) r = rta_copy(o); r->hash_key = h; r->aflags = RTAF_CACHED; + rt_lock_source(r->src); rt_lock_hostentry(r->hostentry); rta_insert(r); @@ -805,6 +991,7 @@ rta__free(rta *a) a->next->pprev = a->pprev; a->aflags = 0; /* Poison the entry */ rt_unlock_hostentry(a->hostentry); + rt_unlock_source(a->src); mpnh_free(a->nexthops); ea_free(a->eattrs); sl_free(rta_slab, a); @@ -827,7 +1014,7 @@ rta_dump(rta *a) static char *rtd[] = { "", " DEV", " HOLE", " UNREACH", " PROHIBIT" }; debug("p=%s uc=%d %s %s%s%s h=%04x", - a->proto->name, a->uc, rts[a->source], ip_scope_text(a->scope), rtc[a->cast], + a->src->proto->name, a->uc, rts[a->source], ip_scope_text(a->scope), rtc[a->cast], rtd[a->dest], a->hash_key); if (!(a->aflags & RTAF_CACHED)) debug(" !CACHED"); @@ -895,6 +1082,7 @@ rta_init(void) rta_slab = sl_new(rta_pool, sizeof(rta)); mpnh_slab = sl_new(rta_pool, sizeof(struct mpnh)); rta_alloc_hash(); + rte_src_init(); } /* diff --git a/nest/rt-dev.c b/nest/rt-dev.c index 54cb14ba..7319018f 100644 --- a/nest/rt-dev.c +++ b/nest/rt-dev.c @@ -48,29 +48,31 @@ dev_ifa_notify(struct proto *p, unsigned c, struct ifa *ad) DBG("dev_if_notify: device shutdown: prefix not found\n"); return; } - rte_update(p->table, n, p, p, NULL); + rte_update(p, n, NULL); } else if (c & IF_CHANGE_UP) { - rta *a, A; + rta *a; net *n; rte *e; DBG("dev_if_notify: %s:%I going up\n", ad->iface->name, ad->ip); - bzero(&A, sizeof(A)); - A.proto = p; - A.source = RTS_DEVICE; - A.scope = SCOPE_UNIVERSE; - A.cast = RTC_UNICAST; - A.dest = RTD_DEVICE; - A.iface = ad->iface; - A.eattrs = NULL; - a = rta_lookup(&A); + + rta a0 = { + .src = p->main_source, + .source = RTS_DEVICE, + .scope = SCOPE_UNIVERSE, + .cast = RTC_UNICAST, + .dest = RTD_DEVICE, + .iface = ad->iface + }; + + a = rta_lookup(&a0); n = net_get(p->table, ad->prefix, ad->pxlen); e = rte_get_temp(a); e->net = n; e->pflags = 0; - rte_update(p->table, n, p, p, e); + rte_update(p, n, e); } } diff --git a/nest/rt-table.c b/nest/rt-table.c index 118f4c25..ecd6e324 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -58,6 +58,14 @@ static void rt_next_hop_update(rtable *tab); static inline void rt_schedule_gc(rtable *tab); +static inline struct ea_list * +make_tmp_attrs(struct rte *rt, struct linpool *pool) +{ + struct ea_list *(*mta)(struct rte *rt, struct linpool *pool); + mta = rt->attrs->src->proto->make_tmp_attrs; + return mta ? mta(rt, rte_update_pool) : NULL; +} + /* Like fib_route(), but skips empty net entries */ static net * net_route(rtable *tab, ip_addr a, int len) @@ -88,17 +96,17 @@ rte_init(struct fib_node *N) /** * rte_find - find a route * @net: network node - * @p: protocol + * @src: route source * * The rte_find() function returns a route for destination @net - * which belongs has been defined by protocol @p. + * which is from route source @src. */ rte * -rte_find(net *net, struct proto *p) +rte_find(net *net, struct rte_src *src) { rte *e = net->routes; - while (e && e->attrs->proto != p) + while (e && e->attrs->src != src) e = e->next; return e; } @@ -119,7 +127,7 @@ rte_get_temp(rta *a) e->attrs = a; e->flags = 0; - e->pref = a->proto->preference; + e->pref = a->src->proto->preference; return e; } @@ -145,16 +153,16 @@ rte_better(rte *new, rte *old) return 1; if (new->pref < old->pref) return 0; - if (new->attrs->proto->proto != old->attrs->proto->proto) + if (new->attrs->src->proto->proto != old->attrs->src->proto->proto) { /* * If the user has configured protocol preferences, so that two different protocols * have the same preference, try to break the tie by comparing addresses. Not too * useful, but keeps the ordering of routes unambiguous. */ - return new->attrs->proto->proto > old->attrs->proto->proto; + return new->attrs->src->proto->proto > old->attrs->src->proto->proto; } - if (better = new->attrs->proto->rte_better) + if (better = new->attrs->src->proto->rte_better) return better(new, old); return 0; } @@ -198,8 +206,7 @@ export_filter(struct announce_hook *ah, rte *rt0, rte **rt_free, ea_list **tmpa, /* If called does not care for eattrs, we prepare one internally */ if (!tmpa) { - struct proto *src = rt->attrs->proto; - tmpb = src->make_tmp_attrs ? src->make_tmp_attrs(rt, rte_update_pool) : NULL; + tmpb = make_tmp_attrs(rt, rte_update_pool); tmpa = &tmpb; } @@ -536,9 +543,9 @@ rte_announce(rtable *tab, unsigned type, net *net, rte *new, rte *old, rte *befo if (type == RA_OPTIMAL) { if (new) - new->attrs->proto->stats.pref_routes++; + new->attrs->src->proto->stats.pref_routes++; if (old) - old->attrs->proto->stats.pref_routes--; + old->attrs->src->proto->stats.pref_routes--; if (tab->hostcache) rt_notify_hostcache(tab, net); @@ -588,7 +595,7 @@ rte_validate(rte *e) void rte_free(rte *e) { - if (e->attrs->aflags & RTAF_CACHED) + if (rta_is_cached(e->attrs)) rta_free(e->attrs); sl_free(rte_slab, e); } @@ -608,11 +615,11 @@ rte_same(rte *x, rte *y) x->flags == y->flags && x->pflags == y->pflags && x->pref == y->pref && - (!x->attrs->proto->rte_same || x->attrs->proto->rte_same(x, y)); + (!x->attrs->src->proto->rte_same || x->attrs->src->proto->rte_same(x, y)); } static void -rte_recalculate(struct announce_hook *ah, net *net, rte *new, ea_list *tmpa, struct proto *src) +rte_recalculate(struct announce_hook *ah, net *net, rte *new, ea_list *tmpa, struct rte_src *src) { struct proto *p = ah->proto; struct rtable *table = ah->table; @@ -625,7 +632,7 @@ rte_recalculate(struct announce_hook *ah, net *net, rte *new, ea_list *tmpa, str k = &net->routes; /* Find and remove original route from the same protocol */ while (old = *k) { - if (old->attrs->proto == src) + if (old->attrs->src == src) { /* If there is the same route in the routing table but from * a different sender, then there are two paths from the @@ -656,7 +663,7 @@ rte_recalculate(struct announce_hook *ah, net *net, rte *new, ea_list *tmpa, str #ifdef CONFIG_RIP /* lastmod is used internally by RIP as the last time when the route was received. */ - if (src->proto == &proto_rip) + if (src->proto->proto == &proto_rip) old->lastmod = now; #endif return; @@ -725,7 +732,7 @@ rte_recalculate(struct announce_hook *ah, net *net, rte *new, ea_list *tmpa, str /* If routes are not sorted, find the best route and move it on the first position. There are several optimized cases. */ - if (src->rte_recalculate && src->rte_recalculate(table, net, new, old, old_best)) + if (src->proto->rte_recalculate && src->proto->rte_recalculate(table, net, new, old, old_best)) goto do_recalculate; if (new && rte_better(new, old_best)) @@ -881,7 +888,7 @@ rte_update_unlock(void) */ void -rte_update2(struct announce_hook *ah, net *net, rte *new, struct proto *src) +rte_update2(struct announce_hook *ah, net *net, rte *new, struct rte_src *src) { struct proto *p = ah->proto; struct proto_stats *stats = ah->stats; @@ -906,8 +913,8 @@ rte_update2(struct announce_hook *ah, net *net, rte *new, struct proto *src) rte_trace_in(D_FILTERS, p, new, "filtered out"); goto drop; } - if (src->make_tmp_attrs) - tmpa = src->make_tmp_attrs(new, rte_update_pool); + + tmpa = make_tmp_attrs(new, rte_update_pool); if (filter) { ea_list *old_tmpa = tmpa; @@ -918,15 +925,25 @@ rte_update2(struct announce_hook *ah, net *net, rte *new, struct proto *src) rte_trace_in(D_FILTERS, p, new, "filtered out"); goto drop; } - if (tmpa != old_tmpa && src->store_tmp_attrs) - src->store_tmp_attrs(new, tmpa); + if (tmpa != old_tmpa && src->proto->store_tmp_attrs) + src->proto->store_tmp_attrs(new, tmpa); } - if (!(new->attrs->aflags & RTAF_CACHED)) /* Need to copy attributes */ + + if (!rta_is_cached(new->attrs)) /* Need to copy attributes */ new->attrs = rta_lookup(new->attrs); new->flags |= REF_COW; } else - stats->imp_withdraws_received++; + { + stats->imp_withdraws_received++; + + if (!net || !src) + { + stats->imp_withdraws_ignored++; + rte_update_unlock(); + return; + } + } rte_recalculate(ah, net, new, tmpa, src); rte_update_unlock(); @@ -943,12 +960,10 @@ drop: static inline void rte_announce_i(rtable *tab, unsigned type, net *n, rte *new, rte *old) { - struct proto *src; ea_list *tmpa; rte_update_lock(); - src = new->attrs->proto; - tmpa = src->make_tmp_attrs ? src->make_tmp_attrs(new, rte_update_pool) : NULL; + tmpa = make_tmp_attrs(new, rte_update_pool); rte_announce(tab, type, n, new, old, NULL, tmpa); rte_update_unlock(); } @@ -957,7 +972,7 @@ void rte_discard(rtable *t, rte *old) /* Non-filtered route deletion, used during garbage collection */ { rte_update_lock(); - rte_recalculate(old->sender, old->net, NULL, NULL, old->attrs->proto); + rte_recalculate(old->sender, old->net, NULL, NULL, old->attrs->src); rte_update_unlock(); } @@ -974,8 +989,8 @@ rte_dump(rte *e) debug("%-1I/%2d ", n->n.prefix, n->n.pxlen); debug("KF=%02x PF=%02x pref=%d lm=%d ", n->n.flags, e->pflags, e->pref, now-e->lastmod); rta_dump(e->attrs); - if (e->attrs->proto->proto->dump_attrs) - e->attrs->proto->proto->dump_attrs(e); + if (e->attrs->src->proto->proto->dump_attrs) + e->attrs->src->proto->proto->dump_attrs(e); debug("\n"); } @@ -1096,7 +1111,10 @@ rt_event(void *ptr) rt_next_hop_update(tab); if (tab->gc_scheduled) - rt_prune_nets(tab); + { + rt_prune_nets(tab); + rt_prune_sources(); // FIXME this should be moved to independent event + } } void @@ -1303,8 +1321,8 @@ rt_next_hop_update_net(rtable *tab, net *n) /* Call a pre-comparison hook */ /* Not really an efficient way to compute this */ - if (e->attrs->proto->rte_recalculate) - e->attrs->proto->rte_recalculate(tab, n, new, e, NULL); + if (e->attrs->src->proto->rte_recalculate) + e->attrs->src->proto->rte_recalculate(tab, n, new, e, NULL); if (e != old_best) rte_free_quick(e); @@ -1502,11 +1520,10 @@ rt_commit(struct config *new, struct config *old) static inline void do_feed_baby(struct proto *p, int type, struct announce_hook *h, net *n, rte *e) { - struct proto *src = e->attrs->proto; ea_list *tmpa; rte_update_lock(); - tmpa = src->make_tmp_attrs ? src->make_tmp_attrs(e, rte_update_pool) : NULL; + tmpa = make_tmp_attrs(e, rte_update_pool); if (type == RA_ACCEPTED) rt_notify_accepted(h, n, e, NULL, NULL, tmpa, p->refeeding ? 2 : 1); else @@ -1889,7 +1906,7 @@ rt_update_hostcache(rtable *tab) } static struct hostentry * -rt_find_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep) +rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep) { struct hostentry *he; @@ -1910,9 +1927,10 @@ rt_find_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep) void rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr *gw, ip_addr *ll) { - rta_apply_hostentry(a, rt_find_hostentry(tab, *gw, *ll, dep)); + rta_apply_hostentry(a, rt_get_hostentry(tab, *gw, *ll, dep)); } + /* * CLI commands */ @@ -1942,6 +1960,7 @@ rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, ea_list *tm rta *a = e->attrs; int primary = (e->net->routes == e); int sync_error = (e->net->n.flags & KRF_SYNC_ERROR); + void (*get_route_info)(struct rte *, byte *buf, struct ea_list *attrs); struct mpnh *nh; rt_format_via(e, via); @@ -1950,7 +1969,9 @@ rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, ea_list *tm bsprintf(from, " from %I", a->from); else from[0] = 0; - if (a->proto->proto->get_route_info || d->verbose) + + get_route_info = a->src->proto->proto->get_route_info; + if (get_route_info || d->verbose) { /* Need to normalize the extended attributes */ ea_list *t = tmpa; @@ -1959,11 +1980,11 @@ rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, ea_list *tm ea_merge(t, tmpa); ea_sort(tmpa); } - if (a->proto->proto->get_route_info) - a->proto->proto->get_route_info(e, info, tmpa); + if (get_route_info) + get_route_info(e, info, tmpa); else bsprintf(info, " (%d)", e->pref); - cli_printf(c, -1007, "%-18s %s [%s %s%s]%s%s", ia, via, a->proto->name, + cli_printf(c, -1007, "%-18s %s [%s %s%s]%s%s", ia, via, a->src->proto->name, tm, from, primary ? (sync_error ? " !" : " *") : "", info); for (nh = a->nexthops; nh; nh = nh->next) cli_printf(c, -1007, "\tvia %I on %s weight %d", nh->gw, nh->iface->name, nh->weight + 1); @@ -1985,15 +2006,15 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) for(e=n->routes; e; e=e->next) { struct ea_list *tmpa; - struct proto *p0 = e->attrs->proto; + struct rte_src *src = e->attrs->src; struct proto *p1 = d->export_protocol; struct proto *p2 = d->show_protocol; d->rt_counter++; ee = e; rte_update_lock(); /* We use the update buffer for filtering */ - tmpa = p0->make_tmp_attrs ? p0->make_tmp_attrs(e, rte_update_pool) : NULL; + tmpa = make_tmp_attrs(e, rte_update_pool); ok = (d->filter == FILTER_ACCEPT || f_run(d->filter, &e, &tmpa, rte_update_pool, FF_FORCE_TMPATTR) <= F_ACCEPT); - if (p2 && p2 != p0) ok = 0; + if (p2 && p2 != src->proto) ok = 0; if (ok && d->export_mode) { int ic; diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index e5bc84dd..837a6861 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -621,12 +621,14 @@ bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains) return -1; } +/* static void bgp_init_prefix(struct fib_node *N) { struct bgp_prefix *p = (struct bgp_prefix *) N; p->bucket_node.next = NULL; } +*/ static int bgp_compare_u32(const u32 *x, const u32 *y) @@ -870,30 +872,125 @@ bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck) mb_free(buck); } + +/* Prefix hash table */ + +static inline u32 prefix_hash(ip_addr prefix, int pxlen, u32 path_id, u32 order) +{ + u32 x = ipa_hash(prefix) + pxlen + path_id; + return (x * 2902958171u) >> (32 - order); +} + +static inline u32 px_hash_size(struct bgp_proto *p) +{ return 1 << p->px_hash_order; } + +void +bgp_init_prefix_table(struct bgp_proto *p, u32 order) +{ + p->px_hash_count = 0; + p->px_hash_order = order; + p->prefix_table = mb_allocz(p->p.pool, px_hash_size(p) * sizeof(struct bgp_prefix *)); + p->prefix_slab = sl_new(p->p.pool, sizeof(struct bgp_prefix)); +} + +static void +bgp_rehash_prefix_table(struct bgp_proto *p, int step) +{ + struct bgp_prefix **old_tab, *px, *px_next; + u32 old_size, hash, i; + + old_tab = p->prefix_table; + old_size = px_hash_size(p); + + p->px_hash_order += step; + p->prefix_table = mb_allocz(p->p.pool, px_hash_size(p) * sizeof(struct bgp_prefix *)); + + for (i = 0; i < old_size; i++) + for (px = old_tab[i]; px; px = px_next) + { + px_next = px->next; + hash = prefix_hash(px->n.prefix, px->n.pxlen, px->path_id, p->px_hash_order); + px->next = p->prefix_table[hash]; + p->prefix_table[hash] = px; + } + + mb_free(old_tab); +} + +static struct bgp_prefix * +bgp_get_prefix(struct bgp_proto *p, ip_addr prefix, int pxlen, u32 path_id) +{ + struct bgp_prefix *bp; + u32 hash = prefix_hash(prefix, pxlen, path_id, p->px_hash_order); + + for (bp = p->prefix_table[hash]; bp; bp = bp->next) + if (bp->n.pxlen == pxlen && ipa_equal(bp->n.prefix, prefix) && bp->path_id == path_id) + return bp; + + bp = sl_alloc(p->prefix_slab); + bp->n.prefix = prefix; + bp->n.pxlen = pxlen; + bp->path_id = path_id; + bp->next = p->prefix_table[hash]; + p->prefix_table[hash] = bp; + + bp->bucket_node.next = NULL; + + p->px_hash_count++; + if ((p->px_hash_count > px_hash_size(p)) && (p->px_hash_order < 18)) + bgp_rehash_prefix_table(p, 1); + + return bp; +} + +void +bgp_free_prefix(struct bgp_proto *p, struct bgp_prefix *bp) +{ + struct bgp_prefix **bpp; + u32 hash = prefix_hash(bp->n.prefix, bp->n.pxlen, bp->path_id, p->px_hash_order); + + for (bpp = &p->prefix_table[hash]; *bpp; *bpp = (*bpp)->next) + if (*bpp == bp) + break; + + *bpp = bp->next; + sl_free(p->prefix_slab, bp); + + p->px_hash_count--; + if ((p->px_hash_count < (px_hash_size(p) / 4)) && (p->px_hash_order > 10)) + bgp_rehash_prefix_table(p, -1); +} + + void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs) { struct bgp_proto *p = (struct bgp_proto *) P; struct bgp_bucket *buck; struct bgp_prefix *px; + rte *key; + u32 path_id; DBG("BGP: Got route %I/%d %s\n", n->n.prefix, n->n.pxlen, new ? "up" : "down"); if (new) { + key = new; buck = bgp_get_bucket(p, n, attrs, new->attrs->source != RTS_BGP); if (!buck) /* Inconsistent attribute list */ return; } else { + key = old; if (!(buck = p->withdraw_bucket)) { buck = p->withdraw_bucket = mb_alloc(P->pool, sizeof(struct bgp_bucket)); init_list(&buck->prefixes); } } - px = fib_get(&p->prefix_fib, &n->n.prefix, n->n.pxlen); + path_id = p->add_path_tx ? key->attrs->src->global_id : 0; + px = bgp_get_prefix(p, n->n.prefix, n->n.pxlen, path_id); if (px->bucket_node.next) { DBG("\tRemoving old entry.\n"); @@ -1021,7 +1118,7 @@ bgp_update_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *p if (rr) { /* Handling route reflection, RFC 4456 */ - struct bgp_proto *src = (struct bgp_proto *) e->attrs->proto; + struct bgp_proto *src = (struct bgp_proto *) e->attrs->src->proto; a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID)); if (!a) @@ -1071,7 +1168,8 @@ bgp_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool * { rte *e = *new; struct bgp_proto *p = (struct bgp_proto *) P; - struct bgp_proto *new_bgp = (e->attrs->proto->proto == &proto_bgp) ? (struct bgp_proto *) e->attrs->proto : NULL; + struct bgp_proto *new_bgp = (e->attrs->src->proto->proto == &proto_bgp) ? + (struct bgp_proto *) e->attrs->src->proto : NULL; if (p == new_bgp) /* Poison reverse updates */ return -1; @@ -1110,7 +1208,7 @@ bgp_get_neighbor(rte *r) if (e && as_path_get_first(e->u.ptr, &as)) return as; else - return ((struct bgp_proto *) r->attrs->proto)->remote_as; + return ((struct bgp_proto *) r->attrs->src->proto)->remote_as; } static inline int @@ -1123,8 +1221,8 @@ rte_resolvable(rte *rt) int bgp_rte_better(rte *new, rte *old) { - struct bgp_proto *new_bgp = (struct bgp_proto *) new->attrs->proto; - struct bgp_proto *old_bgp = (struct bgp_proto *) old->attrs->proto; + struct bgp_proto *new_bgp = (struct bgp_proto *) new->attrs->src->proto; + struct bgp_proto *old_bgp = (struct bgp_proto *) old->attrs->src->proto; eattr *x, *y; u32 n, o; @@ -1258,7 +1356,7 @@ same_group(rte *r, u32 lpref, u32 lasn) static inline int use_deterministic_med(rte *r) { - struct proto *P = r->attrs->proto; + struct proto *P = r->attrs->src->proto; return (P->proto == &proto_bgp) && ((struct bgp_proto *) P)->cf->deterministic_med; } @@ -1543,7 +1641,6 @@ bgp_decode_attrs(struct bgp_conn *conn, byte *attr, unsigned int len, struct lin int withdraw = 0; bzero(a, sizeof(rta)); - a->proto = &bgp->p; a->source = RTS_BGP; a->scope = SCOPE_UNIVERSE; a->cast = RTC_UNICAST; @@ -1752,14 +1849,14 @@ bgp_get_attr(eattr *a, byte *buf, int buflen) } void -bgp_attr_init(struct bgp_proto *p) +bgp_init_bucket_table(struct bgp_proto *p) { p->hash_size = 256; p->hash_limit = p->hash_size * 4; p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *)); init_list(&p->bucket_queue); p->withdraw_bucket = NULL; - fib_init(&p->prefix_fib, p->p.pool, sizeof(struct bgp_prefix), 0, bgp_init_prefix); + // fib_init(&p->prefix_fib, p->p.pool, sizeof(struct bgp_prefix), 0, bgp_init_prefix); } void diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index dbc59eea..f290f227 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -362,7 +362,9 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) p->conn = conn; p->last_error_class = 0; p->last_error_code = 0; - bgp_attr_init(conn->bgp); + bgp_init_bucket_table(p); + bgp_init_prefix_table(p, 8); + bgp_conn_set_state(conn, BS_ESTABLISHED); proto_notify_state(&p->p, PS_UP); } @@ -410,8 +412,11 @@ static void bgp_send_open(struct bgp_conn *conn) { conn->start_state = conn->bgp->start_state; - conn->want_as4_support = conn->bgp->cf->enable_as4 && (conn->start_state != BSS_CONNECT_NOCAP); - conn->peer_as4_support = 0; // Default value, possibly changed by receiving capability. + + // Default values, possibly changed by receiving capabilities. + conn->peer_refresh_support = 0; + conn->peer_as4_support = 0; + conn->peer_add_path = 0; conn->advertised_as = 0; DBG("BGP: Sending open\n"); @@ -920,19 +925,17 @@ get_igp_table(struct bgp_config *cf) static struct proto * bgp_init(struct proto_config *C) { - struct bgp_config *c = (struct bgp_config *) C; struct proto *P = proto_new(C, sizeof(struct bgp_proto)); + struct bgp_config *c = (struct bgp_config *) C; struct bgp_proto *p = (struct bgp_proto *) P; P->accept_ra_types = c->secondary ? RA_ACCEPTED : RA_OPTIMAL; P->rt_notify = bgp_rt_notify; - P->rte_better = bgp_rte_better; P->import_control = bgp_import_control; P->neigh_notify = bgp_neigh_notify; P->reload_routes = bgp_reload_routes; - - if (c->deterministic_med) - P->rte_recalculate = bgp_rte_recalculate; + P->rte_better = bgp_rte_better; + P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL; p->cf = c; p->local_as = c->local_as; @@ -1176,15 +1179,19 @@ bgp_show_proto_info(struct proto *P) else if (P->proto_state == PS_UP) { cli_msg(-1006, " Neighbor ID: %R", p->remote_id); - cli_msg(-1006, " Neighbor caps: %s%s", + cli_msg(-1006, " Neighbor caps: %s%s%s%s", c->peer_refresh_support ? " refresh" : "", - c->peer_as4_support ? " AS4" : ""); - cli_msg(-1006, " Session: %s%s%s%s%s", + c->peer_as4_support ? " AS4" : "", + (c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "", + (c->peer_add_path & ADD_PATH_TX) ? " add-path-tx" : ""); + cli_msg(-1006, " Session: %s%s%s%s%s%s%s", p->is_internal ? "internal" : "external", p->cf->multihop ? " multihop" : "", p->rr_client ? " route-reflector" : "", p->rs_client ? " route-server" : "", - p->as4_session ? " AS4" : ""); + p->as4_session ? " AS4" : "", + p->add_path_rx ? " add-path-rx" : "", + p->add_path_tx ? " add-path-tx" : ""); cli_msg(-1006, " Source address: %I", p->source_addr); if (P->cf->in_limit) cli_msg(-1006, " Route limit: %d/%d", diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index c3adf254..b87de46e 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -43,6 +43,7 @@ struct bgp_config { int passive; /* Do not initiate outgoing connection */ int interpret_communities; /* Hardwired handling of well-known communities */ int secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */ + int add_path; /* Use ADD-PATH extension [draft] */ unsigned connect_retry_time; unsigned hold_time, initial_hold_time; unsigned keepalive_time; @@ -62,6 +63,11 @@ struct bgp_config { #define GW_DIRECT 1 #define GW_RECURSIVE 2 +#define ADD_PATH_RX 1 +#define ADD_PATH_TX 2 +#define ADD_PATH_FULL 3 + + struct bgp_conn { struct bgp_proto *bgp; struct birdsock *sk; @@ -75,9 +81,9 @@ struct bgp_conn { byte *notify_data; u32 advertised_as; /* Temporary value for AS number received */ int start_state; /* protocol start_state snapshot when connection established */ - int want_as4_support; /* Connection tries to establish AS4 session */ - int peer_as4_support; /* Peer supports 4B AS numbers [RFC4893] */ - int peer_refresh_support; /* Peer supports route refresh [RFC2918] */ + u8 peer_refresh_support; /* Peer supports route refresh [RFC2918] */ + u8 peer_as4_support; /* Peer supports 4B AS numbers [RFC4893] */ + u8 peer_add_path; /* Peer supports ADD-PATH [draft] */ unsigned hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */ }; @@ -86,8 +92,10 @@ struct bgp_proto { struct bgp_config *cf; /* Shortcut to BGP configuration */ u32 local_as, remote_as; int start_state; /* Substates that partitions BS_START */ - int is_internal; /* Internal BGP connection (local_as == remote_as) */ - int as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */ + u8 is_internal; /* Internal BGP connection (local_as == remote_as) */ + u8 as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */ + u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */ + u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */ u32 local_id; /* BGP identifier of this router */ u32 remote_id; /* BGP identifier of the neighbor */ u32 rr_cluster_id; /* Route reflector cluster ID */ @@ -104,7 +112,10 @@ struct bgp_proto { struct timer *startup_timer; /* Timer used to delay protocol startup due to previous errors (startup_delay) */ struct bgp_bucket **bucket_hash; /* Hash table of attribute buckets */ unsigned int hash_size, hash_count, hash_limit; - struct fib prefix_fib; /* Prefixes to be sent */ + // struct fib prefix_fib; /* Prefixes to be sent */ + struct bgp_prefix **prefix_table; /* Prefixes to be sent */ + slab *prefix_slab; /* Slab holding prefix nodes */ + u32 px_hash_order, px_hash_count; list bucket_queue; /* Queue of buckets to send */ struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ unsigned startup_delay; /* Time to delay protocol startup by due to errors */ @@ -120,7 +131,12 @@ struct bgp_proto { }; struct bgp_prefix { - struct fib_node n; /* Node in prefix fib */ + struct { + ip_addr prefix; + int pxlen; + } n; + u32 path_id; + struct bgp_prefix *next; node bucket_node; /* Node in per-bucket list */ }; @@ -154,6 +170,9 @@ void bgp_conn_enter_idle_state(struct bgp_conn *conn); void bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code); void bgp_stop(struct bgp_proto *p, unsigned subcode); +struct rte_source *bgp_find_source(struct bgp_proto *p, u32 path_id); +struct rte_source *bgp_get_source(struct bgp_proto *p, u32 path_id); + #ifdef LOCAL_DEBUG @@ -189,9 +208,11 @@ int bgp_rte_better(struct rte *, struct rte *); int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best); void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs); int bgp_import_control(struct proto *, struct rte **, struct ea_list **, struct linpool *); -void bgp_attr_init(struct bgp_proto *); -unsigned int bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains); +void bgp_init_bucket_table(struct bgp_proto *); void bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck); +void bgp_init_prefix_table(struct bgp_proto *p, u32 order); +void bgp_free_prefix(struct bgp_proto *p, struct bgp_prefix *bp); +unsigned int bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains); void bgp_get_route_info(struct rte *, byte *buf, struct ea_list *attrs); inline static void bgp_attach_attr_ip(struct ea_list **to, struct linpool *pool, unsigned attr, ip_addr a) diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 8b80d7fd..0b096339 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -26,7 +26,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC, - SECONDARY) + SECONDARY, ADD, PATHS, RX, TX) CF_GRAMMAR @@ -107,6 +107,9 @@ bgp_proto: | bgp_proto PASSIVE bool ';' { BGP_CFG->passive = $3; } | bgp_proto INTERPRET COMMUNITIES bool ';' { BGP_CFG->interpret_communities = $4; } | bgp_proto SECONDARY bool ';' { BGP_CFG->secondary = $3; } + | bgp_proto ADD PATHS RX ';' { BGP_CFG->add_path = ADD_PATH_RX; } + | bgp_proto ADD PATHS TX ';' { BGP_CFG->add_path = ADD_PATH_TX; } + | bgp_proto ADD PATHS bool ';' { BGP_CFG->add_path = $4 ? ADD_PATH_FULL : 0; } | bgp_proto IGP TABLE rtable ';' { BGP_CFG->igp_table = $4; } | bgp_proto TTL SECURITY bool ';' { BGP_CFG->ttl_security = $4; } ; diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index cfa37fb5..3fae2c24 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -159,6 +159,21 @@ bgp_put_cap_as4(struct bgp_conn *conn, byte *buf) } static byte * +bgp_put_cap_add_path(struct bgp_conn *conn, byte *buf) +{ + *buf++ = 69; /* Capability 69: Support for ADD-PATH */ + *buf++ = 4; /* Capability data length */ + + *buf++ = 0; /* Appropriate AF */ + *buf++ = BGP_AF; + *buf++ = 1; /* SAFI 1 */ + + *buf++ = conn->bgp->cf->add_path; + + return buf; +} + +static byte * bgp_create_open(struct bgp_conn *conn, byte *buf) { struct bgp_proto *p = conn->bgp; @@ -194,9 +209,12 @@ bgp_create_open(struct bgp_conn *conn, byte *buf) if (p->cf->enable_refresh) cap = bgp_put_cap_rr(conn, cap); - if (conn->want_as4_support) + if (p->cf->enable_as4) cap = bgp_put_cap_as4(conn, cap); + if (p->cf->add_path) + cap = bgp_put_cap_add_path(conn, cap); + cap_len = cap - buf - 12; if (cap_len > 0) { @@ -223,6 +241,13 @@ bgp_encode_prefixes(struct bgp_proto *p, byte *w, struct bgp_bucket *buck, unsig { struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes)); DBG("\tDequeued route %I/%d\n", px->n.prefix, px->n.pxlen); + + if (p->add_path_tx) + { + put_u32(w, px->path_id); + w += 4; + } + *w++ = px->n.pxlen; bytes = (px->n.pxlen + 7) / 8; a = px->n.prefix; @@ -231,7 +256,8 @@ bgp_encode_prefixes(struct bgp_proto *p, byte *w, struct bgp_bucket *buck, unsig w += bytes; remains -= bytes + 1; rem_node(&px->bucket_node); - fib_delete(&p->prefix_fib, px); + bgp_free_prefix(p, px); + // fib_delete(&p->prefix_fib, px); } return w - start; } @@ -244,7 +270,8 @@ bgp_flush_prefixes(struct bgp_proto *p, struct bgp_bucket *buck) struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes)); log(L_ERR "%s: - route %I/%d skipped", p->p.name, px->n.prefix, px->n.pxlen); rem_node(&px->bucket_node); - fib_delete(&p->prefix_fib, px); + bgp_free_prefix(p, px); + // fib_delete(&p->prefix_fib, px); } } @@ -626,7 +653,7 @@ void bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) { // struct bgp_proto *p = conn->bgp; - int cl; + int i, cl; while (len > 0) { @@ -643,14 +670,25 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) conn->peer_refresh_support = 1; break; - case 65: /* AS4 capability, RFC 4893 */ + case 65: /* AS4 capability, RFC 4893 */ if (cl != 4) goto err; conn->peer_as4_support = 1; - if (conn->want_as4_support) + if (conn->bgp->cf->enable_as4) conn->advertised_as = get_u32(opt + 2); break; + case 69: /* ADD-PATH capability, draft */ + if (cl % 4) + goto err; + for (i = 0; i < cl; i += 4) + if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */ + conn->peer_add_path = opt[2+i+3]; + if (conn->peer_add_path > ADD_PATH_FULL) + goto err; + + break; + /* We can safely ignore all other capabilities */ } len -= 2 + cl; @@ -789,7 +827,12 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) conn->hold_time = MIN(hold, p->cf->hold_time); conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3; p->remote_id = id; - p->as4_session = conn->want_as4_support && conn->peer_as4_support; + p->as4_session = p->cf->enable_as4 && conn->peer_as4_support; + p->add_path_rx = (p->cf->add_path & ADD_PATH_RX) && (conn->peer_add_path & ADD_PATH_TX); + p->add_path_tx = (p->cf->add_path & ADD_PATH_TX) && (conn->peer_add_path & ADD_PATH_RX); + + if (p->add_path_tx) + p->p.accept_ra_types = RA_ANY; DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n", conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, p->as4_session); @@ -799,6 +842,13 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) } #define DECODE_PREFIX(pp, ll) do { \ + if (p->add_path_rx) \ + { \ + if (ll < 5) { err=1; goto done; } \ + path_id = get_u32(pp); \ + pp += 4; \ + ll -= 4; \ + } \ int b = *pp++; \ int q; \ ll--; \ @@ -813,6 +863,53 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len) pxlen = b; \ } while (0) + +static inline void +bgp_rte_update(struct bgp_proto *p, ip_addr prefix, int pxlen, + u32 path_id, u32 *last_id, struct rte_src **src, + rta *a0, rta **a) +{ + if (path_id != *last_id) + { + *src = rt_get_source(&p->p, path_id); + *last_id = path_id; + + if (*a) + { + rta_free(*a); + *a = NULL; + } + } + + /* Prepare cached route attributes */ + if (!*a) + { + a0->src = *src; + *a = rta_lookup(a0); + } + + net *n = net_get(p->p.table, prefix, pxlen); + rte *e = rte_get_temp(rta_clone(*a)); + e->net = n; + e->pflags = 0; + e->u.bgp.suppressed = 0; + rte_update2(p->p.main_ahook, n, e, *src); +} + +static inline void +bgp_rte_withdraw(struct bgp_proto *p, ip_addr prefix, int pxlen, + u32 path_id, u32 *last_id, struct rte_src **src) +{ + if (path_id != *last_id) + { + *src = rt_find_source(&p->p, path_id); + *last_id = path_id; + } + + net *n = net_find(p->p.table, prefix, pxlen); + rte_update2( p->p.main_ahook, n, NULL, *src); +} + static inline int bgp_set_next_hop(struct bgp_proto *p, rta *a) { @@ -871,18 +968,20 @@ bgp_do_rx_update(struct bgp_conn *conn, byte *attrs, int attr_len) { struct bgp_proto *p = conn->bgp; - net *n; - rta *a0, *a = NULL; + struct rte_src *src = p->p.main_source; + rta *a0, *a; ip_addr prefix; int pxlen, err = 0; + u32 path_id = 0; + u32 last_id = 0; /* Withdraw routes */ while (withdrawn_len) { DECODE_PREFIX(withdrawn, withdrawn_len); DBG("Withdraw %I/%d\n", prefix, pxlen); - if (n = net_find(p->p.table, prefix, pxlen)) - rte_update(p->p.table, n, &p->p, &p->p, NULL); + + bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); } if (!attr_len && !nlri_len) /* shortcut */ @@ -893,28 +992,22 @@ bgp_do_rx_update(struct bgp_conn *conn, if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */ return; - if (a0 && nlri_len && bgp_set_next_hop(p, a0)) - a = rta_lookup(a0); + if (a0 && ! bgp_set_next_hop(p, a0)) + a0 = NULL; + + a = NULL; + last_id = 0; + src = p->p.main_source; while (nlri_len) { DECODE_PREFIX(nlri, nlri_len); DBG("Add %I/%d\n", prefix, pxlen); - if (a) - { - rte *e = rte_get_temp(rta_clone(a)); - e->net = net_get(p->p.table, prefix, pxlen); - e->pflags = 0; - e->u.bgp.suppressed = 0; - rte_update(p->p.table, e->net, &p->p, &p->p, e); - } - else - { - /* Forced withdraw as a result of soft error */ - if (n = net_find(p->p.table, prefix, pxlen)) - rte_update(p->p.table, n, &p->p, &p->p, NULL); - } + if (a0) + bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a); + else /* Forced withdraw as a result of soft error */ + bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); } done: @@ -970,13 +1063,15 @@ bgp_do_rx_update(struct bgp_conn *conn, byte *attrs, int attr_len) { struct bgp_proto *p = conn->bgp; + struct rte_src *src = p->p.main_source; byte *start, *x; int len, len0; unsigned af, sub; - net *n; - rta *a0, *a = NULL; + rta *a0, *a; ip_addr prefix; int pxlen, err = 0; + u32 path_id = 0; + u32 last_id = 0; p->mp_reach_len = 0; p->mp_unreach_len = 0; @@ -991,8 +1086,7 @@ bgp_do_rx_update(struct bgp_conn *conn, { DECODE_PREFIX(x, len); DBG("Withdraw %I/%d\n", prefix, pxlen); - if (n = net_find(p->p.table, prefix, pxlen)) - rte_update(p->p.table, n, &p->p, &p->p, NULL); + bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); } } @@ -1009,28 +1103,22 @@ bgp_do_rx_update(struct bgp_conn *conn, len -= *x + 2; x += *x + 2; - if (a0 && bgp_set_next_hop(p, a0)) - a = rta_lookup(a0); + if (a0 && ! bgp_set_next_hop(p, a0)) + a0 = NULL; + + a = NULL; + last_id = 0; + src = p->p.main_source; while (len) { DECODE_PREFIX(x, len); DBG("Add %I/%d\n", prefix, pxlen); - if (a) - { - rte *e = rte_get_temp(rta_clone(a)); - e->net = net_get(p->p.table, prefix, pxlen); - e->pflags = 0; - e->u.bgp.suppressed = 0; - rte_update(p->p.table, e->net, &p->p, &p->p, e); - } - else - { - /* Forced withdraw as a result of soft error */ - if (n = net_find(p->p.table, prefix, pxlen)) - rte_update(p->p.table, n, &p->p, &p->p, NULL); - } + if (a0) + bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a); + else /* Forced withdraw as a result of soft error */ + bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); } } diff --git a/proto/ospf/ospf.c b/proto/ospf/ospf.c index aa62da14..1aa7407a 100644 --- a/proto/ospf/ospf.c +++ b/proto/ospf/ospf.c @@ -298,14 +298,14 @@ ospf_init(struct proto_config *c) { struct proto *p = proto_new(c, sizeof(struct proto_ospf)); - p->make_tmp_attrs = ospf_make_tmp_attrs; - p->store_tmp_attrs = ospf_store_tmp_attrs; - p->import_control = ospf_import_control; - p->reload_routes = ospf_reload_routes; p->accept_ra_types = RA_OPTIMAL; p->rt_notify = ospf_rt_notify; p->if_notify = ospf_if_notify; p->ifa_notify = ospf_ifa_notify; + p->import_control = ospf_import_control; + p->reload_routes = ospf_reload_routes; + p->make_tmp_attrs = ospf_make_tmp_attrs; + p->store_tmp_attrs = ospf_store_tmp_attrs; p->rte_better = ospf_rte_better; p->rte_same = ospf_rte_same; @@ -502,7 +502,7 @@ ospf_import_control(struct proto *p, rte ** new, ea_list ** attrs, struct ospf_area *oa = ospf_main_area((struct proto_ospf *) p); rte *e = *new; - if (p == e->attrs->proto) + if (e->attrs->src->proto == p) return -1; /* Reject our own routes */ if (oa_is_stub(oa)) diff --git a/proto/ospf/rt.c b/proto/ospf/rt.c index 4b8de4b8..1053fd07 100644 --- a/proto/ospf/rt.c +++ b/proto/ospf/rt.c @@ -1987,10 +1987,10 @@ again1: if (nf->n.type) /* Add the route */ { rta a0 = { - .proto = p, + .src = p->main_source, .source = nf->n.type, .scope = SCOPE_UNIVERSE, - .cast = RTC_UNICAST, + .cast = RTC_UNICAST }; if (nf->n.nhs->next) @@ -2028,7 +2028,7 @@ again1: DBG("Mod rte type %d - %I/%d via %I on iface %s, met %d\n", a0.source, nf->fn.prefix, nf->fn.pxlen, a0.gw, a0.iface ? a0.iface->name : "(none)", nf->n.metric1); - rte_update(p->table, ne, p, p, e); + rte_update(p, ne, e); } } else if (nf->old_rta) @@ -2038,7 +2038,7 @@ again1: nf->old_rta = NULL; net *ne = net_get(p->table, nf->fn.prefix, nf->fn.pxlen); - rte_update(p->table, ne, p, p, NULL); + rte_update(p, ne, NULL); } /* Remove unused rt entry. Entries with fn.x0 == 1 are persistent. */ diff --git a/proto/pipe/pipe.c b/proto/pipe/pipe.c index 6099d284..5bae8614 100644 --- a/proto/pipe/pipe.c +++ b/proto/pipe/pipe.c @@ -49,7 +49,7 @@ pipe_rt_notify(struct proto *P, rtable *src_table, net *n, rte *new, rte *old, e struct pipe_proto *p = (struct pipe_proto *) P; struct announce_hook *ah = (src_table == P->table) ? p->peer_ahook : P->main_ahook; rtable *dst_table = ah->table; - struct proto *src; + struct rte_src *src; net *nn; rte *e; @@ -72,7 +72,7 @@ pipe_rt_notify(struct proto *P, rtable *src_table, net *n, rte *new, rte *old, e if (p->mode == PIPE_OPAQUE) { - a.proto = &p->p; + a.src = P->main_source; a.source = RTS_PIPE; } @@ -91,16 +91,16 @@ pipe_rt_notify(struct proto *P, rtable *src_table, net *n, rte *new, rte *old, e e->pflags = new->pflags; } - src = new->attrs->proto; + src = a.src; } else { e = NULL; - src = old->attrs->proto; + src = old->attrs->src; } src_table->pipe_busy = 1; - rte_update2(ah, nn, e, (p->mode == PIPE_OPAQUE) ? &p->p : src); + rte_update2(ah, nn, e, src); src_table->pipe_busy = 0; } @@ -173,6 +173,12 @@ pipe_start(struct proto *P) p->peer_ahook->in_limit = cf->c.out_limit; proto_reset_limit(p->peer_ahook->in_limit); + if (p->mode == PIPE_OPAQUE) + { + P->main_source = rt_get_source(P, 0); + rt_lock_source(P->main_source); + } + return PS_UP; } @@ -187,6 +193,10 @@ pipe_cleanup(struct proto *P) P->main_ahook = NULL; p->peer_ahook = NULL; + if (p->mode == PIPE_OPAQUE) + rt_unlock_source(P->main_source); + P->main_source = NULL; + rt_unlock_table(P->table); rt_unlock_table(p->peer_table); } diff --git a/proto/rip/rip.c b/proto/rip/rip.c index 281296a5..9f4f0856 100644 --- a/proto/rip/rip.c +++ b/proto/rip/rip.c @@ -263,16 +263,18 @@ find_interface(struct proto *p, struct iface *what) * This part is responsible for any updates that come from network */ +static int rip_rte_better(struct rte *new, struct rte *old); + static void rip_rte_update_if_better(rtable *tab, net *net, struct proto *p, rte *new) { rte *old; - old = rte_find(net, p); - if (!old || p->rte_better(new, old) || + old = rte_find(net, p->main_source); + if (!old || rip_rte_better(new, old) || (ipa_equal(old->attrs->from, new->attrs->from) && (old->u.rip.metric != new->u.rip.metric)) ) - rte_update(tab, net, p, p, new); + rte_update(p, net, new); else rte_free(new); } @@ -295,7 +297,7 @@ advertise_entry( struct proto *p, struct rip_block *b, ip_addr whotoldme, struct int pxlen; bzero(&A, sizeof(A)); - A.proto = p; + A.src= p->main_source; A.source = RTS_RIP; A.scope = SCOPE_UNIVERSE; A.cast = RTC_UNICAST; @@ -604,20 +606,10 @@ rip_start(struct proto *p) add_head( &P->interfaces, NODE rif ); CHK_MAGIC; - rip_init_instance(p); - DBG( "RIP: ...done\n"); return PS_UP; } -static struct proto * -rip_init(struct proto_config *cfg) -{ - struct proto *p = proto_new(cfg, sizeof(struct rip_proto)); - - return p; -} - static void rip_dump(struct proto *p) { @@ -843,7 +835,7 @@ rip_gen_attrs(struct linpool *pool, int metric, u16 tag) static int rip_import_control(struct proto *p, struct rte **rt, struct ea_list **attrs, struct linpool *pool) { - if ((*rt)->attrs->proto == p) /* My own must not be touched */ + if ((*rt)->attrs->src->proto == p) /* My own must not be touched */ return 1; if ((*rt)->attrs->source != RTS_RIP) { @@ -895,7 +887,7 @@ rip_rt_notify(struct proto *p, struct rtable *table UNUSED, struct network *net, if (e->metric > P_CF->infinity) e->metric = P_CF->infinity; - if (new->attrs->proto == p) + if (new->attrs->src->proto == p) e->whotoldme = new->attrs->from; if (!e->metric) /* That's okay: this way user can set his own value for external @@ -917,7 +909,7 @@ rip_rte_same(struct rte *new, struct rte *old) static int rip_rte_better(struct rte *new, struct rte *old) { - struct proto *p = new->attrs->proto; + struct proto *p = new->attrs->src->proto; if (ipa_equal(old->attrs->from, new->attrs->from)) return 1; @@ -928,7 +920,7 @@ rip_rte_better(struct rte *new, struct rte *old) if (old->u.rip.metric > new->u.rip.metric) return 1; - if (old->attrs->proto == new->attrs->proto) /* This does not make much sense for different protocols */ + if (old->attrs->src->proto == new->attrs->src->proto) /* This does not make much sense for different protocols */ if ((old->u.rip.metric == new->u.rip.metric) && ((now - old->lastmod) > (P_CF->timeout_time / 2))) return 1; @@ -944,7 +936,7 @@ rip_rte_better(struct rte *new, struct rte *old) static void rip_rte_insert(net *net UNUSED, rte *rte) { - struct proto *p = rte->attrs->proto; + struct proto *p = rte->attrs->src->proto; CHK_MAGIC; DBG( "rip_rte_insert: %p\n", rte ); add_head( &P->garbage, &rte->u.rip.garbage ); @@ -962,9 +954,11 @@ rip_rte_remove(net *net UNUSED, rte *rte) rem_node( &rte->u.rip.garbage ); } -void -rip_init_instance(struct proto *p) +static struct proto * +rip_init(struct proto_config *cfg) { + struct proto *p = proto_new(cfg, sizeof(struct rip_proto)); + p->accept_ra_types = RA_OPTIMAL; p->if_notify = rip_if_notify; p->rt_notify = rip_rt_notify; @@ -975,6 +969,8 @@ rip_init_instance(struct proto *p) p->rte_same = rip_rte_same; p->rte_insert = rip_rte_insert; p->rte_remove = rip_rte_remove; + + return p; } void diff --git a/proto/rip/rip.h b/proto/rip/rip.h index 896fab64..6a8af379 100644 --- a/proto/rip/rip.h +++ b/proto/rip/rip.h @@ -172,7 +172,6 @@ struct rip_proto { #endif -void rip_init_instance(struct proto *p); void rip_init_config(struct rip_proto_config *c); /* Authentication functions */ diff --git a/proto/static/static.c b/proto/static/static.c index 6a027f50..b6c8948f 100644 --- a/proto/static/static.c +++ b/proto/static/static.c @@ -67,7 +67,7 @@ static_install(struct proto *p, struct static_route *r, struct iface *ifa) DBG("Installing static route %I/%d, rtd=%d\n", r->net, r->masklen, r->dest); bzero(&a, sizeof(a)); - a.proto = p; + a.src = p->main_source; a.source = (r->dest == RTD_DEVICE) ? RTS_STATIC_DEVICE : RTS_STATIC; a.scope = SCOPE_UNIVERSE; a.cast = RTC_UNICAST; @@ -113,7 +113,7 @@ static_install(struct proto *p, struct static_route *r, struct iface *ifa) e = rte_get_temp(aa); e->net = n; e->pflags = 0; - rte_update(p->table, n, p, p, e); + rte_update(p, n, e); r->installed = 1; } @@ -127,8 +127,7 @@ static_remove(struct proto *p, struct static_route *r) DBG("Removing static route %I/%d\n", r->net, r->masklen); n = net_find(p->table, r->net, r->masklen); - if (n) - rte_update(p->table, n, p, p, NULL); + rte_update(p, n, NULL); r->installed = 0; } @@ -367,6 +366,7 @@ static_init(struct proto_config *c) p->neigh_notify = static_neigh_notify; p->if_notify = static_if_notify; + return p; } diff --git a/sysdep/bsd/krt-sock.c b/sysdep/bsd/krt-sock.c index e970d6bd..8d45cbfe 100644 --- a/sysdep/bsd/krt-sock.c +++ b/sysdep/bsd/krt-sock.c @@ -314,7 +314,7 @@ krt_read_rt(struct ks_msg *msg, struct krt_proto *p, int scan) net = net_get(p->p.table, idst, pxlen); rta a = { - .proto = &p->p, + .src = p->p.main_source, .source = RTS_INHERIT, .scope = SCOPE_UNIVERSE, .cast = RTC_UNICAST diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index d1b203ef..791f715e 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -804,7 +804,7 @@ nl_parse_route(struct nlmsghdr *h, int scan) net *net = net_get(p->p.table, dst, i->rtm_dst_len); rta ra = { - .proto = &p->p, + .src= p->p.main_source, .source = RTS_INHERIT, .scope = SCOPE_UNIVERSE, .cast = RTC_UNICAST diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index 2bd1bc44..497d328d 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -340,15 +340,14 @@ krt_learn_announce_update(struct krt_proto *p, rte *e) ee->pflags = 0; ee->pref = p->p.preference; ee->u.krt = e->u.krt; - rte_update(p->p.table, nn, &p->p, &p->p, ee); + rte_update(&p->p, nn, ee); } static void krt_learn_announce_delete(struct krt_proto *p, net *n) { n = net_find(p->p.table, n->n.prefix, n->n.pxlen); - if (n) - rte_update(p->p.table, n, &p->p, &p->p, NULL); + rte_update(&p->p, n, NULL); } /* Called when alien route is discovered during scan */ @@ -692,7 +691,7 @@ krt_export_rte(struct krt_proto *p, rte **new, ea_list **tmpa) if (filter == FILTER_ACCEPT) return 1; - struct proto *src = (*new)->attrs->proto; + struct proto *src = (*new)->attrs->src->proto; *tmpa = src->make_tmp_attrs ? src->make_tmp_attrs(*new, krt_filter_lp) : NULL; return f_run(filter, new, tmpa, krt_filter_lp, FF_FORCE_TMPATTR) <= F_ACCEPT; } @@ -874,7 +873,7 @@ krt_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool * struct krt_proto *p = (struct krt_proto *) P; rte *e = *new; - if (e->attrs->proto == P) + if (e->attrs->src->proto == P) return -1; if (!KRT_CF->devroutes && @@ -926,10 +925,10 @@ krt_init(struct proto_config *c) struct krt_proto *p = proto_new(c, sizeof(struct krt_proto)); p->p.accept_ra_types = RA_OPTIMAL; - p->p.make_tmp_attrs = krt_make_tmp_attrs; - p->p.store_tmp_attrs = krt_store_tmp_attrs; p->p.import_control = krt_import_control; p->p.rt_notify = krt_notify; + p->p.make_tmp_attrs = krt_make_tmp_attrs; + p->p.store_tmp_attrs = krt_store_tmp_attrs; p->p.rte_same = krt_rte_same; krt_sys_init(p); |