diff options
149 files changed, 13922 insertions, 8367 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index d1a824ff..1c8aa869 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -140,18 +140,10 @@ build-fedora-34-amd64: <<: *build-linux image: registry.nic.cz/labs/bird:fedora-33-amd64 -build-centos-7-amd64: - <<: *build-linux - image: registry.nic.cz/labs/bird:centos-7-amd64 - build-centos-8-amd64: <<: *build-linux image: registry.nic.cz/labs/bird:centos-8-amd64 -build-ubuntu-14_04-amd64: - <<: *build-linux - image: registry.nic.cz/labs/bird:ubuntu-14.04-amd64 - build-ubuntu-16_04-amd64: <<: *build-linux image: registry.nic.cz/labs/bird:ubuntu-16.04-amd64 @@ -292,14 +284,7 @@ pkg-fedora-33-amd64: pkg-fedora-34-amd64: <<: *pkg-rpm needs: [build-fedora-34-amd64] - image: registry.nic.cz/labs/bird:fedora-34-amd64 - -pkg-centos-7-amd64: - <<: *pkg-rpm-wa - variables: - LC_ALL: en_US.UTF-8 - needs: [build-centos-7-amd64] - image: registry.nic.cz/labs/bird:centos-7-amd64 + image: registry.labs.nic.cz/labs/bird:fedora-34-amd64 pkg-centos-8-amd64: <<: *pkg-rpm-wa @@ -368,7 +353,7 @@ build-birdlab: script: - cd $TOOLS_DIR/netlab - sudo ./stop - - sudo ./runtest -s v2 -m check $TEST_NAME + - sudo ./runtest -s v3 -m check $TEST_NAME test-ospf-base: <<: *test-base @@ -27,9 +27,9 @@ Requirements For compiling BIRD you need these programs and libraries: - - GNU C Compiler (or LLVM Clang) + - GNU C Compiler (or LLVM Clang) capable of compiling C11 code - GNU Make - - GNU Bison + - GNU Bison (at least 3.0) - GNU M4 - Flex @@ -1,5 +1,32 @@ dnl ** Additional Autoconf tests for BIRD configure script dnl ** (c) 1999 Martin Mares <mj@ucw.cz> +dnl ** (c) 2021 Maria Matejka <mq@jmq.cz> + +AC_DEFUN([BIRD_CHECK_POINTER_ALIGNMENT], +[ + AC_CACHE_CHECK( + [how pointers are aligned], + [bird_cv_pointer_alignment], + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM( + [ + _Static_assert(_Alignof(void *) == 8, "bad"); + ], [] + ) + ], + [bird_cv_pointer_alignment=8], + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM( + [ + _Static_assert(_Alignof(void *) == 4, "bad"); + ], [] + ) + ], + [bird_cv_pointer_alignment=4], + [bird_cv_pointer_alignment=unknown] + )) + ) +]) AC_DEFUN([BIRD_CHECK_THREAD_LOCAL], [ @@ -9,14 +36,23 @@ AC_DEFUN([BIRD_CHECK_THREAD_LOCAL], AC_COMPILE_IFELSE([ AC_LANG_PROGRAM( [ - _Thread_local static int x = 42; + static _Thread_local int x = 42; ], [] ) ], [bird_cv_thread_local=yes], + [AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM( + [ + static __thread int x = 42; + ], + [] + ) + ], + [bird_cv_thread_local=__thread], [bird_cv_thread_local=no] - ) + )]) ) ]) diff --git a/bird-gdb.py b/bird-gdb.py index 3cf65a9c..c53a3b55 100644 --- a/bird-gdb.py +++ b/bird-gdb.py @@ -4,9 +4,10 @@ class BIRDPrinter: @classmethod def lookup(cls, val): - if val.type.code != cls.typeCode: + t = val.type.strip_typedefs() + if t.code != cls.typeCode: return None - if val.type.tag != cls.typeTag: + if t.tag != cls.typeTag: return None return cls(val) @@ -25,7 +26,6 @@ class BIRDFValPrinter(BIRDPrinter): "T_ENUM_RTS": "i", "T_ENUM_BGP_ORIGIN": "i", "T_ENUM_SCOPE": "i", - "T_ENUM_RTC": "i", "T_ENUM_RTD": "i", "T_ENUM_ROA": "i", "T_ENUM_NETTYPE": "i", @@ -123,7 +123,7 @@ class BIRDFLinePrinter(BIRDPrinter): "n": n, "code": str(self.val['items'][n]['fi_code']), } if n % 8 == 0 else str(self.val['items'][n]['fi_code']) for n in range(cnt)])) - + class BIRDFExecStackPrinter(BIRDPrinter): "Print BIRD's struct f_exec_stack" @@ -141,6 +141,317 @@ class BIRDFExecStackPrinter(BIRDPrinter): "n": n } for n in range(cnt-1, -1, -1) ]) + +class BIRD: + def skip_back(t, i, v): + if isinstance(t, str): + t = gdb.lookup_type(t) + elif isinstance(t, gdb.Value): + t = gdb.lookup_type(t.string()) + elif not isinstance(t, gdb.Type): + raise Exception(f"First argument of skip_back(t, i, v) must be a type, got {type(t)}") + + t = t.strip_typedefs() + nullptr = gdb.Value(0).cast(t.pointer()) + + if isinstance(i, gdb.Value): + i = i.string() + elif not isinstance(i, str): + raise Exception(f"Second argument of skip_back(t, i, v) must be a item name, got {type(i)}") + + if not isinstance(v, gdb.Value): + raise Exception(f"Third argument of skip_back(t, i, v) must be a value, got {type(v)}") + if v.type.code != gdb.TYPE_CODE_PTR and v.type.code != gdb.TYPE_CODE_REF: + raise Exception(f"Third argument of skip_back(t, i, v) must be a pointer, is {v.type} ({v.type.code})") + if v.type.target().strip_typedefs() != nullptr[i].type: + raise Exception(f"Third argument of skip_back(t, i, v) points to type {v.type.target().strip_typedefs()}, should be {nullptr[i].type}") + + uintptr_t = gdb.lookup_type("uintptr_t") + taddr = v.dereference().address.cast(uintptr_t) - nullptr[i].address.cast(uintptr_t) + return gdb.Value(taddr).cast(t.pointer()) + + class skip_back_gdb(gdb.Function): + "Given address of a structure item, returns address of the structure, as the SKIP_BACK macro does" + def __init__(self): + gdb.Function.__init__(self, "SKIP_BACK") + + def invoke(self, t, i, v): + return BIRD.skip_back(t, i, v) + + +BIRD.skip_back_gdb() + + +class BIRDList: + def __init__(self, val): + ltype = val.type.strip_typedefs() + if ltype.code != gdb.TYPE_CODE_UNION or ltype.tag != "list": + raise Exception(f"Not a list, is type {ltype}") + + self.head = val["head"] + self.tail_node = val["tail_node"] + + if str(self.head.address) == '0x0': + raise Exception("List head is NULL") + + if str(self.tail_node["prev"].address) == '0x0': + raise Exception("List tail is NULL") + + def walk(self, do): + cur = self.head + while cur.dereference() != self.tail_node: + do(cur) + cur = cur.dereference()["next"] + + +class BIRDListLength(gdb.Function): + """Returns length of the list, as in + print $list_length(routing_tables)""" + def __init__(self): + super(BIRDListLength, self).__init__("list_length") + + def count(self, _): + self.cnt += 1 + + def invoke(self, l): + self.cnt = 0 + BIRDList(l).walk(self.count) + return self.cnt + +BIRDListLength() + +class BIRDListItem(gdb.Function): + """Returns n-th item of the list.""" + def __init__(self): + super(BIRDListItem, self).__init__("list_item") + + class BLException(Exception): + def __init__(self, node, msg): + Exception.__init__(self, msg) + self.node = node + + def count(self, node): + if self.cnt == self.pos: + raise self.BLException(node, "Node found") + + self.cnt += 1 + + def invoke(self, l, n, t=None, item="n"): + self.cnt = 0 + self.pos = n + bl = BIRDList(l) + try: + bl.walk(self.count) + except self.BLException as e: + if t is None: + return e.node + else: + return BIRD.skip_back(t, item, e.node) + + raise Exception("List too short") + +BIRDListItem() + +class BIRDResourceSize(): + def __init__(self, netto, overhead, free): + self.netto = netto + self.overhead = overhead + self.free = free + + def __str__(self): + ns = str(self.netto) + os = str(self.overhead) + fs = str(self.free) + + return "{: >12s} | {: >12s} | {: >12s}".format(ns, os, fs) + + def __add__(self, val): + return BIRDResourceSize(self.netto + val.netto, self.overhead + val.overhead, self.free + val.free) + +class BIRDResource(): + def __init__(self, val): + self.val = val + + def __str__(self): + return f"Item {self.val.address} of class \"{self.val['class']['name'].string()}\"" + + def memsize(self): + if str(self.val["class"]["memsize"]) == '0x0': + size = self.val["class"]["size"] + ressize = gdb.lookup_type("struct resource").sizeof + return BIRDResourceSize(size - ressize, ressize, 0) + else: + raise Exception(f"Resource class {self.val['class']['name']} with defined memsize() not known by Python") + + def parse(self): + pass + +class BIRDMBResource(BIRDResource): + def __init__(self, val): + self.mbtype = gdb.lookup_type("struct mblock") + self.val = val.cast(self.mbtype) + + def memsize(self): + return BIRDResourceSize(self.val["size"], 8 + self.mbtype.sizeof, 0) + + def __str__(self): + return f"Standalone memory block {self.val.address} of size {self.val['size']}, data at {self.val['data'].address}" + +class BIRDLinPoolResource(BIRDResource): + def __init__(self, val): + self.lptype = gdb.lookup_type("struct linpool") + self.val = val.cast(self.lptype) + self.info = None + + def count_chunk(self, which): + cnt = 0 + chunk = self.val[which] + while str(chunk) != '0x0': + cnt += 1 + chunk = chunk.dereference()["next"] + return cnt + + def parse(self): + self.info = { + "std_chunks": self.count_chunk("first"), + "large_chunks": self.count_chunk("first_large"), + } + + def memsize(self): + if self.info is None: + self.parse() + + overhead = (8 - 8*self.val["use_pages"]) + gdb.lookup_type("struct lp_chunk").sizeof + return BIRDResourceSize( + self.val["total"] + self.val["total_large"], + (self.info["std_chunks"] + self.info["large_chunks"]) * overhead, + 0) + + def __str__(self): + if self.info is None: + self.parse() + + return f"Linpool {self.val.address} with {self.info['std_chunks']} standard chunks of size {self.val['chunk_size']} and {self.info['large_chunks']} large chunks" + +class BIRDSlabResource(BIRDResource): + def __init__(self, val): + self.slabtype = gdb.lookup_type("struct slab") + self.val = val.cast(self.slabtype) + self.info = None + + def count_heads_item(self, item): + self.hcnt += 1 + self.used += item.dereference().cast(self.slheadtype)["num_full"] + + def count_heads(self, which): + self.hcnt = 0 + self.used = 0 + BIRDList(self.val[which + "_heads"]).walk(self.count_heads_item) + self.info[which + "_heads"] = self.hcnt + self.info[which + "_used"] = self.used + return (self.hcnt, self.used) + + def parse(self): + self.page_size = gdb.lookup_symbol("page_size")[0].value() + self.slheadtype = gdb.lookup_type("struct sl_head") + self.info = {} + self.count_heads("empty") + self.count_heads("partial") + self.count_heads("full") + + def memsize(self): + if self.info is None: + self.parse() + + total_used = self.info["empty_used"] + self.info["partial_used"] + self.info["full_used"] + total_heads = self.info["empty_heads"] + self.info["partial_heads"] + self.info["full_heads"] + + eff_size = total_used * self.val["obj_size"] + free_size = self.info["empty_heads"] * self.page_size + total_size = total_heads * self.page_size + self.slabtype.sizeof + + return BIRDResourceSize( eff_size, total_size - free_size - eff_size, free_size) + + def __str__(self): + if self.info is None: + self.parse() + + return f"Slab {self.val.address} " + ", ".join([ + f"{self.info[x + '_heads']} {x} heads" for x in [ "empty", "partial", "full" ]]) + \ + f", {self.val['objs_per_slab']} objects of size {self.val['obj_size']} per head" + + +class BIRDPoolResource(BIRDResource): + def __init__(self, val): + self.pooltype = gdb.lookup_type("struct pool") + self.resptrtype = gdb.lookup_type("struct resource").pointer() + self.page_size = gdb.lookup_symbol("page_size")[0].value() + self.val = val.cast(self.pooltype) + self.items = None + + def parse_inside(self, val): + self.items.append(BIRDNewResource(val.cast(self.resptrtype).dereference())) + + def parse(self): + self.items = [] + BIRDList(self.val["inside"]).walk(self.parse_inside) + + def memsize(self): + if self.items is None: + self.parse() + + sum = BIRDResourceSize(0, self.pooltype.sizeof, 0) +# for i in self.items: +# sum += i.memsize() + + return sum + + def __str__(self): + if self.items is None: + self.parse() + +# for i in self.items: +# print(i) + + return f"Resource pool {self.val.address} \"{self.val['name'].string()}\" containing {len(self.items)} items" + +BIRDResourceMap = { + "mbl_memsize": BIRDMBResource, + "pool_memsize": BIRDPoolResource, + "lp_memsize": BIRDLinPoolResource, + "slab_memsize": BIRDSlabResource, + } + +def BIRDNewResource(res): + cms = res["class"].dereference()["memsize"] + for cx in BIRDResourceMap: + if cms == gdb.lookup_symbol(cx)[0].value(): + return BIRDResourceMap[cx](res) + + return BIRDResource(res) + + +class BIRDResourcePrinter(BIRDPrinter): + "Print BIRD's resource" + typeCode = gdb.TYPE_CODE_STRUCT + typeTag = "resource" + + def __init__(self, val): + super(BIRDResourcePrinter, self).__init__(val) + self.resource = BIRDNewResource(val) + self.resource.parse() + self.resourcetype = gdb.lookup_type("struct resource") + + if type(self.resource) == BIRDPoolResource: + self.children = self.pool_children + + def pool_children(self): + return iter([ ("\n", i.val.cast(self.resourcetype)) for i in self.resource.items ]) + + def to_string(self): + return f"[ {str(self.resource.memsize())} ] {str(self.resource)}" + + def register_printers(objfile): objfile.pretty_printers.append(BIRDFInstPrinter.lookup) objfile.pretty_printers.append(BIRDFValPrinter.lookup) @@ -148,6 +459,7 @@ def register_printers(objfile): objfile.pretty_printers.append(BIRDFLineItemPrinter.lookup) objfile.pretty_printers.append(BIRDFLinePrinter.lookup) objfile.pretty_printers.append(BIRDFExecStackPrinter.lookup) + objfile.pretty_printers.append(BIRDResourcePrinter.lookup) register_printers(gdb.current_objfile()) diff --git a/conf/cf-lex.l b/conf/cf-lex.l index ceedee8a..7ce457fe 100644 --- a/conf/cf-lex.l +++ b/conf/cf-lex.l @@ -42,7 +42,7 @@ #define PARSER 1 #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "filter/filter.h" #include "filter/f-inst.h" @@ -77,19 +77,22 @@ static uint cf_hash(const byte *c); #define SYM_NEXT(n) n->next #define SYM_EQ(a,s1,b,s2) !strcmp(a,b) && s1 == s2 #define SYM_FN(k,s) cf_hash(k) -#define SYM_ORDER 6 /* Initial */ +#define SYM_ORDER 4 /* Initial */ #define SYM_REHASH sym_rehash -#define SYM_PARAMS /8, *1, 2, 2, 6, 20 +#define SYM_PARAMS /8, *1, 2, 2, 4, 20 HASH_DEFINE_REHASH_FN(SYM, struct symbol) HASH(struct keyword) kw_hash; - +HASH(struct ea_class) ea_name_hash; struct sym_scope *conf_this_scope; +static struct sym_scope global_root_scope__init = { .active = 1, }; +struct sym_scope *global_root_scope = &global_root_scope__init; + linpool *cfg_mem; int (*cf_read_hook)(byte *buf, unsigned int max, int fd); @@ -347,7 +350,7 @@ else: { return DDOT; } -[={}:;,.()+*/%<>~\[\]?!\|-] { +[={}:;,.()+*/%<>~\[\]?!\|&-] { return yytext[0]; } @@ -591,41 +594,58 @@ cf_new_symbol(const byte *c) *s = (struct symbol) { .scope = conf_this_scope, .class = SYM_VOID, }; strcpy(s->name, c); - if (!new_config->sym_hash.data) - HASH_INIT(new_config->sym_hash, new_config->pool, SYM_ORDER); + if (!conf_this_scope->hash.data) + HASH_INIT(conf_this_scope->hash, new_config->pool, SYM_ORDER); + + HASH_INSERT2(conf_this_scope->hash, SYM, new_config->pool, s); + + if (conf_this_scope == new_config->root_scope) + add_tail(&(new_config->symbols), &(s->n)); + + return s; +} + +static struct symbol * +cf_root_symbol(const byte *c) +{ + uint l = strlen(c); + if (l > SYM_MAX_LEN) + bug("Root symbol %s too long", c); - HASH_INSERT2(new_config->sym_hash, SYM, new_config->pool, s); + struct symbol *s = mb_alloc(&root_pool, sizeof(struct symbol) + l + 1); + *s = (struct symbol) { .scope = global_root_scope, .class = SYM_VOID, }; + memcpy(s->name, c, l+1); - add_tail(&(new_config->symbols), &(s->n)); + if (!global_root_scope->hash.data) + HASH_INIT(global_root_scope->hash, &root_pool, SYM_ORDER); + HASH_INSERT2(global_root_scope->hash, SYM, &root_pool, s); return s; } + /** - * cf_find_symbol - find a symbol by name - * @cfg: specificed config + * cf_find_symbol_scope - find a symbol by name + * @scope: config scope * @c: symbol name * - * This functions searches the symbol table in the config @cfg for a symbol of - * given name. First it examines the current scope, then the second recent one + * This functions searches the symbol table in the scope @scope for a symbol of + * given name. First it examines the current scope, then the underlying one * and so on until it either finds the symbol and returns a pointer to its * &symbol structure or reaches the end of the scope chain and returns %NULL to * signify no match. */ struct symbol * -cf_find_symbol(const struct config *cfg, const byte *c) +cf_find_symbol_scope(const struct sym_scope *scope, const byte *c) { struct symbol *s; - if (cfg->sym_hash.data && - (s = HASH_FIND(cfg->sym_hash, SYM, c, 1))) - return s; - - /* In CLI command parsing, fallback points to the current config, otherwise it is NULL. */ - if (cfg->fallback && - cfg->fallback->sym_hash.data && - (s = HASH_FIND(cfg->fallback->sym_hash, SYM, c, 1))) - return s; + /* Find the symbol here or anywhere below */ + while (scope) + if (scope->hash.data && (s = HASH_FIND(scope->hash, SYM, c, 1))) + return s; + else + scope = scope->next; return NULL; } @@ -642,7 +662,7 @@ cf_find_symbol(const struct config *cfg, const byte *c) struct symbol * cf_get_symbol(const byte *c) { - return cf_find_symbol(new_config, c) ?: cf_new_symbol(c); + return cf_find_symbol_scope(conf_this_scope, c) ?: cf_new_symbol(c); } /** @@ -693,10 +713,7 @@ cf_lex_symbol(const char *data) struct symbol *sym = cf_get_symbol(data); cf_lval.s = sym; - if (sym->class != SYM_VOID) - return CF_SYM_KNOWN; - - /* Is it a keyword? */ + /* Is it a keyword? Prefer the keyword. */ struct keyword *k = HASH_FIND(kw_hash, KW, data); if (k) { @@ -709,9 +726,11 @@ cf_lex_symbol(const char *data) } } - /* OK, undefined symbol */ - cf_lval.s = sym; - return CF_SYM_UNDEFINED; + /* OK, only a symbol. */ + if (sym->class == SYM_VOID) + return CF_SYM_UNDEFINED; + else + return CF_SYM_KNOWN; } static void @@ -724,6 +743,34 @@ cf_lex_init_kh(void) HASH_INSERT(kw_hash, KW, k); } +void +ea_lex_register(struct ea_class *def) +{ + struct symbol *sym = cf_root_symbol(def->name); + sym->class = SYM_ATTRIBUTE; + sym->attribute = def; + def->sym = sym; +} + +void +ea_lex_unregister(struct ea_class *def) +{ + struct symbol *sym = def->sym; + HASH_REMOVE2(global_root_scope->hash, SYM, &root_pool, sym); + mb_free(sym); + def->sym = NULL; +} + +struct ea_class * +ea_class_find_by_name(const char *name) +{ + struct symbol *sym = cf_find_symbol(global_root_scope, name); + if (!sym || (sym->class != SYM_ATTRIBUTE)) + return NULL; + else + return sym->attribute; +} + /** * cf_lex_init - initialize the lexer * @is_cli: true if we're going to parse CLI command, false for configuration @@ -757,6 +804,11 @@ cf_lex_init(int is_cli, struct config *c) c->root_scope = cfg_allocz(sizeof(struct sym_scope)); conf_this_scope = c->root_scope; conf_this_scope->active = 1; + + if (is_cli) + conf_this_scope->next = config->root_scope; + else + conf_this_scope->next = global_root_scope; } /** diff --git a/conf/conf.c b/conf/conf.c index 4e31de29..daac85c1 100644 --- a/conf/conf.c +++ b/conf/conf.c @@ -46,7 +46,7 @@ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "lib/resource.h" @@ -170,7 +170,6 @@ int cli_parse(struct config *c) { int done = 0; - c->fallback = config; new_config = c; cfg_mem = c->mem; if (setjmp(conf_jmpbuf)) @@ -181,7 +180,6 @@ cli_parse(struct config *c) done = 1; cleanup: - c->fallback = NULL; new_config = NULL; cfg_mem = NULL; return done; @@ -197,8 +195,12 @@ cleanup: void config_free(struct config *c) { - if (c) - rfree(c->pool); + if (!c) + return; + + ASSERT(!c->obstacle_count); + + rfree(c->pool); } /** @@ -207,10 +209,14 @@ config_free(struct config *c) * This function frees the old configuration (%old_config) that is saved for the * purpose of undo. It is useful before parsing a new config when reconfig is * requested, to avoid keeping three (perhaps memory-heavy) configs together. + * Configuration is not freed when it is still active during reconfiguration. */ void config_free_old(void) { + if (!old_config || old_config->obstacle_count) + return; + tm_stop(config_timer); undo_available = 0; @@ -543,7 +549,6 @@ order_shutdown(int gr) init_list(&c->tables); init_list(&c->symbols); memset(c->def_tables, 0, sizeof(c->def_tables)); - HASH_INIT(c->sym_hash, c->pool, 4); c->shutdown = 1; c->gr_down = gr; diff --git a/conf/conf.h b/conf/conf.h index b409750e..b30914c1 100644 --- a/conf/conf.h +++ b/conf/conf.h @@ -16,7 +16,6 @@ #include "lib/timer.h" /* Configuration structure */ - struct config { pool *pool; /* Pool the configuration is stored in */ linpool *mem; /* Linear pool containing configuration data */ @@ -28,13 +27,14 @@ struct config { int mrtdump_file; /* Configured MRTDump file (sysdep, fd in unix) */ const char *syslog_name; /* Name used for syslog (NULL -> no syslog) */ - struct rtable_config *def_tables[NET_MAX]; /* Default routing tables for each network */ + struct symbol *def_tables[NET_MAX]; /* Default routing tables for each network */ struct iface_patt *router_id_from; /* Configured list of router ID iface patterns */ u32 router_id; /* Our Router ID */ u32 proto_default_debug; /* Default protocol debug mask */ u32 proto_default_mrtdump; /* Default protocol mrtdump mask */ u32 channel_default_debug; /* Default channel debug mask */ + u16 filter_vstk, filter_estk; /* Filter stack depth */ struct timeformat tf_route; /* Time format for 'show route' */ struct timeformat tf_proto; /* Time format for 'show protocol' */ struct timeformat tf_log; /* Time format for the logfile */ @@ -44,6 +44,7 @@ struct config { int cli_debug; /* Tracing of CLI connections and commands */ int latency_debug; /* I/O loop tracks duration of each event */ + int table_debug; /* Track route propagation through tables */ u32 latency_limit; /* Events with longer duration are logged (us) */ u32 watchdog_warning; /* I/O loop watchdog limit for warning (us) */ u32 watchdog_timeout; /* Watchdog timeout (in seconds, 0 = disabled) */ @@ -53,8 +54,8 @@ struct config { char *err_file_name; /* File name containing error */ char *file_name; /* Name of main configuration file */ int file_fd; /* File descriptor of main configuration file */ - HASH(struct symbol) sym_hash; /* Lexer: symbol hash table */ - struct config *fallback; /* Link to regular config for CLI parsing */ + int thread_count; /* How many worker threads to prefork */ + struct sym_scope *root_scope; /* Scope for root symbols */ int obstacle_count; /* Number of items blocking freeing of this config */ int shutdown; /* This is a pseudo-config for daemon shutdown */ @@ -122,7 +123,7 @@ struct symbol { const struct f_line *function; /* For SYM_FUNCTION */ const struct filter *filter; /* For SYM_FILTER */ struct rtable_config *table; /* For SYM_TABLE */ - struct f_dynamic_attr *attribute; /* For SYM_ATTRIBUTE */ + struct ea_class *attribute; /* For SYM_ATTRIBUTE */ struct f_val *val; /* For SYM_CONSTANT */ uint offset; /* For SYM_VARIABLE */ }; @@ -133,12 +134,17 @@ struct symbol { struct sym_scope { struct sym_scope *next; /* Next on scope stack */ struct symbol *name; /* Name of this scope */ + + HASH(struct symbol) hash; /* Local symbol hash */ + uint slots; /* Variable slots */ byte active; /* Currently entered */ byte block; /* No independent stack frame */ byte soft_scopes; /* Number of soft scopes above */ }; +extern struct sym_scope *global_root_scope; + struct bytestring { size_t length; byte data[]; @@ -187,7 +193,14 @@ int cf_lex(void); void cf_lex_init(int is_cli, struct config *c); void cf_lex_unwind(void); -struct symbol *cf_find_symbol(const struct config *cfg, const byte *c); +struct symbol *cf_find_symbol_scope(const struct sym_scope *scope, const byte *c); +static inline struct symbol *cf_find_symbol_cfg(const struct config *cfg, const byte *c) +{ return cf_find_symbol_scope(cfg->root_scope, c); } + +#define cf_find_symbol(where, what) _Generic(*(where), \ + struct config: cf_find_symbol_cfg, \ + struct sym_scope: cf_find_symbol_scope \ + )((where), (what)) struct symbol *cf_get_symbol(const byte *c); struct symbol *cf_default_name(char *template, int *counter); diff --git a/conf/confbase.Y b/conf/confbase.Y index 1d5738ff..8e5da9e3 100644 --- a/conf/confbase.Y +++ b/conf/confbase.Y @@ -14,11 +14,12 @@ CF_HDR #include "conf/conf.h" #include "lib/resource.h" #include "lib/socket.h" +#include "lib/settle.h" #include "lib/timer.h" #include "lib/string.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/bfd.h" #include "nest/cli.h" #include "filter/filter.h" @@ -71,8 +72,9 @@ CF_DECLS } xp; enum filter_return fret; enum ec_subtype ecs; - struct f_dynamic_attr fda; + struct ea_class *ea_class; struct f_static_attr fsa; + struct f_attr_bit fab; struct f_lval flv; struct f_line *fl; struct f_arg *fa; @@ -92,7 +94,8 @@ CF_DECLS struct proto_spec ps; struct channel_limit cl; struct timeformat *tf; - mpls_label_stack *mls; + struct settle_config settle; + struct adata *ad; struct bytestring *bs; } @@ -110,17 +113,19 @@ CF_DECLS %type <i> expr bool pxlen4 %type <time> expr_us time +%type <settle> settle %type <a> ipa %type <net> net_ip4_ net_ip4 net_ip6_ net_ip6 net_ip_ net_ip net_or_ipa %type <net_ptr> net_ net_any net_vpn4_ net_vpn6_ net_vpn_ net_roa4_ net_roa6_ net_roa_ net_ip6_sadr_ net_mpls_ -%type <mls> label_stack_start label_stack +%type <ad> label_stack_start label_stack %type <t> text opttext -%type <s> symbol +%type <s> symbol symbol_known toksym %nonassoc PREFIX_DUMMY %left AND OR %nonassoc '=' '<' '>' '~' GEQ LEQ NEQ NMA PO PC +%left '|' '&' %left '+' '-' %left '*' '/' '%' %left '!' @@ -152,16 +157,16 @@ conf: definition ; definition: DEFINE symbol '=' term ';' { - struct f_val *val = cfg_allocz(sizeof(struct f_val)); - if (f_eval(f_linearize($4, 1), cfg_mem, val) > F_RETURN) cf_error("Runtime error"); - cf_define_symbol($2, SYM_CONSTANT | val->type, val, val); + struct f_val val; + if (f_eval(f_linearize($4, 1), &val) > F_RETURN) cf_error("Runtime error"); + cf_define_symbol($2, SYM_CONSTANT | val.type, val, lp_val_copy(cfg_mem, &val)); } ; expr: NUM | '(' term ')' { $$ = f_eval_int(f_linearize($2, 1)); } - | CF_SYM_KNOWN { + | symbol_known { if ($1->class != (SYM_CONSTANT | T_INT)) cf_error("Number constant expected"); $$ = SYM_VAL($1).i; } ; @@ -172,7 +177,9 @@ expr_us: | expr US { $$ = $1 US_; } ; -symbol: CF_SYM_UNDEFINED | CF_SYM_KNOWN ; +toksym: FROM | PREFERENCE ; +symbol: CF_SYM_UNDEFINED | CF_SYM_KNOWN | toksym ; +symbol_known: CF_SYM_KNOWN | toksym ; /* Switches */ @@ -357,17 +364,19 @@ net_or_ipa: label_stack_start: NUM { - $$ = cfg_allocz(sizeof(mpls_label_stack)); - $$->len = 1; - $$->stack[0] = $1; + $$ = cfg_allocz(ADATA_SIZE(MPLS_MAX_LABEL_STACK * sizeof(u32))); + $$->length = sizeof(u32); + *((u32 *)$$->data) = $1; }; label_stack: label_stack_start | label_stack '/' NUM { - if ($1->len >= MPLS_MAX_LABEL_STACK) + if ($1->length >= MPLS_MAX_LABEL_STACK * sizeof(u32)) cf_error("Too many labels in stack"); - $1->stack[$1->len++] = $3; + + *((u32 *)($$->data + $1->length)) = $3; + $1->length += sizeof(u32); $$ = $1; } ; @@ -380,6 +389,13 @@ time: } ; +/* Settle timer configuration */ +settle: expr_us expr_us { + if ($1 > $2) cf_error("Minimum settle time %t is bigger than maximum settle time %t", $1, $2); + $$.min = $1; + $$.max = $2; +}; + text: TEXT | CF_SYM_KNOWN { diff --git a/conf/gen_keywords.m4 b/conf/gen_keywords.m4 index 0c1dc545..53226e4d 100644 --- a/conf/gen_keywords.m4 +++ b/conf/gen_keywords.m4 @@ -26,8 +26,7 @@ m4_define(CF_DEFINES, `m4_divert(-1)') m4_define(CF_handle_kw, `m4_divert(1){ "m4_translit($1,[[A-Z]],[[a-z]])", $1, NULL }, m4_divert(-1)') m4_define(CF_keywd, `m4_ifdef([[CF_tok_$1]],,[[m4_define([[CF_tok_$1]],1)CF_handle_kw($1)]])') -m4_define(CF_KEYWORDS, `m4_define([[CF_toks]],[[]])CF_iterate([[CF_keywd]], [[$@]])m4_ifelse(CF_toks,,,%token[[]]CF_toks -)DNL') +m4_define(CF_KEYWORDS, `CF_iterate([[CF_keywd]], [[$@]])DNL') # CLI commands generate keywords as well m4_define(CF_CLI, `CF_KEYWORDS(m4_translit($1, [[ ]], [[,]])) diff --git a/configure.ac b/configure.ac index d73eec28..bf5c6df4 100644 --- a/configure.ac +++ b/configure.ac @@ -42,12 +42,6 @@ AC_ARG_ENABLE([compact-tries], [enable_compact_tries=no] ) -AC_ARG_ENABLE([pthreads], - [AS_HELP_STRING([--enable-pthreads], [enable POSIX threads support @<:@try@:>@])], - [], - [enable_pthreads=try] -) - AC_ARG_ENABLE([libssh], [AS_HELP_STRING([--enable-libssh], [enable LibSSH support in RPKI @<:@try@:>@])], [], @@ -131,25 +125,19 @@ if test -z "$GCC" ; then fi BIRD_CHECK_THREAD_LOCAL -if test "$bird_cv_thread_local" = yes ; then - AC_DEFINE([HAVE_THREAD_LOCAL], [1], [Define to 1 if _Thread_local is available]) +if test "$bird_cv_thread_local" = no ; then + AC_MSG_ERROR([This program requires thread local storage.]) +elif test "$bird_cv_thread_local" != yes ; then + AC_DEFINE_UNQUOTED([_Thread_local], [$bird_cv_thread_local], [Legacy _Thread_local]) fi -if test "$enable_pthreads" != no ; then - BIRD_CHECK_PTHREADS - - if test "$bird_cv_lib_pthreads" = yes ; then - AC_DEFINE([USE_PTHREADS], [1], [Define to 1 if pthreads are enabled]) - CFLAGS="$CFLAGS -pthread" - LDFLAGS="$LDFLAGS -pthread" - proto_bfd=bfd - elif test "$enable_pthreads" = yes ; then - AC_MSG_ERROR([POSIX threads not available.]) - fi +BIRD_CHECK_PTHREADS - if test "$enable_pthreads" = try ; then - enable_pthreads="$bird_cv_lib_pthreads" - fi +if test "$bird_cv_lib_pthreads" = yes ; then + CFLAGS="$CFLAGS -pthread" + LDFLAGS="$LDFLAGS -pthread" +else + AC_MSG_ERROR([POSIX threads not available.]) fi # This is assumed to be necessary for proper BIRD build @@ -296,6 +284,12 @@ if test "$enable_libssh" != no ; then enable_libssh=no fi fi + + AC_CHECK_LIB([ssh], [ssh_session_is_known_server], [ssh_old_server_validation_api=no], [ssh_old_server_validation_api=yes]) + + if test "$ssh_old_server_validation_api" = yes; then + AC_DEFINE([HAVE_SSH_OLD_SERVER_VALIDATION_API], [1], [Define to 1 if ssh_session_is_known_server isn't defined]) + fi fi if test "$enable_mpls_kernel" != no ; then @@ -312,8 +306,8 @@ if test "$enable_mpls_kernel" != no ; then fi fi -all_protocols="$proto_bfd babel bgp mrt ospf perf pipe radv rip rpki static" - +# temporarily removed "mrt" from all_protocols to speed up 3.0-alpha1 release +all_protocols="bfd babel bgp ospf perf pipe radv rip rpki static" all_protocols=`echo $all_protocols | sed 's/ /,/g'` if test "$with_protocols" = all ; then @@ -358,16 +352,29 @@ case $sysdesc in ;; esac -AC_CHECK_HEADERS_ONCE([alloca.h syslog.h]) -AC_CHECK_HEADER([sys/mman.h], [AC_DEFINE([HAVE_MMAP], [1], [Define to 1 if mmap() is available.])]) +AC_CHECK_HEADERS_ONCE([alloca.h syslog.h stdatomic.h]) +AC_CHECK_HEADER([sys/mman.h], [AC_DEFINE([HAVE_MMAP], [1], [Define to 1 if mmap() is available.])], have_mman=no) +AC_CHECK_FUNC([aligned_alloc], [AC_DEFINE([HAVE_ALIGNED_ALLOC], [1], [Define to 1 if aligned_alloc() is available.])], have_aligned_alloc=no) AC_CHECK_MEMBERS([struct sockaddr.sa_len], [], [], [#include <sys/socket.h>]) +if test "$have_aligned_alloc" = "no" && test "$have_mman" = "no" ; then + AC_MSG_ERROR([No means of aligned alloc found. Need mmap() or aligned_alloc().]) +fi + + AC_C_BIGENDIAN( [AC_DEFINE([CPU_BIG_ENDIAN], [1], [Define to 1 if cpu is big endian])], [AC_DEFINE([CPU_LITTLE_ENDIAN], [1], [Define to 1 if cpu is little endian])], [AC_MSG_ERROR([Cannot determine CPU endianity.])] ) +BIRD_CHECK_POINTER_ALIGNMENT +if test "$bird_cv_pointer_alignment" = "unknown" ; then + AC_MSG_ERROR([Couldn't determine pointer alignment]) +else + AC_DEFINE_UNQUOTED([CPU_POINTER_ALIGNMENT], [$bird_cv_pointer_alignment], [Pointer alignment for macro usage]) +fi + BIRD_CHECK_ANDROID_GLOB if test "$bird_cv_lib_glob" = no ; then AC_MSG_ERROR([glob.h not found.]) @@ -480,7 +487,6 @@ AC_MSG_RESULT([ Iproute2 directory: $iproutedir]) AC_MSG_RESULT([ System configuration: $sysdesc]) AC_MSG_RESULT([ Debugging: $enable_debug]) AC_MSG_RESULT([ Compact tries: $enable_compact_tries]) -AC_MSG_RESULT([ POSIX threads: $enable_pthreads]) AC_MSG_RESULT([ Routing protocols: $protocols]) AC_MSG_RESULT([ LibSSH support in RPKI: $enable_libssh]) AC_MSG_RESULT([ Kernel MPLS support: $enable_mpls_kernel]) diff --git a/doc/bird.sgml b/doc/bird.sgml index 6f3fcfc6..9a3cb996 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -505,6 +505,11 @@ include "tablename.conf";; See <ref id="channel-debug" name="debug"> in the channel section. Default: off. + <tag><label id="opt-debug-tables">debug tables all|off|{ states|routes|filters|events [, <m/.../] }</tag> + Set global defaults of table debugging options. + See <ref id="rtable-debug" name="debug"> in the table section. + Default: off. + <tag><label id="opt-debug-commands">debug commands <m/number/</tag> Control logging of client connections (0 for no logging, 1 for logging of connects and disconnects, 2 and higher for logging of all client @@ -527,6 +532,12 @@ include "tablename.conf";; killed by abort signal. The timeout has effective granularity of seconds, zero means disabled. Default: disabled (0). + <tag><label id="opt-threads">threads <m/number/</tag> + Set how many worker threads should BIRD spawn. Tests show that every + thread can utilize one complete CPU core, therefore you probably want to + keep at least one free core. The maximum feasible thread count heavily + depends on the actual workload and must be determined by testing or estimation. Default: 1 + <tag><label id="opt-mrtdump">mrtdump "<m/filename/"</tag> Set MRTdump file name. This option must be specified to allow MRTdump feature. Default: no dump file. @@ -670,21 +681,6 @@ to set options. disadvantage is that trie-enabled routing tables require more memory, which may be an issue especially in multi-table setups. Default: off. - <tag><label id="rtable-min-settle-time">min settle time <m/time/</tag> - Specify a minimum value of the settle time. When a ROA table changes, - automatic <ref id="proto-rpki-reload" name="RPKI reload"> may be - triggered, after a short settle time. Minimum settle time is a delay - from the last ROA table change to wait for more updates. Default: 1 s. - - - <tag><label id="rtable-max-settle-time">max settle time <m/time/</tag> - Specify a maximum value of the settle time. When a ROA table changes, - automatic <ref id="proto-rpki-reload" name="RPKI reload"> may be - triggered, after a short settle time. Maximum settle time is an upper - limit to the settle time from the initial ROA table change even if - there are consecutive updates gradually renewing the settle time. - Default: 20 s. - <tag><label id="rtable-gc-threshold">gc threshold <m/number/</tag> Specify a minimum amount of removed networks that triggers a garbage collection (GC) cycle. Default: 1000. @@ -699,6 +695,40 @@ to set options. periods. Default: adaptive, based on number of routing tables in the configuration. From 10 s (with <= 25 routing tables) up to 600 s (with >= 1500 routing tables). + + <tag><label id="rtable-cork-threshold">cork threshold <m/number/ <m/number/</tag> + Too many pending exports may lead to memory bloating. In such cases, + BIRD tries to relieve the memory pressure by pausing some routines until + the queue sizes get low enough. This option allows the user to set the + thresholds; first value is the low threshold (when to resume), the + second one is the high threshold (when to pause). The higher is the + threshold, the more memory can get used. In most cases, the defaults + should work for you. Default: 1024 8192. + + <tag><label id="rtable-export-settle-time">export settle time <m/time/ <m/time/</tag> + Minimum and maximum settle times, respectively, for export announcements. + When multiple routes are changing, this mechanism waits for the changes + to settle before waking up sleeping export threads but if the changes are coming + steadily, BIRD isn't waiting forever; at most the maximum time. + Default values: <cf/1 ms 100 ms/. You have to always provide both values. + + <tag><label id="rtable-route-refresh-export-settle-time">route refresh export settle time <m/time/ <m/time/</tag> + Minimum and maximum settle times, respectively, for export announcements + (the same as above), valid when any channel is currently doing a route refresh. + This serves a purpose of even more aggresive change bundling, knowing that there + is some active process generating changes in a fast pace. If you don't want + this feature, set this to the same values as <ref id="rtable-export-settle-time" name="export settle time">. + Default values: <cf/100 ms 3 s/. + + <tag><label id="rtable-debug">debug all|off|{ states|routes|events [, <m/.../] }</tag> + Set table debugging options. Each table can write some trace messages + into log with category <cf/trace/. You can request <cf/all/ trace messages + or select some types: <cf/states/ for table state changes and auxiliary + processes, <cf/routes/ for auxiliary route notifications (next hop update, + flowspec revalidation) and <cf/events/ for more detailed auxiliary routine + debug. See also <ref id="channel-debug" name="channel debugging option">. + Default: off. + </descrip> @@ -929,10 +959,12 @@ inherited from templates can be updated by new definitions. <cf/none/ is for dropping all routes. Default: <cf/all/ (except for EBGP). - <tag><label id="proto-export">export <m/filter/</tag> + <tag><label id="proto-export">export [ in <m/prefix/ ] <m/filter/</tag> This is similar to the <cf>import</cf> keyword, except that it works in - the direction from the routing table to the protocol. Default: <cf/none/ - (except for EBGP). + the direction from the routing table to the protocol. If <cf/in/ keyword is used, + only routes inside the given prefix are exported. Other routes are completely + ignored (e.g. no logging and no statistics). + Default: <cf/none/ (except for EBGP). <tag><label id="proto-import-keep-filtered">import keep filtered <m/switch/</tag> Usually, if an import filter rejects a route, the route is forgotten. @@ -954,6 +986,16 @@ inherited from templates can be updated by new definitions. <ref id="bgp-export-table" name="export table"> (for respective direction). Default: on. + <tag><label id="rtable-min-settle-time">roa settle time <m/time/ <m/time/</tag> + Minimum and maximum settle times, respectively, for ROA table changes. + The automatic reload is triggered after the minimum time after the last + ROA table change has been received but not later than the maximum time after + first unprocessed ROA table change. Therefore with default values, the + automatic reload happens 1 second after the ROA table stops updating, yet if it + were to be later than 20 seconds after the ROA table starts updating, + the automatic reload is triggered anyway. Default values: <cf/1 s 20 s/. + You have to always provide both values. + <tag><label id="proto-import-limit">import limit [<m/number/ | off ] [action warn | block | restart | disable]</tag> Specify an import route limit (a maximum number of routes imported from the protocol) and optionally the action to be taken when the limit is @@ -1765,17 +1807,8 @@ Common route attributes are: primary key of the routing table. Read-only. (See the <ref id="routes" name="chapter about routes">.) - <tag><label id="rta-scope"><m/enum/ scope</tag> - The scope of the route. Possible values: <cf/SCOPE_HOST/ for routes - local to this host, <cf/SCOPE_LINK/ for those specific for a physical - link, <cf/SCOPE_SITE/ and <cf/SCOPE_ORGANIZATION/ for private routes and - <cf/SCOPE_UNIVERSE/ for globally visible routes. This attribute is not - interpreted by BIRD and can be used to mark routes in filters. The - default value for new routes is <cf/SCOPE_UNIVERSE/. - <tag><label id="rta-preference"><m/int/ preference</tag> - Preference of the route. Valid values are 0-65535. (See the chapter - about routing tables.) + Preference of the route. <tag><label id="rta-from"><m/ip/ from</tag> The router which the route has originated from. @@ -1816,14 +1849,6 @@ Common route attributes are: creation/removal. Zero is returned for routes with undefined outgoing interfaces. Read-only. - <tag><label id="rta-onlink"><m/bool/ onlink</tag> - Onlink flag means that the specified nexthop is accessible on the - interface regardless of IP prefixes configured on the interface. - The attribute can be used to configure such next hops by first setting - <cf/onlink = true/ and <cf/ifname/, and then setting <cf/gw/. Possible - use case for setting this flag is to automatically build overlay IP-IP - networks on linux. - <tag><label id="rta-weight"><m/int/ weight</tag> Multipath weight of route next hops. Valid values are 1-256. Reading returns the weight of the first next hop, setting it sets weights of all @@ -2996,13 +3021,6 @@ be used in explicit configuration. be examined later by <cf/show route/, and can be used to reconfigure import filters without full route refresh. Default: off. - Note that currently the import table breaks routes with recursive - nexthops (e.g. ones from IBGP, see <ref id="bgp-gateway" name="gateway - recursive">), they are not properly updated after next hop change. For - the same reason, it also breaks re-evaluation of flowspec routes with - <ref id="bgp-validate" name="flowspec validation"> option enabled on - flowspec channels. - <tag><label id="bgp-export-table">export table <m/switch/</tag> A BGP export table contains all routes sent to given BGP neighbor, after application of export filters. It is also called <em/Adj-RIB-Out/ in BGP @@ -4338,6 +4356,14 @@ include standard channel config options; see the example below. <tag><label id="pipe-peer-table">peer table <m/table/</tag> Defines secondary routing table to connect to. The primary one is selected by the <cf/table/ keyword. + + <tag><label id="pipe-max-generation">max generation <m/expr/</tag> + Sets maximal generation of route that may pass through this pipe. + The generation value is increased by one by each pipe on its path. + Not meeting this requirement causes an error message complaining about + an overpiped route. If you have long chains of pipes, you probably want + to raise this value; anyway the default of 16 should be enough for even + most strange uses. Maximum is 254. </descrip> <sect1>Attributes diff --git a/doc/reply_codes b/doc/reply_codes index 02f4e656..bcbbbb03 100644 --- a/doc/reply_codes +++ b/doc/reply_codes @@ -61,6 +61,7 @@ Reply codes of BIRD command-line interface 1023 Show Babel interfaces 1024 Show Babel neighbors 1025 Show Babel entries +1026 Show threads 8000 Reply too long 8001 Route not found diff --git a/filter/config.Y b/filter/config.Y index a1e5e9f1..e6264a83 100644 --- a/filter/config.Y +++ b/filter/config.Y @@ -22,6 +22,16 @@ static inline u32 pair_b(u32 p) { return p & 0xFFFF; } #define f_generate_complex(fi_code, da, arg) \ f_new_inst(FI_EA_SET, f_new_inst(fi_code, f_new_inst(FI_EA_GET, da), arg), da) +#define f_generate_complex_sym(fi_code, sym, arg) ({ \ + if (sym->class != SYM_ATTRIBUTE) \ + cf_error("Can't empty %s: not an attribute", sym->name); \ + f_generate_complex(fi_code, sym->attribute, arg); \ +}) + +#define f_generate_complex_default(fi_code, da, arg, def) \ + f_new_inst(FI_EA_SET, f_new_inst(fi_code, f_new_inst(FI_DEFAULT, f_new_inst(FI_EA_GET, da), f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_INT, .val.i = def })), arg), da) + + static int f_new_var(struct sym_scope *s) { @@ -191,28 +201,32 @@ f_new_lc_item(u32 f1, u32 t1, u32 f2, u32 t2, u32 f3, u32 t3) } static inline struct f_inst * -f_generate_empty(struct f_dynamic_attr dyn) +f_generate_empty(const struct symbol *sym) { - struct f_val empty; + if (sym->class != SYM_ATTRIBUTE) + cf_error("Can't empty %s: not an attribute", sym->name); - switch (dyn.type & EAF_TYPE_MASK) { - case EAF_TYPE_AS_PATH: - empty = f_const_empty_path; - break; - case EAF_TYPE_INT_SET: - empty = f_const_empty_clist; - break; - case EAF_TYPE_EC_SET: - empty = f_const_empty_eclist; - break; - case EAF_TYPE_LC_SET: - empty = f_const_empty_lclist; - break; - default: - cf_error("Can't empty that attribute"); - } + const struct ea_class *def = sym->attribute; + const struct f_val *empty = f_get_empty(def->type); + if (!empty) + cf_error("Can't empty attribute %s", def->name); + + return f_new_inst(FI_EA_SET, f_new_inst(FI_CONSTANT, *empty), def); +} + +static inline struct f_inst * +f_implicit_roa_check(struct rtable_config *tab) +{ + const struct ea_class *def = ea_class_find("bgp_path"); + if (!def) + cf_error("Fatal: Couldn't find BGP path attribute definition."); + + struct f_static_attr fsa = f_new_static_attr(T_NET, SA_NET, 1); - return f_new_inst(FI_EA_SET, f_new_inst(FI_CONSTANT, empty), dyn); + return f_new_inst(FI_ROA_CHECK, + f_new_inst(FI_RTA_GET, fsa), + f_new_inst(FI_AS_PATH_LAST, f_new_inst(FI_EA_GET, def)), + tab); } /* @@ -305,26 +319,24 @@ CF_KEYWORDS(FUNCTION, PRINT, PRINTN, UNSET, RETURN, IF, THEN, ELSE, CASE, FOR, IN, DO, TRUE, FALSE, RT, RO, UNKNOWN, GENERIC, - FROM, GW, NET, MASK, PROTO, SOURCE, SCOPE, DEST, IFNAME, IFINDEX, WEIGHT, GW_MPLS, ONLINK, - PREFERENCE, + FROM, GW, NET, MASK, PROTO, SCOPE, DEST, IFNAME, IFINDEX, WEIGHT, GW_MPLS, ROA_CHECK, ASN, SRC, DST, IS_V4, IS_V6, LEN, MAXLEN, DATA, DATA1, DATA2, DEFINED, - ADD, DELETE, CONTAINS, RESET, - PREPEND, FIRST, LAST, LAST_NONAGGREGATED, MATCH, + ADD, DELETE, RESET, + PREPEND, FIRST, LAST, LAST_NONAGGREGATED, MIN, MAX, EMPTY, FILTER, WHERE, EVAL, ATTRIBUTE, - BT_ASSERT, BT_TEST_SUITE, BT_CHECK_ASSIGN, BT_TEST_SAME, FORMAT) + BT_ASSERT, BT_TEST_SUITE, BT_CHECK_ASSIGN, BT_TEST_SAME, FORMAT, STACKS) %nonassoc THEN %nonassoc ELSE %type <xp> cmds_int cmd_prep %type <x> term cmd cmd_var cmds cmds_scoped constant constructor print_list var var_init var_list function_call symbol_value bgp_path_expr bgp_path bgp_path_tail -%type <fda> dynamic_attr %type <fsa> static_attr %type <f> filter where_filter %type <fl> filter_body function_body @@ -343,6 +355,12 @@ CF_KEYWORDS(FUNCTION, PRINT, PRINTN, UNSET, RETURN, CF_GRAMMAR +conf: FILTER STACKS expr expr ';' { + new_config->filter_vstk = $3; + new_config->filter_estk = $4; + } + ; + conf: filter_def ; filter_def: FILTER symbol { $2 = cf_define_symbol($2, SYM_FILTER, filter, NULL); cf_push_scope( $2 ); } @@ -362,12 +380,19 @@ filter_eval: conf: custom_attr ; custom_attr: ATTRIBUTE type symbol ';' { - cf_define_symbol($3, SYM_ATTRIBUTE, attribute, ca_lookup(new_config->pool, $3->name, $2)->fda); + if (($3->class == SYM_ATTRIBUTE) && ($3->scope == new_config->root_scope)) + cf_error("Duplicate attribute %s definition", $3->name); + + cf_define_symbol($3, SYM_ATTRIBUTE, attribute, + ea_register_alloc(new_config->pool, (struct ea_class) { + .name = $3->name, + .type = $2, + })->class); }; conf: bt_test_suite ; bt_test_suite: - BT_TEST_SUITE '(' CF_SYM_KNOWN ',' text ')' { + BT_TEST_SUITE '(' symbol_known ',' text ')' { cf_assert_symbol($3, SYM_FUNCTION); struct f_bt_test_suite *t = cfg_allocz(sizeof(struct f_bt_test_suite)); t->fn = $3->function; @@ -380,7 +405,7 @@ bt_test_suite: conf: bt_test_same ; bt_test_same: - BT_TEST_SAME '(' CF_SYM_KNOWN ',' CF_SYM_KNOWN ',' NUM ')' { + BT_TEST_SAME '(' symbol_known ',' symbol_known ',' NUM ')' { cf_assert_symbol($3, SYM_FUNCTION); cf_assert_symbol($5, SYM_FUNCTION); struct f_bt_test_suite *t = cfg_allocz(sizeof(struct f_bt_test_suite)); @@ -461,7 +486,7 @@ function_vars: filter_body: function_body ; filter: - CF_SYM_KNOWN { + symbol_known { cf_assert_symbol($1, SYM_FILTER); $$ = $1->filter; } @@ -571,10 +596,10 @@ set_atom: | VPN_RD { $$.type = T_RD; $$.val.ec = $1; } | ENUM { $$.type = pair_a($1); $$.val.i = pair_b($1); } | '(' term ')' { - if (f_eval(f_linearize($2, 1), cfg_mem, &($$)) > F_RETURN) cf_error("Runtime error"); + if (f_eval(f_linearize($2, 1), &($$)) > F_RETURN) cf_error("Runtime error"); if (!f_valid_set_type($$.type)) cf_error("Set-incompatible type"); } - | CF_SYM_KNOWN { + | symbol_known { cf_assert_symbol($1, SYM_CONSTANT); if (!f_valid_set_type(SYM_TYPE($1))) cf_error("%s: set-incompatible type", $1->name); $$ = *$1->val; @@ -745,7 +770,7 @@ var_list: /* EMPTY */ { $$ = NULL; } | var_list ',' term { $$ = $3; $$->next = $1; } function_call: - CF_SYM_KNOWN '(' var_list ')' + symbol_known '(' var_list ')' { if ($1->class != SYM_FUNCTION) cf_error("You can't call something which is not a function. Really."); @@ -764,7 +789,7 @@ function_call: } ; -symbol_value: CF_SYM_KNOWN +symbol_value: symbol_known { switch ($1->class) { case SYM_CONSTANT_RANGE: @@ -774,7 +799,7 @@ symbol_value: CF_SYM_KNOWN $$ = f_new_inst(FI_VAR_GET, $1); break; case SYM_ATTRIBUTE: - $$ = f_new_inst(FI_EA_GET, *$1->attribute); + $$ = f_new_inst(FI_EA_GET, $1->attribute); break; default: cf_error("Can't get value of symbol %s", $1->name); @@ -783,19 +808,14 @@ symbol_value: CF_SYM_KNOWN ; static_attr: - FROM { $$ = f_new_static_attr(T_IP, SA_FROM, 0); } - | GW { $$ = f_new_static_attr(T_IP, SA_GW, 0); } + GW { $$ = f_new_static_attr(T_IP, SA_GW, 0); } | NET { $$ = f_new_static_attr(T_NET, SA_NET, 1); } | PROTO { $$ = f_new_static_attr(T_STRING, SA_PROTO, 1); } - | SOURCE { $$ = f_new_static_attr(T_ENUM_RTS, SA_SOURCE, 1); } - | SCOPE { $$ = f_new_static_attr(T_ENUM_SCOPE, SA_SCOPE, 0); } | DEST { $$ = f_new_static_attr(T_ENUM_RTD, SA_DEST, 0); } | IFNAME { $$ = f_new_static_attr(T_STRING, SA_IFNAME, 0); } | IFINDEX { $$ = f_new_static_attr(T_INT, SA_IFINDEX, 1); } | WEIGHT { $$ = f_new_static_attr(T_INT, SA_WEIGHT, 0); } - | PREFERENCE { $$ = f_new_static_attr(T_INT, SA_PREF, 0); } | GW_MPLS { $$ = f_new_static_attr(T_INT, SA_GW_MPLS, 0); } - | ONLINK { $$ = f_new_static_attr(T_BOOL, SA_ONLINK, 0); } ; term: @@ -804,6 +824,8 @@ term: | term '-' term { $$ = f_new_inst(FI_SUBTRACT, $1, $3); } | term '*' term { $$ = f_new_inst(FI_MULTIPLY, $1, $3); } | term '/' term { $$ = f_new_inst(FI_DIVIDE, $1, $3); } + | term '&' term { $$ = f_new_inst(FI_BITAND, $1, $3); } + | term '|' term { $$ = f_new_inst(FI_BITOR, $1, $3); } | term AND term { $$ = f_new_inst(FI_AND, $1, $3); } | term OR term { $$ = f_new_inst(FI_OR, $1, $3); } | term '=' term { $$ = f_new_inst(FI_EQ, $1, $3); } @@ -823,8 +845,6 @@ term: | static_attr { $$ = f_new_inst(FI_RTA_GET, $1); } - | dynamic_attr { $$ = f_new_inst(FI_EA_GET, $1); } - | term '.' IS_V4 { $$ = f_new_inst(FI_IS_V4, $1); } | term '.' TYPE { $$ = f_new_inst(FI_TYPE, $1); } | term '.' IP { $$ = f_new_inst(FI_IP, $1); } @@ -861,13 +881,11 @@ term: | DELETE '(' term ',' term ')' { $$ = f_new_inst(FI_CLIST_DEL, $3, $5); } | FILTER '(' term ',' term ')' { $$ = f_new_inst(FI_CLIST_FILTER, $3, $5); } - | ROA_CHECK '(' rtable ')' { $$ = f_new_inst(FI_ROA_CHECK_IMPLICIT, $3); } - | ROA_CHECK '(' rtable ',' term ',' term ')' { $$ = f_new_inst(FI_ROA_CHECK_EXPLICIT, $5, $7, $3); } + | ROA_CHECK '(' rtable ')' { $$ = f_implicit_roa_check($3); } + | ROA_CHECK '(' rtable ',' term ',' term ')' { $$ = f_new_inst(FI_ROA_CHECK, $5, $7, $3); } | FORMAT '(' term ')' { $$ = f_new_inst(FI_FORMAT, $3); } -/* | term '.' LEN { $$->code = P('P','l'); } */ - | function_call ; @@ -925,13 +943,15 @@ cmd: $$ = f_new_inst(FI_FOR_INIT, $6, $3); $$->next = f_new_inst(FI_FOR_NEXT, $3, $9); } - | CF_SYM_KNOWN '=' term ';' { + | symbol_known '=' term ';' { switch ($1->class) { case SYM_VARIABLE_RANGE: $$ = f_new_inst(FI_VAR_SET, $3, $1); break; case SYM_ATTRIBUTE: - $$ = f_new_inst(FI_EA_SET, $3, *$1->attribute); + if ($1->attribute->readonly) + cf_error("Attribute %s is read-only", $1->attribute->name); + $$ = f_new_inst(FI_EA_SET, $3, $1->attribute); break; default: cf_error("Can't assign to symbol %s", $1->name); @@ -941,16 +961,17 @@ cmd: DBG( "Ook, we'll return the value\n" ); $$ = f_new_inst(FI_RETURN, $2); } - | dynamic_attr '=' term ';' { - $$ = f_new_inst(FI_EA_SET, $3, $1); - } | static_attr '=' term ';' { if ($1.readonly) cf_error( "This static attribute is read-only."); $$ = f_new_inst(FI_RTA_SET, $3, $1); } - | UNSET '(' dynamic_attr ')' ';' { - $$ = f_new_inst(FI_EA_UNSET, $3); + | UNSET '(' symbol_known ')' ';' { + if ($3->class != SYM_ATTRIBUTE) + cf_error("Can't unset %s", $3->name); + if ($3->attribute->readonly) + cf_error("Attribute %s is read-only", $3->attribute->name); + $$ = f_new_inst(FI_EA_UNSET, $3->attribute); } | break_command print_list ';' { struct f_inst *breaker = f_new_inst(FI_DIE, $1); @@ -975,11 +996,11 @@ cmd: $$ = f_new_inst(FI_SWITCH, $2, $4); } - | dynamic_attr '.' EMPTY ';' { $$ = f_generate_empty($1); } - | dynamic_attr '.' PREPEND '(' term ')' ';' { $$ = f_generate_complex( FI_PATH_PREPEND, $1, $5 ); } - | dynamic_attr '.' ADD '(' term ')' ';' { $$ = f_generate_complex( FI_CLIST_ADD, $1, $5 ); } - | dynamic_attr '.' DELETE '(' term ')' ';' { $$ = f_generate_complex( FI_CLIST_DEL, $1, $5 ); } - | dynamic_attr '.' FILTER '(' term ')' ';' { $$ = f_generate_complex( FI_CLIST_FILTER, $1, $5 ); } + | symbol_known '.' EMPTY ';' { $$ = f_generate_empty($1); } + | symbol_known '.' PREPEND '(' term ')' ';' { $$ = f_generate_complex_sym( FI_PATH_PREPEND, $1, $5 ); } + | symbol_known '.' ADD '(' term ')' ';' { $$ = f_generate_complex_sym( FI_CLIST_ADD, $1, $5 ); } + | symbol_known '.' DELETE '(' term ')' ';' { $$ = f_generate_complex_sym( FI_CLIST_DEL, $1, $5 ); } + | symbol_known '.' FILTER '(' term ')' ';' { $$ = f_generate_complex_sym( FI_CLIST_FILTER, $1, $5 ); } | BT_ASSERT '(' get_cf_position term get_cf_position ')' ';' { $$ = assert_done($4, $3 + 1, $5 - 1); } | BT_CHECK_ASSIGN '(' get_cf_position lvalue get_cf_position ',' term ')' ';' { $$ = assert_assign(&$4, $7, $3 + 1, $5 - 1); } ; @@ -990,8 +1011,17 @@ get_cf_position: }; lvalue: - CF_SYM_KNOWN { cf_assert_symbol($1, SYM_VARIABLE); $$ = (struct f_lval) { .type = F_LVAL_VARIABLE, .sym = $1 }; } + symbol_known { + switch ($1->class) { + case SYM_VARIABLE_RANGE: + $$ = (struct f_lval) { .type = F_LVAL_VARIABLE, .sym = $1 }; + break; + case SYM_ATTRIBUTE: + $$ = (struct f_lval) { .type = F_LVAL_EA, .da = $1->attribute }; + break; + } + } | static_attr { $$ = (struct f_lval) { .type = F_LVAL_SA, .sa = $1 }; } - | dynamic_attr { $$ = (struct f_lval) { .type = F_LVAL_EA, .da = $1 }; }; + ; CF_END diff --git a/filter/data.c b/filter/data.c index 56d746fd..f104d2f8 100644 --- a/filter/data.c +++ b/filter/data.c @@ -16,10 +16,10 @@ #include "lib/unaligned.h" #include "lib/net.h" #include "lib/ip.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/attrs.h" +#include "lib/attrs.h" #include "conf/conf.h" #include "filter/filter.h" #include "filter/f-inst.h" @@ -27,6 +27,8 @@ static const char * const f_type_str[] = { [T_VOID] = "void", + [T_OPAQUE] = "opaque byte string", + [T_IFACE] = "interface", [T_INT] = "int", [T_BOOL] = "bool", @@ -36,7 +38,6 @@ static const char * const f_type_str[] = { [T_ENUM_RTS] = "enum rts", [T_ENUM_BGP_ORIGIN] = "enum bgp_origin", [T_ENUM_SCOPE] = "enum scope", - [T_ENUM_RTC] = "enum rtc", [T_ENUM_RTD] = "enum rtd", [T_ENUM_ROA] = "enum roa", [T_ENUM_NETTYPE] = "enum nettype", @@ -47,6 +48,7 @@ static const char * const f_type_str[] = { [T_NET] = "prefix", [T_STRING] = "string", [T_PATH_MASK] = "bgpmask", + [T_PATH_MASK_ITEM] = "bgpmask item", [T_PATH] = "bgppath", [T_CLIST] = "clist", [T_EC] = "ec", @@ -54,22 +56,19 @@ static const char * const f_type_str[] = { [T_LC] = "lc", [T_LCLIST] = "lclist", [T_RD] = "rd", + + [T_SET] = "set", + [T_PREFIX_SET] = "prefix set", }; const char * -f_type_name(enum f_type t) +f_type_name(btype t) { - if (t < ARRAY_SIZE(f_type_str)) - return f_type_str[t] ?: "?"; - - if ((t == T_SET) || (t == T_PREFIX_SET)) - return "set"; - - return "?"; + return (t < ARRAY_SIZE(f_type_str)) ? (f_type_str[t] ?: "?") : "?"; } -enum f_type -f_type_element_type(enum f_type t) +btype +f_type_element_type(btype t) { switch(t) { case T_PATH: return T_INT; @@ -99,14 +98,6 @@ const struct f_val f_const_empty_path = { .val.ti = &f_const_empty_trie, }; -static struct adata * -adata_empty(struct linpool *pool, int l) -{ - struct adata *res = lp_alloc(pool, sizeof(struct adata) + l); - res->length = l; - return res; -} - static void pm_format(const struct f_path_mask *p, buffer *buf) { @@ -435,7 +426,7 @@ clist_filter(struct linpool *pool, const struct adata *list, const struct f_val if (nl == list->length) return list; - struct adata *res = adata_empty(pool, nl); + struct adata *res = lp_alloc_adata(pool, nl); memcpy(res->data, tmp, nl); return res; } @@ -469,7 +460,7 @@ eclist_filter(struct linpool *pool, const struct adata *list, const struct f_val if (nl == list->length) return list; - struct adata *res = adata_empty(pool, nl); + struct adata *res = lp_alloc_adata(pool, nl); memcpy(res->data, tmp, nl); return res; } @@ -501,7 +492,7 @@ lclist_filter(struct linpool *pool, const struct adata *list, const struct f_val if (nl == list->length) return list; - struct adata *res = adata_empty(pool, nl); + struct adata *res = lp_alloc_adata(pool, nl); memcpy(res->data, tmp, nl); return res; } @@ -625,3 +616,75 @@ val_dump(const struct f_val *v) { return val_dump_buffer; } + +struct f_val * +lp_val_copy(struct linpool *lp, const struct f_val *v) +{ + switch (v->type) + { + case T_VOID: + case T_BOOL: + case T_INT: + case T_IP: + case T_PAIR: + case T_QUAD: + case T_EC: + case T_LC: + case T_RD: + case T_ENUM: + case T_PATH_MASK_ITEM: + /* These aren't embedded but there is no need to copy them */ + case T_SET: + case T_PREFIX_SET: + case T_PATH_MASK: + case T_IFACE: + { + struct f_val *out = lp_alloc(lp, sizeof(*out)); + *out = *v; + return out; + } + + case T_NET: + { + struct { + struct f_val val; + net_addr net[0]; + } *out = lp_alloc(lp, sizeof(*out) + v->val.net->length); + out->val = *v; + out->val.val.net = out->net; + net_copy(out->net, v->val.net); + return &out->val; + } + + case T_STRING: + { + uint len = strlen(v->val.s); + struct { + struct f_val val; + char buf[0]; + } *out = lp_alloc(lp, sizeof(*out) + len + 1); + out->val = *v; + out->val.val.s = out->buf; + memcpy(out->buf, v->val.s, len+1); + return &out->val; + } + + case T_PATH: + case T_CLIST: + case T_ECLIST: + case T_LCLIST: + { + struct { + struct f_val val; + struct adata ad; + } *out = lp_alloc(lp, sizeof(*out) + v->val.ad->length); + out->val = *v; + out->val.val.ad = &out->ad; + memcpy(&out->ad, v->val.ad, v->val.ad->length); + return &out->val; + } + + default: + bug("Unknown type in value copy: %d", v->type); + } +} diff --git a/filter/data.h b/filter/data.h index b3767f7b..b5db4aec 100644 --- a/filter/data.h +++ b/filter/data.h @@ -11,111 +11,37 @@ #define _BIRD_FILTER_DATA_H_ #include "nest/bird.h" - -/* Type numbers must be in 0..0xff range */ -#define T_MASK 0xff - -/* Internal types */ -enum f_type { -/* Nothing. Simply nothing. */ - T_VOID = 0, - -/* User visible types, which fit in int */ - T_INT = 0x10, - T_BOOL = 0x11, - T_PAIR = 0x12, /* Notice that pair is stored as integer: first << 16 | second */ - T_QUAD = 0x13, - -/* Put enumerational types in 0x30..0x3f range */ - T_ENUM_LO = 0x30, - T_ENUM_HI = 0x3f, - - T_ENUM_RTS = 0x30, - T_ENUM_BGP_ORIGIN = 0x31, - T_ENUM_SCOPE = 0x32, - T_ENUM_RTC = 0x33, - T_ENUM_RTD = 0x34, - T_ENUM_ROA = 0x35, - T_ENUM_NETTYPE = 0x36, - T_ENUM_RA_PREFERENCE = 0x37, - T_ENUM_AF = 0x38, - -/* new enums go here */ - T_ENUM_EMPTY = 0x3f, /* Special hack for atomic_aggr */ - -#define T_ENUM T_ENUM_LO ... T_ENUM_HI - -/* Bigger ones */ - T_IP = 0x20, - T_NET = 0x21, - T_STRING = 0x22, - T_PATH_MASK = 0x23, /* mask for BGP path */ - T_PATH = 0x24, /* BGP path */ - T_CLIST = 0x25, /* Community list */ - T_EC = 0x26, /* Extended community value, u64 */ - T_ECLIST = 0x27, /* Extended community list */ - T_LC = 0x28, /* Large community value, lcomm */ - T_LCLIST = 0x29, /* Large community list */ - T_RD = 0x2a, /* Route distinguisher for VPN addresses */ - T_PATH_MASK_ITEM = 0x2b, /* Path mask item for path mask constructors */ - - T_SET = 0x80, - T_PREFIX_SET = 0x81, -} PACKED; +#include "lib/type.h" /* Filter value; size of this affects filter memory consumption */ struct f_val { - enum f_type type; /* T_* */ - union { - uint i; - u64 ec; - lcomm lc; - ip_addr ip; - const net_addr *net; - const char *s; - const struct f_tree *t; - const struct f_trie *ti; - const struct adata *ad; - const struct f_path_mask *path_mask; - struct f_path_mask_item pmi; - } val; + btype type; /* T_* */ + union bval_long val; }; -/* Dynamic attribute definition (eattrs) */ -struct f_dynamic_attr { - u8 type; /* EA type (EAF_*) */ - u8 bit; /* For bitfield accessors */ - enum f_type f_type; /* Filter type */ - uint ea_code; /* EA code */ -}; +#define fputip(a) ({ ip_addr *ax = falloc(sizeof(*ax)); *ax = (a); ax; }) enum f_sa_code { - SA_FROM = 1, - SA_GW, + SA_GW = 1, SA_NET, SA_PROTO, - SA_SOURCE, - SA_SCOPE, SA_DEST, SA_IFNAME, SA_IFINDEX, SA_WEIGHT, - SA_PREF, SA_GW_MPLS, - SA_ONLINK, } PACKED; /* Static attribute definition (members of struct rta) */ struct f_static_attr { - enum f_type f_type; /* Filter type */ + btype type; /* Data type */ enum f_sa_code sa_code; /* Static attribute id */ - int readonly:1; /* Don't allow writing */ + int readonly:1; /* Don't allow writing */ }; /* Filter l-value type */ enum f_lval_type { F_LVAL_VARIABLE, - F_LVAL_PREFERENCE, F_LVAL_SA, F_LVAL_EA, }; @@ -125,7 +51,7 @@ struct f_lval { enum f_lval_type type; union { struct symbol *sym; - struct f_dynamic_attr da; + const struct ea_class *da; struct f_static_attr sa; }; }; @@ -274,9 +200,9 @@ trie_match_next_longest_ip6(net_addr_ip6 *n, ip6_addr *found) #define F_CMP_ERROR 999 -const char *f_type_name(enum f_type t); +const char *f_type_name(btype t); -enum f_type f_type_element_type(enum f_type t); +enum btype f_type_element_type(btype t); int val_same(const struct f_val *v1, const struct f_val *v2); int val_compare(const struct f_val *v1, const struct f_val *v2); @@ -284,6 +210,8 @@ void val_format(const struct f_val *v, buffer *buf); char *val_format_str(struct linpool *lp, const struct f_val *v); const char *val_dump(const struct f_val *v); +struct f_val *lp_val_copy(struct linpool *lp, const struct f_val *v); + static inline int val_is_ip4(const struct f_val *v) { return (v->type == T_IP) && ipa_is_ip4(v->val.ip); } int val_in_range(const struct f_val *v1, const struct f_val *v2); @@ -315,7 +243,17 @@ undef_value(struct f_val v) } extern const struct f_val f_const_empty_path, f_const_empty_clist, f_const_empty_eclist, f_const_empty_lclist, f_const_empty_prefix_set; +static inline const struct f_val *f_get_empty(btype t) +{ + switch (t) { + case T_PATH: return &f_const_empty_path; + case T_CLIST: return &f_const_empty_clist; + case T_ECLIST: return &f_const_empty_eclist; + case T_LCLIST: return &f_const_empty_lclist; + default: return NULL; + } +} -enum filter_return f_eval(const struct f_line *expr, struct linpool *tmp_pool, struct f_val *pres); +enum filter_return f_eval(const struct f_line *expr, struct f_val *pres); #endif diff --git a/filter/decl.m4 b/filter/decl.m4 index b6026867..5ef3616a 100644 --- a/filter/decl.m4 +++ b/filter/decl.m4 @@ -94,7 +94,7 @@ FID_DUMP_BODY()m4_dnl debug("%s" $4 "\n", INDENT, $5); ]]) FID_INTERPRET_EXEC()m4_dnl -const $1 $2 = whati->$2 +$1 $2 = whati->$2 FID_INTERPRET_BODY') # Instruction arguments are needed only until linearization is done. @@ -201,6 +201,7 @@ FID_INTERPRET_BODY()') # that was needed in the former implementation. m4_define(LINEX, `FID_INTERPRET_EXEC()LINEX_($1)FID_INTERPRET_NEW()return $1 FID_INTERPRET_BODY()') m4_define(LINEX_, `do { + if (fstk->ecnt + 1 >= fstk->elen) runtime("Filter execution stack overflow"); fstk->estk[fstk->ecnt].pos = 0; fstk->estk[fstk->ecnt].line = $1; fstk->estk[fstk->ecnt].ventry = fstk->vcnt; @@ -238,7 +239,7 @@ FID_INTERPRET_BODY()') # state the result and put it to the right place. m4_define(RESULT, `RESULT_TYPE([[$1]]) RESULT_([[$1]],[[$2]],[[$3]])') m4_define(RESULT_, `RESULT_VAL([[ (struct f_val) { .type = $1, .val.$2 = $3 } ]])') -m4_define(RESULT_VAL, `FID_HIC(, [[do { res = $1; fstk->vcnt++; } while (0)]], +m4_define(RESULT_VAL, `FID_HIC(, [[do { res = $1; f_vcnt_check_overflow(1); fstk->vcnt++; } while (0)]], [[return fi_constant(what, $1)]])') m4_define(RESULT_VOID, `RESULT_VAL([[ (struct f_val) { .type = T_VOID } ]])') @@ -265,7 +266,7 @@ FID_INTERPRET_BODY()') m4_define(SYMBOL, `FID_MEMBER(struct symbol *, sym, [[strcmp(f1->sym->name, f2->sym->name) || (f1->sym->class != f2->sym->class)]], "symbol %s", item->sym->name)') m4_define(RTC, `FID_MEMBER(struct rtable_config *, rtc, [[strcmp(f1->rtc->name, f2->rtc->name)]], "route table %s", item->rtc->name)') m4_define(STATIC_ATTR, `FID_MEMBER(struct f_static_attr, sa, f1->sa.sa_code != f2->sa.sa_code,,)') -m4_define(DYNAMIC_ATTR, `FID_MEMBER(struct f_dynamic_attr, da, f1->da.ea_code != f2->da.ea_code,,)') +m4_define(DYNAMIC_ATTR, `FID_MEMBER(const struct ea_class *, da, f1->da != f2->da,,)') m4_define(ACCESS_RTE, `FID_HIC(,[[do { if (!fs->rte) runtime("No route to access"); } while (0)]],NEVER_CONSTANT())') # 2) Code wrapping @@ -502,7 +503,7 @@ fi_constant(struct f_inst *what, struct f_val val) } static int -f_const_promotion(struct f_inst *arg, enum f_type want) +f_const_promotion(struct f_inst *arg, btype want) { if (arg->fi_code != FI_CONSTANT) return 0; @@ -660,7 +661,7 @@ struct f_inst { struct f_inst *next; /* Next instruction */ enum f_instruction_code fi_code; /* Instruction code */ enum f_instruction_flags flags; /* Flags, instruction-specific */ - enum f_type type; /* Type of returned value, if known */ + btype type; /* Type of returned value, if known */ int size; /* How many instructions are underneath */ int lineno; /* Line number */ union { diff --git a/filter/f-inst.c b/filter/f-inst.c index e4b47ff4..caffc2b8 100644 --- a/filter/f-inst.c +++ b/filter/f-inst.c @@ -70,7 +70,6 @@ * m4_dnl DYNAMIC_ATTR; dynamic attribute definition * m4_dnl RTC; route table config * m4_dnl ACCESS_RTE; this instruction needs route - * m4_dnl ACCESS_EATTRS; this instruction needs extended attributes * * m4_dnl FID_MEMBER( custom instruction member * m4_dnl C type, for storage in structs @@ -237,8 +236,6 @@ * m4_dnl fpool -> the current linpool * m4_dnl NEVER_CONSTANT-> don't generate pre-interpretation code at all * m4_dnl ACCESS_RTE -> check that route is available, also NEVER_CONSTANT - * m4_dnl ACCESS_EATTRS -> pre-cache the eattrs; use only with ACCESS_RTE - * m4_dnl f_rta_cow(fs) -> function to call before any change to route should be done * * m4_dnl If you are stymied, see FI_CALL or FI_CONSTANT or just search for * m4_dnl the mentioned macros in this file to see what is happening there in wild. @@ -297,6 +294,16 @@ if (v2.val.i == 0) runtime( "Mother told me not to divide by 0" ); RESULT(T_INT, i, v1.val.i / v2.val.i); } + INST(FI_BITOR, 2, 1) { + ARG(1,T_INT); + ARG(2,T_INT); + RESULT(T_INT, i, v1.val.i | v2.val.i); + } + INST(FI_BITAND, 2, 1) { + ARG(1,T_INT); + ARG(2,T_INT); + RESULT(T_INT, i, v1.val.i & v2.val.i); + } INST(FI_AND, 1, 1) { ARG(1,T_BOOL); ARG_TYPE_STATIC(2,T_BOOL); @@ -559,8 +566,8 @@ /* Static type check */ if (f1->type) { - enum f_type t_var = (sym->class & 0xff); - enum f_type t_arg = f_type_element_type(f1->type); + enum btype t_var = (sym->class & 0xff); + enum btype t_arg = f_type_element_type(f1->type); if (!t_arg) cf_error("Value of expression in FOR must be iterable, got %s", f_type_name(f1->type)); @@ -678,26 +685,43 @@ { STATIC_ATTR; ACCESS_RTE; - struct rta *rta = (*fs->rte)->attrs; switch (sa.sa_code) { - case SA_FROM: RESULT(sa.f_type, ip, rta->from); break; - case SA_GW: RESULT(sa.f_type, ip, rta->nh.gw); break; - case SA_NET: RESULT(sa.f_type, net, (*fs->rte)->net->n.addr); break; - case SA_PROTO: RESULT(sa.f_type, s, (*fs->rte)->src->proto->name); break; - case SA_SOURCE: RESULT(sa.f_type, i, rta->source); break; - case SA_SCOPE: RESULT(sa.f_type, i, rta->scope); break; - case SA_DEST: RESULT(sa.f_type, i, rta->dest); break; - case SA_IFNAME: RESULT(sa.f_type, s, rta->nh.iface ? rta->nh.iface->name : ""); break; - case SA_IFINDEX: RESULT(sa.f_type, i, rta->nh.iface ? rta->nh.iface->index : 0); break; - case SA_WEIGHT: RESULT(sa.f_type, i, rta->nh.weight + 1); break; - case SA_PREF: RESULT(sa.f_type, i, rta->pref); break; - case SA_GW_MPLS: RESULT(sa.f_type, i, rta->nh.labels ? rta->nh.label[0] : MPLS_NULL); break; - case SA_ONLINK: RESULT(sa.f_type, i, rta->nh.flags & RNF_ONLINK ? 1 : 0); break; - + case SA_NET: RESULT(sa.type, net, fs->rte->net); break; + case SA_PROTO: RESULT(sa.type, s, fs->rte->src->owner->name); break; default: - bug("Invalid static attribute access (%u/%u)", sa.f_type, sa.sa_code); + { + struct eattr *nhea = ea_find(fs->rte->attrs, &ea_gen_nexthop); + struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + struct nexthop *nh = nhad ? &nhad->nh : NULL; + + switch (sa.sa_code) + { + case SA_DEST: + RESULT(sa.type, i, nhad ? + (NEXTHOP_IS_REACHABLE(nhad) ? RTD_UNICAST : nhad->dest) + : RTD_NONE); + break; + case SA_GW: + RESULT(sa.type, ip, nh ? nh->gw : IPA_NONE); + break; + case SA_IFNAME: + RESULT(sa.type, s, (nh && nh->iface) ? nh->iface->name : ""); + break; + case SA_IFINDEX: + RESULT(sa.type, i, (nh && nh->iface) ? nh->iface->index : 0); + break; + case SA_WEIGHT: + RESULT(sa.type, i, (nh ? nh->weight : 0) + 1); + break; + case SA_GW_MPLS: + RESULT(sa.type, i, (nh && nh->labels) ? nh->label[0] : MPLS_NULL); + break; + default: + bug("Invalid static attribute access (%u/%u)", sa.type, sa.sa_code); + } + } } } } @@ -706,51 +730,52 @@ ACCESS_RTE; ARG_ANY(1); STATIC_ATTR; - ARG_TYPE(1, sa.f_type); - - f_rta_cow(fs); + ARG_TYPE(1, sa.type); { - struct rta *rta = (*fs->rte)->attrs; + union { + struct nexthop_adata nha; + struct { + struct adata ad; + struct nexthop nh; + u32 label; + }; + } nha; + + nha.ad = (struct adata) { + .length = sizeof (struct nexthop_adata) - sizeof (struct adata), + }; + + eattr *a = NULL; switch (sa.sa_code) { - case SA_FROM: - rta->from = v1.val.ip; - break; + case SA_DEST: + { + int i = v1.val.i; + if ((i != RTD_BLACKHOLE) && (i != RTD_UNREACHABLE) && (i != RTD_PROHIBIT)) + runtime( "Destination can be changed only to blackhole, unreachable or prohibit" ); + nha.nha.dest = i; + nha.ad.length = NEXTHOP_DEST_SIZE; + break; + } case SA_GW: { + struct eattr *nh_ea = ea_find(fs->rte->attrs, &ea_gen_nexthop); + ip_addr ip = v1.val.ip; - struct iface *ifa = ipa_is_link_local(ip) || (rta->nh.flags & RNF_ONLINK) ? rta->nh.iface : NULL; - neighbor *n = neigh_find((*fs->rte)->src->proto, ip, ifa, (rta->nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0); + struct iface *ifa = (ipa_is_link_local(ip) && nh_ea) ? + ((struct nexthop_adata *) nh_ea->u.ptr)->nh.iface : NULL; + + /* XXX this code supposes that every owner is a protocol XXX */ + neighbor *n = neigh_find(SKIP_BACK(struct proto, sources, fs->rte->src->owner), ip, ifa, 0); if (!n || (n->scope == SCOPE_HOST)) runtime( "Invalid gw address" ); - rta->dest = RTD_UNICAST; - rta->nh.gw = ip; - rta->nh.iface = n->iface; - rta->nh.next = NULL; - rta->hostentry = NULL; - rta->nh.labels = 0; - } - break; - - case SA_SCOPE: - rta->scope = v1.val.i; - break; - - case SA_DEST: - { - int i = v1.val.i; - if ((i != RTD_BLACKHOLE) && (i != RTD_UNREACHABLE) && (i != RTD_PROHIBIT)) - runtime( "Destination can be changed only to blackhole, unreachable or prohibit" ); - - rta->dest = i; - rta->nh.gw = IPA_NONE; - rta->nh.iface = NULL; - rta->nh.next = NULL; - rta->hostentry = NULL; - rta->nh.labels = 0; + nha.nh = (struct nexthop) { + .gw = ip, + .iface = n->iface, + }; } break; @@ -760,12 +785,9 @@ if (!ifa) runtime( "Invalid iface name" ); - rta->dest = RTD_UNICAST; - rta->nh.gw = IPA_NONE; - rta->nh.iface = ifa; - rta->nh.next = NULL; - rta->hostentry = NULL; - rta->nh.labels = 0; + nha.nh = (struct nexthop) { + .iface = ifa, + }; } break; @@ -774,13 +796,20 @@ if (v1.val.i >= 0x100000) runtime( "Invalid MPLS label" ); + struct eattr *nh_ea = ea_find(fs->rte->attrs, &ea_gen_nexthop); + if (!nh_ea) + runtime( "No nexthop to add a MPLS label to" ); + + nha.nh = ((struct nexthop_adata *) nh_ea->u.ptr)->nh; + if (v1.val.i != MPLS_NULL) { - rta->nh.label[0] = v1.val.i; - rta->nh.labels = 1; + nha.nh.label[0] = v1.val.i; + nha.nh.labels = 1; + nha.ad.length = sizeof nha - sizeof (struct adata); } else - rta->nh.labels = 0; + nha.nh.labels = 0; } break; @@ -789,180 +818,120 @@ int i = v1.val.i; if (i < 1 || i > 256) runtime( "Setting weight value out of bounds" ); - if (rta->dest != RTD_UNICAST) + + struct eattr *nh_ea = ea_find(fs->rte->attrs, &ea_gen_nexthop); + if (!nh_ea) + runtime( "No nexthop to set weight on" ); + + struct nexthop_adata *nhad = (struct nexthop_adata *) nh_ea->u.ptr; + if (!NEXTHOP_IS_REACHABLE(nhad)) runtime( "Setting weight needs regular nexthop " ); + struct nexthop_adata *nhax = (struct nexthop_adata *) tmp_copy_adata(&nhad->ad); + /* Set weight on all next hops */ - for (struct nexthop *nh = &rta->nh; nh; nh = nh->next) + NEXTHOP_WALK(nh, nhax) nh->weight = i - 1; - } - break; - case SA_PREF: - rta->pref = v1.val.i; - break; - - case SA_ONLINK: - { - if (v1.val.i) - rta->nh.flags |= RNF_ONLINK; - else - rta->nh.flags &= ~RNF_ONLINK; - } + a = ea_set_attr(&fs->rte->attrs, + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nhax->ad)); + } break; default: - bug("Invalid static attribute access (%u/%u)", sa.f_type, sa.sa_code); + bug("Invalid static attribute access (%u/%u)", sa.type, sa.sa_code); } + + if (!a) + a = ea_set_attr(&fs->rte->attrs, + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, tmp_copy_adata(&nha.ad))); + + a->originated = 1; + a->fresh = 1; } } INST(FI_EA_GET, 0, 1) { /* Access to extended attributes */ DYNAMIC_ATTR; ACCESS_RTE; - ACCESS_EATTRS; - RESULT_TYPE(da.f_type); + RESULT_TYPE(da->type); { - eattr *e = ea_find(*fs->eattrs, da.ea_code); + const struct f_val *empty; + const eattr *e = ea_find(fs->rte->attrs, da->id); - if (!e) { - /* A special case: undefined as_path looks like empty as_path */ - if (da.type == EAF_TYPE_AS_PATH) { - RESULT_(T_PATH, ad, &null_adata); - break; - } - - /* The same special case for int_set */ - if (da.type == EAF_TYPE_INT_SET) { - RESULT_(T_CLIST, ad, &null_adata); - break; - } - - /* The same special case for ec_set */ - if (da.type == EAF_TYPE_EC_SET) { - RESULT_(T_ECLIST, ad, &null_adata); - break; - } + if (e) + { + ASSERT_DIE(e->type == da->type); - /* The same special case for lc_set */ - if (da.type == EAF_TYPE_LC_SET) { - RESULT_(T_LCLIST, ad, &null_adata); - break; + switch (e->type) { + case T_IP: + RESULT_(T_IP, ip, *((const ip_addr *) e->u.ptr->data)); + break; + default: + RESULT_VAL([[(struct f_val) { + .type = e->type, + .val.bval = e->u, + }]]); } - - /* Undefined value */ - RESULT_VOID; - break; - } - - switch (e->type & EAF_TYPE_MASK) { - case EAF_TYPE_INT: - RESULT_(da.f_type, i, e->u.data); - break; - case EAF_TYPE_ROUTER_ID: - RESULT_(T_QUAD, i, e->u.data); - break; - case EAF_TYPE_OPAQUE: - RESULT_(T_ENUM_EMPTY, i, 0); - break; - case EAF_TYPE_IP_ADDRESS: - RESULT_(T_IP, ip, *((ip_addr *) e->u.ptr->data)); - break; - case EAF_TYPE_AS_PATH: - RESULT_(T_PATH, ad, e->u.ptr); - break; - case EAF_TYPE_BITFIELD: - RESULT_(T_BOOL, i, !!(e->u.data & (1u << da.bit))); - break; - case EAF_TYPE_INT_SET: - RESULT_(T_CLIST, ad, e->u.ptr); - break; - case EAF_TYPE_EC_SET: - RESULT_(T_ECLIST, ad, e->u.ptr); - break; - case EAF_TYPE_LC_SET: - RESULT_(T_LCLIST, ad, e->u.ptr); - break; - default: - bug("Unknown dynamic attribute type"); } + else if (empty = f_get_empty(da->type)) + RESULT_VAL(*empty); + else + RESULT_VOID; } } INST(FI_EA_SET, 1, 0) { ACCESS_RTE; - ACCESS_EATTRS; ARG_ANY(1); DYNAMIC_ATTR; - ARG_TYPE(1, da.f_type); + ARG_TYPE(1, da->type); { - struct ea_list *l = lp_alloc(fs->pool, sizeof(struct ea_list) + sizeof(eattr)); - - l->next = NULL; - l->flags = EALF_SORTED; - l->count = 1; - l->attrs[0].id = da.ea_code; - l->attrs[0].flags = 0; - l->attrs[0].type = da.type; - l->attrs[0].originated = 1; - l->attrs[0].fresh = 1; - l->attrs[0].undef = 0; - - switch (da.type) { - case EAF_TYPE_INT: - case EAF_TYPE_ROUTER_ID: - l->attrs[0].u.data = v1.val.i; - break; + struct eattr *a; - case EAF_TYPE_OPAQUE: - runtime( "Setting opaque attribute is not allowed" ); - break; + if (da->type >= EAF_TYPE__MAX) + bug("Unsupported attribute type"); - case EAF_TYPE_IP_ADDRESS:; - int len = sizeof(ip_addr); - struct adata *ad = lp_alloc(fs->pool, sizeof(struct adata) + len); - ad->length = len; - (* (ip_addr *) ad->data) = v1.val.ip; - l->attrs[0].u.ptr = ad; + switch (da->type) { + case T_OPAQUE: + case T_IFACE: + runtime( "Setting opaque attribute is not allowed" ); break; - case EAF_TYPE_AS_PATH: - case EAF_TYPE_INT_SET: - case EAF_TYPE_EC_SET: - case EAF_TYPE_LC_SET: - l->attrs[0].u.ptr = v1.val.ad; - break; - - case EAF_TYPE_BITFIELD: - { - /* First, we have to find the old value */ - eattr *e = ea_find(*fs->eattrs, da.ea_code); - u32 data = e ? e->u.data : 0; - - if (v1.val.i) - l->attrs[0].u.data = data | (1u << da.bit); - else - l->attrs[0].u.data = data & ~(1u << da.bit); - } + case T_IP: + a = ea_set_attr(&fs->rte->attrs, + EA_LITERAL_STORE_ADATA(da, 0, &v1.val.ip, sizeof(ip_addr))); break; default: - bug("Unknown dynamic attribute type"); + a = ea_set_attr(&fs->rte->attrs, + EA_LITERAL_GENERIC(da->id, da->type, 0, .u = v1.val.bval)); + break; } - f_rta_cow(fs); - l->next = *fs->eattrs; - *fs->eattrs = l; + a->originated = 1; + a->fresh = 1; } } INST(FI_EA_UNSET, 0, 0) { DYNAMIC_ATTR; ACCESS_RTE; - ACCESS_EATTRS; - f_rta_cow(fs); - ea_unset_attr(fs->eattrs, fs->pool, 1, da.ea_code); + ea_unset_attr(&fs->rte->attrs, 1, da); + } + + INST(FI_DEFAULT, 2, 1) { + ARG_ANY(1); + ARG_ANY(2); + RESULT_TYPE(f_type_element_type(v2.type)); + + log(L_INFO "Type of arg 1 is: %d", v1.type); + + if (v1.type == T_VOID) + RESULT_VAL(v2); + else + RESULT_VAL(v1); } INST(FI_LENGTH, 1, 1) { /* Get length of */ @@ -1244,7 +1213,7 @@ struct f_arg *b = sym->function->arg_list; for (uint i = 1; a && b; a = a->next, b = b->next, i++) { - enum f_type b_type = b->arg->class & 0xff; + enum btype b_type = b->arg->class & 0xff; if (a->type && (a->type != b_type) && !f_const_promotion(a, b_type)) cf_error("Argument %u of '%s' must be %s, got %s", @@ -1281,6 +1250,7 @@ fstk->vcnt += sym->function->args; /* Storage for local variables */ + f_vcnt_check_overflow(sym->function->vars); memset(&(fstk->vstk[fstk->vcnt]), 0, sizeof(struct f_val) * sym->function->vars); fstk->vcnt += sym->function->vars; } @@ -1504,42 +1474,12 @@ runtime("Can't filter non-[e|l]clist"); } - INST(FI_ROA_CHECK_IMPLICIT, 0, 1) { /* ROA Check */ - NEVER_CONSTANT; - RTC(1); - struct rtable *table = rtc->table; - ACCESS_RTE; - ACCESS_EATTRS; - const net_addr *net = (*fs->rte)->net->n.addr; - - /* We ignore temporary attributes, probably not a problem here */ - /* 0x02 is a value of BA_AS_PATH, we don't want to include BGP headers */ - eattr *e = ea_find(*fs->eattrs, EA_CODE(PROTOCOL_BGP, 0x02)); - - if (!e || ((e->type & EAF_TYPE_MASK) != EAF_TYPE_AS_PATH)) - runtime("Missing AS_PATH attribute"); - - u32 as = 0; - as_path_get_last(e->u.ptr, &as); - - if (!table) - runtime("Missing ROA table"); - - if (table->addr_type != NET_ROA4 && table->addr_type != NET_ROA6) - runtime("Table type must be either ROA4 or ROA6"); - - if (table->addr_type != (net->type == NET_IP4 ? NET_ROA4 : NET_ROA6)) - RESULT(T_ENUM_ROA, i, ROA_UNKNOWN); /* Prefix and table type mismatch */ - else - RESULT(T_ENUM_ROA, i, [[ net_roa_check(table, net, as) ]]); - } - - INST(FI_ROA_CHECK_EXPLICIT, 2, 1) { /* ROA Check */ + INST(FI_ROA_CHECK, 2, 1) { /* ROA Check */ NEVER_CONSTANT; ARG(1, T_NET); ARG(2, T_INT); RTC(3); - struct rtable *table = rtc->table; + rtable *table = rtc->table; u32 as = v2.val.i; diff --git a/filter/f-inst.h b/filter/f-inst.h index e35f71c6..fbc59de7 100644 --- a/filter/f-inst.h +++ b/filter/f-inst.h @@ -94,15 +94,17 @@ void f_add_lines(const struct f_line_item *what, struct filter_iterator *fit); struct filter *f_new_where(struct f_inst *); -static inline struct f_dynamic_attr f_new_dynamic_attr(u8 type, enum f_type f_type, uint code) /* Type as core knows it, type as filters know it, and code of dynamic attribute */ -{ return (struct f_dynamic_attr) { .type = type, .f_type = f_type, .ea_code = code }; } /* f_type currently unused; will be handy for static type checking */ -static inline struct f_dynamic_attr f_new_dynamic_attr_bit(u8 bit, enum f_type f_type, uint code) /* Type as core knows it, type as filters know it, and code of dynamic attribute */ -{ return (struct f_dynamic_attr) { .type = EAF_TYPE_BITFIELD, .bit = bit, .f_type = f_type, .ea_code = code }; } /* f_type currently unused; will be handy for static type checking */ -static inline struct f_static_attr f_new_static_attr(int f_type, int code, int readonly) -{ return (struct f_static_attr) { .f_type = f_type, .sa_code = code, .readonly = readonly }; } -struct f_inst *f_generate_complex(enum f_instruction_code fi_code, struct f_dynamic_attr da, struct f_inst *argument); +static inline struct f_static_attr f_new_static_attr(btype type, int code, int readonly) +{ return (struct f_static_attr) { .type = type, .sa_code = code, .readonly = readonly }; } struct f_inst *f_generate_roa_check(struct rtable_config *table, struct f_inst *prefix, struct f_inst *asn); +struct f_attr_bit { + const struct ea_class *class; + uint bit; +}; + +#define f_new_dynamic_attr_bit(_bit, _name) ((struct f_attr_bit) { .bit = _bit, .class = ea_class_find(_name) }) + /* Hook for call bt_assert() function in configuration */ extern void (*bt_assert_hook)(int result, const struct f_line_item *assert); diff --git a/filter/f-util.c b/filter/f-util.c index d814493e..82a06bdd 100644 --- a/filter/f-util.c +++ b/filter/f-util.c @@ -2,7 +2,7 @@ * Filters: utility functions * * Copyright 1998 Pavel Machek <pavel@ucw.cz> - * 2017 Jan Maria Matejka <mq@ucw.cz> + * 2017 Maria Matejka <mq@ucw.cz> * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -13,7 +13,7 @@ #include "filter/f-inst.h" #include "lib/idm.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #define P(a,b) ((a<<8) | b) @@ -40,144 +40,3 @@ struct filter *f_new_where(struct f_inst *where) f->root = f_linearize(cond, 0); return f; } - -#define CA_KEY(n) n->name, n->fda.type -#define CA_NEXT(n) n->next -#define CA_EQ(na,ta,nb,tb) (!strcmp(na,nb) && (ta == tb)) -#define CA_FN(n,t) (mem_hash(n, strlen(n)) ^ (t*0xaae99453U)) -#define CA_ORDER 8 /* Fixed */ - -struct ca_storage { - struct ca_storage *next; - struct f_dynamic_attr fda; - u32 uc; - char name[0]; -}; - -HASH(struct ca_storage) ca_hash; - -static struct idm ca_idm; -static struct ca_storage **ca_storage; -static uint ca_storage_max; - -static void -ca_free(resource *r) -{ - struct custom_attribute *ca = (void *) r; - struct ca_storage *cas = HASH_FIND(ca_hash, CA, ca->name, ca->fda->type); - ASSERT(cas); - - ca->name = NULL; - ca->fda = NULL; - if (!--cas->uc) { - uint id = EA_CUSTOM_ID(cas->fda.ea_code); - idm_free(&ca_idm, id); - HASH_REMOVE(ca_hash, CA, cas); - ca_storage[id] = NULL; - mb_free(cas); - } -} - -static void -ca_dump(resource *r) -{ - struct custom_attribute *ca = (void *) r; - debug("name \"%s\" id 0x%04x ea_type 0x%02x f_type 0x%02x\n", - ca->name, ca->fda->ea_code, ca->fda->type, ca->fda->f_type); -} - -static struct resclass ca_class = { - .name = "Custom attribute", - .size = sizeof(struct custom_attribute), - .free = ca_free, - .dump = ca_dump, - .lookup = NULL, - .memsize = NULL, -}; - -struct custom_attribute * -ca_lookup(pool *p, const char *name, int f_type) -{ - int ea_type; - - switch (f_type) { - case T_INT: - ea_type = EAF_TYPE_INT; - break; - case T_IP: - ea_type = EAF_TYPE_IP_ADDRESS; - break; - case T_QUAD: - ea_type = EAF_TYPE_ROUTER_ID; - break; - case T_PATH: - ea_type = EAF_TYPE_AS_PATH; - break; - case T_CLIST: - ea_type = EAF_TYPE_INT_SET; - break; - case T_ECLIST: - ea_type = EAF_TYPE_EC_SET; - break; - case T_LCLIST: - ea_type = EAF_TYPE_LC_SET; - break; - default: - cf_error("Custom route attribute of unsupported type"); - } - - static int inited = 0; - if (!inited) { - idm_init(&ca_idm, config_pool, 8); - HASH_INIT(ca_hash, config_pool, CA_ORDER); - - ca_storage_max = 256; - ca_storage = mb_allocz(config_pool, sizeof(struct ca_storage *) * ca_storage_max); - - inited++; - } - - struct ca_storage *cas = HASH_FIND(ca_hash, CA, name, ea_type); - if (cas) { - cas->uc++; - } else { - - uint id = idm_alloc(&ca_idm); - - if (id >= EA_CUSTOM_BIT) - cf_error("Too many custom attributes."); - - if (id >= ca_storage_max) { - ca_storage_max *= 2; - ca_storage = mb_realloc(ca_storage, sizeof(struct ca_storage *) * ca_storage_max * 2); - } - - cas = mb_allocz(config_pool, sizeof(struct ca_storage) + strlen(name) + 1); - cas->fda = f_new_dynamic_attr(ea_type, f_type, EA_CUSTOM(id)); - cas->uc = 1; - - strcpy(cas->name, name); - ca_storage[id] = cas; - - HASH_INSERT(ca_hash, CA, cas); - } - - struct custom_attribute *ca = ralloc(p, &ca_class); - ca->fda = &(cas->fda); - ca->name = cas->name; - return ca; -} - -const char * -ea_custom_name(uint ea) -{ - uint id = EA_CUSTOM_ID(ea); - if (id >= ca_storage_max) - return NULL; - - if (!ca_storage[id]) - return NULL; - - return ca_storage[id]->name; -} - diff --git a/filter/filter.c b/filter/filter.c index 20a380dc..0aff4d30 100644 --- a/filter/filter.c +++ b/filter/filter.c @@ -35,10 +35,10 @@ #include "lib/ip.h" #include "lib/net.h" #include "lib/flowspec.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/attrs.h" +#include "lib/attrs.h" #include "conf/conf.h" #include "filter/filter.h" #include "filter/f-inst.h" @@ -50,42 +50,31 @@ enum f_exception { FE_RETURN = 0x1, }; - -struct filter_stack { - /* Value stack for execution */ -#define F_VAL_STACK_MAX 4096 - uint vcnt; /* Current value stack size; 0 for empty */ - uint ecnt; /* Current execute stack size; 0 for empty */ - - struct f_val vstk[F_VAL_STACK_MAX]; /* The stack itself */ - - /* Instruction stack for execution */ -#define F_EXEC_STACK_MAX 4096 - struct { - const struct f_line *line; /* The line that is being executed */ - uint pos; /* Instruction index in the line */ - uint ventry; /* Value stack depth on entry */ - uint vbase; /* Where to index variable positions from */ - enum f_exception emask; /* Exception mask */ - } estk[F_EXEC_STACK_MAX]; +struct filter_exec_stack { + const struct f_line *line; /* The line that is being executed */ + uint pos; /* Instruction index in the line */ + uint ventry; /* Value stack depth on entry */ + uint vbase; /* Where to index variable positions from */ + enum f_exception emask; /* Exception mask */ }; /* Internal filter state, to be allocated on stack when executing filters */ struct filter_state { /* Stacks needed for execution */ - struct filter_stack *stack; + struct filter_stack { + /* Current filter stack depth */ - /* The route we are processing. This may be NULL to indicate no route available. */ - struct rte **rte; - - /* The old rta to be freed after filters are done. */ - struct rta *old_rta; + /* Value stack */ + uint vcnt, vlen; + struct f_val *vstk; - /* Cached pointer to ea_list */ - struct ea_list **eattrs; + /* Instruction stack for execution */ + uint ecnt, elen; + struct filter_exec_stack *estk; + } stack; - /* Linpool for adata allocation */ - struct linpool *pool; + /* The route we are processing. This may be NULL to indicate no route available. */ + struct rte *rte; /* Buffer for log output */ struct buffer buf; @@ -95,49 +84,12 @@ struct filter_state { }; _Thread_local static struct filter_state filter_state; -_Thread_local static struct filter_stack filter_stack; void (*bt_assert_hook)(int result, const struct f_line_item *assert); -static inline void f_cache_eattrs(struct filter_state *fs) -{ - fs->eattrs = &((*fs->rte)->attrs->eattrs); -} - -static inline void f_rte_cow(struct filter_state *fs) -{ - if (!((*fs->rte)->flags & REF_COW)) - return; - - *fs->rte = rte_cow(*fs->rte); -} +#define _f_stack_init(fs, px, def) ((fs).stack.px##stk = alloca(sizeof(*(fs).stack.px##stk) * ((fs).stack.px##len = (config && config->filter_##px##stk) ? config->filter_##px##stk : (def)))) -/* - * rta_cow - prepare rta for modification by filter - */ -static void -f_rta_cow(struct filter_state *fs) -{ - if (!rta_is_cached((*fs->rte)->attrs)) - return; - - /* Prepare to modify rte */ - f_rte_cow(fs); - - /* Store old rta to free it later, it stores reference from rte_cow() */ - fs->old_rta = (*fs->rte)->attrs; - - /* - * Get shallow copy of rta. Fields eattrs and nexthops of rta are shared - * with fs->old_rta (they will be copied when the cached rta will be obtained - * at the end of f_run()), also the lock of hostentry is inherited (we - * suppose hostentry is not changed by filters). - */ - (*fs->rte)->attrs = rta_do_cow((*fs->rte)->attrs, fs->pool); - - /* Re-cache the ea_list */ - f_cache_eattrs(fs); -} +#define f_stack_init(fs) ( _f_stack_init(fs, v, 128), _f_stack_init(fs, e, 128) ) static struct tbf rl_runtime_err = TBF_DEFAULT_LOG_LIMITS; @@ -163,15 +115,17 @@ interpret(struct filter_state *fs, const struct f_line *line, struct f_val *val) ASSERT(line->args == 0); /* Initialize the filter stack */ - struct filter_stack *fstk = fs->stack; + struct filter_stack *fstk = &fs->stack; fstk->vcnt = line->vars; memset(fstk->vstk, 0, sizeof(struct f_val) * line->vars); /* The same as with the value stack. Not resetting the stack for performance reasons. */ fstk->ecnt = 1; - fstk->estk[0].line = line; - fstk->estk[0].pos = 0; + fstk->estk[0] = (struct filter_exec_stack) { + .line = line, + .pos = 0, + }; #define curline fstk->estk[fstk->ecnt-1] @@ -191,16 +145,16 @@ interpret(struct filter_state *fs, const struct f_line *line, struct f_val *val) #define v2 vv(1) #define v3 vv(2) +#define f_vcnt_check_overflow(n) do { if (fstk->vcnt + n >= fstk->vlen) runtime("Filter execution stack overflow"); } while (0) + #define runtime(fmt, ...) do { \ if (!(fs->flags & FF_SILENT)) \ log_rl(&rl_runtime_err, L_ERR "filters, line %d: " fmt, what->lineno, ##__VA_ARGS__); \ return F_ERROR; \ } while(0) -#define falloc(size) lp_alloc(fs->pool, size) -#define fpool fs->pool - -#define ACCESS_EATTRS do { if (!fs->eattrs) f_cache_eattrs(fs); } while (0) +#define falloc(size) tmp_alloc(size) +#define fpool tmp_linpool #include "filter/inst-interpret.c" #undef res @@ -210,7 +164,6 @@ interpret(struct filter_state *fs, const struct f_line *line, struct f_val *val) #undef runtime #undef falloc #undef fpool -#undef ACCESS_EATTRS } } @@ -240,29 +193,15 @@ interpret(struct filter_state *fs, const struct f_line *line, struct f_val *val) /** * f_run - run a filter for a route * @filter: filter to run - * @rte: route being filtered, may be modified + * @rte: route being filtered, must be write-able * @tmp_pool: all filter allocations go from this pool * @flags: flags * - * If filter needs to modify the route, there are several - * posibilities. @rte might be read-only (with REF_COW flag), in that - * case rw copy is obtained by rte_cow() and @rte is replaced. If - * @rte is originally rw, it may be directly modified (and it is never - * copied). - * - * The returned rte may reuse the (possibly cached, cloned) rta, or - * (if rta was modified) contains a modified uncached rta, which - * uses parts allocated from @tmp_pool and parts shared from original - * rta. There is one exception - if @rte is rw but contains a cached - * rta and that is modified, rta in returned rte is also cached. - * - * Ownership of cached rtas is consistent with rte, i.e. - * if a new rte is returned, it has its own clone of cached rta - * (and cached rta of read-only source rte is intact), if rte is - * modified in place, old cached rta is possibly freed. + * If @rte->attrs is cached, the returned rte allocates a new rta on + * tmp_pool, otherwise the filters may modify it. */ enum filter_return -f_run(const struct filter *filter, struct rte **rte, struct linpool *tmp_pool, int flags) +f_run(const struct filter *filter, struct rte *rte, int flags) { if (filter == FILTER_ACCEPT) return F_ACCEPT; @@ -270,48 +209,21 @@ f_run(const struct filter *filter, struct rte **rte, struct linpool *tmp_pool, i if (filter == FILTER_REJECT) return F_REJECT; - int rte_cow = ((*rte)->flags & REF_COW); DBG( "Running filter `%s'...", filter->name ); /* Initialize the filter state */ filter_state = (struct filter_state) { - .stack = &filter_stack, .rte = rte, - .pool = tmp_pool, .flags = flags, }; + f_stack_init(filter_state); + LOG_BUFFER_INIT(filter_state.buf); /* Run the interpreter itself */ enum filter_return fret = interpret(&filter_state, filter->root, NULL); - if (filter_state.old_rta) { - /* - * Cached rta was modified and filter_state->rte contains now an uncached one, - * sharing some part with the cached one. The cached rta should - * be freed (if rte was originally COW, filter_state->old_rta is a clone - * obtained during rte_cow()). - * - * This also implements the exception mentioned in f_run() - * description. The reason for this is that rta reuses parts of - * filter_state->old_rta, and these may be freed during rta_free(filter_state->old_rta). - * This is not the problem if rte was COW, because original rte - * also holds the same rta. - */ - if (!rte_cow) { - /* Cache the new attrs */ - (*filter_state.rte)->attrs = rta_lookup((*filter_state.rte)->attrs); - - /* Drop cached ea_list pointer */ - filter_state.eattrs = NULL; - } - - /* Uncache the old attrs and drop the pointer as it is invalid now. */ - rta_free(filter_state.old_rta); - filter_state.old_rta = NULL; - } - /* Process the filter output, log it and return */ if (fret < F_ACCEPT) { if (!(filter_state.flags & FF_SILENT)) @@ -336,18 +248,17 @@ f_run(const struct filter *filter, struct rte **rte, struct linpool *tmp_pool, i */ enum filter_return -f_eval_rte(const struct f_line *expr, struct rte **rte, struct linpool *tmp_pool) +f_eval_rte(const struct f_line *expr, struct rte *rte) { filter_state = (struct filter_state) { - .stack = &filter_stack, .rte = rte, - .pool = tmp_pool, }; + f_stack_init(filter_state); + LOG_BUFFER_INIT(filter_state.buf); - ASSERT(!((*rte)->flags & REF_COW)); - ASSERT(!rta_is_cached((*rte)->attrs)); + ASSERT(!rta_is_cached(rte->attrs)); return interpret(&filter_state, expr, NULL); } @@ -359,12 +270,11 @@ f_eval_rte(const struct f_line *expr, struct rte **rte, struct linpool *tmp_pool * @pres: here the output will be stored */ enum filter_return -f_eval(const struct f_line *expr, struct linpool *tmp_pool, struct f_val *pres) +f_eval(const struct f_line *expr, struct f_val *pres) { - filter_state = (struct filter_state) { - .stack = &filter_stack, - .pool = tmp_pool, - }; + filter_state = (struct filter_state) {}; + + f_stack_init(filter_state); LOG_BUFFER_INIT(filter_state.buf); @@ -381,10 +291,9 @@ uint f_eval_int(const struct f_line *expr) { /* Called independently in parse-time to eval expressions */ - filter_state = (struct filter_state) { - .stack = &filter_stack, - .pool = cfg_mem, - }; + filter_state = (struct filter_state) {}; + + f_stack_init(filter_state); struct f_val val; @@ -403,10 +312,10 @@ f_eval_int(const struct f_line *expr) * f_eval_buf - get a value of a term and print it to the supplied buffer */ enum filter_return -f_eval_buf(const struct f_line *expr, struct linpool *tmp_pool, buffer *buf) +f_eval_buf(const struct f_line *expr, buffer *buf) { struct f_val val; - enum filter_return fret = f_eval(expr, tmp_pool, &val); + enum filter_return fret = f_eval(expr, &val); if (fret <= F_RETURN) val_format(&val, buf); return fret; @@ -474,6 +383,23 @@ filter_commit(struct config *new, struct config *old) } } +void channel_filter_dump(const struct filter *f) +{ + if (f == FILTER_ACCEPT) + debug(" ALL"); + else if (f == FILTER_REJECT) + debug(" NONE"); + else if (f == FILTER_UNDEF) + debug(" UNDEF"); + else if (f->sym) { + ASSERT(f->sym->filter == f); + debug(" named filter %s", f->sym->name); + } else { + debug("\n"); + f_dump_line(f->root, 2); + } +} + void filters_dump_all(void) { struct symbol *sym; @@ -493,19 +419,10 @@ void filters_dump_all(void) struct channel *c; WALK_LIST(c, sym->proto->proto->channels) { debug(" Channel %s (%s) IMPORT", c->name, net_label[c->net_type]); - if (c->in_filter == FILTER_ACCEPT) - debug(" ALL\n"); - else if (c->in_filter == FILTER_REJECT) - debug(" NONE\n"); - else if (c->in_filter == FILTER_UNDEF) - debug(" UNDEF\n"); - else if (c->in_filter->sym) { - ASSERT(c->in_filter->sym->filter == c->in_filter); - debug(" named filter %s\n", c->in_filter->sym->name); - } else { - debug("\n"); - f_dump_line(c->in_filter->root, 2); - } + channel_filter_dump(c->in_filter); + debug(" EXPORT", c->name, net_label[c->net_type]); + channel_filter_dump(c->out_filter); + debug("\n"); } } } diff --git a/filter/filter.h b/filter/filter.h index 26c1037b..3f2e62eb 100644 --- a/filter/filter.h +++ b/filter/filter.h @@ -13,8 +13,8 @@ #include "lib/resource.h" #include "lib/ip.h" #include "lib/macro.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" /* Possible return values of filter execution */ enum filter_return { @@ -51,10 +51,10 @@ struct filter { struct rte; -enum filter_return f_run(const struct filter *filter, struct rte **rte, struct linpool *tmp_pool, int flags); -enum filter_return f_eval_rte(const struct f_line *expr, struct rte **rte, struct linpool *tmp_pool); +enum filter_return f_run(const struct filter *filter, struct rte *rte, int flags); +enum filter_return f_eval_rte(const struct f_line *expr, struct rte *rte); uint f_eval_int(const struct f_line *expr); -enum filter_return f_eval_buf(const struct f_line *expr, struct linpool *tmp_pool, buffer *buf); +enum filter_return f_eval_buf(const struct f_line *expr, buffer *buf); const char *filter_name(const struct filter *filter); int filter_same(const struct filter *new, const struct filter *old); @@ -70,13 +70,4 @@ void filters_dump_all(void); #define FF_SILENT 2 /* Silent filter execution */ -/* Custom route attributes */ -struct custom_attribute { - resource r; - struct f_dynamic_attr *fda; - const char *name; -}; - -struct custom_attribute *ca_lookup(pool *p, const char *name, int ea_type); - #endif diff --git a/filter/filter_test.c b/filter/filter_test.c index e8e8b747..aa325aef 100644 --- a/filter/filter_test.c +++ b/filter/filter_test.c @@ -46,7 +46,7 @@ run_function(const void *arg) if (t->cmp) return t->result == f_same(t->fn, t->cmp); - enum filter_return fret = f_eval(t->fn, tmp_linpool, NULL); + enum filter_return fret = f_eval(t->fn, NULL); return (fret < F_REJECT); } @@ -70,6 +70,7 @@ int main(int argc, char *argv[]) { bt_init(argc, argv); + bt_bird_init(); bt_assert_hook = bt_assert_filter; @@ -78,12 +79,11 @@ main(int argc, char *argv[]) if (!bt_config_file_parse(BT_CONFIG_FILE)) abort(); - bt_test_suite(t_reconfig, "Testing reconfiguration"); + bt_test_suite_extra(t_reconfig, 0, BT_TIMEOUT, "Testing reconfiguration"); struct f_bt_test_suite *t; WALK_LIST(t, config->tests) - bt_test_suite_base(run_function, t->fn_name, t, BT_FORKING, BT_TIMEOUT, "%s", t->dsc); + bt_test_suite_base(run_function, t->fn_name, t, 0, BT_TIMEOUT, "%s", t->dsc); - bt_bird_cleanup(); return bt_exit_value(); } diff --git a/filter/test.conf b/filter/test.conf index 1d291c69..17aaf8cf 100644 --- a/filter/test.conf +++ b/filter/test.conf @@ -9,7 +9,109 @@ router id 62.168.0.1; /* We have to setup any protocol */ protocol device { } - +/* Setting some custom attributes, enough to force BIRD to reallocate the attribute idmap */ +attribute int test_ca_int1; +attribute int test_ca_int2; +attribute int test_ca_int3; +attribute int test_ca_int4; +attribute int test_ca_int5; +attribute int test_ca_int6; +attribute int test_ca_int7; +attribute int test_ca_int8; +attribute int test_ca_int9; +attribute int test_ca_int10; + +attribute ip test_ca_ip1; +attribute ip test_ca_ip2; +attribute ip test_ca_ip3; +attribute ip test_ca_ip4; +attribute ip test_ca_ip5; +attribute ip test_ca_ip6; +attribute ip test_ca_ip7; +attribute ip test_ca_ip8; +attribute ip test_ca_ip9; +attribute ip test_ca_ip10; + +attribute quad test_ca_quad1; +attribute quad test_ca_quad2; +attribute quad test_ca_quad3; +attribute quad test_ca_quad4; +attribute quad test_ca_quad5; +attribute quad test_ca_quad6; +attribute quad test_ca_quad7; +attribute quad test_ca_quad8; +attribute quad test_ca_quad9; +attribute quad test_ca_quad10; + +attribute bgppath test_ca_bgppath1; +attribute bgppath test_ca_bgppath2; +attribute bgppath test_ca_bgppath3; +attribute bgppath test_ca_bgppath4; +attribute bgppath test_ca_bgppath5; +attribute bgppath test_ca_bgppath6; +attribute bgppath test_ca_bgppath7; +attribute bgppath test_ca_bgppath8; +attribute bgppath test_ca_bgppath9; +attribute bgppath test_ca_bgppath10; + +attribute clist test_ca_clist1; +attribute clist test_ca_clist2; +attribute clist test_ca_clist3; +attribute clist test_ca_clist4; +attribute clist test_ca_clist5; +attribute clist test_ca_clist6; +attribute clist test_ca_clist7; +attribute clist test_ca_clist8; +attribute clist test_ca_clist9; +attribute clist test_ca_clist10; + +attribute eclist test_ca_eclist1; +attribute eclist test_ca_eclist2; +attribute eclist test_ca_eclist3; +attribute eclist test_ca_eclist4; +attribute eclist test_ca_eclist5; +attribute eclist test_ca_eclist6; +attribute eclist test_ca_eclist7; +attribute eclist test_ca_eclist8; +attribute eclist test_ca_eclist9; +attribute eclist test_ca_eclist10; + +attribute lclist test_ca_lclist1; +attribute lclist test_ca_lclist2; +attribute lclist test_ca_lclist3; +attribute lclist test_ca_lclist4; +attribute lclist test_ca_lclist5; +attribute lclist test_ca_lclist6; +attribute lclist test_ca_lclist7; +attribute lclist test_ca_lclist8; +attribute lclist test_ca_lclist9; +attribute lclist test_ca_lclist10; + +attribute lclist test_ca_lclist_max1; +attribute lclist test_ca_lclist_max2; +attribute lclist test_ca_lclist_max3; +attribute lclist test_ca_lclist_max4; +attribute lclist test_ca_lclist_max5; +attribute lclist test_ca_lclist_max6; +attribute lclist test_ca_lclist_max7; +attribute lclist test_ca_lclist_max8; +attribute lclist test_ca_lclist_max9; +attribute lclist test_ca_lclist_max10; +attribute lclist test_ca_lclist_max11; +attribute lclist test_ca_lclist_max12; +attribute lclist test_ca_lclist_max13; +attribute lclist test_ca_lclist_max14; +attribute lclist test_ca_lclist_max15; +attribute lclist test_ca_lclist_max16; +attribute lclist test_ca_lclist_max17; +attribute lclist test_ca_lclist_max18; +attribute lclist test_ca_lclist_max19; +attribute lclist test_ca_lclist_max20; +attribute lclist test_ca_lclist_max21; + + +/* Uncomment this to get an error */ +#attribute int bgp_path; /* * Common definitions and functions @@ -110,6 +212,14 @@ function t_int() bt_assert(1 <= 1); bt_assert(!(1234 < 1234)); + bt_assert(10 - 5 = 5); + bt_assert(4294967295 + 1 = 0); + bt_assert(6*9=54); + bt_assert(984/41 = 24); + bt_assert(123/45 = 2); + bt_assert(0xfee1a | 0xbeef = 0xffeff); + bt_assert(0xfee1a & 0xbeef = 0xae0a); + case i { 4200000000: bt_assert(true); else: bt_assert(false); @@ -416,9 +526,9 @@ bt_test_suite(t_ip_set, "Testing sets of ip address"); function t_enum() { - bt_assert(format(RTS_STATIC) = "(enum 30)1"); - bt_assert(format(NET_IP4) = "(enum 36)1"); - bt_assert(format(NET_VPN6) = "(enum 36)4"); + bt_assert(format(RTS_STATIC) = "(enum 31)1"); + bt_assert(format(NET_IP4) = "(enum 3b)1"); + bt_assert(format(NET_VPN6) = "(enum 3b)4"); bt_assert(RTS_STATIC ~ [RTS_STATIC, RTS_DEVICE]); bt_assert(RTS_BGP !~ [RTS_STATIC, RTS_DEVICE]); @@ -1516,6 +1626,7 @@ function __test2() filter testf int j; +bool t; { print "Heya, filtering route to ", net.ip, " prefixlen ", net.len, " source ", source; print "This route was from ", from; @@ -1527,6 +1638,52 @@ int j; rip_metric = 14; unset(rip_metric); + preference = 1234; + + test_ca_int1 = 42; + test_ca_ip2 = 1.3.5.7; + test_ca_quad3 = 2.4.6.8; + test_ca_bgppath4 = +empty+; + test_ca_clist5 = -empty-; + test_ca_eclist6 = --empty--; + test_ca_lclist7 = ---empty---; + + igp_metric = 53; + babel_metric = 64; + t = defined(babel_router_id); + + bgp_origin = ORIGIN_IGP; + bgp_path = +empty+; + bgp_next_hop = 3456:789a:bcde:f012::3456:789a; + bgp_med = 71; + bgp_local_pref = 942; + t = defined(bgp_atomic_aggr); + t = defined(bgp_aggregator); + bgp_community = -empty-; + bgp_originator_id = 9.7.5.3; + bgp_cluster_list = -empty-; + bgp_ext_community = --empty--; + t = defined(bgp_aigp); + bgp_large_community = ---empty---; + t = defined(bgp_mpls_label_stack); + + ospf_metric1 = 64; + ospf_metric2 = 111; + ospf_tag = 654432; + + radv_preference = RA_PREF_LOW; + radv_lifetime = 28; + + rip_metric = 2; + rip_tag = 4; + + krt_source = 17; + krt_metric = 19; + +# krt_lock_mtu = false; +# krt_lock_window = true; +# krt_lock_rtt = krt_lock_rttvar && krt_lock_sstresh || krt_lock_cwnd; + accept "ok I take that"; } diff --git a/filter/test.conf2 b/filter/test.conf2 index e95f9563..9fc8330f 100644 --- a/filter/test.conf2 +++ b/filter/test.conf2 @@ -38,12 +38,6 @@ protocol static { print from; from = 1.2.3.4; print from; - print scope; - scope = SCOPE_HOST; - print scope; - if !(scope ~ [ SCOPE_HOST, SCOPE_SITE ]) then { - print "Failed in test"; - } preference = 15; print preference; diff --git a/filter/tree_test.c b/filter/tree_test.c index 05702f81..655205c2 100644 --- a/filter/tree_test.c +++ b/filter/tree_test.c @@ -15,13 +15,6 @@ #define MAX_TREE_HEIGHT 13 -static void -start_conf_env(void) -{ - bt_bird_init(); - cfg_mem = tmp_linpool; -} - static struct f_tree * new_tree(uint id) { @@ -153,8 +146,6 @@ get_balanced_tree_with_ranged_values(uint nodes_count) static int t_balancing(void) { - start_conf_env(); - uint height; for (height = 1; height < MAX_TREE_HEIGHT; height++) { @@ -170,6 +161,8 @@ t_balancing(void) show_tree(balanced_tree_from_simple); bt_assert(same_tree(balanced_tree_from_simple, expected_balanced_tree)); + + tmp_flush(); } return 1; @@ -179,8 +172,6 @@ t_balancing(void) static int t_balancing_random(void) { - start_conf_env(); - uint height; for (height = 1; height < MAX_TREE_HEIGHT; height++) { @@ -191,6 +182,9 @@ t_balancing_random(void) uint i; for(i = 0; i < 10; i++) { + struct lp_state lps; + lp_save(tmp_linpool, &lps); + struct f_tree *random_degenerated_tree = get_random_degenerated_left_tree(nodes_count); show_tree(random_degenerated_tree); @@ -200,7 +194,11 @@ t_balancing_random(void) show_tree(balanced_tree_from_random); bt_assert(same_tree(balanced_tree_from_random, expected_balanced_tree)); + + lp_restore(tmp_linpool, &lps); } + + tmp_flush(); } return 1; @@ -209,8 +207,6 @@ t_balancing_random(void) static int t_find(void) { - start_conf_env(); - uint height; for (height = 1; height < MAX_TREE_HEIGHT; height++) { @@ -227,6 +223,8 @@ t_find(void) const struct f_tree *found_tree = find_tree(tree, &looking_up_value); bt_assert((val_compare(&looking_up_value, &(found_tree->from)) == 0) && (val_compare(&looking_up_value, &(found_tree->to)) == 0)); } + + tmp_flush(); } return 1; @@ -255,8 +253,6 @@ get_max_value_in_unbalanced_tree(struct f_tree *node, uint max) static int t_find_ranges(void) { - start_conf_env(); - uint height; for (height = 1; height < MAX_TREE_HEIGHT; height++) { @@ -283,6 +279,8 @@ t_find_ranges(void) ((val_compare(&needle, &(found_tree->from)) == 1) && (val_compare(&needle, &(found_tree->to)) == -1)) ); } + + tmp_flush(); } return 1; @@ -292,6 +290,8 @@ int main(int argc, char *argv[]) { bt_init(argc, argv); + bt_bird_init(); + cfg_mem = tmp_linpool; bt_test_suite(t_balancing, "Balancing strong unbalanced trees"); bt_test_suite(t_balancing_random, "Balancing random unbalanced trees"); diff --git a/filter/trie_test.c b/filter/trie_test.c index dc791280..81f38db9 100644 --- a/filter/trie_test.c +++ b/filter/trie_test.c @@ -251,12 +251,12 @@ get_outer_net(net_addr *net, const struct f_prefix *src) static list * make_random_prefix_list(int num, int v6, int tight) { - list *prefixes = lp_allocz(tmp_linpool, sizeof(struct f_prefix_node)); + list *prefixes = tmp_allocz(sizeof(struct f_prefix_node)); init_list(prefixes); for (int i = 0; i < num; i++) { - struct f_prefix_node *px = lp_allocz(tmp_linpool, sizeof(struct f_prefix_node)); + struct f_prefix_node *px = tmp_allocz(sizeof(struct f_prefix_node)); get_random_prefix(&px->prefix, v6, tight); add_tail(prefixes, &px->n); @@ -294,7 +294,7 @@ read_prefix_list(FILE *f, int v6, int plus) char s[32]; int n; - list *pxlist = lp_allocz(tmp_linpool, sizeof(struct f_prefix_node)); + list *pxlist = tmp_allocz(sizeof(struct f_prefix_node)); init_list(pxlist); errno = 0; @@ -308,7 +308,7 @@ read_prefix_list(FILE *f, int v6, int plus) if (n != 5) bt_abort_msg("Invalid content of trie_data"); - struct f_prefix_node *px = lp_allocz(tmp_linpool, sizeof(struct f_prefix_node)); + struct f_prefix_node *px = tmp_allocz(sizeof(struct f_prefix_node)); net_fill_ip4(&px->prefix.net, ip4_build(a0, a1, a2, a3), pl); px->prefix.lo = pl; px->prefix.hi = plus ? IP4_MAX_PREFIX_LENGTH : pl; @@ -432,9 +432,6 @@ test_match_net(list *prefixes, struct f_trie *trie, const net_addr *net) static int t_match_random_net(void) { - bt_bird_init(); - bt_config_parse(BT_CONFIG_SIMPLE); - int v6 = 0; for (int round = 0; round < TESTS_NUM; round++) { @@ -452,16 +449,12 @@ t_match_random_net(void) tmp_flush(); } - bt_bird_cleanup(); return 1; } static int t_match_inner_net(void) { - bt_bird_init(); - bt_config_parse(BT_CONFIG_SIMPLE); - int v6 = 0; for (int round = 0; round < TESTS_NUM; round++) { @@ -482,16 +475,12 @@ t_match_inner_net(void) tmp_flush(); } - bt_bird_cleanup(); return 1; } static int t_match_outer_net(void) { - bt_bird_init(); - bt_config_parse(BT_CONFIG_SIMPLE); - int v6 = 0; for (int round = 0; round < TESTS_NUM; round++) { @@ -513,7 +502,6 @@ t_match_outer_net(void) } v6 = !v6; - bt_bird_cleanup(); return 1; } @@ -574,34 +562,24 @@ benchmark_trie_dataset(const char *filename, int plus) static int UNUSED t_bench_trie_datasets_subset(void) { - bt_bird_init(); - bt_config_parse(BT_CONFIG_SIMPLE); - /* Specific datasets, not included */ benchmark_trie_dataset("trie-data-bgp-1", 0); benchmark_trie_dataset("trie-data-bgp-10", 0); benchmark_trie_dataset("trie-data-bgp-100", 0); benchmark_trie_dataset("trie-data-bgp-1000", 0); - bt_bird_cleanup(); - return 1; } static int UNUSED t_bench_trie_datasets_random(void) { - bt_bird_init(); - bt_config_parse(BT_CONFIG_SIMPLE); - /* Specific datasets, not included */ benchmark_trie_dataset("trie-data-bgp-1", 1); benchmark_trie_dataset("trie-data-bgp-10", 1); benchmark_trie_dataset("trie-data-bgp-100", 1); benchmark_trie_dataset("trie-data-bgp-1000", 1); - bt_bird_cleanup(); - return 1; } @@ -609,9 +587,6 @@ t_bench_trie_datasets_random(void) static int t_trie_same(void) { - bt_bird_init(); - bt_config_parse(BT_CONFIG_SIMPLE); - int v6 = 0; for (int round = 0; round < TESTS_NUM*4; round++) { @@ -632,7 +607,6 @@ t_trie_same(void) tmp_flush(); } - bt_bird_cleanup(); return 1; } @@ -652,9 +626,6 @@ log_networks(const net_addr *a, const net_addr *b) static int t_trie_walk(void) { - bt_bird_init(); - bt_config_parse(BT_CONFIG_SIMPLE); - for (int round = 0; round < TESTS_NUM*8; round++) { int level = round / TESTS_NUM; @@ -763,7 +734,6 @@ t_trie_walk(void) tmp_flush(); } - bt_bird_cleanup(); return 1; } @@ -802,9 +772,6 @@ find_covering_nets(struct f_prefix *prefixes, int num, const net_addr *net, net_ static int t_trie_walk_to_root(void) { - bt_bird_init(); - bt_config_parse(BT_CONFIG_SIMPLE); - for (int round = 0; round < TESTS_NUM * 4; round++) { int level = round / TESTS_NUM; @@ -876,7 +843,6 @@ t_trie_walk_to_root(void) tmp_flush(); } - bt_bird_cleanup(); return 1; } @@ -884,6 +850,8 @@ int main(int argc, char *argv[]) { bt_init(argc, argv); + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); bt_test_suite(t_match_random_net, "Testing random prefix matching"); bt_test_suite(t_match_inner_net, "Testing random inner prefix matching"); diff --git a/lib/Makefile b/lib/Makefile index 812f721c..f4ade9a6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -1,7 +1,7 @@ -src := bitmap.c bitops.c blake2s.c blake2b.c checksum.c event.c flowspec.c idm.c ip.c lists.c mac.c md5.c mempool.c net.c patmatch.c printf.c resource.c sha1.c sha256.c sha512.c slab.c slists.c strtoul.c tbf.c timer.c xmalloc.c +src := a-path.c a-set.c bitmap.c bitops.c blake2s.c blake2b.c checksum.c event.c flowspec.c idm.c ip.c lists.c mac.c md5.c mempool.c net.c patmatch.c printf.c rcu.c resource.c sha1.c sha256.c sha512.c slab.c slists.c strtoul.c tbf.c timer.c xmalloc.c obj := $(src-o-files) $(all-daemon) -tests_src := bitmap_test.c heap_test.c buffer_test.c event_test.c flowspec_test.c bitops_test.c patmatch_test.c fletcher16_test.c slist_test.c checksum_test.c lists_test.c mac_test.c ip_test.c hash_test.c printf_test.c slab_test.c +tests_src := a-set_test.c a-path_test.c bitmap_test.c heap_test.c buffer_test.c event_test.c flowspec_test.c bitops_test.c patmatch_test.c fletcher16_test.c slist_test.c checksum_test.c lists_test.c mac_test.c ip_test.c hash_test.c printf_test.c slab_test.c type_test.c tests_targets := $(tests_targets) $(tests-target-files) tests_objs := $(tests_objs) $(src-o-files) diff --git a/nest/a-path.c b/lib/a-path.c index c421b41f..a7a22e40 100644 --- a/nest/a-path.c +++ b/lib/a-path.c @@ -8,8 +8,8 @@ */ #include "nest/bird.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "lib/resource.h" #include "lib/unaligned.h" #include "lib/string.h" diff --git a/nest/a-path_test.c b/lib/a-path_test.c index a0f3f0e3..08c6c96c 100644 --- a/nest/a-path_test.c +++ b/lib/a-path_test.c @@ -9,8 +9,8 @@ #include "test/birdtest.h" #include "test/bt-utils.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "lib/resource.h" #include "filter/data.h" @@ -78,13 +78,13 @@ t_path_format(void) bt_debug("Prepending ASN: %10u \n", i); } -#define BUFFER_SIZE 120 - byte buf[BUFFER_SIZE] = {}; +#define T_BUFFER_SIZE 120 + byte buf[T_BUFFER_SIZE] = {}; - as_path_format(&empty_as_path, buf, BUFFER_SIZE); + as_path_format(&empty_as_path, buf, T_BUFFER_SIZE); bt_assert_msg(strcmp(buf, "") == 0, "Buffer(%zu): '%s'", strlen(buf), buf); - as_path_format(as_path, buf, BUFFER_SIZE); + as_path_format(as_path, buf, T_BUFFER_SIZE); bt_assert_msg(strcmp(buf, "4294967294 4294967293 4294967292 4294967291 4294967290 4294967289 4294967288 4294967287 4294967286 4294967285") == 0, "Buffer(%zu): '%s'", strlen(buf), buf); #define SMALL_BUFFER_SIZE 25 diff --git a/nest/a-set.c b/lib/a-set.c index 40ed573c..dcb86058 100644 --- a/nest/a-set.c +++ b/lib/a-set.c @@ -10,8 +10,8 @@ #include <stdlib.h> #include "nest/bird.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "lib/resource.h" #include "lib/string.h" diff --git a/nest/a-set_test.c b/lib/a-set_test.c index 904e6764..79b5cf9a 100644 --- a/nest/a-set_test.c +++ b/lib/a-set_test.c @@ -10,8 +10,8 @@ #include "test/bt-utils.h" #include "lib/net.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "lib/resource.h" #define SET_SIZE 10 @@ -20,8 +20,8 @@ static const struct adata *set_sequence_same; /* <0; SET_SIZE) */ static const struct adata *set_sequence_higher; /* <SET_SIZE; 2*SET_SIZE) */ static const struct adata *set_random; -#define BUFFER_SIZE 1000 -static byte buf[BUFFER_SIZE] = {}; +#define T_BUFFER_SIZE 1000 +static byte buf[T_BUFFER_SIZE] = {}; #define SET_SIZE_FOR_FORMAT_OUTPUT 10 @@ -92,11 +92,11 @@ t_set_int_union(void) const struct adata *set_union; set_union = int_set_union(tmp_linpool, set_sequence, set_sequence_same); bt_assert(int_set_get_size(set_union) == SET_SIZE); - bt_assert(int_set_format(set_union, 0, 2, buf, BUFFER_SIZE) == 0); + bt_assert(int_set_format(set_union, 0, 2, buf, T_BUFFER_SIZE) == 0); set_union = int_set_union(tmp_linpool, set_sequence, set_sequence_higher); bt_assert_msg(int_set_get_size(set_union) == SET_SIZE*2, "int_set_get_size(set_union) %d, SET_SIZE*2 %d", int_set_get_size(set_union), SET_SIZE*2); - bt_assert(int_set_format(set_union, 0, 2, buf, BUFFER_SIZE) == 0); + bt_assert(int_set_format(set_union, 0, 2, buf, T_BUFFER_SIZE) == 0); return 1; } @@ -106,15 +106,15 @@ t_set_int_format(void) { generate_set_sequence(SET_TYPE_INT, SET_SIZE_FOR_FORMAT_OUTPUT); - bt_assert(int_set_format(set_sequence, 0, 0, buf, BUFFER_SIZE) == 0); + bt_assert(int_set_format(set_sequence, 0, 0, buf, T_BUFFER_SIZE) == 0); bt_assert(strcmp(buf, "0.0.0.0 0.0.0.1 0.0.0.2 0.0.0.3 0.0.0.4 0.0.0.5 0.0.0.6 0.0.0.7 0.0.0.8 0.0.0.9") == 0); - bzero(buf, BUFFER_SIZE); - bt_assert(int_set_format(set_sequence, 0, 2, buf, BUFFER_SIZE) == 0); + bzero(buf, T_BUFFER_SIZE); + bt_assert(int_set_format(set_sequence, 0, 2, buf, T_BUFFER_SIZE) == 0); bt_assert(strcmp(buf, "0.0.0.2 0.0.0.3 0.0.0.4 0.0.0.5 0.0.0.6 0.0.0.7 0.0.0.8 0.0.0.9") == 0); - bzero(buf, BUFFER_SIZE); - bt_assert(int_set_format(set_sequence, 1, 0, buf, BUFFER_SIZE) == 0); + bzero(buf, T_BUFFER_SIZE); + bt_assert(int_set_format(set_sequence, 1, 0, buf, T_BUFFER_SIZE) == 0); bt_assert(strcmp(buf, "(0,0) (0,1) (0,2) (0,3) (0,4) (0,5) (0,6) (0,7) (0,8) (0,9)") == 0); return 1; @@ -174,11 +174,11 @@ t_set_ec_union(void) const struct adata *set_union; set_union = ec_set_union(tmp_linpool, set_sequence, set_sequence_same); bt_assert(ec_set_get_size(set_union) == SET_SIZE); - bt_assert(ec_set_format(set_union, 0, buf, BUFFER_SIZE) == 0); + bt_assert(ec_set_format(set_union, 0, buf, T_BUFFER_SIZE) == 0); set_union = ec_set_union(tmp_linpool, set_sequence, set_sequence_higher); bt_assert_msg(ec_set_get_size(set_union) == SET_SIZE*2, "ec_set_get_size(set_union) %d, SET_SIZE*2 %d", ec_set_get_size(set_union), SET_SIZE*2); - bt_assert(ec_set_format(set_union, 0, buf, BUFFER_SIZE) == 0); + bt_assert(ec_set_format(set_union, 0, buf, T_BUFFER_SIZE) == 0); return 1; } @@ -194,7 +194,7 @@ t_set_ec_format(void) for (i = 1; i < SET_SIZE_FOR_FORMAT_OUTPUT; i++) set_sequence = ec_set_add(tmp_linpool, set_sequence, i + ((i%2) ? ((u64)EC_RO << 48) : ((u64)EC_RT << 48))); - bt_assert(ec_set_format(set_sequence, 0, buf, BUFFER_SIZE) == 0); + bt_assert(ec_set_format(set_sequence, 0, buf, T_BUFFER_SIZE) == 0); bt_assert_msg(strcmp(buf, "(unknown 0x0, 0, 0) (ro, 0, 1) (rt, 0, 2) (ro, 0, 3) (rt, 0, 4) (ro, 0, 5) (rt, 0, 6) (ro, 0, 7) (rt, 0, 8) (ro, 0, 9)") == 0, "ec_set_format() returns '%s'", buf); @@ -221,6 +221,7 @@ t_set_ec_delete(void) return 1; } + int main(int argc, char *argv[]) { diff --git a/nest/attrs.h b/lib/attrs.h index fcd5ac16..b4dd2e83 100644 --- a/nest/attrs.h +++ b/lib/attrs.h @@ -11,7 +11,39 @@ #include <stdint.h> #include "lib/unaligned.h" -#include "nest/route.h" + +typedef struct adata { + uint length; /* Length of data */ + byte data[0]; +} adata; + +#define ADATA_SIZE(s) BIRD_CPU_ALIGN(sizeof(struct adata) + s) + +extern const adata null_adata; /* adata of length 0 */ + +static inline struct adata * +lp_alloc_adata(struct linpool *pool, uint len) +{ + struct adata *ad = lp_alloc(pool, sizeof(struct adata) + len); + ad->length = len; + return ad; +} + +static inline struct adata * +lp_store_adata(struct linpool *pool, const void *buf, uint len) +{ + struct adata *ad = lp_alloc_adata(pool, len); + memcpy(ad->data, buf, len); + return ad; +} + +#define tmp_alloc_adata(len) lp_alloc_adata(tmp_linpool, len) +#define tmp_store_adata(buf, len) lp_store_adata(tmp_linpool, buf, len) +#define tmp_copy_adata(ad) tmp_store_adata((ad)->data, (ad)->length) + +static inline int adata_same(const struct adata *a, const struct adata *b) +{ return (!a && !b) || (a->length == b->length && !memcmp(a->data, b->data, a->length)); } + /* a-path.c */ diff --git a/lib/birdlib.h b/lib/birdlib.h index e03bd0b2..3f65b8c2 100644 --- a/lib/birdlib.h +++ b/lib/birdlib.h @@ -9,16 +9,39 @@ #ifndef _BIRD_BIRDLIB_H_ #define _BIRD_BIRDLIB_H_ +#include <stddef.h> + +#include "sysdep/config.h" #include "lib/alloca.h" /* Ugly structure offset handling macros */ -struct align_probe { char x; long int y; }; +#define SAME_TYPE(a, b) ({ int _ = ((a) != (b)); !_; }) +#define TYPE_CAST(from, to, what) ( SAME_TYPE(((from) NULL), (what)), ((to) (what))) +#ifdef offsetof +#define OFFSETOF offsetof +#else #define OFFSETOF(s, i) ((size_t) &((s *)0)->i) -#define SKIP_BACK(s, i, p) ((s *)((char *)p - OFFSETOF(s, i))) +#endif + +#define SKIP_BACK(s, i, p) ({ s *_ptr = ((s *)((char *)p - OFFSETOF(s, i))); SAME_TYPE(&_ptr->i, p); _ptr; }) #define BIRD_ALIGN(s, a) (((s)+a-1)&~(a-1)) -#define CPU_STRUCT_ALIGN (sizeof(struct align_probe)) +#define CPU_STRUCT_ALIGN (MAX_(_Alignof(void*), _Alignof(u64))) +#define BIRD_CPU_ALIGN(s) BIRD_ALIGN((s), CPU_STRUCT_ALIGN) + +/* Structure item alignment macros */ + +#define PADDING_NAME(id) _padding_##id +#define PADDING_(id, sz) u8 PADDING_NAME(id)[sz] + +#if CPU_POINTER_ALIGNMENT == 4 +#define PADDING(id, n32, n64) PADDING_(id, n32) +#elif CPU_POINTER_ALIGNMENT == 8 +#define PADDING(id, n32, n64) PADDING_(id, n64) +#else +#error "Strange CPU pointer alignment: " CPU_POINTER_ALIGNMENT +#endif /* Utility macros */ @@ -73,6 +96,7 @@ static inline int u64_cmp(u64 i1, u64 i2) /* Macros for gcc attributes */ #define NORET __attribute__((noreturn)) +#define USE_RESULT __atribute__((warn_unused_result)) #define UNUSED __attribute__((unused)) #define PACKED __attribute__((packed)) #define NONNULL(...) __attribute__((nonnull((__VA_ARGS__)))) @@ -80,10 +104,6 @@ static inline int u64_cmp(u64 i1, u64 i2) #define STATIC_ASSERT(EXP) _Static_assert(EXP, #EXP) #define STATIC_ASSERT_MSG(EXP,MSG) _Static_assert(EXP, MSG) -#ifndef HAVE_THREAD_LOCAL -#define _Thread_local -#endif - /* Microsecond time */ typedef s64 btime; @@ -95,6 +115,7 @@ typedef s64 btime; #define TO_S /1000000 #define TO_MS /1000 #define TO_US /1 +#define TO_NS * (btime) 1000 #ifndef PARSER #define S S_ @@ -166,8 +187,13 @@ void debug_safe(const char *msg); /* Printf to debug output, async-safe */ #if defined(LOCAL_DEBUG) || defined(GLOBAL_DEBUG) #define DBG(x, y...) debug(x, ##y) +#define DBGL(x, y...) debug(x "\n", ##y) +#elif defined(DEBUG_TO_LOG) +#define DBG(...) do { } while (0) +#define DBGL(...) log(L_DEBUG __VA_ARGS__) #else -#define DBG(x, y...) do { } while(0) +#define DBG(...) do { } while(0) +#define DBGL(...) do { } while (0) #endif #define ASSERT_DIE(x) do { if (!(x)) bug("Assertion '%s' failed at %s:%d", #x, __FILE__, __LINE__); } while(0) diff --git a/lib/event.c b/lib/event.c index 33dc00b0..68888a4c 100644 --- a/lib/event.c +++ b/lib/event.c @@ -19,31 +19,163 @@ * events in them and explicitly ask to run them. */ +#undef LOCAL_DEBUG + #include "nest/bird.h" #include "lib/event.h" +#include "lib/io-loop.h" event_list global_event_list; event_list global_work_list; -inline void -ev_postpone(event *e) +//#ifdef DEBUGGING +#if 0 +#define EDL_MAX 16384 +enum edl_caller { + EDL_REMOVE_FROM = 1, + EDL_POSTPONE = 2, + EDL_RUN = 3, + EDL_SEND = 4, + EDL_RUN_LIST = 5, +} caller; +static struct event_debug_log { + event_list *target_list; + event *event; + event *receiver; + uint pos; + uint prev_edl_pos; + uint thread; + enum edl_caller caller; +} edl[EDL_MAX]; +static _Atomic uint edl_cnt; +_Thread_local static uint edl_thread; +_Thread_local static uint prev_edl_pos = ~0; +static inline void edlog(event_list *list, event *e, event *receiver, uint pos, enum edl_caller caller) +{ + uint edl_pos = atomic_fetch_add_explicit(&edl_cnt, 1, memory_order_acq_rel); + if (!edl_thread) + edl_thread = edl_pos; + + edl[edl_pos % EDL_MAX] = (struct event_debug_log) { + .target_list = list, + .event = e, + .receiver = receiver, + .pos = pos, + .prev_edl_pos = prev_edl_pos, + .thread = edl_thread, + .caller = caller, + }; + + prev_edl_pos = edl_pos; +} +#else +#define edlog(...) +#endif + + +void +ev_init_list(event_list *el, struct birdloop *loop, const char *name) { - if (ev_active(e)) + el->name = name; + el->loop = loop; + + atomic_store_explicit(&el->receiver, NULL, memory_order_release); + atomic_store_explicit(&el->_executor, NULL, memory_order_release); +} + +/* + * The event list should work as a message passing point. Sending a message + * must be a fairly fast process with no locks and low waiting times. OTOH, + * processing messages always involves running the assigned code and the + * receiver is always a single one thread with no concurrency at all. There is + * also a postponing requirement to synchronously remove an event from a queue, + * yet we allow this only when the caller has its receiver event loop locked. + * It still means that the event may get postponed from other event in the same + * list, therefore we have to be careful. + */ + +static inline int +ev_remove_from(event *e, event * _Atomic * head) +{ + /* The head pointer stores where cur is pointed to from */ + event * _Atomic *prev = head; + + /* The current event in queue to check */ + event *cur = atomic_load_explicit(prev, memory_order_acquire); + + /* This part of queue is empty! */ + if (!cur) + return 0; + + edlog(NULL, e, cur, 1, EDL_REMOVE_FROM); + while (cur) + { + /* Pre-loaded next pointer */ + event *next = atomic_load_explicit(&cur->next, memory_order_acquire); + + if (e == cur) { - rem_node(&e->n); - e->n.next = NULL; + edlog(NULL, e, next, 3, EDL_REMOVE_FROM); + + /* Check whether we have collided with somebody else + * adding an item to the queue. */ + if (!atomic_compare_exchange_strong_explicit( + prev, &cur, next, + memory_order_acq_rel, memory_order_acquire)) + { + /* This may happen only on list head */ + ASSERT_DIE(prev == head); + + /* Restart. The collision should never happen again. */ + return ev_remove_from(e, head); + } + + /* Successfully removed from the list; inactivate this event. */ + atomic_store_explicit(&cur->next, NULL, memory_order_release); + return 1; } + + edlog(NULL, e, next, 2, EDL_REMOVE_FROM); + + /* Go to the next event. */ + prev = &cur->next; + cur = next; + } + + edlog(NULL, e, cur, 4, EDL_REMOVE_FROM); + + return 0; +} + +inline void +ev_postpone(event *e) +{ + /* Find the list to remove the event from */ + event_list *sl = ev_get_list(e); + edlog(sl, e, NULL, 1, EDL_POSTPONE); + if (!sl) + return; + + /* Postponing allowed only from the target loop */ + ASSERT_DIE(birdloop_inside(sl->loop)); + + /* Remove from one of these lists. */ + ASSERT(ev_remove_from(e, &sl->_executor) || ev_remove_from(e, &sl->receiver)); + + /* Mark as inactive */ + ASSERT_DIE(sl == atomic_exchange_explicit(&e->list, NULL, memory_order_acq_rel)); + edlog(sl, e, NULL, 2, EDL_POSTPONE); } static void -ev_dump(resource *r) +ev_dump(resource *r, unsigned indent UNUSED) { event *e = (event *) r; debug("(code %p, data %p, %s)\n", e->hook, e->data, - e->n.next ? "scheduled" : "inactive"); + atomic_load_explicit(&e->next, memory_order_relaxed) ? "scheduled" : "inactive"); } static struct resclass ev_class = { @@ -82,8 +214,10 @@ ev_new(pool *p) inline void ev_run(event *e) { + edlog(NULL, e, NULL, 1, EDL_RUN); ev_postpone(e); e->hook(e->data); + edlog(NULL, e, NULL, 2, EDL_RUN); } /** @@ -95,40 +229,39 @@ ev_run(event *e) * list @l which can be run by calling ev_run_list(). */ inline void -ev_enqueue(event_list *l, event *e) +ev_send(event_list *l, event *e) { - ev_postpone(e); - add_tail(l, &e->n); -} + edlog(l, e, NULL, 1, EDL_SEND); + /* Set the target list */ + event_list *ol = NULL; + if (!atomic_compare_exchange_strong_explicit( + &e->list, &ol, l, + memory_order_acq_rel, memory_order_acquire)) + if (ol == l) + return; + else + bug("Queuing an already queued event to another queue is not supported."); -/** - * ev_schedule - schedule an event - * @e: an event - * - * This function schedules an event by enqueueing it to a system-wide - * event list which is run by the platform dependent code whenever - * appropriate. - */ -void -ev_schedule(event *e) -{ - ev_enqueue(&global_event_list, e); -} + /* Here should be no concurrent senders */ + event *next = atomic_load_explicit(&l->receiver, memory_order_acquire); + edlog(l, e, next, 2, EDL_SEND); + event *old_next = NULL; + do + if (!atomic_compare_exchange_strong_explicit( + &e->next, &old_next, next, + memory_order_acq_rel, memory_order_acquire)) + bug("Event %p in inconsistent state"); + else + { + old_next = next; + edlog(l, old_next, next, 3, EDL_SEND); + } + while (!atomic_compare_exchange_strong_explicit( + &l->receiver, &next, e, + memory_order_acq_rel, memory_order_acquire)); -/** - * ev_schedule_work - schedule a work-event. - * @e: an event - * - * This function schedules an event by enqueueing it to a system-wide work-event - * list which is run by the platform dependent code whenever appropriate. This - * is designated for work-events instead of regular events. They are executed - * less often in order to not clog I/O loop. - */ -void -ev_schedule_work(event *e) -{ - if (!ev_active(e)) - add_tail(&global_work_list, &e->n); + edlog(l, e, next, 4, EDL_SEND); + if (l->loop) birdloop_ping(l->loop); } void io_log_event(void *hook, void *data); @@ -140,62 +273,66 @@ void io_log_event(void *hook, void *data); * This function calls ev_run() for all events enqueued in the list @l. */ int -ev_run_list(event_list *l) +ev_run_list_limited(event_list *l, uint limit) { - node *n; - list tmp_list; - - init_list(&tmp_list); - add_tail_list(&tmp_list, l); - init_list(l); - WALK_LIST_FIRST(n, tmp_list) - { - event *e = SKIP_BACK(event, n, n); + event * _Atomic *ep = &l->_executor; + edlog(l, NULL, NULL, 1, EDL_RUN_LIST); - /* This is ugly hack, we want to log just events executed from the main I/O loop */ - if ((l == &global_event_list) || (l == &global_work_list)) - io_log_event(e->hook, e->data); + /* No pending events, refill the queue. */ + if (!atomic_load_explicit(ep, memory_order_acquire)) + { + /* Move the current event list aside and create a new one. */ + event *received = atomic_exchange_explicit(&l->receiver, NULL, memory_order_acq_rel); + edlog(l, NULL, received, 2, EDL_RUN_LIST); - ev_run(e); - tmp_flush(); - } + /* No event to run. */ + if (!received) + return 0; - return !EMPTY_LIST(*l); -} + /* Setup the executor queue */ + event *head = NULL; -int -ev_run_list_limited(event_list *l, uint limit) -{ - node *n; - list tmp_list; + /* Flip the order of the events by relinking them one by one (push-pop) */ + while (received) + { + event *cur = received; + received = atomic_exchange_explicit(&cur->next, head, memory_order_acq_rel); + edlog(l, head, received, 3, EDL_RUN_LIST); + head = cur; + } - init_list(&tmp_list); - add_tail_list(&tmp_list, l); - init_list(l); + /* Store the executor queue to its designated place */ + ASSERT_DIE(atomic_exchange_explicit(ep, head, memory_order_acq_rel) == NULL); + edlog(l, NULL, head, 4, EDL_RUN_LIST); + } - WALK_LIST_FIRST(n, tmp_list) + /* Run the events in order. */ + event *e; + while (e = atomic_load_explicit(ep, memory_order_acquire)) { - event *e = SKIP_BACK(event, n, n); - - if (!limit) - break; + edlog(l, e, NULL, 5, EDL_RUN_LIST); + /* Check limit */ + if (!--limit) + return 1; /* This is ugly hack, we want to log just events executed from the main I/O loop */ if ((l == &global_event_list) || (l == &global_work_list)) io_log_event(e->hook, e->data); - ev_run(e); + edlog(l, e, NULL, 6, EDL_RUN_LIST); + /* Inactivate the event */ + event *next = atomic_load_explicit(&e->next, memory_order_relaxed); + ASSERT_DIE(e == atomic_exchange_explicit(ep, next, memory_order_acq_rel)); + ASSERT_DIE(next == atomic_exchange_explicit(&e->next, NULL, memory_order_acq_rel)); + ASSERT_DIE(l == atomic_exchange_explicit(&e->list, NULL, memory_order_acq_rel)); + edlog(l, e, next, 7, EDL_RUN_LIST); + + /* Run the event */ + e->hook(e->data); tmp_flush(); - limit--; - } - if (!EMPTY_LIST(tmp_list)) - { - /* Attach new items after the unprocessed old items */ - add_tail_list(&tmp_list, l); - init_list(l); - add_tail_list(l, &tmp_list); - } + edlog(l, e, next, 8, EDL_RUN_LIST); + } - return !EMPTY_LIST(*l); + return !!atomic_load_explicit(&l->receiver, memory_order_acquire); } diff --git a/lib/event.h b/lib/event.h index 5f3b78d8..6fd9f31c 100644 --- a/lib/event.h +++ b/lib/event.h @@ -10,33 +10,56 @@ #define _BIRD_EVENT_H_ #include "lib/resource.h" +#include "lib/locking.h" + +#include <stdatomic.h> + +struct birdloop; typedef struct event { resource r; void (*hook)(void *); void *data; - node n; /* Internal link */ + struct event * _Atomic next; + struct event_list * _Atomic list; } event; -typedef list event_list; +typedef struct event_list { + event * _Atomic receiver; /* Event receive list */ + event * _Atomic _executor; /* Event execute list */ + const char *name; + struct birdloop *loop; /* The executor loop */ +} event_list; extern event_list global_event_list; extern event_list global_work_list; event *ev_new(pool *); void ev_run(event *); -#define ev_init_list(el) init_list(el) +void ev_init_list(event_list *, struct birdloop *loop, const char *name); void ev_enqueue(event_list *, event *); -void ev_schedule(event *); -void ev_schedule_work(event *); +#define ev_send ev_enqueue +#define ev_send_loop(l, e) ev_send(birdloop_event_list((l)), (e)) + +#define ev_schedule(e) ({ ASSERT_THE_BIRD_LOCKED; if (!ev_active((e))) ev_send(&global_event_list, (e)); }) +#define ev_schedule_work(e) ({ ASSERT_THE_BIRD_LOCKED; if (!ev_active((e))) ev_send(&global_work_list, (e)); }) + void ev_postpone(event *); -int ev_run_list(event_list *); int ev_run_list_limited(event_list *, uint); +#define ev_run_list(l) ev_run_list_limited((l), ~0) + +#define LEGACY_EVENT_LIST(l) (((l) == &global_event_list) || ((l) == &global_work_list)) static inline int ev_active(event *e) { - return e->n.next != NULL; + return atomic_load_explicit(&e->list, memory_order_acquire) != NULL; +} + +static inline event_list * +ev_get_list(event *e) +{ + return atomic_load_explicit(&e->list, memory_order_acquire); } static inline event* diff --git a/lib/event_test.c b/lib/event_test.c index e1fbea8f..612deb25 100644 --- a/lib/event_test.c +++ b/lib/event_test.c @@ -15,7 +15,7 @@ #include "nest/locks.h" #include "sysdep/unix/unix.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #define MAX_NUM 4 @@ -54,9 +54,8 @@ t_ev_run_list(void) int i; olock_init(); - timer_init(); - io_init(); rt_init(); + io_init(); if_init(); // roa_init(); config_init(); diff --git a/lib/fib.h b/lib/fib.h new file mode 100644 index 00000000..bec2a8d4 --- /dev/null +++ b/lib/fib.h @@ -0,0 +1,119 @@ +/* + * BIRD Internet Routing Daemon -- Network prefix storage + * + * (c) 1998--2000 Martin Mares <mj@ucw.cz> + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_LIB_FIB_H_ +#define _BIRD_LIB_FIB_H_ + +/* + * BIRD FIBs are generic data structure for storing network prefixes. + * Also used for the master routing table. Currently implemented as + * a hash table. + * + * Available operations: + * - insertion of new entry + * - deletion of entry + * - searching for entry by network prefix + * - asynchronous retrieval of fib contents + */ + +struct fib; + +struct fib_node { + struct fib_node *next; /* Next in hash chain */ + struct fib_iterator *readers; /* List of readers of this node */ + net_addr addr[0]; +}; + +struct fib_iterator { /* See lib/slists.h for an explanation */ + struct fib_iterator *prev, *next; /* Must be synced with struct fib_node! */ + byte efef; /* 0xff to distinguish between iterator and node */ + byte pad[3]; + struct fib_node *node; /* Or NULL if freshly merged */ + uint hash; +}; + +typedef void (*fib_init_fn)(struct fib *, void *); + +struct fib { + pool *fib_pool; /* Pool holding all our data */ + slab *fib_slab; /* Slab holding all fib nodes */ + struct fib_node **hash_table; /* Node hash table */ + uint hash_size; /* Number of hash table entries (a power of two) */ + uint hash_order; /* Binary logarithm of hash_size */ + uint hash_shift; /* 32 - hash_order */ + uint addr_type; /* Type of address data stored in fib (NET_*) */ + uint node_size; /* FIB node size, 0 for nonuniform */ + uint node_offset; /* Offset of fib_node struct inside of user data */ + uint entries; /* Number of entries */ + uint entries_min, entries_max; /* Entry count limits (else start rehashing) */ + fib_init_fn init; /* Constructor */ +}; + +static inline void * fib_node_to_user(struct fib *f, struct fib_node *e) +{ return e ? (void *) ((char *) e - f->node_offset) : NULL; } + +static inline struct fib_node * fib_user_to_node(struct fib *f, void *e) +{ return e ? (void *) ((char *) e + f->node_offset) : NULL; } + +void fib_init(struct fib *f, pool *p, uint addr_type, uint node_size, uint node_offset, uint hash_order, fib_init_fn init); +void *fib_find(struct fib *, const net_addr *); /* Find or return NULL if doesn't exist */ +void *fib_get_chain(struct fib *f, const net_addr *a); /* Find first node in linked list from hash table */ +void *fib_get(struct fib *, const net_addr *); /* Find or create new if nonexistent */ +void *fib_route(struct fib *, const net_addr *); /* Longest-match routing lookup */ +void fib_delete(struct fib *, void *); /* Remove fib entry */ +void fib_free(struct fib *); /* Destroy the fib */ +void fib_check(struct fib *); /* Consistency check for debugging */ + +void fit_init(struct fib_iterator *, struct fib *); /* Internal functions, don't call */ +struct fib_node *fit_get(struct fib *, struct fib_iterator *); +void fit_put(struct fib_iterator *, struct fib_node *); +void fit_put_next(struct fib *f, struct fib_iterator *i, struct fib_node *n, uint hpos); +void fit_put_end(struct fib_iterator *i); +void fit_copy(struct fib *f, struct fib_iterator *dst, struct fib_iterator *src); + + +#define FIB_WALK(fib, type, z) do { \ + struct fib_node *fn_, **ff_ = (fib)->hash_table; \ + uint count_ = (fib)->hash_size; \ + type *z; \ + while (count_--) \ + for (fn_ = *ff_++; z = fib_node_to_user(fib, fn_); fn_=fn_->next) + +#define FIB_WALK_END } while (0) + +#define FIB_ITERATE_INIT(it, fib) fit_init(it, fib) + +#define FIB_ITERATE_START(fib, it, type, z) do { \ + struct fib_node *fn_ = fit_get(fib, it); \ + uint count_ = (fib)->hash_size; \ + uint hpos_ = (it)->hash; \ + type *z; \ + for(;;) { \ + if (!fn_) \ + { \ + if (++hpos_ >= count_) \ + break; \ + fn_ = (fib)->hash_table[hpos_]; \ + continue; \ + } \ + z = fib_node_to_user(fib, fn_); + +#define FIB_ITERATE_END fn_ = fn_->next; } } while(0) + +#define FIB_ITERATE_PUT(it) fit_put(it, fn_) + +#define FIB_ITERATE_PUT_NEXT(it, fib) fit_put_next(fib, it, fn_, hpos_) + +#define FIB_ITERATE_PUT_END(it) fit_put_end(it) + +#define FIB_ITERATE_UNLINK(it, fib) fit_get(fib, it) + +#define FIB_ITERATE_COPY(dst, src, fib) fit_copy(fib, dst, src) + +#endif @@ -10,7 +10,7 @@ #ifndef _BIRD_HASH_H_ #define _BIRD_HASH_H_ -#define HASH(type) struct { type **data; uint count, order; } +#define HASH(type) struct { type **data; uint count; u16 iterators; u8 order; u8 down_requested:1; } #define HASH_TYPE(v) typeof(** (v).data) #define HASH_SIZE(v) (1U << (v).order) @@ -125,20 +125,26 @@ #define HASH_MAY_STEP_DOWN_(v,pool,rehash_fn,args) \ ({ \ - if (((v).count < (HASH_SIZE(v) REHASH_LO_MARK(args))) && \ - ((v).order > (REHASH_LO_BOUND(args)))) \ + if ((v).iterators) \ + (v).down_requested = 1; \ + else if (((v).count < (HASH_SIZE(v) REHASH_LO_MARK(args))) && \ + ((v).order > (REHASH_LO_BOUND(args)))) \ rehash_fn(&(v), pool, -(REHASH_LO_STEP(args))); \ }) #define HASH_MAY_RESIZE_DOWN_(v,pool,rehash_fn,args) \ ({ \ - uint _o = (v).order; \ - while (((v).count < ((1U << _o) REHASH_LO_MARK(args))) && \ - (_o > (REHASH_LO_BOUND(args)))) \ - _o -= (REHASH_LO_STEP(args)); \ - if (_o < (v).order) \ - rehash_fn(&(v), pool, _o - (v).order); \ - }) + if ((v).iterators) \ + (v).down_requested = 1; \ + else { \ + uint _o = (v).order; \ + while (((v).count < ((1U << _o) REHASH_LO_MARK(args))) && \ + (_o > (REHASH_LO_BOUND(args)))) \ + _o -= (REHASH_LO_STEP(args)); \ + if (_o < (v).order) \ + rehash_fn(&(v), pool, _o - (v).order); \ + } \ + }) #define HASH_INSERT2(v,id,pool,node) \ @@ -195,6 +201,20 @@ #define HASH_WALK_FILTER_END } while (0) +#define HASH_WALK_ITER(v, id, n, iter) \ + do { \ + uint _hash_walk_iter_put = 0; \ + uint _shift = 32 - (v).order; \ + for ( ; !_hash_walk_iter_put; (iter) += (1U << _shift)) { \ + _hash_walk_iter_put = ((iter) + (1U << _shift) == 0); \ + for (HASH_TYPE(v) *n = (v).data[(iter) >> _shift]; n; n = id##_NEXT((n)))\ + if (HASH_FN(v, id, id##_KEY(n)) >= ((iter) >> _shift)) \ + +#define HASH_WALK_ITER_PUT (_hash_walk_iter_put = 1) + +#define HASH_WALK_ITER_END } } while (0) + + static inline void mem_hash_init(u64 *h) { diff --git a/lib/hash_test.c b/lib/hash_test.c index 4bce7017..ecfcdd66 100644 --- a/lib/hash_test.c +++ b/lib/hash_test.c @@ -285,6 +285,46 @@ t_walk_filter(void) return 1; } +static int +t_walk_iter(void) +{ + init_hash(); + fill_hash(); + + u32 hit = 0; + + u32 prev_hash = ~0; + for (uint cnt = 0; cnt < MAX_NUM; ) + { + u32 last_hash = ~0; +// printf("PUT!\n"); + HASH_WALK_ITER(hash, TEST, n, hit) + { + cnt++; + u32 cur_hash = HASH_FN(hash, TEST, n->key); + /* + printf("C%08x L%08x P%08x K%08x H%08x N%p S%d I%ld\n", + cur_hash, last_hash, prev_hash, n->key, hit, n, _shift, n - &nodes[0]); + */ + + if (last_hash == ~0U) + { + if (prev_hash != ~0U) + bt_assert(prev_hash < cur_hash); + last_hash = prev_hash = cur_hash; + } + else + bt_assert(last_hash == cur_hash); + + if (cnt < MAX_NUM) + HASH_WALK_ITER_PUT; + } + HASH_WALK_ITER_END; + } + + return 1; +} + int main(int argc, char *argv[]) { @@ -299,6 +339,7 @@ main(int argc, char *argv[]) bt_test_suite(t_walk_delsafe_remove, "HASH_WALK_DELSAFE and HASH_REMOVE"); bt_test_suite(t_walk_delsafe_remove2, "HASH_WALK_DELSAFE and HASH_REMOVE2. HASH_REMOVE2 is HASH_REMOVE and smart auto-resize function"); bt_test_suite(t_walk_filter, "HASH_WALK_FILTER"); + bt_test_suite(t_walk_iter, "HASH_WALK_ITER"); return bt_exit_value(); } diff --git a/lib/io-loop.h b/lib/io-loop.h new file mode 100644 index 00000000..502d77fc --- /dev/null +++ b/lib/io-loop.h @@ -0,0 +1,66 @@ +/* + * BIRD -- I/O and event loop + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_IO_LOOP_H_ +#define _BIRD_IO_LOOP_H_ + +#include "nest/bird.h" +#include "lib/lists.h" +#include "lib/locking.h" +#include "lib/resource.h" +#include "lib/event.h" +#include "lib/socket.h" + +extern struct birdloop main_birdloop; + +/* Start a new birdloop owned by given pool and domain */ +struct birdloop *birdloop_new(pool *p, uint order, const char *name); + +/* Stop the loop. At the end, the @stopped callback is called unlocked in tail + * position to finish cleanup. Run birdloop_free() from that callback to free + * the loop itself. */ +void birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data); +void birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data); +void birdloop_free(struct birdloop *loop); + +/* Get birdloop's event list */ +event_list *birdloop_event_list(struct birdloop *loop); + +/* Get birdloop's time heap */ +struct timeloop *birdloop_time_loop(struct birdloop *loop); + +/* Enter and exit the birdloop */ +void birdloop_enter(struct birdloop *loop); +void birdloop_leave(struct birdloop *loop); + +_Bool birdloop_inside(struct birdloop *loop); + +void birdloop_mask_wakeups(struct birdloop *loop); +void birdloop_unmask_wakeups(struct birdloop *loop); + +void birdloop_link(struct birdloop *loop); +void birdloop_unlink(struct birdloop *loop); + +void birdloop_ping(struct birdloop *loop); + +struct birdloop_flag_handler { + void (*hook)(struct birdloop_flag_handler *, u32 flags); + void *data; +}; + +void birdloop_flag(struct birdloop *loop, u32 flag); +void birdloop_flag_set_handler(struct birdloop *, struct birdloop_flag_handler *); + +/* Setup sockets */ +void birdloop_add_socket(struct birdloop *, struct birdsock *); +void birdloop_remove_socket(struct birdloop *, struct birdsock *); + +void birdloop_init(void); + +/* Yield for a little while. Use only in special cases. */ +void birdloop_yield(void); + +#endif /* _BIRD_IO_LOOP_H_ */ @@ -362,11 +362,7 @@ static inline ip6_addr ip6_hton(ip6_addr a) static inline ip6_addr ip6_ntoh(ip6_addr a) { return _MI6(ntohl(_I0(a)), ntohl(_I1(a)), ntohl(_I2(a)), ntohl(_I3(a))); } -#define MPLS_MAX_LABEL_STACK 8 -typedef struct mpls_label_stack { - uint len; - u32 stack[MPLS_MAX_LABEL_STACK]; -} mpls_label_stack; +#define MPLS_MAX_LABEL_STACK 16 static inline int mpls_get(const char *buf, int buflen, u32 *stack) diff --git a/lib/lists.c b/lib/lists.c index 200576cf..cabfddba 100644 --- a/lib/lists.c +++ b/lib/lists.c @@ -26,7 +26,7 @@ #define _BIRD_LISTS_C_ -#include "nest/bird.h" +#include "lib/birdlib.h" #include "lib/lists.h" LIST_INLINE int @@ -37,9 +37,10 @@ check_list(list *l, node *n) ASSERT_DIE(n); ASSERT_DIE(n->prev); - do { n = n->prev; } while (n->prev); + node *nn = n; + do { nn = nn->prev; } while (nn->prev); - l = SKIP_BACK(list, head_node, n); + l = SKIP_BACK(list, head_node, nn); } int seen = 0; @@ -120,7 +121,7 @@ add_head(list *l, node *n) LIST_INLINE void insert_node(node *n, node *after) { - EXPENSIVE_CHECK(check_list(l, after)); + EXPENSIVE_CHECK((after->prev == NULL) || check_list(NULL, after)); ASSUME(n->prev == NULL); ASSUME(n->next == NULL); @@ -141,7 +142,7 @@ insert_node(node *n, node *after) LIST_INLINE void rem_node(node *n) { - EXPENSIVE_CHECK(check_list(NULL, n)); + EXPENSIVE_CHECK((n->prev == n) && (n->next == n) || check_list(NULL, n)); node *z = n->prev; node *x = n->next; diff --git a/lib/lists.h b/lib/lists.h index 7e6d5467..86ff59c9 100644 --- a/lib/lists.h +++ b/lib/lists.h @@ -69,6 +69,18 @@ typedef union list { /* In fact two overlayed nodes */ #define EMPTY_LIST(list) (!(list).head->next) +static inline _Bool +enlisted(node *n) +{ + switch ((!!n->next) + (!!n->prev)) + { + case 0: return 0; + case 2: return 1; + case 1: bug("Garbled event list node"); + } + + bug("Maths is broken. And you should see a new heaven and a new earth: for the first heaven and the first earth had been passed away."); +} #ifndef _BIRD_LISTS_C_ #define LIST_INLINE static inline diff --git a/lib/locking.h b/lib/locking.h new file mode 100644 index 00000000..6bed14bf --- /dev/null +++ b/lib/locking.h @@ -0,0 +1,70 @@ +/* + * BIRD Library -- Locking + * + * (c) 2020--2021 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_LOCKING_H_ +#define _BIRD_LOCKING_H_ + +struct domain_generic; + +/* Here define the global lock order; first to last. */ +struct lock_order { + struct domain_generic *meta; + struct domain_generic *the_bird; + struct domain_generic *control; + struct domain_generic *proto; + struct domain_generic *service; + struct domain_generic *rtable; + struct domain_generic *attrs; + struct domain_generic *resource; +}; + +extern _Thread_local struct lock_order locking_stack; +extern _Thread_local struct domain_generic **last_locked; + +#define DOMAIN(type) struct domain__##type +#define DEFINE_DOMAIN(type) DOMAIN(type) { struct domain_generic *type; } +#define DOMAIN_ORDER(type) OFFSETOF(struct lock_order, type) + +#define DOMAIN_NEW(type, name) (DOMAIN(type)) { .type = domain_new(name, DOMAIN_ORDER(type)) } +struct domain_generic *domain_new(const char *name, uint order); + +#define DOMAIN_FREE(type, d) domain_free((d).type) +void domain_free(struct domain_generic *); + +#define DOMAIN_NAME(type, d) domain_name((d).type) +const char *domain_name(struct domain_generic *); + +#define DOMAIN_NULL(type) (DOMAIN(type)) {} + +#define LOCK_DOMAIN(type, d) do_lock(((d).type), &(locking_stack.type)) +#define UNLOCK_DOMAIN(type, d) do_unlock(((d).type), &(locking_stack.type)) + +#define DOMAIN_IS_LOCKED(type, d) (((d).type) == (locking_stack.type)) +#define DG_IS_LOCKED(d) ((d) == *(DG_LSP(d))) + +/* Internal for locking */ +void do_lock(struct domain_generic *dg, struct domain_generic **lsp); +void do_unlock(struct domain_generic *dg, struct domain_generic **lsp); + +uint dg_order(struct domain_generic *dg); + +#define DG_LSP(d) ((struct domain_generic **) (((void *) &locking_stack) + dg_order(d))) +#define DG_LOCK(d) do_lock(d, DG_LSP(d)) +#define DG_UNLOCK(d) do_unlock(d, DG_LSP(d)) + +/* Use with care. To be removed in near future. */ +DEFINE_DOMAIN(the_bird); +extern DOMAIN(the_bird) the_bird_domain; + +#define the_bird_lock() LOCK_DOMAIN(the_bird, the_bird_domain) +#define the_bird_unlock() UNLOCK_DOMAIN(the_bird, the_bird_domain) +#define the_bird_locked() DOMAIN_IS_LOCKED(the_bird, the_bird_domain) + +#define ASSERT_THE_BIRD_LOCKED ({ if (!the_bird_locked()) bug("The BIRD lock must be locked here: %s:%d", __FILE__, __LINE__); }) + +#endif diff --git a/lib/macro.h b/lib/macro.h index 24fc3393..8f5d4b0e 100644 --- a/lib/macro.h +++ b/lib/macro.h @@ -26,6 +26,8 @@ #define MACRO_DROP(...) #define MACRO_UNPAREN(...) __VA_ARGS__ #define MACRO_SEP(a, b, sep) a sep b +#define MACRO_STR(a) #a +#define MACRO_STR_AFTER(a) MACRO_STR(a) /* Aliases for some special chars */ #define MACRO_COMMA , diff --git a/lib/mempool.c b/lib/mempool.c index 325b1ecf..5200f5e7 100644 --- a/lib/mempool.c +++ b/lib/mempool.c @@ -41,10 +41,8 @@ struct linpool { uint total, total_large; }; -_Thread_local linpool *tmp_linpool; - static void lp_free(resource *); -static void lp_dump(resource *); +static void lp_dump(resource *, unsigned); static resource *lp_lookup(resource *, unsigned long); static struct resmem lp_memsize(resource *r); @@ -233,9 +231,9 @@ lp_restore(linpool *m, lp_state *p) struct lp_chunk *c; /* Move ptr to the saved pos and free all newer large chunks */ - m->current = c = p->current; - m->ptr = p->ptr; - m->end = c ? c->data + LP_DATA_SIZE : NULL; + m->current = c = p->current ?: m->first; + m->ptr = p->ptr ?: (c ? c->data : NULL); + m->end = c ? (c->data + LP_DATA_SIZE) : NULL; m->total_large = p->total_large; while ((c = m->first_large) && (c != p->large)) @@ -264,11 +262,12 @@ lp_free(resource *r) } static void -lp_dump(resource *r) +lp_dump(resource *r, unsigned indent) { linpool *m = (linpool *) r; struct lp_chunk *c; int cnt, cntl; + char x[32]; for(cnt=0, c=m->first; c; c=c->next, cnt++) ; @@ -279,6 +278,14 @@ lp_dump(resource *r) cntl, m->total, m->total_large); + + bsprintf(x, "%%%dschunk %%p\n", indent + 2); + for (c=m->first; c; c=c->next) + debug(x, "", c); + + bsprintf(x, "%%%dslarge %%p\n", indent + 2); + for (c=m->first_large; c; c=c->next) + debug(x, "", c); } static struct resmem diff --git a/lib/rcu.c b/lib/rcu.c new file mode 100644 index 00000000..8ae4f2ab --- /dev/null +++ b/lib/rcu.c @@ -0,0 +1,79 @@ +/* + * BIRD Library -- Read-Copy-Update Basic Operations + * + * (c) 2021 Maria Matejka <mq@jmq.cz> + * (c) 2021 CZ.NIC z.s.p.o. + * + * Can be freely distributed and used under the terms of the GNU GPL. + * Note: all the relevant patents shall be expired. + * + * Using the Supplementary Material for User-Level Implementations of Read-Copy-Update + * by Matthieu Desnoyers, Paul E. McKenney, Alan S. Stern, Michel R. Dagenais and Jonathan Walpole + * obtained from https://www.efficios.com/pub/rcu/urcu-supp-accepted.pdf + */ + +#include "lib/rcu.h" +#include "lib/io-loop.h" +#include "lib/locking.h" + +_Atomic uint rcu_gp_ctl = RCU_NEST_CNT; +_Thread_local struct rcu_thread *this_rcu_thread = NULL; + +static list rcu_thread_list; + +static struct rcu_thread main_rcu_thread; + +DEFINE_DOMAIN(resource); +static DOMAIN(resource) rcu_domain; + +static int +rcu_gp_ongoing(_Atomic uint *ctl) +{ + uint val = atomic_load(ctl); + return (val & RCU_NEST_CNT) && ((val ^ rcu_gp_ctl) & RCU_GP_PHASE); +} + +static void +update_counter_and_wait(void) +{ + atomic_fetch_xor(&rcu_gp_ctl, RCU_GP_PHASE); + struct rcu_thread *rc; + WALK_LIST(rc, rcu_thread_list) + while (rcu_gp_ongoing(&rc->ctl)) + birdloop_yield(); +} + +void +synchronize_rcu(void) +{ + LOCK_DOMAIN(resource, rcu_domain); + update_counter_and_wait(); + update_counter_and_wait(); + UNLOCK_DOMAIN(resource, rcu_domain); +} + +void +rcu_thread_start(struct rcu_thread *rc) +{ + LOCK_DOMAIN(resource, rcu_domain); + add_tail(&rcu_thread_list, &rc->n); + this_rcu_thread = rc; + UNLOCK_DOMAIN(resource, rcu_domain); +} + +void +rcu_thread_stop(struct rcu_thread *rc) +{ + LOCK_DOMAIN(resource, rcu_domain); + this_rcu_thread = NULL; + rem_node(&rc->n); + UNLOCK_DOMAIN(resource, rcu_domain); +} + +void +rcu_init(void) +{ + rcu_domain = DOMAIN_NEW(resource, "Read-Copy-Update"); + init_list(&rcu_thread_list); + rcu_thread_start(&main_rcu_thread); +} diff --git a/lib/rcu.h b/lib/rcu.h new file mode 100644 index 00000000..30eacc5b --- /dev/null +++ b/lib/rcu.h @@ -0,0 +1,55 @@ +/* + * BIRD Library -- Read-Copy-Update Basic Operations + * + * (c) 2021 Maria Matejka <mq@jmq.cz> + * (c) 2021 CZ.NIC z.s.p.o. + * + * Can be freely distributed and used under the terms of the GNU GPL. + * Note: all the relevant patents shall be expired. + */ + +#ifndef _BIRD_RCU_H_ +#define _BIRD_RCU_H_ + +#include "lib/birdlib.h" +#include "lib/lists.h" +#include <stdatomic.h> + +#define RCU_GP_PHASE 0x100000 +#define RCU_NEST_MASK 0x0fffff +#define RCU_NEST_CNT 0x000001 + +extern _Atomic uint rcu_gp_ctl; + +struct rcu_thread { + node n; + _Atomic uint ctl; +}; + +extern _Thread_local struct rcu_thread *this_rcu_thread; + +static inline void rcu_read_lock(void) +{ + uint cmp = atomic_load_explicit(&this_rcu_thread->ctl, memory_order_acquire); + + if (cmp & RCU_NEST_MASK) + atomic_store_explicit(&this_rcu_thread->ctl, cmp + RCU_NEST_CNT, memory_order_relaxed); + else + atomic_store(&this_rcu_thread->ctl, atomic_load_explicit(&rcu_gp_ctl, memory_order_acquire)); +} + +static inline void rcu_read_unlock(void) +{ + atomic_fetch_sub(&this_rcu_thread->ctl, RCU_NEST_CNT); +} + +void synchronize_rcu(void); + +/* Registering and unregistering a birdloop. To be called from birdloop implementation */ +void rcu_thread_start(struct rcu_thread *); +void rcu_thread_stop(struct rcu_thread *); + +/* Run this from resource init */ +void rcu_init(void); + +#endif diff --git a/lib/resource.c b/lib/resource.c index 89e559b4..94b8d019 100644 --- a/lib/resource.c +++ b/lib/resource.c @@ -14,6 +14,7 @@ #include "nest/bird.h" #include "lib/resource.h" #include "lib/string.h" +#include "lib/rcu.h" /** * DOC: Resource pools @@ -29,13 +30,7 @@ * is freed upon shutdown of the module. */ -struct pool { - resource r; - list inside; - const char *name; -}; - -static void pool_dump(resource *); +static void pool_dump(resource *, unsigned); static void pool_free(resource *); static resource *pool_lookup(resource *, unsigned long); static struct resmem pool_memsize(resource *P); @@ -51,8 +46,6 @@ static struct resclass pool_class = { pool root_pool; -static int indent; - /** * rp_new - create a resource pool * @p: parent pool @@ -100,16 +93,14 @@ pool_free(resource *P) } static void -pool_dump(resource *P) +pool_dump(resource *P, unsigned indent) { pool *p = (pool *) P; resource *r; debug("%s\n", p->name); - indent += 3; WALK_LIST(r, p->inside) - rdump(r); - indent -= 3; + rdump(r, indent + 3); } static struct resmem @@ -199,7 +190,7 @@ rfree(void *res) * It works by calling a class-specific dump function. */ void -rdump(void *res) +rdump(void *res, unsigned indent) { char x[16]; resource *r = res; @@ -209,7 +200,7 @@ rdump(void *res) if (r) { debug("%s ", r->class->name); - r->class->dump(r); + r->class->dump(r, indent); } else debug("NULL\n"); @@ -269,7 +260,7 @@ rlookup(unsigned long a) debug("Looking up %08lx\n", a); if (r = pool_lookup(&root_pool.r, a)) - rdump(r); + rdump(r, 3); else debug("Not found.\n"); } @@ -284,6 +275,7 @@ rlookup(unsigned long a) void resource_init(void) { + rcu_init(); resource_sys_init(); root_pool.r.class = &pool_class; @@ -292,6 +284,25 @@ resource_init(void) tmp_init(&root_pool); } +_Thread_local struct tmp_resources tmp_res; + +void +tmp_init(pool *p) +{ + tmp_res.lp = lp_new_default(p); + tmp_res.parent = p; + tmp_res.pool = rp_new(p, "TMP"); +} + +void +tmp_flush(void) +{ + lp_flush(tmp_linpool); + rfree(tmp_res.pool); + tmp_res.pool = rp_new(tmp_res.parent, "TMP"); +} + + /** * DOC: Memory blocks * @@ -316,7 +327,7 @@ static void mbl_free(resource *r UNUSED) { } -static void mbl_debug(resource *r) +static void mbl_debug(resource *r, unsigned indent UNUSED) { struct mblock *m = (struct mblock *) r; diff --git a/lib/resource.h b/lib/resource.h index a4e110a5..911b990d 100644 --- a/lib/resource.h +++ b/lib/resource.h @@ -30,7 +30,7 @@ struct resclass { char *name; /* Resource class name */ unsigned size; /* Standard size of single resource */ void (*free)(resource *); /* Freeing function */ - void (*dump)(resource *); /* Dump to debug output */ + void (*dump)(resource *, unsigned indent); /* Dump to debug output */ resource *(*lookup)(resource *, unsigned long); /* Look up address (only for debugging) */ struct resmem (*memsize)(resource *); /* Return size of memory used by the resource, may be NULL */ }; @@ -40,13 +40,18 @@ struct resclass { /* Generic resource manipulation */ -typedef struct pool pool; +typedef struct pool { + resource r; + list inside; + const char *name; +} pool; + void resource_init(void); pool *rp_new(pool *, const char *); /* Create new pool */ pool *rp_newf(pool *, const char *, ...); /* Create a new pool with a formatted string as its name */ void rfree(void *); /* Free single resource */ -void rdump(void *); /* Dump to debug output */ +void rdump(void *, unsigned indent); /* Dump to debug output */ struct resmem rmemsize(void *res); /* Return size of memory used by the resource */ void rlookup(unsigned long); /* Look up address (only for debugging) */ void rmove(void *, pool *); /* Move to a different pool */ @@ -80,14 +85,21 @@ void lp_flush(linpool *); /* Free everything, but leave linpool */ void lp_save(linpool *m, lp_state *p); /* Save state */ void lp_restore(linpool *m, lp_state *p); /* Restore state */ -extern _Thread_local linpool *tmp_linpool; /* Temporary linpool autoflushed regularily */ +struct tmp_resources { + pool *pool, *parent; + linpool *lp; +}; + +extern _Thread_local struct tmp_resources tmp_res; +#define tmp_linpool tmp_res.lp #define tmp_alloc(sz) lp_alloc(tmp_linpool, sz) #define tmp_allocu(sz) lp_allocu(tmp_linpool, sz) #define tmp_allocz(sz) lp_allocz(tmp_linpool, sz) -#define tmp_init(p) tmp_linpool = lp_new_default(p) -#define tmp_flush() lp_flush(tmp_linpool) +void tmp_init(pool *p); +void tmp_flush(void); + #define lp_new_default lp_new @@ -108,9 +120,13 @@ void sl_free(void *); void buffer_realloc(void **buf, unsigned *size, unsigned need, unsigned item_size); /* Allocator of whole pages; for use in slabs and other high-level allocators. */ +#define PAGE_HEAD(x) ((void *) (((uintptr_t) (x)) & ~(page_size-1))) extern long page_size; +extern _Atomic int pages_kept; +extern _Atomic int pages_kept_locally; void *alloc_page(void); void free_page(void *); +void flush_local_pages(void); void resource_sys_init(void); diff --git a/lib/route.h b/lib/route.h new file mode 100644 index 00000000..6189895a --- /dev/null +++ b/lib/route.h @@ -0,0 +1,560 @@ +/* + * BIRD Internet Routing Daemon -- Routing data structures + * + * (c) 1998--2000 Martin Mares <mj@ucw.cz> + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_LIB_ROUTE_H_ +#define _BIRD_LIB_ROUTE_H_ + +#undef RT_SOURCE_DEBUG + +#include "lib/type.h" +#include "lib/rcu.h" +#include "lib/hash.h" +#include "lib/event.h" + +struct network; +struct proto; +struct cli; +struct rtable_private; + +typedef struct rte { + struct ea_list *attrs; /* Attributes of this route */ + const net_addr *net; /* Network this RTE belongs to */ + struct rte_src *src; /* Route source that created the route */ + struct rt_import_hook *sender; /* Import hook used to send the route to the routing table */ + btime lastmod; /* Last modified (set by table) */ + u32 id; /* Table specific route id */ + byte flags; /* Table-specific flags */ + byte pflags; /* Protocol-specific flags */ + u8 generation; /* If this route import is based on other previously exported route, + this value should be 1 + MAX(generation of the parent routes). + Otherwise the route is independent and this value is zero. */ + u8 stale_cycle; /* Auxiliary value for route refresh */ +} rte; + +#define REF_FILTERED 2 /* Route is rejected by import filter */ +#define REF_PENDING 32 /* Route has not propagated completely yet */ + +/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ +static inline int rte_is_valid(const rte *r) { return r && !(r->flags & REF_FILTERED); } + +/* Route just has REF_FILTERED flag */ +static inline int rte_is_filtered(const rte *r) { return !!(r->flags & REF_FILTERED); } + +/* Strip the route of the table-specific values */ +static inline rte rte_init_from(const rte *r) +{ + return (rte) { + .attrs = r->attrs, + .net = r->net, + .src = r->src, + }; +} + +struct rte_src { + struct rte_src *next; /* Hash chain */ + struct rte_owner *owner; /* Route source owner */ + u32 private_id; /* Private ID, assigned by the protocol */ + u32 global_id; /* Globally unique ID of the source */ + _Atomic u64 uc; /* Use count */ +}; + +struct rte_owner_class { + void (*get_route_info)(const rte *, byte *buf); /* Get route information (for `show route' command) */ + int (*rte_better)(const rte *, const rte *); + int (*rte_mergable)(const rte *, const rte *); + u32 (*rte_igp_metric)(const rte *); +}; + +struct rte_owner { + struct rte_owner_class *class; + int (*rte_recalculate)(struct rtable_private *, struct network *, struct rte *, struct rte *, struct rte *); + HASH(struct rte_src) hash; + const char *name; + u32 hash_key; + u32 uc; + event_list *list; + event *prune; + event *stop; +}; + +DEFINE_DOMAIN(attrs); +extern DOMAIN(attrs) attrs_domain; + +#define RTA_LOCK LOCK_DOMAIN(attrs, attrs_domain) +#define RTA_UNLOCK UNLOCK_DOMAIN(attrs, attrs_domain) + +#define RTE_SRC_PU_SHIFT 44 +#define RTE_SRC_IN_PROGRESS (1ULL << RTE_SRC_PU_SHIFT) + +/* Get a route source. This also locks the source, therefore the caller has to + * unlock the source after the route has been propagated. */ +struct rte_src *rt_get_source_o(struct rte_owner *o, u32 id); +#define rt_get_source(p, id) rt_get_source_o(&(p)->sources, (id)) + +struct rte_src *rt_find_source_global(u32 id); + +#ifdef RT_SOURCE_DEBUG +#define rt_lock_source _rt_lock_source_internal +#define rt_unlock_source _rt_unlock_source_internal +#endif + +static inline void rt_lock_source(struct rte_src *src) +{ + /* Locking a source is trivial; somebody already holds it so we just increase + * the use count. Nothing can be freed underneath our hands. */ + u64 uc = atomic_fetch_add_explicit(&src->uc, 1, memory_order_acq_rel); + ASSERT_DIE(uc > 0); +} + +static inline void rt_unlock_source(struct rte_src *src) +{ + /* Unlocking is tricky. We do it lockless so at the same time, the prune + * event may be running, therefore if the unlock gets us to zero, it must be + * the last thing in this routine, otherwise the prune routine may find the + * source's usecount zeroed, freeing it prematurely. + * + * The usecount is split into two parts: + * the top 20 bits are an in-progress indicator + * the bottom 44 bits keep the actual usecount. + * + * Therefore at most 1 million of writers can simultaneously unlock the same + * source, while at most ~17T different routes can reference it. Both limits + * are insanely high from the 2022 point of view. Let's suppose that when 17T + * routes or 1M writers get real, we get also 128bit atomic variables in the + * C norm. */ + + /* First, we push the in-progress indicator */ + u64 uc = atomic_fetch_add_explicit(&src->uc, RTE_SRC_IN_PROGRESS, memory_order_acq_rel); + + /* Then we split the indicator to its parts. Remember, we got the value before the operation happened. */ + u64 pending = (uc >> RTE_SRC_PU_SHIFT) + 1; + uc &= RTE_SRC_IN_PROGRESS - 1; + + /* We per-use the RCU critical section indicator to make the prune event wait + * until we finish here in the rare case we get preempted. */ + rcu_read_lock(); + + /* Obviously, there can't be more pending unlocks than the usecount itself */ + if (uc == pending) + /* If we're the last unlocker, schedule the owner's prune event */ + ev_send(src->owner->list, src->owner->prune); + else + ASSERT_DIE(uc > pending); + + /* And now, finally, simultaneously pop the in-progress indicator and the + * usecount, possibly allowing the source pruning routine to free this structure */ + atomic_fetch_sub_explicit(&src->uc, RTE_SRC_IN_PROGRESS + 1, memory_order_acq_rel); + + /* ... and to reduce the load a bit, the source pruning routine will better wait for + * RCU synchronization instead of a busy loop. */ + rcu_read_unlock(); +} + +#ifdef RT_SOURCE_DEBUG +#undef rt_lock_source +#undef rt_unlock_source + +#define rt_lock_source(x) ( log(L_INFO "Lock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_lock_source_internal(x) ) +#define rt_unlock_source(x) ( log(L_INFO "Unlock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_unlock_source_internal(x) ) +#endif + +void rt_init_sources(struct rte_owner *, const char *name, event_list *list); +void rt_destroy_sources(struct rte_owner *, event *); + +/* + * Route Attributes + * + * Beware: All standard BGP attributes must be represented here instead + * of making them local to the route. This is needed to ensure proper + * construction of BGP route attribute lists. + */ + +/* Nexthop structure */ +struct nexthop { + ip_addr gw; /* Next hop */ + struct iface *iface; /* Outgoing interface */ + byte flags; + byte weight; + byte labels; /* Number of all labels */ + u32 label[0]; +}; + +/* For packing one into eattrs */ +struct nexthop_adata { + struct adata ad; + /* There is either a set of nexthops or a special destination (RTD_*) */ + union { + struct nexthop nh; + uint dest; + }; +}; + +#define NEXTHOP_DEST_SIZE (OFFSETOF(struct nexthop_adata, dest) + sizeof(uint) - OFFSETOF(struct adata, data)) +#define NEXTHOP_DEST_LITERAL(x) ((struct nexthop_adata) { \ + .ad.length = NEXTHOP_DEST_SIZE, .dest = (x), }) + +#define RNF_ONLINK 0x1 /* Gateway is onlink regardless of IP ranges */ + + +#define RTS_STATIC 1 /* Normal static route */ +#define RTS_INHERIT 2 /* Route inherited from kernel */ +#define RTS_DEVICE 3 /* Device route */ +#define RTS_STATIC_DEVICE 4 /* Static device route */ +#define RTS_REDIRECT 5 /* Learned via redirect */ +#define RTS_RIP 6 /* RIP route */ +#define RTS_OSPF 7 /* OSPF route */ +#define RTS_OSPF_IA 8 /* OSPF inter-area route */ +#define RTS_OSPF_EXT1 9 /* OSPF external route type 1 */ +#define RTS_OSPF_EXT2 10 /* OSPF external route type 2 */ +#define RTS_BGP 11 /* BGP route */ +#define RTS_PIPE 12 /* Inter-table wormhole */ +#define RTS_BABEL 13 /* Babel route */ +#define RTS_RPKI 14 /* Route Origin Authorization */ +#define RTS_PERF 15 /* Perf checker */ +#define RTS_MAX 16 + +#define RTD_NONE 0 /* Undefined next hop */ +#define RTD_UNICAST 1 /* A standard next hop */ +#define RTD_BLACKHOLE 2 /* Silently drop packets */ +#define RTD_UNREACHABLE 3 /* Reject as unreachable */ +#define RTD_PROHIBIT 4 /* Administratively prohibited */ +#define RTD_MAX 5 + +extern const char * rta_dest_names[RTD_MAX]; + +static inline const char *rta_dest_name(uint n) +{ return (n < RTD_MAX) ? rta_dest_names[n] : "???"; } + + +/* + * Extended Route Attributes + */ + +typedef struct eattr { + word id; /* EA_CODE(PROTOCOL_..., protocol-dependent ID) */ + byte flags; /* Protocol-dependent flags */ + byte type; /* Attribute type */ + byte rfu:5; + byte originated:1; /* The attribute has originated locally */ + byte fresh:1; /* An uncached attribute (e.g. modified in export filter) */ + byte undef:1; /* Explicitly undefined */ + + PADDING(unused, 3, 3); + + union bval u; +} eattr; + + +#define EA_CODE_MASK 0xffff +#define EA_ALLOW_UNDEF 0x10000 /* ea_find: allow EAF_TYPE_UNDEF */ +#define EA_BIT(n) ((n) << 24) /* Used in bitfield accessors */ +#define EA_BIT_GET(ea) ((ea) >> 24) + +typedef struct ea_list { + struct ea_list *next; /* In case we have an override list */ + byte flags; /* Flags: EALF_... */ + byte rfu; + word count; /* Number of attributes */ + eattr attrs[0]; /* Attribute definitions themselves */ +} ea_list; + +struct ea_storage { + struct ea_storage *next_hash; /* Next in hash chain */ + struct ea_storage **pprev_hash; /* Previous in hash chain */ + _Atomic u32 uc; /* Use count */ + u32 hash_key; /* List hash */ + ea_list l[0]; /* The list itself */ +}; + +#define EALF_SORTED 1 /* Attributes are sorted by code */ +#define EALF_BISECT 2 /* Use interval bisection for searching */ +#define EALF_CACHED 4 /* List is cached */ +#define EALF_HUGE 8 /* List is too big to fit into slab */ + +struct ea_class { +#define EA_CLASS_INSIDE \ + const char *name; /* Name (both print and filter) */ \ + struct symbol *sym; /* Symbol to export to configs */ \ + uint id; /* Autoassigned attribute ID */ \ + uint uc; /* Reference count */ \ + btype type; /* Data type ID */ \ + uint readonly:1; /* This attribute can't be changed by filters */ \ + uint conf:1; /* Requested by config */ \ + uint hidden:1; /* Technical attribute, do not show, do not expose to filters */ \ + void (*format)(const eattr *ea, byte *buf, uint size); \ + void (*stored)(const eattr *ea); /* When stored into global hash */ \ + void (*freed)(const eattr *ea); /* When released from global hash */ \ + + EA_CLASS_INSIDE; +}; + +struct ea_class_ref { + resource r; + struct ea_class *class; +}; + +void ea_register_init(struct ea_class *); +struct ea_class_ref *ea_register_alloc(pool *, struct ea_class); + +#define EA_REGISTER_ALL_HELPER(x) ea_register_init(x); +#define EA_REGISTER_ALL(...) MACRO_FOREACH(EA_REGISTER_ALL_HELPER, __VA_ARGS__) + +struct ea_class *ea_class_find_by_id(uint id); +struct ea_class *ea_class_find_by_name(const char *name); +static inline struct ea_class *ea_class_self(struct ea_class *self) { return self; } +#define ea_class_find(_arg) _Generic((_arg), \ + uint: ea_class_find_by_id, \ + word: ea_class_find_by_id, \ + char *: ea_class_find_by_name, \ + const char *: ea_class_find_by_name, \ + struct ea_class *: ea_class_self)(_arg) + +struct ea_walk_state { + ea_list *eattrs; /* Ccurrent ea_list, initially set by caller */ + eattr *ea; /* Current eattr, initially NULL */ + u32 visited[4]; /* Bitfield, limiting max to 128 */ +}; + +#define ea_find(_l, _arg) _Generic((_arg), uint: ea_find_by_id, struct ea_class *: ea_find_by_class, char *: ea_find_by_name)(_l, _arg) +eattr *ea_find_by_id(ea_list *, unsigned ea); +static inline eattr *ea_find_by_class(ea_list *l, const struct ea_class *def) +{ return ea_find_by_id(l, def->id); } +static inline eattr *ea_find_by_name(ea_list *l, const char *name) +{ + const struct ea_class *def = ea_class_find_by_name(name); + return def ? ea_find_by_class(l, def) : NULL; +} + +#define ea_get_int(_l, _ident, _def) ({ \ + struct ea_class *cls = ea_class_find((_ident)); \ + ASSERT_DIE(cls->type & EAF_EMBEDDED); \ + const eattr *ea = ea_find((_l), cls->id); \ + (ea ? ea->u.data : (_def)); \ + }) + +#define ea_get_ip(_l, _ident, _def) ({ \ + struct ea_class *cls = ea_class_find((_ident)); \ + ASSERT_DIE(cls->type == T_IP); \ + const eattr *ea = ea_find((_l), cls->id); \ + (ea ? *((const ip_addr *) ea->u.ptr->data) : (_def)); \ + }) + +eattr *ea_walk(struct ea_walk_state *s, uint id, uint max); +void ea_dump(ea_list *); +int ea_same(ea_list *x, ea_list *y); /* Test whether two ea_lists are identical */ +uint ea_hash(ea_list *e); /* Calculate 16-bit hash value */ +ea_list *ea_append(ea_list *to, ea_list *what); +void ea_format_bitfield(const struct eattr *a, byte *buf, int bufsize, const char **names, int min, int max); + +/* Normalize ea_list; allocates the result from tmp_linpool */ +ea_list *ea_normalize(ea_list *e, int overlay); + +uint ea_list_size(ea_list *); +void ea_list_copy(ea_list *dest, ea_list *src, uint size); + +#define EA_LOCAL_LIST(N) struct { ea_list l; eattr a[N]; } + +#define EA_LITERAL_EMBEDDED(_class, _flags, _val) ({ \ + btype _type = (_class)->type; \ + ASSERT_DIE(_type & EAF_EMBEDDED); \ + EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.i = _val); \ + }) + +#define EA_LITERAL_STORE_ADATA(_class, _flags, _buf, _len) ({ \ + btype _type = (_class)->type; \ + ASSERT_DIE(!(_type & EAF_EMBEDDED)); \ + EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.ad = tmp_store_adata((_buf), (_len))); \ + }) + +#define EA_LITERAL_DIRECT_ADATA(_class, _flags, _adata) ({ \ + btype _type = (_class)->type; \ + ASSERT_DIE(!(_type & EAF_EMBEDDED)); \ + EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.ad = _adata); \ + }) + +#define EA_LITERAL_GENERIC(_id, _type, _flags, ...) \ + ((eattr) { .id = _id, .type = _type, .flags = _flags, __VA_ARGS__ }) + +static inline eattr * +ea_set_attr(ea_list **to, eattr a) +{ + EA_LOCAL_LIST(1) *ea = tmp_alloc(sizeof(*ea)); + *ea = (typeof(*ea)) { + .l.flags = EALF_SORTED, + .l.count = 1, + .l.next = *to, + .a[0] = a, + }; + + *to = &ea->l; + return &ea->a[0]; +} + +static inline void +ea_unset_attr(ea_list **to, _Bool local, const struct ea_class *def) +{ + ea_set_attr(to, EA_LITERAL_GENERIC(def->id, 0, 0, + .fresh = local, .originated = local, .undef = 1)); +} + +static inline void +ea_set_attr_u32(ea_list **to, const struct ea_class *def, uint flags, u64 data) +{ ea_set_attr(to, EA_LITERAL_EMBEDDED(def, flags, data)); } + +static inline void +ea_set_attr_data(ea_list **to, const struct ea_class *def, uint flags, const void *data, uint len) +{ ea_set_attr(to, EA_LITERAL_STORE_ADATA(def, flags, data, len)); } + +static inline void +ea_copy_attr(ea_list **to, ea_list *from, const struct ea_class *def) +{ + eattr *e = ea_find_by_class(from, def); + if (e) + if (e->type & EAF_EMBEDDED) + ea_set_attr_u32(to, def, e->flags, e->u.data); + else + ea_set_attr_data(to, def, e->flags, e->u.ptr->data, e->u.ptr->length); + else + ea_unset_attr(to, 0, def); +} + +/* + * Common route attributes + */ + +/* Preference: first-order comparison */ +extern struct ea_class ea_gen_preference; +static inline u32 rt_get_preference(const rte *rt) +{ return ea_get_int(rt->attrs, &ea_gen_preference, 0); } + +/* IGP metric: second-order comparison */ +extern struct ea_class ea_gen_igp_metric; +u32 rt_get_igp_metric(const rte *rt); +#define IGP_METRIC_UNKNOWN 0x80000000 /* Default igp_metric used when no other + protocol-specific metric is availabe */ + +/* From: Advertising router */ +extern struct ea_class ea_gen_from; + +/* Source: An old method to devise the route source protocol and kind. + * To be superseded in a near future by something more informative. */ +extern struct ea_class ea_gen_source; +static inline u32 rt_get_source_attr(const rte *rt) +{ return ea_get_int(rt->attrs, &ea_gen_source, 0); } + +/* Flowspec validation result */ +enum flowspec_valid { + FLOWSPEC_UNKNOWN = 0, + FLOWSPEC_VALID = 1, + FLOWSPEC_INVALID = 2, + FLOWSPEC__MAX, +}; + +extern const char * flowspec_valid_names[FLOWSPEC__MAX]; +static inline const char *flowspec_valid_name(enum flowspec_valid v) +{ return (v < FLOWSPEC__MAX) ? flowspec_valid_names[v] : "???"; } + +extern struct ea_class ea_gen_flowspec_valid; +static inline enum flowspec_valid rt_get_flowspec_valid(rte *rt) +{ return ea_get_int(rt->attrs, &ea_gen_flowspec_valid, FLOWSPEC_UNKNOWN); } + +/* Next hop: For now, stored as adata */ +extern struct ea_class ea_gen_nexthop; + +static inline void ea_set_dest(struct ea_list **to, uint flags, uint dest) +{ + struct nexthop_adata nhad = NEXTHOP_DEST_LITERAL(dest); + ea_set_attr_data(to, &ea_gen_nexthop, flags, &nhad.ad.data, nhad.ad.length); +} + +/* Next hop structures */ + +#define NEXTHOP_ALIGNMENT (_Alignof(struct nexthop)) +#define NEXTHOP_MAX_SIZE (sizeof(struct nexthop) + sizeof(u32)*MPLS_MAX_LABEL_STACK) +#define NEXTHOP_SIZE(_nh) NEXTHOP_SIZE_CNT(((_nh)->labels)) +#define NEXTHOP_SIZE_CNT(cnt) BIRD_ALIGN((sizeof(struct nexthop) + sizeof(u32) * (cnt)), NEXTHOP_ALIGNMENT) +#define nexthop_size(nh) NEXTHOP_SIZE((nh)) + +#define NEXTHOP_NEXT(_nh) ((void *) (_nh) + NEXTHOP_SIZE(_nh)) +#define NEXTHOP_END(_nhad) ((_nhad)->ad.data + (_nhad)->ad.length) +#define NEXTHOP_VALID(_nh, _nhad) ((void *) (_nh) < (void *) NEXTHOP_END(_nhad)) +#define NEXTHOP_ONE(_nhad) (NEXTHOP_NEXT(&(_nhad)->nh) == NEXTHOP_END(_nhad)) + +#define NEXTHOP_WALK(_iter, _nhad) for ( \ + struct nexthop *_iter = &(_nhad)->nh; \ + (void *) _iter < (void *) NEXTHOP_END(_nhad); \ + _iter = NEXTHOP_NEXT(_iter)) + + +static inline int nexthop_same(struct nexthop_adata *x, struct nexthop_adata *y) +{ return adata_same(&x->ad, &y->ad); } +struct nexthop_adata *nexthop_merge(struct nexthop_adata *x, struct nexthop_adata *y, int max, linpool *lp); +struct nexthop_adata *nexthop_sort(struct nexthop_adata *x, linpool *lp); +int nexthop_is_sorted(struct nexthop_adata *x); + +#define NEXTHOP_IS_REACHABLE(nhad) ((nhad)->ad.length > NEXTHOP_DEST_SIZE) + +/* Route has regular, reachable nexthop (i.e. not RTD_UNREACHABLE and like) */ +static inline int rte_is_reachable(rte *r) +{ + eattr *nhea = ea_find(r->attrs, &ea_gen_nexthop); + if (!nhea) + return 0; + + struct nexthop_adata *nhad = (void *) nhea->u.ptr; + return NEXTHOP_IS_REACHABLE(nhad); +} + +static inline int nhea_dest(eattr *nhea) +{ + if (!nhea) + return RTD_NONE; + + struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + if (NEXTHOP_IS_REACHABLE(nhad)) + return RTD_UNICAST; + else + return nhad->dest; +} + +static inline int rte_dest(const rte *r) +{ + return nhea_dest(ea_find(r->attrs, &ea_gen_nexthop)); +} + +void rta_init(void); +ea_list *ea_lookup(ea_list *, int overlay); /* Get a cached (and normalized) variant of this attribute list */ +static inline int ea_is_cached(const ea_list *r) { return r->flags & EALF_CACHED; } +static inline struct ea_storage *ea_get_storage(ea_list *r) +{ + ASSERT_DIE(ea_is_cached(r)); + return SKIP_BACK(struct ea_storage, l[0], r); +} + +static inline ea_list *ea_clone(ea_list *r) { + ASSERT_DIE(0 < atomic_fetch_add_explicit(&ea_get_storage(r)->uc, 1, memory_order_acq_rel)); + return r; +} +void ea__free(struct ea_storage *r); +static inline void ea_free(ea_list *l) { + if (!l) return; + struct ea_storage *r = ea_get_storage(l); + if (1 == atomic_fetch_sub_explicit(&r->uc, 1, memory_order_acq_rel)) ea__free(r); +} + +void ea_dump(ea_list *); +void ea_dump_all(void); +void ea_show_list(struct cli *, ea_list *); + +#define rta_lookup ea_lookup +#define rta_is_cached ea_is_cached +#define rta_clone ea_clone +#define rta_free ea_free + +#endif diff --git a/lib/settle.h b/lib/settle.h new file mode 100644 index 00000000..e721f21f --- /dev/null +++ b/lib/settle.h @@ -0,0 +1,64 @@ +/* + * BIRD -- Settle timer + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * (c) 2022 CZ.NIC z.s.p.o. + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_SETTLE_H_ +#define _BIRD_SETTLE_H_ + +#include "lib/birdlib.h" +#include "lib/timer.h" + +struct settle_config { + btime min, max; +}; + +struct settle { + union { + /* Timer hook polymorphism. */ + struct { + resource _r; + void (*hook)(struct settle *); + }; + timer tm; + }; + struct settle_config cf; + btime started; +}; + +STATIC_ASSERT(OFFSETOF(struct settle, hook) == OFFSETOF(struct settle, tm) + OFFSETOF(timer, hook)); + +#define SETTLE_INIT(_cfp, _hook, _data) (struct settle) { .tm = { .data = (_data), .hook = TYPE_CAST(void (*)(struct settle *), void (*)(struct timer *), (_hook)), }, .cf = ({ASSERT_DIE((_cfp)->min <= (_cfp)->max); *(_cfp); }), } + + +static inline void settle_init(struct settle *s, struct settle_config *cf, void (*hook)(struct settle *), void *data) +{ + *s = SETTLE_INIT(cf, hook, data); +} + +#define settle_active(s) tm_active(&(s)->tm) + +static inline void settle_kick(struct settle *s, struct birdloop *loop) +{ + if (!tm_active(&s->tm)) + { + s->started = current_time(); + tm_set_in(&s->tm, s->started + s->cf.min, loop); + } + else + { + btime now = current_time(); + tm_set_in(&s->tm, MIN_(now + s->cf.min, s->started + s->cf.max), loop); + } +} + +static inline void settle_cancel(struct settle *s) +{ + tm_stop(&s->tm); +} + +#endif @@ -41,7 +41,7 @@ #endif static void slab_free(resource *r); -static void slab_dump(resource *r); +static void slab_dump(resource *r, unsigned indent); static resource *slab_lookup(resource *r, unsigned long addr); static struct resmem slab_memsize(resource *r); @@ -118,7 +118,7 @@ slab_free(resource *r) } static void -slab_dump(resource *r) +slab_dump(resource *r, unsigned indent UNUSED) { slab *s = (slab *) r; int cnt = 0; @@ -197,7 +197,7 @@ static struct resclass sl_class = { slab_memsize }; -#define SL_GET_HEAD(x) ((struct sl_head *) (((uintptr_t) (x)) & ~(page_size-1))) +#define SL_GET_HEAD(x) PAGE_HEAD(x) #define SL_HEAD_CHANGE_STATE(_s, _h, _from, _to) ({ \ ASSERT_DIE(_h->state == slh_##_from); \ @@ -236,7 +236,7 @@ sl_new(pool *p, uint size) + sizeof(u32) * s->head_bitfield_len + align - 1) / align * align; - } while (s->objs_per_slab * size + s->head_size > page_size); + } while (s->objs_per_slab * size + s->head_size > (size_t) page_size); if (!s->objs_per_slab) bug("Slab: object too large"); @@ -378,7 +378,7 @@ slab_free(resource *r) } static void -slab_dump(resource *r) +slab_dump(resource *r, unsigned indent UNUSED) { slab *s = (slab *) r; int ec=0, pc=0, fc=0; @@ -390,6 +390,15 @@ slab_dump(resource *r) WALK_TLIST(sl_head, h, &s->full_heads) fc++; debug("(%de+%dp+%df blocks per %d objs per %d bytes)\n", ec, pc, fc, s->objs_per_slab, s->obj_size); + + char x[16]; + bsprintf(x, "%%%ds%%s %%p\n", indent + 2); + WALK_TLIST(sl_head, h, &s->full_heads) + debug(x, "", "full", h); + WALK_TLIST(sl_head, h, &s->partial_heads) + debug(x, "", "partial", h); + WALK_TLIST(sl_head, h, &s->empty_heads) + debug(x, "", "empty", h); } static struct resmem diff --git a/lib/socket.h b/lib/socket.h index 0b6ac589..4b169581 100644 --- a/lib/socket.h +++ b/lib/socket.h @@ -12,6 +12,7 @@ #include <errno.h> #include "lib/resource.h" +#include "lib/event.h" #ifdef HAVE_LIBSSH #define LIBSSH_LEGACY_0_4 #include <libssh/libssh.h> @@ -79,16 +80,22 @@ typedef struct birdsock { const char *password; /* Password for MD5 authentication */ const char *err; /* Error message */ struct ssh_sock *ssh; /* Used in SK_SSH */ + struct birdloop *loop; /* BIRDLoop owning this socket */ } sock; sock *sock_new(pool *); /* Allocate new socket */ #define sk_new(X) sock_new(X) /* Wrapper to avoid name collision with OpenSSL */ -int sk_open(sock *); /* Open socket */ +int sk_open(sock *, struct birdloop *); /* Open socket */ +void sk_reloop(sock *, struct birdloop *); /* Move socket to another loop. Both loops must be locked. */ + int sk_rx_ready(sock *s); +_Bool sk_tx_pending(sock *s); int sk_send(sock *, uint len); /* Send data, <0=err, >0=ok, 0=sleep */ int sk_send_to(sock *, uint len, ip_addr to, uint port); /* sk_send to given destination */ void sk_reallocate(sock *); /* Free and allocate tbuf & rbuf */ +void sk_pause_rx(struct birdloop *loop, sock *s); +void sk_resume_rx(struct birdloop *loop, sock *s, int (*hook)(sock *, uint)); void sk_set_rbsize(sock *s, uint val); /* Resize RX buffer */ void sk_set_tbsize(sock *s, uint val); /* Resize TX buffer, keeping content */ void sk_set_tbuf(sock *s, void *tbuf); /* Switch TX buffer, NULL-> return to internal */ @@ -112,6 +119,7 @@ int sk_set_icmp6_filter(sock *s, int p1, int p2); void sk_log_error(sock *s, const char *p); byte * sk_rx_buffer(sock *s, int *len); /* Temporary */ +sock *sk_next(sock *s); extern int sk_priority_control; /* Suggested priority for control traffic, should be sysdep define */ @@ -125,7 +133,6 @@ extern int sk_priority_control; /* Suggested priority for control traffic, shou #define SKF_HIGH_PORT 0x20 /* Choose port from high range if possible */ #define SKF_FREEBIND 0x40 /* Allow socket to bind to a nonlocal address */ -#define SKF_THREAD 0x100 /* Socked used in thread, Do not add to main loop */ #define SKF_TRUNCATED 0x200 /* Received packet was truncated, set by IO layer */ #define SKF_HDRINCL 0x400 /* Used internally */ #define SKF_PKTINFO 0x800 /* Used internally */ diff --git a/lib/timer.c b/lib/timer.c index c47e0bbc..4e4aa55e 100644 --- a/lib/timer.c +++ b/lib/timer.c @@ -36,57 +36,13 @@ #include "lib/resource.h" #include "lib/timer.h" - -struct timeloop main_timeloop; - - -#ifdef USE_PTHREADS - #include <pthread.h> -/* Data accessed and modified from proto/bfd/io.c */ -pthread_key_t current_time_key; - -static inline struct timeloop * -timeloop_current(void) -{ - return pthread_getspecific(current_time_key); -} - -static inline void -timeloop_init_current(void) -{ - pthread_key_create(¤t_time_key, NULL); - pthread_setspecific(current_time_key, &main_timeloop); -} +_Atomic btime last_time; +_Atomic btime real_time; void wakeup_kick_current(void); -#else - -/* Just use main timelooop */ -static inline struct timeloop * timeloop_current(void) { return &main_timeloop; } -static inline void timeloop_init_current(void) { } - -#endif - -btime -current_time(void) -{ - return timeloop_current()->last_time; -} - -btime -current_real_time(void) -{ - struct timeloop *loop = timeloop_current(); - - if (!loop->real_time) - times_update_real_time(loop); - - return loop->real_time; -} - #define TIMER_LESS(a,b) ((a)->expires < (b)->expires) #define TIMER_SWAP(heap,a,b,t) (t = heap[a], heap[a] = heap[b], heap[b] = t, \ @@ -102,7 +58,7 @@ tm_free(resource *r) } static void -tm_dump(resource *r) +tm_dump(resource *r, unsigned indent UNUSED) { timer *t = (void *) r; @@ -112,7 +68,7 @@ tm_dump(resource *r) if (t->recurrent) debug("recur %d, ", t->recurrent); if (t->expires) - debug("expires in %d ms)\n", (t->expires - current_time()) TO_MS); + debug("in loop %p expires in %d ms)\n", t->loop, (t->expires - current_time()) TO_MS); else debug("inactive)\n"); } @@ -135,41 +91,40 @@ tm_new(pool *p) return t; } -void -tm_set(timer *t, btime when) +static void +tm_set_in_tl(timer *t, btime when, struct timeloop *local_timeloop) { - struct timeloop *loop = timeloop_current(); - uint tc = timers_count(loop); + uint tc = timers_count(local_timeloop); if (!t->expires) { t->index = ++tc; t->expires = when; - BUFFER_PUSH(loop->timers) = t; - HEAP_INSERT(loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP); + BUFFER_PUSH(local_timeloop->timers) = t; + HEAP_INSERT(local_timeloop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP); } else if (t->expires < when) { t->expires = when; - HEAP_INCREASE(loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); + HEAP_INCREASE(local_timeloop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); } else if (t->expires > when) { t->expires = when; - HEAP_DECREASE(loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); + HEAP_DECREASE(local_timeloop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); } -#ifdef CONFIG_BFD - /* Hack to notify BFD loops */ - if ((loop != &main_timeloop) && (t->index == 1)) - wakeup_kick_current(); -#endif + t->loop = local_timeloop; + + if (t->index == 1) + birdloop_ping(local_timeloop->loop); } void -tm_start(timer *t, btime after) +tm_set_in(timer *t, btime when, struct birdloop *loop) { - tm_set(t, current_time() + MAX(after, 0)); + ASSERT_DIE(birdloop_inside(loop)); + tm_set_in_tl(t, when, birdloop_time_loop(loop)); } void @@ -178,20 +133,22 @@ tm_stop(timer *t) if (!t->expires) return; - struct timeloop *loop = timeloop_current(); - uint tc = timers_count(loop); + TLOCK_TIMER_ASSERT(t->loop); - HEAP_DELETE(loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); - BUFFER_POP(loop->timers); + uint tc = timers_count(t->loop); + + HEAP_DELETE(t->loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); + BUFFER_POP(t->loop->timers); t->index = -1; t->expires = 0; + t->loop = NULL; } void timers_init(struct timeloop *loop, pool *p) { - times_init(loop); + TLOCK_TIMER_ASSERT(loop); BUFFER_INIT(loop->timers, p, 4); BUFFER_PUSH(loop->timers) = NULL; @@ -200,13 +157,15 @@ timers_init(struct timeloop *loop, pool *p) void io_log_event(void *hook, void *data); void -timers_fire(struct timeloop *loop) +timers_fire(struct timeloop *loop, int io_log) { + TLOCK_TIMER_ASSERT(loop); + btime base_time; timer *t; - times_update(loop); - base_time = loop->last_time; + times_update(); + base_time = current_time(); while (t = timers_first(loop)) { @@ -217,19 +176,19 @@ timers_fire(struct timeloop *loop) { btime when = t->expires + t->recurrent; - if (when <= loop->last_time) - when = loop->last_time + t->recurrent; + if (when <= base_time) + when = base_time + t->recurrent; if (t->randomize) when += random() % (t->randomize + 1); - tm_set(t, when); + tm_set_in_tl(t, when, loop); } else tm_stop(t); /* This is ugly hack, we want to log just timers executed from the main I/O loop */ - if (loop == &main_timeloop) + if (io_log) io_log_event(t->hook, t->data); t->hook(t); @@ -237,13 +196,6 @@ timers_fire(struct timeloop *loop) } } -void -timer_init(void) -{ - timers_init(&main_timeloop, &root_pool); - timeloop_init_current(); -} - /** * tm_parse_time - parse a date and time diff --git a/lib/timer.h b/lib/timer.h index c5ea430c..4a3a2108 100644 --- a/lib/timer.h +++ b/lib/timer.h @@ -12,8 +12,14 @@ #include "nest/bird.h" #include "lib/buffer.h" +#include "lib/io-loop.h" +#include "lib/locking.h" #include "lib/resource.h" +#include <stdatomic.h> + +extern _Atomic btime last_time; +extern _Atomic btime real_time; typedef struct timer { @@ -25,36 +31,42 @@ typedef struct timer uint randomize; /* Amount of randomization */ uint recurrent; /* Timer recurrence */ + struct timeloop *loop; /* Loop where the timer is active */ + int index; } timer; struct timeloop { BUFFER_(timer *) timers; - btime last_time; - btime real_time; + struct domain_generic *domain; + struct birdloop *loop; }; +#define TLOCK_TIMER_ASSERT(loop) ASSERT_DIE((loop)->domain && DG_IS_LOCKED((loop)->domain)) +#define TLOCK_LOCAL_ASSERT(loop) ASSERT_DIE(!(loop)->domain || DG_IS_LOCKED((loop)->domain)) + static inline uint timers_count(struct timeloop *loop) -{ return loop->timers.used - 1; } +{ TLOCK_TIMER_ASSERT(loop); return loop->timers.used - 1; } static inline timer *timers_first(struct timeloop *loop) -{ return (loop->timers.used > 1) ? loop->timers.data[1] : NULL; } - -extern struct timeloop main_timeloop; +{ TLOCK_TIMER_ASSERT(loop); return (loop->timers.used > 1) ? loop->timers.data[1] : NULL; } -btime current_time(void); -btime current_real_time(void); +#define current_time() atomic_load_explicit(&last_time, memory_order_acquire) +#define current_real_time() atomic_load_explicit(&real_time, memory_order_acquire) //#define now (current_time() TO_S) //#define now_real (current_real_time() TO_S) extern btime boot_time; timer *tm_new(pool *p); -void tm_set(timer *t, btime when); -void tm_start(timer *t, btime after); +#define tm_set(t, when) tm_set_in((t), (when), &main_birdloop) +#define tm_start(t, after) tm_start_in((t), (after), &main_birdloop) void tm_stop(timer *t); +void tm_set_in(timer *t, btime when, struct birdloop *loop); +#define tm_start_in(t, after, loop) tm_set_in((t), (current_time() + MAX_((after), 0)), loop) + static inline int tm_active(timer *t) { @@ -87,22 +99,20 @@ tm_set_max(timer *t, btime when) } static inline void -tm_start_max(timer *t, btime after) +tm_start_max_in(timer *t, btime after, struct birdloop *loop) { btime rem = tm_remains(t); - tm_start(t, MAX_(rem, after)); + tm_start_in(t, MAX_(rem, after), loop); } +#define tm_start_max(t, after) tm_start_max_in(t, after, &main_birdloop) + /* In sysdep code */ -void times_init(struct timeloop *loop); -void times_update(struct timeloop *loop); -void times_update_real_time(struct timeloop *loop); +void times_update(void); /* For I/O loop */ void timers_init(struct timeloop *loop, pool *p); -void timers_fire(struct timeloop *loop); - -void timer_init(void); +void timers_fire(struct timeloop *loop, int io_log); struct timeformat { diff --git a/lib/tlists.h b/lib/tlists.h index e1ed79ea..1437e17e 100644 --- a/lib/tlists.h +++ b/lib/tlists.h @@ -147,9 +147,14 @@ static inline void TLIST_NAME(rem_node)(TLIST_LIST_STRUCT *list, TLIST_TYPE *nod #error "You should first include lib/tlists.h without requesting a TLIST" #endif -#define TLIST_NODE(_name, _type) struct _name##_node { _type *next; _type *prev; } +#define TLIST_NODE_CONTENTS(_type) { _type *next; _type *prev; } +#define TLIST_NODE(_name, _type) struct _name##_node TLIST_NODE_CONTENTS(_type) +#define TLIST_DEFAULT_NODE struct MACRO_CONCAT_AFTER(TLIST_PREFIX,_node) \ + TLIST_NODE_CONTENTS(TLIST_TYPE) TLIST_ITEM + #define TLIST_LIST(_name) struct _name##_list + /* Use ->first and ->last to access HEAD and TAIL */ #define THEAD(_name, _list) (_list)->first #define TTAIL(_name, _list) (_list)->last diff --git a/lib/type.h b/lib/type.h new file mode 100644 index 00000000..b54744c1 --- /dev/null +++ b/lib/type.h @@ -0,0 +1,112 @@ +/* + * BIRD Internet Routing Daemon -- Internal Data Types + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_TYPE_H_ +#define _BIRD_TYPE_H_ + +#include "lib/birdlib.h" +#include "lib/attrs.h" + +union bval { +#define BVAL_ITEMS \ + struct { \ + u32 data; /* Integer type inherited from eattrs */ \ + PADDING(data, 0, 4); /* Must be padded on 64-bits */ \ + }; \ + struct { \ + u32 i; /* Integer type inherited from filters */ \ + PADDING(i, 0, 4); /* Must be padded on 64-bits */ \ + }; \ + const struct adata *ptr; /* Generic attribute data inherited from eattrs */ \ + const struct adata *ad; /* Generic attribute data inherited from filters */ \ + + BVAL_ITEMS; +}; + +union bval_long { + union bval bval; /* For direct assignments */ + BVAL_ITEMS; /* For item-wise access */ + + u64 ec; + lcomm lc; + ip_addr ip; + const net_addr *net; + const char *s; + const struct f_tree *t; + const struct f_trie *ti; + const struct f_path_mask *path_mask; + struct f_path_mask_item pmi; +}; + + +/* Internal types */ +enum btype { +/* Nothing. Simply nothing. */ + T_VOID = 0, + +/* Something but inaccessible. */ + T_OPAQUE = 0x02, /* Opaque byte string (not filterable) */ + T_IFACE = 0x0c, /* Pointer to an interface (inside adata) */ + T_NEXTHOP_LIST = 0x2c, /* The whole nexthop block */ + T_HOSTENTRY = 0x2e, /* Hostentry with possible MPLS labels */ + +/* Types shared with eattrs */ + T_INT = 0x01, /* 32-bit unsigned integer number */ + T_IP = 0x04, /* IP address */ + T_QUAD = 0x05, /* Router ID (IPv4 address) */ + T_PATH = 0x06, /* BGP AS path (encoding per RFC 1771:4.3) */ + T_CLIST = 0x0a, /* Set of u32's (e.g., a community list) */ + T_ECLIST = 0x0e, /* Set of pairs of u32's - ext. community list */ + T_LCLIST = 0x08, /* Set of triplets of u32's - large community list */ + + T_ENUM_BGP_ORIGIN = 0x11, /* BGP Origin enum */ + T_ENUM_RA_PREFERENCE = 0x13, /* RA Preference enum */ + T_ENUM_FLOWSPEC_VALID = 0x15, /* Flowspec validation result */ + +#define EAF_TYPE__MAX 0x1f +#define EAF_EMBEDDED 0x01 /* Data stored in eattr.u.data (part of type spec) */ + /* Otherwise, attribute data is adata */ + +/* Other user visible types which fit in int */ + T_BOOL = 0xa0, + T_PAIR = 0xa4, /* Notice that pair is stored as integer: first << 16 | second */ + +/* Put enumerational types in 0x20..0x3f range */ + T_ENUM_LO = 0x10, + T_ENUM_HI = 0x3f, + + T_ENUM_RTS = 0x31, + T_ENUM_SCOPE = 0x33, + T_ENUM_RTD = 0x37, + T_ENUM_ROA = 0x39, + T_ENUM_NETTYPE = 0x3b, + T_ENUM_AF = 0x3d, + +/* new enums go here */ + +#define T_ENUM T_ENUM_LO ... T_ENUM_HI + +/* Bigger ones */ + T_NET = 0xb0, + T_STRING = 0xb4, + T_PATH_MASK = 0xb8, /* mask for BGP path */ + T_EC = 0xbc, /* Extended community value, u64 */ + T_LC = 0xc0, /* Large community value, lcomm */ + T_RD = 0xc4, /* Route distinguisher for VPN addresses */ + T_PATH_MASK_ITEM = 0xc8, /* Path mask item for path mask constructors */ + + T_SET = 0x80, + T_PREFIX_SET = 0x84, +} PACKED; + +typedef enum btype btype; + +STATIC_ASSERT(sizeof(btype) == sizeof(byte)); + + +#endif diff --git a/lib/type_test.c b/lib/type_test.c new file mode 100644 index 00000000..b526db69 --- /dev/null +++ b/lib/type_test.c @@ -0,0 +1,79 @@ +/* + * BIRD Library -- Data Type Alignment Tests + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include "test/birdtest.h" +#include "lib/type.h" +#include "lib/route.h" + +#define CHECK_ONE(val) \ + for (uint i=0; i<sizeof(val); i++) \ + bt_assert(((const u8 *) &val)[i] == (u8) ~0); + +#define SET_PADDING(val, name) \ + for (uint i=0; i<sizeof(val.PADDING_NAME(name)); i++) \ + val.PADDING_NAME(name)[i] = ~0; + + +static int +t_bval(void) +{ + union bval v; + + memset(&v, 0, sizeof(v)); + v.data = ~0; + SET_PADDING(v, data); + CHECK_ONE(v); + + memset(&v, 0, sizeof(v)); + v.i = ~0; + SET_PADDING(v, i); + CHECK_ONE(v); + + memset(&v, 0, sizeof(v)); + v.ptr = (void *) ~0; + CHECK_ONE(v); + + memset(&v, 0, sizeof(v)); + v.ad = (void *) ~0; + CHECK_ONE(v); + + return 1; +} + +static int +t_eattr(void) +{ + struct eattr e; + memset(&e, 0, sizeof(e)); + + e.id = ~0; + e.flags = ~0; + e.type = ~0; + e.rfu = ~0; + e.originated = ~0; + e.fresh = ~0; + e.undef = ~0; + memset(&e.u, ~0, sizeof(e.u)); /* Assumes t_bval passed */ + + SET_PADDING(e, unused); + + CHECK_ONE(e); + + return 1; +} + + +int main(int argc, char *argv[]) +{ + bt_init(argc, argv); + + bt_test_suite(t_bval, "Structure alignment test: bval"); + bt_test_suite(t_eattr, "Structure alignment test: eattr"); + + return bt_exit_value(); +} diff --git a/nest/Makefile b/nest/Makefile index 163a1199..1a71c2fb 100644 --- a/nest/Makefile +++ b/nest/Makefile @@ -1,14 +1,18 @@ -src := a-path.c a-set.c cli.c cmds.c iface.c locks.c neighbor.c password.c proto.c proto-build.c rt-attr.c rt-dev.c rt-fib.c rt-show.c rt-table.c +src := cli.c cmds.c iface.c locks.c neighbor.c password.c proto.c proto-build.c rt-attr.c rt-dev.c rt-fib.c rt-show.c rt-table.c obj := $(src-o-files) $(all-daemon) $(cf-local) $(o)proto-build.c: Makefile $(lastword $(MAKEFILE_LIST)) $(objdir)/.dir-stamp $(E)echo GEN $@ - $(Q)echo "$(patsubst %,void %_build(void); ,$(PROTO_BUILD)) void protos_build_gen(void) { $(patsubst %, %_build(); ,$(PROTO_BUILD))}" > $@ + $(Q)echo "#include \"lib/birdlib.h\"" > $@ + $(Q)$(patsubst %,echo 'void %_build(void);' >> $@;,$(PROTO_BUILD)) + $(Q)echo "void protos_build_gen(void) {" >> $@ + $(Q)$(patsubst %,echo ' %_build();'>>$@;,$(PROTO_BUILD)) + $(Q)echo "}" >> $@ prepare: $(o)proto-build.c -tests_src := a-set_test.c a-path_test.c +tests_src := tests_targets := $(tests_targets) $(tests-target-files) tests_objs := $(tests_objs) $(src-o-files) @@ -35,6 +35,7 @@ struct bfd_request { void (*hook)(struct bfd_request *); void *data; + struct birdloop *target; struct bfd_session *session; @@ -57,14 +58,14 @@ static inline struct bfd_options * bfd_new_options(void) #ifdef CONFIG_BFD -struct bfd_request * bfd_request_session(pool *p, ip_addr addr, ip_addr local, struct iface *iface, struct iface *vrf, void (*hook)(struct bfd_request *), void *data, const struct bfd_options *opts); +struct bfd_request * bfd_request_session(pool *p, ip_addr addr, ip_addr local, struct iface *iface, struct iface *vrf, void (*hook)(struct bfd_request *), void *data, struct birdloop *target, const struct bfd_options *opts); void bfd_update_request(struct bfd_request *req, const struct bfd_options *opts); static inline void cf_check_bfd(int use UNUSED) { } #else -static inline struct bfd_request * bfd_request_session(pool *p UNUSED, ip_addr addr UNUSED, ip_addr local UNUSED, struct iface *iface UNUSED, struct iface *vrf UNUSED, void (*hook)(struct bfd_request *) UNUSED, void *data UNUSED, const struct bfd_options *opts UNUSED) { return NULL; } +static inline struct bfd_request * bfd_request_session(pool *p UNUSED, ip_addr addr UNUSED, ip_addr local UNUSED, struct iface *iface UNUSED, struct iface *vrf UNUSED, void (*hook)(struct bfd_request *) UNUSED, void *data UNUSED, struct birdloop *target UNUSED, const struct bfd_options *opts UNUSED) { return NULL; } static inline void bfd_update_request(struct bfd_request *req UNUSED, const struct bfd_options *opts UNUSED) { }; static inline void cf_check_bfd(int use) { if (use) cf_error("BFD not available"); } diff --git a/nest/bird.h b/nest/bird.h index 55712abe..931974a0 100644 --- a/nest/bird.h +++ b/nest/bird.h @@ -9,7 +9,6 @@ #ifndef _BIRD_BIRD_H_ #define _BIRD_BIRD_H_ -#include "sysdep/config.h" #include "lib/birdlib.h" #include "lib/ip.h" #include "lib/net.h" @@ -302,24 +302,24 @@ cli_event(void *data) cli_command(c); } - cli_write_trigger(c); + if (c->tx_pos) + cli_write_trigger(c); } cli * -cli_new(void *priv) +cli_new(struct birdsock *sock) { pool *p = rp_new(cli_pool, "CLI"); cli *c = mb_alloc(p, sizeof(cli)); bzero(c, sizeof(cli)); c->pool = p; - c->priv = priv; + c->sock = sock; c->event = ev_new(p); c->event->hook = cli_event; c->event->data = c; c->cont = cli_hello; c->parser_pool = lp_new_default(c->pool); - c->show_pool = lp_new_default(c->pool); c->rx_buf = mb_alloc(c->pool, CLI_RX_BUF_SIZE); ev_schedule(c->event); return c; @@ -409,11 +409,19 @@ void cli_free(cli *c) { cli_set_log_echo(c, 0, 0); + int defer = 0; if (c->cleanup) - c->cleanup(c); + defer = c->cleanup(c); if (c == cmd_reconfig_stored_cli) cmd_reconfig_stored_cli = NULL; - rfree(c->pool); + + if (defer) + { + rfree(c->sock); + c->sock = NULL; + } + else + rfree(c->pool); } /** @@ -28,17 +28,17 @@ struct cli_out { typedef struct cli { node n; /* Node in list of all log hooks */ pool *pool; - void *priv; /* Private to sysdep layer */ + struct birdsock *sock; /* Underlying socket */ byte *rx_buf, *rx_pos; /* sysdep */ struct cli_out *tx_buf, *tx_pos, *tx_write; event *event; void (*cont)(struct cli *c); - void (*cleanup)(struct cli *c); + int (*cleanup)(struct cli *c); /* Return 0 if finished and cli may be freed immediately. + Otherwise return 1 and call rfree(c->pool) when appropriate. */ void *rover; /* Private to continuation routine */ int last_reply; int restricted; /* CLI is restricted to read-only commands */ struct linpool *parser_pool; /* Pool used during parsing */ - struct linpool *show_pool; /* Pool used during route show */ byte *ring_buf; /* Ring buffer for asynchronous messages */ byte *ring_end, *ring_read, *ring_write; /* Pointers to the ring buffer */ uint ring_overflow; /* Counter of ring overflows */ @@ -63,7 +63,7 @@ static inline void cli_separator(cli *c) /* Functions provided to sysdep layer */ -cli *cli_new(void *); +cli *cli_new(struct birdsock *); void cli_init(void); void cli_free(cli *); void cli_kick(cli *); diff --git a/nest/cmds.c b/nest/cmds.c index bcc8d6c2..6717be0c 100644 --- a/nest/cmds.c +++ b/nest/cmds.c @@ -8,7 +8,7 @@ #include "nest/bird.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "conf/conf.h" #include "nest/cmds.h" @@ -51,17 +51,18 @@ cmd_show_symbols(struct sym_show_data *sd) cli_msg(1010, "%-8s\t%s", sd->sym->name, cf_symbol_class_name(sd->sym)); else { - HASH_WALK(config->sym_hash, next, sym) - { - if (!sym->scope->active) - continue; + for (const struct sym_scope *scope = config->root_scope; scope; scope = scope->next) + HASH_WALK(scope->hash, next, sym) + { + if (!sym->scope->active) + continue; - if (sd->type && (sym->class != sd->type)) - continue; + if (sd->type && (sym->class != sd->type)) + continue; - cli_msg(-1010, "%-8s\t%s", sym->name, cf_symbol_class_name(sym)); - } - HASH_WALK_END; + cli_msg(-1010, "%-8s\t%s", sym->name, cf_symbol_class_name(sym)); + } + HASH_WALK_END; cli_msg(0, ""); } @@ -108,7 +109,6 @@ print_size(char *dsc, struct resmem vals) extern pool *rt_table_pool; extern pool *rta_pool; -extern uint *pages_kept; void cmd_show_memory(void) @@ -121,8 +121,10 @@ cmd_show_memory(void) print_size("Current config:", rmemsize(config_pool)); struct resmem total = rmemsize(&root_pool); #ifdef HAVE_MMAP - print_size("Standby memory:", (struct resmem) { .overhead = page_size * *pages_kept }); - total.overhead += page_size * *pages_kept; + int pk = atomic_load_explicit(&pages_kept, memory_order_relaxed) + + atomic_load_explicit(&pages_kept_locally, memory_order_relaxed); + print_size("Standby memory:", (struct resmem) { .overhead = page_size * pk }); + total.overhead += page_size * pk; #endif print_size("Total:", total); cli_msg(0, ""); @@ -134,7 +136,7 @@ cmd_eval(const struct f_line *expr) buffer buf; LOG_BUFFER_INIT(buf); - if (f_eval_buf(expr, this_cli->parser_pool, &buf) > F_RETURN) + if (f_eval_buf(expr, &buf) > F_RETURN) { cli_msg(8008, "runtime error"); return; diff --git a/nest/config.Y b/nest/config.Y index b2aa0906..8c3ee8ad 100644 --- a/nest/config.Y +++ b/nest/config.Y @@ -112,20 +112,20 @@ proto_postconfig(void) CF_DECLS -CF_KEYWORDS(ROUTER, ID, HOSTNAME, PROTOCOL, TEMPLATE, PREFERENCE, DISABLED, DEBUG, ALL, OFF, DIRECT) +CF_KEYWORDS(ROUTER, ID, HOSTNAME, PROTOCOL, TEMPLATE, PREFERENCE, DISABLED, DEBUG, ALL, OFF, DIRECT, PIPE) CF_KEYWORDS(INTERFACE, IMPORT, EXPORT, FILTER, NONE, VRF, DEFAULT, TABLE, STATES, ROUTES, FILTERS) CF_KEYWORDS(IPV4, IPV6, VPN4, VPN6, ROA4, ROA6, FLOW4, FLOW6, SADR, MPLS) CF_KEYWORDS(RECEIVE, LIMIT, ACTION, WARN, BLOCK, RESTART, DISABLE, KEEP, FILTERED, RPKI) CF_KEYWORDS(PASSWORD, KEY, FROM, PASSIVE, TO, ID, EVENTS, PACKETS, PROTOCOLS, CHANNELS, INTERFACES) CF_KEYWORDS(ALGORITHM, KEYED, HMAC, MD5, SHA1, SHA256, SHA384, SHA512, BLAKE2S128, BLAKE2S256, BLAKE2B256, BLAKE2B512) -CF_KEYWORDS(PRIMARY, STATS, COUNT, BY, FOR, IN, COMMANDS, PREEXPORT, NOEXPORT, EXPORTED, GENERATE) +CF_KEYWORDS(PRIMARY, STATS, COUNT, FOR, IN, COMMANDS, PREEXPORT, NOEXPORT, EXPORTED, GENERATE) CF_KEYWORDS(BGP, PASSWORDS, DESCRIPTION) -CF_KEYWORDS(RELOAD, IN, OUT, MRTDUMP, MESSAGES, RESTRICT, MEMORY, IGP_METRIC, CLASS, DSCP) +CF_KEYWORDS(RELOAD, IN, OUT, MRTDUMP, MESSAGES, RESTRICT, MEMORY, CLASS, DSCP) CF_KEYWORDS(TIMEFORMAT, ISO, SHORT, LONG, ROUTE, PROTOCOL, BASE, LOG, S, MS, US) -CF_KEYWORDS(GRACEFUL, RESTART, WAIT, MAX, FLUSH, AS) +CF_KEYWORDS(GRACEFUL, RESTART, WAIT, MAX, AS) CF_KEYWORDS(MIN, IDLE, RX, TX, INTERVAL, MULTIPLIER, PASSIVE) CF_KEYWORDS(CHECK, LINK) -CF_KEYWORDS(SORTED, TRIE, MIN, MAX, SETTLE, TIME, GC, THRESHOLD, PERIOD) +CF_KEYWORDS(CORK, SORTED, TRIE, MIN, MAX, ROA, ROUTE, REFRESH, SETTLE, TIME, GC, THRESHOLD, PERIOD) /* For r_args_channel */ CF_KEYWORDS(IPV4, IPV4_MC, IPV4_MPLS, IPV6, IPV6_MC, IPV6_MPLS, IPV6_SADR, VPN4, VPN4_MC, VPN4_MPLS, VPN6, VPN6_MC, VPN6_MPLS, ROA4, ROA6, FLOW4, FLOW6, MPLS, PRI, SEC) @@ -133,7 +133,7 @@ CF_KEYWORDS(IPV4, IPV4_MC, IPV4_MPLS, IPV6, IPV6_MC, IPV6_MPLS, IPV6_SADR, VPN4, CF_ENUM(T_ENUM_RTS, RTS_, STATIC, INHERIT, DEVICE, STATIC_DEVICE, REDIRECT, RIP, OSPF, OSPF_IA, OSPF_EXT1, OSPF_EXT2, BGP, PIPE, BABEL) CF_ENUM(T_ENUM_SCOPE, SCOPE_, HOST, LINK, SITE, ORGANIZATION, UNIVERSE, UNDEFINED) -CF_ENUM(T_ENUM_RTD, RTD_, UNICAST, BLACKHOLE, UNREACHABLE, PROHIBIT) +CF_ENUM(T_ENUM_RTD, RTD_, BLACKHOLE, UNREACHABLE, PROHIBIT) CF_ENUM(T_ENUM_ROA, ROA_, UNKNOWN, VALID, INVALID) CF_ENUM_PX(T_ENUM_AF, AF_, AFI_, IPV4, IPV6) @@ -227,10 +227,15 @@ table_opt: cf_error("Trie option not supported for %s table", net_label[this_table->addr_type]); this_table->trie_used = $2; } - | MIN SETTLE TIME expr_us { this_table->min_settle_time = $4; } - | MAX SETTLE TIME expr_us { this_table->max_settle_time = $4; } | GC THRESHOLD expr { this_table->gc_threshold = $3; } | GC PERIOD expr_us { this_table->gc_period = (uint) $3; if ($3 > 3600 S_) cf_error("GC period must be at most 3600 s"); } + | CORK THRESHOLD expr expr { + if ($3 > $4) cf_error("Cork low threshold must be lower than the high threshold."); + this_table->cork_threshold.low = $3; + this_table->cork_threshold.high = $4; } + | EXPORT SETTLE TIME settle { this_table->export_settle = $4; } + | ROUTE REFRESH EXPORT SETTLE TIME settle { this_table->export_rr_settle = $6; } + | DEBUG bool { this_table->debug = $2; } ; table_opts: @@ -291,8 +296,8 @@ proto_item: | MRTDUMP mrtdump_mask { this_proto->mrtdump = $2; } | ROUTER ID idval { this_proto->router_id = $3; } | DESCRIPTION text { this_proto->dsc = $2; } - | VRF text { this_proto->vrf = if_get_by_name($2); this_proto->vrf_set = 1; } - | VRF DEFAULT { this_proto->vrf = NULL; this_proto->vrf_set = 1; } + | VRF text { this_proto->vrf = if_get_by_name($2); } + | VRF DEFAULT { this_proto->vrf = &default_vrf; } ; @@ -308,12 +313,26 @@ channel_item_: this_channel->table = $2; } | IMPORT imexport { this_channel->in_filter = $2; } + | EXPORT IN net_any imexport { + if (this_channel->net_type && ($3->type != this_channel->net_type)) + cf_error("Incompatible export prefilter type"); + this_channel->out_subprefix = $3; + this_channel->out_filter = $4; + } | EXPORT imexport { this_channel->out_filter = $2; } | RECEIVE LIMIT limit_spec { this_channel->rx_limit = $3; } | IMPORT LIMIT limit_spec { this_channel->in_limit = $3; } | EXPORT LIMIT limit_spec { this_channel->out_limit = $3; } + | ROA SETTLE TIME settle { this_channel->roa_settle = $4; } | PREFERENCE expr { this_channel->preference = $2; check_u16($2); } - | IMPORT KEEP FILTERED bool { this_channel->in_keep_filtered = $4; } + | IMPORT KEEP FILTERED bool { + if ($4) + this_channel->in_keep |= RIK_REJECTED; + else if ((this_channel->in_keep & RIK_PREFILTER) == RIK_PREFILTER) + cf_error("Import keep filtered is implied by the import table."); + else + this_channel->in_keep &= ~RIK_REJECTED; + } | RPKI RELOAD bool { this_channel->rpki_reload = $3; } ; @@ -344,7 +363,11 @@ channel_end: proto_channel: channel_start channel_opt_list channel_end; -rtable: CF_SYM_KNOWN { cf_assert_symbol($1, SYM_TABLE); $$ = $1->table; } ; +rtable: CF_SYM_KNOWN { + cf_assert_symbol($1, SYM_TABLE); + if (!$1->table) rt_new_default_table($1); + $$ = $1->table; +} ; imexport: FILTER filter { $$ = $2; } @@ -373,6 +396,7 @@ debug_default: DEBUG PROTOCOLS debug_mask { new_config->proto_default_debug = $3; } | DEBUG CHANNELS debug_mask { new_config->channel_default_debug = $3; } | DEBUG COMMANDS expr { new_config->cli_debug = $3; } + | DEBUG TABLES debug_mask { new_config->table_debug = $3; } ; /* MRTDUMP PROTOCOLS is in systep/unix/config.Y */ @@ -401,7 +425,6 @@ timeformat_base: TIMEFORMAT timeformat_spec ';' ; - /* Interface patterns */ iface_patt_node_init: @@ -458,6 +481,7 @@ proto: dev_proto '}' ; dev_proto_start: proto_start DIRECT { this_proto = proto_config_new(&proto_device, $1); init_list(&DIRECT_CFG->iface_list); + this_proto->late_if_feed = 1; } ; @@ -640,29 +664,31 @@ r_args: $$ = cfg_allocz(sizeof(struct rt_show_data)); init_list(&($$->tables)); $$->filter = FILTER_ACCEPT; - $$->running_on_config = new_config->fallback; + $$->running_on_config = config; + $$->cli = this_cli; } | r_args net_any { $$ = $1; if ($$->addr) cf_error("Only one prefix expected"); $$->addr = $2; - $$->addr_mode = RSD_ADDR_EQUAL; + $$->addr_mode = TE_ADDR_EQUAL; } | r_args FOR r_args_for { $$ = $1; if ($$->addr) cf_error("Only one prefix expected"); $$->addr = $3; - $$->addr_mode = RSD_ADDR_FOR; + $$->addr_mode = TE_ADDR_FOR; } | r_args IN net_any { $$ = $1; if ($$->addr) cf_error("Only one prefix expected"); if (!net_type_match($3, NB_IP)) cf_error("Only IP networks accepted for 'in' argument"); $$->addr = $3; - $$->addr_mode = RSD_ADDR_IN; + $$->addr_mode = TE_ADDR_IN; } -| r_args TABLE CF_SYM_KNOWN { +| r_args TABLE symbol_known { cf_assert_symbol($3, SYM_TABLE); + if (!$3->table) cf_error("Table %s not configured", $3->name); $$ = $1; rt_show_add_table($$, $3->table->table); $$->tables_defined_by = RSD_TDB_DIRECT; @@ -675,13 +701,14 @@ r_args: $$->tables_defined_by = RSD_TDB_ALL; } | r_args IMPORT TABLE channel_arg { - if (!$4->in_table) cf_error("No import table in channel %s.%s", $4->proto->name, $4->name); - rt_show_add_table($$, $4->in_table); + if (!($4->in_keep & RIK_PREFILTER)) cf_error("No import table in channel %s.%s", $4->proto->name, $4->name); + RT_LOCKED($4->table, tab) + rt_show_add_exporter($$, &tab->exporter.e, "import")->prefilter = $4; $$->tables_defined_by = RSD_TDB_DIRECT; } | r_args EXPORT TABLE channel_arg { if (!$4->out_table) cf_error("No export table in channel %s.%s", $4->proto->name, $4->name); - rt_show_add_table($$, $4->out_table); + rt_show_add_exporter($$, $4->out_table, "export"); $$->tables_defined_by = RSD_TDB_DIRECT; } | r_args FILTER filter { @@ -706,7 +733,7 @@ r_args: $$ = $1; $$->filtered = 1; } - | r_args export_mode CF_SYM_KNOWN { + | r_args export_mode symbol_known { cf_assert_symbol($3, SYM_PROTO); struct proto_config *c = (struct proto_config *) $3->proto; $$ = $1; @@ -723,7 +750,7 @@ r_args: $$->export_channel = $3; $$->tables_defined_by = RSD_TDB_INDIRECT; } - | r_args PROTOCOL CF_SYM_KNOWN { + | r_args PROTOCOL symbol_known { cf_assert_symbol($3, SYM_PROTO); struct proto_config *c = (struct proto_config *) $3->proto; $$ = $1; @@ -841,7 +868,7 @@ sym_args: CF_CLI_HELP(DUMP, ..., [[Dump debugging information]]) CF_CLI(DUMP RESOURCES,,, [[Dump all allocated resource]]) -{ rdump(&root_pool); cli_msg(0, ""); } ; +{ rdump(&root_pool, 0); cli_msg(0, ""); } ; CF_CLI(DUMP SOCKETS,,, [[Dump open sockets]]) { sk_dump_all(); cli_msg(0, ""); } ; CF_CLI(DUMP EVENTS,,, [[Dump event log]]) @@ -851,9 +878,11 @@ CF_CLI(DUMP INTERFACES,,, [[Dump interface information]]) CF_CLI(DUMP NEIGHBORS,,, [[Dump neighbor cache]]) { neigh_dump_all(); cli_msg(0, ""); } ; CF_CLI(DUMP ATTRIBUTES,,, [[Dump attribute cache]]) -{ rta_dump_all(); cli_msg(0, ""); } ; -CF_CLI(DUMP ROUTES,,, [[Dump routing table]]) +{ ea_dump_all(); cli_msg(0, ""); } ; +CF_CLI(DUMP ROUTES,,, [[Dump routes]]) { rt_dump_all(); cli_msg(0, ""); } ; +CF_CLI(DUMP TABLES,,, [[Dump table connections]]) +{ rt_dump_hooks_all(); cli_msg(0, ""); } ; CF_CLI(DUMP PROTOCOLS,,, [[Dump protocol information]]) { protos_dump_all(); cli_msg(0, ""); } ; CF_CLI(DUMP FILTER ALL,,, [[Dump all filters in linearized form]]) @@ -923,9 +952,6 @@ proto_patt2: | TEXT { $$.ptr = $1; $$.patt = 1; } ; -dynamic_attr: IGP_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_GEN_IGP_METRIC); } ; - - CF_CODE CF_END diff --git a/nest/iface.c b/nest/iface.c index 682340c5..f1938664 100644 --- a/nest/iface.c +++ b/nest/iface.c @@ -31,15 +31,51 @@ #include "nest/cli.h" #include "lib/resource.h" #include "lib/string.h" +#include "lib/locking.h" #include "conf/conf.h" #include "sysdep/unix/krt.h" +DOMAIN(attrs) iface_domain; + +#define IFACE_LOCK LOCK_DOMAIN(attrs, iface_domain) +#define IFACE_UNLOCK UNLOCK_DOMAIN(attrs, iface_domain) +#define IFACE_ASSERT_LOCKED ASSERT_DIE(DOMAIN_IS_LOCKED(attrs, iface_domain)) + +static TLIST_LIST(ifsub) iface_sub_list; +static slab *iface_sub_slab; static pool *if_pool; -list iface_list; +list global_iface_list; +struct iface default_vrf; static void if_recalc_preferred(struct iface *i); +static void ifa_dump_locked(struct ifa *); +static void if_dump_locked(struct iface *); + +struct iface * +if_walk_first(void) +{ + IFACE_LOCK; + struct iface *i = HEAD(global_iface_list); + return NODE_VALID(i) ? i : NULL; +} + +struct iface * +if_walk_next(struct iface *i) +{ + IFACE_ASSERT_LOCKED; + i = NODE_NEXT(i); + return NODE_VALID(i) ? i : NULL; +} + +void +if_walk_done(void) +{ + IFACE_ASSERT_LOCKED; + IFACE_UNLOCK; +} + /** * ifa_dump - dump interface address * @a: interface address descriptor @@ -49,6 +85,14 @@ static void if_recalc_preferred(struct iface *i); void ifa_dump(struct ifa *a) { + IFACE_LOCK; + ifa_dump_locked(a); + IFACE_UNLOCK; +} + +static void +ifa_dump_locked(struct ifa *a) +{ debug("\t%I, net %N bc %I -> %I%s%s%s%s\n", a->ip, &a->prefix, a->brd, a->opposite, (a->flags & IA_PRIMARY) ? " PRIMARY" : "", (a->flags & IA_SECONDARY) ? " SEC" : "", @@ -66,6 +110,14 @@ ifa_dump(struct ifa *a) void if_dump(struct iface *i) { + IFACE_LOCK; + if_dump_locked(i); + IFACE_UNLOCK; +} + +static void +if_dump_locked(struct iface *i) +{ struct ifa *a; debug("IF%d: %s", i->index, i->name); @@ -92,7 +144,7 @@ if_dump(struct iface *i) debug(" MTU=%d\n", i->mtu); WALK_LIST(a, i->addrs) { - ifa_dump(a); + ifa_dump_locked(a); ASSERT(!!(a->flags & IA_PRIMARY) == ((a == i->addr4) || (a == i->addr6) || (a == i->llv6))); } @@ -107,14 +159,60 @@ if_dump(struct iface *i) void if_dump_all(void) { - struct iface *i; - debug("Known network interfaces:\n"); - WALK_LIST(i, iface_list) + IFACE_WALK(i) if_dump(i); debug("Router ID: %08x\n", config->router_id); } +void +if_link(struct iface *i) +{ + IFACE_ASSERT_LOCKED; + + if (i) + i->uc++; +} + +void +if_unlink(struct iface *i) +{ + IFACE_ASSERT_LOCKED; + + if (i) + i->uc--; + /* TODO: Do some interface object cleanup */ +} + +void ifa_link(struct ifa *a) +{ + IFACE_ASSERT_LOCKED; + + if (a) + { + debug("ifa_link: %p %d\n", a, a->uc); + a->uc++; + } +} + +void ifa_unlink(struct ifa *a) +{ + IFACE_ASSERT_LOCKED; + + if (!a) + return; + + debug("ifa_unlink: %p %d\n", a, a->uc); + if (--a->uc) + return; + + if_unlink(a->iface); +#if DEBUGGING + memset(a, 0x5b, sizeof(struct ifa)); +#endif + mb_free(a); +} + static inline unsigned if_what_changed(struct iface *i, struct iface *j) { @@ -139,33 +237,79 @@ if_copy(struct iface *to, struct iface *from) to->flags = from->flags | (to->flags & IF_TMP_DOWN); to->mtu = from->mtu; to->master_index = from->master_index; - to->master = from->master; + + if_unlink(to->master); + if_link(to->master = from->master); +} + +void +if_enqueue_notify_to(struct iface_notification x, struct iface_subscription *s) +{ + IFACE_ASSERT_LOCKED; + + switch (x.type) { + case IFNOT_ADDRESS: + if (!s->ifa_notify) return; + ifa_link(x.a); + break; + case IFNOT_INTERFACE: + if (!s->if_notify) return; + if_link(x.i); + break; + case IFNOT_NEIGHBOR: + if (!s->neigh_notify) return; + neigh_link(x.n); + break; + default: + bug("Unknown interface notification type: %d", x.type); + } + + struct iface_notification *in = sl_alloc(iface_sub_slab); + *in = x; + + debug("Enqueue notify %d/%p (%p) to %p\n", x.type, x.a, in, s); + + ifnot_add_tail(&s->queue, in); + ev_send(s->target, &s->event); +} + +void +if_enqueue_notify(struct iface_notification x) +{ + IFACE_ASSERT_LOCKED; + + WALK_TLIST(ifsub, s, &iface_sub_list) + if_enqueue_notify_to(x, s); } static inline void -ifa_send_notify(struct proto *p, unsigned c, struct ifa *a) +ifa_send_notify(struct iface_subscription *s, unsigned c, struct ifa *a) { - if (p->ifa_notify && + struct proto *p = SKIP_BACK(struct proto, iface_sub, s); + + if (s->ifa_notify && (p->proto_state != PS_DOWN) && - (!p->vrf_set || p->vrf == a->iface->master)) + (!p->vrf || p->vrf == a->iface->master)) { if (p->debug & D_IFACES) log(L_TRACE "%s < address %N on interface %s %s", p->name, &a->prefix, a->iface->name, (c & IF_CHANGE_UP) ? "added" : "removed"); - p->ifa_notify(p, c, a); + s->ifa_notify(p, c, a); } } static void ifa_notify_change_(unsigned c, struct ifa *a) { - struct proto *p; - DBG("IFA change notification (%x) for %s:%I\n", c, a->iface->name, a->ip); - WALK_LIST(p, proto_list) - ifa_send_notify(p, c, a); + if_enqueue_notify((struct iface_notification) { + .type = IFNOT_ADDRESS, + .a = a, + .flags = c, + }); + } static inline void @@ -181,11 +325,13 @@ ifa_notify_change(unsigned c, struct ifa *a) } static inline void -if_send_notify(struct proto *p, unsigned c, struct iface *i) +if_send_notify(struct iface_subscription *s, unsigned c, struct iface *i) { - if (p->if_notify && + struct proto *p = SKIP_BACK(struct proto, iface_sub, s); + + if (s->if_notify && (p->proto_state != PS_DOWN) && - (!p->vrf_set || p->vrf == i->master)) + (!p->vrf || p->vrf == i->master)) { if (p->debug & D_IFACES) log(L_TRACE "%s < interface %s %s", p->name, i->name, @@ -196,14 +342,13 @@ if_send_notify(struct proto *p, unsigned c, struct iface *i) (c & IF_CHANGE_PREFERRED) ? "changes preferred address" : (c & IF_CHANGE_CREATE) ? "created" : "sends unknown event"); - p->if_notify(p, c, i); + s->if_notify(p, c, i); } } static void if_notify_change(unsigned c, struct iface *i) { - struct proto *p; struct ifa *a; if (i->flags & IF_JUST_CREATED) @@ -214,7 +359,7 @@ if_notify_change(unsigned c, struct iface *i) DBG("Interface change notification (%x) for %s\n", c, i->name); #ifdef LOCAL_DEBUG - if_dump(i); + if_dump_locked(i); #endif if (c & IF_CHANGE_DOWN) @@ -224,8 +369,11 @@ if_notify_change(unsigned c, struct iface *i) WALK_LIST(a, i->addrs) ifa_notify_change_(IF_CHANGE_DOWN, a); - WALK_LIST(p, proto_list) - if_send_notify(p, c, i); + if_enqueue_notify((struct iface_notification) { + .type = IFNOT_INTERFACE, + .i = i, + .flags = c, + }); if (c & IF_CHANGE_UP) WALK_LIST(a, i->addrs) @@ -243,7 +391,7 @@ if_recalc_flags(struct iface *i UNUSED, uint flags) { if ((flags & IF_ADMIN_UP) && !(flags & (IF_SHUTDOWN | IF_TMP_DOWN)) && - !(i->master_index && !i->master)) + !(i->master_index && i->master == &default_vrf)) flags |= IF_UP; else flags &= ~IF_UP; @@ -273,10 +421,12 @@ if_change_flags(struct iface *i, uint flags) void if_delete(struct iface *old) { + IFACE_LOCK; struct iface f = {}; strncpy(f.name, old->name, sizeof(f.name)-1); f.flags = IF_SHUTDOWN; - if_update(&f); + if_update_locked(&f); + IFACE_UNLOCK; } /** @@ -298,10 +448,22 @@ if_delete(struct iface *old) struct iface * if_update(struct iface *new) { + IFACE_LOCK; + struct iface *i = if_update_locked(new); + IFACE_UNLOCK; + return i; +} + +struct iface * +if_update_locked(struct iface *new) +{ struct iface *i; unsigned c; - WALK_LIST(i, iface_list) + if (!new->master) + new->master = &default_vrf; + + WALK_LIST(i, global_iface_list) if (!strcmp(new->name, i->name)) { new->flags = if_recalc_flags(new, new->flags); @@ -316,6 +478,7 @@ if_update(struct iface *new) new->llv6 = i->llv6; new->sysdep = i->sysdep; memcpy(&new->addrs, &i->addrs, sizeof(i->addrs)); + memcpy(&new->neighbors, &i->neighbors, sizeof(i->neighbors)); memcpy(i, new, sizeof(*i)); i->flags &= ~IF_UP; /* IF_TMP_DOWN will be added later */ goto newif; @@ -330,21 +493,21 @@ if_update(struct iface *new) } i = mb_alloc(if_pool, sizeof(struct iface)); memcpy(i, new, sizeof(*i)); + if_link(i->master); init_list(&i->addrs); -newif: init_list(&i->neighbors); +newif: i->flags |= IF_UPDATED | IF_TMP_DOWN; /* Tmp down as we don't have addresses yet */ - add_tail(&iface_list, &i->n); + add_tail(&global_iface_list, &i->n); return i; } void if_start_update(void) { - struct iface *i; struct ifa *a; - WALK_LIST(i, iface_list) + IFACE_WALK(i) { i->flags &= ~IF_UPDATED; WALK_LIST(a, i->addrs) @@ -352,8 +515,8 @@ if_start_update(void) } } -void -if_end_partial_update(struct iface *i) +static void +if_end_partial_update_locked(struct iface *i) { if (i->flags & IF_NEEDS_RECALC) if_recalc_preferred(i); @@ -363,12 +526,19 @@ if_end_partial_update(struct iface *i) } void +if_end_partial_update(struct iface *i) +{ + IFACE_LOCK; + if_end_partial_update_locked(i); + IFACE_UNLOCK; +} + +void if_end_update(void) { - struct iface *i; struct ifa *a, *b; - WALK_LIST(i, iface_list) + IFACE_WALK(i) { if (!(i->flags & IF_UPDATED)) if_change_flags(i, (i->flags & ~IF_ADMIN_UP) | IF_SHUTDOWN); @@ -377,43 +547,145 @@ if_end_update(void) WALK_LIST_DELSAFE(a, b, i->addrs) if (!(a->flags & IA_UPDATED)) ifa_delete(a); - if_end_partial_update(i); + if_end_partial_update_locked(i); } } } -void -if_flush_ifaces(struct proto *p) +static void +iface_notify_hook(void *_s) { - if (p->debug & D_EVENTS) - log(L_TRACE "%s: Flushing interfaces", p->name); - if_start_update(); - if_end_update(); + struct iface_subscription *s = _s; + + IFACE_LOCK; + + while (!EMPTY_TLIST(ifnot, &s->queue)) + { + struct iface_notification *n = THEAD(ifnot, &s->queue); + debug("Process notify %d/%p (%p) to %p\n", n->type, n->a, n, s); + IFACE_UNLOCK; + + switch (n->type) { + case IFNOT_ADDRESS: + ifa_send_notify(s, n->flags, n->a); + IFACE_LOCK; + ifa_unlink(n->a); + IFACE_UNLOCK; + break; + case IFNOT_INTERFACE: + if_send_notify(s, n->flags, n->i); + IFACE_LOCK; + if_unlink(n->i); + IFACE_UNLOCK; + break; + case IFNOT_NEIGHBOR: + s->neigh_notify(n->n); + IFACE_LOCK; + neigh_unlink(n->n); + IFACE_UNLOCK; + break; + default: + bug("Bad interface notification type: %d", n->type); + } + + IFACE_LOCK; + ifnot_rem_node(&s->queue, n); + sl_free(n); + } + + IFACE_UNLOCK; } + /** - * if_feed_baby - advertise interfaces to a new protocol - * @p: protocol to feed + * iface_subscribe - request interface updates + * @s: subscription structure * * When a new protocol starts, this function sends it a series * of notifications about all existing interfaces. */ void -if_feed_baby(struct proto *p) +iface_subscribe(struct iface_subscription *s) { - struct iface *i; - struct ifa *a; - - if (!p->if_notify && !p->ifa_notify) /* shortcut */ + IFACE_LOCK; + ifsub_add_tail(&iface_sub_list, s); + s->event = (event) { + .hook = iface_notify_hook, + .data = s, + }; + + if (!s->if_notify && !s->ifa_notify) /* shortcut */ + { + IFACE_UNLOCK; return; + } + + struct iface *i; DBG("Announcing interfaces to new protocol %s\n", p->name); - WALK_LIST(i, iface_list) + WALK_LIST(i, global_iface_list) { - if_send_notify(p, IF_CHANGE_CREATE | ((i->flags & IF_UP) ? IF_CHANGE_UP : 0), i); + if_enqueue_notify_to( + (struct iface_notification) { + .type = IFNOT_INTERFACE, + .i = i, + .flags = IF_CHANGE_CREATE | ((i->flags & IF_UP) ? IF_CHANGE_UP : 0), + }, s); + + struct ifa *a; if (i->flags & IF_UP) WALK_LIST(a, i->addrs) - ifa_send_notify(p, IF_CHANGE_CREATE | IF_CHANGE_UP, a); + if_enqueue_notify_to( + (struct iface_notification) { + .type = IFNOT_ADDRESS, + .a = a, + .flags = IF_CHANGE_CREATE | IF_CHANGE_UP, + }, s); + } + + IFACE_UNLOCK; +} + +/** + * iface_unsubscribe - unsubscribe from interface updates + * @s: subscription structure + */ +void +iface_unsubscribe(struct iface_subscription *s) +{ + IFACE_LOCK; + + struct proto *p = SKIP_BACK(struct proto, iface_sub, s); + WALK_TLIST_DELSAFE(proto_neigh, n, &p->neighbors) + neigh_unlink(n); + + ifsub_rem_node(&iface_sub_list, s); + ev_postpone(&s->event); + + WALK_TLIST_DELSAFE(ifnot, n, &s->queue) + { + debug("Drop notify %d/%p (%p) to %p\n", n->type, n->a, n, s); + switch (n->type) + { + case IFNOT_ADDRESS: + ifa_unlink(n->a); + break; + case IFNOT_INTERFACE: + if_unlink(n->i); + break; + case IFNOT_NEIGHBOR: + neigh_unlink(n->n); + break; + default: + bug("Bad interface notification type: %d", n->type); } + + ifnot_rem_node(&s->queue, n); + sl_free(n); + } + + ASSERT_DIE(EMPTY_TLIST(proto_neigh, &p->neighbors)); + + IFACE_UNLOCK; } /** @@ -425,16 +697,26 @@ if_feed_baby(struct proto *p) * if no such structure exists. */ struct iface * -if_find_by_index(unsigned idx) +if_find_by_index_locked(unsigned idx) { struct iface *i; - WALK_LIST(i, iface_list) + WALK_LIST(i, global_iface_list) if (i->index == idx && !(i->flags & IF_SHUTDOWN)) return i; + return NULL; } +struct iface * +if_find_by_index(unsigned idx) +{ + IFACE_LOCK; + struct iface *i = if_find_by_index_locked(idx); + IFACE_UNLOCK; + return i; +} + /** * if_find_by_name - find interface by name * @name: interface name @@ -448,9 +730,15 @@ if_find_by_name(const char *name) { struct iface *i; - WALK_LIST(i, iface_list) + IFACE_LOCK; + WALK_LIST(i, global_iface_list) if (!strcmp(i->name, name) && !(i->flags & IF_SHUTDOWN)) + { + IFACE_UNLOCK; return i; + } + + IFACE_UNLOCK; return NULL; } @@ -459,9 +747,13 @@ if_get_by_name(const char *name) { struct iface *i; - WALK_LIST(i, iface_list) + IFACE_LOCK; + WALK_LIST(i, global_iface_list) if (!strcmp(i->name, name)) + { + IFACE_UNLOCK; return i; + } /* No active iface, create a dummy */ i = mb_allocz(if_pool, sizeof(struct iface)); @@ -469,7 +761,9 @@ if_get_by_name(const char *name) i->flags = IF_SHUTDOWN; init_list(&i->addrs); init_list(&i->neighbors); - add_tail(&iface_list, &i->n); + add_tail(&global_iface_list, &i->n); + + IFACE_UNLOCK; return i; } @@ -553,9 +847,7 @@ if_recalc_preferred(struct iface *i) void if_recalc_all_preferred_addresses(void) { - struct iface *i; - - WALK_LIST(i, iface_list) + IFACE_WALK(i) { if_recalc_preferred(i); @@ -582,6 +874,8 @@ ifa_same(struct ifa *a, struct ifa *b) struct ifa * ifa_update(struct ifa *a) { + IFACE_LOCK; + struct iface *i = a->iface; struct ifa *b; @@ -594,6 +888,8 @@ ifa_update(struct ifa *a) !((b->flags ^ a->flags) & (IA_SECONDARY | IA_PEER | IA_HOST))) { b->flags |= IA_UPDATED; + + IFACE_UNLOCK; return b; } ifa_delete(b); @@ -605,12 +901,16 @@ ifa_update(struct ifa *a) b = mb_alloc(if_pool, sizeof(struct ifa)); memcpy(b, a, sizeof(struct ifa)); + ifa_link(b); + if_link(i); add_tail(&i->addrs, &b->n); b->flags |= IA_UPDATED; i->flags |= IF_NEEDS_RECALC; if (i->flags & IF_UP) ifa_notify_change(IF_CHANGE_CREATE | IF_CHANGE_UP, b); + + IFACE_UNLOCK; return b; } @@ -628,6 +928,8 @@ ifa_delete(struct ifa *a) struct iface *i = a->iface; struct ifa *b; + IFACE_LOCK; + WALK_LIST(b, i->addrs) if (ifa_same(b, a)) { @@ -651,19 +953,24 @@ ifa_delete(struct ifa *a) if (i->flags & IF_UP) ifa_notify_change(IF_CHANGE_DOWN, b); - mb_free(b); + ifa_unlink(b); + IFACE_UNLOCK; return; } + + IFACE_UNLOCK; } u32 if_choose_router_id(struct iface_patt *mask, u32 old_id) { + IFACE_LOCK; + struct iface *i; struct ifa *a, *b; b = NULL; - WALK_LIST(i, iface_list) + WALK_LIST(i, global_iface_list) { if (!(i->flags & IF_ADMIN_UP) || (i->flags & IF_SHUTDOWN)) @@ -690,6 +997,8 @@ if_choose_router_id(struct iface_patt *mask, u32 old_id) } } + IFACE_UNLOCK; + if (!b) return 0; @@ -710,8 +1019,11 @@ void if_init(void) { if_pool = rp_new(&root_pool, "Interfaces"); - init_list(&iface_list); + init_list(&global_iface_list); + iface_sub_slab = sl_new(if_pool, sizeof(struct iface_notification)); + strcpy(default_vrf.name, "default"); neigh_init(if_pool); + iface_domain = DOMAIN_NEW(attrs, "Interfaces"); } /* @@ -833,17 +1145,16 @@ if_show_addr(struct ifa *a) void if_show(void) { - struct iface *i; struct ifa *a; char *type; - WALK_LIST(i, iface_list) + IFACE_WALK(i) { if (i->flags & IF_SHUTDOWN) continue; char mbuf[16 + sizeof(i->name)] = {}; - if (i->master) + if (i->master != &default_vrf) bsprintf(mbuf, " master=%s", i->master->name); else if (i->master_index) bsprintf(mbuf, " master=#%u", i->master_index); @@ -877,10 +1188,8 @@ if_show(void) void if_show_summary(void) { - struct iface *i; - cli_msg(-2005, "%-10s %-6s %-18s %s", "Interface", "State", "IPv4 address", "IPv6 address"); - WALK_LIST(i, iface_list) + IFACE_WALK(i) { byte a4[IPA_MAX_TEXT_LENGTH + 17]; byte a6[IPA_MAX_TEXT_LENGTH + 17]; diff --git a/nest/iface.h b/nest/iface.h index 1189cdd4..05898a55 100644 --- a/nest/iface.h +++ b/nest/iface.h @@ -9,11 +9,12 @@ #ifndef _BIRD_IFACE_H_ #define _BIRD_IFACE_H_ +#include "lib/locking.h" +#include "lib/event.h" #include "lib/lists.h" +#include "lib/tlists.h" #include "lib/ip.h" -extern list iface_list; - struct proto; struct pool; @@ -26,8 +27,11 @@ struct ifa { /* Interface address */ ip_addr opposite; /* Opposite end of a point-to-point link */ unsigned scope; /* Interface address scope */ unsigned flags; /* Analogous to iface->flags */ + unsigned uc; /* Use (link) count */ }; +extern struct iface default_vrf; + struct iface { node n; char name[16]; @@ -42,6 +46,7 @@ struct iface { struct ifa *llv6; /* Primary link-local address for IPv6 */ ip4_addr sysdep; /* Arbitrary IPv4 address for internal sysdep use */ list neighbors; /* All neighbors on this interface */ + unsigned uc; /* Use (link) count */ }; #define IF_UP 1 /* Currently just IF_ADMIN_UP */ @@ -106,25 +111,31 @@ void ifa_dump(struct ifa *); void if_show(void); void if_show_summary(void); struct iface *if_update(struct iface *); +struct iface *if_update_locked(struct iface *); void if_delete(struct iface *old); struct ifa *ifa_update(struct ifa *); void ifa_delete(struct ifa *); void if_start_update(void); void if_end_partial_update(struct iface *); void if_end_update(void); -void if_flush_ifaces(struct proto *p); -void if_feed_baby(struct proto *); struct iface *if_find_by_index(unsigned); +struct iface *if_find_by_index_locked(unsigned); struct iface *if_find_by_name(const char *); struct iface *if_get_by_name(const char *); void if_recalc_all_preferred_addresses(void); +struct iface *if_walk_first(void); +struct iface *if_walk_next(struct iface *); +void if_walk_done(void); + +#define IFACE_WALK(_i) for (struct iface *_i = if_walk_first(); _i || (if_walk_done(), 0); _i = if_walk_next(_i)) /* The Neighbor Cache */ typedef struct neighbor { node n; /* Node in neighbor hash table chain */ node if_n; /* Node in per-interface neighbor list */ + TLIST_NODE(proto_neigh, struct neighbor) proto_n; ip_addr addr; /* Address of the neighbor */ struct ifa *ifa; /* Ifa on related iface */ struct iface *iface; /* Interface it's connected to */ @@ -135,8 +146,16 @@ typedef struct neighbor { u16 flags; /* NEF_* flags */ s16 scope; /* Address scope, -1 for unreachable neighbors, SCOPE_HOST when it's our own address */ + uint uc; /* Use (link) count */ } neighbor; +#define TLIST_PREFIX proto_neigh +#define TLIST_TYPE struct neighbor +#define TLIST_ITEM proto_n +#define TLIST_WANT_WALK +#define TLIST_WANT_ADD_TAIL +#include "lib/tlists.h" + #define NEF_STICKY 1 #define NEF_ONLINK 2 #define NEF_IFACE 4 /* Entry for whole iface */ @@ -144,9 +163,7 @@ typedef struct neighbor { neighbor *neigh_find(struct proto *p, ip_addr a, struct iface *ifa, uint flags); -void neigh_dump(neighbor *); void neigh_dump_all(void); -void neigh_prune(void); void neigh_if_up(struct iface *); void neigh_if_down(struct iface *); void neigh_if_link(struct iface *); @@ -154,6 +171,64 @@ void neigh_ifa_up(struct ifa *a); void neigh_ifa_down(struct ifa *a); void neigh_init(struct pool *); +void neigh_link(neighbor *); +void neigh_unlink(neighbor *); + +/* + * Notification mechanism + */ + +#define TLIST_PREFIX ifnot +#define TLIST_TYPE struct iface_notification +#define TLIST_ITEM nn +#define TLIST_WANT_WALK +#define TLIST_WANT_ADD_TAIL + +struct iface_notification { + TLIST_DEFAULT_NODE; + enum { + IFNOT_INVALID, + IFNOT_ADDRESS, + IFNOT_INTERFACE, + IFNOT_NEIGHBOR, + } type; + unsigned flags; + union { + struct ifa *a; + struct iface *i; + neighbor *n; + }; +}; + +#include "lib/tlists.h" + +#define TLIST_PREFIX ifsub +#define TLIST_TYPE struct iface_subscription +#define TLIST_ITEM n +#define TLIST_WANT_WALK +#define TLIST_WANT_ADD_TAIL + +struct iface_subscription { + TLIST_DEFAULT_NODE; + + event event; + event_list *target; + TLIST_LIST(ifnot) queue; + + void (*if_notify)(struct proto *, unsigned flags, struct iface *i); + void (*ifa_notify)(struct proto *, unsigned flags, struct ifa *a); + void (*neigh_notify)(struct neighbor *neigh); +}; + +#include "lib/tlists.h" + +void if_enqueue_notify(struct iface_notification); +void if_enqueue_notify_to(struct iface_notification x, struct iface_subscription *s); + +void iface_flush_notifications(struct iface_subscription *); +void iface_subscribe(struct iface_subscription *); +void iface_unsubscribe(struct iface_subscription *); + /* * Interface Pattern Lists */ diff --git a/nest/limit.h b/nest/limit.h new file mode 100644 index 00000000..f8d4b212 --- /dev/null +++ b/nest/limit.h @@ -0,0 +1,50 @@ +/* + * BIRD Internet Routing Daemon -- Limits + * + * (c) 1998--2000 Martin Mares <mj@ucw.cz> + * (c) 2021 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_LIMIT_H_ +#define _BIRD_LIMIT_H_ + +struct limit { + u32 max; + u32 count; + int (*action)(struct limit *, void *data); +}; + +static inline int limit_do_action(struct limit *l, void *data) +{ + return l->action ? l->action(l, data) : 1; +} + +static inline int limit_push(struct limit *l, void *data) +{ + if ((l->count >= l->max) && limit_do_action(l, data)) + return 1; + + l->count++; + return 0; +} + +static inline void limit_pop(struct limit *l) +{ + ASSERT_DIE(l->count > 0); + --l->count; +} + +static inline void limit_reset(struct limit *l) +{ + l->count = 0; +} + +static inline void limit_update(struct limit *l, void *data, u32 max) +{ + if (l->count > (l->max = max)) + limit_do_action(l, data); +} + +#endif diff --git a/nest/locks.c b/nest/locks.c index 812a6534..247d0c56 100644 --- a/nest/locks.c +++ b/nest/locks.c @@ -37,7 +37,11 @@ #include "nest/iface.h" static list olock_list; -static event *olock_event; + +DEFINE_DOMAIN(attrs); +static DOMAIN(attrs) olock_domain; +#define OBJ_LOCK LOCK_DOMAIN(attrs, olock_domain) +#define OBJ_UNLOCK UNLOCK_DOMAIN(attrs, olock_domain) static inline int olock_same(struct object_lock *x, struct object_lock *y) @@ -54,39 +58,56 @@ olock_same(struct object_lock *x, struct object_lock *y) static void olock_free(resource *r) { - struct object_lock *q, *l = (struct object_lock *) r; + /* Called externally from rfree() */ + struct object_lock *l = SKIP_BACK(struct object_lock, r, r); node *n; + OBJ_LOCK; DBG("olock: Freeing %p\n", l); switch (l->state) { case OLOCK_STATE_FREE: break; case OLOCK_STATE_LOCKED: - case OLOCK_STATE_EVENT: + /* Remove myself from the olock_list */ rem_node(&l->n); + + /* Maybe the notification is still pending. */ + ev_postpone(&l->event); + + /* Get new lock candidate */ n = HEAD(l->waiters); - if (n->next) + if (NODE_VALID(n)) { - DBG("olock: -> %p becomes locked\n", n); - q = SKIP_BACK(struct object_lock, n, n); + struct object_lock *q = SKIP_BACK(struct object_lock, n, n); + + /* Remove this candidate from waiters list */ rem_node(n); + + /* Move waiter lists */ + DBG("olock: -> %p becomes locked\n", n); add_tail_list(&q->waiters, &l->waiters); - q->state = OLOCK_STATE_EVENT; + + /* Add the new olock to olock_list */ add_head(&olock_list, n); - ev_schedule(olock_event); + + /* Inform */ + q->state = OLOCK_STATE_LOCKED; + ev_send(q->target, &q->event); } break; case OLOCK_STATE_WAITING: + /* Remove from the waiters list */ rem_node(&l->n); break; default: ASSERT(0); } + OBJ_UNLOCK; } static void -olock_dump(resource *r) +olock_dump(resource *r, unsigned indent UNUSED) { struct object_lock *l = (struct object_lock *) r; static char *olock_states[] = { "free", "locked", "waiting", "event" }; @@ -140,6 +161,8 @@ olock_acquire(struct object_lock *l) node *n; struct object_lock *q; + OBJ_LOCK; + WALK_LIST(n, olock_list) { q = SKIP_BACK(struct object_lock, n, n); @@ -148,36 +171,19 @@ olock_acquire(struct object_lock *l) l->state = OLOCK_STATE_WAITING; add_tail(&q->waiters, &l->n); DBG("olock: %p waits\n", l); + + OBJ_UNLOCK; return; } } + DBG("olock: %p acquired immediately\n", l); - l->state = OLOCK_STATE_EVENT; add_head(&olock_list, &l->n); - ev_schedule(olock_event); -} -static void -olock_run_event(void *unused UNUSED) -{ - node *n; - struct object_lock *q; + l->state = OLOCK_STATE_LOCKED; + ev_send(l->target, &l->event); - DBG("olock: Processing events\n"); - for(;;) - { - n = HEAD(olock_list); - if (!n->next) - break; - q = SKIP_BACK(struct object_lock, n, n); - if (q->state != OLOCK_STATE_EVENT) - break; - DBG("olock: %p locked\n", q); - q->state = OLOCK_STATE_LOCKED; - rem_node(&q->n); - add_tail(&olock_list, &q->n); - q->hook(q); - } + OBJ_UNLOCK; } /** @@ -191,5 +197,5 @@ olock_init(void) { DBG("olock: init\n"); init_list(&olock_list); - olock_event = ev_new_init(&root_pool, olock_run_event, NULL); + olock_domain = DOMAIN_NEW(attrs, "Object lock"); } diff --git a/nest/locks.h b/nest/locks.h index 37026c68..04571e69 100644 --- a/nest/locks.h +++ b/nest/locks.h @@ -31,8 +31,8 @@ struct object_lock { uint inst; /* ... instance ID */ struct iface *iface; /* ... interface */ struct iface *vrf; /* ... or VRF (if iface is unknown) */ - void (*hook)(struct object_lock *); /* Called when the lock succeeds */ - void *data; /* User data */ + event event; /* Enqueued when the lock succeeds */ + event_list *target; /* Where to put the event */ /* ... internal to lock manager, don't touch ... */ node n; /* Node in list of olocks */ int state; /* OLOCK_STATE_xxx */ @@ -50,6 +50,5 @@ void olock_init(void); #define OLOCK_STATE_FREE 0 #define OLOCK_STATE_LOCKED 1 #define OLOCK_STATE_WAITING 2 -#define OLOCK_STATE_EVENT 3 /* waiting for unlock processing */ #endif diff --git a/nest/neighbor.c b/nest/neighbor.c index 7cf9c85d..934cd35c 100644 --- a/nest/neighbor.c +++ b/nest/neighbor.c @@ -60,6 +60,19 @@ static slab *neigh_slab; static list neigh_hash_table[NEIGH_HASH_SIZE], sticky_neigh_list; +void if_link(struct iface *); +void if_unlink(struct iface *); +void ifa_link(struct ifa *); +void ifa_unlink(struct ifa *); + +extern list global_iface_list; + +extern DOMAIN(attrs) iface_domain; + +#define IFACE_LOCK LOCK_DOMAIN(attrs, iface_domain) +#define IFACE_UNLOCK UNLOCK_DOMAIN(attrs, iface_domain) +#define IFACE_ASSERT_LOCKED ASSERT_DIE(DOMAIN_IS_LOCKED(attrs, iface_domain)) + static inline uint neigh_hash(struct proto *p, ip_addr a, struct iface *i) { @@ -142,7 +155,7 @@ if_connected(ip_addr a, struct iface *i, struct ifa **ap, uint flags) } static inline int -if_connected_any(ip_addr a, struct iface *vrf, uint vrf_set, struct iface **iface, struct ifa **addr, uint flags) +if_connected_any(ip_addr a, struct iface *vrf, struct iface **iface, struct ifa **addr, uint flags) { struct iface *i; struct ifa *b; @@ -152,8 +165,8 @@ if_connected_any(ip_addr a, struct iface *vrf, uint vrf_set, struct iface **ifac *addr = NULL; /* Prefer SCOPE_HOST or longer prefix */ - WALK_LIST(i, iface_list) - if ((!vrf_set || vrf == i->master) && ((s = if_connected(a, i, &b, flags)) >= 0)) + WALK_LIST(i, global_iface_list) + if ((!vrf || vrf == i->master) && ((s = if_connected(a, i, &b, flags)) >= 0)) if (scope_better(s, scope) || (scope_remote(s, scope) && ifa_better(b, *addr))) { *iface = i; @@ -210,6 +223,8 @@ if_intersect(struct iface *ia, struct iface *ib) neighbor * neigh_find(struct proto *p, ip_addr a, struct iface *iface, uint flags) { + IFACE_LOCK; + neighbor *n; int class, scope = -1; uint h = neigh_hash(p, a, iface); @@ -218,26 +233,29 @@ neigh_find(struct proto *p, ip_addr a, struct iface *iface, uint flags) WALK_LIST(n, neigh_hash_table[h]) /* Search the cache */ if ((n->proto == p) && ipa_equal(n->addr, a) && (n->ifreq == iface)) + { + IFACE_UNLOCK; return n; + } if (flags & NEF_IFACE) { if (ipa_nonzero(a) || !iface) - return NULL; + goto bad; } else { class = ipa_classify(a); if (class < 0) /* Invalid address */ - return NULL; + goto bad; if (((class & IADDR_SCOPE_MASK) == SCOPE_HOST) || (((class & IADDR_SCOPE_MASK) == SCOPE_LINK) && !iface) || !(class & IADDR_HOST)) - return NULL; /* Bad scope or a somecast */ + goto bad; /* Bad scope or a somecast */ } if ((flags & NEF_ONLINK) && !iface) - return NULL; + goto bad; if (iface) { @@ -245,26 +263,34 @@ neigh_find(struct proto *p, ip_addr a, struct iface *iface, uint flags) iface = (scope < 0) ? NULL : iface; } else - scope = if_connected_any(a, p->vrf, p->vrf_set, &iface, &addr, flags); + scope = if_connected_any(a, p->vrf, &iface, &addr, flags); /* scope < 0 means i don't know neighbor */ /* scope >= 0 <=> iface != NULL */ if ((scope < 0) && !(flags & NEF_STICKY)) - return NULL; + goto bad; n = sl_allocz(neigh_slab); add_tail(&neigh_hash_table[h], &n->n); add_tail((scope >= 0) ? &iface->neighbors : &sticky_neigh_list, &n->if_n); + proto_neigh_add_tail(&p->neighbors, n); n->addr = a; - n->ifa = addr; - n->iface = iface; - n->ifreq = ifreq; + ifa_link(n->ifa = addr); + if_link(n->iface = iface); + if_link(n->ifreq = ifreq); n->proto = p; n->flags = flags; n->scope = scope; + neigh_link(n); + + IFACE_UNLOCK; return n; + +bad: + IFACE_UNLOCK; + return NULL; } /** @@ -273,7 +299,7 @@ neigh_find(struct proto *p, ip_addr a, struct iface *iface, uint flags) * * This functions dumps the contents of a given neighbor entry to debug output. */ -void +static void neigh_dump(neighbor *n) { debug("%p %I %s %s ", n, n->addr, @@ -295,6 +321,8 @@ neigh_dump(neighbor *n) void neigh_dump_all(void) { + IFACE_LOCK; + neighbor *n; int i; @@ -303,24 +331,28 @@ neigh_dump_all(void) WALK_LIST(n, neigh_hash_table[i]) neigh_dump(n); debug("\n"); + + IFACE_UNLOCK; } static inline void neigh_notify(neighbor *n) { - if (n->proto->neigh_notify && (n->proto->proto_state != PS_STOP)) - n->proto->neigh_notify(n); + IFACE_ASSERT_LOCKED; + if_enqueue_notify_to((struct iface_notification) { .type = IFNOT_NEIGHBOR, .n = n, }, &n->proto->iface_sub); } static void neigh_up(neighbor *n, struct iface *i, struct ifa *a, int scope) { DBG("Waking up sticky neighbor %I\n", n->addr); - n->iface = i; - n->ifa = a; + if_link(n->iface = i); + ifa_link(n->ifa = a); + n->scope = scope; - rem_node(&n->if_n); + rem_node(&n->if_n); /* HACK: Here the neighbor is always in the sticky list, + regardless whether it is sticky or not */ add_tail(&i->neighbors, &n->if_n); neigh_notify(n); @@ -330,21 +362,50 @@ static void neigh_down(neighbor *n) { DBG("Flushing neighbor %I on %s\n", n->addr, n->iface->name); - n->iface = NULL; - n->ifa = NULL; + n->scope = -1; rem_node(&n->if_n); add_tail(&sticky_neigh_list, &n->if_n); + ifa_unlink(n->ifa); + n->ifa = NULL; + + if_unlink(n->iface); + n->iface = NULL; + neigh_notify(n); } -static inline void -neigh_free(neighbor *n) +void +neigh_link(neighbor *n) +{ + IFACE_ASSERT_LOCKED; + n->uc++; +} + +void +neigh_unlink(neighbor *n) { + IFACE_ASSERT_LOCKED; + if (--n->uc) + return; + + struct proto *p = n->proto; + proto_neigh_rem_node(&p->neighbors, n); + + if ((p->proto_state == PS_DOWN) && EMPTY_TLIST(proto_neigh, &p->neighbors)) + proto_send_event(p, p->event); + + n->proto = NULL; + rem_node(&n->n); rem_node(&n->if_n); + + ifa_unlink(n->ifa); + if_unlink(n->iface); + if_unlink(n->ifreq); + sl_free(n); } @@ -360,6 +421,8 @@ neigh_free(neighbor *n) void neigh_update(neighbor *n, struct iface *iface) { + IFACE_ASSERT_LOCKED; + struct proto *p = n->proto; struct ifa *ifa = NULL; int scope = -1; @@ -369,7 +432,7 @@ neigh_update(neighbor *n, struct iface *iface) return; /* VRF-bound neighbors ignore changes in other VRFs */ - if (p->vrf_set && (p->vrf != iface->master)) + if (p->vrf && (p->vrf != iface->master)) return; scope = if_connected(n->addr, iface, &ifa, n->flags); @@ -379,7 +442,7 @@ neigh_update(neighbor *n, struct iface *iface) { /* When neighbor is going down, try to respawn it on other ifaces */ if ((scope < 0) && (n->scope >= 0) && !n->ifreq && (n->flags & NEF_STICKY)) - scope = if_connected_any(n->addr, p->vrf, p->vrf_set, &iface, &ifa, n->flags); + scope = if_connected_any(n->addr, p->vrf, &iface, &ifa, n->flags); } else { @@ -394,7 +457,8 @@ neigh_update(neighbor *n, struct iface *iface) { if (ifa != n->ifa) { - n->ifa = ifa; + ifa_unlink(n->ifa); + ifa_link(n->ifa = ifa); neigh_notify(n); } @@ -408,7 +472,7 @@ neigh_update(neighbor *n, struct iface *iface) if ((n->scope < 0) && !(n->flags & NEF_STICKY)) { - neigh_free(n); + neigh_unlink(n); return; } @@ -429,12 +493,13 @@ neigh_update(neighbor *n, struct iface *iface) void neigh_if_up(struct iface *i) { + IFACE_ASSERT_LOCKED; struct iface *ii; neighbor *n; node *x, *y; /* Update neighbors that might be better off with the new iface */ - WALK_LIST(ii, iface_list) + WALK_LIST(ii, global_iface_list) if (!EMPTY_LIST(ii->neighbors) && (ii != i) && if_intersect(i, ii)) WALK_LIST2_DELSAFE(n, x, y, ii->neighbors, if_n) neigh_update(n, i); @@ -454,6 +519,7 @@ neigh_if_up(struct iface *i) void neigh_if_down(struct iface *i) { + IFACE_ASSERT_LOCKED; neighbor *n; node *x, *y; @@ -471,6 +537,7 @@ neigh_if_down(struct iface *i) void neigh_if_link(struct iface *i) { + IFACE_ASSERT_LOCKED; neighbor *n; node *x, *y; @@ -491,12 +558,13 @@ neigh_if_link(struct iface *i) void neigh_ifa_up(struct ifa *a) { + IFACE_ASSERT_LOCKED; struct iface *i = a->iface, *ii; neighbor *n; node *x, *y; /* Update neighbors that might be better off with the new ifa */ - WALK_LIST(ii, iface_list) + WALK_LIST(ii, global_iface_list) if (!EMPTY_LIST(ii->neighbors) && ifa_intersect(a, ii)) WALK_LIST2_DELSAFE(n, x, y, ii->neighbors, if_n) neigh_update(n, i); @@ -509,6 +577,7 @@ neigh_ifa_up(struct ifa *a) void neigh_ifa_down(struct ifa *a) { + IFACE_ASSERT_LOCKED; struct iface *i = a->iface; neighbor *n; node *x, *y; @@ -519,35 +588,6 @@ neigh_ifa_down(struct ifa *a) neigh_update(n, i); } -static inline void -neigh_prune_one(neighbor *n) -{ - if (n->proto->proto_state != PS_DOWN) - return; - - neigh_free(n); -} - -/** - * neigh_prune - prune neighbor cache - * - * neigh_prune() examines all neighbor entries cached and removes those - * corresponding to inactive protocols. It's called whenever a protocol - * is shut down to get rid of all its heritage. - */ -void -neigh_prune(void) -{ - neighbor *n; - node *m; - int i; - - DBG("Pruning neighbors\n"); - for(i=0; i<NEIGH_HASH_SIZE; i++) - WALK_LIST_DELSAFE(n, m, neigh_hash_table[i]) - neigh_prune_one(n); -} - /** * neigh_init - initialize the neighbor cache. * @if_pool: resource pool to be used for neighbor entries. diff --git a/nest/proto.c b/nest/proto.c index 885a0b75..21460e56 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -16,7 +16,7 @@ #include "lib/timer.h" #include "lib/string.h" #include "conf/conf.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/iface.h" #include "nest/cli.h" #include "filter/filter.h" @@ -26,12 +26,10 @@ pool *proto_pool; list STATIC_LIST_INIT(proto_list); static list STATIC_LIST_INIT(protocol_list); -struct protocol *class_to_protocol[PROTOCOL__MAX]; #define CD(c, msg, args...) ({ if (c->debug & D_STATES) log(L_TRACE "%s.%s: " msg, c->proto->name, c->name ?: "?", ## args); }) #define PD(p, msg, args...) ({ if (p->debug & D_STATES) log(L_TRACE "%s: " msg, p->name, ## args); }) -static timer *proto_shutdown_timer; static timer *gr_wait_timer; #define GRS_NONE 0 @@ -43,24 +41,26 @@ static int graceful_restart_state; static u32 graceful_restart_locks; static char *p_states[] = { "DOWN", "START", "UP", "STOP" }; -static char *c_states[] = { "DOWN", "START", "UP", "FLUSHING" }; -static char *e_states[] = { "DOWN", "FEEDING", "READY" }; +static char *c_states[] = { "DOWN", "START", "UP", "STOP", "RESTART" }; extern struct protocol proto_unix_iface; static void channel_request_reload(struct channel *c); -static void proto_shutdown_loop(timer *); static void proto_rethink_goal(struct proto *p); static char *proto_state_name(struct proto *p); -static void channel_verify_limits(struct channel *c); -static inline void channel_reset_limit(struct channel_limit *l); - +static void channel_init_limit(struct channel *c, struct limit *l, int dir, struct channel_limit *cf); +static void channel_update_limit(struct channel *c, struct limit *l, int dir, struct channel_limit *cf); +static void channel_reset_limit(struct channel *c, struct limit *l, int dir); +static void channel_feed_end(struct channel *c); +static void channel_stop_export(struct channel *c); +static void channel_export_stopped(struct rt_export_request *req); +static void channel_check_stopped(struct channel *c); static inline int proto_is_done(struct proto *p) -{ return (p->proto_state == PS_DOWN) && (p->active_channels == 0); } +{ return (p->proto_state == PS_DOWN) && proto_is_inactive(p); } static inline int channel_is_active(struct channel *c) -{ return (c->channel_state == CS_START) || (c->channel_state == CS_UP); } +{ return (c->channel_state != CS_DOWN); } static inline int channel_reloadable(struct channel *c) { return c->proto->reload_routes && c->reloadable; } @@ -68,10 +68,46 @@ static inline int channel_reloadable(struct channel *c) static inline void channel_log_state_change(struct channel *c) { - if (c->export_state) - CD(c, "State changed to %s/%s", c_states[c->channel_state], e_states[c->export_state]); - else - CD(c, "State changed to %s", c_states[c->channel_state]); + CD(c, "State changed to %s", c_states[c->channel_state]); +} + +void +channel_import_log_state_change(struct rt_import_request *req, u8 state) +{ + struct channel *c = SKIP_BACK(struct channel, in_req, req); + CD(c, "Channel import state changed to %s", rt_import_state_name(state)); +} + +void +channel_export_log_state_change(struct rt_export_request *req, u8 state) +{ + struct channel *c = SKIP_BACK(struct channel, out_req, req); + CD(c, "Channel export state changed to %s", rt_export_state_name(state)); + + switch (state) + { + case TES_FEEDING: + if (c->proto->feed_begin) + c->proto->feed_begin(c, !c->refeeding); + break; + case TES_READY: + channel_feed_end(c); + break; + } +} + +static void +channel_dump_import_req(struct rt_import_request *req) +{ + struct channel *c = SKIP_BACK(struct channel, in_req, req); + debug(" Channel %s.%s import request %p\n", c->proto->name, c->name, req); +} + +static void +channel_dump_export_req(struct rt_export_request *req) +{ + struct channel *c = SKIP_BACK(struct channel, out_req, req); + debug(" Channel %s.%s export request %p\n", c->proto->name, c->name, req); } static void @@ -111,7 +147,7 @@ proto_cf_find_channel(struct proto_config *pc, uint net_type) * Returns pointer to channel or NULL */ struct channel * -proto_find_channel_by_table(struct proto *p, struct rtable *t) +proto_find_channel_by_table(struct proto *p, rtable *t) { struct channel *c; @@ -165,23 +201,25 @@ proto_add_channel(struct proto *p, struct channel_config *cf) c->channel = cf->channel; c->proto = p; c->table = cf->table->table; + rt_lock_table(c->table); c->in_filter = cf->in_filter; c->out_filter = cf->out_filter; - c->rx_limit = cf->rx_limit; - c->in_limit = cf->in_limit; - c->out_limit = cf->out_limit; + c->out_subprefix = cf->out_subprefix; + + channel_init_limit(c, &c->rx_limit, PLD_RX, &cf->rx_limit); + channel_init_limit(c, &c->in_limit, PLD_IN, &cf->in_limit); + channel_init_limit(c, &c->out_limit, PLD_OUT, &cf->out_limit); c->net_type = cf->net_type; c->ra_mode = cf->ra_mode; c->preference = cf->preference; c->debug = cf->debug; c->merge_limit = cf->merge_limit; - c->in_keep_filtered = cf->in_keep_filtered; + c->in_keep = cf->in_keep; c->rpki_reload = cf->rpki_reload; c->channel_state = CS_DOWN; - c->export_state = ES_DOWN; c->last_state_change = current_time(); c->reloadable = 1; @@ -203,6 +241,7 @@ proto_remove_channel(struct proto *p UNUSED, struct channel *c) CD(c, "Removed", c->name); + rt_unlock_table(c->table); rem_node(&c->n); mb_free(c); } @@ -223,7 +262,7 @@ proto_pause_channels(struct proto *p) struct channel *c; WALK_LIST(c, p->channels) if (!c->disabled && channel_is_active(c)) - channel_set_state(c, CS_START); + channel_set_state(c, CS_PAUSE); } static void @@ -232,7 +271,7 @@ proto_stop_channels(struct proto *p) struct channel *c; WALK_LIST(c, p->channels) if (!c->disabled && channel_is_active(c)) - channel_set_state(c, CS_FLUSHING); + channel_set_state(c, CS_STOP); } static void @@ -243,122 +282,76 @@ proto_remove_channels(struct proto *p) proto_remove_channel(p, c); } +struct roa_subscription { + node roa_node; + struct settle settle; + struct channel *c; + struct rt_export_request req; +}; + static void -channel_schedule_feed(struct channel *c, int initial) +channel_roa_in_changed(struct settle *se) { - // DBG("%s: Scheduling meal\n", p->name); - ASSERT(c->channel_state == CS_UP); - - c->export_state = ES_FEEDING; - c->refeeding = !initial; + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, settle, se); + struct channel *c = s->c; - ev_schedule_work(c->feed_event); + CD(c, "Reload triggered by RPKI change"); + channel_request_reload(c); } static void -channel_feed_loop(void *ptr) +channel_roa_out_changed(struct settle *se) { - struct channel *c = ptr; - - if (c->export_state != ES_FEEDING) - return; - - /* Start feeding */ - if (!c->feed_active) - { - if (c->proto->feed_begin) - c->proto->feed_begin(c, !c->refeeding); - - c->refeed_pending = 0; - } - - // DBG("Feeding protocol %s continued\n", p->name); - if (!rt_feed_channel(c)) - { - ev_schedule_work(c->feed_event); - return; - } - - /* Reset export limit if the feed ended with acceptable number of exported routes */ - struct channel_limit *l = &c->out_limit; - if (c->refeeding && - (l->state == PLS_BLOCKED) && - (c->refeed_count <= l->limit) && - (c->stats.exp_routes <= l->limit)) - { - log(L_INFO "Protocol %s resets route export limit (%u)", c->proto->name, l->limit); - channel_reset_limit(&c->out_limit); - - /* Continue in feed - it will process routing table again from beginning */ - c->refeed_count = 0; - ev_schedule_work(c->feed_event); - return; - } - - // DBG("Feeding protocol %s finished\n", p->name); - c->export_state = ES_READY; - channel_log_state_change(c); + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, settle, se); + struct channel *c = s->c; - if (c->proto->feed_end) - c->proto->feed_end(c); + CD(c, "Feeding triggered by RPKI change"); - /* Restart feeding */ - if (c->refeed_pending) - channel_request_feeding(c); + c->refeed_pending = 1; + channel_stop_export(c); } - static void -channel_roa_in_changed(struct rt_subscription *s) +channel_export_one_roa(struct rt_export_request *req, const net_addr *net UNUSED, struct rt_pending_export *first) { - struct channel *c = s->data; - int active = c->reload_event && ev_active(c->reload_event); + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, req, req); - CD(c, "Reload triggered by RPKI change%s", active ? " - already active" : ""); + /* TODO: use the information about what roa has changed */ + settle_kick(&s->settle, s->c->proto->loop); - if (!active) - channel_request_reload(c); - else - c->reload_pending = 1; + rpe_mark_seen_all(req->hook, first, NULL, NULL); } static void -channel_roa_out_changed(struct rt_subscription *s) +channel_dump_roa_req(struct rt_export_request *req) { - struct channel *c = s->data; - int active = (c->export_state == ES_FEEDING); + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, req, req); + struct channel *c = s->c; + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter.e, req->hook->table); - CD(c, "Feeding triggered by RPKI change%s", active ? " - already active" : ""); - - if (!active) - channel_request_feeding(c); - else - c->refeed_pending = 1; + debug(" Channel %s.%s ROA %s change notifier from table %s request %p\n", + c->proto->name, c->name, + (s->settle.hook == channel_roa_in_changed) ? "import" : "export", + tab->name, req); } -/* Temporary code, subscriptions should be changed to resources */ -struct roa_subscription { - struct rt_subscription s; - node roa_node; -}; - static int channel_roa_is_subscribed(struct channel *c, rtable *tab, int dir) { - void (*hook)(struct rt_subscription *) = + void (*hook)(struct settle *) = dir ? channel_roa_in_changed : channel_roa_out_changed; struct roa_subscription *s; node *n; WALK_LIST2(s, n, c->roa_subscriptions, roa_node) - if ((s->s.tab == tab) && (s->s.hook == hook)) + if ((tab == SKIP_BACK(rtable, priv.exporter.e, s->req.hook->table)) + && (s->settle.hook == hook)) return 1; return 0; } - static void channel_roa_subscribe(struct channel *c, rtable *tab, int dir) { @@ -367,26 +360,47 @@ channel_roa_subscribe(struct channel *c, rtable *tab, int dir) struct roa_subscription *s = mb_allocz(c->proto->pool, sizeof(struct roa_subscription)); - s->s.hook = dir ? channel_roa_in_changed : channel_roa_out_changed; - s->s.data = c; - rt_subscribe(tab, &s->s); + *s = (struct roa_subscription) { + .settle = SETTLE_INIT(&c->roa_settle, dir ? channel_roa_in_changed : channel_roa_out_changed, NULL), + .c = c, + .req = { + .name = mb_sprintf(c->proto->pool, "%s.%s.roa-%s.%s", + c->proto->name, c->name, dir ? "in" : "out", tab->name), + .list = proto_work_list(c->proto), + .trace_routes = c->debug | c->proto->debug, + .dump_req = channel_dump_roa_req, + .export_one = channel_export_one_roa, + }, + }; add_tail(&c->roa_subscriptions, &s->roa_node); + rt_request_export(tab, &s->req); } static void -channel_roa_unsubscribe(struct roa_subscription *s) +channel_roa_unsubscribed(struct rt_export_request *req) { - rt_unsubscribe(&s->s); + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, req, req); + struct channel *c = s->c; + rem_node(&s->roa_node); mb_free(s); + + channel_check_stopped(c); +} + +static void +channel_roa_unsubscribe(struct roa_subscription *s) +{ + rt_stop_export(&s->req, channel_roa_unsubscribed); + settle_cancel(&s->settle); } static void channel_roa_subscribe_filter(struct channel *c, int dir) { const struct filter *f = dir ? c->in_filter : c->out_filter; - struct rtable *tab; + rtable *tab; int valid = 1, found = 0; if ((f == FILTER_ACCEPT) || (f == FILTER_REJECT)) @@ -399,7 +413,7 @@ channel_roa_subscribe_filter(struct channel *c, int dir) #ifdef CONFIG_BGP /* No automatic reload for BGP channels without in_table / out_table */ if (c->channel == &channel_bgp) - valid = dir ? !!c->in_table : !!c->out_table; + valid = dir ? ((c->in_keep & RIK_PREFILTER) == RIK_PREFILTER) : !!c->out_table; #endif struct filter_iterator fit; @@ -409,14 +423,8 @@ channel_roa_subscribe_filter(struct channel *c, int dir) { switch (fi->fi_code) { - case FI_ROA_CHECK_IMPLICIT: - tab = fi->i_FI_ROA_CHECK_IMPLICIT.rtc->table; - if (valid) channel_roa_subscribe(c, tab, dir); - found = 1; - break; - - case FI_ROA_CHECK_EXPLICIT: - tab = fi->i_FI_ROA_CHECK_EXPLICIT.rtc->table; + case FI_ROA_CHECK: + tab = fi->i_FI_ROA_CHECK.rtc->table; if (valid) channel_roa_subscribe(c, tab, dir); found = 1; break; @@ -445,119 +453,257 @@ channel_roa_unsubscribe_all(struct channel *c) } static void -channel_start_export(struct channel *c) +channel_start_import(struct channel *c) { + if (c->in_req.hook) + { + log(L_WARN "%s.%s: Attempted to start channel's already started import", c->proto->name, c->name); + return; + } + + c->in_req = (struct rt_import_request) { + .name = mb_sprintf(c->proto->pool, "%s.%s", c->proto->name, c->name), + .trace_routes = c->debug | c->proto->debug, + .list = proto_work_list(c->proto), + .dump_req = channel_dump_import_req, + .log_state_change = channel_import_log_state_change, + .preimport = channel_preimport, + }; + ASSERT(c->channel_state == CS_UP); - ASSERT(c->export_state == ES_DOWN); - channel_schedule_feed(c, 1); /* Sets ES_FEEDING */ + channel_reset_limit(c, &c->rx_limit, PLD_RX); + channel_reset_limit(c, &c->in_limit, PLD_IN); + + memset(&c->import_stats, 0, sizeof(struct channel_import_stats)); + + DBG("%s.%s: Channel start import req=%p\n", c->proto->name, c->name, &c->in_req); + rt_request_import(c->table, &c->in_req); } static void -channel_stop_export(struct channel *c) +channel_start_export(struct channel *c) { - /* Need to abort feeding */ - if (c->export_state == ES_FEEDING) - rt_feed_channel_abort(c); + if (c->out_req.hook) + { + log(L_WARN "%s.%s: Attempted to start channel's already started export", c->proto->name, c->name); + return; + } + + ASSERT(c->channel_state == CS_UP); + + c->out_req = (struct rt_export_request) { + .name = mb_sprintf(c->proto->pool, "%s.%s", c->proto->name, c->name), + .list = proto_work_list(c->proto), + .addr = c->out_subprefix, + .addr_mode = c->out_subprefix ? TE_ADDR_IN : TE_ADDR_NONE, + .trace_routes = c->debug | c->proto->debug, + .dump_req = channel_dump_export_req, + .log_state_change = channel_export_log_state_change, + }; + + bmap_init(&c->export_map, c->proto->pool, 16); + bmap_init(&c->export_reject_map, c->proto->pool, 16); + + channel_reset_limit(c, &c->out_limit, PLD_OUT); + + memset(&c->export_stats, 0, sizeof(struct channel_export_stats)); + + switch (c->ra_mode) { + case RA_OPTIMAL: + c->out_req.export_one = rt_notify_optimal; + break; + case RA_ANY: + c->out_req.export_one = rt_notify_any; + c->out_req.export_bulk = rt_feed_any; + break; + case RA_ACCEPTED: + c->out_req.export_bulk = rt_notify_accepted; + break; + case RA_MERGED: + c->out_req.export_bulk = rt_notify_merged; + break; + default: + bug("Unknown route announcement mode"); + } - c->export_state = ES_DOWN; - c->stats.exp_routes = 0; - bmap_reset(&c->export_map, 1024); + DBG("%s.%s: Channel start export req=%p\n", c->proto->name, c->name, &c->out_req); + rt_request_export(c->table, &c->out_req); } +static void +channel_check_stopped(struct channel *c) +{ + switch (c->channel_state) + { + case CS_STOP: + if (!EMPTY_LIST(c->roa_subscriptions) || c->out_req.hook || c->in_req.hook || c->reload_req.hook) + return; + + channel_set_state(c, CS_DOWN); + proto_send_event(c->proto, c->proto->event); + + break; + case CS_PAUSE: + if (!EMPTY_LIST(c->roa_subscriptions) || c->out_req.hook || c->reload_req.hook) + return; + + channel_set_state(c, CS_START); + break; + } + + DBG("%s.%s: Channel requests/hooks stopped (in state %s)\n", c->proto->name, c->name, c_states[c->channel_state]); +} -/* Called by protocol for reload from in_table */ void -channel_schedule_reload(struct channel *c) +channel_import_stopped(struct rt_import_request *req) { - ASSERT(c->channel_state == CS_UP); + struct channel *c = SKIP_BACK(struct channel, in_req, req); + + mb_free(c->in_req.name); + c->in_req.name = NULL; - rt_reload_channel_abort(c); - ev_schedule_work(c->reload_event); + channel_check_stopped(c); } static void -channel_reload_loop(void *ptr) +channel_export_stopped(struct rt_export_request *req) { - struct channel *c = ptr; + struct channel *c = SKIP_BACK(struct channel, out_req, req); - /* Start reload */ - if (!c->reload_active) - c->reload_pending = 0; + /* The hook has already stopped */ + req->hook = NULL; - if (!rt_reload_channel(c)) + if (c->refeed_pending) { - ev_schedule_work(c->reload_event); + c->refeeding = 1; + c->refeed_pending = 0; + + channel_reset_limit(c, &c->out_limit, PLD_OUT); + + bmap_reset(&c->export_map, 16); + bmap_reset(&c->export_reject_map, 16); + + rt_request_export(c->table, req); return; } - /* Restart reload */ - if (c->reload_pending) - channel_request_reload(c); + mb_free(c->out_req.name); + c->out_req.name = NULL; + + bmap_free(&c->export_map); + bmap_free(&c->export_reject_map); + + channel_check_stopped(c); } static void -channel_reset_import(struct channel *c) +channel_feed_end(struct channel *c) { - /* Need to abort feeding */ - ev_postpone(c->reload_event); - rt_reload_channel_abort(c); + /* Reset export limit if the feed ended with acceptable number of exported routes */ + struct limit *l = &c->out_limit; + if (c->refeeding && + (c->limit_active & (1 << PLD_OUT)) && + (c->refeed_count <= l->max) && + (l->count <= l->max)) + { + log(L_INFO "Protocol %s resets route export limit (%u)", c->proto->name, l->max); + channel_reset_limit(c, &c->out_limit, PLD_OUT); + + c->refeed_pending = 1; + channel_stop_export(c); + return; + } + + if (c->proto->feed_end) + c->proto->feed_end(c); - rt_prune_sync(c->in_table, 1); + if (c->refeed_pending) + channel_stop_export(c); + else + c->refeeding = 0; } -static void -channel_reset_export(struct channel *c) +/* Called by protocol for reload from in_table */ +void +channel_schedule_reload(struct channel *c) { - /* Just free the routes */ - rt_prune_sync(c->out_table, 1); + ASSERT(c->in_req.hook); + + if (c->reload_req.hook) + { + CD(c, "Reload triggered before the previous one has finished"); + c->reload_pending = 1; + return; + } + + rt_refresh_begin(&c->in_req); + rt_request_export(c->table, &c->reload_req); } -/* Called by protocol to activate in_table */ -void -channel_setup_in_table(struct channel *c) +static void +channel_reload_stopped(struct rt_export_request *req) { - struct rtable_config *cf = mb_allocz(c->proto->pool, sizeof(struct rtable_config)); + struct channel *c = SKIP_BACK(struct channel, reload_req, req); + + /* Restart reload */ + if (c->reload_pending) + { + c->reload_pending = 0; + channel_request_reload(c); + } + + if (c->channel_state != CS_UP) + channel_check_stopped(c); +} - cf->name = "import"; - cf->addr_type = c->net_type; - cf->internal = 1; +static void +channel_reload_log_state_change(struct rt_export_request *req, u8 state) +{ + struct channel *c = SKIP_BACK(struct channel, reload_req, req); - c->in_table = rt_setup(c->proto->pool, cf); + if (state == TES_READY) + { + if (c->channel_state == CS_UP) + rt_refresh_end(&c->in_req); - c->reload_event = ev_new_init(c->proto->pool, channel_reload_loop, c); + rt_stop_export(req, channel_reload_stopped); + } } -/* Called by protocol to activate out_table */ -void -channel_setup_out_table(struct channel *c) +static void +channel_reload_dump_req(struct rt_export_request *req) { - struct rtable_config *cf = mb_allocz(c->proto->pool, sizeof(struct rtable_config)); - cf->name = "export"; - cf->addr_type = c->net_type; - cf->internal = 1; + struct channel *c = SKIP_BACK(struct channel, reload_req, req); + debug(" Channel %s.%s import reload request %p\n", c->proto->name, c->name, req); +} - c->out_table = rt_setup(c->proto->pool, cf); +/* Called by protocol to activate in_table */ +static void +channel_setup_in_table(struct channel *c) +{ + c->reload_req = (struct rt_export_request) { + .name = mb_sprintf(c->proto->pool, "%s.%s.import", c->proto->name, c->name), + .list = proto_work_list(c->proto), + .trace_routes = c->debug | c->proto->debug, + .export_bulk = channel_reload_export_bulk, + .dump_req = channel_reload_dump_req, + .log_state_change = channel_reload_log_state_change, + }; } static void channel_do_start(struct channel *c) { - rt_lock_table(c->table); - add_tail(&c->table->channels, &c->table_node); c->proto->active_channels++; - c->feed_event = ev_new_init(c->proto->pool, channel_feed_loop, c); - - bmap_init(&c->export_map, c->proto->pool, 1024); - memset(&c->stats, 0, sizeof(struct proto_stats)); - - channel_reset_limit(&c->rx_limit); - channel_reset_limit(&c->in_limit); - channel_reset_limit(&c->out_limit); + if ((c->in_keep & RIK_PREFILTER) == RIK_PREFILTER) + channel_setup_in_table(c); CALL(c->channel->start, c); + + channel_start_import(c); } static void @@ -572,9 +718,28 @@ channel_do_up(struct channel *c) } static void -channel_do_flush(struct channel *c) +channel_do_pause(struct channel *c) +{ + /* Drop ROA subscriptions */ + channel_roa_unsubscribe_all(c); + + /* Need to abort feeding */ + c->reload_pending = 0; + + if (c->reload_req.hook && c->reload_req.hook->export_state != TES_STOP) + rt_stop_export(&c->reload_req, channel_reload_stopped); + + /* Stop export */ + c->refeed_pending = 0; + channel_stop_export(c); +} + +static void +channel_do_stop(struct channel *c) { - rt_schedule_prune(c->table); + /* Stop import */ + if (c->in_req.hook) + rt_stop_import(&c->in_req, channel_import_stopped); c->gr_wait = 0; if (c->gr_lock) @@ -582,32 +747,18 @@ channel_do_flush(struct channel *c) CALL(c->channel->shutdown, c); - /* This have to be done in here, as channel pool is freed before channel_do_down() */ - bmap_free(&c->export_map); - c->in_table = NULL; - c->reload_event = NULL; - c->out_table = NULL; - - channel_roa_unsubscribe_all(c); } static void channel_do_down(struct channel *c) { - ASSERT(!c->feed_active && !c->reload_active); + ASSERT(!c->reload_req.hook); - rem_node(&c->table_node); - rt_unlock_table(c->table); c->proto->active_channels--; - if ((c->stats.imp_routes + c->stats.filt_routes) != 0) - log(L_ERR "%s: Channel %s is down but still has some routes", c->proto->name, c->name); + memset(&c->import_stats, 0, sizeof(struct channel_import_stats)); + memset(&c->export_stats, 0, sizeof(struct channel_export_stats)); - // bmap_free(&c->export_map); - memset(&c->stats, 0, sizeof(struct proto_stats)); - - c->in_table = NULL; - c->reload_event = NULL; c->out_table = NULL; /* The in_table and out_table are going to be freed by freeing their resource pools. */ @@ -616,14 +767,13 @@ channel_do_down(struct channel *c) /* Schedule protocol shutddown */ if (proto_is_done(c->proto)) - ev_schedule(c->proto->event); + proto_send_event(c->proto, c->proto->event); } void channel_set_state(struct channel *c, uint state) { uint cs = c->channel_state; - uint es = c->export_state; DBG("%s reporting channel %s state transition %s -> %s\n", c->proto->name, c->name, c_states[cs], c_states[state]); if (state == cs) @@ -635,20 +785,11 @@ channel_set_state(struct channel *c, uint state) switch (state) { case CS_START: - ASSERT(cs == CS_DOWN || cs == CS_UP); + ASSERT(cs == CS_DOWN || cs == CS_PAUSE); if (cs == CS_DOWN) channel_do_start(c); - if (es != ES_DOWN) - channel_stop_export(c); - - if (c->in_table && (cs == CS_UP)) - channel_reset_import(c); - - if (c->out_table && (cs == CS_UP)) - channel_reset_export(c); - break; case CS_UP: @@ -663,23 +804,24 @@ channel_set_state(struct channel *c, uint state) channel_do_up(c); break; - case CS_FLUSHING: - ASSERT(cs == CS_START || cs == CS_UP); + case CS_PAUSE: + ASSERT(cs == CS_UP); - if (es != ES_DOWN) - channel_stop_export(c); + if (cs == CS_UP) + channel_do_pause(c); + break; - if (c->in_table && (cs == CS_UP)) - channel_reset_import(c); + case CS_STOP: + ASSERT(cs == CS_UP || cs == CS_START || cs == CS_PAUSE); - if (c->out_table && (cs == CS_UP)) - channel_reset_export(c); + if (cs == CS_UP) + channel_do_pause(c); - channel_do_flush(c); + channel_do_stop(c); break; case CS_DOWN: - ASSERT(cs == CS_FLUSHING); + ASSERT(cs == CS_STOP); channel_do_down(c); break; @@ -704,47 +846,36 @@ channel_set_state(struct channel *c, uint state) void channel_request_feeding(struct channel *c) { - ASSERT(c->channel_state == CS_UP); + ASSERT(c->out_req.hook); - CD(c, "Feeding requested"); - - /* Do nothing if we are still waiting for feeding */ - if (c->export_state == ES_DOWN) + if (c->refeed_pending) return; - /* If we are already feeding, we want to restart it */ - if (c->export_state == ES_FEEDING) - { - /* Unless feeding is in initial state */ - if (!c->feed_active) - return; - - rt_feed_channel_abort(c); - } + c->refeed_pending = 1; + channel_stop_export(c); +} - /* Track number of exported routes during refeed */ - c->refeed_count = 0; +static void +channel_stop_export(struct channel *c) +{ + if (!c->out_req.hook || (c->out_req.hook->export_state == TES_STOP)) + return; - channel_schedule_feed(c, 0); /* Sets ES_FEEDING */ - channel_log_state_change(c); + rt_stop_export(&c->out_req, channel_export_stopped); } static void channel_request_reload(struct channel *c) { - ASSERT(c->channel_state == CS_UP); + ASSERT(c->in_req.hook); ASSERT(channel_reloadable(c)); CD(c, "Reload requested"); - c->proto->reload_routes(c); - - /* - * Should this be done before reload_routes() hook? - * Perhaps, but routes are updated asynchronously. - */ - channel_reset_limit(&c->rx_limit); - channel_reset_limit(&c->in_limit); + if ((c->in_keep & RIK_PREFILTER) == RIK_PREFILTER) + channel_schedule_reload(c); + else + c->proto->reload_routes(c); } const struct channel_class channel_basic = { @@ -766,7 +897,7 @@ channel_config_new(const struct channel_class *cc, const char *name, uint net_ty if (proto->net_type && (net_type != proto->net_type)) cf_error("Different channel type"); - tab = new_config->def_tables[net_type]; + tab = rt_get_default_table(new_config, net_type); } if (!cc) @@ -785,6 +916,11 @@ channel_config_new(const struct channel_class *cc, const char *name, uint net_ty cf->debug = new_config->channel_default_debug; cf->rpki_reload = 1; + cf->roa_settle = (struct settle_config) { + .min = 1 S, + .max = 20 S, + }; + add_tail(&proto->channels, &cf->n); return cf; @@ -834,7 +970,12 @@ channel_reconfigure(struct channel *c, struct channel_config *cf) c->stale = 0; /* FIXME: better handle these changes, also handle in_keep_filtered */ - if ((c->table != cf->table->table) || (cf->ra_mode && (c->ra_mode != cf->ra_mode))) + if ((c->table != cf->table->table) || + (cf->ra_mode && (c->ra_mode != cf->ra_mode)) || + (cf->in_keep != c->in_keep) || + cf->out_subprefix && c->out_subprefix && + !net_equal(cf->out_subprefix, c->out_subprefix) || + (!cf->out_subprefix != !c->out_subprefix)) return 0; /* Note that filter_same() requires arguments in (new, old) order */ @@ -851,18 +992,34 @@ channel_reconfigure(struct channel *c, struct channel_config *cf) /* Reconfigure channel fields */ c->in_filter = cf->in_filter; c->out_filter = cf->out_filter; - c->rx_limit = cf->rx_limit; - c->in_limit = cf->in_limit; - c->out_limit = cf->out_limit; + + channel_update_limit(c, &c->rx_limit, PLD_RX, &cf->rx_limit); + channel_update_limit(c, &c->in_limit, PLD_IN, &cf->in_limit); + channel_update_limit(c, &c->out_limit, PLD_OUT, &cf->out_limit); // c->ra_mode = cf->ra_mode; c->merge_limit = cf->merge_limit; c->preference = cf->preference; + c->out_req.addr = c->out_subprefix = cf->out_subprefix; c->debug = cf->debug; - c->in_keep_filtered = cf->in_keep_filtered; + c->in_req.trace_routes = c->out_req.trace_routes = c->debug | c->proto->debug; c->rpki_reload = cf->rpki_reload; - channel_verify_limits(c); + if ( (c->roa_settle.min != cf->roa_settle.min) + || (c->roa_settle.max != cf->roa_settle.max)) + { + c->roa_settle = cf->roa_settle; + + struct roa_subscription *s; + node *n; + + WALK_LIST2(s, n, c->roa_subscriptions, roa_node) + { + s->settle.cf = cf->roa_settle; + if (settle_active(&s->settle)) + settle_kick(&s->settle, &main_birdloop); + } + } /* Execute channel-specific reconfigure hook */ if (c->channel->reconfigure && !c->channel->reconfigure(c, cf, &import_changed, &export_changed)) @@ -954,34 +1111,50 @@ proto_configure_channel(struct proto *p, struct channel **pc, struct channel_con return 1; } +static void +proto_cleanup(struct proto *p) +{ + CALL(p->proto->cleanup, p); + + rfree(p->pool); + p->pool = NULL; + + p->active = 0; + proto_log_state_change(p); + proto_rethink_goal(p); +} static void -proto_event(void *ptr) +proto_loop_stopped(void *ptr) { struct proto *p = ptr; - if (p->do_start) - { - if_feed_baby(p); - p->do_start = 0; - } + birdloop_enter(&main_birdloop); + + birdloop_free(p->loop); + p->loop = &main_birdloop; + proto_cleanup(p); + + birdloop_leave(&main_birdloop); +} + +static void +proto_event(void *ptr) +{ + struct proto *p = ptr; if (p->do_stop) { - if (p->proto == &proto_unix_iface) - if_flush_ifaces(p); + iface_unsubscribe(&p->iface_sub); + p->do_stop = 0; } if (proto_is_done(p)) - { - if (p->proto->cleanup) - p->proto->cleanup(p); - - p->active = 0; - proto_log_state_change(p); - proto_rethink_goal(p); - } + if (p->loop != &main_birdloop) + birdloop_stop_self(p->loop, proto_loop_stopped, p); + else + proto_cleanup(p); } @@ -1022,10 +1195,10 @@ proto_init(struct proto_config *c, node *n) struct protocol *pr = c->protocol; struct proto *p = pr->init(c); + p->loop = &main_birdloop; p->proto_state = PS_DOWN; p->last_state_change = current_time(); p->vrf = c->vrf; - p->vrf_set = c->vrf_set; insert_node(&p->n, n); p->event = ev_new_init(proto_pool, proto_event, p); @@ -1038,11 +1211,21 @@ proto_init(struct proto_config *c, node *n) static void proto_start(struct proto *p) { - /* Here we cannot use p->cf->name since it won't survive reconfiguration */ - p->pool = rp_new(proto_pool, p->proto->name); + DBG("Kicking %s up\n", p->name); + PD(p, "Starting"); + + p->pool = rp_newf(proto_pool, "Protocol %s", p->cf->name); if (graceful_restart_state == GRS_INIT) p->gr_recovery = 1; + + if (p->cf->loop_order != DOMAIN_ORDER(the_bird)) + p->loop = birdloop_new(p->pool, p->cf->loop_order, p->pool->name); + + p->iface_sub.target = proto_event_list(p); + + PROTO_LOCKED_FROM_MAIN(p) + proto_notify_state(p, (p->proto->start ? p->proto->start(p) : PS_UP)); } @@ -1078,6 +1261,7 @@ proto_config_new(struct protocol *pr, int class) cf->class = class; cf->debug = new_config->proto_default_debug; cf->mrtdump = new_config->proto_default_mrtdump; + cf->loop_order = DOMAIN_ORDER(the_bird); init_list(&cf->channels); @@ -1193,11 +1377,10 @@ proto_reconfigure(struct proto *p, struct proto_config *oc, struct proto_config if ((nc->protocol != oc->protocol) || (nc->net_type != oc->net_type) || (nc->disabled != p->disabled) || - (nc->vrf != oc->vrf) || - (nc->vrf_set != oc->vrf_set)) + (nc->vrf != oc->vrf)) return 0; - p->name = nc->name; + p->sources.name = p->name = nc->name; p->debug = nc->debug; p->mrtdump = nc->mrtdump; reconfigure_type = type; @@ -1258,6 +1441,8 @@ protos_commit(struct config *new, struct config *old, int force_reconfig, int ty p = oc->proto; sym = cf_find_symbol(new, oc->name); + struct birdloop *proto_loop = PROTO_ENTER_FROM_MAIN(p); + /* Handle dynamic protocols */ if (!sym && oc->parent && !new->shutdown) { @@ -1283,8 +1468,11 @@ protos_commit(struct config *new, struct config *old, int force_reconfig, int ty nc->proto = p; /* We will try to reconfigure protocol p */ - if (! force_reconfig && proto_reconfigure(p, oc, nc, type)) + if (!force_reconfig && proto_reconfigure(p, oc, nc, type)) + { + PROTO_LEAVE_FROM_MAIN(proto_loop); continue; + } if (nc->parent) { @@ -1322,6 +1510,8 @@ protos_commit(struct config *new, struct config *old, int force_reconfig, int ty } p->reconfiguring = 1; + PROTO_LEAVE_FROM_MAIN(proto_loop); + config_add_obstacle(old); proto_rethink_goal(p); } @@ -1367,11 +1557,20 @@ protos_commit(struct config *new, struct config *old, int force_reconfig, int ty } static void -proto_rethink_goal(struct proto *p) +proto_shutdown(struct proto *p) { - struct protocol *q; - byte goal; + if (p->proto_state == PS_START || p->proto_state == PS_UP) + { + /* Going down */ + DBG("Kicking %s down\n", p->name); + PD(p, "Shutting down"); + proto_notify_state(p, (p->proto->shutdown ? p->proto->shutdown(p) : PS_DOWN)); + } +} +static void +proto_rethink_goal(struct proto *p) +{ if (p->reconfiguring && !p->active) { struct proto_config *nc = p->cf_new; @@ -1391,32 +1590,12 @@ proto_rethink_goal(struct proto *p) /* Determine what state we want to reach */ if (p->disabled || p->reconfiguring) - goal = PS_DOWN; - else - goal = PS_UP; - - q = p->proto; - if (goal == PS_UP) { - if (!p->active) - { - /* Going up */ - DBG("Kicking %s up\n", p->name); - PD(p, "Starting"); - proto_start(p); - proto_notify_state(p, (q->start ? q->start(p) : PS_UP)); - } - } - else - { - if (p->proto_state == PS_START || p->proto_state == PS_UP) - { - /* Going down */ - DBG("Kicking %s down\n", p->name); - PD(p, "Shutting down"); - proto_notify_state(p, (q->shutdown ? q->shutdown(p) : PS_DOWN)); - } + PROTO_LOCKED_FROM_MAIN(p) + proto_shutdown(p); } + else if (!p->active) + proto_start(p); } struct proto * @@ -1528,7 +1707,7 @@ graceful_restart_done(timer *t UNUSED) WALK_LIST(c, p->channels) { /* Resume postponed export of routes */ - if ((c->channel_state == CS_UP) && c->gr_wait && c->proto->rt_notify) + if ((c->channel_state == CS_UP) && c->gr_wait && p->rt_notify) channel_start_export(c); /* Cleanup */ @@ -1616,9 +1795,13 @@ protos_dump_all(void) debug("Protocols:\n"); struct proto *p; - WALK_LIST(p, proto_list) + WALK_LIST(p, proto_list) PROTO_LOCKED_FROM_MAIN(p) { - debug(" protocol %s state %s\n", p->name, p_states[p->proto_state]); +#define DPF(x) (p->x ? " " #x : "") + debug(" protocol %s (%p) state %s with %d active channels flags: %s%s%s%s\n", + p->name, p, p_states[p->proto_state], p->active_channels, + DPF(disabled), DPF(active), DPF(do_stop), DPF(reconfiguring)); +#undef DPF struct channel *c; WALK_LIST(c, p->channels) @@ -1628,6 +1811,9 @@ protos_dump_all(void) debug("\tInput filter: %s\n", filter_name(c->in_filter)); if (c->out_filter) debug("\tOutput filter: %s\n", filter_name(c->out_filter)); + debug("\tChannel state: %s/%s/%s\n", c_states[c->channel_state], + c->in_req.hook ? rt_import_state_name(rt_import_get_state(c->in_req.hook)) : "-", + c->out_req.hook ? rt_export_state_name(rt_export_get_state(c->out_req.hook)) : "-"); } if (p->proto->dump && (p->proto_state != PS_DOWN)) @@ -1647,9 +1833,6 @@ void proto_build(struct protocol *p) { add_tail(&protocol_list, &p->n); - ASSERT(p->class); - ASSERT(!class_to_protocol[p->class]); - class_to_protocol[p->class] = p; } /* FIXME: convert this call to some protocol hook */ @@ -1672,8 +1855,6 @@ protos_build(void) protos_build_gen(); proto_pool = rp_new(&root_pool, "Protocols"); - proto_shutdown_timer = tm_new(proto_pool); - proto_shutdown_timer->hook = proto_shutdown_loop; } @@ -1681,23 +1862,39 @@ protos_build(void) int proto_restart; static void -proto_shutdown_loop(timer *t UNUSED) +proto_restart_event_hook(void *_p) { - struct proto *p, *p_next; + struct proto *p = _p; + if (!p->down_sched) + return; - WALK_LIST_DELSAFE(p, p_next, proto_list) - if (p->down_sched) - { - proto_restart = (p->down_sched == PDS_RESTART); + proto_restart = (p->down_sched == PDS_RESTART); + p->disabled = 1; + proto_rethink_goal(p); - p->disabled = 1; - proto_rethink_goal(p); - if (proto_restart) - { - p->disabled = 0; - proto_rethink_goal(p); - } - } + p->restart_event = NULL; + p->restart_timer = NULL; + + if (proto_restart) + /* No need to call proto_rethink_goal() here again as the proto_cleanup() routine will + * call it after the protocol stops ... and both these routines are fixed to main_birdloop. + */ + p->disabled = 0; +} + +static void +proto_send_restart_event(struct proto *p) +{ + if (!p->restart_event) + p->restart_event = ev_new_init(p->pool, proto_restart_event_hook, p); + + ev_send(&global_event_list, p->restart_event); +} + +static void +proto_send_restart_event_from_timer(struct timer *t) +{ + proto_send_restart_event((struct proto *) t->data); } static inline void @@ -1712,7 +1909,21 @@ proto_schedule_down(struct proto *p, byte restart, byte code) p->down_sched = restart ? PDS_RESTART : PDS_DISABLE; p->down_code = code; - tm_start_max(proto_shutdown_timer, restart ? 250 MS : 0); + + if (!restart) + { + if (p->restart_timer && tm_active(p->restart_timer)) + tm_stop(p->restart_timer); + + proto_send_restart_event(p); + } + else + { + if (!p->restart_timer) + p->restart_timer = tm_new_init(p->pool, proto_send_restart_event_from_timer, p, 0, 0); + + tm_start_max_in(p->restart_timer, 250 MS, p->loop); + } } /** @@ -1749,108 +1960,132 @@ proto_set_message(struct proto *p, char *msg, int len) } -static const char * -channel_limit_name(struct channel_limit *l) -{ - const char *actions[] = { - [PLA_WARN] = "warn", - [PLA_BLOCK] = "block", - [PLA_RESTART] = "restart", - [PLA_DISABLE] = "disable", - }; +static const char * channel_limit_name[] = { + [PLA_WARN] = "warn", + [PLA_BLOCK] = "block", + [PLA_RESTART] = "restart", + [PLA_DISABLE] = "disable", +}; - return actions[l->action]; -} -/** - * channel_notify_limit: notify about limit hit and take appropriate action - * @c: channel - * @l: limit being hit - * @dir: limit direction (PLD_*) - * @rt_count: the number of routes - * - * The function is called by the route processing core when limit @l - * is breached. It activates the limit and tooks appropriate action - * according to @l->action. - */ -void -channel_notify_limit(struct channel *c, struct channel_limit *l, int dir, u32 rt_count) +static void +channel_log_limit(struct channel *c, struct limit *l, int dir) { const char *dir_name[PLD_MAX] = { "receive", "import" , "export" }; - const byte dir_down[PLD_MAX] = { PDC_RX_LIMIT_HIT, PDC_IN_LIMIT_HIT, PDC_OUT_LIMIT_HIT }; - struct proto *p = c->proto; + log(L_WARN "Channel %s.%s hits route %s limit (%d), action: %s", + c->proto->name, c->name, dir_name[dir], l->max, channel_limit_name[c->limit_actions[dir]]); +} - if (l->state == PLS_BLOCKED) +static void +channel_activate_limit(struct channel *c, struct limit *l, int dir) +{ + if (c->limit_active & (1 << dir)) return; - /* For warning action, we want the log message every time we hit the limit */ - if (!l->state || ((l->action == PLA_WARN) && (rt_count == l->limit))) - log(L_WARN "Protocol %s hits route %s limit (%d), action: %s", - p->name, dir_name[dir], l->limit, channel_limit_name(l)); + c->limit_active |= (1 << dir); + channel_log_limit(c, l, dir); +} - switch (l->action) - { - case PLA_WARN: - l->state = PLS_ACTIVE; - break; +static int +channel_limit_warn(struct limit *l, void *data) +{ + struct channel_limit_data *cld = data; + struct channel *c = cld->c; + int dir = cld->dir; - case PLA_BLOCK: - l->state = PLS_BLOCKED; - break; + channel_log_limit(c, l, dir); - case PLA_RESTART: - case PLA_DISABLE: - l->state = PLS_BLOCKED; - if (p->proto_state == PS_UP) - proto_schedule_down(p, l->action == PLA_RESTART, dir_down[dir]); - break; - } + return 0; } -static void -channel_verify_limits(struct channel *c) +static int +channel_limit_block(struct limit *l, void *data) +{ + struct channel_limit_data *cld = data; + struct channel *c = cld->c; + int dir = cld->dir; + + channel_activate_limit(c, l, dir); + + return 1; +} + +static const byte chl_dir_down[PLD_MAX] = { PDC_RX_LIMIT_HIT, PDC_IN_LIMIT_HIT, PDC_OUT_LIMIT_HIT }; + +static int +channel_limit_down(struct limit *l, void *data) { - struct channel_limit *l; - u32 all_routes = c->stats.imp_routes + c->stats.filt_routes; + struct channel_limit_data *cld = data; + struct channel *c = cld->c; + struct proto *p = c->proto; + int dir = cld->dir; - l = &c->rx_limit; - if (l->action && (all_routes > l->limit)) - channel_notify_limit(c, l, PLD_RX, all_routes); + channel_activate_limit(c, l, dir); - l = &c->in_limit; - if (l->action && (c->stats.imp_routes > l->limit)) - channel_notify_limit(c, l, PLD_IN, c->stats.imp_routes); + if (p->proto_state == PS_UP) + proto_schedule_down(p, c->limit_actions[dir] == PLA_RESTART, chl_dir_down[dir]); - l = &c->out_limit; - if (l->action && (c->stats.exp_routes > l->limit)) - channel_notify_limit(c, l, PLD_OUT, c->stats.exp_routes); + return 1; } -static inline void -channel_reset_limit(struct channel_limit *l) +static int (*channel_limit_action[])(struct limit *, void *) = { + [PLA_NONE] = NULL, + [PLA_WARN] = channel_limit_warn, + [PLA_BLOCK] = channel_limit_block, + [PLA_RESTART] = channel_limit_down, + [PLA_DISABLE] = channel_limit_down, +}; + +static void +channel_update_limit(struct channel *c, struct limit *l, int dir, struct channel_limit *cf) { - if (l->action) - l->state = PLS_INITIAL; + l->action = channel_limit_action[cf->action]; + c->limit_actions[dir] = cf->action; + + struct channel_limit_data cld = { .c = c, .dir = dir }; + limit_update(l, &cld, cf->action ? cf->limit : ~((u32) 0)); } +static void +channel_init_limit(struct channel *c, struct limit *l, int dir, struct channel_limit *cf) +{ + channel_reset_limit(c, l, dir); + channel_update_limit(c, l, dir, cf); +} + +static void +channel_reset_limit(struct channel *c, struct limit *l, int dir) +{ + limit_reset(l); + c->limit_active &= ~(1 << dir); +} + +static struct rte_owner_class default_rte_owner_class; + static inline void proto_do_start(struct proto *p) { p->active = 1; - p->do_start = 1; - ev_schedule(p->event); + + rt_init_sources(&p->sources, p->name, proto_event_list(p)); + if (!p->sources.class) + p->sources.class = &default_rte_owner_class; + + if (!p->cf->late_if_feed) + iface_subscribe(&p->iface_sub); } static void proto_do_up(struct proto *p) { if (!p->main_source) - { p->main_source = rt_get_source(p, 0); - rt_lock_source(p->main_source); - } + // Locked automaticaly proto_start_channels(p); + + if (p->cf->late_if_feed) + iface_subscribe(&p->iface_sub); } static inline void @@ -1865,9 +2100,6 @@ proto_do_stop(struct proto *p) p->down_sched = 0; p->gr_recovery = 0; - p->do_stop = 1; - ev_schedule(p->event); - if (p->main_source) { rt_unlock_source(p->main_source); @@ -1875,19 +2107,20 @@ proto_do_stop(struct proto *p) } proto_stop_channels(p); + rt_destroy_sources(&p->sources, p->event); + + p->do_stop = 1; + proto_send_event(p, p->event); } static void proto_do_down(struct proto *p) { p->down_code = 0; - neigh_prune(); - rfree(p->pool); - p->pool = NULL; /* Shutdown is finished in the protocol event */ if (proto_is_done(p)) - ev_schedule(p->event); + proto_send_event(p, p->event); } @@ -1978,38 +2211,58 @@ proto_state_name(struct proto *p) static void channel_show_stats(struct channel *c) { - struct proto_stats *s = &c->stats; + struct channel_import_stats *ch_is = &c->import_stats; + struct channel_export_stats *ch_es = &c->export_stats; + struct rt_import_stats *rt_is = c->in_req.hook ? &c->in_req.hook->stats : NULL; + struct rt_export_stats *rt_es = c->out_req.hook ? &c->out_req.hook->stats : NULL; - if (c->in_keep_filtered) +#define SON(ie, item) ((ie) ? (ie)->item : 0) +#define SCI(item) SON(ch_is, item) +#define SCE(item) SON(ch_es, item) +#define SRI(item) SON(rt_is, item) +#define SRE(item) SON(rt_es, item) + + u32 rx_routes = c->rx_limit.count; + u32 in_routes = c->in_limit.count; + u32 out_routes = c->out_limit.count; + + if (c->in_keep) cli_msg(-1006, " Routes: %u imported, %u filtered, %u exported, %u preferred", - s->imp_routes, s->filt_routes, s->exp_routes, s->pref_routes); + in_routes, (rx_routes - in_routes), out_routes, SRI(pref)); else cli_msg(-1006, " Routes: %u imported, %u exported, %u preferred", - s->imp_routes, s->exp_routes, s->pref_routes); - - cli_msg(-1006, " Route change stats: received rejected filtered ignored accepted"); - cli_msg(-1006, " Import updates: %10u %10u %10u %10u %10u", - s->imp_updates_received, s->imp_updates_invalid, - s->imp_updates_filtered, s->imp_updates_ignored, - s->imp_updates_accepted); - cli_msg(-1006, " Import withdraws: %10u %10u --- %10u %10u", - s->imp_withdraws_received, s->imp_withdraws_invalid, - s->imp_withdraws_ignored, s->imp_withdraws_accepted); - cli_msg(-1006, " Export updates: %10u %10u %10u --- %10u", - s->exp_updates_received, s->exp_updates_rejected, - s->exp_updates_filtered, s->exp_updates_accepted); - cli_msg(-1006, " Export withdraws: %10u --- --- --- %10u", - s->exp_withdraws_received, s->exp_withdraws_accepted); + in_routes, out_routes, SRI(pref)); + + cli_msg(-1006, " Route change stats: received rejected filtered ignored RX limit IN limit accepted"); + cli_msg(-1006, " Import updates: %10u %10u %10u %10u %10u %10u %10u", + SCI(updates_received), SCI(updates_invalid), + SCI(updates_filtered), SRI(updates_ignored), + SCI(updates_limited_rx), SCI(updates_limited_in), + SRI(updates_accepted)); + cli_msg(-1006, " Import withdraws: %10u %10u --- %10u --- %10u", + SCI(withdraws_received), SCI(withdraws_invalid), + SRI(withdraws_ignored), SRI(withdraws_accepted)); + cli_msg(-1006, " Export updates: %10u %10u %10u --- %10u %10u", + SRE(updates_received), SCE(updates_rejected), + SCE(updates_filtered), SCE(updates_limited), SCE(updates_accepted)); + cli_msg(-1006, " Export withdraws: %10u --- --- --- ---%10u", + SRE(withdraws_received), SCE(withdraws_accepted)); + +#undef SRI +#undef SRE +#undef SCI +#undef SCE +#undef SON } void -channel_show_limit(struct channel_limit *l, const char *dsc) +channel_show_limit(struct limit *l, const char *dsc, int active, int action) { if (!l->action) return; - cli_msg(-1006, " %-16s%d%s", dsc, l->limit, l->state ? " [HIT]" : ""); - cli_msg(-1006, " Action: %s", channel_limit_name(l)); + cli_msg(-1006, " %-16s%d%s", dsc, l->max, active ? " [HIT]" : ""); + cli_msg(-1006, " Action: %s", channel_limit_name[action]); } void @@ -2017,6 +2270,8 @@ channel_show_info(struct channel *c) { cli_msg(-1006, " Channel %s", c->name); cli_msg(-1006, " State: %s", c_states[c->channel_state]); + cli_msg(-1006, " Import state: %s", rt_import_state_name(rt_import_get_state(c->in_req.hook))); + cli_msg(-1006, " Export state: %s", rt_export_state_name(rt_export_get_state(c->out_req.hook))); cli_msg(-1006, " Table: %s", c->table->name); cli_msg(-1006, " Preference: %d", c->preference); cli_msg(-1006, " Input filter: %s", filter_name(c->in_filter)); @@ -2027,9 +2282,9 @@ channel_show_info(struct channel *c) c->gr_lock ? " pending" : "", c->gr_wait ? " waiting" : ""); - channel_show_limit(&c->rx_limit, "Receive limit:"); - channel_show_limit(&c->in_limit, "Import limit:"); - channel_show_limit(&c->out_limit, "Export limit:"); + channel_show_limit(&c->rx_limit, "Receive limit:", c->limit_active & (1 << PLD_RX), c->limit_actions[PLD_RX]); + channel_show_limit(&c->in_limit, "Import limit:", c->limit_active & (1 << PLD_IN), c->limit_actions[PLD_IN]); + channel_show_limit(&c->out_limit, "Export limit:", c->limit_active & (1 << PLD_OUT), c->limit_actions[PLD_OUT]); if (c->channel_state != CS_DOWN) channel_show_stats(c); @@ -2075,8 +2330,8 @@ proto_cmd_show(struct proto *p, uintptr_t verbose, int cnt) cli_msg(-1006, " Message: %s", p->message); if (p->cf->router_id) cli_msg(-1006, " Router ID: %R", p->cf->router_id); - if (p->vrf_set) - cli_msg(-1006, " VRF: %s", p->vrf ? p->vrf->name : "default"); + if (p->vrf) + cli_msg(-1006, " VRF: %s", p->vrf->name); if (p->proto->show_proto_info) p->proto->show_proto_info(p); @@ -2104,7 +2359,7 @@ proto_cmd_disable(struct proto *p, uintptr_t arg, int cnt UNUSED) p->disabled = 1; p->down_code = PDC_CMD_DISABLE; proto_set_message(p, (char *) arg, -1); - proto_rethink_goal(p); + proto_shutdown(p); cli_msg(-9, "%s: disabled", p->name); } @@ -2137,9 +2392,9 @@ proto_cmd_restart(struct proto *p, uintptr_t arg, int cnt UNUSED) p->disabled = 1; p->down_code = PDC_CMD_RESTART; proto_set_message(p, (char *) arg, -1); - proto_rethink_goal(p); + proto_shutdown(p); p->disabled = 0; - proto_rethink_goal(p); + /* After the protocol shuts down, proto_rethink_goal() is run from proto_event. */ cli_msg(-12, "%s: restarted", p->name); } @@ -2214,7 +2469,9 @@ proto_apply_cmd_symbol(const struct symbol *s, void (* cmd)(struct proto *, uint if (s->proto->proto) { - cmd(s->proto->proto, arg, 0); + struct proto *p = s->proto->proto; + PROTO_LOCKED_FROM_MAIN(p) + cmd(p, arg, 0); cli_msg(0, ""); } else @@ -2229,7 +2486,8 @@ proto_apply_cmd_patt(const char *patt, void (* cmd)(struct proto *, uintptr_t, i WALK_LIST(p, proto_list) if (!patt || patmatch(patt, p->name)) - cmd(p, arg, cnt++); + PROTO_LOCKED_FROM_MAIN(p) + cmd(p, arg, cnt++); if (!cnt) cli_msg(8003, "No protocols match"); diff --git a/nest/protocol.h b/nest/protocol.h index fcbf0539..fe987d17 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -9,15 +9,17 @@ #ifndef _BIRD_PROTOCOL_H_ #define _BIRD_PROTOCOL_H_ -#include "lib/lists.h" +#include "lib/tlists.h" #include "lib/resource.h" #include "lib/event.h" -#include "nest/route.h" +#include "nest/iface.h" +#include "lib/settle.h" +#include "nest/rt.h" +#include "nest/limit.h" #include "conf/conf.h" struct iface; struct ifa; -struct rtable; struct rte; struct neighbor; struct rta; @@ -37,38 +39,20 @@ struct symbol; * Routing Protocol */ -enum protocol_class { - PROTOCOL_NONE, - PROTOCOL_BABEL, - PROTOCOL_BFD, - PROTOCOL_BGP, - PROTOCOL_DEVICE, - PROTOCOL_DIRECT, - PROTOCOL_KERNEL, - PROTOCOL_OSPF, - PROTOCOL_MRT, - PROTOCOL_PERF, - PROTOCOL_PIPE, - PROTOCOL_RADV, - PROTOCOL_RIP, - PROTOCOL_RPKI, - PROTOCOL_STATIC, - PROTOCOL__MAX -}; - -extern struct protocol *class_to_protocol[PROTOCOL__MAX]; struct protocol { node n; char *name; char *template; /* Template for automatic generation of names */ int name_counter; /* Counter for automatic name generation */ - enum protocol_class class; /* Machine readable protocol class */ uint preference; /* Default protocol preference */ uint channel_mask; /* Mask of accepted channel types (NB_*) */ uint proto_size; /* Size of protocol data structure */ uint config_size; /* Size of protocol config data structure */ + uint eattr_begin; /* First ID of registered eattrs */ + uint eattr_end; /* End of eattr id zone */ + void (*preconfig)(struct protocol *, struct config *); /* Just before configuring */ void (*postconfig)(struct proto_config *); /* After configuring each instance */ struct proto * (*init)(struct proto_config *); /* Create new instance */ @@ -76,10 +60,9 @@ struct protocol { void (*dump)(struct proto *); /* Debugging dump */ int (*start)(struct proto *); /* Start the instance */ int (*shutdown)(struct proto *); /* Stop the instance */ - void (*cleanup)(struct proto *); /* Called after shutdown when protocol became hungry/down */ + void (*cleanup)(struct proto *); /* Cleanup the instance right before tearing it all down */ void (*get_status)(struct proto *, byte *buf); /* Get instance status (for `show protocols' command) */ - void (*get_route_info)(struct rte *, byte *buf); /* Get route information (for `show route' command) */ - int (*get_attr)(const struct eattr *, byte *buf, int buflen); /* ASCIIfy dynamic attribute (returns GA_*) */ +// int (*get_attr)(const struct eattr *, byte *buf, int buflen); /* ASCIIfy dynamic attribute (returns GA_*) */ void (*show_proto_info)(struct proto *); /* Show protocol info (for `show protocols all' command) */ void (*copy_config)(struct proto_config *, struct proto_config *); /* Copy config from given protocol instance */ }; @@ -94,7 +77,6 @@ void protos_dump_all(void); #define GA_UNKNOWN 0 /* Attribute not recognized */ #define GA_NAME 1 /* Result = name */ #define GA_FULL 2 /* Result = both name and value */ -#define GA_HIDDEN 3 /* Attribute should not be printed */ /* * Known protocols @@ -120,9 +102,10 @@ struct proto_config { int class; /* SYM_PROTO or SYM_TEMPLATE */ u8 net_type; /* Protocol network type (NET_*), 0 for undefined */ u8 disabled; /* Protocol enabled/disabled by default */ - u8 vrf_set; /* Related VRF instance (below) is defined */ + u8 late_if_feed; /* Delay interface feed after channels are up */ u32 debug, mrtdump; /* Debugging bitfields, both use D_* constants */ u32 router_id; /* Protocol specific router ID */ + uint loop_order; /* Launch a birdloop on this locking level; use DOMAIN_ORDER(the_bird) for mainloop */ list channels; /* List of channel configs (struct channel_config) */ struct iface *vrf; /* Related VRF instance, NULL if global */ @@ -133,31 +116,6 @@ struct proto_config { }; /* Protocol statistics */ -struct proto_stats { - /* Import - from protocol to core */ - u32 imp_routes; /* Number of routes successfully imported to the (adjacent) routing table */ - u32 filt_routes; /* Number of routes rejected in import filter but kept in the routing table */ - u32 pref_routes; /* Number of routes selected as best in the (adjacent) routing table */ - u32 imp_updates_received; /* Number of route updates received */ - u32 imp_updates_invalid; /* Number of route updates rejected as invalid */ - u32 imp_updates_filtered; /* Number of route updates rejected by filters */ - u32 imp_updates_ignored; /* Number of route updates rejected as already in route table */ - u32 imp_updates_accepted; /* Number of route updates accepted and imported */ - u32 imp_withdraws_received; /* Number of route withdraws received */ - u32 imp_withdraws_invalid; /* Number of route withdraws rejected as invalid */ - u32 imp_withdraws_ignored; /* Number of route withdraws rejected as already not in route table */ - u32 imp_withdraws_accepted; /* Number of route withdraws accepted and processed */ - - /* Export - from core to protocol */ - u32 exp_routes; /* Number of routes successfully exported to the protocol */ - u32 exp_updates_received; /* Number of route updates received */ - u32 exp_updates_rejected; /* Number of route updates rejected by protocol */ - u32 exp_updates_filtered; /* Number of route updates rejected by filters */ - u32 exp_updates_accepted; /* Number of route updates accepted and exported */ - u32 exp_withdraws_received; /* Number of route withdraws received */ - u32 exp_withdraws_accepted; /* Number of route withdraws accepted and processed */ -}; - struct proto { node n; /* Node in global proto_list */ struct protocol *proto; /* Protocol */ @@ -165,22 +123,27 @@ struct proto { struct proto_config *cf_new; /* Configuration we want to switch to after shutdown (NULL=delete) */ pool *pool; /* Pool containing local objects */ event *event; /* Protocol event */ + timer *restart_timer; /* Timer to restart the protocol from limits */ + event *restart_event; /* Event to restart/shutdown the protocol from limits */ + struct birdloop *loop; /* BIRDloop running this protocol */ list channels; /* List of channels to rtables (struct channel) */ struct channel *main_channel; /* Primary channel */ struct rte_src *main_source; /* Primary route source */ + struct rte_owner sources; /* Route source owner structure */ struct iface *vrf; /* Related VRF instance, NULL if global */ + TLIST_LIST(proto_neigh) neighbors; /* List of neighbor structures */ + struct iface_subscription iface_sub; /* Interface notification subscription */ - const char *name; /* Name of this instance (== cf->name) */ + const char *name; /* Name of this instance (== cf->name) */ u32 debug; /* Debugging flags */ u32 mrtdump; /* MRTDump flags */ uint active_channels; /* Number of active channels */ + uint active_loops; /* Number of active IO loops */ byte net_type; /* Protocol network type (NET_*), 0 for undefined */ byte disabled; /* Manually disabled */ - byte vrf_set; /* Related VRF instance (above) is defined */ byte proto_state; /* Protocol state machine (PS_*, see below) */ byte active; /* From PS_START to cleanup after PS_STOP */ - byte do_start; /* Start actions are scheduled */ byte do_stop; /* Stop actions are scheduled */ byte reconfiguring; /* We're shutting down due to reconfiguration */ byte gr_recovery; /* Protocol should participate in graceful restart recovery */ @@ -210,10 +173,7 @@ struct proto { * feed_end Notify channel about finish of route feeding. */ - void (*if_notify)(struct proto *, unsigned flags, struct iface *i); - void (*ifa_notify)(struct proto *, unsigned flags, struct ifa *a); - void (*rt_notify)(struct proto *, struct channel *, struct network *net, struct rte *new, struct rte *old); - void (*neigh_notify)(struct neighbor *neigh); + void (*rt_notify)(struct proto *, struct channel *, const net_addr *net, struct rte *new, const struct rte *old); int (*preexport)(struct channel *, struct rte *rt); void (*reload_routes)(struct channel *); void (*feed_begin)(struct channel *, int initial); @@ -223,20 +183,16 @@ struct proto { * Routing entry hooks (called only for routes belonging to this protocol): * * rte_recalculate Called at the beginning of the best route selection - * rte_better Compare two rte's and decide which one is better (1=first, 0=second). - * rte_same Compare two rte's and decide whether they are identical (1=yes, 0=no). * rte_mergable Compare two rte's and decide whether they could be merged (1=yes, 0=no). * rte_insert Called whenever a rte is inserted to a routing table. * rte_remove Called whenever a rte is removed from the routing table. */ - int (*rte_recalculate)(struct rtable *, struct network *, struct rte *, struct rte *, struct rte *); - int (*rte_better)(struct rte *, struct rte *); + int (*rte_recalculate)(struct rtable_private *, struct network *, struct rte *, struct rte *, struct rte *); int (*rte_mergable)(struct rte *, struct rte *); - struct rte * (*rte_modify)(struct rte *, struct linpool *); void (*rte_insert)(struct network *, struct rte *); void (*rte_remove)(struct network *, struct rte *); - u32 (*rte_igp_metric)(struct rte *); + u32 (*rte_igp_metric)(const struct rte *); /* Hic sunt protocol-specific data */ }; @@ -276,7 +232,16 @@ void channel_graceful_restart_unlock(struct channel *c); #define DEFAULT_GR_WAIT 240 -void channel_show_limit(struct channel_limit *l, const char *dsc); +static inline event_list *proto_event_list(struct proto *p) +{ return p->loop == &main_birdloop ? &global_event_list : birdloop_event_list(p->loop); } + +static inline event_list *proto_work_list(struct proto *p) +{ return p->loop == &main_birdloop ? &global_work_list : birdloop_event_list(p->loop); } + +static inline void proto_send_event(struct proto *p, event *e) +{ ev_send(proto_event_list(p), e); } + +void channel_show_limit(struct limit *l, const char *dsc, int active, int action); void channel_show_info(struct channel *c); void channel_cmd_debug(struct channel *c, uint mask); @@ -294,6 +259,17 @@ struct proto *proto_iterate_named(struct symbol *sym, struct protocol *proto, st #define PROTO_WALK_CMD(sym,pr,p) for(struct proto *p = NULL; p = proto_iterate_named(sym, pr, p); ) +#define PROTO_ENTER_FROM_MAIN(p) ({ \ + ASSERT_DIE(birdloop_inside(&main_birdloop)); \ + struct birdloop *_loop = (p)->loop; \ + if (_loop != &main_birdloop) birdloop_enter(_loop); \ + _loop; \ + }) + +#define PROTO_LEAVE_FROM_MAIN(loop) ({ if (loop != &main_birdloop) birdloop_leave(loop); }) + +#define PROTO_LOCKED_FROM_MAIN(p) for (struct birdloop *_proto_loop = PROTO_ENTER_FROM_MAIN(p); _proto_loop; PROTO_LEAVE_FROM_MAIN(_proto_loop), (_proto_loop = NULL)) + #define CMD_RELOAD 0 #define CMD_RELOAD_IN 1 @@ -383,6 +359,14 @@ void proto_notify_state(struct proto *p, unsigned state); * as a result of received ROUTE-REFRESH request). */ +static inline int proto_is_inactive(struct proto *p) +{ + return (p->active_channels == 0) + && (p->active_loops == 0) + && (p->sources.uc == 0) + && EMPTY_TLIST(proto_neigh, &p->neighbors) + ; +} /* @@ -431,19 +415,30 @@ extern struct proto_config *cf_dev_proto; #define PLA_RESTART 4 /* Force protocol restart */ #define PLA_DISABLE 5 /* Shutdown and disable protocol */ -#define PLS_INITIAL 0 /* Initial limit state after protocol start */ -#define PLS_ACTIVE 1 /* Limit was hit */ -#define PLS_BLOCKED 2 /* Limit is active and blocking new routes */ - struct channel_limit { u32 limit; /* Maximum number of prefixes */ u8 action; /* Action to take (PLA_*) */ - u8 state; /* State of limit (PLS_*) */ }; -void channel_notify_limit(struct channel *c, struct channel_limit *l, int dir, u32 rt_count); +struct channel_limit_data { + struct channel *c; + int dir; +}; + +#define CLP__RX(_c) (&(_c)->rx_limit) +#define CLP__IN(_c) (&(_c)->in_limit) +#define CLP__OUT(_c) (&(_c)->out_limit) +#if 0 +#define CHANNEL_LIMIT_LOG(_c, _dir, _op) log(L_TRACE "%s.%s: %s limit %s %u", (_c)->proto->name, (_c)->name, #_dir, _op, (CLP__##_dir(_c))->count) +#else +#define CHANNEL_LIMIT_LOG(_c, _dir, _op) +#endif + +#define CHANNEL_LIMIT_PUSH(_c, _dir) ({ CHANNEL_LIMIT_LOG(_c, _dir, "push from"); struct channel_limit_data cld = { .c = (_c), .dir = PLD_##_dir }; limit_push(CLP__##_dir(_c), &cld); }) +#define CHANNEL_LIMIT_POP(_c, _dir) ({ limit_pop(CLP__##_dir(_c)); CHANNEL_LIMIT_LOG(_c, _dir, "pop to"); }) + /* * Channels */ @@ -485,40 +480,71 @@ struct channel_config { struct proto_config *parent; /* Where channel is defined (proto or template) */ struct rtable_config *table; /* Table we're attached to */ const struct filter *in_filter, *out_filter; /* Attached filters */ + const net_addr *out_subprefix; /* Export only subprefixes of this net */ + struct channel_limit rx_limit; /* Limit for receiving routes from protocol - (relevant when in_keep_filtered is active) */ + (relevant when in_keep & RIK_REJECTED) */ struct channel_limit in_limit; /* Limit for importing routes from protocol */ struct channel_limit out_limit; /* Limit for exporting routes to protocol */ + struct settle_config roa_settle; /* Settle times for ROA-induced reload */ + u8 net_type; /* Routing table network type (NET_*), 0 for undefined */ u8 ra_mode; /* Mode of received route advertisements (RA_*) */ u16 preference; /* Default route preference */ u32 debug; /* Debugging flags (D_*) */ u8 copy; /* Value from channel_config_get() is new (0) or from template (1) */ u8 merge_limit; /* Maximal number of nexthops for RA_MERGED */ - u8 in_keep_filtered; /* Routes rejected in import filter are kept */ + u8 in_keep; /* Which states of routes to keep (RIK_*) */ u8 rpki_reload; /* RPKI changes trigger channel reload */ }; struct channel { node n; /* Node in proto->channels */ - node table_node; /* Node in table->channels */ const char *name; /* Channel name (may be NULL) */ const struct channel_class *channel; struct proto *proto; - struct rtable *table; + rtable *table; const struct filter *in_filter; /* Input filter */ const struct filter *out_filter; /* Output filter */ - struct bmap export_map; /* Keeps track which routes passed export filter */ - struct channel_limit rx_limit; /* Receive limit (for in_keep_filtered) */ - struct channel_limit in_limit; /* Input limit */ - struct channel_limit out_limit; /* Output limit */ - - struct event *feed_event; /* Event responsible for feeding */ - struct fib_iterator feed_fit; /* Routing table iterator used during feeding */ - struct proto_stats stats; /* Per-channel protocol statistics */ + const net_addr *out_subprefix; /* Export only subprefixes of this net */ + struct bmap export_map; /* Keeps track which routes were really exported */ + struct bmap export_reject_map; /* Keeps track which routes were rejected by export filter */ + + struct limit rx_limit; /* Receive limit (for in_keep & RIK_REJECTED) */ + struct limit in_limit; /* Input limit */ + struct limit out_limit; /* Output limit */ + + struct settle_config roa_settle; /* Settle times for ROA-induced reload */ + + u8 limit_actions[PLD_MAX]; /* Limit actions enum */ + u8 limit_active; /* Flags for active limits */ + + struct channel_import_stats { + /* Import - from protocol to core */ + u32 updates_received; /* Number of route updates received */ + u32 updates_invalid; /* Number of route updates rejected as invalid */ + u32 updates_filtered; /* Number of route updates rejected by filters */ + u32 updates_limited_rx; /* Number of route updates exceeding the rx_limit */ + u32 updates_limited_in; /* Number of route updates exceeding the in_limit */ + u32 withdraws_received; /* Number of route withdraws received */ + u32 withdraws_invalid; /* Number of route withdraws rejected as invalid */ + } import_stats; + + struct channel_export_stats { + /* Export - from core to protocol */ + u32 updates_rejected; /* Number of route updates rejected by protocol */ + u32 updates_filtered; /* Number of route updates rejected by filters */ + u32 updates_accepted; /* Number of route updates accepted and exported */ + u32 updates_limited; /* Number of route updates exceeding the out_limit */ + u32 withdraws_accepted; /* Number of route withdraws accepted and processed */ + } export_stats; + + struct rt_import_request in_req; /* Table import connection */ + struct rt_export_request out_req; /* Table export connection */ + u32 refeed_count; /* Number of routes exported during refeed regardless of out_limit */ u8 net_type; /* Routing table network type (NET_*), 0 for undefined */ @@ -526,36 +552,31 @@ struct channel { u16 preference; /* Default route preference */ u32 debug; /* Debugging flags (D_*) */ u8 merge_limit; /* Maximal number of nexthops for RA_MERGED */ - u8 in_keep_filtered; /* Routes rejected in import filter are kept */ + u8 in_keep; /* Which states of routes to keep (RIK_*) */ u8 disabled; u8 stale; /* Used in reconfiguration */ u8 channel_state; - u8 export_state; /* Route export state (ES_*, see below) */ - u8 feed_active; - u8 flush_active; - u8 refeeding; /* We are refeeding (valid only if export_state == ES_FEEDING) */ + u8 refeeding; /* Refeeding the channel. */ u8 reloadable; /* Hook reload_routes() is allowed on the channel */ u8 gr_lock; /* Graceful restart mechanism should wait for this channel */ u8 gr_wait; /* Route export to channel is postponed until graceful restart */ btime last_state_change; /* Time of last state transition */ - struct rtable *in_table; /* Internal table for received routes */ - struct event *reload_event; /* Event responsible for reloading from in_table */ - struct fib_iterator reload_fit; /* FIB iterator in in_table used during reloading */ - struct rte *reload_next_rte; /* Route iterator in in_table used during reloading */ - u8 reload_active; /* Iterator reload_fit is linked */ + struct rt_export_request reload_req; /* Feeder for import reload */ u8 reload_pending; /* Reloading and another reload is scheduled */ u8 refeed_pending; /* Refeeding and another refeed is scheduled */ u8 rpki_reload; /* RPKI changes trigger channel reload */ - struct rtable *out_table; /* Internal table for exported routes */ + struct rt_exporter *out_table; /* Internal table for exported routes */ - list roa_subscriptions; /* List of active ROA table subscriptions based on filters roa_check() */ + list roa_subscriptions; /* List of active ROA table subscriptions based on filters' roa_check() calls */ }; +#define RIK_REJECTED 1 /* Routes rejected in import filter are kept */ +#define RIK_PREFILTER (2 | RIK_REJECTED) /* All routes' attribute state before import filter is kept */ /* * Channel states @@ -582,70 +603,54 @@ struct channel { * restricted by that and is on volition of the protocol. Generally, channels * are opened in protocols' start() hooks when going to PS_UP. * - * CS_FLUSHING - The transitional state between initialized channel and closed + * CS_STOP - The transitional state between initialized channel and closed * channel. The channel is still initialized, but no route exchange is allowed. * Instead, the associated table is running flush loop to remove routes imported * through the channel. After that, the channel changes state to CS_DOWN and * is detached from the table (the table is unlocked and the channel is unlinked - * from it). Unlike other states, the CS_FLUSHING state is not explicitly + * from it). Unlike other states, the CS_STOP state is not explicitly * entered or left by the protocol. A protocol may request to close a channel * (by calling channel_close()), which causes the channel to change state to - * CS_FLUSHING and later to CS_DOWN. Also note that channels are closed + * CS_STOP and later to CS_DOWN. Also note that channels are closed * automatically by the core when the protocol is going down. * + * CS_PAUSE - Almost the same as CS_STOP, just the table import is kept and + * the table export is stopped before transitioning to CS_START. + * * Allowed transitions: * * CS_DOWN -> CS_START / CS_UP - * CS_START -> CS_UP / CS_FLUSHING - * CS_UP -> CS_START / CS_FLUSHING - * CS_FLUSHING -> CS_DOWN (automatic) + * CS_START -> CS_UP / CS_STOP + * CS_UP -> CS_PAUSE / CS_STOP + * CS_PAUSE -> CS_START (automatic) + * CS_STOP -> CS_DOWN (automatic) */ #define CS_DOWN 0 #define CS_START 1 #define CS_UP 2 -#define CS_FLUSHING 3 - -#define ES_DOWN 0 -#define ES_FEEDING 1 -#define ES_READY 2 - +#define CS_STOP 3 +#define CS_PAUSE 4 struct channel_config *proto_cf_find_channel(struct proto_config *p, uint net_type); static inline struct channel_config *proto_cf_main_channel(struct proto_config *pc) { return proto_cf_find_channel(pc, pc->net_type); } -struct channel *proto_find_channel_by_table(struct proto *p, struct rtable *t); +struct channel *proto_find_channel_by_table(struct proto *p, rtable *t); struct channel *proto_find_channel_by_name(struct proto *p, const char *n); struct channel *proto_add_channel(struct proto *p, struct channel_config *cf); int proto_configure_channel(struct proto *p, struct channel **c, struct channel_config *cf); void channel_set_state(struct channel *c, uint state); -void channel_setup_in_table(struct channel *c); -void channel_setup_out_table(struct channel *c); void channel_schedule_reload(struct channel *c); static inline void channel_init(struct channel *c) { channel_set_state(c, CS_START); } static inline void channel_open(struct channel *c) { channel_set_state(c, CS_UP); } -static inline void channel_close(struct channel *c) { channel_set_state(c, CS_FLUSHING); } +static inline void channel_close(struct channel *c) { channel_set_state(c, CS_STOP); } void channel_request_feeding(struct channel *c); void *channel_config_new(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto); void *channel_config_get(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto); int channel_reconfigure(struct channel *c, struct channel_config *cf); - -/* Moved from route.h to avoid dependency conflicts */ -static inline void rte_update(struct proto *p, const net_addr *n, rte *new) { rte_update2(p->main_channel, n, new, p->main_source); } - -static inline void -rte_update3(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) -{ - if (c->in_table && !rte_update_in(c, n, new, src)) - return; - - rte_update2(c, n, new, src); -} - - #endif diff --git a/nest/route.h b/nest/route.h deleted file mode 100644 index 7aec7117..00000000 --- a/nest/route.h +++ /dev/null @@ -1,765 +0,0 @@ -/* - * BIRD Internet Routing Daemon -- Routing Table - * - * (c) 1998--2000 Martin Mares <mj@ucw.cz> - * - * Can be freely distributed and used under the terms of the GNU GPL. - */ - -#ifndef _BIRD_ROUTE_H_ -#define _BIRD_ROUTE_H_ - -#include "lib/lists.h" -#include "lib/bitmap.h" -#include "lib/resource.h" -#include "lib/net.h" - -struct ea_list; -struct protocol; -struct proto; -struct rte_src; -struct symbol; -struct timer; -struct fib; -struct filter; -struct f_trie; -struct f_trie_walk_state; -struct cli; - -/* - * Generic data structure for storing network prefixes. Also used - * for the master routing table. Currently implemented as a hash - * table. - * - * Available operations: - * - insertion of new entry - * - deletion of entry - * - searching for entry by network prefix - * - asynchronous retrieval of fib contents - */ - -struct fib_node { - struct fib_node *next; /* Next in hash chain */ - struct fib_iterator *readers; /* List of readers of this node */ - net_addr addr[0]; -}; - -struct fib_iterator { /* See lib/slists.h for an explanation */ - struct fib_iterator *prev, *next; /* Must be synced with struct fib_node! */ - byte efef; /* 0xff to distinguish between iterator and node */ - byte pad[3]; - struct fib_node *node; /* Or NULL if freshly merged */ - uint hash; -}; - -typedef void (*fib_init_fn)(struct fib *, void *); - -struct fib { - pool *fib_pool; /* Pool holding all our data */ - slab *fib_slab; /* Slab holding all fib nodes */ - struct fib_node **hash_table; /* Node hash table */ - uint hash_size; /* Number of hash table entries (a power of two) */ - uint hash_order; /* Binary logarithm of hash_size */ - uint hash_shift; /* 32 - hash_order */ - uint addr_type; /* Type of address data stored in fib (NET_*) */ - uint node_size; /* FIB node size, 0 for nonuniform */ - uint node_offset; /* Offset of fib_node struct inside of user data */ - uint entries; /* Number of entries */ - uint entries_min, entries_max; /* Entry count limits (else start rehashing) */ - fib_init_fn init; /* Constructor */ -}; - -static inline void * fib_node_to_user(struct fib *f, struct fib_node *e) -{ return e ? (void *) ((char *) e - f->node_offset) : NULL; } - -static inline struct fib_node * fib_user_to_node(struct fib *f, void *e) -{ return e ? (void *) ((char *) e + f->node_offset) : NULL; } - -void fib_init(struct fib *f, pool *p, uint addr_type, uint node_size, uint node_offset, uint hash_order, fib_init_fn init); -void *fib_find(struct fib *, const net_addr *); /* Find or return NULL if doesn't exist */ -void *fib_get_chain(struct fib *f, const net_addr *a); /* Find first node in linked list from hash table */ -void *fib_get(struct fib *, const net_addr *); /* Find or create new if nonexistent */ -void *fib_route(struct fib *, const net_addr *); /* Longest-match routing lookup */ -void fib_delete(struct fib *, void *); /* Remove fib entry */ -void fib_free(struct fib *); /* Destroy the fib */ -void fib_check(struct fib *); /* Consistency check for debugging */ - -void fit_init(struct fib_iterator *, struct fib *); /* Internal functions, don't call */ -struct fib_node *fit_get(struct fib *, struct fib_iterator *); -void fit_put(struct fib_iterator *, struct fib_node *); -void fit_put_next(struct fib *f, struct fib_iterator *i, struct fib_node *n, uint hpos); -void fit_put_end(struct fib_iterator *i); -void fit_copy(struct fib *f, struct fib_iterator *dst, struct fib_iterator *src); - - -#define FIB_WALK(fib, type, z) do { \ - struct fib_node *fn_, **ff_ = (fib)->hash_table; \ - uint count_ = (fib)->hash_size; \ - type *z; \ - while (count_--) \ - for (fn_ = *ff_++; z = fib_node_to_user(fib, fn_); fn_=fn_->next) - -#define FIB_WALK_END } while (0) - -#define FIB_ITERATE_INIT(it, fib) fit_init(it, fib) - -#define FIB_ITERATE_START(fib, it, type, z) do { \ - struct fib_node *fn_ = fit_get(fib, it); \ - uint count_ = (fib)->hash_size; \ - uint hpos_ = (it)->hash; \ - type *z; \ - for(;;) { \ - if (!fn_) \ - { \ - if (++hpos_ >= count_) \ - break; \ - fn_ = (fib)->hash_table[hpos_]; \ - continue; \ - } \ - z = fib_node_to_user(fib, fn_); - -#define FIB_ITERATE_END fn_ = fn_->next; } } while(0) - -#define FIB_ITERATE_PUT(it) fit_put(it, fn_) - -#define FIB_ITERATE_PUT_NEXT(it, fib) fit_put_next(fib, it, fn_, hpos_) - -#define FIB_ITERATE_PUT_END(it) fit_put_end(it) - -#define FIB_ITERATE_UNLINK(it, fib) fit_get(fib, it) - -#define FIB_ITERATE_COPY(dst, src, fib) fit_copy(fib, dst, src) - - -/* - * Master Routing Tables. Generally speaking, each of them contains a FIB - * with each entry pointing to a list of route entries representing routes - * to given network (with the selected one at the head). - * - * Each of the RTE's contains variable data (the preference and protocol-dependent - * metrics) and a pointer to a route attribute block common for many routes). - * - * It's guaranteed that there is at most one RTE for every (prefix,proto) pair. - */ - -struct rtable_config { - node n; - char *name; - struct rtable *table; - struct proto_config *krt_attached; /* Kernel syncer attached to this table */ - uint addr_type; /* Type of address data stored in table (NET_*) */ - uint gc_threshold; /* Maximum number of operations before GC is run */ - uint gc_period; /* Approximate time between two consecutive GC runs */ - byte sorted; /* Routes of network are sorted according to rte_better() */ - byte internal; /* Internal table of a protocol */ - byte trie_used; /* Rtable has attached trie */ - btime min_settle_time; /* Minimum settle time for notifications */ - btime max_settle_time; /* Maximum settle time for notifications */ -}; - -typedef struct rtable { - resource r; - node n; /* Node in list of all tables */ - pool *rp; /* Resource pool to allocate everything from, including itself */ - struct fib fib; - struct f_trie *trie; /* Trie of prefixes defined in fib */ - char *name; /* Name of this table */ - list channels; /* List of attached channels (struct channel) */ - uint addr_type; /* Type of address data stored in table (NET_*) */ - int pipe_busy; /* Pipe loop detection */ - int use_count; /* Number of protocols using this table */ - u32 rt_count; /* Number of routes in the table */ - - byte internal; /* Internal table of a protocol */ - - struct hmap id_map; - struct hostcache *hostcache; - struct rtable_config *config; /* Configuration of this table */ - struct config *deleted; /* Table doesn't exist in current configuration, - * delete as soon as use_count becomes 0 and remove - * obstacle from this routing table. - */ - struct event *rt_event; /* Routing table event */ - struct timer *prune_timer; /* Timer for periodic pruning / GC */ - btime last_rt_change; /* Last time when route changed */ - btime base_settle_time; /* Start time of rtable settling interval */ - btime gc_time; /* Time of last GC */ - uint gc_counter; /* Number of operations since last GC */ - byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */ - byte prune_trie; /* Prune prefix trie during next table prune */ - byte hcu_scheduled; /* Hostcache update is scheduled */ - byte nhu_state; /* Next Hop Update state */ - struct fib_iterator prune_fit; /* Rtable prune FIB iterator */ - struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ - struct f_trie *trie_new; /* New prefix trie defined during pruning */ - struct f_trie *trie_old; /* Old prefix trie waiting to be freed */ - u32 trie_lock_count; /* Prefix trie locked by walks */ - u32 trie_old_lock_count; /* Old prefix trie locked by walks */ - - list subscribers; /* Subscribers for notifications */ - struct timer *settle_timer; /* Settle time for notifications */ - list flowspec_links; /* List of flowspec links, src for NET_IPx and dst for NET_FLOWx */ - struct f_trie *flowspec_trie; /* Trie for evaluation of flowspec notifications */ -} rtable; - -struct rt_subscription { - node n; - rtable *tab; - void (*hook)(struct rt_subscription *b); - void *data; -}; - -struct rt_flowspec_link { - node n; - rtable *src; - rtable *dst; - u32 uc; -}; - -#define NHU_CLEAN 0 -#define NHU_SCHEDULED 1 -#define NHU_RUNNING 2 -#define NHU_DIRTY 3 - -typedef struct network { - struct rte *routes; /* Available routes for this network */ - struct fib_node n; /* FIB flags reserved for kernel syncer */ -} net; - -struct hostcache { - slab *slab; /* Slab holding all hostentries */ - struct hostentry **hash_table; /* Hash table for hostentries */ - unsigned hash_order, hash_shift; - unsigned hash_max, hash_min; - unsigned hash_items; - linpool *lp; /* Linpool for trie */ - struct f_trie *trie; /* Trie of prefixes that might affect hostentries */ - list hostentries; /* List of all hostentries */ - byte update_hostcache; -}; - -struct hostentry { - node ln; - ip_addr addr; /* IP address of host, part of key */ - ip_addr link; /* (link-local) IP address of host, used as gw - if host is directly attached */ - struct rtable *tab; /* Dependent table, part of key */ - struct hostentry *next; /* Next in hash chain */ - unsigned hash_key; /* Hash key */ - unsigned uc; /* Use count */ - struct rta *src; /* Source rta entry */ - byte dest; /* Chosen route destination type (RTD_...) */ - byte nexthop_linkable; /* Nexthop list is completely non-device */ - u32 igp_metric; /* Chosen route IGP metric */ -}; - -typedef struct rte { - struct rte *next; - net *net; /* Network this RTE belongs to */ - struct rte_src *src; /* Route source that created the route */ - struct channel *sender; /* Channel used to send the route to the routing table */ - struct rta *attrs; /* Attributes of this route */ - u32 id; /* Table specific route id */ - byte flags; /* Flags (REF_...) */ - byte pflags; /* Protocol-specific flags */ - btime lastmod; /* Last modified */ -} rte; - -#define REF_COW 1 /* Copy this rte on write */ -#define REF_FILTERED 2 /* Route is rejected by import filter */ -#define REF_STALE 4 /* Route is stale in a refresh cycle */ -#define REF_DISCARD 8 /* Route is scheduled for discard */ -#define REF_MODIFY 16 /* Route is scheduled for modify */ - -/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ -static inline int rte_is_valid(rte *r) { return r && !(r->flags & REF_FILTERED); } - -/* Route just has REF_FILTERED flag */ -static inline int rte_is_filtered(rte *r) { return !!(r->flags & REF_FILTERED); } - - -/* Types of route announcement, also used as flags */ -#define RA_UNDEF 0 /* Undefined RA type */ -#define RA_OPTIMAL 1 /* Announcement of optimal route change */ -#define RA_ACCEPTED 2 /* Announcement of first accepted route */ -#define RA_ANY 3 /* Announcement of any route change */ -#define RA_MERGED 4 /* Announcement of optimal route merged with next ones */ - -/* Return value of preexport() callback */ -#define RIC_ACCEPT 1 /* Accepted by protocol */ -#define RIC_PROCESS 0 /* Process it through import filter */ -#define RIC_REJECT -1 /* Rejected by protocol */ -#define RIC_DROP -2 /* Silently dropped by protocol */ - -extern list routing_tables; -struct config; - -void rt_init(void); -void rt_preconfig(struct config *); -void rt_postconfig(struct config *); -void rt_commit(struct config *new, struct config *old); -void rt_lock_table(rtable *); -void rt_unlock_table(rtable *); -struct f_trie * rt_lock_trie(rtable *tab); -void rt_unlock_trie(rtable *tab, struct f_trie *trie); -void rt_subscribe(rtable *tab, struct rt_subscription *s); -void rt_unsubscribe(struct rt_subscription *s); -void rt_flowspec_link(rtable *src, rtable *dst); -void rt_flowspec_unlink(rtable *src, rtable *dst); -rtable *rt_setup(pool *, struct rtable_config *); -static inline void rt_shutdown(rtable *r) { rfree(r->rp); } - -static inline net *net_find(rtable *tab, const net_addr *addr) { return (net *) fib_find(&tab->fib, addr); } -static inline net *net_find_valid(rtable *tab, const net_addr *addr) -{ net *n = net_find(tab, addr); return (n && rte_is_valid(n->routes)) ? n : NULL; } -static inline net *net_get(rtable *tab, const net_addr *addr) { return (net *) fib_get(&tab->fib, addr); } -net *net_get(rtable *tab, const net_addr *addr); -net *net_route(rtable *tab, const net_addr *n); -int net_roa_check(rtable *tab, const net_addr *n, u32 asn); -rte *rte_find(net *net, struct rte_src *src); -rte *rte_get_temp(struct rta *, struct rte_src *src); -void rte_update2(struct channel *c, const net_addr *n, rte *new, struct rte_src *src); -/* rte_update() moved to protocol.h to avoid dependency conflicts */ -int rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter); -rte *rt_export_merged(struct channel *c, net *net, rte **rt_free, linpool *pool, int silent); -void rt_refresh_begin(rtable *t, struct channel *c); -void rt_refresh_end(rtable *t, struct channel *c); -void rt_modify_stale(rtable *t, struct channel *c); -void rt_schedule_prune(rtable *t); -void rte_dump(rte *); -void rte_free(rte *); -rte *rte_do_cow(rte *); -static inline rte * rte_cow(rte *r) { return (r->flags & REF_COW) ? rte_do_cow(r) : r; } -rte *rte_cow_rta(rte *r, linpool *lp); -void rt_dump(rtable *); -void rt_dump_all(void); -int rt_feed_channel(struct channel *c); -void rt_feed_channel_abort(struct channel *c); -int rte_update_in(struct channel *c, const net_addr *n, rte *new, struct rte_src *src); -int rt_reload_channel(struct channel *c); -void rt_reload_channel_abort(struct channel *c); -void rt_prune_sync(rtable *t, int all); -int rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, int refeed); -struct rtable_config *rt_new_table(struct symbol *s, uint addr_type); - -static inline int rt_is_ip(rtable *tab) -{ return (tab->addr_type == NET_IP4) || (tab->addr_type == NET_IP6); } - -static inline int rt_is_vpn(rtable *tab) -{ return (tab->addr_type == NET_VPN4) || (tab->addr_type == NET_VPN6); } - -static inline int rt_is_roa(rtable *tab) -{ return (tab->addr_type == NET_ROA4) || (tab->addr_type == NET_ROA6); } - -static inline int rt_is_flow(rtable *tab) -{ return (tab->addr_type == NET_FLOW4) || (tab->addr_type == NET_FLOW6); } - - -/* Default limit for ECMP next hops, defined in sysdep code */ -extern const int rt_default_ecmp; - -struct rt_show_data_rtable { - node n; - rtable *table; - struct channel *export_channel; -}; - -struct rt_show_data { - net_addr *addr; - list tables; - struct rt_show_data_rtable *tab; /* Iterator over table list */ - struct rt_show_data_rtable *last_table; /* Last table in output */ - struct fib_iterator fit; /* Iterator over networks in table */ - struct f_trie_walk_state *walk_state; /* Iterator over networks in trie */ - struct f_trie *walk_lock; /* Locked trie for walking */ - int verbose, tables_defined_by; - const struct filter *filter; - struct proto *show_protocol; - struct proto *export_protocol; - struct channel *export_channel; - struct config *running_on_config; - struct krt_proto *kernel; - int export_mode, addr_mode, primary_only, filtered, stats; - - int table_open; /* Iteration (fit) is open */ - int trie_walk; /* Current table is iterated using trie */ - int net_counter, rt_counter, show_counter, table_counter; - int net_counter_last, rt_counter_last, show_counter_last; -}; - -void rt_show(struct rt_show_data *); -struct rt_show_data_rtable * rt_show_add_table(struct rt_show_data *d, rtable *t); - -/* Value of table definition mode in struct rt_show_data */ -#define RSD_TDB_DEFAULT 0 /* no table specified */ -#define RSD_TDB_INDIRECT 0 /* show route ... protocol P ... */ -#define RSD_TDB_ALL RSD_TDB_SET /* show route ... table all ... */ -#define RSD_TDB_DIRECT RSD_TDB_SET | RSD_TDB_NMN /* show route ... table X table Y ... */ - -#define RSD_TDB_SET 0x1 /* internal: show empty tables */ -#define RSD_TDB_NMN 0x2 /* internal: need matching net */ - -/* Value of addr_mode */ -#define RSD_ADDR_EQUAL 1 /* Exact query - show route <addr> */ -#define RSD_ADDR_FOR 2 /* Longest prefix match - show route for <addr> */ -#define RSD_ADDR_IN 3 /* Interval query - show route in <addr> */ - -/* Value of export_mode in struct rt_show_data */ -#define RSEM_NONE 0 /* Export mode not used */ -#define RSEM_PREEXPORT 1 /* Routes ready for export, before filtering */ -#define RSEM_EXPORT 2 /* Routes accepted by export filter */ -#define RSEM_NOEXPORT 3 /* Routes rejected by export filter */ -#define RSEM_EXPORTED 4 /* Routes marked in export map */ - -/* - * Route Attributes - * - * Beware: All standard BGP attributes must be represented here instead - * of making them local to the route. This is needed to ensure proper - * construction of BGP route attribute lists. - */ - -/* Nexthop structure */ -struct nexthop { - ip_addr gw; /* Next hop */ - struct iface *iface; /* Outgoing interface */ - struct nexthop *next; - byte flags; - byte weight; - byte labels_orig; /* Number of labels before hostentry was applied */ - byte labels; /* Number of all labels */ - u32 label[0]; -}; - -#define RNF_ONLINK 0x1 /* Gateway is onlink regardless of IP ranges */ - - -struct rte_src { - struct rte_src *next; /* Hash chain */ - struct proto *proto; /* Protocol the source is based on */ - u32 private_id; /* Private ID, assigned by the protocol */ - u32 global_id; /* Globally unique ID of the source */ - unsigned uc; /* Use count */ -}; - - -typedef struct rta { - struct rta *next, **pprev; /* Hash chain */ - u32 uc; /* Use count */ - u32 hash_key; /* Hash over important fields */ - struct ea_list *eattrs; /* Extended Attribute chain */ - struct hostentry *hostentry; /* Hostentry for recursive next-hops */ - ip_addr from; /* Advertising router */ - u32 igp_metric; /* IGP metric to next hop (for iBGP routes) */ - u16 cached:1; /* Are attributes cached? */ - u16 source:7; /* Route source (RTS_...) */ - u16 scope:4; /* Route scope (SCOPE_... -- see ip.h) */ - u16 dest:4; /* Route destination type (RTD_...) */ - word pref; - struct nexthop nh; /* Next hop */ -} rta; - -#define RTS_STATIC 1 /* Normal static route */ -#define RTS_INHERIT 2 /* Route inherited from kernel */ -#define RTS_DEVICE 3 /* Device route */ -#define RTS_STATIC_DEVICE 4 /* Static device route */ -#define RTS_REDIRECT 5 /* Learned via redirect */ -#define RTS_RIP 6 /* RIP route */ -#define RTS_OSPF 7 /* OSPF route */ -#define RTS_OSPF_IA 8 /* OSPF inter-area route */ -#define RTS_OSPF_EXT1 9 /* OSPF external route type 1 */ -#define RTS_OSPF_EXT2 10 /* OSPF external route type 2 */ -#define RTS_BGP 11 /* BGP route */ -#define RTS_PIPE 12 /* Inter-table wormhole */ -#define RTS_BABEL 13 /* Babel route */ -#define RTS_RPKI 14 /* Route Origin Authorization */ -#define RTS_PERF 15 /* Perf checker */ -#define RTS_MAX 16 - -#define RTD_NONE 0 /* Undefined next hop */ -#define RTD_UNICAST 1 /* Next hop is neighbor router */ -#define RTD_BLACKHOLE 2 /* Silently drop packets */ -#define RTD_UNREACHABLE 3 /* Reject as unreachable */ -#define RTD_PROHIBIT 4 /* Administratively prohibited */ -#define RTD_MAX 5 - -#define IGP_METRIC_UNKNOWN 0x80000000 /* Default igp_metric used when no other - protocol-specific metric is availabe */ - - -extern const char * rta_dest_names[RTD_MAX]; - -static inline const char *rta_dest_name(uint n) -{ return (n < RTD_MAX) ? rta_dest_names[n] : "???"; } - -/* Route has regular, reachable nexthop (i.e. not RTD_UNREACHABLE and like) */ -static inline int rte_is_reachable(rte *r) -{ return r->attrs->dest == RTD_UNICAST; } - - -/* - * Extended Route Attributes - */ - -typedef struct eattr { - word id; /* EA_CODE(PROTOCOL_..., protocol-dependent ID) */ - byte flags; /* Protocol-dependent flags */ - byte type:5; /* Attribute type */ - byte originated:1; /* The attribute has originated locally */ - byte fresh:1; /* An uncached attribute (e.g. modified in export filter) */ - byte undef:1; /* Explicitly undefined */ - union { - uintptr_t data; - const struct adata *ptr; /* Attribute data elsewhere */ - } u; -} eattr; - - -#define EA_CODE(proto,id) (((proto) << 8) | (id)) -#define EA_ID(ea) ((ea) & 0xff) -#define EA_PROTO(ea) ((ea) >> 8) -#define EA_CUSTOM(id) ((id) | EA_CUSTOM_BIT) -#define EA_IS_CUSTOM(ea) ((ea) & EA_CUSTOM_BIT) -#define EA_CUSTOM_ID(ea) ((ea) & ~EA_CUSTOM_BIT) - -const char *ea_custom_name(uint ea); - -#define EA_GEN_IGP_METRIC EA_CODE(PROTOCOL_NONE, 0) - -#define EA_CODE_MASK 0xffff -#define EA_CUSTOM_BIT 0x8000 -#define EA_ALLOW_UNDEF 0x10000 /* ea_find: allow EAF_TYPE_UNDEF */ -#define EA_BIT(n) ((n) << 24) /* Used in bitfield accessors */ -#define EA_BIT_GET(ea) ((ea) >> 24) - -#define EAF_TYPE_MASK 0x1f /* Mask with this to get type */ -#define EAF_TYPE_INT 0x01 /* 32-bit unsigned integer number */ -#define EAF_TYPE_OPAQUE 0x02 /* Opaque byte string (not filterable) */ -#define EAF_TYPE_IP_ADDRESS 0x04 /* IP address */ -#define EAF_TYPE_ROUTER_ID 0x05 /* Router ID (IPv4 address) */ -#define EAF_TYPE_AS_PATH 0x06 /* BGP AS path (encoding per RFC 1771:4.3) */ -#define EAF_TYPE_BITFIELD 0x09 /* 32-bit embedded bitfield */ -#define EAF_TYPE_INT_SET 0x0a /* Set of u32's (e.g., a community list) */ -#define EAF_TYPE_EC_SET 0x0e /* Set of pairs of u32's - ext. community list */ -#define EAF_TYPE_LC_SET 0x12 /* Set of triplets of u32's - large community list */ -#define EAF_TYPE_IFACE 0x16 /* Interface pointer stored in adata */ -#define EAF_EMBEDDED 0x01 /* Data stored in eattr.u.data (part of type spec) */ -#define EAF_VAR_LENGTH 0x02 /* Attribute length is variable (part of type spec) */ - -typedef struct adata { - uint length; /* Length of data */ - byte data[0]; -} adata; - -extern const adata null_adata; /* adata of length 0 */ - -static inline struct adata * -lp_alloc_adata(struct linpool *pool, uint len) -{ - struct adata *ad = lp_alloc(pool, sizeof(struct adata) + len); - ad->length = len; - return ad; -} - -static inline int adata_same(const struct adata *a, const struct adata *b) -{ return (a->length == b->length && !memcmp(a->data, b->data, a->length)); } - - -typedef struct ea_list { - struct ea_list *next; /* In case we have an override list */ - byte flags; /* Flags: EALF_... */ - byte rfu; - word count; /* Number of attributes */ - eattr attrs[0]; /* Attribute definitions themselves */ -} ea_list; - -#define EALF_SORTED 1 /* Attributes are sorted by code */ -#define EALF_BISECT 2 /* Use interval bisection for searching */ -#define EALF_CACHED 4 /* Attributes belonging to cached rta */ - -struct rte_src *rt_find_source(struct proto *p, u32 id); -struct rte_src *rt_get_source(struct proto *p, u32 id); -static inline void rt_lock_source(struct rte_src *src) { src->uc++; } -static inline void rt_unlock_source(struct rte_src *src) { src->uc--; } -void rt_prune_sources(void); - -struct ea_walk_state { - ea_list *eattrs; /* Ccurrent ea_list, initially set by caller */ - eattr *ea; /* Current eattr, initially NULL */ - u32 visited[4]; /* Bitfield, limiting max to 128 */ -}; - -eattr *ea_find(ea_list *, unsigned ea); -eattr *ea_walk(struct ea_walk_state *s, uint id, uint max); -uintptr_t ea_get_int(ea_list *, unsigned ea, uintptr_t def); -void ea_dump(ea_list *); -void ea_sort(ea_list *); /* Sort entries in all sub-lists */ -unsigned ea_scan(ea_list *); /* How many bytes do we need for merged ea_list */ -void ea_merge(ea_list *from, ea_list *to); /* Merge sub-lists to allocated buffer */ -int ea_same(ea_list *x, ea_list *y); /* Test whether two ea_lists are identical */ -uint ea_hash(ea_list *e); /* Calculate 16-bit hash value */ -ea_list *ea_append(ea_list *to, ea_list *what); -void ea_format_bitfield(const struct eattr *a, byte *buf, int bufsize, const char **names, int min, int max); - -#define ea_normalize(ea) do { \ - if (ea->next) { \ - ea_list *t = alloca(ea_scan(ea)); \ - ea_merge(ea, t); \ - ea = t; \ - } \ - ea_sort(ea); \ - if (ea->count == 0) \ - ea = NULL; \ -} while(0) \ - -struct ea_one_attr_list { - ea_list l; - eattr a; -}; - -static inline eattr * -ea_set_attr(ea_list **to, struct linpool *pool, uint id, uint flags, uint type, uintptr_t val) -{ - struct ea_one_attr_list *ea = lp_alloc(pool, sizeof(*ea)); - *ea = (struct ea_one_attr_list) { - .l.flags = EALF_SORTED, - .l.count = 1, - .l.next = *to, - - .a.id = id, - .a.type = type, - .a.flags = flags, - }; - - if (type & EAF_EMBEDDED) - ea->a.u.data = val; - else - ea->a.u.ptr = (struct adata *) val; - - *to = &ea->l; - - return &ea->a; -} - -static inline void -ea_unset_attr(ea_list **to, struct linpool *pool, _Bool local, uint code) -{ - struct ea_one_attr_list *ea = lp_alloc(pool, sizeof(*ea)); - *ea = (struct ea_one_attr_list) { - .l.flags = EALF_SORTED, - .l.count = 1, - .l.next = *to, - .a.id = code, - .a.fresh = local, - .a.originated = local, - .a.undef = 1, - }; - - *to = &ea->l; -} - -static inline void -ea_set_attr_u32(ea_list **to, struct linpool *pool, uint id, uint flags, uint type, u32 val) -{ ea_set_attr(to, pool, id, flags, type, (uintptr_t) val); } - -static inline void -ea_set_attr_ptr(ea_list **to, struct linpool *pool, uint id, uint flags, uint type, struct adata *val) -{ ea_set_attr(to, pool, id, flags, type, (uintptr_t) val); } - -static inline void -ea_set_attr_data(ea_list **to, struct linpool *pool, uint id, uint flags, uint type, void *data, uint len) -{ - struct adata *a = lp_alloc_adata(pool, len); - memcpy(a->data, data, len); - ea_set_attr(to, pool, id, flags, type, (uintptr_t) a); -} - - -#define NEXTHOP_MAX_SIZE (sizeof(struct nexthop) + sizeof(u32)*MPLS_MAX_LABEL_STACK) - -static inline size_t nexthop_size(const struct nexthop *nh) -{ return sizeof(struct nexthop) + sizeof(u32)*nh->labels; } -int nexthop__same(struct nexthop *x, struct nexthop *y); /* Compare multipath nexthops */ -static inline int nexthop_same(struct nexthop *x, struct nexthop *y) -{ return (x == y) || nexthop__same(x, y); } -struct nexthop *nexthop_merge(struct nexthop *x, struct nexthop *y, int rx, int ry, int max, linpool *lp); -struct nexthop *nexthop_sort(struct nexthop *x); -static inline void nexthop_link(struct rta *a, struct nexthop *from) -{ memcpy(&a->nh, from, nexthop_size(from)); } -void nexthop_insert(struct nexthop **n, struct nexthop *y); -int nexthop_is_sorted(struct nexthop *x); - -void rta_init(void); -static inline size_t rta_size(const rta *a) { return sizeof(rta) + sizeof(u32)*a->nh.labels; } -#define RTA_MAX_SIZE (sizeof(rta) + sizeof(u32)*MPLS_MAX_LABEL_STACK) -rta *rta_lookup(rta *); /* Get rta equivalent to this one, uc++ */ -static inline int rta_is_cached(rta *r) { return r->cached; } -static inline rta *rta_clone(rta *r) { r->uc++; return r; } -void rta__free(rta *r); -static inline void rta_free(rta *r) { if (r && !--r->uc) rta__free(r); } -rta *rta_do_cow(rta *o, linpool *lp); -static inline rta * rta_cow(rta *r, linpool *lp) { return rta_is_cached(r) ? rta_do_cow(r, lp) : r; } -void rta_dump(rta *); -void rta_dump_all(void); -void rta_show(struct cli *, rta *); - -u32 rt_get_igp_metric(rte *rt); -struct hostentry * rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep); -void rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls); - -static inline void -rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr gw, ip_addr ll, mpls_label_stack *mls) -{ - rta_apply_hostentry(a, rt_get_hostentry(tab, gw, ll, dep), mls); -} - -/* - * rta_set_recursive_next_hop() acquires hostentry from hostcache and fills - * rta->hostentry field. New hostentry has zero use count. Cached rta locks its - * hostentry (increases its use count), uncached rta does not lock it. Hostentry - * with zero use count is removed asynchronously during host cache update, - * therefore it is safe to hold such hostentry temorarily. Hostentry holds a - * lock for a 'source' rta, mainly to share multipath nexthops. - * - * There is no need to hold a lock for hostentry->dep table, because that table - * contains routes responsible for that hostentry, and therefore is non-empty if - * given hostentry has non-zero use count. If the hostentry has zero use count, - * the entry is removed before dep is referenced. - * - * The protocol responsible for routes with recursive next hops should hold a - * lock for a 'source' table governing that routes (argument tab to - * rta_set_recursive_next_hop()), because its routes reference hostentries - * (through rta) related to the governing table. When all such routes are - * removed, rtas are immediately removed achieving zero uc. Then the 'source' - * table lock could be immediately released, although hostentries may still - * exist - they will be freed together with the 'source' table. - */ - -static inline void rt_lock_hostentry(struct hostentry *he) { if (he) he->uc++; } -static inline void rt_unlock_hostentry(struct hostentry *he) { if (he) he->uc--; } - -int rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, rta *a, int interior); - - -/* - * Default protocol preferences - */ - -#define DEF_PREF_DIRECT 240 /* Directly connected */ -#define DEF_PREF_STATIC 200 /* Static route */ -#define DEF_PREF_OSPF 150 /* OSPF intra-area, inter-area and type 1 external routes */ -#define DEF_PREF_BABEL 130 /* Babel */ -#define DEF_PREF_RIP 120 /* RIP */ -#define DEF_PREF_BGP 100 /* BGP */ -#define DEF_PREF_RPKI 100 /* RPKI */ -#define DEF_PREF_INHERITED 10 /* Routes inherited from other routing daemons */ - -/* - * Route Origin Authorization - */ - -#define ROA_UNKNOWN 0 -#define ROA_VALID 1 -#define ROA_INVALID 2 - -#endif diff --git a/nest/rt-attr.c b/nest/rt-attr.c index d793c72e..903926f6 100644 --- a/nest/rt-attr.c +++ b/nest/rt-attr.c @@ -45,11 +45,11 @@ */ #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "nest/cli.h" -#include "nest/attrs.h" +#include "lib/attrs.h" #include "lib/alloca.h" #include "lib/hash.h" #include "lib/idm.h" @@ -57,9 +57,25 @@ #include "lib/string.h" #include <stddef.h> +#include <stdlib.h> const adata null_adata; /* adata of length 0 */ +struct ea_class ea_gen_igp_metric = { + .name = "igp_metric", + .type = T_INT, +}; + +struct ea_class ea_gen_preference = { + .name = "preference", + .type = T_INT, +}; + +struct ea_class ea_gen_from = { + .name = "from", + .type = T_IP, +}; + const char * const rta_src_names[RTS_MAX] = { [RTS_STATIC] = "static", [RTS_INHERIT] = "inherit", @@ -77,6 +93,71 @@ const char * const rta_src_names[RTS_MAX] = { [RTS_RPKI] = "RPKI", }; +static void +ea_gen_source_format(const eattr *a, byte *buf, uint size) +{ + if ((a->u.data >= RTS_MAX) || !rta_src_names[a->u.data]) + bsnprintf(buf, size, "unknown"); + else + bsnprintf(buf, size, "%s", rta_src_names[a->u.data]); +} + +struct ea_class ea_gen_source = { + .name = "source", + .type = T_ENUM_RTS, + .readonly = 1, + .format = ea_gen_source_format, +}; + +struct ea_class ea_gen_nexthop = { + .name = "nexthop", + .type = T_NEXTHOP_LIST, +}; + +/* + * ea_set_hostentry() acquires hostentry from hostcache. + * New hostentry has zero use count. Cached rta locks its + * hostentry (increases its use count), uncached rta does not lock it. + * Hostentry with zero use count is removed asynchronously + * during host cache update, therefore it is safe to hold + * such hostentry temporarily as long as you hold the table lock. + * + * There is no need to hold a lock for hostentry->dep table, because that table + * contains routes responsible for that hostentry, and therefore is non-empty if + * given hostentry has non-zero use count. If the hostentry has zero use count, + * the entry is removed before dep is referenced. + * + * The protocol responsible for routes with recursive next hops should hold a + * lock for a 'source' table governing that routes (argument tab), + * because its routes reference hostentries related to the governing table. + * When all such routes are + * removed, rtas are immediately removed achieving zero uc. Then the 'source' + * table lock could be immediately released, although hostentries may still + * exist - they will be freed together with the 'source' table. + */ + + static void +ea_gen_hostentry_stored(const eattr *ea) +{ + struct hostentry_adata *had = (struct hostentry_adata *) ea->u.ptr; + had->he->uc++; +} + +static void +ea_gen_hostentry_freed(const eattr *ea) +{ + struct hostentry_adata *had = (struct hostentry_adata *) ea->u.ptr; + had->he->uc--; +} + +struct ea_class ea_gen_hostentry = { + .name = "hostentry", + .type = T_HOSTENTRY, + .readonly = 1, + .stored = ea_gen_hostentry_stored, + .freed = ea_gen_hostentry_freed, +}; + const char * rta_dest_names[RTD_MAX] = { [RTD_NONE] = "", [RTD_UNICAST] = "unicast", @@ -85,10 +166,26 @@ const char * rta_dest_names[RTD_MAX] = { [RTD_PROHIBIT] = "prohibited", }; +struct ea_class ea_gen_flowspec_valid = { + .name = "flowspec_valid", + .type = T_ENUM_FLOWSPEC_VALID, + .readonly = 1, +}; + +const char * flowspec_valid_names[FLOWSPEC__MAX] = { + [FLOWSPEC_UNKNOWN] = "unknown", + [FLOWSPEC_VALID] = "", + [FLOWSPEC_INVALID] = "invalid", +}; + +DOMAIN(attrs) attrs_domain; + pool *rta_pool; -static slab *rta_slab_[4]; -static slab *nexthop_slab_[4]; +/* Assuming page size of 4096, these are magic values for slab allocation */ +static const uint ea_slab_sizes[] = { 56, 112, 168, 288, 448, 800, 1344 }; +static slab *ea_slab[ARRAY_SIZE(ea_slab_sizes)]; + static slab *rte_src_slab; static struct idm src_ids; @@ -96,121 +193,176 @@ static struct idm src_ids; /* rte source hash */ -#define RSH_KEY(n) n->proto, n->private_id +#define RSH_KEY(n) n->private_id #define RSH_NEXT(n) n->next -#define RSH_EQ(p1,n1,p2,n2) p1 == p2 && n1 == n2 -#define RSH_FN(p,n) p->hash_key ^ u32_hash(n) +#define RSH_EQ(n1,n2) n1 == n2 +#define RSH_FN(n) u32_hash(n) #define RSH_REHASH rte_src_rehash #define RSH_PARAMS /2, *2, 1, 1, 8, 20 -#define RSH_INIT_ORDER 6 - -static HASH(struct rte_src) src_hash; +#define RSH_INIT_ORDER 2 +static struct rte_src **rte_src_global; +static uint rte_src_global_max = SRC_ID_INIT_SIZE; static void rte_src_init(void) { rte_src_slab = sl_new(rta_pool, sizeof(struct rte_src)); + rte_src_global = mb_allocz(rta_pool, sizeof(struct rte_src *) * rte_src_global_max); idm_init(&src_ids, rta_pool, SRC_ID_INIT_SIZE); - - HASH_INIT(src_hash, rta_pool, RSH_INIT_ORDER); } - HASH_DEFINE_REHASH_FN(RSH, struct rte_src) -struct rte_src * -rt_find_source(struct proto *p, u32 id) +static struct rte_src * +rt_find_source(struct rte_owner *p, u32 id) { - return HASH_FIND(src_hash, RSH, p, id); + return HASH_FIND(p->hash, RSH, id); } struct rte_src * -rt_get_source(struct proto *p, u32 id) +rt_get_source_o(struct rte_owner *p, u32 id) { + if (p->stop) + bug("Stopping route owner asked for another source."); + struct rte_src *src = rt_find_source(p, id); if (src) + { + UNUSED u64 uc = atomic_fetch_add_explicit(&src->uc, 1, memory_order_acq_rel); return src; + } + RTA_LOCK; src = sl_allocz(rte_src_slab); - src->proto = p; + src->owner = p; src->private_id = id; src->global_id = idm_alloc(&src_ids); - src->uc = 0; - HASH_INSERT2(src_hash, RSH, rta_pool, src); + atomic_store_explicit(&src->uc, 1, memory_order_release); + p->uc++; + + HASH_INSERT2(p->hash, RSH, rta_pool, src); + if (config->table_debug) + log(L_TRACE "Allocated new rte_src for %s, ID %uL %uG, have %u sources now", + p->name, src->private_id, src->global_id, p->uc); + + if (src->global_id >= rte_src_global_max) + { + rte_src_global = mb_realloc(rte_src_global, sizeof(struct rte_src *) * (rte_src_global_max *= 2)); + memset(&rte_src_global[rte_src_global_max / 2], 0, + sizeof(struct rte_src *) * (rte_src_global_max / 2)); + } + + rte_src_global[src->global_id] = src; + RTA_UNLOCK; return src; } +struct rte_src * +rt_find_source_global(u32 id) +{ + if (id >= rte_src_global_max) + return NULL; + else + return rte_src_global[id]; +} + +static inline void +rt_done_sources(struct rte_owner *o) +{ + ev_send(o->list, o->stop); +} + void -rt_prune_sources(void) +rt_prune_sources(void *data) { - HASH_WALK_FILTER(src_hash, next, src, sp) + struct rte_owner *o = data; + + HASH_WALK_FILTER(o->hash, next, src, sp) { - if (src->uc == 0) + u64 uc; + while ((uc = atomic_load_explicit(&src->uc, memory_order_acquire)) >> RTE_SRC_PU_SHIFT) + synchronize_rcu(); + + if (uc == 0) { - HASH_DO_REMOVE(src_hash, RSH, sp); + o->uc--; + + HASH_DO_REMOVE(o->hash, RSH, sp); + + RTA_LOCK; + rte_src_global[src->global_id] = NULL; idm_free(&src_ids, src->global_id); sl_free(src); + RTA_UNLOCK; } } HASH_WALK_FILTER_END; - HASH_MAY_RESIZE_DOWN(src_hash, RSH, rta_pool); -} - + RTA_LOCK; + HASH_MAY_RESIZE_DOWN(o->hash, RSH, rta_pool); -/* - * Multipath Next Hop - */ - -static inline u32 -nexthop_hash(struct nexthop *x) -{ - u32 h = 0; - for (; x; x = x->next) + if (o->stop && !o->uc) { - h ^= ipa_hash(x->gw) ^ (h << 5) ^ (h >> 9); + rfree(o->prune); + RTA_UNLOCK; - for (int i = 0; i < x->labels; i++) - h ^= x->label[i] ^ (h << 6) ^ (h >> 7); + if (config->table_debug) + log(L_TRACE "All rte_src's for %s pruned, scheduling stop event", o->name); + + rt_done_sources(o); } + else + RTA_UNLOCK; +} - return h; +void +rt_init_sources(struct rte_owner *o, const char *name, event_list *list) +{ + RTA_LOCK; + HASH_INIT(o->hash, rta_pool, RSH_INIT_ORDER); + o->hash_key = random_u32(); + o->uc = 0; + o->name = name; + o->prune = ev_new_init(rta_pool, rt_prune_sources, o); + o->stop = NULL; + o->list = list; + RTA_UNLOCK; } -int -nexthop__same(struct nexthop *x, struct nexthop *y) +void +rt_destroy_sources(struct rte_owner *o, event *done) { - for (; x && y; x = x->next, y = y->next) + o->stop = done; + + if (!o->uc) { - if (!ipa_equal(x->gw, y->gw) || (x->iface != y->iface) || - (x->flags != y->flags) || (x->weight != y->weight) || - (x->labels_orig != y->labels_orig) || (x->labels != y->labels)) - return 0; + if (config->table_debug) + log(L_TRACE "Source owner %s destroy requested. All rte_src's already pruned, scheduling stop event", o->name); - for (int i = 0; i < x->labels; i++) - if (x->label[i] != y->label[i]) - return 0; - } + RTA_LOCK; + rfree(o->prune); + RTA_UNLOCK; - return x == y; + rt_done_sources(o); + } + else + if (config->table_debug) + log(L_TRACE "Source owner %s destroy requested. Remaining %u rte_src's to prune.", o->name, o->uc); } +/* + * Multipath Next Hop + */ + static int nexthop_compare_node(const struct nexthop *x, const struct nexthop *y) { int r; - - if (!x) - return 1; - - if (!y) - return -1; - /* Should we also compare flags ? */ r = ((int) y->weight) - ((int) x->weight); @@ -235,23 +387,16 @@ nexthop_compare_node(const struct nexthop *x, const struct nexthop *y) return ((int) x->iface->index) - ((int) y->iface->index); } -static inline struct nexthop * -nexthop_copy_node(const struct nexthop *src, linpool *lp) +static int +nexthop_compare_qsort(const void *x, const void *y) { - struct nexthop *n = lp_alloc(lp, nexthop_size(src)); - - memcpy(n, src, nexthop_size(src)); - n->next = NULL; - - return n; + return nexthop_compare_node( *(const struct nexthop **) x, *(const struct nexthop **) y ); } /** * nexthop_merge - merge nexthop lists * @x: list 1 * @y: list 2 - * @rx: reusability of list @x - * @ry: reusability of list @y * @max: max number of nexthops * @lp: linpool for allocating nexthops * @@ -268,138 +413,229 @@ nexthop_copy_node(const struct nexthop *src, linpool *lp) * resulting list is no longer needed. When reusability is not set, the * corresponding lists are not modified nor linked from the resulting list. */ -struct nexthop * -nexthop_merge(struct nexthop *x, struct nexthop *y, int rx, int ry, int max, linpool *lp) +struct nexthop_adata * +nexthop_merge(struct nexthop_adata *xin, struct nexthop_adata *yin, int max, linpool *lp) { - struct nexthop *root = NULL; - struct nexthop **n = &root; + uint outlen = ADATA_SIZE(xin->ad.length) + ADATA_SIZE(yin->ad.length); + struct nexthop_adata *out = lp_alloc(lp, outlen); + out->ad.length = outlen - sizeof (struct adata); + + struct nexthop *x = &xin->nh, *y = &yin->nh, *cur = &out->nh; + int xvalid, yvalid; - while ((x || y) && max--) + while (max--) { - int cmp = nexthop_compare_node(x, y); + xvalid = NEXTHOP_VALID(x, xin); + yvalid = NEXTHOP_VALID(y, yin); + + if (!xvalid && !yvalid) + break; + + ASSUME(NEXTHOP_VALID(cur, out)); + + int cmp = !xvalid ? 1 : !yvalid ? -1 : nexthop_compare_node(x, y); if (cmp < 0) { - ASSUME(x); - *n = rx ? x : nexthop_copy_node(x, lp); - x = x->next; + ASSUME(NEXTHOP_VALID(x, xin)); + memcpy(cur, x, nexthop_size(x)); + x = NEXTHOP_NEXT(x); } else if (cmp > 0) { - ASSUME(y); - *n = ry ? y : nexthop_copy_node(y, lp); - y = y->next; + ASSUME(NEXTHOP_VALID(y, yin)); + memcpy(cur, y, nexthop_size(y)); + y = NEXTHOP_NEXT(y); } else { - ASSUME(x && y); - *n = rx ? x : (ry ? y : nexthop_copy_node(x, lp)); - x = x->next; - y = y->next; + ASSUME(NEXTHOP_VALID(x, xin)); + memcpy(cur, x, nexthop_size(x)); + x = NEXTHOP_NEXT(x); + + ASSUME(NEXTHOP_VALID(y, yin)); + y = NEXTHOP_NEXT(y); } - n = &((*n)->next); + cur = NEXTHOP_NEXT(cur); } - *n = NULL; - return root; + out->ad.length = (void *) cur - (void *) out->ad.data; + + return out; } -void -nexthop_insert(struct nexthop **n, struct nexthop *x) +struct nexthop_adata * +nexthop_sort(struct nexthop_adata *nhad, linpool *lp) { - for (; *n; n = &((*n)->next)) - { - int cmp = nexthop_compare_node(*n, x); + /* Count the nexthops */ + uint cnt = 0; + NEXTHOP_WALK(nh, nhad) + cnt++; - if (cmp < 0) - continue; - else if (cmp > 0) - break; - else - return; - } + if (cnt <= 1) + return nhad; - x->next = *n; - *n = x; -} + /* Get pointers to them */ + struct nexthop **sptr = tmp_alloc(cnt * sizeof(struct nexthop *)); -struct nexthop * -nexthop_sort(struct nexthop *x) -{ - struct nexthop *s = NULL; + uint i = 0; + NEXTHOP_WALK(nh, nhad) + sptr[i++] = nh; + + /* Sort the pointers */ + qsort(sptr, cnt, sizeof(struct nexthop *), nexthop_compare_qsort); - /* Simple insert-sort */ - while (x) + /* Allocate the output */ + struct nexthop_adata *out = (struct nexthop_adata *) lp_alloc_adata(lp, nhad->ad.length); + struct nexthop *dest = &out->nh; + + /* Deduplicate nexthops while storing them */ + for (uint i = 0; i < cnt; i++) { - struct nexthop *n = x; - x = n->next; - n->next = NULL; + if (i && !nexthop_compare_node(sptr[i], sptr[i-1])) + continue; - nexthop_insert(&s, n); + memcpy(dest, sptr[i], NEXTHOP_SIZE(sptr[i])); + dest = NEXTHOP_NEXT(dest); } - return s; + out->ad.length = (void *) dest - (void *) out->ad.data; + return out; } int -nexthop_is_sorted(struct nexthop *x) +nexthop_is_sorted(struct nexthop_adata *nhad) { - for (; x && x->next; x = x->next) - if (nexthop_compare_node(x, x->next) >= 0) + struct nexthop *prev = NULL; + NEXTHOP_WALK(nh, nhad) + { + if (prev && (nexthop_compare_node(prev, nh) >= 0)) return 0; + prev = nh; + } + return 1; } -static inline slab * -nexthop_slab(struct nexthop *nh) +/* + * Extended Attributes + */ + +#define EA_CLASS_INITIAL_MAX 128 +static struct ea_class **ea_class_global = NULL; +static uint ea_class_max; +static struct idm ea_class_idm; + +/* Config parser lex register function */ +void ea_lex_register(struct ea_class *def); +void ea_lex_unregister(struct ea_class *def); + +static void +ea_class_free(struct ea_class *cl) { - return nexthop_slab_[MIN(nh->labels, 3)]; + /* No more ea class references. Unregister the attribute. */ + idm_free(&ea_class_idm, cl->id); + ea_class_global[cl->id] = NULL; + if (!cl->hidden) + ea_lex_unregister(cl); } -static struct nexthop * -nexthop_copy(struct nexthop *o) +static void +ea_class_ref_free(resource *r) { - struct nexthop *first = NULL; - struct nexthop **last = &first; - - for (; o; o = o->next) - { - struct nexthop *n = sl_allocz(nexthop_slab(o)); - n->gw = o->gw; - n->iface = o->iface; - n->next = NULL; - n->flags = o->flags; - n->weight = o->weight; - n->labels_orig = o->labels_orig; - n->labels = o->labels; - for (int i=0; i<o->labels; i++) - n->label[i] = o->label[i]; - - *last = n; - last = &(n->next); - } + struct ea_class_ref *ref = SKIP_BACK(struct ea_class_ref, r, r); + if (!--ref->class->uc) + ea_class_free(ref->class); +} - return first; +static void +ea_class_ref_dump(resource *r, unsigned indent UNUSED) +{ + struct ea_class_ref *ref = SKIP_BACK(struct ea_class_ref, r, r); + debug("name \"%s\", type=%d\n", ref->class->name, ref->class->type); } +static struct resclass ea_class_ref_class = { + .name = "Attribute class reference", + .size = sizeof(struct ea_class_ref), + .free = ea_class_ref_free, + .dump = ea_class_ref_dump, + .lookup = NULL, + .memsize = NULL, +}; + static void -nexthop_free(struct nexthop *o) +ea_class_init(void) { - struct nexthop *n; + ASSERT_DIE(ea_class_global == NULL); - while (o) - { - n = o->next; - sl_free(o); - o = n; - } + idm_init(&ea_class_idm, rta_pool, EA_CLASS_INITIAL_MAX); + ea_class_global = mb_allocz(rta_pool, + sizeof(*ea_class_global) * (ea_class_max = EA_CLASS_INITIAL_MAX)); +} + +static struct ea_class_ref * +ea_ref_class(pool *p, struct ea_class *def) +{ + def->uc++; + struct ea_class_ref *ref = ralloc(p, &ea_class_ref_class); + ref->class = def; + return ref; } +static struct ea_class_ref * +ea_register(pool *p, struct ea_class *def) +{ + def->id = idm_alloc(&ea_class_idm); -/* - * Extended Attributes - */ + ASSERT_DIE(ea_class_global); + while (def->id >= ea_class_max) + ea_class_global = mb_realloc(ea_class_global, sizeof(*ea_class_global) * (ea_class_max *= 2)); + + ASSERT_DIE(def->id < ea_class_max); + ea_class_global[def->id] = def; + + if (!def->hidden) + ea_lex_register(def); + + return ea_ref_class(p, def); +} + +struct ea_class_ref * +ea_register_alloc(pool *p, struct ea_class cl) +{ + struct ea_class *clp = ea_class_find_by_name(cl.name); + if (clp && clp->type == cl.type) + return ea_ref_class(p, clp); + + uint namelen = strlen(cl.name) + 1; + + struct { + struct ea_class cl; + char name[0]; + } *cla = mb_alloc(rta_pool, sizeof(struct ea_class) + namelen); + cla->cl = cl; + memcpy(cla->name, cl.name, namelen); + cla->cl.name = cla->name; + + return ea_register(p, &cla->cl); +} + +void +ea_register_init(struct ea_class *clp) +{ + ASSERT_DIE(!ea_class_find_by_name(clp->name)); + ea_register(&root_pool, clp); +} + +struct ea_class * +ea_class_find_by_id(uint id) +{ + ASSERT_DIE(id < ea_class_max); + ASSERT_DIE(ea_class_global[id]); + return ea_class_global[id]; +} static inline eattr * ea__find(ea_list *e, unsigned id) @@ -444,7 +680,7 @@ ea__find(ea_list *e, unsigned id) * to its &eattr structure or %NULL if no such attribute exists. */ eattr * -ea_find(ea_list *e, unsigned id) +ea_find_by_id(ea_list *e, unsigned id) { eattr *a = ea__find(e, id & EA_CODE_MASK); @@ -529,25 +765,6 @@ ea_walk(struct ea_walk_state *s, uint id, uint max) return NULL; } -/** - * ea_get_int - fetch an integer attribute - * @e: attribute list - * @id: attribute ID - * @def: default value - * - * This function is a shortcut for retrieving a value of an integer attribute - * by calling ea_find() to find the attribute, extracting its value or returning - * a provided default if no such attribute is present. - */ -uintptr_t -ea_get_int(ea_list *e, unsigned id, uintptr_t def) -{ - eattr *a = ea_find(e, id); - if (!a) - return def; - return a->u.data; -} - static inline void ea_do_sort(ea_list *e) { @@ -614,8 +831,8 @@ ea_do_prune(ea_list *e) s++; /* Now s0 is the most recent version, s[-1] the oldest one */ - /* Drop undefs */ - if (s0->undef) + /* Drop undefs unless this is a true overlay */ + if (s0->undef && (s[-1].undef || !e->next)) continue; /* Copy the newest version to destination */ @@ -645,21 +862,18 @@ ea_do_prune(ea_list *e) * If an attribute occurs multiple times in a single &ea_list, * ea_sort() leaves only the first (the only significant) occurrence. */ -void +static void ea_sort(ea_list *e) { - while (e) - { - if (!(e->flags & EALF_SORTED)) - { - ea_do_sort(e); - ea_do_prune(e); - e->flags |= EALF_SORTED; - } - if (e->count > 5) - e->flags |= EALF_BISECT; - e = e->next; - } + if (!(e->flags & EALF_SORTED)) + { + ea_do_sort(e); + ea_do_prune(e); + e->flags |= EALF_SORTED; + } + + if (e->count > 5) + e->flags |= EALF_BISECT; } /** @@ -669,8 +883,8 @@ ea_sort(ea_list *e) * This function calculates an upper bound of the size of * a given &ea_list after merging with ea_merge(). */ -unsigned -ea_scan(ea_list *e) +static unsigned +ea_scan(const ea_list *e, int overlay) { unsigned cnt = 0; @@ -678,6 +892,8 @@ ea_scan(ea_list *e) { cnt += e->count; e = e->next; + if (e && overlay && ea_is_cached(e)) + break; } return sizeof(ea_list) + sizeof(eattr)*cnt; } @@ -696,21 +912,36 @@ ea_scan(ea_list *e) * segments with ea_merge() and finally sort and prune the result * by calling ea_sort(). */ -void -ea_merge(ea_list *e, ea_list *t) +static void +ea_merge(ea_list *e, ea_list *t, int overlay) { eattr *d = t->attrs; t->flags = 0; t->count = 0; - t->next = NULL; + while (e) { memcpy(d, e->attrs, sizeof(eattr)*e->count); t->count += e->count; d += e->count; e = e->next; + + if (e && overlay && ea_is_cached(e)) + break; } + + t->next = e; +} + +ea_list * +ea_normalize(ea_list *e, int overlay) +{ + ea_list *t = tmp_alloc(ea_scan(e, overlay)); + ea_merge(e, t, overlay); + ea_sort(t); + + return t->count ? t : t->next; } /** @@ -728,7 +959,8 @@ ea_same(ea_list *x, ea_list *y) if (!x || !y) return x == y; - ASSERT(!x->next && !y->next); + if (x->next != y->next) + return 0; if (x->count != y->count) return 0; for(c=0; c<x->count; c++) @@ -748,33 +980,37 @@ ea_same(ea_list *x, ea_list *y) return 1; } -static inline ea_list * -ea_list_copy(ea_list *o) +uint +ea_list_size(ea_list *o) { - ea_list *n; - unsigned i, adpos, elen; + unsigned i, elen; - if (!o) - return NULL; - ASSERT(!o->next); - elen = adpos = sizeof(ea_list) + sizeof(eattr) * o->count; + ASSERT_DIE(o); + elen = BIRD_CPU_ALIGN(sizeof(ea_list) + sizeof(eattr) * o->count); for(i=0; i<o->count; i++) { eattr *a = &o->attrs[i]; - if (!(a->type & EAF_EMBEDDED)) - elen += sizeof(struct adata) + a->u.ptr->length; + if (!a->undef && !(a->type & EAF_EMBEDDED)) + elen += ADATA_SIZE(a->u.ptr->length); } - n = mb_alloc(rta_pool, elen); + return elen; +} + +void +ea_list_copy(ea_list *n, ea_list *o, uint elen) +{ + uint adpos = sizeof(ea_list) + sizeof(eattr) * o->count; memcpy(n, o, adpos); - n->flags |= EALF_CACHED; - for(i=0; i<o->count; i++) + adpos = BIRD_CPU_ALIGN(adpos); + + for(uint i=0; i<o->count; i++) { eattr *a = &n->attrs[i]; - if (!(a->type & EAF_EMBEDDED)) + if (!a->undef && !(a->type & EAF_EMBEDDED)) { - unsigned size = sizeof(struct adata) + a->u.ptr->length; + unsigned size = ADATA_SIZE(a->u.ptr->length); ASSERT_DIE(adpos + size <= elen); struct adata *d = ((void *) n) + adpos; @@ -784,30 +1020,58 @@ ea_list_copy(ea_list *o) adpos += size; } } + ASSERT_DIE(adpos == elen); - return n; } -static inline void -ea_free(ea_list *o) +static void +ea_list_ref(ea_list *l) { - if (o) + for(uint i=0; i<l->count; i++) { - ASSERT(!o->next); - mb_free(o); + eattr *a = &l->attrs[i]; + ASSERT_DIE(a->id < ea_class_max); + + if (a->undef) + continue; + + struct ea_class *cl = ea_class_global[a->id]; + ASSERT_DIE(cl && cl->uc); + + CALL(cl->stored, a); + cl->uc++; } + + if (l->next) + { + ASSERT_DIE(ea_is_cached(l->next)); + ea_clone(l->next); + } } -static int -get_generic_attr(const eattr *a, byte **buf, int buflen UNUSED) +static void ea_free_nested(ea_list *l); + +static void +ea_list_unref(ea_list *l) { - if (a->id == EA_GEN_IGP_METRIC) + for(uint i=0; i<l->count; i++) { - *buf += bsprintf(*buf, "igp_metric"); - return GA_NAME; + eattr *a = &l->attrs[i]; + ASSERT_DIE(a->id < ea_class_max); + + if (a->undef) + continue; + + struct ea_class *cl = ea_class_global[a->id]; + ASSERT_DIE(cl && cl->uc); + + CALL(cl->freed, a); + if (!--cl->uc) + ea_class_free(cl); } - return GA_UNKNOWN; + if (l->next) + ea_free_nested(l->next); } void @@ -860,41 +1124,90 @@ opaque_format(const struct adata *ad, byte *buf, uint size) } static inline void -ea_show_int_set(struct cli *c, const struct adata *ad, int way, byte *pos, byte *buf, byte *end) +ea_show_int_set(struct cli *c, const char *name, const struct adata *ad, int way, byte *buf) { - int i = int_set_format(ad, way, 0, pos, end - pos); - cli_printf(c, -1012, "\t%s", buf); + int nlen = strlen(name); + int i = int_set_format(ad, way, 0, buf, CLI_MSG_SIZE - nlen - 3); + cli_printf(c, -1012, "\t%s: %s", name, buf); while (i) { - i = int_set_format(ad, way, i, buf, end - buf - 1); + i = int_set_format(ad, way, i, buf, CLI_MSG_SIZE - 1); cli_printf(c, -1012, "\t\t%s", buf); } } static inline void -ea_show_ec_set(struct cli *c, const struct adata *ad, byte *pos, byte *buf, byte *end) +ea_show_ec_set(struct cli *c, const char *name, const struct adata *ad, byte *buf) { - int i = ec_set_format(ad, 0, pos, end - pos); - cli_printf(c, -1012, "\t%s", buf); + int nlen = strlen(name); + int i = ec_set_format(ad, 0, buf, CLI_MSG_SIZE - nlen - 3); + cli_printf(c, -1012, "\t%s: %s", name, buf); while (i) { - i = ec_set_format(ad, i, buf, end - buf - 1); + i = ec_set_format(ad, i, buf, CLI_MSG_SIZE - 1); cli_printf(c, -1012, "\t\t%s", buf); } } static inline void -ea_show_lc_set(struct cli *c, const struct adata *ad, byte *pos, byte *buf, byte *end) +ea_show_lc_set(struct cli *c, const char *name, const struct adata *ad, byte *buf) { - int i = lc_set_format(ad, 0, pos, end - pos); - cli_printf(c, -1012, "\t%s", buf); + int nlen = strlen(name); + int i = lc_set_format(ad, 0, buf, CLI_MSG_SIZE - nlen - 3); + cli_printf(c, -1012, "\t%s: %s", name, buf); while (i) { - i = lc_set_format(ad, i, buf, end - buf - 1); + i = lc_set_format(ad, i, buf, CLI_MSG_SIZE - 1); cli_printf(c, -1012, "\t\t%s", buf); } } +void +ea_show_nexthop_list(struct cli *c, struct nexthop_adata *nhad) +{ + if (!NEXTHOP_IS_REACHABLE(nhad)) + return; + + NEXTHOP_WALK(nh, nhad) + { + char mpls[MPLS_MAX_LABEL_STACK*12 + 5], *lsp = mpls; + char *onlink = (nh->flags & RNF_ONLINK) ? " onlink" : ""; + char weight[16] = ""; + + if (nh->labels) + { + lsp += bsprintf(lsp, " mpls %d", nh->label[0]); + for (int i=1;i<nh->labels; i++) + lsp += bsprintf(lsp, "/%d", nh->label[i]); + } + *lsp = '\0'; + + if (!NEXTHOP_ONE(nhad)) + bsprintf(weight, " weight %d", nh->weight + 1); + + if (ipa_nonzero(nh->gw)) + if (nh->iface) + cli_printf(c, -1007, "\tvia %I on %s%s%s%s", + nh->gw, nh->iface->name, mpls, onlink, weight); + else + cli_printf(c, -1007, "\tvia %I", nh->gw); + else + cli_printf(c, -1007, "\tdev %s%s%s", + nh->iface->name, mpls, onlink, weight); + } +} + +void +ea_show_hostentry(const struct adata *ad, byte *buf, uint size) +{ + const struct hostentry_adata *had = (const struct hostentry_adata *) ad; + + if (ipa_nonzero(had->he->link) && !ipa_equal(had->he->link, had->he->addr)) + bsnprintf(buf, size, "via %I %I table %s", had->he->addr, had->he->link, had->he->tab->name); + else + bsnprintf(buf, size, "via %I table %s", had->he->addr, had->he->tab->name); +} + /** * ea_show - print an &eattr to CLI * @c: destination CLI @@ -906,84 +1219,80 @@ ea_show_lc_set(struct cli *c, const struct adata *ad, byte *pos, byte *buf, byte * If the protocol defining the attribute provides its own * get_attr() hook, it's consulted first. */ -void +static void ea_show(struct cli *c, const eattr *e) { - struct protocol *p; - int status = GA_UNKNOWN; const struct adata *ad = (e->type & EAF_EMBEDDED) ? NULL : e->u.ptr; byte buf[CLI_MSG_SIZE]; byte *pos = buf, *end = buf + sizeof(buf); - if (EA_IS_CUSTOM(e->id)) - { - const char *name = ea_custom_name(e->id); - if (name) - { - pos += bsprintf(pos, "%s", name); - status = GA_NAME; - } - else - pos += bsprintf(pos, "%02x.", EA_PROTO(e->id)); - } - else if (p = class_to_protocol[EA_PROTO(e->id)]) - { - pos += bsprintf(pos, "%s.", p->name); - if (p->get_attr) - status = p->get_attr(e, pos, end - pos); - pos += strlen(pos); - } - else if (EA_PROTO(e->id)) - pos += bsprintf(pos, "%02x.", EA_PROTO(e->id)); - else - status = get_generic_attr(e, &pos, end - pos); + ASSERT_DIE(e->id < ea_class_max); - if (status < GA_NAME) - pos += bsprintf(pos, "%02x", EA_ID(e->id)); - if (status < GA_FULL) - { - *pos++ = ':'; - *pos++ = ' '; + struct ea_class *cls = ea_class_global[e->id]; + ASSERT_DIE(cls); + + if (e->undef || cls->hidden) + return; + else if (cls->format) + cls->format(e, buf, end - buf); + else + switch (e->type) + { + case T_INT: + if ((cls == &ea_gen_igp_metric) && e->u.data >= IGP_METRIC_UNKNOWN) + return; - if (e->undef) - bsprintf(pos, "undefined"); - else - switch (e->type & EAF_TYPE_MASK) - { - case EAF_TYPE_INT: bsprintf(pos, "%u", e->u.data); break; - case EAF_TYPE_OPAQUE: + case T_OPAQUE: opaque_format(ad, pos, end - pos); break; - case EAF_TYPE_IP_ADDRESS: + case T_IP: bsprintf(pos, "%I", *(ip_addr *) ad->data); break; - case EAF_TYPE_ROUTER_ID: + case T_QUAD: bsprintf(pos, "%R", e->u.data); break; - case EAF_TYPE_AS_PATH: + case T_PATH: as_path_format(ad, pos, end - pos); break; - case EAF_TYPE_BITFIELD: - bsprintf(pos, "%08x", e->u.data); - break; - case EAF_TYPE_INT_SET: - ea_show_int_set(c, ad, 1, pos, buf, end); + case T_CLIST: + ea_show_int_set(c, cls->name, ad, 1, buf); return; - case EAF_TYPE_EC_SET: - ea_show_ec_set(c, ad, pos, buf, end); + case T_ECLIST: + ea_show_ec_set(c, cls->name, ad, buf); return; - case EAF_TYPE_LC_SET: - ea_show_lc_set(c, ad, pos, buf, end); + case T_LCLIST: + ea_show_lc_set(c, cls->name, ad, buf); return; + case T_NEXTHOP_LIST: + ea_show_nexthop_list(c, (struct nexthop_adata *) e->u.ptr); + return; + case T_HOSTENTRY: + ea_show_hostentry(ad, pos, end - pos); + break; default: bsprintf(pos, "<type %02x>", e->type); - } - } + } - if (status != GA_HIDDEN) - cli_printf(c, -1012, "\t%s", buf); + cli_printf(c, -1012, "\t%s: %s", cls->name, buf); +} + +static void +nexthop_dump(const struct adata *ad) +{ + struct nexthop_adata *nhad = (struct nexthop_adata *) ad; + + debug(":"); + + NEXTHOP_WALK(nh, nhad) + { + if (ipa_nonzero(nh->gw)) debug(" ->%I", nh->gw); + if (nh->labels) debug(" L %d", nh->label[0]); + for (int i=1; i<nh->labels; i++) + debug("/%d", nh->label[i]); + debug(" [%s]", nh->iface ? nh->iface->name : "???"); + } } /** @@ -1005,19 +1314,26 @@ ea_dump(ea_list *e) } while (e) { - debug("[%c%c%c]", + struct ea_storage *s = ea_is_cached(e) ? ea_get_storage(e) : NULL; + debug("[%c%c%c] uc=%d h=%08x", (e->flags & EALF_SORTED) ? 'S' : 's', (e->flags & EALF_BISECT) ? 'B' : 'b', - (e->flags & EALF_CACHED) ? 'C' : 'c'); + (e->flags & EALF_CACHED) ? 'C' : 'c', + s ? s->uc : 0, s ? s->hash_key : 0); for(i=0; i<e->count; i++) { eattr *a = &e->attrs[i]; - debug(" %02x:%02x.%02x", EA_PROTO(a->id), EA_ID(a->id), a->flags); - debug("=%c", "?iO?I?P???S?????" [a->type & EAF_TYPE_MASK]); + debug(" %04x.%02x", a->id, a->flags); + debug("=%c", + "?iO?IRP???S??pE?" + "??L???N?????????" + "?o???r??????????" [a->type]); if (a->originated) debug("o"); if (a->type & EAF_EMBEDDED) debug(":%08x", a->u.data); + else if (a->id == ea_gen_nexthop.id) + nexthop_dump(a->u.ptr); else { int j, len = a->u.ptr->length; @@ -1047,10 +1363,13 @@ ea_hash(ea_list *e) if (e) /* Assuming chain of length 1 */ { + h ^= mem_hash(&e->next, sizeof(e->next)); for(i=0; i<e->count; i++) { struct eattr *a = &e->attrs[i]; h ^= a->id; h *= mul; + if (a->undef) + continue; if (a->type & EAF_EMBEDDED) h ^= a->u.data; else @@ -1094,12 +1413,12 @@ static uint rta_cache_count; static uint rta_cache_size = 32; static uint rta_cache_limit; static uint rta_cache_mask; -static rta **rta_hash_table; +static struct ea_storage **rta_hash_table; static void rta_alloc_hash(void) { - rta_hash_table = mb_allocz(rta_pool, sizeof(rta *) * rta_cache_size); + rta_hash_table = mb_allocz(rta_pool, sizeof(struct ea_storage *) * rta_cache_size); if (rta_cache_size < 32768) rta_cache_limit = rta_cache_size * 2; else @@ -1107,64 +1426,14 @@ rta_alloc_hash(void) rta_cache_mask = rta_cache_size - 1; } -static inline uint -rta_hash(rta *a) -{ - u64 h; - mem_hash_init(&h); -#define MIX(f) mem_hash_mix(&h, &(a->f), sizeof(a->f)); -#define BMIX(f) mem_hash_mix_num(&h, a->f); - MIX(hostentry); - MIX(from); - MIX(igp_metric); - BMIX(source); - BMIX(scope); - BMIX(dest); - MIX(pref); -#undef MIX - - return mem_hash_value(&h) ^ nexthop_hash(&(a->nh)) ^ ea_hash(a->eattrs); -} - -static inline int -rta_same(rta *x, rta *y) -{ - return (x->source == y->source && - x->scope == y->scope && - x->dest == y->dest && - x->igp_metric == y->igp_metric && - ipa_equal(x->from, y->from) && - x->hostentry == y->hostentry && - nexthop_same(&(x->nh), &(y->nh)) && - ea_same(x->eattrs, y->eattrs)); -} - -static inline slab * -rta_slab(rta *a) -{ - return rta_slab_[a->nh.labels > 2 ? 3 : a->nh.labels]; -} - -static rta * -rta_copy(rta *o) -{ - rta *r = sl_alloc(rta_slab(o)); - - memcpy(r, o, rta_size(o)); - r->uc = 1; - r->nh.next = nexthop_copy(o->nh.next); - r->eattrs = ea_list_copy(o->eattrs); - return r; -} - static inline void -rta_insert(rta *r) +rta_insert(struct ea_storage *r) { uint h = r->hash_key & rta_cache_mask; - r->next = rta_hash_table[h]; - if (r->next) - r->next->pprev = &r->next; - r->pprev = &rta_hash_table[h]; + r->next_hash = rta_hash_table[h]; + if (r->next_hash) + r->next_hash->pprev_hash = &r->next_hash; + r->pprev_hash = &rta_hash_table[h]; rta_hash_table[h] = r; } @@ -1173,8 +1442,8 @@ rta_rehash(void) { uint ohs = rta_cache_size; uint h; - rta *r, *n; - rta **oht = rta_hash_table; + struct ea_storage *r, *n; + struct ea_storage **oht = rta_hash_table; rta_cache_size = 2*rta_cache_size; DBG("Rehashing rta cache from %d to %d entries.\n", ohs, rta_cache_size); @@ -1182,7 +1451,7 @@ rta_rehash(void) for(h=0; h<ohs; h++) for(r=oht[h]; r; r=n) { - n = r->next; + n = r->next_hash; rta_insert(r); } mb_free(oht); @@ -1201,100 +1470,89 @@ rta_rehash(void) * The extended attribute lists attached to the &rta are automatically * converted to the normalized form. */ -rta * -rta_lookup(rta *o) +ea_list * +ea_lookup(ea_list *o, int overlay) { - rta *r; + struct ea_storage *r; uint h; - ASSERT(!o->cached); - if (o->eattrs) - ea_normalize(o->eattrs); + ASSERT(!ea_is_cached(o)); + o = ea_normalize(o, overlay); + h = ea_hash(o); + + RTA_LOCK; - h = rta_hash(o); - for(r=rta_hash_table[h & rta_cache_mask]; r; r=r->next) - if (r->hash_key == h && rta_same(r, o)) - return rta_clone(r); + for(r=rta_hash_table[h & rta_cache_mask]; r; r=r->next_hash) + if (r->hash_key == h && ea_same(r->l, o)) + { + atomic_fetch_add_explicit(&r->uc, 1, memory_order_acq_rel); + RTA_UNLOCK; + return r->l; + } - r = rta_copy(o); + uint elen = ea_list_size(o); + uint sz = elen + sizeof(struct ea_storage); + for (uint i=0; i<ARRAY_SIZE(ea_slab_sizes); i++) + if (sz <= ea_slab_sizes[i]) + { + r = sl_alloc(ea_slab[i]); + break; + } + + int huge = r ? 0 : EALF_HUGE;; + if (huge) + r = mb_alloc(rta_pool, sz); + + ea_list_copy(r->l, o, elen); + ea_list_ref(r->l); + + r->l->flags |= EALF_CACHED | huge; r->hash_key = h; - r->cached = 1; - rt_lock_hostentry(r->hostentry); + r->uc = 1; + rta_insert(r); if (++rta_cache_count > rta_cache_limit) rta_rehash(); - return r; + RTA_UNLOCK; + return r->l; } -void -rta__free(rta *a) +static void +ea_free_locked(struct ea_storage *a) { - ASSERT(rta_cache_count && a->cached); + /* Somebody has cloned this rta inbetween. This sometimes happens. */ + if (atomic_load_explicit(&a->uc, memory_order_acquire)) + return; + + ASSERT(rta_cache_count); rta_cache_count--; - *a->pprev = a->next; - if (a->next) - a->next->pprev = a->pprev; - rt_unlock_hostentry(a->hostentry); - if (a->nh.next) - nexthop_free(a->nh.next); - ea_free(a->eattrs); - a->cached = 0; - sl_free(a); + *a->pprev_hash = a->next_hash; + if (a->next_hash) + a->next_hash->pprev_hash = a->pprev_hash; + + ea_list_unref(a->l); + if (a->l->flags & EALF_HUGE) + mb_free(a); + else + sl_free(a); } -rta * -rta_do_cow(rta *o, linpool *lp) +static void +ea_free_nested(struct ea_list *l) { - rta *r = lp_alloc(lp, rta_size(o)); - memcpy(r, o, rta_size(o)); - for (struct nexthop **nhn = &(r->nh.next), *nho = o->nh.next; nho; nho = nho->next) - { - *nhn = lp_alloc(lp, nexthop_size(nho)); - memcpy(*nhn, nho, nexthop_size(nho)); - nhn = &((*nhn)->next); - } - r->cached = 0; - r->uc = 0; - return r; + struct ea_storage *r = ea_get_storage(l); + if (1 == atomic_fetch_sub_explicit(&r->uc, 1, memory_order_acq_rel)) + ea_free_locked(r); } -/** - * rta_dump - dump route attributes - * @a: attribute structure to dump - * - * This function takes a &rta and dumps its contents to the debug output. - */ void -rta_dump(rta *a) +ea__free(struct ea_storage *a) { - static char *rts[] = { "", "RTS_STATIC", "RTS_INHERIT", "RTS_DEVICE", - "RTS_STAT_DEV", "RTS_REDIR", "RTS_RIP", - "RTS_OSPF", "RTS_OSPF_IA", "RTS_OSPF_EXT1", - "RTS_OSPF_EXT2", "RTS_BGP", "RTS_PIPE", "RTS_BABEL" }; - static char *rtd[] = { "", " DEV", " HOLE", " UNREACH", " PROHIBIT" }; - - debug("pref=%d uc=%d %s %s%s h=%04x", - a->pref, a->uc, rts[a->source], ip_scope_text(a->scope), - rtd[a->dest], a->hash_key); - if (!a->cached) - debug(" !CACHED"); - debug(" <-%I", a->from); - if (a->dest == RTD_UNICAST) - for (struct nexthop *nh = &(a->nh); nh; nh = nh->next) - { - if (ipa_nonzero(nh->gw)) debug(" ->%I", nh->gw); - if (nh->labels) debug(" L %d", nh->label[0]); - for (int i=1; i<nh->labels; i++) - debug("/%d", nh->label[i]); - debug(" [%s]", nh->iface ? nh->iface->name : "???"); - } - if (a->eattrs) - { - debug(" EA: "); - ea_dump(a->eattrs); - } + RTA_LOCK; + ea_free_locked(a); + RTA_UNLOCK; } /** @@ -1304,30 +1562,29 @@ rta_dump(rta *a) * to the debug output. */ void -rta_dump_all(void) +ea_dump_all(void) { - rta *a; - uint h; + RTA_LOCK; debug("Route attribute cache (%d entries, rehash at %d):\n", rta_cache_count, rta_cache_limit); - for(h=0; h<rta_cache_size; h++) - for(a=rta_hash_table[h]; a; a=a->next) + for (uint h=0; h < rta_cache_size; h++) + for (struct ea_storage *a = rta_hash_table[h]; a; a = a->next_hash) { debug("%p ", a); - rta_dump(a); + ea_dump(a->l); debug("\n"); } debug("\n"); + + RTA_UNLOCK; } void -rta_show(struct cli *c, rta *a) +ea_show_list(struct cli *c, ea_list *eal) { - cli_printf(c, -1008, "\tType: %s %s", rta_src_names[a->source], ip_scope_text(a->scope)); - - for(ea_list *eal = a->eattrs; eal; eal=eal->next) - for(int i=0; i<eal->count; i++) - ea_show(c, &eal->attrs[i]); + ea_list *n = ea_normalize(eal, 0); + for (int i =0; i < n->count; i++) + ea_show(c, &n->attrs[i]); } /** @@ -1339,20 +1596,27 @@ rta_show(struct cli *c, rta *a) void rta_init(void) { - rta_pool = rp_new(&root_pool, "Attributes"); + attrs_domain = DOMAIN_NEW(attrs, "Attributes"); - rta_slab_[0] = sl_new(rta_pool, sizeof(rta)); - rta_slab_[1] = sl_new(rta_pool, sizeof(rta) + sizeof(u32)); - rta_slab_[2] = sl_new(rta_pool, sizeof(rta) + sizeof(u32)*2); - rta_slab_[3] = sl_new(rta_pool, sizeof(rta) + sizeof(u32)*MPLS_MAX_LABEL_STACK); + rta_pool = rp_new(&root_pool, "Attributes"); - nexthop_slab_[0] = sl_new(rta_pool, sizeof(struct nexthop)); - nexthop_slab_[1] = sl_new(rta_pool, sizeof(struct nexthop) + sizeof(u32)); - nexthop_slab_[2] = sl_new(rta_pool, sizeof(struct nexthop) + sizeof(u32)*2); - nexthop_slab_[3] = sl_new(rta_pool, sizeof(struct nexthop) + sizeof(u32)*MPLS_MAX_LABEL_STACK); + for (uint i=0; i<ARRAY_SIZE(ea_slab_sizes); i++) + ea_slab[i] = sl_new(rta_pool, ea_slab_sizes[i]); rta_alloc_hash(); rte_src_init(); + ea_class_init(); + + /* These attributes are required to be first for nice "show route" output */ + ea_register_init(&ea_gen_nexthop); + ea_register_init(&ea_gen_hostentry); + + /* Other generic route attributes */ + ea_register_init(&ea_gen_preference); + ea_register_init(&ea_gen_igp_metric); + ea_register_init(&ea_gen_from); + ea_register_init(&ea_gen_source); + ea_register_init(&ea_gen_flowspec_valid); } /* diff --git a/nest/rt-dev.c b/nest/rt-dev.c index 7932b8b7..8ae563b5 100644 --- a/nest/rt-dev.c +++ b/nest/rt-dev.c @@ -18,7 +18,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/rt-dev.h" #include "conf/conf.h" #include "lib/resource.h" @@ -67,13 +67,11 @@ dev_ifa_notify(struct proto *P, uint flags, struct ifa *ad) /* Use iface ID as local source ID */ struct rte_src *src = rt_get_source(P, ad->iface->index); - rte_update2(c, net, NULL, src); + rte_update(c, net, NULL, src); + rt_unlock_source(src); } else if (flags & IF_CHANGE_UP) { - rta *a; - rte *e; - DBG("dev_if_notify: %s:%I going up\n", ad->iface->name, ad->ip); if (cf->check_link && !(ad->iface->flags & IF_LINK_UP)) @@ -82,17 +80,23 @@ dev_ifa_notify(struct proto *P, uint flags, struct ifa *ad) /* Use iface ID as local source ID */ struct rte_src *src = rt_get_source(P, ad->iface->index); - rta a0 = { - .pref = c->preference, - .source = RTS_DEVICE, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, - .nh.iface = ad->iface, + ea_list *ea = NULL; + struct nexthop_adata nhad = { + .nh = { .iface = ad->iface, }, + .ad = { .length = (void *) NEXTHOP_NEXT(&nhad.nh) - (void *) nhad.ad.data, }, + }; + + ea_set_attr_u32(&ea, &ea_gen_preference, 0, c->preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_DEVICE); + ea_set_attr_data(&ea, &ea_gen_nexthop, 0, nhad.ad.data, nhad.ad.length); + + rte e0 = { + .attrs = ea, + .src = src, }; - a = rta_lookup(&a0); - e = rte_get_temp(a, src); - rte_update2(c, net, e, src); + rte_update(c, net, &e0, src); + rt_unlock_source(src); } } @@ -141,8 +145,8 @@ dev_init(struct proto_config *CF) proto_configure_channel(P, &p->ip4_channel, cf->ip4_channel); proto_configure_channel(P, &p->ip6_channel, cf->ip6_channel); - P->if_notify = dev_if_notify; - P->ifa_notify = dev_ifa_notify; + P->iface_sub.if_notify = dev_if_notify; + P->iface_sub.ifa_notify = dev_ifa_notify; return P; } @@ -184,7 +188,6 @@ dev_copy_config(struct proto_config *dest, struct proto_config *src) struct protocol proto_device = { .name = "Direct", .template = "direct%d", - .class = PROTOCOL_DIRECT, .preference = DEF_PREF_DIRECT, .channel_mask = NB_IP | NB_IP6_SADR, .proto_size = sizeof(struct rt_dev_proto), diff --git a/nest/rt-fib.c b/nest/rt-fib.c index 43e3039d..801561da 100644 --- a/nest/rt-fib.c +++ b/nest/rt-fib.c @@ -55,7 +55,7 @@ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "lib/string.h" /* diff --git a/nest/rt-show.c b/nest/rt-show.c index 183d023c..a5c7dc8f 100644 --- a/nest/rt-show.c +++ b/nest/rt-show.c @@ -10,7 +10,7 @@ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/cli.h" #include "nest/iface.h" @@ -19,90 +19,83 @@ #include "sysdep/unix/krt.h" static void -rt_show_table(struct cli *c, struct rt_show_data *d) +rt_show_table(struct rt_show_data *d) { + struct cli *c = d->cli; + /* No table blocks in 'show route count' */ if (d->stats == 2) return; if (d->last_table) cli_printf(c, -1007, ""); - cli_printf(c, -1007, "Table %s:", d->tab->table->name); + cli_printf(c, -1007, "Table %s:", + d->tab->name); d->last_table = d->tab; } -static inline struct krt_proto * -rt_show_get_kernel(struct rt_show_data *d) -{ - struct proto_config *krt = d->tab->table->config->krt_attached; - return krt ? (struct krt_proto *) krt->proto : NULL; -} - static void rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, int primary) { byte from[IPA_MAX_TEXT_LENGTH+8]; byte tm[TM_DATETIME_BUFFER_SIZE], info[256]; - rta *a = e->attrs; - int sync_error = d->kernel ? krt_get_sync_error(d->kernel, e) : 0; - void (*get_route_info)(struct rte *, byte *buf); - struct nexthop *nh; + ea_list *a = e->attrs; + int sync_error = d->tab->kernel ? krt_get_sync_error(d->tab->kernel, e) : 0; + void (*get_route_info)(const rte *, byte *buf); + eattr *nhea = net_type_match(e->net, NB_DEST) ? + ea_find(a, &ea_gen_nexthop) : NULL; + struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhad ? (NEXTHOP_IS_REACHABLE(nhad) ? RTD_UNICAST : nhad->dest) : RTD_NONE; + int flowspec_valid = net_is_flow(e->net) ? rt_get_flowspec_valid(e) : FLOWSPEC_UNKNOWN; tm_format_time(tm, &config->tf_route, e->lastmod); - if (ipa_nonzero(a->from) && !ipa_equal(a->from, a->nh.gw)) - bsprintf(from, " from %I", a->from); + ip_addr a_from = ea_get_ip(a, &ea_gen_from, IPA_NONE); + if (ipa_nonzero(a_from) && (!nhad || !ipa_equal(a_from, nhad->nh.gw))) + bsprintf(from, " from %I", a_from); else from[0] = 0; /* Need to normalize the extended attributes */ - if (d->verbose && !rta_is_cached(a) && a->eattrs) - ea_normalize(a->eattrs); + if (d->verbose && !rta_is_cached(a) && a) + a = ea_normalize(a, 0); - get_route_info = e->src->proto->proto->get_route_info; + get_route_info = e->src->owner->class ? e->src->owner->class->get_route_info : NULL; if (get_route_info) get_route_info(e, info); else - bsprintf(info, " (%d)", a->pref); + bsprintf(info, " (%d)", rt_get_preference(e)); if (d->last_table != d->tab) - rt_show_table(c, d); - - cli_printf(c, -1007, "%-20s %s [%s %s%s]%s%s", ia, rta_dest_name(a->dest), - e->src->proto->name, tm, from, primary ? (sync_error ? " !" : " *") : "", info); - - if (a->dest == RTD_UNICAST) - for (nh = &(a->nh); nh; nh = nh->next) - { - char mpls[MPLS_MAX_LABEL_STACK*12 + 5], *lsp = mpls; - char *onlink = (nh->flags & RNF_ONLINK) ? " onlink" : ""; - char weight[16] = ""; - - if (nh->labels) - { - lsp += bsprintf(lsp, " mpls %d", nh->label[0]); - for (int i=1;i<nh->labels; i++) - lsp += bsprintf(lsp, "/%d", nh->label[i]); - } - *lsp = '\0'; + rt_show_table(d); - if (a->nh.next) - bsprintf(weight, " weight %d", nh->weight + 1); + eattr *heea; + struct hostentry_adata *had = NULL; + if (!net_is_flow(e->net) && (dest == RTD_NONE) && (heea = ea_find(a, &ea_gen_hostentry))) + had = (struct hostentry_adata *) heea->u.ptr; - if (ipa_nonzero(nh->gw)) - cli_printf(c, -1007, "\tvia %I on %s%s%s%s", - nh->gw, nh->iface->name, mpls, onlink, weight); - else - cli_printf(c, -1007, "\tdev %s%s%s", - nh->iface->name, mpls, onlink, weight); - } + cli_printf(c, -1007, "%-20s %s [%s %s%s]%s%s", ia, + net_is_flow(e->net) ? flowspec_valid_name(flowspec_valid) : had ? "recursive" : rta_dest_name(dest), + e->src->owner->name, tm, from, primary ? (sync_error ? " !" : " *") : "", info); if (d->verbose) - rta_show(c, a); + { + ea_show_list(c, a); + cli_printf(c, -1008, "\tInternal route handling values: %uL %uG %uS id %u", + e->src->private_id, e->src->global_id, e->stale_cycle, e->id); + } + else if (dest == RTD_UNICAST) + ea_show_nexthop_list(c, nhad); + else if (had) + { + char hetext[256]; + ea_show_hostentry(&had->ad, hetext, sizeof hetext); + cli_printf(c, -1007, "\t%s", hetext); + } } static void -rt_show_net(struct cli *c, net *n, struct rt_show_data *d) +rt_show_net(struct rt_show_data *d, const net_addr *n, const rte **feed, uint count) { - rte *e, *ee; + struct cli *c = d->cli; byte ia[NET_MAX_TEXT_LENGTH+1]; struct channel *ec = d->tab->export_channel; @@ -114,9 +107,9 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) int first_show = 1; int pass = 0; - for (e = n->routes; e; e = e->next) + for (uint i = 0; i < count; i++) { - if (rte_is_filtered(e) != d->filtered) + if (!d->tab->prefilter && (rte_is_filtered(feed[i]) != d->filtered)) continue; d->rt_counter++; @@ -126,15 +119,20 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) if (pass) continue; - ee = e; + struct rte e = *feed[i]; + if (d->tab->prefilter) + if (e.sender != d->tab->prefilter->in_req.hook) + continue; + else while (e.attrs->next) + e.attrs = e.attrs->next; /* Export channel is down, do not try to export routes to it */ - if (ec && (ec->export_state == ES_DOWN)) + if (ec && !ec->out_req.hook) goto skip; if (d->export_mode == RSEM_EXPORTED) { - if (!bmap_test(&ec->export_map, ee->id)) + if (!bmap_test(&ec->export_map, e.id)) goto skip; // if (ec->ra_mode != RA_ANY) @@ -143,17 +141,18 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) else if ((d->export_mode == RSEM_EXPORT) && (ec->ra_mode == RA_MERGED)) { /* Special case for merged export */ - rte *rt_free; - e = rt_export_merged(ec, n, &rt_free, c->show_pool, 1); pass = 1; + rte *em = rt_export_merged(ec, feed, count, tmp_linpool, 1); - if (!e) - { e = ee; goto skip; } + if (em) + e = *em; + else + goto skip; } else if (d->export_mode) { struct proto *ep = ec->proto; - int ic = ep->preexport ? ep->preexport(ec, e) : 0; + int ic = ep->preexport ? ep->preexport(ec, &e) : 0; if (ec->ra_mode == RA_OPTIMAL || ec->ra_mode == RA_MERGED) pass = 1; @@ -169,7 +168,7 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) * command may change the export filter and do not update routes. */ int do_export = (ic > 0) || - (f_run(ec->out_filter, &e, c->show_pool, FF_SILENT) <= F_ACCEPT); + (f_run(ec->out_filter, &e, FF_SILENT) <= F_ACCEPT); if (do_export != (d->export_mode == RSEM_EXPORT)) goto skip; @@ -179,184 +178,197 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) } } - if (d->show_protocol && (d->show_protocol != e->src->proto)) + if (d->show_protocol && (&d->show_protocol->sources != e.src->owner)) goto skip; - if (f_run(d->filter, &e, c->show_pool, 0) > F_ACCEPT) + if (f_run(d->filter, &e, 0) > F_ACCEPT) goto skip; if (d->stats < 2) { if (first_show) - net_format(n->n.addr, ia, sizeof(ia)); + net_format(n, ia, sizeof(ia)); else ia[0] = 0; - rt_show_rte(c, ia, e, d, (e->net->routes == ee)); + rt_show_rte(c, ia, &e, d, !d->tab->prefilter && !i); first_show = 0; } d->show_counter++; skip: - if (e != ee) - { - rte_free(e); - e = ee; - } - lp_flush(c->show_pool); - if (d->primary_only) break; } + + if ((d->show_counter - d->show_counter_last_flush) > 64) + { + d->show_counter_last_flush = d->show_counter; + cli_write_trigger(d->cli); + } } static void -rt_show_cleanup(struct cli *c) +rt_show_net_export_bulk(struct rt_export_request *req, const net_addr *n, + struct rt_pending_export *first UNUSED, struct rt_pending_export *last UNUSED, + const rte **feed, uint count) { - struct rt_show_data *d = c->rover; - struct rt_show_data_rtable *tab; + struct rt_show_data *d = SKIP_BACK(struct rt_show_data, req, req); + return rt_show_net(d, n, feed, count); +} - /* Unlink the iterator */ - if (d->table_open && !d->trie_walk) - fit_get(&d->tab->table->fib, &d->fit); +static void +rt_show_export_stopped_cleanup(struct rt_export_request *req) +{ + struct rt_show_data *d = SKIP_BACK(struct rt_show_data, req, req); - if (d->walk_lock) - rt_unlock_trie(d->tab->table, d->walk_lock); + /* The hook is now invalid */ + req->hook = NULL; - /* Unlock referenced tables */ - WALK_LIST(tab, d->tables) - rt_unlock_table(tab->table); + /* And free the CLI (deferred) */ + rfree(d->cli->pool); } -static void -rt_show_cont(struct cli *c) +static int +rt_show_cleanup(struct cli *c) { struct rt_show_data *d = c->rover; - struct rtable *tab = d->tab->table; -#ifdef DEBUGGING - unsigned max = 4; -#else - unsigned max = 64; -#endif - struct fib *fib = &tab->fib; - struct fib_iterator *it = &d->fit; + c->cleanup = NULL; - if (d->running_on_config && (d->running_on_config != config)) + /* Cancel the feed */ + if (d->req.hook) { - cli_printf(c, 8004, "Stopped due to reconfiguration"); - goto done; + rt_stop_export(&d->req, rt_show_export_stopped_cleanup); + return 1; } + else + return 0; +} - if (!d->table_open) - { - /* We use either trie-based walk or fib-based walk */ - d->trie_walk = tab->trie && - (d->addr_mode == RSD_ADDR_IN) && - net_val_match(tab->addr_type, NB_IP); +static void rt_show_export_stopped(struct rt_export_request *req); - if (d->trie_walk && !d->walk_state) - d->walk_state = lp_allocz(c->parser_pool, sizeof (struct f_trie_walk_state)); +static void +rt_show_log_state_change(struct rt_export_request *req, u8 state) +{ + if (state == TES_READY) + rt_stop_export(req, rt_show_export_stopped); +} - if (d->trie_walk) - { - d->walk_lock = rt_lock_trie(tab); - trie_walk_init(d->walk_state, tab->trie, d->addr); - } - else - FIB_ITERATE_INIT(&d->fit, &tab->fib); +static void +rt_show_dump_req(struct rt_export_request *req) +{ + debug(" CLI Show Route Feed %p\n", req); +} - d->table_open = 1; - d->table_counter++; - d->kernel = rt_show_get_kernel(d); +static void +rt_show_done(struct rt_show_data *d) +{ + /* No more action */ + d->cli->cleanup = NULL; + d->cli->cont = NULL; + d->cli->rover = NULL; - d->show_counter_last = d->show_counter; - d->rt_counter_last = d->rt_counter; - d->net_counter_last = d->net_counter; + /* Write pending messages */ + cli_write_trigger(d->cli); +} - if (d->tables_defined_by & RSD_TDB_SET) - rt_show_table(c, d); - } +static void +rt_show_cont(struct rt_show_data *d) +{ + struct cli *c = d->cli; - if (d->trie_walk) + if (d->running_on_config && (d->running_on_config != config)) { - /* Trie-based walk */ - net_addr addr; - while (trie_walk_next(d->walk_state, &addr)) - { - net *n = net_find(tab, &addr); - if (!n) - continue; + cli_printf(c, 8004, "Stopped due to reconfiguration"); + return rt_show_done(d); + } - rt_show_net(c, n, d); + d->req = (struct rt_export_request) { + .addr = d->addr, + .name = "CLI Show Route", + .list = &global_work_list, + .export_bulk = rt_show_net_export_bulk, + .dump_req = rt_show_dump_req, + .log_state_change = rt_show_log_state_change, + .addr_mode = d->addr_mode, + }; - if (!--max) - return; - } + d->table_counter++; - rt_unlock_trie(tab, d->walk_lock); - d->walk_lock = NULL; - } - else - { - /* fib-based walk */ - FIB_ITERATE_START(fib, it, net, n) - { - if ((d->addr_mode == RSD_ADDR_IN) && (!net_in_netX(n->n.addr, d->addr))) - goto next; + d->show_counter_last = d->show_counter; + d->rt_counter_last = d->rt_counter; + d->net_counter_last = d->net_counter; - if (!max--) - { - FIB_ITERATE_PUT(it); - return; - } - rt_show_net(c, n, d); + if (d->tables_defined_by & RSD_TDB_SET) + rt_show_table(d); - next:; - } - FIB_ITERATE_END; - } + rt_request_export_other(d->tab->table, &d->req); +} + +static void +rt_show_export_stopped(struct rt_export_request *req) +{ + struct rt_show_data *d = SKIP_BACK(struct rt_show_data, req, req); + + /* The hook is now invalid */ + req->hook = NULL; if (d->stats) { if (d->last_table != d->tab) - rt_show_table(c, d); + rt_show_table(d); - cli_printf(c, -1007, "%d of %d routes for %d networks in table %s", + cli_printf(d->cli, -1007, "%d of %d routes for %d networks in table %s", d->show_counter - d->show_counter_last, d->rt_counter - d->rt_counter_last, - d->net_counter - d->net_counter_last, tab->name); + d->net_counter - d->net_counter_last, d->tab->name); } - d->kernel = NULL; - d->table_open = 0; d->tab = NODE_NEXT(d->tab); if (NODE_VALID(d->tab)) - return; + return rt_show_cont(d); + /* Printout total stats */ if (d->stats && (d->table_counter > 1)) { - if (d->last_table) cli_printf(c, -1007, ""); - cli_printf(c, 14, "Total: %d of %d routes for %d networks in %d tables", + if (d->last_table) cli_printf(d->cli, -1007, ""); + cli_printf(d->cli, 14, "Total: %d of %d routes for %d networks in %d tables", d->show_counter, d->rt_counter, d->net_counter, d->table_counter); } + else if (!d->rt_counter && ((d->addr_mode == TE_ADDR_EQUAL) || (d->addr_mode == TE_ADDR_FOR))) + cli_printf(d->cli, 8001, "Network not found"); else - cli_printf(c, 0, ""); + cli_printf(d->cli, 0, ""); -done: - rt_show_cleanup(c); - c->cont = c->cleanup = NULL; + /* No more route showing */ + rt_show_done(d); } struct rt_show_data_rtable * -rt_show_add_table(struct rt_show_data *d, rtable *t) +rt_show_add_exporter(struct rt_show_data *d, struct rt_exporter *t, const char *name) { struct rt_show_data_rtable *tab = cfg_allocz(sizeof(struct rt_show_data_rtable)); tab->table = t; + tab->name = name; add_tail(&(d->tables), &(tab->n)); return tab; } +struct rt_show_data_rtable * +rt_show_add_table(struct rt_show_data *d, rtable *t) +{ + struct rt_show_data_rtable *rsdr; + RT_LOCKED(t, tp) + rsdr = rt_show_add_exporter(d, &tp->exporter.e, t->name); + + struct proto_config *krt = t->config->krt_attached; + if (krt) + rsdr->kernel = (struct krt_proto *) krt->proto; + + return rsdr; +} + static inline void rt_show_get_default_tables(struct rt_show_data *d) { @@ -375,7 +387,7 @@ rt_show_get_default_tables(struct rt_show_data *d) { WALK_LIST(c, d->export_protocol->channels) { - if (c->export_state == ES_DOWN) + if (!c->out_req.hook) continue; tab = rt_show_add_table(d, c->table); @@ -392,8 +404,8 @@ rt_show_get_default_tables(struct rt_show_data *d) } for (int i=1; i<NET_MAX; i++) - if (config->def_tables[i] && config->def_tables[i]->table) - rt_show_add_table(d, config->def_tables[i]->table); + if (config->def_tables[i] && config->def_tables[i]->table && config->def_tables[i]->table->table) + rt_show_add_table(d, config->def_tables[i]->table->table); } static inline void @@ -410,17 +422,18 @@ rt_show_prepare_tables(struct rt_show_data *d) /* Ensure there is defined export_channel for each table */ if (d->export_mode) { + rtable *rt = SKIP_BACK(rtable, priv.exporter.e, tab->table); if (!tab->export_channel && d->export_channel && - (tab->table == d->export_channel->table)) + (rt == d->export_channel->table)) tab->export_channel = d->export_channel; if (!tab->export_channel && d->export_protocol) - tab->export_channel = proto_find_channel_by_table(d->export_protocol, tab->table); + tab->export_channel = proto_find_channel_by_table(d->export_protocol, rt); if (!tab->export_channel) { if (d->tables_defined_by & RSD_TDB_NMN) - cf_error("No export channel for table %s", tab->table->name); + cf_error("No export channel for table %s", tab->name); rem_node(&(tab->n)); continue; @@ -431,7 +444,7 @@ rt_show_prepare_tables(struct rt_show_data *d) if (d->addr && (tab->table->addr_type != d->addr->type)) { if (d->tables_defined_by & RSD_TDB_NMN) - cf_error("Incompatible type of prefix/ip for table %s", tab->table->name); + cf_error("Incompatible type of prefix/ip for table %s", tab->name); rem_node(&(tab->n)); continue; @@ -443,48 +456,29 @@ rt_show_prepare_tables(struct rt_show_data *d) cf_error("No valid tables"); } +static void +rt_show_dummy_cont(struct cli *c UNUSED) +{ + /* Explicitly do nothing to prevent CLI from trying to parse another command. */ +} + void rt_show(struct rt_show_data *d) { - struct rt_show_data_rtable *tab; - net *n; - /* Filtered routes are neither exported nor have sensible ordering */ if (d->filtered && (d->export_mode || d->primary_only)) cf_error("Incompatible show route options"); rt_show_prepare_tables(d); - if (!d->addr || (d->addr_mode == RSD_ADDR_IN)) - { - WALK_LIST(tab, d->tables) - rt_lock_table(tab->table); - - /* There is at least one table */ - d->tab = HEAD(d->tables); - this_cli->cont = rt_show_cont; - this_cli->cleanup = rt_show_cleanup; - this_cli->rover = d; - } - else - { - WALK_LIST(tab, d->tables) - { - d->tab = tab; - d->kernel = rt_show_get_kernel(d); + if (EMPTY_LIST(d->tables)) + cf_error("No suitable tables found"); - if (d->addr_mode == RSD_ADDR_FOR) - n = net_route(tab->table, d->addr); - else - n = net_find(tab->table, d->addr); + d->tab = HEAD(d->tables); - if (n) - rt_show_net(this_cli, n, d); - } + this_cli->cleanup = rt_show_cleanup; + this_cli->rover = d; + this_cli->cont = rt_show_dummy_cont; - if (d->rt_counter) - cli_msg(0, ""); - else - cli_msg(8001, "Network not found"); - } + rt_show_cont(d); } diff --git a/nest/rt-table.c b/nest/rt-table.c index e4b27814..54aa90a6 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -47,10 +47,10 @@ * all prefixes that may influence resolving of tracked next hops. * * When a best route changes in the src table, the hostcache is notified using - * rt_notify_hostcache(), which immediately checks using the trie whether the + * an auxiliary export request, which checks using the trie whether the * change is relevant and if it is, then it schedules asynchronous hostcache * recomputation. The recomputation is done by rt_update_hostcache() (called - * from rt_event() of src table), it walks through all hostentries and resolves + * as an event of src table), it walks through all hostentries and resolves * them (by rt_update_hostentry()). It also updates the trie. If a change in * hostentry resolution was found, then it schedules asynchronous nexthop * recomputation of associated dst table. That is done by rt_next_hop_update() @@ -64,15 +64,14 @@ * routes depends of resolving their network prefixes in IP routing tables. This * is similar to the recursive next hop mechanism, but simpler as there are no * intermediate hostcache and hostentries (because flows are less likely to - * share common net prefix than routes sharing a common next hop). In src table, - * there is a list of dst tables (list flowspec_links), this list is updated by - * flowpsec channels (by rt_flowspec_link() and rt_flowspec_unlink() during - * channel start/stop). Each dst table has its own trie of prefixes that may - * influence validation of flowspec routes in it (flowspec_trie). + * share common net prefix than routes sharing a common next hop). Every dst + * table has its own export request in every src table. Each dst table has its + * own trie of prefixes that may influence validation of flowspec routes in it + * (flowspec_trie). * - * When a best route changes in the src table, rt_flowspec_notify() immediately - * checks all dst tables from the list using their tries to see whether the - * change is relevant for them. If it is, then an asynchronous re-validation of + * When a best route changes in the src table, the notification mechanism is + * invoked by the export request which checks its dst table's trie to see + * whether the change is relevant, and if so, an asynchronous re-validation of * flowspec routes in the dst table is scheduled. That is also done by function * rt_next_hop_update(), like nexthop recomputation above. It iterates over all * flowspec routes and re-validates them. It also recalculates the trie. @@ -87,15 +86,14 @@ * will be re-validated later in this round anyway. * * The third mechanism is used for RPKI re-validation of IP routes and it is the - * simplest. It is just a list of subscribers in src table, who are notified - * when any change happened, but only after a settle time. Also, in RPKI case - * the dst is not a table, but a channel, who refeeds routes through a filter. + * simplest. It is also an auxiliary export request belonging to the + * appropriate channel, triggering its reload/refeed timer after a settle time. */ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "lib/resource.h" @@ -109,32 +107,103 @@ #include "lib/string.h" #include "lib/alloca.h" #include "lib/flowspec.h" +#include "lib/idm.h" #ifdef CONFIG_BGP #include "proto/bgp/bgp.h" #endif -pool *rt_table_pool; +#include <stdatomic.h> -static slab *rte_slab; -static linpool *rte_update_pool; +pool *rt_table_pool; list routing_tables; +list deleted_routing_tables; + +struct rt_cork rt_cork; + +/* Data structures for export journal */ +#define RT_PENDING_EXPORT_ITEMS (page_size - sizeof(struct rt_export_block)) / sizeof(struct rt_pending_export) + +struct rt_export_block { + node n; + _Atomic u32 end; + _Atomic _Bool not_last; + struct rt_pending_export export[]; +}; + +static void rt_free_hostcache(struct rtable_private *tab); +static void rt_update_hostcache(void *tab); +static void rt_next_hop_update(struct rtable_private *tab); +static void rt_nhu_uncork(void *_tab); +static inline void rt_next_hop_resolve_rte(rte *r); +static inline void rt_flowspec_resolve_rte(rte *r, struct channel *c); +static inline void rt_prune_table(struct rtable_private *tab); +static void rt_kick_prune_timer(struct rtable_private *tab); +static void rt_feed_by_fib(void *); +static void rt_feed_by_trie(void *); +static void rt_feed_equal(void *); +static void rt_feed_for(void *); +static void rt_check_cork_low(struct rtable_private *tab); +static void rt_check_cork_high(struct rtable_private *tab); +static void rt_cork_release_hook(void *); +static void rt_shutdown(void *); +static void rt_delete(void *); + +static void rt_export_used(struct rt_table_exporter *, const char *, const char *); +static void rt_export_cleanup(struct rtable_private *tab); + +static int rte_same(rte *x, rte *y); + +const char *rt_import_state_name_array[TIS_MAX] = { + [TIS_DOWN] = "DOWN", + [TIS_UP] = "UP", + [TIS_STOP] = "STOP", + [TIS_FLUSHING] = "FLUSHING", + [TIS_WAITING] = "WAITING", + [TIS_CLEARED] = "CLEARED", +}; + +const char *rt_export_state_name_array[TES_MAX] = { + [TES_DOWN] = "DOWN", + [TES_HUNGRY] = "HUNGRY", + [TES_FEEDING] = "FEEDING", + [TES_READY] = "READY", + [TES_STOP] = "STOP" +}; + +const char *rt_import_state_name(u8 state) +{ + if (state >= TIS_MAX) + return "!! INVALID !!"; + else + return rt_import_state_name_array[state]; +} -static void rt_free_hostcache(rtable *tab); -static void rt_notify_hostcache(rtable *tab, net *net); -static void rt_update_hostcache(rtable *tab); -static void rt_next_hop_update(rtable *tab); -static inline void rt_prune_table(rtable *tab); -static inline void rt_schedule_notify(rtable *tab); -static void rt_flowspec_notify(rtable *tab, net *net); -static void rt_kick_prune_timer(rtable *tab); +const char *rt_export_state_name(u8 state) +{ + if (state >= TES_MAX) + return "!! INVALID !!"; + else + return rt_export_state_name_array[state]; +} +static struct hostentry *rt_get_hostentry(struct rtable_private *tab, ip_addr a, ip_addr ll, rtable *dep); + +static inline rtable *rt_priv_to_pub(struct rtable_private *tab) { return RT_PUB(tab); } +static inline rtable *rt_pub_to_pub(rtable *tab) { return tab; } +#define RT_ANY_TO_PUB(tab) _Generic((tab),rtable*:rt_pub_to_pub,struct rtable_private*:rt_priv_to_pub)((tab)) + +#define rt_trace(tab, level, fmt, args...) do {\ + rtable *t = RT_ANY_TO_PUB((tab)); \ + if (t->config->debug & (level)) \ + log(L_TRACE "%s: " fmt, t->name, ##args); \ +} while (0) static void net_init_with_trie(struct fib *f, void *N) { - rtable *tab = SKIP_BACK(rtable, fib, f); + struct rtable_private *tab = SKIP_BACK(struct rtable_private, fib, f); net *n = N; if (tab->trie) @@ -145,7 +214,7 @@ net_init_with_trie(struct fib *f, void *N) } static inline net * -net_route_ip4_trie(rtable *t, const net_addr_ip4 *n0) +net_route_ip4_trie(struct rtable_private *t, const net_addr_ip4 *n0) { TRIE_WALK_TO_ROOT_IP4(t->trie, n0, n) { @@ -159,7 +228,7 @@ net_route_ip4_trie(rtable *t, const net_addr_ip4 *n0) } static inline net * -net_route_vpn4_trie(rtable *t, const net_addr_vpn4 *n0) +net_route_vpn4_trie(struct rtable_private *t, const net_addr_vpn4 *n0) { TRIE_WALK_TO_ROOT_IP4(t->trie, (const net_addr_ip4 *) n0, px) { @@ -175,7 +244,7 @@ net_route_vpn4_trie(rtable *t, const net_addr_vpn4 *n0) } static inline net * -net_route_ip6_trie(rtable *t, const net_addr_ip6 *n0) +net_route_ip6_trie(struct rtable_private *t, const net_addr_ip6 *n0) { TRIE_WALK_TO_ROOT_IP6(t->trie, n0, n) { @@ -189,7 +258,7 @@ net_route_ip6_trie(rtable *t, const net_addr_ip6 *n0) } static inline net * -net_route_vpn6_trie(rtable *t, const net_addr_vpn6 *n0) +net_route_vpn6_trie(struct rtable_private *t, const net_addr_vpn6 *n0) { TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px) { @@ -205,7 +274,7 @@ net_route_vpn6_trie(rtable *t, const net_addr_vpn6 *n0) } static inline void * -net_route_ip6_sadr_trie(rtable *t, const net_addr_ip6_sadr *n0) +net_route_ip6_sadr_trie(struct rtable_private *t, const net_addr_ip6_sadr *n0) { TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px) { @@ -238,7 +307,7 @@ net_route_ip6_sadr_trie(rtable *t, const net_addr_ip6_sadr *n0) } static inline net * -net_route_ip4_fib(rtable *t, const net_addr_ip4 *n0) +net_route_ip4_fib(struct rtable_private *t, const net_addr_ip4 *n0) { net_addr_ip4 n; net_copy_ip4(&n, n0); @@ -254,7 +323,7 @@ net_route_ip4_fib(rtable *t, const net_addr_ip4 *n0) } static inline net * -net_route_vpn4_fib(rtable *t, const net_addr_vpn4 *n0) +net_route_vpn4_fib(struct rtable_private *t, const net_addr_vpn4 *n0) { net_addr_vpn4 n; net_copy_vpn4(&n, n0); @@ -270,7 +339,7 @@ net_route_vpn4_fib(rtable *t, const net_addr_vpn4 *n0) } static inline net * -net_route_ip6_fib(rtable *t, const net_addr_ip6 *n0) +net_route_ip6_fib(struct rtable_private *t, const net_addr_ip6 *n0) { net_addr_ip6 n; net_copy_ip6(&n, n0); @@ -286,7 +355,7 @@ net_route_ip6_fib(rtable *t, const net_addr_ip6 *n0) } static inline net * -net_route_vpn6_fib(rtable *t, const net_addr_vpn6 *n0) +net_route_vpn6_fib(struct rtable_private *t, const net_addr_vpn6 *n0) { net_addr_vpn6 n; net_copy_vpn6(&n, n0); @@ -302,7 +371,7 @@ net_route_vpn6_fib(rtable *t, const net_addr_vpn6 *n0) } static inline void * -net_route_ip6_sadr_fib(rtable *t, const net_addr_ip6_sadr *n0) +net_route_ip6_sadr_fib(struct rtable_private *t, const net_addr_ip6_sadr *n0) { net_addr_ip6_sadr n; net_copy_ip6_sadr(&n, n0); @@ -342,7 +411,7 @@ net_route_ip6_sadr_fib(rtable *t, const net_addr_ip6_sadr *n0) } net * -net_route(rtable *tab, const net_addr *n) +net_route(struct rtable_private *tab, const net_addr *n) { ASSERT(tab->addr_type == n->type); @@ -385,7 +454,7 @@ net_route(rtable *tab, const net_addr *n) static int -net_roa_check_ip4_trie(rtable *tab, const net_addr_ip4 *px, u32 asn) +net_roa_check_ip4_trie(struct rtable_private *tab, const net_addr_ip4 *px, u32 asn) { int anything = 0; @@ -399,7 +468,7 @@ net_roa_check_ip4_trie(rtable *tab, const net_addr_ip4 *px, u32 asn) net_addr_roa4 *roa = (void *) fn->addr; net *r = fib_node_to_user(&tab->fib, fn); - if (net_equal_prefix_roa4(roa, &roa0) && rte_is_valid(r->routes)) + if (net_equal_prefix_roa4(roa, &roa0) && r->routes && rte_is_valid(&r->routes->rte)) { anything = 1; if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) @@ -413,7 +482,7 @@ net_roa_check_ip4_trie(rtable *tab, const net_addr_ip4 *px, u32 asn) } static int -net_roa_check_ip4_fib(rtable *tab, const net_addr_ip4 *px, u32 asn) +net_roa_check_ip4_fib(struct rtable_private *tab, const net_addr_ip4 *px, u32 asn) { struct net_addr_roa4 n = NET_ADDR_ROA4(px->prefix, px->pxlen, 0, 0); struct fib_node *fn; @@ -426,7 +495,7 @@ net_roa_check_ip4_fib(rtable *tab, const net_addr_ip4 *px, u32 asn) net_addr_roa4 *roa = (void *) fn->addr; net *r = fib_node_to_user(&tab->fib, fn); - if (net_equal_prefix_roa4(roa, &n) && rte_is_valid(r->routes)) + if (net_equal_prefix_roa4(roa, &n) && r->routes && rte_is_valid(&r->routes->rte)) { anything = 1; if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) @@ -445,7 +514,7 @@ net_roa_check_ip4_fib(rtable *tab, const net_addr_ip4 *px, u32 asn) } static int -net_roa_check_ip6_trie(rtable *tab, const net_addr_ip6 *px, u32 asn) +net_roa_check_ip6_trie(struct rtable_private *tab, const net_addr_ip6 *px, u32 asn) { int anything = 0; @@ -459,7 +528,7 @@ net_roa_check_ip6_trie(rtable *tab, const net_addr_ip6 *px, u32 asn) net_addr_roa6 *roa = (void *) fn->addr; net *r = fib_node_to_user(&tab->fib, fn); - if (net_equal_prefix_roa6(roa, &roa0) && rte_is_valid(r->routes)) + if (net_equal_prefix_roa6(roa, &roa0) && r->routes && rte_is_valid(&r->routes->rte)) { anything = 1; if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) @@ -473,7 +542,7 @@ net_roa_check_ip6_trie(rtable *tab, const net_addr_ip6 *px, u32 asn) } static int -net_roa_check_ip6_fib(rtable *tab, const net_addr_ip6 *px, u32 asn) +net_roa_check_ip6_fib(struct rtable_private *tab, const net_addr_ip6 *px, u32 asn) { struct net_addr_roa6 n = NET_ADDR_ROA6(px->prefix, px->pxlen, 0, 0); struct fib_node *fn; @@ -486,7 +555,7 @@ net_roa_check_ip6_fib(rtable *tab, const net_addr_ip6 *px, u32 asn) net_addr_roa6 *roa = (void *) fn->addr; net *r = fib_node_to_user(&tab->fib, fn); - if (net_equal_prefix_roa6(roa, &n) && rte_is_valid(r->routes)) + if (net_equal_prefix_roa6(roa, &n) && r->routes && rte_is_valid(&r->routes->rte)) { anything = 1; if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) @@ -520,24 +589,30 @@ net_roa_check_ip6_fib(rtable *tab, const net_addr_ip6 *px, u32 asn) * must have type NET_IP4 or NET_IP6, respectively. */ int -net_roa_check(rtable *tab, const net_addr *n, u32 asn) +net_roa_check(rtable *tp, const net_addr *n, u32 asn) { - if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4)) - { - if (tab->trie) - return net_roa_check_ip4_trie(tab, (const net_addr_ip4 *) n, asn); - else - return net_roa_check_ip4_fib (tab, (const net_addr_ip4 *) n, asn); - } - else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6)) + int out = ROA_UNKNOWN; + + RT_LOCKED(tp, tab) { - if (tab->trie) - return net_roa_check_ip6_trie(tab, (const net_addr_ip6 *) n, asn); + if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4)) + { + if (tab->trie) + out = net_roa_check_ip4_trie(tab, (const net_addr_ip4 *) n, asn); + else + out = net_roa_check_ip4_fib (tab, (const net_addr_ip4 *) n, asn); + } + else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6)) + { + if (tab->trie) + out = net_roa_check_ip6_trie(tab, (const net_addr_ip6 *) n, asn); + else + out = net_roa_check_ip6_fib (tab, (const net_addr_ip6 *) n, asn); + } else - return net_roa_check_ip6_fib (tab, (const net_addr_ip6 *) n, asn); + out = ROA_UNKNOWN; /* Should not happen */ } - else - return ROA_UNKNOWN; /* Should not happen */ + return out; } /** @@ -545,287 +620,285 @@ net_roa_check(rtable *tab, const net_addr *n, u32 asn) * @net: network node * @src: route source * - * The rte_find() function returns a route for destination @net - * which is from route source @src. + * The rte_find() function returns a pointer to a route for destination @net + * which is from route source @src. List end pointer is returned if no route is found. */ -rte * +static struct rte_storage ** rte_find(net *net, struct rte_src *src) { - rte *e = net->routes; + struct rte_storage **e = &net->routes; + + while ((*e) && (*e)->rte.src != src) + e = &(*e)->next; - while (e && e->src != src) - e = e->next; return e; } -/** - * rte_get_temp - get a temporary &rte - * @a: attributes to assign to the new route (a &rta; in case it's - * un-cached, rte_update() will create a cached copy automatically) - * @src: route source - * - * Create a temporary &rte and bind it with the attributes @a. - */ -rte * -rte_get_temp(rta *a, struct rte_src *src) + +struct rte_storage * +rte_store(const rte *r, net *net, struct rtable_private *tab) { - rte *e = sl_alloc(rte_slab); + struct rte_storage *e = sl_alloc(tab->rte_slab); - e->attrs = a; - e->id = 0; - e->flags = 0; - e->pflags = 0; - rt_lock_source(e->src = src); - return e; -} + e->rte = *r; + e->rte.net = net->n.addr; -rte * -rte_do_cow(rte *r) -{ - rte *e = sl_alloc(rte_slab); + rt_lock_source(e->rte.src); - memcpy(e, r, sizeof(rte)); + if (ea_is_cached(e->rte.attrs)) + e->rte.attrs = rta_clone(e->rte.attrs); + else + e->rte.attrs = rta_lookup(e->rte.attrs, 1); - rt_lock_source(e->src); - e->attrs = rta_clone(r->attrs); - e->flags = 0; return e; } /** - * rte_cow_rta - get a private writable copy of &rte with writable &rta - * @r: a route entry to be copied - * @lp: a linpool from which to allocate &rta - * - * rte_cow_rta() takes a &rte and prepares it and associated &rta for - * modification. There are three possibilities: First, both &rte and &rta are - * private copies, in that case they are returned unchanged. Second, &rte is - * private copy, but &rta is cached, in that case &rta is duplicated using - * rta_do_cow(). Third, both &rte is shared and &rta is cached, in that case - * both structures are duplicated by rte_do_cow() and rta_do_cow(). - * - * Note that in the second case, cached &rta loses one reference, while private - * copy created by rta_do_cow() is a shallow copy sharing indirect data (eattrs, - * nexthops, ...) with it. To work properly, original shared &rta should have - * another reference during the life of created private copy. + * rte_free - delete a &rte + * @e: &struct rte_storage to be deleted + * @tab: the table which the rte belongs to * - * Result: a pointer to the new writable &rte with writable &rta. + * rte_free() deletes the given &rte from the routing table it's linked to. */ -rte * -rte_cow_rta(rte *r, linpool *lp) -{ - if (!rta_is_cached(r->attrs)) - return r; - r = rte_cow(r); - rta *a = rta_do_cow(r->attrs, lp); - rta_free(r->attrs); - r->attrs = a; - return r; +void +rte_free(struct rte_storage *e) +{ + rt_unlock_source(e->rte.src); + rta_free(e->rte.attrs); + sl_free(e); } static int /* Actually better or at least as good as */ -rte_better(rte *new, rte *old) +rte_better(const rte *new, const rte *old) { - int (*better)(rte *, rte *); + int (*better)(const rte *, const rte *); if (!rte_is_valid(old)) return 1; if (!rte_is_valid(new)) return 0; - if (new->attrs->pref > old->attrs->pref) + u32 np = rt_get_preference(new); + u32 op = rt_get_preference(old); + + if (np > op) return 1; - if (new->attrs->pref < old->attrs->pref) + if (np < op) return 0; - if (new->src->proto->proto != old->src->proto->proto) + if (new->src->owner->class != old->src->owner->class) { /* * If the user has configured protocol preferences, so that two different protocols * have the same preference, try to break the tie by comparing addresses. Not too * useful, but keeps the ordering of routes unambiguous. */ - return new->src->proto->proto > old->src->proto->proto; + return new->src->owner->class > old->src->owner->class; } - if (better = new->src->proto->rte_better) + if (better = new->src->owner->class->rte_better) return better(new, old); return 0; } static int -rte_mergable(rte *pri, rte *sec) +rte_mergable(const rte *pri, const rte *sec) { - int (*mergable)(rte *, rte *); + int (*mergable)(const rte *, const rte *); if (!rte_is_valid(pri) || !rte_is_valid(sec)) return 0; - if (pri->attrs->pref != sec->attrs->pref) + if (rt_get_preference(pri) != rt_get_preference(sec)) return 0; - if (pri->src->proto->proto != sec->src->proto->proto) + if (pri->src->owner->class != sec->src->owner->class) return 0; - if (mergable = pri->src->proto->rte_mergable) + if (mergable = pri->src->owner->class->rte_mergable) return mergable(pri, sec); return 0; } static void -rte_trace(struct channel *c, rte *e, int dir, char *msg) +rte_trace(const char *name, const rte *e, int dir, const char *msg) { - log(L_TRACE "%s.%s %c %s %N %uL %uG %s", - c->proto->name, c->name ?: "?", dir, msg, e->net->n.addr, e->src->private_id, e->src->global_id, - rta_dest_name(e->attrs->dest)); + log(L_TRACE "%s %c %s %N src %uL %uG %uS id %u %s", + name, dir, msg, e->net, + e->src->private_id, e->src->global_id, e->stale_cycle, e->id, + rta_dest_name(rte_dest(e))); } static inline void -rte_trace_in(uint flag, struct channel *c, rte *e, char *msg) +channel_rte_trace_in(uint flag, struct channel *c, const rte *e, const char *msg) { if ((c->debug & flag) || (c->proto->debug & flag)) - rte_trace(c, e, '>', msg); + rte_trace(c->in_req.name, e, '>', msg); } static inline void -rte_trace_out(uint flag, struct channel *c, rte *e, char *msg) +channel_rte_trace_out(uint flag, struct channel *c, const rte *e, const char *msg) { if ((c->debug & flag) || (c->proto->debug & flag)) - rte_trace(c, e, '<', msg); + rte_trace(c->out_req.name, e, '<', msg); +} + +static inline void +rt_rte_trace_in(uint flag, struct rt_import_request *req, const rte *e, const char *msg) +{ + if (req->trace_routes & flag) + rte_trace(req->name, e, '>', msg); +} + +#if 0 +// seems to be unused at all +static inline void +rt_rte_trace_out(uint flag, struct rt_export_request *req, const rte *e, const char *msg) +{ + if (req->trace_routes & flag) + rte_trace(req->name, e, '<', msg); +} +#endif + +static uint +rte_feed_count(net *n) +{ + uint count = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + count++; + + return count; +} + +static void +rte_feed_obtain(net *n, const rte **feed, uint count) +{ + uint i = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + { + ASSERT_DIE(i < count); + feed[i++] = &e->rte; + } + + ASSERT_DIE(i == count); } static rte * -export_filter_(struct channel *c, rte *rt0, rte **rt_free, linpool *pool, int silent) +export_filter(struct channel *c, rte *rt, int silent) { struct proto *p = c->proto; const struct filter *filter = c->out_filter; - struct proto_stats *stats = &c->stats; - rte *rt; - int v; + struct channel_export_stats *stats = &c->export_stats; - rt = rt0; - *rt_free = NULL; + /* Do nothing if we have already rejected the route */ + if (silent && bmap_test(&c->export_reject_map, rt->id)) + goto reject_noset; - v = p->preexport ? p->preexport(c, rt) : 0; + int v = p->preexport ? p->preexport(c, rt) : 0; if (v < 0) { if (silent) - goto reject; + goto reject_noset; - stats->exp_updates_rejected++; + stats->updates_rejected++; if (v == RIC_REJECT) - rte_trace_out(D_FILTERS, c, rt, "rejected by protocol"); + channel_rte_trace_out(D_FILTERS, c, rt, "rejected by protocol"); goto reject; + } if (v > 0) { if (!silent) - rte_trace_out(D_FILTERS, c, rt, "forced accept by protocol"); + channel_rte_trace_out(D_FILTERS, c, rt, "forced accept by protocol"); goto accept; } v = filter && ((filter == FILTER_REJECT) || - (f_run(filter, &rt, pool, + (f_run(filter, rt, (silent ? FF_SILENT : 0)) > F_ACCEPT)); if (v) { if (silent) goto reject; - stats->exp_updates_filtered++; - rte_trace_out(D_FILTERS, c, rt, "filtered out"); + stats->updates_filtered++; + channel_rte_trace_out(D_FILTERS, c, rt, "filtered out"); goto reject; } accept: - if (rt != rt0) - *rt_free = rt; + /* We have accepted the route */ + bmap_clear(&c->export_reject_map, rt->id); return rt; reject: + /* We have rejected the route by filter */ + bmap_set(&c->export_reject_map, rt->id); + +reject_noset: /* Discard temporary rte */ - if (rt != rt0) - rte_free(rt); return NULL; } -static inline rte * -export_filter(struct channel *c, rte *rt0, rte **rt_free, int silent) -{ - return export_filter_(c, rt0, rt_free, rte_update_pool, silent); -} - static void -do_rt_notify(struct channel *c, net *net, rte *new, rte *old, int refeed) +do_rt_notify(struct channel *c, const net_addr *net, rte *new, const rte *old) { struct proto *p = c->proto; - struct proto_stats *stats = &c->stats; + struct channel_export_stats *stats = &c->export_stats; - if (refeed && new) + if (c->refeeding && new) c->refeed_count++; - /* Apply export limit */ - struct channel_limit *l = &c->out_limit; - if (l->action && !old && new) - { - if (stats->exp_routes >= l->limit) - channel_notify_limit(c, l, PLD_OUT, stats->exp_routes); - - if (l->state == PLS_BLOCKED) + if (!old && new) + if (CHANNEL_LIMIT_PUSH(c, OUT)) { - stats->exp_updates_rejected++; - rte_trace_out(D_FILTERS, c, new, "rejected [limit]"); + stats->updates_rejected++; + channel_rte_trace_out(D_FILTERS, c, new, "rejected [limit]"); return; } - } - /* Apply export table */ - if (c->out_table && !rte_update_out(c, net->n.addr, new, old, refeed)) - return; + if (!new && old) + CHANNEL_LIMIT_POP(c, OUT); if (new) - stats->exp_updates_accepted++; + stats->updates_accepted++; else - stats->exp_withdraws_accepted++; + stats->withdraws_accepted++; if (old) - { bmap_clear(&c->export_map, old->id); - stats->exp_routes--; - } if (new) - { bmap_set(&c->export_map, new->id); - stats->exp_routes++; - } if (p->debug & D_ROUTES) { if (new && old) - rte_trace_out(D_ROUTES, c, new, "replaced"); + channel_rte_trace_out(D_ROUTES, c, new, "replaced"); else if (new) - rte_trace_out(D_ROUTES, c, new, "added"); + channel_rte_trace_out(D_ROUTES, c, new, "added"); else if (old) - rte_trace_out(D_ROUTES, c, old, "removed"); + channel_rte_trace_out(D_ROUTES, c, old, "removed"); } p->rt_notify(p, c, net, new, old); } static void -rt_notify_basic(struct channel *c, net *net, rte *new, rte *old, int refeed) +rt_notify_basic(struct channel *c, const net_addr *net, rte *new, rte *old) { - // struct proto *p = c->proto; - rte *new_free = NULL; - - if (new) - c->stats.exp_updates_received++; - else - c->stats.exp_withdraws_received++; + if (new && old && rte_same(new, old)) + { + if ((new->id != old->id) && bmap_test(&c->export_map, old->id)) + { + bmap_set(&c->export_map, new->id); + bmap_clear(&c->export_map, old->id); + } + return; + } if (new) - new = export_filter(c, new, &new_free, 0); + new = export_filter(c, new, 0); if (old && !bmap_test(&c->export_map, old->id)) old = NULL; @@ -833,197 +906,368 @@ rt_notify_basic(struct channel *c, net *net, rte *new, rte *old, int refeed) if (!new && !old) return; - do_rt_notify(c, net, new, old, refeed); - - /* Discard temporary rte */ - if (new_free) - rte_free(new_free); + do_rt_notify(c, net, new, old); } static void -rt_notify_accepted(struct channel *c, net *net, rte *new_changed, rte *old_changed, int refeed) +channel_rpe_mark_seen(struct rt_export_request *req, struct rt_pending_export *rpe) { - // struct proto *p = c->proto; - rte *new_best = NULL; - rte *old_best = NULL; - rte *new_free = NULL; - int new_first = 0; - - /* - * We assume that there are no changes in net route order except (added) - * new_changed and (removed) old_changed. Therefore, the function is not - * compatible with deterministic_med (where nontrivial reordering can happen - * as a result of a route change) and with recomputation of recursive routes - * due to next hop update (where many routes can be changed in one step). - * - * Note that we need this assumption just for optimizations, we could just - * run full new_best recomputation otherwise. - * - * There are three cases: - * feed or old_best is old_changed -> we need to recompute new_best - * old_best is before new_changed -> new_best is old_best, ignore - * old_best is after new_changed -> try new_changed, otherwise old_best - */ + struct channel *c = SKIP_BACK(struct channel, out_req, req); - if (net->routes) - c->stats.exp_updates_received++; - else - c->stats.exp_withdraws_received++; + rpe_mark_seen(req->hook, rpe); + if (rpe->old) + bmap_clear(&c->export_reject_map, rpe->old->rte.id); +} - /* Find old_best - either old_changed, or route for net->routes */ - if (old_changed && bmap_test(&c->export_map, old_changed->id)) - old_best = old_changed; - else +void +rt_notify_accepted(struct rt_export_request *req, const net_addr *n, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) +{ + struct channel *c = SKIP_BACK(struct channel, out_req, req); + + rte nb0, *new_best = NULL; + const rte *old_best = NULL; + + for (uint i = 0; i < count; i++) { - for (rte *r = net->routes; rte_is_valid(r); r = r->next) + if (!rte_is_valid(feed[i])) + continue; + + /* Has been already rejected, won't bother with it */ + if (!c->refeeding && bmap_test(&c->export_reject_map, feed[i]->id)) + continue; + + /* Previously exported */ + if (!old_best && bmap_test(&c->export_map, feed[i]->id)) { - if (bmap_test(&c->export_map, r->id)) + /* is still best */ + if (!new_best) { - old_best = r; - break; + DBG("rt_notify_accepted: idempotent\n"); + goto done; } - /* Note if new_changed found before old_best */ - if (r == new_changed) - new_first = 1; + /* is superseded */ + old_best = feed[i]; + break; } - } - /* Find new_best */ - if ((new_changed == old_changed) || (old_best == old_changed)) - { - /* Feed or old_best changed -> find first accepted by filters */ - for (rte *r = net->routes; rte_is_valid(r); r = r->next) - if (new_best = export_filter(c, r, &new_free, 0)) - break; + /* Have no new best route yet */ + if (!new_best) + { + /* Try this route not seen before */ + nb0 = *feed[i]; + new_best = export_filter(c, &nb0, 0); + DBG("rt_notify_accepted: checking route id %u: %s\n", feed[i]->id, new_best ? "ok" : "no"); + } } - else + +done: + /* Check obsolete routes for previously exported */ + RPE_WALK(first, rpe, NULL) { - /* Other cases -> either new_changed, or old_best (and nothing changed) */ - if (new_first && (new_changed = export_filter(c, new_changed, &new_free, 0))) - new_best = new_changed; - else - return; + channel_rpe_mark_seen(req, rpe); + if (rpe->old) + { + if (bmap_test(&c->export_map, rpe->old->rte.id)) + { + ASSERT_DIE(old_best == NULL); + old_best = &rpe->old->rte; + } + } + if (rpe == last) + break; } - if (!new_best && !old_best) - return; - - do_rt_notify(c, net, new_best, old_best, refeed); - - /* Discard temporary rte */ - if (new_free) - rte_free(new_free); -} - - -static struct nexthop * -nexthop_merge_rta(struct nexthop *nhs, rta *a, linpool *pool, int max) -{ - return nexthop_merge(nhs, &(a->nh), 1, 0, max, pool); + /* Nothing to export */ + if (new_best || old_best) + do_rt_notify(c, n, new_best, old_best); + else + DBG("rt_notify_accepted: nothing to export\n"); } rte * -rt_export_merged(struct channel *c, net *net, rte **rt_free, linpool *pool, int silent) +rt_export_merged(struct channel *c, const rte **feed, uint count, linpool *pool, int silent) { - // struct proto *p = c->proto; - struct nexthop *nhs = NULL; - rte *best0, *best, *rt0, *rt, *tmp; + _Thread_local static rte rloc; - best0 = net->routes; - *rt_free = NULL; + // struct proto *p = c->proto; + struct nexthop_adata *nhs = NULL; + const rte *best0 = feed[0]; + rte *best = NULL; if (!rte_is_valid(best0)) return NULL; - best = export_filter_(c, best0, rt_free, pool, silent); + /* Already rejected, no need to re-run the filter */ + if (!c->refeeding && bmap_test(&c->export_reject_map, best0->id)) + return NULL; + + rloc = *best0; + best = export_filter(c, &rloc, silent); + + if (!best) + /* Best route doesn't pass the filter */ + return NULL; - if (!best || !rte_is_reachable(best)) + if (!rte_is_reachable(best)) + /* Unreachable routes can't be merged */ return best; - for (rt0 = best0->next; rt0; rt0 = rt0->next) + for (uint i = 1; i < count; i++) { - if (!rte_mergable(best0, rt0)) + if (!rte_mergable(best0, feed[i])) continue; - rt = export_filter_(c, rt0, &tmp, pool, 1); + rte tmp0 = *feed[i]; + rte *tmp = export_filter(c, &tmp0, 1); - if (!rt) + if (!tmp || !rte_is_reachable(tmp)) continue; - if (rte_is_reachable(rt)) - nhs = nexthop_merge_rta(nhs, rt->attrs, pool, c->merge_limit); + eattr *nhea = ea_find(tmp->attrs, &ea_gen_nexthop); + ASSERT_DIE(nhea); - if (tmp) - rte_free(tmp); + if (nhs) + nhs = nexthop_merge(nhs, (struct nexthop_adata *) nhea->u.ptr, c->merge_limit, pool); + else + nhs = (struct nexthop_adata *) nhea->u.ptr; } if (nhs) { - nhs = nexthop_merge_rta(nhs, best->attrs, pool, c->merge_limit); + eattr *nhea = ea_find(best->attrs, &ea_gen_nexthop); + ASSERT_DIE(nhea); - if (nhs->next) - { - best = rte_cow_rta(best, pool); - nexthop_link(best->attrs, nhs); - } - } + nhs = nexthop_merge(nhs, (struct nexthop_adata *) nhea->u.ptr, c->merge_limit, pool); - if (best != best0) - *rt_free = best; + ea_set_attr(&best->attrs, + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nhs->ad)); + } return best; } - -static void -rt_notify_merged(struct channel *c, net *net, rte *new_changed, rte *old_changed, - rte *new_best, rte *old_best, int refeed) +void +rt_notify_merged(struct rt_export_request *req, const net_addr *n, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) { - // struct proto *p = c->proto; - rte *new_free = NULL; - - /* We assume that all rte arguments are either NULL or rte_is_valid() */ + struct channel *c = SKIP_BACK(struct channel, out_req, req); - /* This check should be done by the caller */ - if (!new_best && !old_best) - return; + // struct proto *p = c->proto; +#if 0 /* TODO: Find whether this check is possible when processing multiple changes at once. */ /* Check whether the change is relevant to the merged route */ if ((new_best == old_best) && (new_changed != old_changed) && !rte_mergable(new_best, new_changed) && !rte_mergable(old_best, old_changed)) return; +#endif - if (new_best) - c->stats.exp_updates_received++; - else - c->stats.exp_withdraws_received++; + const rte *old_best = NULL; + /* Find old best route */ + for (uint i = 0; i < count; i++) + if (bmap_test(&c->export_map, feed[i]->id)) + { + old_best = feed[i]; + break; + } + + /* Check obsolete routes for previously exported */ + RPE_WALK(first, rpe, NULL) + { + channel_rpe_mark_seen(req, rpe); + if (rpe->old) + { + if (bmap_test(&c->export_map, rpe->old->rte.id)) + { + ASSERT_DIE(old_best == NULL); + old_best = &rpe->old->rte; + } + } + if (rpe == last) + break; + } /* Prepare new merged route */ - if (new_best) - new_best = rt_export_merged(c, net, &new_free, rte_update_pool, 0); + rte *new_merged = count ? rt_export_merged(c, feed, count, tmp_linpool, 0) : NULL; + + if (new_merged || old_best) + do_rt_notify(c, n, new_merged, old_best); +} + +void +rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) +{ + struct channel *c = SKIP_BACK(struct channel, out_req, req); + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; + + RPE_WALK(first, rpe, NULL) + { + channel_rpe_mark_seen(req, rpe); + new_best = rpe->new_best; + } + + rte n0 = RTE_COPY_VALID(new_best); + if (n0.src || o) + rt_notify_basic(c, net, n0.src ? &n0 : NULL, o); +} - /* Check old merged route */ - if (old_best && !bmap_test(&c->export_map, old_best->id)) - old_best = NULL; +void +rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) +{ + struct channel *c = SKIP_BACK(struct channel, out_req, req); - if (!new_best && !old_best) + rte *n = RTE_VALID_OR_NULL(first->new); + rte *o = RTE_VALID_OR_NULL(first->old); + + if (!n && !o) + { + channel_rpe_mark_seen(req, first); return; + } - do_rt_notify(c, net, new_best, old_best, refeed); + struct rte_src *src = n ? n->src : o->src; + struct rte_storage *new_latest = first->new; - /* Discard temporary rte */ - if (new_free) - rte_free(new_free); + RPE_WALK(first, rpe, src) + { + channel_rpe_mark_seen(req, rpe); + new_latest = rpe->new; + } + + rte n0 = RTE_COPY_VALID(new_latest); + if (n0.src || o) + rt_notify_basic(c, net, n0.src ? &n0 : NULL, o); } +void +rt_feed_any(struct rt_export_request *req, const net_addr *net, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) +{ + struct channel *c = SKIP_BACK(struct channel, out_req, req); + + for (uint i=0; i<count; i++) + if (rte_is_valid(feed[i])) + { + rte n0 = *feed[i]; + rt_notify_basic(c, net, &n0, NULL); + } + + RPE_WALK(first, rpe, NULL) + { + channel_rpe_mark_seen(req, rpe); + if (rpe == last) + break; + } +} + +void +rpe_mark_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe) +{ + bmap_set(&hook->seq_map, rpe->seq); +} + +struct rt_pending_export * +rpe_next(struct rt_pending_export *rpe, struct rte_src *src) +{ + struct rt_pending_export *next = atomic_load_explicit(&rpe->next, memory_order_acquire); + + if (!next) + return NULL; + + if (!src) + return next; + + while (rpe = next) + if (src == (rpe->new ? rpe->new->rte.src : rpe->old->rte.src)) + return rpe; + else + next = atomic_load_explicit(&rpe->next, memory_order_acquire); + + return NULL; +} + +static struct rt_pending_export * rt_next_export_fast(struct rt_pending_export *last); +static int +rte_export(struct rt_table_export_hook *th, struct rt_pending_export *rpe) +{ + rtable *tab = RT_PUB(SKIP_BACK(struct rtable_private, exporter, th->table)); + struct rt_export_hook *hook = &th->h; + if (bmap_test(&hook->seq_map, rpe->seq)) + goto ignore; /* Seen already */ + + const net_addr *n = rpe->new_best ? rpe->new_best->rte.net : rpe->old_best->rte.net; + + switch (hook->req->addr_mode) + { + case TE_ADDR_NONE: + break; + + case TE_ADDR_IN: + if (!net_in_netX(n, hook->req->addr)) + goto ignore; + break; + + case TE_ADDR_EQUAL: + if (!net_equal(n, hook->req->addr)) + goto ignore; + break; + + case TE_ADDR_FOR: + bug("Continuos export of best prefix match not implemented yet."); + + default: + bug("Strange table export address mode: %d", hook->req->addr_mode); + } + + if (rpe->new) + hook->stats.updates_received++; + else + hook->stats.withdraws_received++; + + if (hook->req->export_one) + hook->req->export_one(hook->req, n, rpe); + else if (hook->req->export_bulk) + { + net *net = SKIP_BACK(struct network, n.addr, (net_addr (*)[0]) n); + RT_LOCK(tab); + struct rt_pending_export *last = net->last; + uint count = rte_feed_count(net); + const rte **feed = NULL; + if (count) + { + feed = alloca(count * sizeof(rte *)); + rte_feed_obtain(net, feed, count); + } + RT_UNLOCK(tab); + hook->req->export_bulk(hook->req, n, rpe, last, feed, count); + } + else + bug("Export request must always provide an export method"); + +ignore: + /* Get the next export if exists */ + th->rpe_next = rt_next_export_fast(rpe); + + /* The last block may be available to free */ + int used = (PAGE_HEAD(th->rpe_next) != PAGE_HEAD(rpe)); + + /* Releasing this export for cleanup routine */ + DBG("store hook=%p last_export=%p seq=%lu\n", hook, rpe, rpe->seq); + atomic_store_explicit(&th->last_export, rpe, memory_order_release); + + return used; +} /** * rte_announce - announce a routing table change * @tab: table the route has been added to - * @type: type of route announcement (RA_UNDEF or RA_ANY) * @net: network in question * @new: the new or changed route * @old: the previous route replaced by the new one @@ -1039,13 +1283,6 @@ rt_notify_merged(struct channel *c, net *net, rte *new_changed, rte *old_changed * and @new_best and @old_best describes best routes. Other routes are not * affected, but in sorted table the order of other routes might change. * - * Second, There is a bulk change of multiple routes in @net, with shared best - * route selection. In such case separate route changes are described using - * @type of %RA_ANY, with @new and @old specifying the changed route, while - * @new_best and @old_best are NULL. After that, another notification is done - * where @new_best and @old_best are filled (may be the same), but @new and @old - * are NULL. - * * The function announces the change to all associated channels. For each * channel, an appropriate preprocessing is done according to channel &ra_mode. * For example, %RA_OPTIMAL channels receive just changes of best routes. @@ -1060,152 +1297,320 @@ rt_notify_merged(struct channel *c, net *net, rte *new_changed, rte *old_changed * done outside of scope of rte_announce(). */ static void -rte_announce(rtable *tab, uint type, net *net, rte *new, rte *old, - rte *new_best, rte *old_best) +rte_announce(struct rtable_private *tab, net *net, struct rte_storage *new, struct rte_storage *old, + struct rte_storage *new_best, struct rte_storage *old_best) { - if (!rte_is_valid(new)) - new = NULL; + int new_best_valid = rte_is_valid(RTE_OR_NULL(new_best)); + int old_best_valid = rte_is_valid(RTE_OR_NULL(old_best)); - if (!rte_is_valid(old)) - old = NULL; + if ((new == old) && (new_best == old_best)) + return; - if (!rte_is_valid(new_best)) - new_best = NULL; + if (new_best_valid) + new_best->rte.sender->stats.pref++; + if (old_best_valid) + old_best->rte.sender->stats.pref--; - if (!rte_is_valid(old_best)) - old_best = NULL; + if (EMPTY_LIST(tab->exporter.e.hooks) && EMPTY_LIST(tab->exporter.pending)) + { + /* No export hook and no pending exports to cleanup. We may free the route immediately. */ + if (!old) + return; - if (!new && !old && !new_best && !old_best) + hmap_clear(&tab->id_map, old->rte.id); + rte_free(old); return; + } - if (new_best != old_best) + /* Get the pending export structure */ + struct rt_export_block *rpeb = NULL, *rpebsnl = NULL; + u32 end = 0; + + if (!EMPTY_LIST(tab->exporter.pending)) { - if (new_best) - new_best->sender->stats.pref_routes++; - if (old_best) - old_best->sender->stats.pref_routes--; + rpeb = TAIL(tab->exporter.pending); + end = atomic_load_explicit(&rpeb->end, memory_order_relaxed); + if (end >= RT_PENDING_EXPORT_ITEMS) + { + ASSERT_DIE(end == RT_PENDING_EXPORT_ITEMS); + rpebsnl = rpeb; - if (tab->hostcache) - rt_notify_hostcache(tab, net); + rpeb = NULL; + end = 0; + } + } - if (!EMPTY_LIST(tab->flowspec_links)) - rt_flowspec_notify(tab, net); + if (!rpeb) + { + rpeb = alloc_page(); + *rpeb = (struct rt_export_block) {}; + add_tail(&tab->exporter.pending, &rpeb->n); } - rt_schedule_notify(tab); + /* Fill the pending export */ + struct rt_pending_export *rpe = &rpeb->export[rpeb->end]; + *rpe = (struct rt_pending_export) { + .new = new, + .new_best = new_best, + .old = old, + .old_best = old_best, + .seq = tab->exporter.next_seq++, + }; - struct channel *c; node *n; - WALK_LIST2(c, n, tab->channels, table_node) + DBGL("rte_announce: table=%s net=%N new=%p id %u from %s old=%p id %u from %s new_best=%p id %u old_best=%p id %u seq=%lu", + tab->name, net->n.addr, + new, new ? new->rte.id : 0, new ? new->rte.sender->req->name : NULL, + old, old ? old->rte.id : 0, old ? old->rte.sender->req->name : NULL, + new_best, old_best, rpe->seq); + + ASSERT_DIE(atomic_fetch_add_explicit(&rpeb->end, 1, memory_order_release) == end); + + if (rpebsnl) { - if (c->export_state == ES_DOWN) - continue; + _Bool f = 0; + ASSERT_DIE(atomic_compare_exchange_strong_explicit(&rpebsnl->not_last, &f, 1, + memory_order_release, memory_order_relaxed)); + } - if (type && (type != c->ra_mode)) - continue; + /* Append to the same-network squasher list */ + if (net->last) + { + struct rt_pending_export *rpenull = NULL; + ASSERT_DIE(atomic_compare_exchange_strong_explicit( + &net->last->next, &rpenull, rpe, + memory_order_relaxed, + memory_order_relaxed)); + + } + + net->last = rpe; + + if (!net->first) + net->first = rpe; + + if (tab->exporter.first == NULL) + tab->exporter.first = rpe; + + rt_check_cork_high(tab); +} + +static struct rt_pending_export * +rt_next_export_fast(struct rt_pending_export *last) +{ + /* Get the whole export block and find our position in there. */ + struct rt_export_block *rpeb = PAGE_HEAD(last); + u32 pos = (last - &rpeb->export[0]); + u32 end = atomic_load_explicit(&rpeb->end, memory_order_acquire); + ASSERT_DIE(pos < end); + + /* Next is in the same block. */ + if (++pos < end) + return &rpeb->export[pos]; + + /* There is another block. */ + if (atomic_load_explicit(&rpeb->not_last, memory_order_acquire)) + { + /* This is OK to do non-atomically because of the not_last flag. */ + rpeb = NODE_NEXT(rpeb); + return &rpeb->export[0]; + } + + /* There is nothing more. */ + return NULL; +} - switch (c->ra_mode) +static struct rt_pending_export * +rt_next_export(struct rt_table_export_hook *hook, struct rt_table_exporter *tab) +{ + ASSERT_DIE(RT_IS_LOCKED(SKIP_BACK(struct rtable_private, exporter, tab))); + + /* As the table is locked, it is safe to reload the last export pointer */ + struct rt_pending_export *last = atomic_load_explicit(&hook->last_export, memory_order_acquire); + + /* It is still valid, let's reuse it */ + if (last) + return rt_next_export_fast(last); + + /* No, therefore we must process the table's first pending export */ + else + return tab->first; +} + +static inline void +rt_send_export_event(struct rt_export_hook *hook) +{ + ev_send(hook->req->list, &hook->event); +} + +static void +rt_announce_exports(struct settle *s) +{ + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, export_settle, s)), tab) + if (!EMPTY_LIST(tab->exporter.pending)) { - case RA_OPTIMAL: - if (new_best != old_best) - rt_notify_basic(c, net, new_best, old_best, 0); - break; + struct rt_export_hook *c; node *n; + WALK_LIST2(c, n, tab->exporter.e.hooks, n) + { + if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_READY) + continue; - case RA_ANY: - if (new != old) - rt_notify_basic(c, net, new, old, 0); - break; + rt_send_export_event(c); + } + } +} - case RA_ACCEPTED: - /* - * The (new != old) condition is problematic here, as it would break - * the second usage pattern (announcement after bulk change, used in - * rt_next_hop_update_net(), which sends both new and old as NULL). - * - * But recursive next hops do not work with sorted tables anyways, - * such configuration is forbidden in BGP and not supported in - * rt_notify_accepted(). - * - * The condition is needed to eliminate spurious announcements where - * both old and new routes are not valid (so they are NULL). - */ - if (new != old) - rt_notify_accepted(c, net, new, old, 0); - break; +static void +rt_kick_export_settle(struct rtable_private *tab) +{ + tab->export_settle.cf = tab->rr_counter ? tab->config->export_rr_settle : tab->config->export_settle; + settle_kick(&tab->export_settle, tab->loop); +} + +static void +rt_import_announce_exports(void *_hook) +{ + struct rt_import_hook *hook = _hook; + if (hook->import_state == TIS_CLEARED) + { + void (*stopped)(struct rt_import_request *) = hook->stopped; + struct rt_import_request *req = hook->req; + + RT_LOCKED(hook->table, tab) + { + req->hook = NULL; + + rt_trace(tab, D_EVENTS, "Hook %s stopped", req->name); + rem_node(&hook->n); + mb_free(hook); + rt_unlock_table(tab); + } + + stopped(req); + return; + } - case RA_MERGED: - rt_notify_merged(c, net, new, old, new_best, old_best, 0); + rt_trace(hook->table, D_EVENTS, "Announcing exports after imports from %s", hook->req->name); + birdloop_flag(hook->table->loop, RTF_EXPORT); +} + +static struct rt_pending_export * +rt_last_export(struct rt_table_exporter *tab) +{ + struct rt_pending_export *rpe = NULL; + + if (!EMPTY_LIST(tab->pending)) + { + /* We'll continue processing exports from this export on */ + struct rt_export_block *reb = TAIL(tab->pending); + ASSERT_DIE(reb->end); + rpe = &reb->export[reb->end - 1]; + } + + return rpe; +} + +#define RT_EXPORT_BULK 1024 + +static void +rt_export_hook(void *_data) +{ + struct rt_table_export_hook *c = _data; + rtable *tab = SKIP_BACK(rtable, priv.exporter, c->table); + + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_READY); + + if (!c->rpe_next) + { + RT_LOCK(tab); + c->rpe_next = rt_next_export(c, c->table); + + if (!c->rpe_next) + { + rt_export_used(c->table, c->h.req->name, "done exporting"); + RT_UNLOCK(tab); + return; + } + + RT_UNLOCK(tab); + } + + int used = 0; + int no_next = 0; + + /* Process the export */ + for (uint i=0; i<RT_EXPORT_BULK; i++) + { + used += rte_export(c, c->rpe_next); + + if (!c->rpe_next) + { + no_next = 1; break; } } + + if (used) + RT_LOCKED(tab, t) + if (no_next || t->cork_active) + rt_export_used(c->table, c->h.req->name, no_next ? "finished export bulk" : "cork active"); + + rt_send_export_event(&c->h); } + static inline int -rte_validate(rte *e) +rte_validate(struct channel *ch, rte *e) { int c; - net *n = e->net; + const net_addr *n = e->net; - if (!net_validate(n->n.addr)) + if (!net_validate(n)) { log(L_WARN "Ignoring bogus prefix %N received via %s", - n->n.addr, e->sender->proto->name); + n, ch->proto->name); return 0; } /* FIXME: better handling different nettypes */ - c = !net_is_flow(n->n.addr) ? - net_classify(n->n.addr): (IADDR_HOST | SCOPE_UNIVERSE); + c = !net_is_flow(n) ? + net_classify(n): (IADDR_HOST | SCOPE_UNIVERSE); if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK)) { log(L_WARN "Ignoring bogus route %N received via %s", - n->n.addr, e->sender->proto->name); + n, ch->proto->name); return 0; } - if (net_type_match(n->n.addr, NB_DEST) == !e->attrs->dest) + if (net_type_match(n, NB_DEST)) { - /* Exception for flowspec that failed validation */ - if (net_is_flow(n->n.addr) && (e->attrs->dest == RTD_UNREACHABLE)) - return 1; + eattr *nhea = ea_find(e->attrs, &ea_gen_nexthop); + int dest = nhea_dest(nhea); - log(L_WARN "Ignoring route %N with invalid dest %d received via %s", - n->n.addr, e->attrs->dest, e->sender->proto->name); - return 0; - } + if (dest == RTD_NONE) + { + log(L_WARN "Ignoring route %N with no destination received via %s", + n, ch->proto->name); + return 0; + } - if ((e->attrs->dest == RTD_UNICAST) && !nexthop_is_sorted(&(e->attrs->nh))) + if ((dest == RTD_UNICAST) && + !nexthop_is_sorted((struct nexthop_adata *) nhea->u.ptr)) + { + log(L_WARN "Ignoring unsorted multipath route %N received via %s", + n, ch->proto->name); + return 0; + } + } + else if (ea_find(e->attrs, &ea_gen_nexthop)) { - log(L_WARN "Ignoring unsorted multipath route %N received via %s", - n->n.addr, e->sender->proto->name); + log(L_WARN "Ignoring route %N having a nexthop attribute received via %s", + n, ch->proto->name); return 0; } return 1; } -/** - * rte_free - delete a &rte - * @e: &rte to be deleted - * - * rte_free() deletes the given &rte from the routing table it's linked to. - */ -void -rte_free(rte *e) -{ - rt_unlock_source(e->src); - if (rta_is_cached(e->attrs)) - rta_free(e->attrs); - sl_free(e); -} - -static inline void -rte_free_quick(rte *e) -{ - rt_unlock_source(e->src); - rta_free(e->attrs); - sl_free(e); -} - static int rte_same(rte *x, rte *y) { @@ -1218,168 +1623,109 @@ rte_same(rte *x, rte *y) static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); } -static void -rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src) +static int +rte_recalculate(struct rtable_private *table, struct rt_import_hook *c, net *net, rte *new, struct rte_src *src) { - struct proto *p = c->proto; - struct rtable *table = c->table; - struct proto_stats *stats = &c->stats; - static struct tbf rl_pipe = TBF_DEFAULT_LOG_LIMITS; - rte *before_old = NULL; - rte *old_best = net->routes; + struct rt_import_request *req = c->req; + struct rt_import_stats *stats = &c->stats; + struct rte_storage *old_best_stored = net->routes, *old_stored = NULL; + rte *old_best = old_best_stored ? &old_best_stored->rte : NULL; rte *old = NULL; - rte **k; - k = &net->routes; /* Find and remove original route from the same protocol */ - while (old = *k) + /* If the new route is identical to the old one, we find the attributes in + * cache and clone these with no performance drop. OTOH, if we were to lookup + * the attributes, such a route definitely hasn't been anywhere yet, + * therefore it's definitely worth the time. */ + struct rte_storage *new_stored = NULL; + if (new) + new = &(new_stored = rte_store(new, net, table))->rte; + + /* Find and remove original route from the same protocol */ + struct rte_storage **before_old = rte_find(net, src); + + if (*before_old) { - if (old->src == src) + old = &(old_stored = (*before_old))->rte; + + /* If there is the same route in the routing table but from + * a different sender, then there are two paths from the + * source protocol to this routing table through transparent + * pipes, which is not allowed. + * We log that and ignore the route. */ + if (old->sender != c) { - /* If there is the same route in the routing table but from - * a different sender, then there are two paths from the - * source protocol to this routing table through transparent - * pipes, which is not allowed. - * - * We log that and ignore the route. If it is withdraw, we - * ignore it completely (there might be 'spurious withdraws', - * see FIXME in do_rte_announce()) - */ - if (old->sender->proto != p) - { - if (new) - { - log_rl(&rl_pipe, L_ERR "Pipe collision detected when sending %N to table %s", - net->n.addr, table->name); - rte_free_quick(new); - } - return; - } + if (!old->generation && !new->generation) + bug("Two protocols claim to author a route with the same rte_src in table %s: %N %s/%u:%u", + c->table->name, net->n.addr, old->src->owner->name, old->src->private_id, old->src->global_id); - if (new && rte_same(old, new)) + log_rl(&table->rl_pipe, L_ERR "Route source collision in table %s: %N %s/%u:%u", + c->table->name, net->n.addr, old->src->owner->name, old->src->private_id, old->src->global_id); + } + + if (new && rte_same(old, &new_stored->rte)) { /* No changes, ignore the new route and refresh the old one */ - - old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY); + old->stale_cycle = new->stale_cycle; if (!rte_is_filtered(new)) { - stats->imp_updates_ignored++; - rte_trace_in(D_ROUTES, c, new, "ignored"); + stats->updates_ignored++; + rt_rte_trace_in(D_ROUTES, req, new, "ignored"); } - rte_free_quick(new); - return; - } - *k = old->next; - table->rt_count--; - break; - } - k = &old->next; - before_old = old; - } - - /* Save the last accessed position */ - rte **pos = k; + /* We need to free the already stored route here before returning */ + rte_free(new_stored); + return 0; + } - if (!old) - before_old = NULL; + *before_old = (*before_old)->next; + table->rt_count--; + } if (!old && !new) { - stats->imp_withdraws_ignored++; - return; + stats->withdraws_ignored++; + return 0; } + /* If rejected by import limit, we need to pretend there is no route */ + if (req->preimport && (req->preimport(req, new, old) == 0)) + { + rte_free(new_stored); + new_stored = NULL; + new = NULL; + } + int new_ok = rte_is_ok(new); int old_ok = rte_is_ok(old); - struct channel_limit *l = &c->rx_limit; - if (l->action && !old && new && !c->in_table) - { - u32 all_routes = stats->imp_routes + stats->filt_routes; - - if (all_routes >= l->limit) - channel_notify_limit(c, l, PLD_RX, all_routes); - - if (l->state == PLS_BLOCKED) - { - /* In receive limit the situation is simple, old is NULL so - we just free new and exit like nothing happened */ - - stats->imp_updates_ignored++; - rte_trace_in(D_FILTERS, c, new, "ignored [limit]"); - rte_free_quick(new); - return; - } - } - - l = &c->in_limit; - if (l->action && !old_ok && new_ok) - { - if (stats->imp_routes >= l->limit) - channel_notify_limit(c, l, PLD_IN, stats->imp_routes); - - if (l->state == PLS_BLOCKED) - { - /* In import limit the situation is more complicated. We - shouldn't just drop the route, we should handle it like - it was filtered. We also have to continue the route - processing if old or new is non-NULL, but we should exit - if both are NULL as this case is probably assumed to be - already handled. */ - - stats->imp_updates_ignored++; - rte_trace_in(D_FILTERS, c, new, "ignored [limit]"); - - if (c->in_keep_filtered) - new->flags |= REF_FILTERED; - else - { rte_free_quick(new); new = NULL; } - - /* Note that old && !new could be possible when - c->in_keep_filtered changed in the recent past. */ - - if (!old && !new) - return; - - new_ok = 0; - goto skip_stats1; - } - } - if (new_ok) - stats->imp_updates_accepted++; + stats->updates_accepted++; else if (old_ok) - stats->imp_withdraws_accepted++; + stats->withdraws_accepted++; else - stats->imp_withdraws_ignored++; + stats->withdraws_ignored++; if (old_ok || new_ok) table->last_rt_change = current_time(); - skip_stats1: - - if (new) - rte_is_filtered(new) ? stats->filt_routes++ : stats->imp_routes++; - if (old) - rte_is_filtered(old) ? stats->filt_routes-- : stats->imp_routes--; - if (table->config->sorted) { /* If routes are sorted, just insert new route to appropriate position */ - if (new) + if (new_stored) { - if (before_old && !rte_better(new, before_old)) - k = &before_old->next; + struct rte_storage **k; + if ((before_old != &net->routes) && !rte_better(new, &SKIP_BACK(struct rte_storage, next, before_old)->rte)) + k = before_old; else k = &net->routes; for (; *k; k=&(*k)->next) - if (rte_better(new, *k)) + if (rte_better(new, &(*k)->rte)) break; - new->next = *k; - *k = new; + new_stored->next = *k; + *k = new_stored; table->rt_count++; } @@ -1389,16 +1735,17 @@ rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src) /* If routes are not sorted, find the best route and move it on the first position. There are several optimized cases. */ - if (src->proto->rte_recalculate && src->proto->rte_recalculate(table, net, new, old, old_best)) + if (src->owner->rte_recalculate && + src->owner->rte_recalculate(table, net, new_stored ? &new_stored->rte : NULL, old, old_best)) goto do_recalculate; - if (new && rte_better(new, old_best)) + if (new_stored && rte_better(&new_stored->rte, old_best)) { /* The first case - the new route is cleary optimal, we link it at the first position */ - new->next = net->routes; - net->routes = new; + new_stored->next = net->routes; + net->routes = new_stored; table->rt_count++; } @@ -1412,10 +1759,10 @@ rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src) do_recalculate: /* Add the new route to the list */ - if (new) + if (new_stored) { - new->next = *pos; - *pos = new; + new_stored->next = *before_old; + *before_old = new_stored; table->rt_count++; } @@ -1423,299 +1770,560 @@ rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src) /* Find a new optimal route (if there is any) */ if (net->routes) { - rte **bp = &net->routes; - for (k=&(*bp)->next; *k; k=&(*k)->next) - if (rte_better(*k, *bp)) + struct rte_storage **bp = &net->routes; + for (struct rte_storage **k=&(*bp)->next; *k; k=&(*k)->next) + if (rte_better(&(*k)->rte, &(*bp)->rte)) bp = k; /* And relink it */ - rte *best = *bp; + struct rte_storage *best = *bp; *bp = best->next; best->next = net->routes; net->routes = best; } } - else if (new) + else if (new_stored) { /* The third case - the new route is not better than the old best route (therefore old_best != NULL) and the old best route was not removed (therefore old_best == net->routes). We just link the new route to the old/last position. */ - new->next = *pos; - *pos = new; + new_stored->next = *before_old; + *before_old = new_stored; table->rt_count++; } /* The fourth (empty) case - suboptimal route was removed, nothing to do */ } - if (new) + if (new_stored) { - new->lastmod = current_time(); + new_stored->rte.lastmod = current_time(); + new_stored->rte.id = hmap_first_zero(&table->id_map); + hmap_set(&table->id_map, new_stored->rte.id); + } - if (!old) - { - new->id = hmap_first_zero(&table->id_map); - hmap_set(&table->id_map, new->id); - } + /* Log the route change */ + if (new_ok) + rt_rte_trace_in(D_ROUTES, req, &new_stored->rte, new_stored == net->routes ? "added [best]" : "added"); + else if (old_ok) + { + if (old != old_best) + rt_rte_trace_in(D_ROUTES, req, old, "removed"); + else if (net->routes && rte_is_ok(&net->routes->rte)) + rt_rte_trace_in(D_ROUTES, req, old, "removed [replaced]"); else - new->id = old->id; + rt_rte_trace_in(D_ROUTES, req, old, "removed [sole]"); } + else + if (req->trace_routes & D_ROUTES) + log(L_TRACE "%s > ignored %N %s->%s", req->name, net->n.addr, old ? "filtered" : "none", new ? "filtered" : "none"); - /* Log the route change */ - if ((c->debug & D_ROUTES) || (p->debug & D_ROUTES)) + /* Propagate the route change */ + rte_announce(table, net, new_stored, old_stored, + net->routes, old_best_stored); + + return 1; +} + +int +channel_preimport(struct rt_import_request *req, rte *new, rte *old) +{ + struct channel *c = SKIP_BACK(struct channel, in_req, req); + + if (new && !old) + if (CHANNEL_LIMIT_PUSH(c, RX)) + return 0; + + if (!new && old) + CHANNEL_LIMIT_POP(c, RX); + + int new_in = new && !rte_is_filtered(new); + int old_in = old && !rte_is_filtered(old); + + if (new_in && !old_in) + if (CHANNEL_LIMIT_PUSH(c, IN)) + if (c->in_keep & RIK_REJECTED) + { + new->flags |= REF_FILTERED; + return 1; + } + else + return 0; + + if (!new_in && old_in) + CHANNEL_LIMIT_POP(c, IN); + + return 1; +} + +void +rte_update(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) +{ + if (!c->in_req.hook) + { + log(L_WARN "%s.%s: Called rte_update without import hook", c->proto->name, c->name); + return; + } + + ASSERT(c->channel_state == CS_UP); + + /* The import reloader requires prefilter routes to be the first layer */ + if (new && (c->in_keep & RIK_PREFILTER)) + if (ea_is_cached(new->attrs) && !new->attrs->next) + new->attrs = ea_clone(new->attrs); + else + new->attrs = ea_lookup(new->attrs, 0); + + const struct filter *filter = c->in_filter; + struct channel_import_stats *stats = &c->import_stats; + + if (new) { - if (new_ok) - rte_trace(c, new, '>', new == net->routes ? "added [best]" : "added"); - else if (old_ok) + new->net = n; + + int fr; + + stats->updates_received++; + if ((filter == FILTER_REJECT) || + ((fr = f_run(filter, new, 0)) > F_ACCEPT)) { - if (old != old_best) - rte_trace(c, old, '>', "removed"); - else if (rte_is_ok(net->routes)) - rte_trace(c, old, '>', "removed [replaced]"); + stats->updates_filtered++; + channel_rte_trace_in(D_FILTERS, c, new, "filtered out"); + + if (c->in_keep & RIK_REJECTED) + new->flags |= REF_FILTERED; else - rte_trace(c, old, '>', "removed [sole]"); + new = NULL; + } + + if (new) + if (net_is_flow(n)) + rt_flowspec_resolve_rte(new, c); + else + rt_next_hop_resolve_rte(new); + + if (new && !rte_validate(c, new)) + { + channel_rte_trace_in(D_FILTERS, c, new, "invalid"); + stats->updates_invalid++; + new = NULL; } + } + else + stats->withdraws_received++; - /* Propagate the route change */ - rte_announce(table, RA_UNDEF, net, new, old, net->routes, old_best); + rte_import(&c->in_req, n, new, src); + + /* Now the route attributes are kept by the in-table cached version + * and we may drop the local handle */ + if (new && (c->in_keep & RIK_PREFILTER)) + { + /* There may be some updates on top of the original attribute block */ + ea_list *a = new->attrs; + while (a->next) + a = a->next; - if (!net->routes && - (table->gc_counter++ >= table->config->gc_threshold)) - rt_kick_prune_timer(table); + ea_free(a); + } - if (old_ok && p->rte_remove) - p->rte_remove(net, old); - if (new_ok && p->rte_insert) - p->rte_insert(net, new); +} - if (old) +void +rte_import(struct rt_import_request *req, const net_addr *n, rte *new, struct rte_src *src) +{ + struct rt_import_hook *hook = req->hook; + if (!hook) + { + log(L_WARN "%s: Called rte_import without import hook", req->name); + return; + } + + RT_LOCKED(hook->table, tab) + { + net *nn; + if (new) { - if (!new) - hmap_clear(&table->id_map, old->id); + /* Use the actual struct network, not the dummy one */ + nn = net_get(tab, n); + new->net = nn->n.addr; + new->sender = hook; - rte_free_quick(old); + /* Set the stale cycle */ + new->stale_cycle = hook->stale_set; + } + else if (!(nn = net_find(tab, n))) + { + req->hook->stats.withdraws_ignored++; + if (req->trace_routes & D_ROUTES) + log(L_TRACE "%s > ignored %N withdraw", req->name, n); + RT_RETURN(tab); } + + /* Recalculate the best route */ + if (rte_recalculate(tab, hook, nn, new, src)) + ev_send(req->list, &hook->announce_event); + } } -static int rte_update_nest_cnt; /* Nesting counter to allow recursive updates */ +/* Check rtable for best route to given net whether it would be exported do p */ +int +rt_examine(rtable *tp, net_addr *a, struct channel *c, const struct filter *filter) +{ + rte rt = {}; -static inline void -rte_update_lock(void) + RT_LOCKED(tp, t) + { + net *n = net_find(t, a); + if (n) + rt = RTE_COPY_VALID(n->routes); + } + + if (!rt.src) + return 0; + + int v = c->proto->preexport ? c->proto->preexport(c, &rt) : 0; + if (v == RIC_PROCESS) + v = (f_run(filter, &rt, FF_SILENT) <= F_ACCEPT); + + return v > 0; +} + +static void +rt_table_export_done(void *hh) +{ + struct rt_table_export_hook *hook = hh; + struct rt_export_request *req = hook->h.req; + void (*stopped)(struct rt_export_request *) = hook->h.stopped; + rtable *t = SKIP_BACK(rtable, priv.exporter, hook->table); + + RT_LOCKED(t, tab) + { + DBG("Export hook %p in table %s finished uc=%u\n", hook, tab->name, tab->use_count); + + /* Drop pending exports */ + rt_export_used(&tab->exporter, hook->h.req->name, "stopped"); + + /* Do the common code; this frees the hook */ + rt_export_stopped(&hook->h); + } + + /* Report the channel as stopped. */ + CALL(stopped, req); + + /* Unlock the table; this may free it */ + rt_unlock_table(t); +} + +void +rt_export_stopped(struct rt_export_hook *hook) { - rte_update_nest_cnt++; + /* Unlink from the request */ + hook->req->hook = NULL; + + /* Unlist */ + rem_node(&hook->n); + + /* Free the hook itself together with its pool */ + rfree(hook->pool); } static inline void -rte_update_unlock(void) +rt_set_import_state(struct rt_import_hook *hook, u8 state) { - if (!--rte_update_nest_cnt) - lp_flush(rte_update_pool); + hook->last_state_change = current_time(); + hook->import_state = state; + + CALL(hook->req->log_state_change, hook->req, state); } -/** - * rte_update - enter a new update to a routing table - * @table: table to be updated - * @c: channel doing the update - * @net: network node - * @p: protocol submitting the update - * @src: protocol originating the update - * @new: a &rte representing the new route or %NULL for route removal. - * - * This function is called by the routing protocols whenever they discover - * a new route or wish to update/remove an existing route. The right announcement - * sequence is to build route attributes first (either un-cached with @aflags set - * to zero or a cached one using rta_lookup(); in this case please note that - * you need to increase the use count of the attributes yourself by calling - * rta_clone()), call rte_get_temp() to obtain a temporary &rte, fill in all - * the appropriate data and finally submit the new &rte by calling rte_update(). - * - * @src specifies the protocol that originally created the route and the meaning - * of protocol-dependent data of @new. If @new is not %NULL, @src have to be the - * same value as @new->attrs->proto. @p specifies the protocol that called - * rte_update(). In most cases it is the same protocol as @src. rte_update() - * stores @p in @new->sender; - * - * When rte_update() gets any route, it automatically validates it (checks, - * whether the network and next hop address are valid IP addresses and also - * whether a normal routing protocol doesn't try to smuggle a host or link - * scope route to the table), converts all protocol dependent attributes stored - * in the &rte to temporary extended attributes, consults import filters of the - * protocol to see if the route should be accepted and/or its attributes modified, - * stores the temporary attributes back to the &rte. - * - * Now, having a "public" version of the route, we - * automatically find any old route defined by the protocol @src - * for network @n, replace it by the new one (or removing it if @new is %NULL), - * recalculate the optimal route for this destination and finally broadcast - * the change (if any) to all routing protocols by calling rte_announce(). - * - * All memory used for attribute lists and other temporary allocations is taken - * from a special linear pool @rte_update_pool and freed when rte_update() - * finishes. - */ +void +rt_set_export_state(struct rt_export_hook *hook, u8 state) +{ + hook->last_state_change = current_time(); + u8 old = atomic_exchange_explicit(&hook->export_state, state, memory_order_release); + + if (old != state) + CALL(hook->req->log_state_change, hook->req, state); +} void -rte_update2(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) +rt_request_import(rtable *t, struct rt_import_request *req) { - // struct proto *p = c->proto; - struct proto_stats *stats = &c->stats; - const struct filter *filter = c->in_filter; - net *nn; + RT_LOCKED(t, tab) + { + rt_lock_table(tab); - ASSERT(c->channel_state == CS_UP); + struct rt_import_hook *hook = req->hook = mb_allocz(tab->rp, sizeof(struct rt_import_hook)); - rte_update_lock(); - if (new) - { - /* Create a temporary table node */ - nn = alloca(sizeof(net) + n->length); - memset(nn, 0, sizeof(net) + n->length); - net_copy(nn->n.addr, n); + hook->announce_event = (event) { .hook = rt_import_announce_exports, .data = hook }; - new->net = nn; - new->sender = c; + DBG("Lock table %s for import %p req=%p uc=%u\n", tab->name, hook, req, tab->use_count); - stats->imp_updates_received++; - if (!rte_validate(new)) - { - rte_trace_in(D_FILTERS, c, new, "invalid"); - stats->imp_updates_invalid++; - goto drop; - } + hook->req = req; + hook->table = t; - if (filter == FILTER_REJECT) - { - stats->imp_updates_filtered++; - rte_trace_in(D_FILTERS, c, new, "filtered out"); + rt_set_import_state(hook, TIS_UP); + add_tail(&tab->imports, &hook->n); + } +} - if (! c->in_keep_filtered) - goto drop; +void +rt_stop_import(struct rt_import_request *req, void (*stopped)(struct rt_import_request *)) +{ + ASSERT_DIE(req->hook); + struct rt_import_hook *hook = req->hook; - /* new is a private copy, i could modify it */ - new->flags |= REF_FILTERED; - } - else if (filter) - { - int fr = f_run(filter, &new, rte_update_pool, 0); - if (fr > F_ACCEPT) - { - stats->imp_updates_filtered++; - rte_trace_in(D_FILTERS, c, new, "filtered out"); + RT_LOCKED(hook->table, tab) + { + rt_schedule_prune(tab); + rt_set_import_state(hook, TIS_STOP); + hook->stopped = stopped; - if (! c->in_keep_filtered) - goto drop; + /* Cancel table rr_counter */ + if (hook->stale_set != hook->stale_pruned) + tab->rr_counter -= (hook->stale_set - hook->stale_pruned); - new->flags |= REF_FILTERED; - } - } - if (!rta_is_cached(new->attrs)) /* Need to copy attributes */ - new->attrs = rta_lookup(new->attrs); - new->flags |= REF_COW; + tab->rr_counter++; - /* Use the actual struct network, not the dummy one */ - nn = net_get(c->table, n); - new->net = nn; - } + hook->stale_set = hook->stale_pruned = hook->stale_pruning = hook->stale_valid = 0; + } +} + +static void rt_table_export_start_feed(struct rtable_private *tab, struct rt_table_export_hook *hook); +static void +rt_table_export_uncork(void *_hook) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); + + struct rt_table_export_hook *hook = _hook; + struct birdloop *loop = hook->h.req->list->loop; + + if (loop != &main_birdloop) + birdloop_enter(loop); + + u8 state; + switch (state = atomic_load_explicit(&hook->h.export_state, memory_order_relaxed)) + { + case TES_HUNGRY: + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, hook->table)), tab) + if ((state = atomic_load_explicit(&hook->h.export_state, memory_order_relaxed)) == TES_HUNGRY) + rt_table_export_start_feed(tab, hook); + if (state != TES_STOP) + break; + /* fall through */ + case TES_STOP: + rt_stop_export_common(&hook->h); + break; + default: + bug("Uncorking a table export in a strange state: %u", state); + } + + if (loop != &main_birdloop) + birdloop_leave(loop); +} + +static void +rt_table_export_start_locked(struct rtable_private *tab, struct rt_export_request *req) +{ + struct rt_exporter *re = &tab->exporter.e; + rt_lock_table(tab); + + req->hook = rt_alloc_export(re, sizeof(struct rt_table_export_hook)); + req->hook->req = req; + + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, req->hook); + hook->h.event = (event) { + .hook = rt_table_export_uncork, + .data = hook, + }; + + if (rt_cork_check(&hook->h.event)) + rt_set_export_state(&hook->h, TES_HUNGRY); else - { - stats->imp_withdraws_received++; + rt_table_export_start_feed(tab, hook); +} - if (!(nn = net_find(c->table, n)) || !src) - { - stats->imp_withdraws_ignored++; - rte_update_unlock(); - return; - } - } +static void +rt_table_export_start_feed(struct rtable_private *tab, struct rt_table_export_hook *hook) +{ + struct rt_exporter *re = &tab->exporter.e; + struct rt_export_request *req = hook->h.req; - recalc: - /* And recalculate the best route */ - rte_recalculate(c, nn, new, src); + /* stats zeroed by mb_allocz */ + switch (req->addr_mode) + { + case TE_ADDR_IN: + if (tab->trie && net_val_match(tab->addr_type, NB_IP)) + { + hook->walk_state = mb_allocz(hook->h.pool, sizeof (struct f_trie_walk_state)); + hook->walk_lock = rt_lock_trie(tab); + trie_walk_init(hook->walk_state, tab->trie, req->addr); + hook->h.event.hook = rt_feed_by_trie; + hook->walk_last.type = 0; + break; + } + /* fall through */ + case TE_ADDR_NONE: + FIB_ITERATE_INIT(&hook->feed_fit, &tab->fib); + hook->h.event.hook = rt_feed_by_fib; + break; - rte_update_unlock(); - return; + case TE_ADDR_EQUAL: + hook->h.event.hook = rt_feed_equal; + break; - drop: - rte_free(new); - new = NULL; - if (nn = net_find(c->table, n)) - goto recalc; + case TE_ADDR_FOR: + hook->h.event.hook = rt_feed_for; + break; - rte_update_unlock(); + default: + bug("Requested an unknown export address mode"); + } + + DBG("New export hook %p req %p in table %s uc=%u\n", hook, req, tab->name, tab->use_count); + + struct rt_pending_export *rpe = rt_last_export(hook->table); + DBG("store hook=%p last_export=%p seq=%lu\n", hook, rpe, rpe ? rpe->seq : 0); + atomic_store_explicit(&hook->last_export, rpe, memory_order_relaxed); + + rt_init_export(re, req->hook); } -/* Independent call to rte_announce(), used from next hop - recalculation, outside of rte_update(). new must be non-NULL */ -static inline void -rte_announce_i(rtable *tab, uint type, net *net, rte *new, rte *old, - rte *new_best, rte *old_best) +static void +rt_table_export_start(struct rt_exporter *re, struct rt_export_request *req) +{ + RT_LOCKED(SKIP_BACK(rtable, priv.exporter.e, re), tab) + rt_table_export_start_locked(tab, req); +} + +void rt_request_export(rtable *t, struct rt_export_request *req) { - rte_update_lock(); - rte_announce(tab, type, net, new, old, new_best, old_best); - rte_update_unlock(); + RT_LOCKED(t, tab) + rt_table_export_start_locked(tab, req); /* Is locked inside */ } -static inline void -rte_discard(rte *old) /* Non-filtered route deletion, used during garbage collection */ +void +rt_request_export_other(struct rt_exporter *re, struct rt_export_request *req) { - rte_update_lock(); - rte_recalculate(old->sender, old->net, NULL, old->src); - rte_update_unlock(); + return re->class->start(re, req); } -/* Modify existing route by protocol hook, used for long-lived graceful restart */ -static inline void -rte_modify(rte *old) +struct rt_export_hook * +rt_alloc_export(struct rt_exporter *re, uint size) +{ + pool *p = rp_new(re->rp, "Export hook"); + struct rt_export_hook *hook = mb_allocz(p, size); + + hook->pool = p; + hook->table = re; + + hook->n = (node) {}; + add_tail(&re->hooks, &hook->n); + + return hook; +} + +void +rt_init_export(struct rt_exporter *re UNUSED, struct rt_export_hook *hook) { - rte_update_lock(); + hook->event.data = hook; + + bmap_init(&hook->seq_map, hook->pool, 16); + + /* Regular export */ + rt_set_export_state(hook, TES_FEEDING); + rt_send_export_event(hook); +} - rte *new = old->sender->proto->rte_modify(old, rte_update_pool); - if (new != old) +static int +rt_table_export_stop_locked(struct rt_export_hook *hh) +{ + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, hh); + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter, hook->table); + + switch (atomic_load_explicit(&hh->export_state, memory_order_relaxed)) { - if (new) - { - if (!rta_is_cached(new->attrs)) - new->attrs = rta_lookup(new->attrs); - new->flags = (old->flags & ~REF_MODIFY) | REF_COW; - } + case TES_HUNGRY: + rt_trace(tab, D_EVENTS, "Stopping export hook %s must wait for uncorking", hook->h.req->name); + return 0; + case TES_FEEDING: + switch (hh->req->addr_mode) + { + case TE_ADDR_IN: + if (hook->walk_lock) + { + rt_unlock_trie(tab, hook->walk_lock); + hook->walk_lock = NULL; + mb_free(hook->walk_state); + hook->walk_state = NULL; + break; + } + /* fall through */ + case TE_ADDR_NONE: + fit_get(&tab->fib, &hook->feed_fit); + break; + } + break; - rte_recalculate(old->sender, old->net, new, old->src); + case TES_STOP: + bug("Tried to repeatedly stop the same export hook %s", hook->h.req->name); } - rte_update_unlock(); + rt_trace(tab, D_EVENTS, "Stopping export hook %s right now", hook->h.req->name); + return 1; } -/* Check rtable for best route to given net whether it would be exported do p */ -int -rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter) +static void +rt_table_export_stop(struct rt_export_hook *hh) { - struct proto *p = c->proto; - net *n = net_find(t, a); - rte *rt = n ? n->routes : NULL; + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, hh); + int ok = 0; + rtable *t = SKIP_BACK(rtable, priv.exporter, hook->table); + if (RT_IS_LOCKED(t)) + ok = rt_table_export_stop_locked(hh); + else + RT_LOCKED(t, tab) + ok = rt_table_export_stop_locked(hh); - if (!rte_is_valid(rt)) - return 0; + if (ok) + rt_stop_export_common(hh); + else + rt_set_export_state(&hook->h, TES_STOP); +} - rte_update_lock(); +void +rt_stop_export(struct rt_export_request *req, void (*stopped)(struct rt_export_request *)) +{ + ASSERT_DIE(birdloop_inside(req->list->loop)); + ASSERT_DIE(req->hook); + struct rt_export_hook *hook = req->hook; - /* Rest is stripped down export_filter() */ - int v = p->preexport ? p->preexport(c, rt) : 0; - if (v == RIC_PROCESS) - v = (f_run(filter, &rt, rte_update_pool, FF_SILENT) <= F_ACCEPT); + /* Set the stopped callback */ + hook->stopped = stopped; - /* Discard temporary rte */ - if (rt != n->routes) - rte_free(rt); + /* Run the stop code */ + if (hook->table->class->stop) + hook->table->class->stop(hook); + else + rt_stop_export_common(hook); +} + +void +rt_stop_export_common(struct rt_export_hook *hook) +{ + /* Update export state */ + rt_set_export_state(hook, TES_STOP); - rte_update_unlock(); + /* Reset the event as the stopped event */ + hook->event.hook = hook->table->class->done; - return v > 0; + /* Run the stopped event */ + rt_send_export_event(hook); } - /** * rt_refresh_begin - start a refresh cycle * @t: related routing table @@ -1731,16 +2339,43 @@ rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filte * flag in rt_refresh_end() and then removing such routes in the prune loop. */ void -rt_refresh_begin(rtable *t, struct channel *c) +rt_refresh_begin(struct rt_import_request *req) { - FIB_WALK(&t->fib, net, n) - { - rte *e; - for (e = n->routes; e; e = e->next) - if (e->sender == c) - e->flags |= REF_STALE; - } - FIB_WALK_END; + struct rt_import_hook *hook = req->hook; + ASSERT_DIE(hook); + + RT_LOCKED(hook->table, tab) + { + + /* If the pruning routine is too slow */ + if (((hook->stale_set - hook->stale_pruned) & 0xff) >= 240) + { + log(L_WARN "Route refresh flood in table %s (stale_set=%u, stale_pruned=%u)", hook->table->name, hook->stale_set, hook->stale_pruned); + + /* Forcibly set all old routes' stale cycle to zero. */ + FIB_WALK(&tab->fib, net, n) + { + for (struct rte_storage *e = n->routes; e; e = e->next) + if (e->rte.sender == req->hook) + e->rte.stale_cycle = 0; + } + FIB_WALK_END; + + /* Smash the route refresh counter and zero everything. */ + tab->rr_counter -= hook->stale_set - hook->stale_pruned; + hook->stale_set = hook->stale_valid = hook->stale_pruning = hook->stale_pruned = 0; + } + + /* Now we can safely increase the stale_set modifier */ + hook->stale_set++; + + /* The table must know that we're route-refreshing */ + tab->rr_counter++; + + if (req->trace_routes & D_STATES) + log(L_TRACE "%s: route refresh begin [%u]", req->name, hook->stale_set); + + } } /** @@ -1752,45 +2387,26 @@ rt_refresh_begin(rtable *t, struct channel *c) * hook. See rt_refresh_begin() for description of refresh cycles. */ void -rt_refresh_end(rtable *t, struct channel *c) +rt_refresh_end(struct rt_import_request *req) { - int prune = 0; + struct rt_import_hook *hook = req->hook; + ASSERT_DIE(hook); - FIB_WALK(&t->fib, net, n) - { - rte *e; - for (e = n->routes; e; e = e->next) - if ((e->sender == c) && (e->flags & REF_STALE)) - { - e->flags |= REF_DISCARD; - prune = 1; - } - } - FIB_WALK_END; - - if (prune) - rt_schedule_prune(t); -} - -void -rt_modify_stale(rtable *t, struct channel *c) -{ - int prune = 0; + RT_LOCKED(hook->table, tab) + { + /* Now valid routes are only those one with the latest stale_set value */ + uint cnt = hook->stale_set - hook->stale_valid; + hook->stale_valid = hook->stale_set; - FIB_WALK(&t->fib, net, n) - { - rte *e; - for (e = n->routes; e; e = e->next) - if ((e->sender == c) && (e->flags & REF_STALE) && !(e->flags & REF_FILTERED)) - { - e->flags |= REF_MODIFY; - prune = 1; - } - } - FIB_WALK_END; + /* Here we can't kick the timer as we aren't in the table service loop */ + rt_schedule_prune(tab); - if (prune) - rt_schedule_prune(t); + if (req->trace_routes & D_STATES) + if (cnt > 1) + log(L_TRACE "%s: route refresh end (x%u) [%u]", req->name, cnt, hook->stale_valid); + else + log(L_TRACE "%s: route refresh end [%u]", req->name, hook->stale_valid); + } } /** @@ -1800,12 +2416,11 @@ rt_modify_stale(rtable *t, struct channel *c) * This functions dumps contents of a &rte to debug output. */ void -rte_dump(rte *e) +rte_dump(struct rte_storage *e) { - net *n = e->net; - debug("%-1N ", n->n.addr); - debug("PF=%02x ", e->pflags); - rta_dump(e->attrs); + debug("%-1N ", e->rte.net); + debug("PF=%02x ", e->rte.pflags); + ea_dump(e->rte.attrs); debug("\n"); } @@ -1816,20 +2431,24 @@ rte_dump(rte *e) * This function dumps contents of a given routing table to debug output. */ void -rt_dump(rtable *t) +rt_dump(rtable *tp) { - debug("Dump of routing table <%s>\n", t->name); + RT_LOCKED(tp, t) + { + + debug("Dump of routing table <%s>%s\n", t->name, t->deleted ? " (deleted)" : ""); #ifdef DEBUGGING fib_check(&t->fib); #endif FIB_WALK(&t->fib, net, n) { - rte *e; - for(e=n->routes; e; e=e->next) + for(struct rte_storage *e=n->routes; e; e=e->next) rte_dump(e); } FIB_WALK_END; debug("\n"); + + } } /** @@ -1845,73 +2464,153 @@ rt_dump_all(void) WALK_LIST2(t, n, routing_tables, n) rt_dump(t); + + WALK_LIST2(t, n, deleted_routing_tables, n) + rt_dump(t); } -static inline void -rt_schedule_hcu(rtable *tab) +void +rt_dump_hooks(rtable *tp) { - if (tab->hcu_scheduled) - return; + RT_LOCKED(tp, tab) + { + + debug("Dump of hooks in routing table <%s>%s\n", tab->name, tab->deleted ? " (deleted)" : ""); + debug(" nhu_state=%u use_count=%d rt_count=%u\n", + tab->nhu_state, tab->use_count, tab->rt_count); + debug(" last_rt_change=%t gc_time=%t gc_counter=%d prune_state=%u\n", + tab->last_rt_change, tab->gc_time, tab->gc_counter, tab->prune_state); - tab->hcu_scheduled = 1; - ev_schedule(tab->rt_event); + struct rt_import_hook *ih; + WALK_LIST(ih, tab->imports) + { + ih->req->dump_req(ih->req); + debug(" Import hook %p requested by %p: pref=%u" + " last_state_change=%t import_state=%u stopped=%p\n", + ih, ih->req, ih->stats.pref, + ih->last_state_change, ih->import_state, ih->stopped); + } + + struct rt_table_export_hook *eh; + WALK_LIST(eh, tab->exporter.e.hooks) + { + eh->h.req->dump_req(eh->h.req); + debug(" Export hook %p requested by %p:" + " refeed_pending=%u last_state_change=%t export_state=%u\n", + eh, eh->h.req, eh->refeed_pending, eh->h.last_state_change, + atomic_load_explicit(&eh->h.export_state, memory_order_relaxed)); + } + debug("\n"); + + } } -static inline void -rt_schedule_nhu(rtable *tab) +void +rt_dump_hooks_all(void) { - if (tab->nhu_state == NHU_CLEAN) - ev_schedule(tab->rt_event); + rtable *t; + node *n; - /* state change: - * NHU_CLEAN -> NHU_SCHEDULED - * NHU_RUNNING -> NHU_DIRTY - */ - tab->nhu_state |= NHU_SCHEDULED; + debug("Dump of all table hooks\n"); + + WALK_LIST2(t, n, routing_tables, n) + rt_dump_hooks(t); + + WALK_LIST2(t, n, deleted_routing_tables, n) + rt_dump_hooks(t); +} + +static inline void +rt_schedule_nhu(struct rtable_private *tab) +{ + if (tab->nhu_corked) + { + if (!(tab->nhu_corked & NHU_SCHEDULED)) + tab->nhu_corked |= NHU_SCHEDULED; + } + else if (!(tab->nhu_state & NHU_SCHEDULED)) + { + rt_trace(tab, D_EVENTS, "Scheduling NHU"); + + /* state change: + * NHU_CLEAN -> NHU_SCHEDULED + * NHU_RUNNING -> NHU_DIRTY + */ + if ((tab->nhu_state |= NHU_SCHEDULED) == NHU_SCHEDULED) + birdloop_flag(tab->loop, RTF_NHU); + } } void -rt_schedule_prune(rtable *tab) +rt_schedule_prune(struct rtable_private *tab) { if (tab->prune_state == 0) - ev_schedule(tab->rt_event); + birdloop_flag(tab->loop, RTF_CLEANUP); /* state change 0->1, 2->3 */ tab->prune_state |= 1; } +static void +rt_export_used(struct rt_table_exporter *e, const char *who, const char *why) +{ + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter, e); + ASSERT_DIE(RT_IS_LOCKED(tab)); + + rt_trace(tab, D_EVENTS, "Export cleanup requested by %s %s", who, why); + + if (tab->export_used) + return; + + tab->export_used = 1; + birdloop_flag(tab->loop, RTF_CLEANUP); +} static void -rt_event(void *ptr) +rt_flag_handler(struct birdloop_flag_handler *fh, u32 flags) { - rtable *tab = ptr; + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, fh, fh)), tab) + { + ASSERT_DIE(birdloop_inside(tab->loop)); + rt_lock_table(tab); - rt_lock_table(tab); + if (flags & RTF_NHU) + rt_next_hop_update(tab); - if (tab->hcu_scheduled) - rt_update_hostcache(tab); + if (flags & RTF_EXPORT) + rt_kick_export_settle(tab); - if (tab->nhu_state) - rt_next_hop_update(tab); + if (flags & RTF_CLEANUP) + { + if (tab->export_used) + rt_export_cleanup(tab); - if (tab->prune_state) - rt_prune_table(tab); + if (tab->prune_state) + rt_prune_table(tab); + } - rt_unlock_table(tab); -} + if (flags & RTF_DELETE) + { + if (tab->hostcache) + rt_stop_export(&tab->hostcache->req, NULL); + rt_unlock_table(tab); + } + + rt_unlock_table(tab); + } +} static void rt_prune_timer(timer *t) { - rtable *tab = t->data; - - if (tab->gc_counter >= tab->config->gc_threshold) - rt_schedule_prune(tab); + RT_LOCKED((rtable *) t->data, tab) + if (tab->gc_counter >= tab->config->gc_threshold) + rt_schedule_prune(tab); } static void -rt_kick_prune_timer(rtable *tab) +rt_kick_prune_timer(struct rtable_private *tab) { /* Return if prune is already scheduled */ if (tm_active(tab->prune_timer) || (tab->prune_state & 1)) @@ -1920,156 +2619,154 @@ rt_kick_prune_timer(rtable *tab) /* Randomize GC period to +/- 50% */ btime gc_period = tab->config->gc_period; gc_period = (gc_period / 2) + (random_u32() % (uint) gc_period); - tm_start(tab->prune_timer, gc_period); + tm_start_in(tab->prune_timer, gc_period, tab->loop); } -static inline btime -rt_settled_time(rtable *tab) -{ - ASSUME(tab->base_settle_time != 0); - - return MIN(tab->last_rt_change + tab->config->min_settle_time, - tab->base_settle_time + tab->config->max_settle_time); -} - static void -rt_settle_timer(timer *t) +rt_flowspec_export_one(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) { - rtable *tab = t->data; - - if (!tab->base_settle_time) - return; - - btime settled_time = rt_settled_time(tab); - if (current_time() < settled_time) + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rtable *dst_pub = ln->dst; + ASSUME(rt_is_flow(dst_pub)); + struct rtable_private *dst = RT_LOCK(dst_pub); + + /* No need to inspect it further if recalculation is already scheduled */ + if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY) + || !trie_match_net(dst->flowspec_trie, net)) { - tm_set(tab->settle_timer, settled_time); + RT_UNLOCK(dst_pub); + rpe_mark_seen_all(req->hook, first, NULL, NULL); return; } - /* Settled */ - tab->base_settle_time = 0; - - struct rt_subscription *s; - WALK_LIST(s, tab->subscribers) - s->hook(s); -} - -static void -rt_kick_settle_timer(rtable *tab) -{ - tab->base_settle_time = current_time(); - - if (!tab->settle_timer) - tab->settle_timer = tm_new_init(tab->rp, rt_settle_timer, tab, 0, 0); - - if (!tm_active(tab->settle_timer)) - tm_set(tab->settle_timer, rt_settled_time(tab)); -} + /* This net may affect some flowspecs, check the actual change */ + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; -static inline void -rt_schedule_notify(rtable *tab) -{ - if (EMPTY_LIST(tab->subscribers)) - return; + RPE_WALK(first, rpe, NULL) + { + rpe_mark_seen(req->hook, rpe); + new_best = rpe->new_best; + } - if (tab->base_settle_time) - return; + /* Yes, something has actually changed. Schedule the update. */ + if (o != RTE_VALID_OR_NULL(new_best)) + rt_schedule_nhu(dst); - rt_kick_settle_timer(tab); + RT_UNLOCK(dst_pub); } -void -rt_subscribe(rtable *tab, struct rt_subscription *s) +static void +rt_flowspec_dump_req(struct rt_export_request *req) { - s->tab = tab; - rt_lock_table(tab); - add_tail(&tab->subscribers, &s->n); + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + debug(" Flowspec link for table %s (%p)\n", ln->dst->name, req); } -void -rt_unsubscribe(struct rt_subscription *s) +static void +rt_flowspec_log_state_change(struct rt_export_request *req, u8 state) { - rem_node(&s->n); - rt_unlock_table(s->tab); + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rt_trace(ln->dst, D_STATES, "Flowspec link from %s export state changed to %s", + ln->src->name, rt_export_state_name(state)); } static struct rt_flowspec_link * -rt_flowspec_find_link(rtable *src, rtable *dst) +rt_flowspec_find_link(struct rtable_private *src, rtable *dst) { - struct rt_flowspec_link *ln; - WALK_LIST(ln, src->flowspec_links) - if ((ln->src == src) && (ln->dst == dst)) - return ln; + struct rt_table_export_hook *hook; node *n; + WALK_LIST2(hook, n, src->exporter.e.hooks, h.n) + switch (atomic_load_explicit(&hook->h.export_state, memory_order_acquire)) + { + case TES_HUNGRY: + case TES_FEEDING: + case TES_READY: + if (hook->h.req->export_one == rt_flowspec_export_one) + { + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, hook->h.req); + if (ln->dst == dst) + return ln; + } + } return NULL; } void -rt_flowspec_link(rtable *src, rtable *dst) +rt_flowspec_link(rtable *src_pub, rtable *dst_pub) { - ASSERT(rt_is_ip(src)); - ASSERT(rt_is_flow(dst)); + ASSERT(rt_is_ip(src_pub)); + ASSERT(rt_is_flow(dst_pub)); - struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst); + int lock_dst = 0; - if (!ln) + birdloop_enter(dst_pub->loop); + + RT_LOCKED(src_pub, src) { - rt_lock_table(src); - rt_lock_table(dst); + struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst_pub); - ln = mb_allocz(src->rp, sizeof(struct rt_flowspec_link)); - ln->src = src; - ln->dst = dst; - add_tail(&src->flowspec_links, &ln->n); + if (!ln) + { + pool *p = src->rp; + ln = mb_allocz(p, sizeof(struct rt_flowspec_link)); + ln->src = src_pub; + ln->dst = dst_pub; + ln->req = (struct rt_export_request) { + .name = mb_sprintf(p, "%s.flowspec.notifier", dst_pub->name), + .list = birdloop_event_list(dst_pub->loop), + .trace_routes = src->config->debug, + .dump_req = rt_flowspec_dump_req, + .log_state_change = rt_flowspec_log_state_change, + .export_one = rt_flowspec_export_one, + }; + + rt_table_export_start_locked(src, &ln->req); + + lock_dst = 1; + } + + ln->uc++; } - ln->uc++; + if (lock_dst) + rt_lock_table(dst_pub); + + birdloop_leave(dst_pub->loop); } -void -rt_flowspec_unlink(rtable *src, rtable *dst) +static void +rt_flowspec_link_stopped(struct rt_export_request *req) { - struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst); + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rtable *dst = ln->dst; - ASSERT(ln && (ln->uc > 0)); - - ln->uc--; - - if (!ln->uc) - { - rem_node(&ln->n); - mb_free(ln); - - rt_unlock_table(src); - rt_unlock_table(dst); - } + mb_free(ln); + rt_unlock_table(dst); } -static void -rt_flowspec_notify(rtable *src, net *net) +void +rt_flowspec_unlink(rtable *src, rtable *dst) { - /* Only IP tables are src links */ - ASSERT(rt_is_ip(src)); + birdloop_enter(dst->loop); struct rt_flowspec_link *ln; - WALK_LIST(ln, src->flowspec_links) + RT_LOCKED(src, t) { - rtable *dst = ln->dst; - ASSERT(rt_is_flow(dst)); + ln = rt_flowspec_find_link(t, dst); - /* No need to inspect it further if recalculation is already active */ - if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY)) - continue; + ASSERT(ln && (ln->uc > 0)); - if (trie_match_net(dst->flowspec_trie, net->n.addr)) - rt_schedule_nhu(dst); + if (!--ln->uc) + rt_stop_export(&ln->req, rt_flowspec_link_stopped); } + + birdloop_leave(dst->loop); } static void -rt_flowspec_reset_trie(rtable *tab) +rt_flowspec_reset_trie(struct rtable_private *tab) { linpool *lp = tab->flowspec_trie->lp; int ipv4 = tab->flowspec_trie->ipv4; @@ -2082,14 +2779,13 @@ rt_flowspec_reset_trie(rtable *tab) static void rt_free(resource *_r) { - rtable *r = (rtable *) _r; + struct rtable_private *r = SKIP_BACK(struct rtable_private, r, _r); + + DOMAIN_FREE(rtable, r->lock); DBG("Deleting routing table %s\n", r->name); ASSERT_DIE(r->use_count == 0); - if (r->internal) - return; - r->config->table = NULL; rem_node(&r->n); @@ -2100,39 +2796,70 @@ rt_free(resource *_r) fib_free(&r->fib); hmap_free(&r->id_map); rfree(r->rt_event); - rfree(r->settle_timer); mb_free(r); */ } static void -rt_res_dump(resource *_r) +rt_res_dump(resource *_r, unsigned indent) { - rtable *r = (rtable *) _r; + struct rtable_private *r = SKIP_BACK(struct rtable_private, r, _r); + debug("name \"%s\", addr_type=%s, rt_count=%u, use_count=%d\n", r->name, net_label[r->addr_type], r->rt_count, r->use_count); + + char x[32]; + bsprintf(x, "%%%dspending export %%p\n", indent + 2); + + node *n; + WALK_LIST(n, r->exporter.pending) + debug(x, "", n); } static struct resclass rt_class = { .name = "Routing table", - .size = sizeof(struct rtable), + .size = sizeof(rtable), .free = rt_free, .dump = rt_res_dump, .lookup = NULL, .memsize = NULL, }; +static const struct rt_exporter_class rt_table_exporter_class = { + .start = rt_table_export_start, + .stop = rt_table_export_stop, + .done = rt_table_export_done, +}; + +void +rt_exporter_init(struct rt_exporter *e) +{ + init_list(&e->hooks); +} + +static struct idm rtable_idm; +uint rtable_max_id = 0; + rtable * rt_setup(pool *pp, struct rtable_config *cf) { + ASSERT_DIE(birdloop_inside(&main_birdloop)); + pool *p = rp_newf(pp, "Routing table %s", cf->name); - rtable *t = ralloc(p, &rt_class); + struct rtable_private *t = ralloc(p, &rt_class); t->rp = p; + t->rte_slab = sl_new(p, sizeof(struct rte_storage)); + t->name = cf->name; t->config = cf; t->addr_type = cf->addr_type; + t->id = idm_alloc(&rtable_idm); + if (t->id >= rtable_max_id) + rtable_max_id = t->id + 1; + + t->lock = DOMAIN_NEW(rtable, t->name); fib_init(&t->fib, p, t->addr_type, sizeof(net), OFFSETOF(net, n), 0, NULL); @@ -2144,27 +2871,48 @@ rt_setup(pool *pp, struct rtable_config *cf) t->fib.init = net_init_with_trie; } - init_list(&t->channels); - init_list(&t->flowspec_links); - init_list(&t->subscribers); + init_list(&t->imports); - if (!(t->internal = cf->internal)) - { - hmap_init(&t->id_map, p, 1024); - hmap_set(&t->id_map, 0); + hmap_init(&t->id_map, p, 1024); + hmap_set(&t->id_map, 0); - t->rt_event = ev_new_init(p, rt_event, t); - t->prune_timer = tm_new_init(p, rt_prune_timer, t, 0, 0); - t->last_rt_change = t->gc_time = current_time(); + t->fh = (struct birdloop_flag_handler) { .hook = rt_flag_handler, }; + t->nhu_uncork_event = ev_new_init(p, rt_nhu_uncork, t); + t->prune_timer = tm_new_init(p, rt_prune_timer, t, 0, 0); + t->last_rt_change = t->gc_time = current_time(); - if (rt_is_flow(t)) - { - t->flowspec_trie = f_new_trie(lp_new_default(p), 0); - t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4); - } + t->export_settle = SETTLE_INIT(&cf->export_settle, rt_announce_exports, NULL); + + t->exporter = (struct rt_table_exporter) { + .e = { + .class = &rt_table_exporter_class, + .addr_type = t->addr_type, + .rp = t->rp, + }, + .next_seq = 1, + }; + + rt_exporter_init(&t->exporter.e); + + init_list(&t->exporter.pending); + + t->cork_threshold = cf->cork_threshold; + + t->rl_pipe = (struct tbf) TBF_DEFAULT_LOG_LIMITS; + + if (rt_is_flow(RT_PUB(t))) + { + t->flowspec_trie = f_new_trie(lp_new_default(p), 0); + t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4); } - return t; + /* Start the service thread */ + t->loop = birdloop_new(p, DOMAIN_ORDER(service), mb_sprintf(p, "Routing table %s", t->name)); + birdloop_enter(t->loop); + birdloop_flag_set_handler(t->loop, &t->fh); + birdloop_leave(t->loop); + + return RT_PUB(t); } /** @@ -2178,9 +2926,11 @@ rt_init(void) { rta_init(); rt_table_pool = rp_new(&root_pool, "Routing tables"); - rte_update_pool = lp_new_default(rt_table_pool); - rte_slab = sl_new(rt_table_pool, sizeof(rte)); init_list(&routing_tables); + init_list(&deleted_routing_tables); + ev_init_list(&rt_cork.queue, &main_birdloop, "Route cork release"); + rt_cork.run = (event) { .hook = rt_cork_release_hook }; + idm_init(&rtable_idm, rt_table_pool, 256); } @@ -2199,15 +2949,15 @@ rt_init(void) * iteration. */ static void -rt_prune_table(rtable *tab) +rt_prune_table(struct rtable_private *tab) { struct fib_iterator *fit = &tab->prune_fit; int limit = 2000; - struct channel *c; + struct rt_import_hook *ih; node *n, *x; - DBG("Pruning route table %s\n", tab->name); + rt_trace(tab, D_STATES, "Pruning"); #ifdef DEBUGGING fib_check(&tab->fib); #endif @@ -2218,9 +2968,16 @@ rt_prune_table(rtable *tab) if (tab->prune_state == 1) { /* Mark channels to flush */ - WALK_LIST2(c, n, tab->channels, table_node) - if (c->channel_state == CS_FLUSHING) - c->flush_active = 1; + WALK_LIST2(ih, n, tab->imports, n) + if (ih->import_state == TIS_STOP) + rt_set_import_state(ih, TIS_FLUSHING); + else if ((ih->stale_valid != ih->stale_pruning) && (ih->stale_pruning == ih->stale_pruned)) + { + ih->stale_pruning = ih->stale_valid; + + if (ih->req->trace_routes & D_STATES) + log(L_TRACE "%s: table prune after refresh begin [%u]", ih->req->name, ih->stale_pruning); + } FIB_ITERATE_INIT(fit, &tab->fib); tab->prune_state = 2; @@ -2239,36 +2996,29 @@ rt_prune_table(rtable *tab) again: FIB_ITERATE_START(&tab->fib, fit, net, n) { - rte *e; - rescan: if (limit <= 0) { FIB_ITERATE_PUT(fit); - ev_schedule(tab->rt_event); + birdloop_flag(tab->loop, RTF_CLEANUP); return; } - for (e=n->routes; e; e=e->next) + for (struct rte_storage *e=n->routes; e; e=e->next) { - if (e->sender->flush_active || (e->flags & REF_DISCARD)) + struct rt_import_hook *s = e->rte.sender; + if ((s->import_state == TIS_FLUSHING) || + (e->rte.stale_cycle < s->stale_valid) || + (e->rte.stale_cycle > s->stale_set)) { - rte_discard(e); - limit--; - - goto rescan; - } - - if (e->flags & REF_MODIFY) - { - rte_modify(e); + rte_recalculate(tab, e->rte.sender, n, NULL, e->rte.src); limit--; goto rescan; } } - if (!n->routes) /* Orphaned FIB entry */ + if (!n->routes && !n->first) /* Orphaned FIB entry */ { FIB_ITERATE_PUT(fit); fib_delete(&tab->fib, n); @@ -2283,12 +3033,16 @@ again: } FIB_ITERATE_END; + rt_trace(tab, D_EVENTS, "Prune done, scheduling export timer"); + rt_kick_export_settle(tab); + #ifdef DEBUGGING fib_check(&tab->fib); #endif /* state change 2->0, 3->1 */ - tab->prune_state &= 1; + if (tab->prune_state &= 1) + birdloop_flag(tab->loop, RTF_CLEANUP); if (tab->trie_new) { @@ -2321,21 +3075,215 @@ again: } } - if (tab->prune_state > 0) - ev_schedule(tab->rt_event); + /* Close flushed channels */ + WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) + if (ih->import_state == TIS_FLUSHING) + { + DBG("flushing %s %s rr %u", ih->req->name, tab->name, tab->rr_counter); + ih->flush_seq = tab->exporter.next_seq; + rt_set_import_state(ih, TIS_WAITING); + tab->rr_counter--; + tab->wait_counter++; + } + else if (ih->stale_pruning != ih->stale_pruned) + { + DBG("pruning %s %s rr %u set %u valid %u pruning %u pruned %u", ih->req->name, tab->name, tab->rr_counter, ih->stale_set, ih->stale_valid, ih->stale_pruning, ih->stale_pruned); + tab->rr_counter -= (ih->stale_pruning - ih->stale_pruned); + ih->stale_pruned = ih->stale_pruning; + if (ih->req->trace_routes & D_STATES) + log(L_TRACE "%s: table prune after refresh end [%u]", ih->req->name, ih->stale_pruned); + } - /* FIXME: This should be handled in a better way */ - rt_prune_sources(); + /* In some cases, we may want to directly proceed to export cleanup */ + if (tab->wait_counter && (EMPTY_LIST(tab->exporter.e.hooks) || !tab->exporter.first)) + rt_export_cleanup(tab); +} - /* Close flushed channels */ - WALK_LIST2_DELSAFE(c, n, x, tab->channels, table_node) - if (c->flush_active) +static void +rt_export_cleanup(struct rtable_private *tab) +{ + tab->export_used = 0; + + u64 min_seq = ~((u64) 0); + struct rt_pending_export *last_export_to_free = NULL; + struct rt_pending_export *first = tab->exporter.first; + int want_prune = 0; + + struct rt_table_export_hook *eh; + node *n; + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) + { + switch (atomic_load_explicit(&eh->h.export_state, memory_order_acquire)) + { + /* Export cleanup while feeding isn't implemented */ + case TES_FEEDING: + goto done; + + /* States not interfering with export cleanup */ + case TES_DOWN: /* This should not happen at all */ + log(L_WARN "%s: Export cleanup found hook %s in explicit state TES_DOWN", tab->name, eh->h.req->name); + /* fall through */ + case TES_HUNGRY: /* Feeding waiting for uncork */ + case TES_STOP: /* No more export will happen on this hook */ + continue; + + /* Regular export */ + case TES_READY: + { + struct rt_pending_export *last = atomic_load_explicit(&eh->last_export, memory_order_acquire); + if (!last) + /* No last export means that the channel has exported nothing since last cleanup */ + goto done; + + else if (min_seq > last->seq) + { + min_seq = last->seq; + last_export_to_free = last; + } + continue; + } + + default: + bug("%s: Strange export state of hook %s: %d", tab->name, eh->h.req->name, atomic_load_explicit(&eh->h.export_state, memory_order_relaxed)); + } + } + + tab->exporter.first = last_export_to_free ? rt_next_export_fast(last_export_to_free) : NULL; + + rt_trace(tab, D_STATES, "Export cleanup, old exporter.first seq %lu, new %lu, min_seq %ld", + first ? first->seq : 0, + tab->exporter.first ? tab->exporter.first->seq : 0, + min_seq); + + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) + { + if (atomic_load_explicit(&eh->h.export_state, memory_order_acquire) != TES_READY) + continue; + + struct rt_pending_export *last = atomic_load_explicit(&eh->last_export, memory_order_acquire); + if (last == last_export_to_free) + { + /* This may fail when the channel managed to export more inbetween. This is OK. */ + atomic_compare_exchange_strong_explicit( + &eh->last_export, &last, NULL, + memory_order_release, + memory_order_relaxed); + + DBG("store hook=%p last_export=NULL\n", eh); + } + } + + while (first && (first->seq <= min_seq)) + { + ASSERT_DIE(first->new || first->old); + + const net_addr *n = first->new ? + first->new->rte.net : + first->old->rte.net; + net *net = SKIP_BACK(struct network, n.addr, (net_addr (*)[0]) n); + + ASSERT_DIE(net->first == first); + + if (first == net->last) + /* The only export here */ + net->last = net->first = NULL; + else + /* First is now the next one */ + net->first = atomic_load_explicit(&first->next, memory_order_relaxed); + + want_prune += !net->routes && !net->first; + + /* For now, the old route may be finally freed */ + if (first->old) + { + rt_rte_trace_in(D_ROUTES, first->old->rte.sender->req, &first->old->rte, "freed"); + hmap_clear(&tab->id_map, first->old->rte.id); + rte_free(first->old); + } + +#ifdef LOCAL_DEBUG + memset(first, 0xbd, sizeof(struct rt_pending_export)); +#endif + + struct rt_export_block *reb = HEAD(tab->exporter.pending); + ASSERT_DIE(reb == PAGE_HEAD(first)); + + u32 pos = (first - &reb->export[0]); + u32 end = atomic_load_explicit(&reb->end, memory_order_relaxed); + ASSERT_DIE(pos < end); + + struct rt_pending_export *next = NULL; + + if (++pos < end) + next = &reb->export[pos]; + else + { + rem_node(&reb->n); + +#ifdef LOCAL_DEBUG + memset(reb, 0xbe, page_size); +#endif + + free_page(reb); + + if (EMPTY_LIST(tab->exporter.pending)) { - c->flush_active = 0; - channel_set_state(c, CS_DOWN); + rt_trace(tab, D_EVENTS, "Resetting export seq"); + + node *n; + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) + { + if (atomic_load_explicit(&eh->h.export_state, memory_order_acquire) != TES_READY) + continue; + + ASSERT_DIE(atomic_load_explicit(&eh->last_export, memory_order_acquire) == NULL); + bmap_reset(&eh->h.seq_map, 16); + } + + tab->exporter.next_seq = 1; } + else + { + reb = HEAD(tab->exporter.pending); + next = &reb->export[0]; + } + } + + first = next; + } + + rt_check_cork_low(tab); + +done:; + struct rt_import_hook *ih; node *x; + if (tab->wait_counter) + WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) + if (ih->import_state == TIS_WAITING) + if (!first || (first->seq >= ih->flush_seq)) + { + ih->import_state = TIS_CLEARED; + tab->wait_counter--; + ev_send(ih->req->list, &ih->announce_event); + } + + if ((tab->gc_counter += want_prune) >= tab->config->gc_threshold) + rt_kick_prune_timer(tab); - return; + if (tab->export_used) + birdloop_flag(tab->loop, RTF_CLEANUP); + + if (EMPTY_LIST(tab->exporter.pending)) + settle_cancel(&tab->export_settle); +} + +static void +rt_cork_release_hook(void *data UNUSED) +{ + do synchronize_rcu(); + while ( + !atomic_load_explicit(&rt_cork.active, memory_order_acquire) && + ev_run_list(&rt_cork.queue) + ); } /** @@ -2352,7 +3300,7 @@ again: * */ struct f_trie * -rt_lock_trie(rtable *tab) +rt_lock_trie(struct rtable_private *tab) { ASSERT(tab->trie); @@ -2369,7 +3317,7 @@ rt_lock_trie(rtable *tab) * It may free the trie and schedule next trie pruning. */ void -rt_unlock_trie(rtable *tab, struct f_trie *trie) +rt_unlock_trie(struct rtable_private *tab, struct f_trie *trie) { ASSERT(trie); @@ -2395,7 +3343,7 @@ rt_unlock_trie(rtable *tab, struct f_trie *trie) if (tab->trie && (tab->trie->prefix_count > (2 * tab->fib.entries))) { tab->prune_trie = 1; - rt_schedule_prune(tab); + rt_kick_prune_timer(tab); } } } @@ -2409,8 +3357,8 @@ rt_preconfig(struct config *c) { init_list(&c->tables); - rt_new_table(cf_get_symbol("master4"), NET_IP4); - rt_new_table(cf_get_symbol("master6"), NET_IP6); + c->def_tables[NET_IP4] = cf_define_symbol(cf_get_symbol("master4"), SYM_TABLE, table, NULL); + c->def_tables[NET_IP6] = cf_define_symbol(cf_get_symbol("master6"), SYM_TABLE, table, NULL); } void @@ -2425,6 +3373,13 @@ rt_postconfig(struct config *c) WALK_LIST(rc, c->tables) if (rc->gc_period == (uint) -1) rc->gc_period = (uint) def_gc_period; + + for (uint net_type = 0; net_type < NET_MAX; net_type++) + if (c->def_tables[net_type] && !c->def_tables[net_type]->table) + { + c->def_tables[net_type]->class = SYM_VOID; + c->def_tables[net_type] = NULL; + } } @@ -2434,135 +3389,168 @@ rt_postconfig(struct config *c) */ void -rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls) +ea_set_hostentry(ea_list **to, rtable *dep, rtable *src, ip_addr gw, ip_addr ll, u32 lnum, u32 labels[lnum]) { - a->hostentry = he; - a->dest = he->dest; - a->igp_metric = he->igp_metric; + struct { + struct adata ad; + struct hostentry *he; + u32 labels[0]; + } *head = (void *) tmp_alloc_adata(sizeof *head + sizeof(u32) * lnum - sizeof(struct adata)); + + RT_LOCKED(src, tab) + head->he = rt_get_hostentry(tab, gw, ll, dep); + memcpy(head->labels, labels, lnum * sizeof(u32)); + + ea_set_attr(to, EA_LITERAL_DIRECT_ADATA( + &ea_gen_hostentry, 0, &head->ad)); +} - if (a->dest != RTD_UNICAST) + +static void +rta_apply_hostentry(ea_list **to, struct hostentry_adata *head) +{ + struct hostentry *he = head->he; + u32 *labels = head->labels; + u32 lnum = (u32 *) (head->ad.data + head->ad.length) - labels; + + ea_set_attr_u32(to, &ea_gen_igp_metric, 0, he->igp_metric); + + if (!he->src) { - /* No nexthop */ -no_nexthop: - a->nh = (struct nexthop) {}; - if (mls) - { /* Store the label stack for later changes */ - a->nh.labels_orig = a->nh.labels = mls->len; - memcpy(a->nh.label, mls->stack, mls->len * sizeof(u32)); - } + ea_set_dest(to, 0, RTD_UNREACHABLE); return; } - if (((!mls) || (!mls->len)) && he->nexthop_linkable) + eattr *he_nh_ea = ea_find(he->src, &ea_gen_nexthop); + ASSERT_DIE(he_nh_ea); + + struct nexthop_adata *nhad = (struct nexthop_adata *) he_nh_ea->u.ptr; + int idest = nhea_dest(he_nh_ea); + + if ((idest != RTD_UNICAST) || + !lnum && he->nexthop_linkable) { /* Just link the nexthop chain, no label append happens. */ - memcpy(&(a->nh), &(he->src->nh), nexthop_size(&(he->src->nh))); + ea_copy_attr(to, he->src, &ea_gen_nexthop); return; } - struct nexthop *nhp = NULL, *nhr = NULL; - int skip_nexthop = 0; + uint total_size = OFFSETOF(struct nexthop_adata, nh); - for (struct nexthop *nh = &(he->src->nh); nh; nh = nh->next) + NEXTHOP_WALK(nh, nhad) { - if (skip_nexthop) - skip_nexthop--; - else + if (nh->labels + lnum > MPLS_MAX_LABEL_STACK) { - nhr = nhp; - nhp = (nhp ? (nhp->next = lp_alloc(rte_update_pool, NEXTHOP_MAX_SIZE)) : &(a->nh)); + log(L_WARN "Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)", + nh->labels, lnum, nh->labels + lnum, MPLS_MAX_LABEL_STACK); + continue; } - memset(nhp, 0, NEXTHOP_MAX_SIZE); - nhp->iface = nh->iface; - nhp->weight = nh->weight; + total_size += NEXTHOP_SIZE_CNT(nh->labels + lnum); + } - if (mls) - { - nhp->labels = nh->labels + mls->len; - nhp->labels_orig = mls->len; - if (nhp->labels <= MPLS_MAX_LABEL_STACK) - { - memcpy(nhp->label, nh->label, nh->labels * sizeof(u32)); /* First the hostentry labels */ - memcpy(&(nhp->label[nh->labels]), mls->stack, mls->len * sizeof(u32)); /* Then the bottom labels */ - } - else - { - log(L_WARN "Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)", - nh->labels, mls->len, nhp->labels, MPLS_MAX_LABEL_STACK); - skip_nexthop++; - continue; - } - } - else if (nh->labels) + if (total_size == OFFSETOF(struct nexthop_adata, nh)) + { + log(L_WARN "No valid nexthop remaining, setting route unreachable"); + + struct nexthop_adata nha = { + .ad.length = NEXTHOP_DEST_SIZE, + .dest = RTD_UNREACHABLE, + }; + + ea_set_attr_data(to, &ea_gen_nexthop, 0, &nha.ad.data, nha.ad.length); + return; + } + + struct nexthop_adata *new = (struct nexthop_adata *) tmp_alloc_adata(total_size); + struct nexthop *dest = &new->nh; + + NEXTHOP_WALK(nh, nhad) + { + if (nh->labels + lnum > MPLS_MAX_LABEL_STACK) + continue; + + memcpy(dest, nh, NEXTHOP_SIZE(nh)); + if (lnum) { - nhp->labels = nh->labels; - nhp->labels_orig = 0; - memcpy(nhp->label, nh->label, nh->labels * sizeof(u32)); + memcpy(&(dest->label[dest->labels]), labels, lnum * sizeof labels[0]); + dest->labels += lnum; } if (ipa_nonzero(nh->gw)) - { - nhp->gw = nh->gw; /* Router nexthop */ - nhp->flags |= (nh->flags & RNF_ONLINK); - } + /* Router nexthop */ + dest->flags = (dest->flags & RNF_ONLINK); else if (!(nh->iface->flags & IF_MULTIACCESS) || (nh->iface->flags & IF_LOOPBACK)) - nhp->gw = IPA_NONE; /* PtP link - no need for nexthop */ + dest->gw = IPA_NONE; /* PtP link - no need for nexthop */ else if (ipa_nonzero(he->link)) - nhp->gw = he->link; /* Device nexthop with link-local address known */ + dest->gw = he->link; /* Device nexthop with link-local address known */ else - nhp->gw = he->addr; /* Device nexthop with link-local address unknown */ + dest->gw = he->addr; /* Device nexthop with link-local address unknown */ + + dest = NEXTHOP_NEXT(dest); } - if (skip_nexthop) - if (nhr) - nhr->next = NULL; - else - { - a->dest = RTD_UNREACHABLE; - log(L_WARN "No valid nexthop remaining, setting route unreachable"); - goto no_nexthop; - } + /* Fix final length */ + new->ad.length = (void *) dest - (void *) new->ad.data; + ea_set_attr(to, EA_LITERAL_DIRECT_ADATA( + &ea_gen_nexthop, 0, &new->ad)); } -static inline int -rta_next_hop_outdated(rta *a) +static inline struct hostentry_adata * +rta_next_hop_outdated(ea_list *a) { - struct hostentry *he = a->hostentry; + /* First retrieve the hostentry */ + eattr *heea = ea_find(a, &ea_gen_hostentry); + if (!heea) + return NULL; - if (!he) - return 0; + struct hostentry_adata *head = (struct hostentry_adata *) heea->u.ptr; - if (!he->src) - return a->dest != RTD_UNREACHABLE; + /* If no nexthop is present, we have to create one */ + eattr *a_nh_ea = ea_find(a, &ea_gen_nexthop); + if (!a_nh_ea) + return head; + + struct nexthop_adata *nhad = (struct nexthop_adata *) a_nh_ea->u.ptr; + + /* Shortcut for unresolvable hostentry */ + if (!head->he->src) + return NEXTHOP_IS_REACHABLE(nhad) ? head : NULL; - return (a->dest != he->dest) || (a->igp_metric != he->igp_metric) || - (!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh)); + /* Comparing our nexthop with the hostentry nexthop */ + eattr *he_nh_ea = ea_find(head->he->src, &ea_gen_nexthop); + + return ( + (ea_get_int(a, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN) != head->he->igp_metric) || + (!head->he->nexthop_linkable) || + (!he_nh_ea != !a_nh_ea) || + (he_nh_ea && a_nh_ea && !adata_same(he_nh_ea->u.ptr, a_nh_ea->u.ptr))) + ? head : NULL; } -static inline rte * -rt_next_hop_update_rte(rtable *tab UNUSED, rte *old) +static inline int +rt_next_hop_update_rte(rte *old, rte *new) { - if (!rta_next_hop_outdated(old->attrs)) - return NULL; - - rta *a = alloca(RTA_MAX_SIZE); - memcpy(a, old->attrs, rta_size(old->attrs)); + struct hostentry_adata *head = rta_next_hop_outdated(old->attrs); + if (!head) + return 0; - mpls_label_stack mls = { .len = a->nh.labels_orig }; - memcpy(mls.stack, &a->nh.label[a->nh.labels - mls.len], mls.len * sizeof(u32)); + *new = *old; + rta_apply_hostentry(&new->attrs, head); + return 1; +} - rta_apply_hostentry(a, old->attrs->hostentry, &mls); - a->cached = 0; +static inline void +rt_next_hop_resolve_rte(rte *r) +{ + eattr *heea = ea_find(r->attrs, &ea_gen_hostentry); + if (!heea) + return; - rte *e = sl_alloc(rte_slab); - memcpy(e, old, sizeof(rte)); - e->attrs = rta_lookup(a); - rt_lock_source(e->src); + struct hostentry_adata *head = (struct hostentry_adata *) heea->u.ptr; - return e; + rta_apply_hostentry(&r->attrs, head); } - #ifdef CONFIG_BGP static inline int @@ -2586,35 +3574,34 @@ net_flow_has_dst_prefix(const net_addr *n) } static inline int -rta_as_path_is_empty(rta *a) +rta_as_path_is_empty(ea_list *a) { - eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); + eattr *e = ea_find(a, "bgp_path"); return !e || (as_path_getlen(e->u.ptr) == 0); } static inline u32 -rta_get_first_asn(rta *a) +rta_get_first_asn(ea_list *a) { - eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); + eattr *e = ea_find(a, "bgp_path"); u32 asn; return (e && as_path_get_first_regular(e->u.ptr, &asn)) ? asn : 0; } -int -rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, rta *a, int interior) +static inline enum flowspec_valid +rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, ea_list *a, int interior) { ASSERT(rt_is_ip(tab_ip)); ASSERT(rt_is_flow(tab_flow)); - ASSERT(tab_ip->trie); /* RFC 8955 6. a) Flowspec has defined dst prefix */ if (!net_flow_has_dst_prefix(n)) - return 0; + return FLOWSPEC_INVALID; /* RFC 9117 4.1. Accept AS_PATH is empty (fr */ if (interior && rta_as_path_is_empty(a)) - return 1; + return FLOWSPEC_VALID; /* RFC 8955 6. b) Flowspec and its best-match route have the same originator */ @@ -2626,146 +3613,239 @@ rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, rta *a, i else net_fill_ip6(&dst, net6_prefix(n), net6_pxlen(n)); - /* Find best-match BGP unicast route for flowspec dst prefix */ - net *nb = net_route(tab_ip, &dst); - rte *rb = nb ? nb->routes : NULL; + rte rb = {}; + net_addr_union nau; + RT_LOCKED(tab_ip, tip) + { + ASSERT(tip->trie); + /* Find best-match BGP unicast route for flowspec dst prefix */ + net *nb = net_route(tip, &dst); + if (nb) + { + rb = RTE_COPY_VALID(nb->routes); + rta_clone(rb.attrs); + net_copy(&nau.n, nb->n.addr); + rb.net = &nau.n; + } + } /* Register prefix to trie for tracking further changes */ int max_pxlen = (n->type == NET_FLOW4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH; - trie_add_prefix(tab_flow->flowspec_trie, &dst, (nb ? nb->n.addr->pxlen : 0), max_pxlen); + RT_LOCKED(tab_flow, tfl) + trie_add_prefix(tfl->flowspec_trie, &dst, (rb.net ? rb.net->pxlen : 0), max_pxlen); /* No best-match BGP route -> no flowspec */ - if (!rb || (rb->attrs->source != RTS_BGP)) - return 0; + if (!rb.attrs || (rt_get_source_attr(&rb) != RTS_BGP)) + return FLOWSPEC_INVALID; /* Find ORIGINATOR_ID values */ - u32 orig_a = ea_get_int(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0); - u32 orig_b = ea_get_int(rb->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0); + u32 orig_a = ea_get_int(a, "bgp_originator_id", 0); + u32 orig_b = ea_get_int(rb.attrs, "bgp_originator_id", 0); /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */ - if ((orig_a != orig_b) || (!orig_a && !orig_b && !ipa_equal(a->from, rb->attrs->from))) - return 0; + if ((orig_a != orig_b) || (!orig_a && !orig_b && !ipa_equal( + ea_get_ip(a, &ea_gen_from, IPA_NONE), + ea_get_ip(rb.attrs, &ea_gen_from, IPA_NONE) + ))) + return FLOWSPEC_INVALID; /* Find ASN of the best-match route, for use in next checks */ - u32 asn_b = rta_get_first_asn(rb->attrs); + u32 asn_b = rta_get_first_asn(rb.attrs); if (!asn_b) - return 0; + return FLOWSPEC_INVALID; /* RFC 9117 4.2. For EBGP, flowspec and its best-match route are from the same AS */ if (!interior && (rta_get_first_asn(a) != asn_b)) - return 0; + return FLOWSPEC_INVALID; /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */ - TRIE_WALK(tab_ip->trie, subnet, &dst) + RT_LOCKED(tab_ip, tip) { - net *nc = net_find_valid(tab_ip, &subnet); - if (!nc) - continue; + TRIE_WALK(tip->trie, subnet, &dst) + { + net *nc = net_find_valid(tip, &subnet); + if (!nc) + continue; - rte *rc = nc->routes; - if (rc->attrs->source != RTS_BGP) - return 0; + const rte *rc = &nc->routes->rte; + if (rt_get_source_attr(rc) != RTS_BGP) + RT_RETURN(tip, FLOWSPEC_INVALID); - if (rta_get_first_asn(rc->attrs) != asn_b) - return 0; + if (rta_get_first_asn(rc->attrs) != asn_b) + RT_RETURN(tip, FLOWSPEC_INVALID); + } + TRIE_WALK_END; } - TRIE_WALK_END; - return 1; + return FLOWSPEC_VALID; } #endif /* CONFIG_BGP */ -static rte * -rt_flowspec_update_rte(rtable *tab, rte *r) +static int +rt_flowspec_update_rte(rtable *tab, rte *r, rte *new) { #ifdef CONFIG_BGP - if ((r->attrs->source != RTS_BGP) || (r->sender->proto != r->src->proto)) - return NULL; + if (r->generation || (rt_get_source_attr(r) != RTS_BGP)) + return 0; - struct bgp_channel *bc = (struct bgp_channel *) r->sender; + struct bgp_channel *bc = (struct bgp_channel *) SKIP_BACK(struct channel, in_req, r->sender->req); if (!bc->base_table) - return NULL; - - const net_addr *n = r->net->n.addr; - struct bgp_proto *p = (void *) r->src->proto; - int valid = rt_flowspec_check(bc->base_table, tab, n, r->attrs, p->is_interior); - int dest = valid ? RTD_NONE : RTD_UNREACHABLE; + return 0; - if (dest == r->attrs->dest) - return NULL; + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, p, bc->c.proto); - rta *a = alloca(RTA_MAX_SIZE); - memcpy(a, r->attrs, rta_size(r->attrs)); - a->dest = dest; - a->cached = 0; + enum flowspec_valid old = rt_get_flowspec_valid(r), + valid = rt_flowspec_check(bc->base_table, tab, r->net, r->attrs, p->is_interior); - rte *new = sl_alloc(rte_slab); - memcpy(new, r, sizeof(rte)); - new->attrs = rta_lookup(a); - rt_lock_source(new->src); + if (old == valid) + return 0; - return new; + *new = *r; + ea_set_attr_u32(&new->attrs, &ea_gen_flowspec_valid, 0, valid); + return 1; #else - return NULL; + return 0; #endif } +static inline void +rt_flowspec_resolve_rte(rte *r, struct channel *c) +{ +#ifdef CONFIG_BGP + enum flowspec_valid valid, old = rt_get_flowspec_valid(r); + struct bgp_channel *bc = (struct bgp_channel *) c; + + if ( (rt_get_source_attr(r) == RTS_BGP) + && (c->channel == &channel_bgp) + && (bc->base_table)) + { + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, p, bc->c.proto); + valid = rt_flowspec_check( + bc->base_table, + c->in_req.hook->table, + r->net, r->attrs, p->is_interior); + } + else + valid = FLOWSPEC_UNKNOWN; + + if (valid == old) + return; + + if (valid == FLOWSPEC_UNKNOWN) + ea_unset_attr(&r->attrs, 0, &ea_gen_flowspec_valid); + else + ea_set_attr_u32(&r->attrs, &ea_gen_flowspec_valid, 0, valid); +#endif +} static inline int -rt_next_hop_update_net(rtable *tab, net *n) +rt_next_hop_update_net(struct rtable_private *tab, net *n) { - rte **k, *e, *new, *old_best, **new_best; - int count = 0; - int free_old_best = 0; + uint count = 0; + int is_flow = net_is_flow(n->n.addr); - old_best = n->routes; + struct rte_storage *old_best = n->routes; if (!old_best) return 0; - for (k = &n->routes; e = *k; k = &e->next) + for (struct rte_storage *e, **k = &n->routes; e = *k; k = &e->next) + count++; + + if (!count) + return 0; + + struct rte_multiupdate { + struct rte_storage *old, *new_stored; + rte new; + } *updates = tmp_allocz(sizeof(struct rte_multiupdate) * (count+1)); + + struct rt_pending_export *last_pending = n->last; + + uint pos = 0; + for (struct rte_storage *e, **k = &n->routes; e = *k; k = &e->next) + updates[pos++].old = e; + + /* This is an exceptional place where table can be unlocked while keeping its data: + * the reason why this is safe is that NHU must be always run from the same + * thread as cleanup routines, therefore the only real problem may arise when + * some importer does a change on this particular net (destination) while NHU + * is being computed. Statistically, this should almost never happen. In such + * case, we just drop all the computed changes and do it once again. + * */ + RT_UNLOCK(tab); + + uint mod = 0; + if (is_flow) + for (uint i = 0; i < pos; i++) + mod += rt_flowspec_update_rte(RT_PUB(tab), &updates[i].old->rte, &updates[i].new); + + else + for (uint i = 0; i < pos; i++) + mod += rt_next_hop_update_rte(&updates[i].old->rte, &updates[i].new); + + RT_LOCK(RT_PUB(tab)); + + if (!mod) + return 0; + + /* Something has changed inbetween, retry NHU. */ + if (last_pending != n->last) + return rt_next_hop_update_net(tab, n); + + /* Now we reconstruct the original linked list */ + struct rte_storage **nptr = &n->routes; + for (uint i = 0; i < pos; i++) { - if (!net_is_flow(n->n.addr)) - new = rt_next_hop_update_rte(tab, e); + updates[i].old->next = NULL; + + struct rte_storage *put; + if (updates[i].new.attrs) + put = updates[i].new_stored = rte_store(&updates[i].new, n, tab); else - new = rt_flowspec_update_rte(tab, e); + put = updates[i].old; - if (new) - { - *k = new; + *nptr = put; + nptr = &put->next; + } + *nptr = NULL; - rte_trace_in(D_ROUTES, new->sender, new, "updated"); - rte_announce_i(tab, RA_ANY, n, new, e, NULL, NULL); + /* Call the pre-comparison hooks */ + for (uint i = 0; i < pos; i++) + if (updates[i].new_stored) + { + /* Get a new ID for the route */ + updates[i].new_stored->rte.lastmod = current_time(); + updates[i].new_stored->rte.id = hmap_first_zero(&tab->id_map); + hmap_set(&tab->id_map, updates[i].new_stored->rte.id); /* Call a pre-comparison hook */ /* Not really an efficient way to compute this */ - if (e->src->proto->rte_recalculate) - e->src->proto->rte_recalculate(tab, n, new, e, NULL); - - if (e != old_best) - rte_free_quick(e); - else /* Freeing of the old best rte is postponed */ - free_old_best = 1; - - e = new; - count++; + if (updates[i].old->rte.src->owner->rte_recalculate) + updates[i].old->rte.src->owner->rte_recalculate(tab, n, &updates[i].new_stored->rte, &updates[i].old->rte, &old_best->rte); } - } - if (!count) - return 0; +#if DEBUGGING + { + uint t = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + t++; + ASSERT_DIE(t == pos); + ASSERT_DIE(pos == count); + } +#endif /* Find the new best route */ - new_best = NULL; - for (k = &n->routes; e = *k; k = &e->next) + struct rte_storage **new_best = NULL; + for (struct rte_storage *e, **k = &n->routes; e = *k; k = &e->next) { - if (!new_best || rte_better(e, *new_best)) + if (!new_best || rte_better(&e->rte, &(*new_best)->rte)) new_best = k; } /* Relink the new best route to the first position */ - new = *new_best; + struct rte_storage *new = *new_best; if (new != n->routes) { *new_best = new->next; @@ -2773,84 +3853,170 @@ rt_next_hop_update_net(rtable *tab, net *n) n->routes = new; } - /* Announce the new best route */ - if (new != old_best) - rte_trace_in(D_ROUTES, new->sender, new, "updated [best]"); + uint total = 0; + /* Announce the changes */ + for (uint i=0; i<count; i++) + { + if (!updates[i].new_stored) + continue; - /* Propagate changes */ - rte_announce_i(tab, RA_UNDEF, n, NULL, NULL, n->routes, old_best); + _Bool nb = (new->rte.src == updates[i].new.src), ob = (i == 0); + const char *best_indicator[2][2] = { + { "autoupdated", "autoupdated [-best]" }, + { "autoupdated [+best]", "autoupdated [best]" } + }; + rt_rte_trace_in(D_ROUTES, updates[i].new.sender->req, &updates[i].new, best_indicator[nb][ob]); + rte_announce(tab, n, updates[i].new_stored, updates[i].old, new, old_best); - if (free_old_best) - rte_free_quick(old_best); + total++; + } - return count; + return total; } static void -rt_next_hop_update(rtable *tab) +rt_nhu_uncork(void *_tab) { - struct fib_iterator *fit = &tab->nhu_fit; - int max_feed = 32; + RT_LOCKED((rtable *) _tab, tab) + { + ASSERT_DIE(tab->nhu_corked); + ASSERT_DIE(tab->nhu_state == 0); + + /* Reset the state */ + tab->nhu_state = tab->nhu_corked; + tab->nhu_corked = 0; + rt_trace(tab, D_STATES, "Next hop updater uncorked"); + + birdloop_flag(tab->loop, RTF_NHU); + } +} + +static void +rt_next_hop_update(struct rtable_private *tab) +{ + ASSERT_DIE(birdloop_inside(tab->loop)); - if (tab->nhu_state == NHU_CLEAN) + if (tab->nhu_corked) return; + if (!tab->nhu_state) + return; + + /* Check corkedness */ + if (rt_cork_check(tab->nhu_uncork_event)) + { + rt_trace(tab, D_STATES, "Next hop updater corked"); + if ((tab->nhu_state & NHU_RUNNING) + && !EMPTY_LIST(tab->exporter.pending)) + rt_kick_export_settle(tab); + + tab->nhu_corked = tab->nhu_state; + tab->nhu_state = 0; + return; + } + + struct fib_iterator *fit = &tab->nhu_fit; + int max_feed = 32; + + /* Initialize a new run */ if (tab->nhu_state == NHU_SCHEDULED) - { - FIB_ITERATE_INIT(fit, &tab->fib); - tab->nhu_state = NHU_RUNNING; + { + FIB_ITERATE_INIT(fit, &tab->fib); + tab->nhu_state = NHU_RUNNING; - if (tab->flowspec_trie) - rt_flowspec_reset_trie(tab); - } + if (tab->flowspec_trie) + rt_flowspec_reset_trie(tab); + } + /* Walk the fib one net after another */ FIB_ITERATE_START(&tab->fib, fit, net, n) { if (max_feed <= 0) { FIB_ITERATE_PUT(fit); - ev_schedule(tab->rt_event); + birdloop_flag(tab->loop, RTF_NHU); return; } + lp_state lps; + lp_save(tmp_linpool, &lps); max_feed -= rt_next_hop_update_net(tab, n); + lp_restore(tmp_linpool, &lps); } FIB_ITERATE_END; + /* Finished NHU, cleanup */ + rt_trace(tab, D_EVENTS, "NHU done, scheduling export timer"); + rt_kick_export_settle(tab); + /* State change: * NHU_DIRTY -> NHU_SCHEDULED * NHU_RUNNING -> NHU_CLEAN */ - tab->nhu_state &= 1; + if ((tab->nhu_state &= NHU_SCHEDULED) == NHU_SCHEDULED) + birdloop_flag(tab->loop, RTF_NHU); +} + +void +rt_new_default_table(struct symbol *s) +{ + for (uint addr_type = 0; addr_type < NET_MAX; addr_type++) + if (s == new_config->def_tables[addr_type]) + { + ASSERT_DIE(!s->table); + s->table = rt_new_table(s, addr_type); + return; + } - if (tab->nhu_state != NHU_CLEAN) - ev_schedule(tab->rt_event); + bug("Requested an unknown new default table: %s", s->name); } +struct rtable_config * +rt_get_default_table(struct config *cf, uint addr_type) +{ + struct symbol *ts = cf->def_tables[addr_type]; + if (!ts) + return NULL; + + if (!ts->table) + rt_new_default_table(ts); + + return ts->table; +} struct rtable_config * rt_new_table(struct symbol *s, uint addr_type) { - /* Hack that allows to 'redefine' the master table */ - if ((s->class == SYM_TABLE) && - (s->table == new_config->def_tables[addr_type]) && - ((addr_type == NET_IP4) || (addr_type == NET_IP6))) - return s->table; + if (s->table) + cf_error("Duplicate configuration of table %s", s->name); struct rtable_config *c = cfg_allocz(sizeof(struct rtable_config)); - cf_define_symbol(s, SYM_TABLE, table, c); + if (s == new_config->def_tables[addr_type]) + s->table = c; + else + cf_define_symbol(s, SYM_TABLE, table, c); + c->name = s->name; c->addr_type = addr_type; c->gc_threshold = 1000; c->gc_period = (uint) -1; /* set in rt_postconfig() */ - c->min_settle_time = 1 S; - c->max_settle_time = 20 S; + c->cork_threshold.low = 1024; + c->cork_threshold.high = 8192; + c->export_settle = (struct settle_config) { + .min = 1 MS, + .max = 100 MS, + }; + c->export_rr_settle = (struct settle_config) { + .min = 100 MS, + .max = 3 S, + }; + c->debug = new_config->table_debug; add_tail(&new_config->tables, &c->n); /* First table of each type is kept as default */ if (! new_config->def_tables[addr_type]) - new_config->def_tables[addr_type] = c; + new_config->def_tables[addr_type] = s; return c; } @@ -2864,8 +4030,9 @@ rt_new_table(struct symbol *s, uint addr_type) * configuration. */ void -rt_lock_table(rtable *r) +rt_lock_table_priv(struct rtable_private *r, const char *file, uint line) { + rt_trace(r, D_STATES, "Locked at %s:%d", file, line); r->use_count++; } @@ -2878,20 +4045,73 @@ rt_lock_table(rtable *r) * for deletion by configuration changes. */ void -rt_unlock_table(rtable *r) +rt_unlock_table_priv(struct rtable_private *r, const char *file, uint line) { + rt_trace(r, D_STATES, "Unlocked at %s:%d", file, line); if (!--r->use_count && r->deleted) - { - struct config *conf = r->deleted; + /* Stop the service thread to finish this up */ + ev_send(&global_event_list, ev_new_init(r->rp, rt_shutdown, r)); +} - /* Delete the routing table by freeing its pool */ - rt_shutdown(r); - config_del_obstacle(conf); - } +static void +rt_shutdown(void *tab_) +{ + struct rtable_private *r = tab_; + birdloop_stop(r->loop, rt_delete, r); } +static void +rt_delete(void *tab_) +{ + birdloop_enter(&main_birdloop); + + /* We assume that nobody holds the table reference now as use_count is zero. + * Anyway the last holder may still hold the lock. Therefore we lock and + * unlock it the last time to be sure that nobody is there. */ + struct rtable_private *tab = RT_LOCK((rtable *) tab_); + struct config *conf = tab->deleted; + + RT_UNLOCK(RT_PUB(tab)); + + birdloop_free(tab->loop); + rfree(tab->rp); + config_del_obstacle(conf); + + birdloop_leave(&main_birdloop); +} + + +static void +rt_check_cork_low(struct rtable_private *tab) +{ + if (!tab->cork_active) + return; + + if (tab->deleted || !tab->exporter.first || (tab->exporter.first->seq + tab->cork_threshold.low > tab->exporter.next_seq)) + { + tab->cork_active = 0; + rt_cork_release(); + + rt_trace(tab, D_STATES, "Uncorked"); + } +} + +static void +rt_check_cork_high(struct rtable_private *tab) +{ + if (!tab->deleted && !tab->cork_active && tab->exporter.first && (tab->exporter.first->seq + tab->cork_threshold.high <= tab->exporter.next_seq)) + { + tab->cork_active = 1; + rt_cork_acquire(); + rt_export_used(&tab->exporter, tab->name, "corked"); + + rt_trace(tab, D_STATES, "Corked"); + } +} + + static int -rt_reconfigure(rtable *tab, struct rtable_config *new, struct rtable_config *old) +rt_reconfigure(struct rtable_private *tab, struct rtable_config *new, struct rtable_config *old) { if ((new->addr_type != old->addr_type) || (new->sorted != old->sorted) || @@ -2899,10 +4119,26 @@ rt_reconfigure(rtable *tab, struct rtable_config *new, struct rtable_config *old return 0; DBG("\t%s: same\n", new->name); - new->table = tab; + new->table = RT_PUB(tab); tab->name = new->name; tab->config = new; + if (tab->hostcache) + tab->hostcache->req.trace_routes = new->debug; + + struct rt_table_export_hook *hook; node *n; + WALK_LIST2(hook, n, tab->exporter.e.hooks, h.n) + if (hook->h.req->export_one == rt_flowspec_export_one) + hook->h.req->trace_routes = new->debug; + + tab->cork_threshold = new->cork_threshold; + + if (new->cork_threshold.high != old->cork_threshold.high) + rt_check_cork_high(tab); + + if (new->cork_threshold.low != old->cork_threshold.low) + rt_check_cork_low(tab); + return 1; } @@ -2935,19 +4171,34 @@ rt_commit(struct config *new, struct config *old) { WALK_LIST(o, old->tables) { - rtable *tab = o->table; + struct rtable_private *tab = RT_LOCK(o->table); + if (tab->deleted) + { + RT_UNLOCK(tab); continue; + } r = rt_find_table_config(new, o->name); if (r && !new->shutdown && rt_reconfigure(tab, r, o)) + { + RT_UNLOCK(tab); continue; + } DBG("\t%s: deleted\n", o->name); tab->deleted = old; config_add_obstacle(old); rt_lock_table(tab); - rt_unlock_table(tab); + + rt_check_cork_low(tab); + + if (tab->hostcache && ev_get_list(&tab->hostcache->update) == &rt_cork.queue) + ev_postpone(&tab->hostcache->update); + + /* Force one more loop run */ + birdloop_flag(tab->loop, RTF_DELETE); + RT_UNLOCK(tab); } } @@ -2961,387 +4212,254 @@ rt_commit(struct config *new, struct config *old) DBG("\tdone\n"); } -static inline void -do_feed_channel(struct channel *c, net *n, rte *e) +static void +rt_feed_done(struct rt_export_hook *c) { - rte_update_lock(); - if (c->ra_mode == RA_ACCEPTED) - rt_notify_accepted(c, n, NULL, NULL, c->refeeding); - else if (c->ra_mode == RA_MERGED) - rt_notify_merged(c, n, NULL, NULL, e, e, c->refeeding); - else /* RA_BASIC */ - rt_notify_basic(c, n, e, e, c->refeeding); - rte_update_unlock(); -} + c->event.hook = rt_export_hook; -/** - * rt_feed_channel - advertise all routes to a channel - * @c: channel to be fed - * - * This function performs one pass of advertisement of routes to a channel that - * is in the ES_FEEDING state. It is called by the protocol code as long as it - * has something to do. (We avoid transferring all the routes in single pass in - * order not to monopolize CPU time.) - */ -int -rt_feed_channel(struct channel *c) -{ - struct fib_iterator *fit = &c->feed_fit; - int max_feed = 256; + rt_set_export_state(c, TES_READY); - ASSERT(c->export_state == ES_FEEDING); + rt_send_export_event(c); +} - if (!c->feed_active) - { - FIB_ITERATE_INIT(fit, &c->table->fib); - c->feed_active = 1; - } +#define MAX_FEED_BLOCK 1024 +typedef struct { + uint cnt, pos; + union { + struct rt_pending_export *rpe; + struct { + const rte **feed; + struct rt_feed_block_aux { + struct rt_pending_export *first, *last; + uint start; + } *aux; + }; + }; +} rt_feed_block; - FIB_ITERATE_START(&c->table->fib, fit, net, n) +static int +rt_prepare_feed(struct rt_table_export_hook *c, net *n, rt_feed_block *b) +{ + if (n->routes) + { + if (c->h.req->export_bulk) { - rte *e = n->routes; - if (max_feed <= 0) - { - FIB_ITERATE_PUT(fit); - return 0; - } + uint cnt = rte_feed_count(n); + if (b->cnt && (b->cnt + cnt > MAX_FEED_BLOCK)) + return 0; - if ((c->ra_mode == RA_OPTIMAL) || - (c->ra_mode == RA_ACCEPTED) || - (c->ra_mode == RA_MERGED)) - if (rte_is_valid(e)) - { - /* In the meantime, the protocol may fell down */ - if (c->export_state != ES_FEEDING) - goto done; + if (!b->cnt) + { + b->feed = tmp_alloc(sizeof(rte *) * MAX(MAX_FEED_BLOCK, cnt)); - do_feed_channel(c, n, e); - max_feed--; - } + uint aux_block_size = (cnt >= MAX_FEED_BLOCK) ? 2 : (MAX_FEED_BLOCK + 2 - cnt); + b->aux = tmp_alloc(sizeof(struct rt_feed_block_aux) * aux_block_size); + } - if (c->ra_mode == RA_ANY) - for(e = n->routes; e; e = e->next) - { - /* In the meantime, the protocol may fell down */ - if (c->export_state != ES_FEEDING) - goto done; + rte_feed_obtain(n, &b->feed[b->cnt], cnt); - if (!rte_is_valid(e)) - continue; + b->aux[b->pos++] = (struct rt_feed_block_aux) { + .start = b->cnt, + .first = n->first, + .last = n->last, + }; - do_feed_channel(c, n, e); - max_feed--; - } + b->cnt += cnt; } - FIB_ITERATE_END; + else if (b->pos == MAX_FEED_BLOCK) + return 0; + else + { + if (!b->pos) + b->rpe = tmp_alloc(sizeof(struct rt_pending_export) * MAX_FEED_BLOCK); + + b->rpe[b->pos++] = (struct rt_pending_export) { .new = n->routes, .new_best = n->routes }; + } + } -done: - c->feed_active = 0; return 1; } -/** - * rt_feed_baby_abort - abort protocol feeding - * @c: channel - * - * This function is called by the protocol code when the protocol stops or - * ceases to exist during the feeding. - */ -void -rt_feed_channel_abort(struct channel *c) +static void +rt_process_feed(struct rt_table_export_hook *c, rt_feed_block *b) { - if (c->feed_active) + if (!b->pos) + return; + + if (c->h.req->export_bulk) + { + b->aux[b->pos].start = b->cnt; + for (uint p = 0; p < b->pos; p++) { - /* Unlink the iterator */ - fit_get(&c->table->fib, &c->feed_fit); - c->feed_active = 0; + struct rt_feed_block_aux *aux = &b->aux[p]; + const rte **feed = &b->feed[aux->start]; + + c->h.req->export_bulk(c->h.req, feed[0]->net, aux->first, aux->last, feed, (aux+1)->start - aux->start); } + } + else + for (uint p = 0; p < b->pos; p++) + c->h.req->export_one(c->h.req, b->rpe[p].new->rte.net, &b->rpe[p]); } - -/* - * Import table +/** + * rt_feed_by_fib - advertise all routes to a channel by walking a fib + * @c: channel to be fed + * + * This function performs one pass of advertisement of routes to a channel that + * is in the ES_FEEDING state. It is called by the protocol code as long as it + * has something to do. (We avoid transferring all the routes in single pass in + * order not to monopolize CPU time.) */ - -int -rte_update_in(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) +static void +rt_feed_by_fib(void *data) { - struct rtable *tab = c->in_table; - rte *old, **pos; - net *net; + struct rt_table_export_hook *c = data; + struct fib_iterator *fit = &c->feed_fit; + rt_feed_block block = {}; - if (new) - { - net = net_get(tab, n); + ASSERT(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); - if (!rta_is_cached(new->attrs)) - new->attrs = rta_lookup(new->attrs); - } - else + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) { - net = net_find(tab, n); - - if (!net) - goto drop_withdraw; - } - /* Find the old rte */ - for (pos = &net->routes; old = *pos; pos = &old->next) - if (old->src == src) + FIB_ITERATE_START(&tab->fib, fit, net, n) { - if (new && rte_same(old, new)) + if ((c->h.req->addr_mode == TE_ADDR_NONE) || net_in_netX(n->n.addr, c->h.req->addr)) { - /* Refresh the old rte, continue with update to main rtable */ - if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY)) + if (!rt_prepare_feed(c, n, &block)) { - old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY); - return 1; + FIB_ITERATE_PUT(fit); + RT_UNLOCK(tab); + rt_process_feed(c, &block); + rt_send_export_event(&c->h); + return; } - - goto drop_update; } + } + FIB_ITERATE_END; + } - /* Move iterator if needed */ - if (old == c->reload_next_rte) - c->reload_next_rte = old->next; - - /* Remove the old rte */ - *pos = old->next; - rte_free_quick(old); - tab->rt_count--; + rt_process_feed(c, &block); + rt_feed_done(&c->h); +} - break; - } +static void +rt_feed_by_trie(void *data) +{ + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; - if (!new) + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) { - if (!old) - goto drop_withdraw; - if (!net->routes) - fib_delete(&tab->fib, net); + ASSERT_DIE(c->walk_state); + struct f_trie_walk_state *ws = c->walk_state; - return 1; - } + ASSERT(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); - struct channel_limit *l = &c->rx_limit; - if (l->action && !old) - { - if (tab->rt_count >= l->limit) - channel_notify_limit(c, l, PLD_RX, tab->rt_count); + do { + if (!c->walk_last.type) + continue; - if (l->state == PLS_BLOCKED) - { - /* Required by rte_trace_in() */ - new->net = net; + net *n = net_find(tab, &c->walk_last); + if (!n) + continue; - rte_trace_in(D_FILTERS, c, new, "ignored [limit]"); - goto drop_update; + if (!rt_prepare_feed(c, n, &block)) + { + RT_UNLOCK(tab); + rt_process_feed(c, &block); + rt_send_export_event(&c->h); + return; } } + while (trie_walk_next(ws, &c->walk_last)); - /* Insert the new rte */ - rte *e = rte_do_cow(new); - e->flags |= REF_COW; - e->net = net; - e->sender = c; - e->lastmod = current_time(); - e->next = *pos; - *pos = e; - tab->rt_count++; - return 1; + rt_unlock_trie(tab, c->walk_lock); + c->walk_lock = NULL; -drop_update: - c->stats.imp_updates_received++; - c->stats.imp_updates_ignored++; - rte_free(new); + mb_free(c->walk_state); + c->walk_state = NULL; - if (!net->routes) - fib_delete(&tab->fib, net); + c->walk_last.type = 0; - return 0; + } -drop_withdraw: - c->stats.imp_withdraws_received++; - c->stats.imp_withdraws_ignored++; - return 0; + rt_process_feed(c, &block); + rt_feed_done(&c->h); } -int -rt_reload_channel(struct channel *c) +static void +rt_feed_equal(void *data) { - struct rtable *tab = c->in_table; - struct fib_iterator *fit = &c->reload_fit; - int max_feed = 64; + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; + net *n; - ASSERT(c->channel_state == CS_UP); - - if (!c->reload_active) + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) { - FIB_ITERATE_INIT(fit, &tab->fib); - c->reload_active = 1; - } - - do { - for (rte *e = c->reload_next_rte; e; e = e->next) - { - if (max_feed-- <= 0) - { - c->reload_next_rte = e; - debug("%s channel reload burst split (max_feed=%d)", c->proto->name, max_feed); - return 0; - } - - rte_update2(c, e->net->n.addr, rte_do_cow(e), e->src); - } - - c->reload_next_rte = NULL; + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); + ASSERT_DIE(c->h.req->addr_mode == TE_ADDR_EQUAL); - FIB_ITERATE_START(&tab->fib, fit, net, n) - { - if (c->reload_next_rte = n->routes) - { - FIB_ITERATE_PUT_NEXT(fit, &tab->fib); - break; - } - } - FIB_ITERATE_END; + if (n = net_find(tab, c->h.req->addr)) + ASSERT_DIE(rt_prepare_feed(c, n, &block)); } - while (c->reload_next_rte); - c->reload_active = 0; - return 1; -} + if (n) + rt_process_feed(c, &block); -void -rt_reload_channel_abort(struct channel *c) -{ - if (c->reload_active) - { - /* Unlink the iterator */ - fit_get(&c->in_table->fib, &c->reload_fit); - c->reload_next_rte = NULL; - c->reload_active = 0; - } + rt_feed_done(&c->h); } -void -rt_prune_sync(rtable *t, int all) +static void +rt_feed_for(void *data) { - struct fib_iterator fit; + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; + net *n; - FIB_ITERATE_INIT(&fit, &t->fib); - -again: - FIB_ITERATE_START(&t->fib, &fit, net, n) + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) { - rte *e, **ee = &n->routes; + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); + ASSERT_DIE(c->h.req->addr_mode == TE_ADDR_FOR); - while (e = *ee) - { - if (all || (e->flags & (REF_STALE | REF_DISCARD))) - { - *ee = e->next; - rte_free_quick(e); - t->rt_count--; - } - else - ee = &e->next; - } - - if (all || !n->routes) - { - FIB_ITERATE_PUT(&fit); - fib_delete(&t->fib, n); - goto again; - } + if (n = net_route(tab, c->h.req->addr)) + ASSERT_DIE(rt_prepare_feed(c, n, &block)); } - FIB_ITERATE_END; + + if (n) + rt_process_feed(c, &block); + + rt_feed_done(&c->h); } /* - * Export table + * Import table */ -int -rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, int refeed) +void channel_reload_export_bulk(struct rt_export_request *req, const net_addr *net, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) { - struct rtable *tab = c->out_table; - struct rte_src *src; - rte *old, **pos; - net *net; + struct channel *c = SKIP_BACK(struct channel, reload_req, req); - if (new) - { - net = net_get(tab, n); - src = new->src; - - if (!rta_is_cached(new->attrs)) - new->attrs = rta_lookup(new->attrs); - } - else - { - net = net_find(tab, n); - src = old0->src; - - if (!net) - goto drop_withdraw; - } - - /* Find the old rte */ - for (pos = &net->routes; old = *pos; pos = &old->next) - if ((c->ra_mode != RA_ANY) || (old->src == src)) + for (uint i=0; i<count; i++) + if (feed[i]->sender == c->in_req.hook) { - if (new && rte_same(old, new)) - { - /* REF_STALE / REF_DISCARD not used in export table */ - /* - if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY)) - { - old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY); - return 1; - } - */ + /* Strip the table-specific information */ + rte new = rte_init_from(feed[i]); - goto drop_update; - } - - /* Remove the old rte */ - *pos = old->next; - rte_free_quick(old); - tab->rt_count--; + /* Strip the later attribute layers */ + while (new.attrs->next) + new.attrs = new.attrs->next; - break; + /* And reload the route */ + rte_update(c, net, &new, new.src); } - if (!new) - { - if (!old) - goto drop_withdraw; - - if (!net->routes) - fib_delete(&tab->fib, net); - - return 1; - } - - /* Insert the new rte */ - rte *e = rte_do_cow(new); - e->flags |= REF_COW; - e->net = net; - e->sender = c; - e->lastmod = current_time(); - e->next = *pos; - *pos = e; - tab->rt_count++; - return 1; - -drop_update: - return refeed; - -drop_withdraw: - return 0; + rpe_mark_seen_all(req->hook, first, last, NULL); } @@ -3448,7 +4566,56 @@ hc_delete_hostentry(struct hostcache *hc, pool *p, struct hostentry *he) } static void -rt_init_hostcache(rtable *tab) +hc_notify_dump_req(struct rt_export_request *req) +{ + debug(" Table %s (%p)\n", req->name, req); +} + +static void +hc_notify_log_state_change(struct rt_export_request *req, u8 state) +{ + struct hostcache *hc = SKIP_BACK(struct hostcache, req, req); + rt_trace((rtable *) hc->update.data, D_STATES, "HCU Export state changed to %s", rt_export_state_name(state)); +} + +static void +hc_notify_export_one(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) +{ + struct hostcache *hc = SKIP_BACK(struct hostcache, req, req); + + /* No interest in this update, mark seen only */ + int interested = 1; + RT_LOCKED((rtable *) hc->update.data, tab) + if (ev_active(&hc->update) || !trie_match_net(hc->trie, net)) + { + rpe_mark_seen_all(req->hook, first, NULL, NULL); + interested = 0; + } + + if (!interested) + return; + + /* This net may affect some hostentries, check the actual change */ + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; + + RPE_WALK(first, rpe, NULL) + { + rpe_mark_seen(req->hook, rpe); + new_best = rpe->new_best; + } + + /* Yes, something has actually changed. Do the hostcache update. */ + if (o != RTE_VALID_OR_NULL(new_best)) + RT_LOCKED((rtable *) hc->update.data, tab) + if ((atomic_load_explicit(&req->hook->export_state, memory_order_acquire) == TES_READY) + && !ev_active(&hc->update)) + ev_send_loop(tab->loop, &hc->update); +} + + +static void +rt_init_hostcache(struct rtable_private *tab) { struct hostcache *hc = mb_allocz(tab->rp, sizeof(struct hostcache)); init_list(&hc->hostentries); @@ -3460,11 +4627,27 @@ rt_init_hostcache(rtable *tab) hc->lp = lp_new(tab->rp); hc->trie = f_new_trie(hc->lp, 0); + hc->update = (event) { + .hook = rt_update_hostcache, + .data = tab, + }; + + hc->req = (struct rt_export_request) { + .name = mb_sprintf(tab->rp, "%s.hcu.notifier", tab->name), + .list = birdloop_event_list(tab->loop), + .trace_routes = tab->config->debug, + .dump_req = hc_notify_dump_req, + .log_state_change = hc_notify_log_state_change, + .export_one = hc_notify_export_one, + }; + + rt_table_export_start_locked(tab, &hc->req); + tab->hostcache = hc; } static void -rt_free_hostcache(rtable *tab) +rt_free_hostcache(struct rtable_private *tab) { struct hostcache *hc = tab->hostcache; @@ -3486,16 +4669,6 @@ rt_free_hostcache(rtable *tab) */ } -static void -rt_notify_hostcache(rtable *tab, net *net) -{ - if (tab->hcu_scheduled) - return; - - if (trie_match_net(tab->hostcache->trie, net->n.addr)) - rt_schedule_hcu(tab); -} - static int if_local_addr(ip_addr a, struct iface *i) { @@ -3509,32 +4682,31 @@ if_local_addr(ip_addr a, struct iface *i) } u32 -rt_get_igp_metric(rte *rt) +rt_get_igp_metric(const rte *rt) { - eattr *ea = ea_find(rt->attrs->eattrs, EA_GEN_IGP_METRIC); + eattr *ea = ea_find(rt->attrs, "igp_metric"); if (ea) return ea->u.data; - if (rt->attrs->source == RTS_DEVICE) + if (rt_get_source_attr(rt) == RTS_DEVICE) return 0; - if (rt->src->proto->rte_igp_metric) - return rt->src->proto->rte_igp_metric(rt); + if (rt->src->owner->class->rte_igp_metric) + return rt->src->owner->class->rte_igp_metric(rt); return IGP_METRIC_UNKNOWN; } static int -rt_update_hostentry(rtable *tab, struct hostentry *he) +rt_update_hostentry(struct rtable_private *tab, struct hostentry *he) { - rta *old_src = he->src; + ea_list *old_src = he->src; int direct = 0; int pxlen = 0; /* Reset the hostentry */ he->src = NULL; - he->dest = RTD_UNREACHABLE; he->nexthop_linkable = 0; he->igp_metric = 0; @@ -3543,12 +4715,14 @@ rt_update_hostentry(rtable *tab, struct hostentry *he) net *n = net_route(tab, &he_addr); if (n) { - rte *e = n->routes; - rta *a = e->attrs; - word pref = a->pref; - - for (rte *ee = n->routes; ee; ee = ee->next) - if ((ee->attrs->pref >= pref) && ee->attrs->hostentry) + struct rte_storage *e = n->routes; + ea_list *a = e->rte.attrs; + u32 pref = rt_get_preference(&e->rte); + + for (struct rte_storage *ee = n->routes; ee; ee = ee->next) + if (rte_is_valid(&ee->rte) && + (rt_get_preference(&ee->rte) >= pref) && + ea_find(ee->rte.attrs, &ea_gen_hostentry)) { /* Recursive route should not depend on another recursive route */ log(L_WARN "Next hop address %I resolvable through recursive route for %N", @@ -3558,9 +4732,12 @@ rt_update_hostentry(rtable *tab, struct hostentry *he) pxlen = n->n.addr->pxlen; - if (a->dest == RTD_UNICAST) - { - for (struct nexthop *nh = &(a->nh); nh; nh = nh->next) + eattr *nhea = ea_find(a, &ea_gen_nexthop); + ASSERT_DIE(nhea); + struct nexthop_adata *nhad = (void *) nhea->u.ptr; + + if (NEXTHOP_IS_REACHABLE(nhad)) + NEXTHOP_WALK(nh, nhad) if (ipa_zero(nh->gw)) { if (if_local_addr(he->addr, nh->iface)) @@ -3573,12 +4750,10 @@ rt_update_hostentry(rtable *tab, struct hostentry *he) direct++; } - } he->src = rta_clone(a); - he->dest = a->dest; he->nexthop_linkable = !direct; - he->igp_metric = rt_get_igp_metric(e); + he->igp_metric = rt_get_igp_metric(&e->rte); } done: @@ -3590,9 +4765,28 @@ done: } static void -rt_update_hostcache(rtable *tab) +rt_update_hostcache(void *data) { + rtable **nhu_pending; + + RT_LOCKED((rtable *) data, tab) + { + struct hostcache *hc = tab->hostcache; + + /* Shutdown shortcut */ + if (!hc->req.hook) + RT_RETURN(tab); + + if (rt_cork_check(&hc->update)) + { + rt_trace(tab, D_STATES, "Hostcache update corked"); + RT_RETURN(tab); + } + + /* Destination schedule map */ + nhu_pending = tmp_allocz(sizeof(rtable *) * rtable_max_id); + struct hostentry *he; node *n, *x; @@ -3610,14 +4804,18 @@ rt_update_hostcache(rtable *tab) } if (rt_update_hostentry(tab, he)) - rt_schedule_nhu(he->tab); + nhu_pending[he->tab->id] = he->tab; } + } - tab->hcu_scheduled = 0; + for (uint i=0; i<rtable_max_id; i++) + if (nhu_pending[i]) + RT_LOCKED(nhu_pending[i], dst) + rt_schedule_nhu(dst); } -struct hostentry * -rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep) +static struct hostentry * +rt_get_hostentry(struct rtable_private *tab, ip_addr a, ip_addr ll, rtable *dep) { ip_addr link = ipa_zero(ll) ? a : ll; struct hostentry *he; diff --git a/nest/rt.h b/nest/rt.h new file mode 100644 index 00000000..01372b8c --- /dev/null +++ b/nest/rt.h @@ -0,0 +1,706 @@ +/* + * BIRD Internet Routing Daemon -- Routing Table + * + * (c) 1998--2000 Martin Mares <mj@ucw.cz> + * (c) 2019--2021 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_NEST_RT_H_ +#define _BIRD_NEST_RT_H_ + +#include "lib/lists.h" +#include "lib/bitmap.h" +#include "lib/resource.h" +#include "lib/net.h" +#include "lib/type.h" +#include "lib/fib.h" +#include "lib/route.h" +#include "lib/event.h" +#include "lib/rcu.h" +#include "lib/io-loop.h" +#include "lib/settle.h" + +#include <stdatomic.h> + +struct ea_list; +struct protocol; +struct proto; +struct channel; +struct rte_src; +struct hostcache; +struct symbol; +struct timer; +struct filter; +struct f_trie; +struct f_trie_walk_state; +struct cli; + +struct rt_cork_threshold { + u64 low, high; +}; + +/* + * Master Routing Tables. Generally speaking, each of them contains a FIB + * with each entry pointing to a list of route entries representing routes + * to given network (with the selected one at the head). + * + * Each of the RTE's contains variable data (the preference and protocol-dependent + * metrics) and a pointer to a route attribute block common for many routes). + * + * It's guaranteed that there is at most one RTE for every (prefix,proto) pair. + */ + +struct rtable_config { + node n; + char *name; + union rtable *table; + struct proto_config *krt_attached; /* Kernel syncer attached to this table */ + uint addr_type; /* Type of address data stored in table (NET_*) */ + uint gc_threshold; /* Maximum number of operations before GC is run */ + uint gc_period; /* Approximate time between two consecutive GC runs */ + byte sorted; /* Routes of network are sorted according to rte_better() */ + byte trie_used; /* Rtable has attached trie */ + byte debug; /* Whether to log */ + struct rt_cork_threshold cork_threshold; /* Cork threshold values */ + struct settle_config export_settle; /* Export announcement settler */ + struct settle_config export_rr_settle;/* Export announcement settler config valid when any + route refresh is running */ +}; + +struct rt_export_hook; +struct rt_export_request; +struct rt_exporter; + +struct rt_exporter_class { + void (*start)(struct rt_exporter *, struct rt_export_request *); + void (*stop)(struct rt_export_hook *); + void (*done)(void *_rt_export_hook); +}; + +struct rt_exporter { + const struct rt_exporter_class *class; + pool *rp; + list hooks; /* Registered route export hooks */ + uint addr_type; /* Type of address data exported (NET_*) */ +}; + +struct rt_table_exporter { + struct rt_exporter e; + list pending; /* List of packed struct rt_pending_export */ + + struct rt_pending_export *first; /* First export to announce */ + u64 next_seq; /* The next export will have this ID */ +}; + +extern uint rtable_max_id; + +DEFINE_DOMAIN(rtable); + +/* The public part of rtable structure */ +#define RTABLE_PUBLIC \ + resource r; \ + node n; /* Node in list of all tables */ \ + char *name; /* Name of this table */ \ + uint addr_type; /* Type of address data stored in table (NET_*) */ \ + uint id; /* Integer table ID for fast lookup */ \ + DOMAIN(rtable) lock; /* Lock to take to access the private parts */ \ + struct rtable_config *config; /* Configuration of this table */ \ + struct birdloop *loop; /* Service thread */ \ + +/* The complete rtable structure */ +struct rtable_private { + /* Once more the public part */ + RTABLE_PUBLIC; + + /* Here the private items not to be accessed without locking */ + pool *rp; /* Resource pool to allocate everything from, including itself */ + struct slab *rte_slab; /* Slab to allocate route objects */ + struct fib fib; + struct f_trie *trie; /* Trie of prefixes defined in fib */ + int use_count; /* Number of protocols using this table */ + u32 rt_count; /* Number of routes in the table */ + + list imports; /* Registered route importers */ + struct rt_table_exporter exporter; /* Exporter API structure */ + + struct hmap id_map; + struct hostcache *hostcache; + struct config *deleted; /* Table doesn't exist in current configuration, + * delete as soon as use_count becomes 0 and remove + * obstacle from this routing table. + */ + struct event *nhu_uncork_event; /* Helper event to schedule NHU on uncork */ + struct settle export_settle; /* Export batching settle timer */ + struct timer *prune_timer; /* Timer for periodic pruning / GC */ + struct birdloop_flag_handler fh; /* Handler for simple events */ + btime last_rt_change; /* Last time when route changed */ + btime gc_time; /* Time of last GC */ + uint gc_counter; /* Number of operations since last GC */ + uint rr_counter; /* Number of currently running route refreshes, + in fact sum of (stale_set - stale_pruned) over all importers + + one for each TIS_FLUSHING importer */ + uint wait_counter; /* Number of imports in TIS_WAITING state */ + byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */ + byte prune_trie; /* Prune prefix trie during next table prune */ + byte imports_flushing; /* Some imports are being flushed right now */ + byte nhu_state; /* Next Hop Update state */ + byte nhu_corked; /* Next Hop Update is corked with this state */ + byte export_used; /* Pending Export pruning is scheduled */ + byte cork_active; /* Cork has been activated */ + struct rt_cork_threshold cork_threshold; /* Threshold for table cork */ + struct fib_iterator prune_fit; /* Rtable prune FIB iterator */ + struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ + struct f_trie *trie_new; /* New prefix trie defined during pruning */ + struct f_trie *trie_old; /* Old prefix trie waiting to be freed */ + u32 trie_lock_count; /* Prefix trie locked by walks */ + u32 trie_old_lock_count; /* Old prefix trie locked by walks */ + struct tbf rl_pipe; /* Rate limiting token buffer for pipe collisions */ + + struct f_trie *flowspec_trie; /* Trie for evaluation of flowspec notifications */ +}; + +/* The final union private-public rtable structure */ +typedef union rtable { + struct { + RTABLE_PUBLIC; + }; + struct rtable_private priv; +} rtable; + +#define RT_IS_LOCKED(tab) DOMAIN_IS_LOCKED(rtable, (tab)->lock) + +#define RT_LOCK(tab) ({ LOCK_DOMAIN(rtable, (tab)->lock); &(tab)->priv; }) +#define RT_UNLOCK(tab) UNLOCK_DOMAIN(rtable, (tab)->lock) +#define RT_PRIV(tab) ({ ASSERT_DIE(RT_IS_LOCKED((tab))); &(tab)->priv; }) +#define RT_PUB(tab) SKIP_BACK(rtable, priv, tab) + +#define RT_LOCKED(tpub, tpriv) for (struct rtable_private *tpriv = RT_LOCK(tpub); tpriv; RT_UNLOCK(tpriv), (tpriv = NULL)) +#define RT_RETURN(tpriv, ...) do { RT_UNLOCK(tpriv); return __VA_ARGS__; } while (0) + +#define RT_PRIV_SAME(tpriv, tpub) (&(tpub)->priv == (tpriv)) + +/* Flags for birdloop_flag() */ +#define RTF_CLEANUP 1 +#define RTF_NHU 2 +#define RTF_EXPORT 4 +#define RTF_DELETE 8 + +extern struct rt_cork { + _Atomic uint active; + event_list queue; + event run; +} rt_cork; + +static inline void rt_cork_acquire(void) +{ + atomic_fetch_add_explicit(&rt_cork.active, 1, memory_order_acq_rel); +} + +static inline void rt_cork_release(void) +{ + if (atomic_fetch_sub_explicit(&rt_cork.active, 1, memory_order_acq_rel) == 1) + { + synchronize_rcu(); + ev_send(&global_work_list, &rt_cork.run); + } +} + +static inline int rt_cork_check(event *e) +{ + rcu_read_lock(); + + int corked = (atomic_load_explicit(&rt_cork.active, memory_order_acquire) > 0); + if (corked) + ev_send(&rt_cork.queue, e); + + rcu_read_unlock(); + + return corked; +} + + +typedef struct network { + struct rte_storage *routes; /* Available routes for this network */ + struct rt_pending_export *first, *last; + struct fib_node n; /* FIB flags reserved for kernel syncer */ +} net; + +struct rte_storage { + struct rte_storage *next; /* Next in chain */ + struct rte rte; /* Route data */ +}; + +#define RTE_COPY(r) ((r) ? (r)->rte : (rte) {}) +#define RTE_COPY_VALID(r) (((r) && (rte_is_valid(&(r)->rte))) ? (r)->rte : (rte) {}) +#define RTE_OR_NULL(r) ((r) ? &((r)->rte) : NULL) +#define RTE_VALID_OR_NULL(r) (((r) && (rte_is_valid(&(r)->rte))) ? &((r)->rte) : NULL) + +/* Table-channel connections */ + +struct rt_import_request { + struct rt_import_hook *hook; /* The table part of importer */ + char *name; + u8 trace_routes; + + event_list *list; /* Where to schedule announce events */ + + void (*dump_req)(struct rt_import_request *req); + void (*log_state_change)(struct rt_import_request *req, u8 state); + /* Preimport is called when the @new route is just-to-be inserted, replacing @old. + * Return a route (may be different or modified in-place) to continue or NULL to withdraw. */ + int (*preimport)(struct rt_import_request *req, struct rte *new, struct rte *old); +}; + +struct rt_import_hook { + node n; + rtable *table; /* The connected table */ + struct rt_import_request *req; /* The requestor */ + + struct rt_import_stats { + /* Import - from protocol to core */ + u32 pref; /* Number of routes selected as best in the (adjacent) routing table */ + u32 updates_ignored; /* Number of route updates rejected as already in route table */ + u32 updates_accepted; /* Number of route updates accepted and imported */ + u32 withdraws_ignored; /* Number of route withdraws rejected as already not in route table */ + u32 withdraws_accepted; /* Number of route withdraws accepted and processed */ + } stats; + + u64 flush_seq; /* Table export seq when the channel announced flushing */ + btime last_state_change; /* Time of last state transition */ + + u8 import_state; /* IS_* */ + u8 stale_set; /* Set this stale_cycle to imported routes */ + u8 stale_valid; /* Routes with this stale_cycle and bigger are considered valid */ + u8 stale_pruned; /* Last prune finished when this value was set at stale_valid */ + u8 stale_pruning; /* Last prune started when this value was set at stale_valid */ + + void (*stopped)(struct rt_import_request *); /* Stored callback when import is stopped */ + event announce_event; /* This event announces table updates */ +}; + +struct rt_pending_export { + struct rt_pending_export * _Atomic next; /* Next export for the same destination */ + struct rte_storage *new, *new_best, *old, *old_best; + u64 seq; /* Sequential ID (table-local) of the pending export */ +}; + +struct rt_export_request { + struct rt_export_hook *hook; /* Table part of the export */ + char *name; + const net_addr *addr; /* Network prefilter address */ + u8 trace_routes; + u8 addr_mode; /* Network prefilter mode (TE_ADDR_*) */ + + event_list *list; /* Where to schedule export events */ + + /* There are two methods of export. You can either request feeding every single change + * or feeding the whole route feed. In case of regular export, &export_one is preferred. + * Anyway, when feeding, &export_bulk is preferred, falling back to &export_one. + * Thus, for RA_OPTIMAL, &export_one is only set, + * for RA_MERGED and RA_ACCEPTED, &export_bulk is only set + * and for RA_ANY, both are set to accomodate for feeding all routes but receiving single changes + */ + void (*export_one)(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); + void (*export_bulk)(struct rt_export_request *req, const net_addr *net, + struct rt_pending_export *rpe, struct rt_pending_export *last, + const rte **feed, uint count); + + void (*dump_req)(struct rt_export_request *req); + void (*log_state_change)(struct rt_export_request *req, u8); +}; + +struct rt_export_hook { + node n; + struct rt_exporter *table; /* The connected table */ + + pool *pool; + + struct rt_export_request *req; /* The requestor */ + + struct rt_export_stats { + /* Export - from core to protocol */ + u32 updates_received; /* Number of route updates received */ + u32 withdraws_received; /* Number of route withdraws received */ + } stats; + + btime last_state_change; /* Time of last state transition */ + + _Atomic u8 export_state; /* Route export state (TES_*, see below) */ + struct event event; /* Event running all the export operations */ + + struct bmap seq_map; /* Keep track which exports were already procesed */ + + void (*stopped)(struct rt_export_request *); /* Stored callback when export is stopped */ +}; + +struct rt_table_export_hook { + union { + struct rt_export_hook h; + struct { /* Overriding the parent structure beginning */ + node _n; + struct rt_table_exporter *table; + }; + }; + + union { + struct fib_iterator feed_fit; /* Routing table iterator used during feeding */ + struct { + struct f_trie_walk_state *walk_state; /* Iterator over networks in trie */ + struct f_trie *walk_lock; /* Locked trie for walking */ + union { /* Last net visited but not processed */ + net_addr walk_last; + net_addr_ip4 walk_last_ip4; + net_addr_ip6 walk_last_ip6; + }; + }; + }; + + struct rt_pending_export *_Atomic last_export;/* Last export processed */ + struct rt_pending_export *rpe_next; /* Next pending export to process */ + + u8 refeed_pending; /* Refeeding and another refeed is scheduled */ + u8 feed_type; /* Which feeding method is used (TFT_*, see below) */ + +}; + +#define TIS_DOWN 0 +#define TIS_UP 1 +#define TIS_STOP 2 +#define TIS_FLUSHING 3 +#define TIS_WAITING 4 +#define TIS_CLEARED 5 +#define TIS_MAX 6 + +#define TES_DOWN 0 +#define TES_HUNGRY 1 +#define TES_FEEDING 2 +#define TES_READY 3 +#define TES_STOP 4 +#define TES_MAX 5 + +/* Value of addr_mode */ +#define TE_ADDR_NONE 0 /* No address matching */ +#define TE_ADDR_EQUAL 1 /* Exact query - show route <addr> */ +#define TE_ADDR_FOR 2 /* Longest prefix match - show route for <addr> */ +#define TE_ADDR_IN 3 /* Interval query - show route in <addr> */ + + +#define TFT_FIB 1 +#define TFT_TRIE 2 +#define TFT_HASH 3 + +void rt_request_import(rtable *tab, struct rt_import_request *req); +void rt_request_export(rtable *tab, struct rt_export_request *req); +void rt_request_export_other(struct rt_exporter *tab, struct rt_export_request *req); + +void rt_export_once(struct rt_exporter *tab, struct rt_export_request *req); + +void rt_stop_import(struct rt_import_request *, void (*stopped)(struct rt_import_request *)); +void rt_stop_export(struct rt_export_request *, void (*stopped)(struct rt_export_request *)); + +const char *rt_import_state_name(u8 state); +const char *rt_export_state_name(u8 state); + +static inline u8 rt_import_get_state(struct rt_import_hook *ih) { return ih ? ih->import_state : TIS_DOWN; } +static inline u8 rt_export_get_state(struct rt_export_hook *eh) { return eh ? eh->export_state : TES_DOWN; } + +void rt_set_export_state(struct rt_export_hook *hook, u8 state); + +void rte_import(struct rt_import_request *req, const net_addr *net, rte *new, struct rte_src *src); + +/* + * For table export processing + */ + +/* Get next rpe. If src is given, it must match. */ +struct rt_pending_export *rpe_next(struct rt_pending_export *rpe, struct rte_src *src); + +/* Walk all rpe's */ +#define RPE_WALK(first, it, src) \ + for (struct rt_pending_export *it = (first); it; it = rpe_next(it, (src))) + +/* Mark the pending export processed */ +void rpe_mark_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); + +#define rpe_mark_seen_all(hook, first, last, src) do { \ + RPE_WALK((first), _rpe, (src)) { \ + rpe_mark_seen((hook), _rpe); \ + if (_rpe == last) break; \ + }} while (0) + +/* Get pending export seen status */ +int rpe_get_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); + +/* + * For rt_export_hook and rt_exporter inheritance + */ + +void rt_init_export(struct rt_exporter *re, struct rt_export_hook *hook); +struct rt_export_hook *rt_alloc_export(struct rt_exporter *re, uint size); +void rt_stop_export_common(struct rt_export_hook *hook); +void rt_export_stopped(struct rt_export_hook *hook); +void rt_exporter_init(struct rt_exporter *re); + +/* + * Channel export hooks. To be refactored out. + */ + +int channel_preimport(struct rt_import_request *req, rte *new, rte *old); + +void channel_reload_export_bulk(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first, struct rt_pending_export *last, const rte **feed, uint count); + +void rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); +void rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); +void rt_feed_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first, struct rt_pending_export *last, const rte **feed, uint count); +void rt_notify_accepted(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first, struct rt_pending_export *last, const rte **feed, uint count); +void rt_notify_merged(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first, struct rt_pending_export *last, const rte **feed, uint count); + + + +/* Types of route announcement, also used as flags */ +#define RA_UNDEF 0 /* Undefined RA type */ +#define RA_OPTIMAL 1 /* Announcement of optimal route change */ +#define RA_ACCEPTED 2 /* Announcement of first accepted route */ +#define RA_ANY 3 /* Announcement of any route change */ +#define RA_MERGED 4 /* Announcement of optimal route merged with next ones */ + +/* Return value of preexport() callback */ +#define RIC_ACCEPT 1 /* Accepted by protocol */ +#define RIC_PROCESS 0 /* Process it through import filter */ +#define RIC_REJECT -1 /* Rejected by protocol */ +#define RIC_DROP -2 /* Silently dropped by protocol */ + +/* + * Next hop update data structures + */ + +#define NHU_CLEAN 0 +#define NHU_SCHEDULED 1 +#define NHU_RUNNING 2 +#define NHU_DIRTY 3 + +struct hostentry { + node ln; + ip_addr addr; /* IP address of host, part of key */ + ip_addr link; /* (link-local) IP address of host, used as gw + if host is directly attached */ + rtable *tab; /* Dependent table, part of key */ + struct hostentry *next; /* Next in hash chain */ + unsigned hash_key; /* Hash key */ + unsigned uc; /* Use count */ + ea_list *src; /* Source attributes */ + byte nexthop_linkable; /* Nexthop list is completely non-device */ + u32 igp_metric; /* Chosen route IGP metric */ +}; + +struct hostcache { + slab *slab; /* Slab holding all hostentries */ + struct hostentry **hash_table; /* Hash table for hostentries */ + unsigned hash_order, hash_shift; + unsigned hash_max, hash_min; + unsigned hash_items; + linpool *lp; /* Linpool for trie */ + struct f_trie *trie; /* Trie of prefixes that might affect hostentries */ + list hostentries; /* List of all hostentries */ + event update; + struct rt_export_request req; /* Notifier */ +}; + +struct rt_flowspec_link { + rtable *src; + rtable *dst; + u32 uc; + struct rt_export_request req; +}; + +#define rte_update channel_rte_import +/** + * rte_update - enter a new update to a routing table + * @c: channel doing the update + * @net: network address + * @rte: a &rte representing the new route + * @src: old route source identifier + * + * This function imports a new route to the appropriate table (via the channel). + * Table keys are @net (obligatory) and @rte->attrs->src. + * Both the @net and @rte pointers can be local. + * + * The route attributes (@rte->attrs) are obligatory. They can be also allocated + * locally. Anyway, if you use an already-cached attribute object, you shall + * call rta_clone() on that object yourself. (This semantics may change in future.) + * + * If the route attributes are local, you may set @rte->attrs->src to NULL, then + * the protocol's default route source will be supplied. + * + * When rte_update() gets a route, it automatically validates it. This includes + * checking for validity of the given network and next hop addresses and also + * checking for host-scope or link-scope routes. Then the import filters are + * processed and if accepted, the route is passed to route table recalculation. + * + * The accepted routes are then inserted into the table, replacing the old route + * for the same @net identified by @src. Then the route is announced + * to all the channels connected to the table using the standard export mechanism. + * Setting @rte to NULL makes this a withdraw, otherwise @rte->src must be the same + * as @src. + * + * All memory used for temporary allocations is taken from a special linpool + * @rte_update_pool and freed when rte_update() finishes. + */ +void rte_update(struct channel *c, const net_addr *net, struct rte *rte, struct rte_src *src); + +extern list routing_tables; +struct config; + +void rt_init(void); +void rt_preconfig(struct config *); +void rt_postconfig(struct config *); +void rt_commit(struct config *new, struct config *old); +void rt_lock_table_priv(struct rtable_private *, const char *file, uint line); +void rt_unlock_table_priv(struct rtable_private *, const char *file, uint line); +static inline void rt_lock_table_pub(rtable *t, const char *file, uint line) +{ RT_LOCKED(t, tt) rt_lock_table_priv(tt, file, line); } +static inline void rt_unlock_table_pub(rtable *t, const char *file, uint line) +{ RT_LOCKED(t, tt) rt_unlock_table_priv(tt, file, line); } + +#define rt_lock_table(t) _Generic((t), rtable *: rt_lock_table_pub, \ + struct rtable_private *: rt_lock_table_priv)((t), __FILE__, __LINE__) +#define rt_unlock_table(t) _Generic((t), rtable *: rt_unlock_table_pub, \ + struct rtable_private *: rt_unlock_table_priv)((t), __FILE__, __LINE__) + +struct f_trie * rt_lock_trie(struct rtable_private *tab); +void rt_unlock_trie(struct rtable_private *tab, struct f_trie *trie); +void rt_flowspec_link(rtable *src, rtable *dst); +void rt_flowspec_unlink(rtable *src, rtable *dst); +rtable *rt_setup(pool *, struct rtable_config *); + +static inline net *net_find(struct rtable_private *tab, const net_addr *addr) { return (net *) fib_find(&tab->fib, addr); } +static inline net *net_find_valid(struct rtable_private *tab, const net_addr *addr) +{ net *n = net_find(tab, addr); return (n && n->routes && rte_is_valid(&n->routes->rte)) ? n : NULL; } +static inline net *net_get(struct rtable_private *tab, const net_addr *addr) { return (net *) fib_get(&tab->fib, addr); } +net *net_route(struct rtable_private *tab, const net_addr *n); +int rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter); +rte *rt_export_merged(struct channel *c, const rte ** feed, uint count, linpool *pool, int silent); +void rt_refresh_begin(struct rt_import_request *); +void rt_refresh_end(struct rt_import_request *); +void rt_modify_stale(rtable *t, struct rt_import_request *); +void rt_schedule_prune(struct rtable_private *t); +void rte_dump(struct rte_storage *); +void rt_dump(rtable *); +void rt_dump_all(void); +void rt_dump_hooks(rtable *); +void rt_dump_hooks_all(void); +int rt_reload_channel(struct channel *c); +void rt_reload_channel_abort(struct channel *c); +void rt_refeed_channel(struct channel *c); +void rt_prune_sync(rtable *t, int all); +struct rtable_config *rt_new_table(struct symbol *s, uint addr_type); +void rt_new_default_table(struct symbol *s); +struct rtable_config *rt_get_default_table(struct config *cf, uint addr_type); + +static inline int rt_is_ip(rtable *tab) +{ return (tab->addr_type == NET_IP4) || (tab->addr_type == NET_IP6); } + +static inline int rt_is_vpn(rtable *tab) +{ return (tab->addr_type == NET_VPN4) || (tab->addr_type == NET_VPN6); } + +static inline int rt_is_roa(rtable *tab) +{ return (tab->addr_type == NET_ROA4) || (tab->addr_type == NET_ROA6); } + +static inline int rt_is_flow(rtable *tab) +{ return (tab->addr_type == NET_FLOW4) || (tab->addr_type == NET_FLOW6); } + + +/* Default limit for ECMP next hops, defined in sysdep code */ +extern const int rt_default_ecmp; + +struct rt_show_data_rtable { + node n; + const char *name; + struct rt_exporter *table; + struct channel *export_channel; + struct channel *prefilter; + struct krt_proto *kernel; +}; + +struct rt_show_data { + struct cli *cli; /* Pointer back to the CLI */ + net_addr *addr; + list tables; + struct rt_show_data_rtable *tab; /* Iterator over table list */ + struct rt_show_data_rtable *last_table; /* Last table in output */ + struct rt_export_request req; /* Export request in use */ + int verbose, tables_defined_by; + const struct filter *filter; + struct proto *show_protocol; + struct proto *export_protocol; + struct channel *export_channel; + struct config *running_on_config; + struct rt_export_hook *kernel_export_hook; + int export_mode, addr_mode, primary_only, filtered, stats; + + int net_counter, rt_counter, show_counter, table_counter; + int net_counter_last, rt_counter_last, show_counter_last; + int show_counter_last_flush; +}; + +void rt_show(struct rt_show_data *); +struct rt_show_data_rtable * rt_show_add_exporter(struct rt_show_data *d, struct rt_exporter *t, const char *name); +struct rt_show_data_rtable * rt_show_add_table(struct rt_show_data *d, rtable *t); + +/* Value of table definition mode in struct rt_show_data */ +#define RSD_TDB_DEFAULT 0 /* no table specified */ +#define RSD_TDB_INDIRECT 0 /* show route ... protocol P ... */ +#define RSD_TDB_ALL RSD_TDB_SET /* show route ... table all ... */ +#define RSD_TDB_DIRECT RSD_TDB_SET | RSD_TDB_NMN /* show route ... table X table Y ... */ + +#define RSD_TDB_SET 0x1 /* internal: show empty tables */ +#define RSD_TDB_NMN 0x2 /* internal: need matching net */ + +/* Value of export_mode in struct rt_show_data */ +#define RSEM_NONE 0 /* Export mode not used */ +#define RSEM_PREEXPORT 1 /* Routes ready for export, before filtering */ +#define RSEM_EXPORT 2 /* Routes accepted by export filter */ +#define RSEM_NOEXPORT 3 /* Routes rejected by export filter */ +#define RSEM_EXPORTED 4 /* Routes marked in export map */ + +/* Host entry: Resolve hook for recursive nexthops */ +extern struct ea_class ea_gen_hostentry; +struct hostentry_adata { + adata ad; + struct hostentry *he; + u32 labels[0]; +}; + +void +ea_set_hostentry(ea_list **to, rtable *dep, rtable *tab, ip_addr gw, ip_addr ll, u32 lnum, u32 labels[lnum]); + +void ea_show_hostentry(const struct adata *ad, byte *buf, uint size); +void ea_show_nexthop_list(struct cli *c, struct nexthop_adata *nhad); + +/* + * Default protocol preferences + */ + +#define DEF_PREF_DIRECT 240 /* Directly connected */ +#define DEF_PREF_STATIC 200 /* Static route */ +#define DEF_PREF_OSPF 150 /* OSPF intra-area, inter-area and type 1 external routes */ +#define DEF_PREF_BABEL 130 /* Babel */ +#define DEF_PREF_RIP 120 /* RIP */ +#define DEF_PREF_BGP 100 /* BGP */ +#define DEF_PREF_RPKI 100 /* RPKI */ +#define DEF_PREF_INHERITED 10 /* Routes inherited from other routing daemons */ +#define DEF_PREF_UNKNOWN 0 /* Routes with no preference set */ + +/* + * Route Origin Authorization + */ + +#define ROA_UNKNOWN 0 +#define ROA_VALID 1 +#define ROA_INVALID 2 + +int net_roa_check(rtable *tab, const net_addr *n, u32 asn); + +#endif diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 6842d61b..cf625819 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -37,6 +37,7 @@ #include <stdlib.h> #include "babel.h" +#include "lib/macro.h" #define LOG_PKT_AUTH(msg, args...) \ log_rl(&p->log_pkt_tbf, L_AUTH "%s: " msg, p->p.name, args) @@ -62,6 +63,8 @@ static void babel_update_cost(struct babel_neighbor *n); static inline void babel_kick_timer(struct babel_proto *p); static inline void babel_iface_kick_timer(struct babel_iface *ifa); +static struct ea_class ea_babel_metric, ea_babel_router_id, ea_babel_seqno; + /* * Functions to maintain data structures */ @@ -661,37 +664,14 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) if (r) { - rta a0 = { - .source = RTS_BABEL, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, - .pref = c->preference, - .from = r->neigh->addr, - .nh.gw = r->next_hop, - .nh.iface = r->neigh->ifa->iface, - .eattrs = alloca(sizeof(ea_list) + 3*sizeof(eattr)), - }; - - *a0.eattrs = (ea_list) { .count = 3 }; - a0.eattrs->attrs[0] = (eattr) { - .id = EA_BABEL_METRIC, - .type = EAF_TYPE_INT, - .u.data = r->metric, - }; - - struct adata *ad = alloca(sizeof(struct adata) + sizeof(u64)); - ad->length = sizeof(u64); - memcpy(ad->data, &(r->router_id), sizeof(u64)); - a0.eattrs->attrs[1] = (eattr) { - .id = EA_BABEL_ROUTER_ID, - .type = EAF_TYPE_OPAQUE, - .u.ptr = ad, - }; - - a0.eattrs->attrs[2] = (eattr) { - .id = EA_BABEL_SEQNO, - .type = EAF_TYPE_INT, - .u.data = r->seqno, + struct nexthop_adata nhad = { + .nh = { + .gw = r->next_hop, + .iface = r->neigh->ifa->iface, + }, + .ad = { + .length = sizeof nhad - sizeof nhad.ad, + }, }; /* @@ -700,35 +680,54 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) * have routing work. */ if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) - a0.nh.flags = RNF_ONLINK; + nhad.nh.flags = RNF_ONLINK; + + struct { + ea_list l; + eattr a[7]; + } eattrs = { + .l.count = ARRAY_SIZE(eattrs.a), + .a = { + EA_LITERAL_EMBEDDED(&ea_gen_preference, 0, c->preference), + EA_LITERAL_STORE_ADATA(&ea_gen_from, 0, &r->neigh->addr, sizeof(r->neigh->addr)), + EA_LITERAL_EMBEDDED(&ea_gen_source, 0, RTS_BABEL), + EA_LITERAL_STORE_ADATA(&ea_gen_nexthop, 0, nhad.ad.data, nhad.ad.length), + EA_LITERAL_EMBEDDED(&ea_babel_metric, 0, r->metric), + EA_LITERAL_STORE_ADATA(&ea_babel_router_id, 0, &r->router_id, sizeof(r->router_id)), + EA_LITERAL_EMBEDDED(&ea_babel_seqno, 0, r->seqno), + } + }; - rta *a = rta_lookup(&a0); - rte *rte = rte_get_temp(a, p->p.main_source); + rte e0 = { + .attrs = &eattrs.l, + .src = p->p.main_source, + }; e->unreachable = 0; - rte_update2(c, e->n.addr, rte, p->p.main_source); + rte_update(c, e->n.addr, &e0, p->p.main_source); } else if (e->valid && (e->router_id != p->router_id)) { /* Unreachable */ - rta a0 = { - .source = RTS_BABEL, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNREACHABLE, - .pref = 1, - }; + ea_list *ea = NULL; + + ea_set_attr_u32(&ea, &ea_gen_preference, 0, 1); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_BABEL); + ea_set_dest(&ea, 0, RTD_UNREACHABLE); - rta *a = rta_lookup(&a0); - rte *rte = rte_get_temp(a, p->p.main_source); + rte e0 = { + .attrs = ea, + .src = p->p.main_source, + }; e->unreachable = 1; - rte_update2(c, e->n.addr, rte, p->p.main_source); + rte_update(c, e->n.addr, &e0, p->p.main_source); } else { /* Retraction */ e->unreachable = 0; - rte_update2(c, e->n.addr, NULL, p->p.main_source); + rte_update(c, e->n.addr, NULL, p->p.main_source); } } @@ -738,7 +737,7 @@ babel_announce_retraction(struct babel_proto *p, struct babel_entry *e) { struct channel *c = (e->n.addr->type == NET_IP4) ? p->ip4_channel : p->ip6_channel; e->unreachable = 0; - rte_update2(c, e->n.addr, NULL, p->p.main_source); + rte_update(c, e->n.addr, NULL, p->p.main_source); } @@ -1800,9 +1799,9 @@ babel_find_iface(struct babel_proto *p, struct iface *what) } static void -babel_iface_locked(struct object_lock *lock) +babel_iface_locked(void *_ifa) { - struct babel_iface *ifa = lock->data; + struct babel_iface *ifa = _ifa; struct babel_proto *p = ifa->proto; if (!babel_open_socket(ifa)) @@ -1857,8 +1856,11 @@ babel_add_iface(struct babel_proto *p, struct iface *new, struct babel_iface_con lock->addr = IP6_BABEL_ROUTERS; lock->port = ifa->cf->port; lock->iface = ifa->iface; - lock->hook = babel_iface_locked; - lock->data = ifa; + lock->event = (event) { + .hook = babel_iface_locked, + .data = ifa, + }; + lock->target = &global_event_list; olock_acquire(lock); } @@ -1980,11 +1982,9 @@ babel_reconfigure_iface(struct babel_proto *p, struct babel_iface *ifa, struct b static void babel_reconfigure_ifaces(struct babel_proto *p, struct babel_config *cf) { - struct iface *iface; - - WALK_LIST(iface, iface_list) + IFACE_WALK(iface) { - if (p->p.vrf_set && p->p.vrf != iface->master) + if (p->p.vrf && p->p.vrf != iface->master) continue; if (!(iface->flags & IF_UP)) @@ -2106,41 +2106,45 @@ babel_dump(struct proto *P) } static void -babel_get_route_info(rte *rte, byte *buf) +babel_get_route_info(const rte *rte, byte *buf) { u64 rid = 0; - eattr *e = ea_find(rte->attrs->eattrs, EA_BABEL_ROUTER_ID); + eattr *e = ea_find(rte->attrs, &ea_babel_router_id); if (e) memcpy(&rid, e->u.ptr->data, sizeof(u64)); - buf += bsprintf(buf, " (%d/%d) [%lR]", rte->attrs->pref, - ea_get_int(rte->attrs->eattrs, EA_BABEL_METRIC, BABEL_INFINITY), rid); + buf += bsprintf(buf, " (%d/%d) [%lR]", + rt_get_preference(rte), + ea_get_int(rte->attrs, &ea_babel_metric, BABEL_INFINITY), rid); } -static int -babel_get_attr(const eattr *a, byte *buf, int buflen UNUSED) +static void +babel_router_id_format(const eattr *a, byte *buf, uint len) { - switch (a->id) - { - case EA_BABEL_METRIC: - bsprintf(buf, "metric: %d", a->u.data); - return GA_FULL; + u64 rid = 0; + memcpy(&rid, a->u.ptr->data, sizeof(u64)); + bsnprintf(buf, len, "%lR", rid); +} - case EA_BABEL_ROUTER_ID: - { - u64 rid = 0; - memcpy(&rid, a->u.ptr->data, sizeof(u64)); - bsprintf(buf, "router_id: %lR", rid); - return GA_FULL; - } +static struct ea_class ea_babel_metric = { + .name = "babel_metric", + .type = T_INT, +}; - case EA_BABEL_SEQNO: - return GA_HIDDEN; +static struct ea_class ea_babel_router_id = { + .name = "babel_router_id", + .type = T_OPAQUE, + .readonly = 1, + .format = babel_router_id_format, +}; + +static struct ea_class ea_babel_seqno = { + .name = "babel_seqno", + .type = T_INT, + .readonly = 1, + .hidden = 1, +}; - default: - return GA_UNKNOWN; - } -} void babel_show_interfaces(struct proto *P, const char *iff) @@ -2340,9 +2344,13 @@ babel_kick_timer(struct babel_proto *p) static int babel_preexport(struct channel *C, struct rte *new) { - struct rta *a = new->attrs; + if (new->src->owner != &C->proto->sources) + return 0; + /* Reject our own unreachable routes */ - if ((a->dest == RTD_UNREACHABLE) && (new->src->proto == C->proto)) + eattr *ea = ea_find(new->attrs, &ea_gen_nexthop); + struct nexthop_adata *nhad = (void *) ea->u.ptr; + if (!NEXTHOP_IS_REACHABLE(nhad)) return -1; return 0; @@ -2353,8 +2361,8 @@ babel_preexport(struct channel *C, struct rte *new) * so store it into our data structures. */ static void -babel_rt_notify(struct proto *P, struct channel *c UNUSED, struct network *net, - struct rte *new, struct rte *old UNUSED) +babel_rt_notify(struct proto *P, struct channel *c UNUSED, const net_addr *net, + struct rte *new, const struct rte *old UNUSED) { struct babel_proto *p = (void *) P; struct babel_entry *e; @@ -2363,13 +2371,13 @@ babel_rt_notify(struct proto *P, struct channel *c UNUSED, struct network *net, { /* Update */ uint rt_seqno; - uint rt_metric = ea_get_int(new->attrs->eattrs, EA_BABEL_METRIC, 0); + uint rt_metric = ea_get_int(new->attrs, &ea_babel_metric, 0); u64 rt_router_id = 0; - if (new->src->proto == P) + if (new->src->owner == &P->sources) { - rt_seqno = ea_find(new->attrs->eattrs, EA_BABEL_SEQNO)->u.data; - eattr *e = ea_find(new->attrs->eattrs, EA_BABEL_ROUTER_ID); + rt_seqno = ea_get_int(new->attrs, &ea_babel_seqno, 0); + eattr *e = ea_find(new->attrs, &ea_babel_router_id); if (e) memcpy(&rt_router_id, e->u.ptr->data, sizeof(u64)); } @@ -2382,11 +2390,11 @@ babel_rt_notify(struct proto *P, struct channel *c UNUSED, struct network *net, if (rt_metric > BABEL_INFINITY) { log(L_WARN "%s: Invalid babel_metric value %u for route %N", - p->p.name, rt_metric, net->n.addr); + p->p.name, rt_metric, net); rt_metric = BABEL_INFINITY; } - e = babel_get_entry(p, net->n.addr); + e = babel_get_entry(p, net); /* Activate triggered updates */ if ((e->valid != BABEL_ENTRY_VALID) || @@ -2404,7 +2412,7 @@ babel_rt_notify(struct proto *P, struct channel *c UNUSED, struct network *net, else { /* Withdraw */ - e = babel_find_entry(p, net->n.addr); + e = babel_find_entry(p, net); if (!e || e->valid != BABEL_ENTRY_VALID) return; @@ -2418,18 +2426,18 @@ babel_rt_notify(struct proto *P, struct channel *c UNUSED, struct network *net, } static int -babel_rte_better(struct rte *new, struct rte *old) +babel_rte_better(const rte *new, const rte *old) { - uint new_metric = ea_get_int(new->attrs->eattrs, EA_BABEL_METRIC, BABEL_INFINITY); - uint old_metric = ea_get_int(old->attrs->eattrs, EA_BABEL_METRIC, BABEL_INFINITY); + uint new_metric = ea_get_int(new->attrs, &ea_babel_metric, BABEL_INFINITY); + uint old_metric = ea_get_int(old->attrs, &ea_babel_metric, BABEL_INFINITY); return new_metric < old_metric; } static u32 -babel_rte_igp_metric(struct rte *rt) +babel_rte_igp_metric(const rte *rt) { - return ea_get_int(rt->attrs->eattrs, EA_BABEL_METRIC, BABEL_INFINITY); + return ea_get_int(rt->attrs, &ea_babel_metric, BABEL_INFINITY); } @@ -2450,6 +2458,12 @@ babel_postconfig(struct proto_config *CF) cf->ip6_channel = ip6 ?: ip6_sadr; } +static struct rte_owner_class babel_rte_owner_class = { + .get_route_info = babel_get_route_info, + .rte_better = babel_rte_better, + .rte_igp_metric = babel_rte_igp_metric, +}; + static struct proto * babel_init(struct proto_config *CF) { @@ -2460,11 +2474,11 @@ babel_init(struct proto_config *CF) proto_configure_channel(P, &p->ip4_channel, cf->ip4_channel); proto_configure_channel(P, &p->ip6_channel, cf->ip6_channel); - P->if_notify = babel_if_notify; + P->iface_sub.if_notify = babel_if_notify; P->rt_notify = babel_rt_notify; P->preexport = babel_preexport; - P->rte_better = babel_rte_better; - P->rte_igp_metric = babel_rte_igp_metric; + + P->sources.class = &babel_rte_owner_class; return P; } @@ -2562,11 +2576,9 @@ babel_reconfigure(struct proto *P, struct proto_config *CF) return 1; } - struct protocol proto_babel = { .name = "Babel", .template = "babel%d", - .class = PROTOCOL_BABEL, .preference = DEF_PREF_BABEL, .channel_mask = NB_IP | NB_IP6_SADR, .proto_size = sizeof(struct babel_proto), @@ -2577,12 +2589,16 @@ struct protocol proto_babel = { .start = babel_start, .shutdown = babel_shutdown, .reconfigure = babel_reconfigure, - .get_route_info = babel_get_route_info, - .get_attr = babel_get_attr }; void babel_build(void) { proto_build(&proto_babel); + + EA_REGISTER_ALL( + &ea_babel_metric, + &ea_babel_router_id, + &ea_babel_seqno + ); } diff --git a/proto/babel/babel.h b/proto/babel/babel.h index dcd303e1..562abac2 100644 --- a/proto/babel/babel.h +++ b/proto/babel/babel.h @@ -16,7 +16,7 @@ #include "nest/bird.h" #include "nest/cli.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/locks.h" #include "nest/password.h" @@ -26,10 +26,6 @@ #include "lib/string.h" #include "lib/timer.h" -#define EA_BABEL_METRIC EA_CODE(PROTOCOL_BABEL, 0) -#define EA_BABEL_ROUTER_ID EA_CODE(PROTOCOL_BABEL, 1) -#define EA_BABEL_SEQNO EA_CODE(PROTOCOL_BABEL, 2) - #define BABEL_MAGIC 42 #define BABEL_VERSION 2 #define BABEL_PORT 6696 diff --git a/proto/babel/config.Y b/proto/babel/config.Y index 1b4dc6f5..6d1ad7d0 100644 --- a/proto/babel/config.Y +++ b/proto/babel/config.Y @@ -24,7 +24,7 @@ CF_DECLS CF_KEYWORDS(BABEL, INTERFACE, METRIC, RXCOST, HELLO, UPDATE, INTERVAL, PORT, TYPE, WIRED, WIRELESS, RX, TX, BUFFER, PRIORITY, LENGTH, CHECK, LINK, - NEXT, HOP, IPV4, IPV6, BABEL_METRIC, SHOW, INTERFACES, NEIGHBORS, + NEXT, HOP, IPV4, IPV6, SHOW, INTERFACES, NEIGHBORS, ENTRIES, RANDOMIZE, ROUTER, ID, AUTHENTICATION, NONE, MAC, PERMISSIVE, EXTENDED) @@ -166,8 +166,6 @@ babel_iface_opt_list: babel_iface: babel_iface_start iface_patt_list_nopx babel_iface_opt_list babel_iface_finish; -dynamic_attr: BABEL_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_BABEL_METRIC); } ; - CF_CLI_HELP(SHOW BABEL, ..., [[Show information about Babel protocol]]); CF_CLI(SHOW BABEL INTERFACES, optproto opttext, [<name>] [\"<interface>\"], [[Show information about Babel interfaces]]) diff --git a/proto/babel/packets.c b/proto/babel/packets.c index 61c94cc5..d26ee5c6 100644 --- a/proto/babel/packets.c +++ b/proto/babel/packets.c @@ -1666,7 +1666,7 @@ babel_open_socket(struct babel_iface *ifa) sk->ttl = 1; sk->flags = SKF_LADDR_RX; - if (sk_open(sk) < 0) + if (sk_open(sk, p->p.loop) < 0) goto err; if (sk_setup_multicast(sk) < 0) diff --git a/proto/bfd/Makefile b/proto/bfd/Makefile index d9aecfa9..267dff98 100644 --- a/proto/bfd/Makefile +++ b/proto/bfd/Makefile @@ -1,4 +1,4 @@ -src := bfd.c io.c packets.c +src := bfd.c packets.c obj := $(src-o-files) $(all-daemon) $(cf-local) diff --git a/proto/bfd/bfd.c b/proto/bfd/bfd.c index 873e2ed5..c88c1cb2 100644 --- a/proto/bfd/bfd.c +++ b/proto/bfd/bfd.c @@ -82,7 +82,7 @@ * BFD thread to the main thread. This is done in an asynchronous way, sesions * with pending notifications are linked (in the BFD thread) to @notify_list in * &bfd_proto, and then bfd_notify_hook() in the main thread is activated using - * bfd_notify_kick() and a pipe. The hook then processes scheduled sessions and + * a standard event sending code. The hook then processes scheduled sessions and * calls hooks from associated BFD requests. This @notify_list (and state fields * in structure &bfd_session) is protected by a spinlock in &bfd_proto and * functions bfd_lock_sessions() / bfd_unlock_sessions(). @@ -113,15 +113,22 @@ #define HASH_IP_EQ(a1,n1,a2,n2) ipa_equal(a1, a2) && n1 == n2 #define HASH_IP_FN(a,n) ipa_hash(a) ^ u32_hash(n) -static list STATIC_LIST_INIT(bfd_proto_list); -static list STATIC_LIST_INIT(bfd_wait_list); +#define BFD_LOCK LOCK_DOMAIN(rtable, bfd_global.lock) +#define BFD_UNLOCK UNLOCK_DOMAIN(rtable, bfd_global.lock) + +static struct { + DOMAIN(rtable) lock; + list wait_list; + list pickup_list; + list proto_list; + uint pickup_reload; +} bfd_global; const char *bfd_state_names[] = { "AdminDown", "Down", "Init", "Up" }; static void bfd_session_set_min_tx(struct bfd_session *s, u32 val); static struct bfd_iface *bfd_get_iface(struct bfd_proto *p, ip_addr local, struct iface *iface); static void bfd_free_iface(struct bfd_iface *ifa); -static inline void bfd_notify_kick(struct bfd_proto *p); /* @@ -170,7 +177,7 @@ bfd_session_update_state(struct bfd_session *s, uint state, uint diag) bfd_session_set_min_tx(s, s->cf.idle_tx_int); if (notify) - bfd_notify_kick(p); + ev_send(&global_event_list, &p->notify_event); } static void @@ -188,7 +195,7 @@ bfd_session_update_tx_interval(struct bfd_session *s) return; /* Set timer relative to last tx_timer event */ - tm_set(s->tx_timer, s->last_tx + tx_int_l); + tm_set_in(s->tx_timer, s->last_tx + tx_int_l, s->ifa->bfd->p.loop); } static void @@ -202,7 +209,7 @@ bfd_session_update_detection_time(struct bfd_session *s, int kick) if (!s->last_rx) return; - tm_set(s->hold_timer, s->last_rx + timeout); + tm_set_in(s->hold_timer, s->last_rx + timeout, s->ifa->bfd->p.loop); } static void @@ -226,7 +233,7 @@ bfd_session_control_tx_timer(struct bfd_session *s, int reset) if (reset || !tm_active(s->tx_timer)) { s->last_tx = 0; - tm_start(s->tx_timer, 0); + tm_start_in(s->tx_timer, 0, s->ifa->bfd->p.loop); } return; @@ -419,7 +426,7 @@ bfd_get_free_id(struct bfd_proto *p) static struct bfd_session * bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface *iface, struct bfd_options *opts) { - birdloop_enter(p->loop); + ASSERT_DIE(birdloop_inside(p->p.loop)); struct bfd_iface *ifa = bfd_get_iface(p, local, iface); @@ -454,8 +461,6 @@ bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface * TRACE(D_EVENTS, "Session to %I added", s->addr); - birdloop_leave(p->loop); - return s; } @@ -463,38 +468,34 @@ bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface * static void bfd_open_session(struct bfd_proto *p, struct bfd_session *s, ip_addr local, struct iface *ifa) { - birdloop_enter(p->loop); + birdloop_enter(p->p.loop); s->opened = 1; bfd_session_control_tx_timer(s); - birdloop_leave(p->loop); + birdloop_leave(p->p.loop); } static void bfd_close_session(struct bfd_proto *p, struct bfd_session *s) { - birdloop_enter(p->loop); + birdloop_enter(p->p.loop); s->opened = 0; bfd_session_update_state(s, BFD_STATE_DOWN, BFD_DIAG_PATH_DOWN); bfd_session_control_tx_timer(s); - birdloop_leave(p->loop); + birdloop_leave(p->p.loop); } */ static void -bfd_remove_session(struct bfd_proto *p, struct bfd_session *s) +bfd_remove_session_locked(struct bfd_proto *p, struct bfd_session *s) { - ip_addr ip = s->addr; - /* Caller should ensure that request list is empty */ - birdloop_enter(p->loop); - /* Remove session from notify list if scheduled for notification */ /* No need for bfd_lock_sessions(), we are already protected by birdloop_enter() */ if (NODE_VALID(&s->n)) @@ -508,11 +509,17 @@ bfd_remove_session(struct bfd_proto *p, struct bfd_session *s) HASH_REMOVE(p->session_hash_id, HASH_ID, s); HASH_REMOVE(p->session_hash_ip, HASH_IP, s); - sl_free(s); + TRACE(D_EVENTS, "Session to %I removed", s->addr); - TRACE(D_EVENTS, "Session to %I removed", ip); + sl_free(s); +} - birdloop_leave(p->loop); +static void +bfd_remove_session(struct bfd_proto *p, struct bfd_session *s) +{ + birdloop_enter(p->p.loop); + bfd_remove_session_locked(p, s); + birdloop_leave(p->p.loop); } static void @@ -521,7 +528,7 @@ bfd_reconfigure_session(struct bfd_proto *p, struct bfd_session *s) if (EMPTY_LIST(s->request_list)) return; - birdloop_enter(p->loop); + birdloop_enter(p->p.loop); struct bfd_request *req = SKIP_BACK(struct bfd_request, n, HEAD(s->request_list)); s->cf = bfd_merge_options(s->ifa->cf, &req->opts); @@ -534,7 +541,7 @@ bfd_reconfigure_session(struct bfd_proto *p, struct bfd_session *s) bfd_session_control_tx_timer(s, 0); - birdloop_leave(p->loop); + birdloop_leave(p->p.loop); TRACE(D_EVENTS, "Session to %I reconfigured", s->addr); } @@ -597,16 +604,10 @@ bfd_free_iface(struct bfd_iface *ifa) return; if (ifa->sk) - { - sk_stop(ifa->sk); rfree(ifa->sk); - } if (ifa->rx) - { - sk_stop(ifa->rx); rfree(ifa->rx); - } rem_node(&ifa->n); mb_free(ifa); @@ -627,9 +628,9 @@ bfd_reconfigure_iface(struct bfd_proto *p, struct bfd_iface *ifa, struct bfd_con (new->passive != old->passive); /* This should be probably changed to not access ifa->cf from the BFD thread */ - birdloop_enter(p->loop); + birdloop_enter(p->p.loop); ifa->cf = new; - birdloop_leave(p->loop); + birdloop_leave(p->p.loop); } @@ -651,7 +652,17 @@ bfd_request_notify(struct bfd_request *req, u8 state, u8 diag) req->down = (old_state == BFD_STATE_UP) && (state == BFD_STATE_DOWN); if (req->hook) + { + struct birdloop *target = !birdloop_inside(req->target) ? req->target : NULL; + + if (target) + birdloop_enter(target); + req->hook(req); + + if (target) + birdloop_leave(target); + } } static int @@ -659,20 +670,30 @@ bfd_add_request(struct bfd_proto *p, struct bfd_request *req) { struct bfd_config *cf = (struct bfd_config *) (p->p.cf); - if (p->p.vrf_set && (p->p.vrf != req->vrf)) + if (p->p.vrf && (p->p.vrf != req->vrf)) + { + TRACE(D_EVENTS, "Not accepting request to %I with different VRF", req->addr); return 0; + } if (ipa_is_ip4(req->addr) ? !cf->accept_ipv4 : !cf->accept_ipv6) + { + TRACE(D_EVENTS, "Not accepting request to %I (AF limit)", req->addr); return 0; + } if (req->iface ? !cf->accept_direct : !cf->accept_multihop) + { + TRACE(D_EVENTS, "Not accepting %s request to %I", req->iface ? "direct" : "multihop", req->addr); return 0; + } uint ifindex = req->iface ? req->iface->index : 0; struct bfd_session *s = bfd_find_session_by_addr(p, req->addr, ifindex); - u8 state, diag; - if (!s) + if (s) + TRACE(D_EVENTS, "Session to %I reused", s->addr); + else s = bfd_add_session(p, req->addr, req->local, req->iface, &req->opts); rem_node(&req->n); @@ -680,51 +701,129 @@ bfd_add_request(struct bfd_proto *p, struct bfd_request *req) req->session = s; bfd_lock_sessions(p); - state = s->loc_state; - diag = s->loc_diag; + + int notify = !NODE_VALID(&s->n); + if (notify) + add_tail(&p->notify_list, &s->n); + bfd_unlock_sessions(p); - bfd_request_notify(req, state, diag); + if (notify) + ev_send(&global_event_list, &p->notify_event); return 1; } static void -bfd_submit_request(struct bfd_request *req) +bfd_pickup_requests(void *_data UNUSED) { - node *n; + /* NOTE TO MY FUTURE SELF + * + * Functions bfd_take_requests() and bfd_drop_requests() need to have + * consistent &bfd_global.wait_list and this is ensured only by having these + * functions called from bfd_start() and bfd_shutdown() which are both called + * in PROTO_LOCKED_FROM_MAIN context, i.e. always from &main_birdloop. + * + * This pickup event is also called in &main_birdloop, therefore we can + * freely do BFD_LOCK/BFD_UNLOCK while processing all the requests. All BFD + * protocols capable of bfd_add_request() are either started before this code + * happens or after that. + * + * If BFD protocols could start in parallel with this routine, they might + * miss some of the waiting requests, thus if anybody tries to start + * protocols or run this pickup event outside &main_birdloop in future, they + * shall ensure that this race condition is mitigated somehow. + * + * Thank you, my future self, for understanding. Have a nice day! + */ + + DBG("BFD pickup loop starting"); + + BFD_LOCK; + do { + bfd_global.pickup_reload = 0; + BFD_UNLOCK; + + node *n; + WALK_LIST(n, bfd_global.proto_list) + { + struct bfd_proto *p = SKIP_BACK(struct bfd_proto, bfd_node, n); + birdloop_enter(p->p.loop); + BFD_LOCK; + + TRACE(D_EVENTS, "Picking up new requests (%d available)", list_length(&bfd_global.pickup_list)); + + node *rn, *rnxt; + WALK_LIST_DELSAFE(rn, rnxt, bfd_global.pickup_list) + bfd_add_request(p, SKIP_BACK(struct bfd_request, n, rn)); + + BFD_UNLOCK; + + /* Remove sessions with no requests */ + HASH_WALK_DELSAFE(p->session_hash_id, next_id, s) + { + if (EMPTY_LIST(s->request_list)) + bfd_remove_session_locked(p, s); + } + HASH_WALK_END; - WALK_LIST(n, bfd_proto_list) - if (bfd_add_request(SKIP_BACK(struct bfd_proto, bfd_node, n), req)) - return; + birdloop_leave(p->p.loop); + } - rem_node(&req->n); - add_tail(&bfd_wait_list, &req->n); - req->session = NULL; - bfd_request_notify(req, BFD_STATE_ADMIN_DOWN, 0); + BFD_LOCK; + } while (bfd_global.pickup_reload); + + list tmp_list; + init_list(&tmp_list); + add_tail_list(&tmp_list, &bfd_global.pickup_list); + + init_list(&bfd_global.pickup_list); + BFD_UNLOCK; + + log(L_TRACE "No protocol for %d BFD requests", list_length(&tmp_list)); + + node *n; + WALK_LIST(n, tmp_list) + bfd_request_notify(SKIP_BACK(struct bfd_request, n, n), BFD_STATE_ADMIN_DOWN, 0); + + BFD_LOCK; + add_tail_list(&bfd_global.wait_list, &tmp_list); + BFD_UNLOCK; } +static event bfd_pickup_event = { .hook = bfd_pickup_requests }; + static void bfd_take_requests(struct bfd_proto *p) { node *n, *nn; - - WALK_LIST_DELSAFE(n, nn, bfd_wait_list) + BFD_LOCK; + WALK_LIST_DELSAFE(n, nn, bfd_global.wait_list) bfd_add_request(p, SKIP_BACK(struct bfd_request, n, n)); + BFD_UNLOCK; } static void bfd_drop_requests(struct bfd_proto *p) { node *n; - - HASH_WALK(p->session_hash_id, next_id, s) + BFD_LOCK; + HASH_WALK_DELSAFE(p->session_hash_id, next_id, s) { - /* We assume that p is not in bfd_proto_list */ WALK_LIST_FIRST(n, s->request_list) - bfd_submit_request(SKIP_BACK(struct bfd_request, n, n)); + { + struct bfd_request *req = SKIP_BACK(struct bfd_request, n, n); + rem_node(&req->n); + add_tail(&bfd_global.pickup_list, &req->n); + req->session = NULL; + } + + ev_send(&global_event_list, &bfd_pickup_event); + + bfd_remove_session_locked(p, s); } HASH_WALK_END; + BFD_UNLOCK; } static struct resclass bfd_request_class; @@ -733,13 +832,11 @@ struct bfd_request * bfd_request_session(pool *p, ip_addr addr, ip_addr local, struct iface *iface, struct iface *vrf, void (*hook)(struct bfd_request *), void *data, + struct birdloop *target, const struct bfd_options *opts) { struct bfd_request *req = ralloc(p, &bfd_request_class); - /* Hack: self-link req->n, we will call rem_node() on it */ - req->n.prev = req->n.next = &req->n; - req->addr = addr; req->local = local; req->iface = iface; @@ -748,10 +845,19 @@ bfd_request_session(pool *p, ip_addr addr, ip_addr local, if (opts) req->opts = *opts; - bfd_submit_request(req); - + ASSERT_DIE(target || !hook); req->hook = hook; req->data = data; + req->target = target; + + req->session = NULL; + + BFD_LOCK; + bfd_global.pickup_reload++; + add_tail(&bfd_global.pickup_list, &req->n); + ev_send(&global_event_list, &bfd_pickup_event); + DBG("New BFD request enlisted.\n"); + BFD_UNLOCK; return req; } @@ -774,19 +880,16 @@ static void bfd_request_free(resource *r) { struct bfd_request *req = (struct bfd_request *) r; - struct bfd_session *s = req->session; + BFD_LOCK; rem_node(&req->n); + BFD_UNLOCK; - /* Remove the session if there is no request for it. Skip that if - inside notify hooks, will be handled by bfd_notify_hook() itself */ - - if (s && EMPTY_LIST(s->request_list) && !s->notify_running) - bfd_remove_session(s->ifa->bfd, s); + ev_send(&global_event_list, &bfd_pickup_event); } static void -bfd_request_dump(resource *r) +bfd_request_dump(resource *r, unsigned indent UNUSED) { struct bfd_request *req = (struct bfd_request *) r; @@ -819,7 +922,7 @@ bfd_neigh_notify(struct neighbor *nb) if ((nb->scope > 0) && !n->req) { ip_addr local = ipa_nonzero(n->local) ? n->local : nb->ifa->ip; - n->req = bfd_request_session(p->p.pool, n->addr, local, nb->iface, p->p.vrf, NULL, NULL, NULL); + n->req = bfd_request_session(p->p.pool, n->addr, local, nb->iface, p->p.vrf, NULL, NULL, NULL, NULL); } if ((nb->scope <= 0) && n->req) @@ -836,7 +939,7 @@ bfd_start_neighbor(struct bfd_proto *p, struct bfd_neighbor *n) if (n->multihop) { - n->req = bfd_request_session(p->p.pool, n->addr, n->local, NULL, p->p.vrf, NULL, NULL, NULL); + n->req = bfd_request_session(p->p.pool, n->addr, n->local, NULL, p->p.vrf, NULL, NULL, NULL, NULL); return; } @@ -916,21 +1019,15 @@ bfd_reconfigure_neighbors(struct bfd_proto *p, struct bfd_config *new) /* This core notify code should be replaced after main loop transition to birdloop */ -int pipe(int pipefd[2]); -void pipe_drain(int fd); -void pipe_kick(int fd); - -static int -bfd_notify_hook(sock *sk, uint len UNUSED) +static void +bfd_notify_hook(void *data) { - struct bfd_proto *p = sk->data; + struct bfd_proto *p = data; struct bfd_session *s; list tmp_list; u8 state, diag; node *n, *nn; - pipe_drain(sk->fd); - bfd_lock_sessions(p); init_list(&tmp_list); add_tail_list(&tmp_list, &p->notify_list); @@ -945,64 +1042,15 @@ bfd_notify_hook(sock *sk, uint len UNUSED) diag = s->loc_diag; bfd_unlock_sessions(p); - s->notify_running = 1; WALK_LIST_DELSAFE(n, nn, s->request_list) bfd_request_notify(SKIP_BACK(struct bfd_request, n, n), state, diag); - s->notify_running = 0; /* Remove the session if all requests were removed in notify hooks */ if (EMPTY_LIST(s->request_list)) bfd_remove_session(p, s); } - - return 0; } -static inline void -bfd_notify_kick(struct bfd_proto *p) -{ - pipe_kick(p->notify_ws->fd); -} - -static void -bfd_noterr_hook(sock *sk, int err) -{ - struct bfd_proto *p = sk->data; - log(L_ERR "%s: Notify socket error: %m", p->p.name, err); -} - -static void -bfd_notify_init(struct bfd_proto *p) -{ - int pfds[2]; - sock *sk; - - int rv = pipe(pfds); - if (rv < 0) - die("pipe: %m"); - - sk = sk_new(p->p.pool); - sk->type = SK_MAGIC; - sk->rx_hook = bfd_notify_hook; - sk->err_hook = bfd_noterr_hook; - sk->fd = pfds[0]; - sk->data = p; - if (sk_open(sk) < 0) - die("bfd: sk_open failed"); - p->notify_rs = sk; - - /* The write sock is not added to any event loop */ - sk = sk_new(p->p.pool); - sk->type = SK_MAGIC; - sk->fd = pfds[1]; - sk->data = p; - sk->flags = SKF_THREAD; - if (sk_open(sk) < 0) - die("bfd: sk_open failed"); - p->notify_ws = sk; -} - - /* * BFD protocol glue */ @@ -1012,7 +1060,7 @@ bfd_init(struct proto_config *c) { struct proto *p = proto_new(c); - p->neigh_notify = bfd_neigh_notify; + p->iface_sub.neigh_notify = bfd_neigh_notify; return p; } @@ -1023,10 +1071,10 @@ bfd_start(struct proto *P) struct bfd_proto *p = (struct bfd_proto *) P; struct bfd_config *cf = (struct bfd_config *) (P->cf); - p->loop = birdloop_new(); - p->tpool = rp_new(NULL, "BFD thread root"); pthread_spin_init(&p->lock, PTHREAD_PROCESS_PRIVATE); + p->tpool = rp_new(P->pool, "BFD loop pool"); + p->session_slab = sl_new(P->pool, sizeof(struct bfd_session)); HASH_INIT(p->session_hash_id, P->pool, 8); HASH_INIT(p->session_hash_ip, P->pool, 8); @@ -1034,11 +1082,12 @@ bfd_start(struct proto *P) init_list(&p->iface_list); init_list(&p->notify_list); - bfd_notify_init(p); - - add_tail(&bfd_proto_list, &p->bfd_node); + p->notify_event = (event) { + .hook = bfd_notify_hook, + .data = p, + }; - birdloop_enter(p->loop); + add_tail(&bfd_global.proto_list, &p->bfd_node); if (!cf->strict_bind) { @@ -1055,43 +1104,29 @@ bfd_start(struct proto *P) p->rx6_m = bfd_open_rx_sk(p, 1, SK_IPV6); } - birdloop_leave(p->loop); - bfd_take_requests(p); struct bfd_neighbor *n; WALK_LIST(n, cf->neigh_list) bfd_start_neighbor(p, n); - birdloop_start(p->loop); - return PS_UP; } - static int bfd_shutdown(struct proto *P) { struct bfd_proto *p = (struct bfd_proto *) P; - struct bfd_config *cf = (struct bfd_config *) (P->cf); + struct bfd_config *cf = (struct bfd_config *) (p->p.cf); rem_node(&p->bfd_node); - birdloop_stop(p->loop); - - struct bfd_neighbor *n; - WALK_LIST(n, cf->neigh_list) - bfd_stop_neighbor(p, n); + struct bfd_neighbor *bn; + WALK_LIST(bn, cf->neigh_list) + bfd_stop_neighbor(p, bn); bfd_drop_requests(p); - /* FIXME: This is hack */ - birdloop_enter(p->loop); - rfree(p->tpool); - birdloop_leave(p->loop); - - birdloop_free(p->loop); - return PS_DOWN; } @@ -1111,7 +1146,7 @@ bfd_reconfigure(struct proto *P, struct proto_config *c) (new->strict_bind != old->strict_bind)) return 0; - birdloop_mask_wakeups(p->loop); + birdloop_mask_wakeups(p->p.loop); WALK_LIST(ifa, p->iface_list) bfd_reconfigure_iface(p, ifa, new); @@ -1125,7 +1160,7 @@ bfd_reconfigure(struct proto *P, struct proto_config *c) bfd_reconfigure_neighbors(p, new); - birdloop_unmask_wakeups(p->loop); + birdloop_unmask_wakeups(p->p.loop); return 1; } @@ -1183,7 +1218,6 @@ bfd_show_sessions(struct proto *P) struct protocol proto_bfd = { .name = "BFD", .template = "bfd%d", - .class = PROTOCOL_BFD, .proto_size = sizeof(struct bfd_proto), .config_size = sizeof(struct bfd_config), .init = bfd_init, @@ -1197,4 +1231,9 @@ void bfd_build(void) { proto_build(&proto_bfd); + + bfd_global.lock = DOMAIN_NEW(rtable, "BFD Global"); + init_list(&bfd_global.wait_list); + init_list(&bfd_global.pickup_list); + init_list(&bfd_global.proto_list); } diff --git a/proto/bfd/bfd.h b/proto/bfd/bfd.h index 7caf9f66..a4b7d63c 100644 --- a/proto/bfd/bfd.h +++ b/proto/bfd/bfd.h @@ -13,16 +13,16 @@ #include "nest/cli.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/password.h" #include "conf/conf.h" #include "lib/hash.h" +#include "lib/io-loop.h" #include "lib/resource.h" #include "lib/socket.h" #include "lib/string.h" #include "nest/bfd.h" -#include "io.h" #define BFD_CONTROL_PORT 3784 @@ -88,17 +88,18 @@ struct bfd_neighbor struct bfd_proto { struct proto p; - struct birdloop *loop; - pool *tpool; + pthread_spinlock_t lock; + + pool *tpool; + node bfd_node; slab *session_slab; HASH(struct bfd_session) session_hash_id; HASH(struct bfd_session) session_hash_ip; - sock *notify_rs; - sock *notify_ws; + event notify_event; list notify_list; sock *rx4_1; @@ -164,7 +165,6 @@ struct bfd_session list request_list; /* List of client requests (struct bfd_request) */ btime last_state_change; /* Time of last state change */ - u8 notify_running; /* 1 if notify hooks are running */ u8 rx_csn_known; /* Received crypto sequence number is known */ u32 rx_csn; /* Last received crypto sequence number */ diff --git a/proto/bfd/config.Y b/proto/bfd/config.Y index 70461872..0d6e33fa 100644 --- a/proto/bfd/config.Y +++ b/proto/bfd/config.Y @@ -37,6 +37,7 @@ proto: bfd_proto ; bfd_proto_start: proto_start BFD { this_proto = proto_config_new(&proto_bfd, $1); + this_proto->loop_order = DOMAIN_ORDER(proto); init_list(&BFD_CFG->patt_list); init_list(&BFD_CFG->neigh_list); BFD_CFG->accept_ipv4 = BFD_CFG->accept_ipv6 = 1; diff --git a/proto/bfd/io.c b/proto/bfd/io.c deleted file mode 100644 index e696cc89..00000000 --- a/proto/bfd/io.c +++ /dev/null @@ -1,537 +0,0 @@ -/* - * BIRD -- I/O and event loop - * - * Can be freely distributed and used under the terms of the GNU GPL. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <errno.h> -#include <fcntl.h> -#include <poll.h> -#include <pthread.h> -#include <time.h> -#include <sys/time.h> - -#include "nest/bird.h" -#include "proto/bfd/io.h" - -#include "lib/buffer.h" -#include "lib/lists.h" -#include "lib/resource.h" -#include "lib/event.h" -#include "lib/timer.h" -#include "lib/socket.h" - - -struct birdloop -{ - pool *pool; - pthread_t thread; - pthread_mutex_t mutex; - - u8 stop_called; - u8 poll_active; - u8 wakeup_masked; - int wakeup_fds[2]; - - struct timeloop time; - list event_list; - list sock_list; - uint sock_num; - - BUFFER(sock *) poll_sk; - BUFFER(struct pollfd) poll_fd; - u8 poll_changed; - u8 close_scheduled; -}; - - -/* - * Current thread context - */ - -static pthread_key_t current_loop_key; -extern pthread_key_t current_time_key; - -static inline struct birdloop * -birdloop_current(void) -{ - return pthread_getspecific(current_loop_key); -} - -static inline void -birdloop_set_current(struct birdloop *loop) -{ - pthread_setspecific(current_loop_key, loop); - pthread_setspecific(current_time_key, loop ? &loop->time : &main_timeloop); -} - -static inline void -birdloop_init_current(void) -{ - pthread_key_create(¤t_loop_key, NULL); -} - - -/* - * Wakeup code for birdloop - */ - -static void -pipe_new(int *pfds) -{ - int rv = pipe(pfds); - if (rv < 0) - die("pipe: %m"); - - if (fcntl(pfds[0], F_SETFL, O_NONBLOCK) < 0) - die("fcntl(O_NONBLOCK): %m"); - - if (fcntl(pfds[1], F_SETFL, O_NONBLOCK) < 0) - die("fcntl(O_NONBLOCK): %m"); -} - -void -pipe_drain(int fd) -{ - char buf[64]; - int rv; - - try: - rv = read(fd, buf, 64); - if (rv < 0) - { - if (errno == EINTR) - goto try; - if (errno == EAGAIN) - return; - die("wakeup read: %m"); - } - if (rv == 64) - goto try; -} - -void -pipe_kick(int fd) -{ - u64 v = 1; - int rv; - - try: - rv = write(fd, &v, sizeof(u64)); - if (rv < 0) - { - if (errno == EINTR) - goto try; - if (errno == EAGAIN) - return; - die("wakeup write: %m"); - } -} - -static inline void -wakeup_init(struct birdloop *loop) -{ - pipe_new(loop->wakeup_fds); -} - -static inline void -wakeup_drain(struct birdloop *loop) -{ - pipe_drain(loop->wakeup_fds[0]); -} - -static inline void -wakeup_do_kick(struct birdloop *loop) -{ - pipe_kick(loop->wakeup_fds[1]); -} - -static inline void -wakeup_kick(struct birdloop *loop) -{ - if (!loop->wakeup_masked) - wakeup_do_kick(loop); - else - loop->wakeup_masked = 2; -} - -/* For notifications from outside */ -void -wakeup_kick_current(void) -{ - struct birdloop *loop = birdloop_current(); - - if (loop && loop->poll_active) - wakeup_kick(loop); -} - - -/* - * Events - */ - -static inline uint -events_waiting(struct birdloop *loop) -{ - return !EMPTY_LIST(loop->event_list); -} - -static inline void -events_init(struct birdloop *loop) -{ - init_list(&loop->event_list); -} - -static void -events_fire(struct birdloop *loop) -{ - times_update(&loop->time); - ev_run_list(&loop->event_list); -} - -void -ev2_schedule(event *e) -{ - struct birdloop *loop = birdloop_current(); - - if (loop->poll_active && EMPTY_LIST(loop->event_list)) - wakeup_kick(loop); - - if (e->n.next) - rem_node(&e->n); - - add_tail(&loop->event_list, &e->n); -} - - -/* - * Sockets - */ - -static void -sockets_init(struct birdloop *loop) -{ - init_list(&loop->sock_list); - loop->sock_num = 0; - - BUFFER_INIT(loop->poll_sk, loop->pool, 4); - BUFFER_INIT(loop->poll_fd, loop->pool, 4); - loop->poll_changed = 1; /* add wakeup fd */ -} - -static void -sockets_add(struct birdloop *loop, sock *s) -{ - add_tail(&loop->sock_list, &s->n); - loop->sock_num++; - - s->index = -1; - loop->poll_changed = 1; - - if (loop->poll_active) - wakeup_kick(loop); -} - -void -sk_start(sock *s) -{ - struct birdloop *loop = birdloop_current(); - - sockets_add(loop, s); -} - -static void -sockets_remove(struct birdloop *loop, sock *s) -{ - rem_node(&s->n); - loop->sock_num--; - - if (s->index >= 0) - loop->poll_sk.data[s->index] = NULL; - - s->index = -1; - loop->poll_changed = 1; - - /* Wakeup moved to sk_stop() */ -} - -void -sk_stop(sock *s) -{ - struct birdloop *loop = birdloop_current(); - - sockets_remove(loop, s); - - if (loop->poll_active) - { - loop->close_scheduled = 1; - wakeup_kick(loop); - } - else - close(s->fd); - - s->fd = -1; -} - -static inline uint sk_want_events(sock *s) -{ return (s->rx_hook ? POLLIN : 0) | ((s->ttx != s->tpos) ? POLLOUT : 0); } - -/* -FIXME: this should be called from sock code - -static void -sockets_update(struct birdloop *loop, sock *s) -{ - if (s->index >= 0) - loop->poll_fd.data[s->index].events = sk_want_events(s); -} -*/ - -static void -sockets_prepare(struct birdloop *loop) -{ - BUFFER_SET(loop->poll_sk, loop->sock_num + 1); - BUFFER_SET(loop->poll_fd, loop->sock_num + 1); - - struct pollfd *pfd = loop->poll_fd.data; - sock **psk = loop->poll_sk.data; - uint i = 0; - node *n; - - WALK_LIST(n, loop->sock_list) - { - sock *s = SKIP_BACK(sock, n, n); - - ASSERT(i < loop->sock_num); - - s->index = i; - *psk = s; - pfd->fd = s->fd; - pfd->events = sk_want_events(s); - pfd->revents = 0; - - pfd++; - psk++; - i++; - } - - ASSERT(i == loop->sock_num); - - /* Add internal wakeup fd */ - *psk = NULL; - pfd->fd = loop->wakeup_fds[0]; - pfd->events = POLLIN; - pfd->revents = 0; - - loop->poll_changed = 0; -} - -static void -sockets_close_fds(struct birdloop *loop) -{ - struct pollfd *pfd = loop->poll_fd.data; - sock **psk = loop->poll_sk.data; - int poll_num = loop->poll_fd.used - 1; - - int i; - for (i = 0; i < poll_num; i++) - if (psk[i] == NULL) - close(pfd[i].fd); - - loop->close_scheduled = 0; -} - -int sk_read(sock *s, int revents); -int sk_write(sock *s); - -static void -sockets_fire(struct birdloop *loop) -{ - struct pollfd *pfd = loop->poll_fd.data; - sock **psk = loop->poll_sk.data; - int poll_num = loop->poll_fd.used - 1; - - times_update(&loop->time); - - /* Last fd is internal wakeup fd */ - if (pfd[poll_num].revents & POLLIN) - wakeup_drain(loop); - - int i; - for (i = 0; i < poll_num; pfd++, psk++, i++) - { - int e = 1; - - if (! pfd->revents) - continue; - - if (pfd->revents & POLLNVAL) - die("poll: invalid fd %d", pfd->fd); - - if (pfd->revents & POLLIN) - while (e && *psk && (*psk)->rx_hook) - e = sk_read(*psk, 0); - - e = 1; - if (pfd->revents & POLLOUT) - while (e && *psk) - e = sk_write(*psk); - } -} - - -/* - * Birdloop - */ - -static void * birdloop_main(void *arg); - -struct birdloop * -birdloop_new(void) -{ - /* FIXME: this init should be elsewhere and thread-safe */ - static int init = 0; - if (!init) - { birdloop_init_current(); init = 1; } - - pool *p = rp_new(NULL, "Birdloop root"); - struct birdloop *loop = mb_allocz(p, sizeof(struct birdloop)); - loop->pool = p; - pthread_mutex_init(&loop->mutex, NULL); - - wakeup_init(loop); - - events_init(loop); - timers_init(&loop->time, p); - sockets_init(loop); - - return loop; -} - -void -birdloop_start(struct birdloop *loop) -{ - int rv = pthread_create(&loop->thread, NULL, birdloop_main, loop); - if (rv) - die("pthread_create(): %M", rv); -} - -void -birdloop_stop(struct birdloop *loop) -{ - pthread_mutex_lock(&loop->mutex); - loop->stop_called = 1; - wakeup_do_kick(loop); - pthread_mutex_unlock(&loop->mutex); - - int rv = pthread_join(loop->thread, NULL); - if (rv) - die("pthread_join(): %M", rv); -} - -void -birdloop_free(struct birdloop *loop) -{ - rfree(loop->pool); -} - - -void -birdloop_enter(struct birdloop *loop) -{ - /* TODO: these functions could save and restore old context */ - pthread_mutex_lock(&loop->mutex); - birdloop_set_current(loop); -} - -void -birdloop_leave(struct birdloop *loop) -{ - /* TODO: these functions could save and restore old context */ - birdloop_set_current(NULL); - pthread_mutex_unlock(&loop->mutex); -} - -void -birdloop_mask_wakeups(struct birdloop *loop) -{ - pthread_mutex_lock(&loop->mutex); - loop->wakeup_masked = 1; - pthread_mutex_unlock(&loop->mutex); -} - -void -birdloop_unmask_wakeups(struct birdloop *loop) -{ - pthread_mutex_lock(&loop->mutex); - if (loop->wakeup_masked == 2) - wakeup_do_kick(loop); - loop->wakeup_masked = 0; - pthread_mutex_unlock(&loop->mutex); -} - -static void * -birdloop_main(void *arg) -{ - struct birdloop *loop = arg; - timer *t; - int rv, timeout; - - birdloop_set_current(loop); - - tmp_init(loop->pool); - - pthread_mutex_lock(&loop->mutex); - while (1) - { - events_fire(loop); - timers_fire(&loop->time); - - times_update(&loop->time); - if (events_waiting(loop)) - timeout = 0; - else if (t = timers_first(&loop->time)) - timeout = (tm_remains(t) TO_MS) + 1; - else - timeout = -1; - - if (loop->poll_changed) - sockets_prepare(loop); - - loop->poll_active = 1; - pthread_mutex_unlock(&loop->mutex); - - try: - rv = poll(loop->poll_fd.data, loop->poll_fd.used, timeout); - if (rv < 0) - { - if (errno == EINTR || errno == EAGAIN) - goto try; - die("poll: %m"); - } - - pthread_mutex_lock(&loop->mutex); - loop->poll_active = 0; - - if (loop->close_scheduled) - sockets_close_fds(loop); - - if (loop->stop_called) - break; - - if (rv) - sockets_fire(loop); - - timers_fire(&loop->time); - } - - loop->stop_called = 0; - pthread_mutex_unlock(&loop->mutex); - - return NULL; -} - - diff --git a/proto/bfd/io.h b/proto/bfd/io.h deleted file mode 100644 index ec706e9a..00000000 --- a/proto/bfd/io.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * BIRD -- I/O and event loop - * - * Can be freely distributed and used under the terms of the GNU GPL. - */ - -#ifndef _BIRD_BFD_IO_H_ -#define _BIRD_BFD_IO_H_ - -#include "nest/bird.h" -#include "lib/lists.h" -#include "lib/resource.h" -#include "lib/event.h" -#include "lib/timer.h" -#include "lib/socket.h" - - -void ev2_schedule(event *e); - -void sk_start(sock *s); -void sk_stop(sock *s); - -struct birdloop *birdloop_new(void); -void birdloop_start(struct birdloop *loop); -void birdloop_stop(struct birdloop *loop); -void birdloop_free(struct birdloop *loop); - -void birdloop_enter(struct birdloop *loop); -void birdloop_leave(struct birdloop *loop); -void birdloop_mask_wakeups(struct birdloop *loop); -void birdloop_unmask_wakeups(struct birdloop *loop); - - -#endif /* _BIRD_BFD_IO_H_ */ diff --git a/proto/bfd/packets.c b/proto/bfd/packets.c index cb5f0d89..a22f223b 100644 --- a/proto/bfd/packets.c +++ b/proto/bfd/packets.c @@ -416,7 +416,7 @@ bfd_err_hook(sock *sk, int err) sock * bfd_open_rx_sk(struct bfd_proto *p, int multihop, int af) { - sock *sk = sk_new(p->tpool); + sock *sk = sk_new(p->p.pool); sk->type = SK_UDP; sk->subtype = af; sk->sport = !multihop ? BFD_CONTROL_PORT : BFD_MULTI_CTL_PORT; @@ -430,12 +430,11 @@ bfd_open_rx_sk(struct bfd_proto *p, int multihop, int af) /* TODO: configurable ToS and priority */ sk->tos = IP_PREC_INTERNET_CONTROL; sk->priority = sk_priority_control; - sk->flags = SKF_THREAD | SKF_LADDR_RX | (!multihop ? SKF_TTL_RX : 0); + sk->flags = SKF_LADDR_RX | (!multihop ? SKF_TTL_RX : 0); - if (sk_open(sk) < 0) + if (sk_open(sk, p->p.loop) < 0) goto err; - sk_start(sk); return sk; err: @@ -462,12 +461,11 @@ bfd_open_rx_sk_bound(struct bfd_proto *p, ip_addr local, struct iface *ifa) /* TODO: configurable ToS and priority */ sk->tos = IP_PREC_INTERNET_CONTROL; sk->priority = sk_priority_control; - sk->flags = SKF_THREAD | SKF_BIND | (ifa ? SKF_TTL_RX : 0); + sk->flags = SKF_BIND | (ifa ? SKF_TTL_RX : 0); - if (sk_open(sk) < 0) + if (sk_open(sk, p->p.loop) < 0) goto err; - sk_start(sk); return sk; err: @@ -479,7 +477,7 @@ bfd_open_rx_sk_bound(struct bfd_proto *p, ip_addr local, struct iface *ifa) sock * bfd_open_tx_sk(struct bfd_proto *p, ip_addr local, struct iface *ifa) { - sock *sk = sk_new(p->tpool); + sock *sk = sk_new(p->p.pool); sk->type = SK_UDP; sk->saddr = local; sk->dport = ifa ? BFD_CONTROL_PORT : BFD_MULTI_CTL_PORT; @@ -494,12 +492,11 @@ bfd_open_tx_sk(struct bfd_proto *p, ip_addr local, struct iface *ifa) sk->tos = IP_PREC_INTERNET_CONTROL; sk->priority = sk_priority_control; sk->ttl = ifa ? 255 : -1; - sk->flags = SKF_THREAD | SKF_BIND | SKF_HIGH_PORT; + sk->flags = SKF_BIND | SKF_HIGH_PORT; - if (sk_open(sk) < 0) + if (sk_open(sk, p->p.loop) < 0) goto err; - sk_start(sk); return sk; err: diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 1e234b16..039fdff5 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -15,12 +15,13 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "conf/conf.h" #include "lib/resource.h" #include "lib/string.h" #include "lib/unaligned.h" +#include "lib/macro.h" #include "bgp.h" @@ -64,37 +65,72 @@ * format - Optional hook that converts eattr to textual representation. */ - -struct bgp_attr_desc { - const char *name; - uint type; - uint flags; - void (*export)(struct bgp_export_state *s, eattr *a); - int (*encode)(struct bgp_write_state *s, eattr *a, byte *buf, uint size); - void (*decode)(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to); - void (*format)(const eattr *ea, byte *buf, uint size); +union bgp_attr_desc { + struct ea_class class; + struct { + EA_CLASS_INSIDE; + uint flags; + void (*export)(struct bgp_export_state *s, eattr *a); + int (*encode)(struct bgp_write_state *s, eattr *a, byte *buf, uint size); + void (*decode)(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to); + }; }; -static const struct bgp_attr_desc bgp_attr_table[]; +static union bgp_attr_desc bgp_attr_table[]; +static inline const union bgp_attr_desc *bgp_find_attr_desc(eattr *a) +{ + const struct ea_class *class = ea_class_find(a->id); -static inline int bgp_attr_known(uint code); + if ((class < &bgp_attr_table[0].class) || (class >= &bgp_attr_table[BGP_ATTR_MAX].class)) + return NULL; -eattr * -bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintptr_t val) + return (const union bgp_attr_desc *) class; +} + +#define BGP_EA_ID(code) (bgp_attr_table[code].id) +#define EA_BGP_ID(code) (((union bgp_attr_desc *) ea_class_find(code)) - bgp_attr_table) + +void bgp_set_attr_u32(ea_list **to, uint code, uint flags, u32 val) +{ + const union bgp_attr_desc *desc = &bgp_attr_table[code]; + + ea_set_attr(to, EA_LITERAL_EMBEDDED( + &desc->class, + flags & ~BAF_EXT_LEN, + val + )); +} + +void bgp_set_attr_ptr(ea_list **to, uint code, uint flags, const struct adata *ad) { - ASSERT(bgp_attr_known(code)); + const union bgp_attr_desc *desc = &bgp_attr_table[code]; - return ea_set_attr( - attrs, - pool, - EA_CODE(PROTOCOL_BGP, code), - flags & ~BAF_EXT_LEN, - bgp_attr_table[code].type, - val - ); + ea_set_attr(to, EA_LITERAL_DIRECT_ADATA( + &desc->class, + flags & ~BAF_EXT_LEN, + ad + )); } +void +bgp_set_attr_data(ea_list **to, uint code, uint flags, void *data, uint len) +{ + const union bgp_attr_desc *desc = &bgp_attr_table[code]; + + ea_set_attr(to, EA_LITERAL_STORE_ADATA( + &desc->class, + flags & ~BAF_EXT_LEN, + data, + len + )); +} +void +bgp_unset_attr(ea_list **to, uint code) +{ + const union bgp_attr_desc *desc = &bgp_attr_table[code]; + ea_unset_attr(to, 0, &desc->class); +} #define REPORT(msg, args...) \ ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); }) @@ -151,7 +187,7 @@ bgp_encode_u8(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) if (size < (3+1)) return -1; - bgp_put_attr_hdr3(buf, EA_ID(a->id), a->flags, 1); + bgp_put_attr_hdr3(buf, EA_BGP_ID(a->id), a->flags, 1); buf[3] = a->u.data; return 3+1; @@ -163,7 +199,7 @@ bgp_encode_u32(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) if (size < (3+4)) return -1; - bgp_put_attr_hdr3(buf, EA_ID(a->id), a->flags, 4); + bgp_put_attr_hdr3(buf, EA_BGP_ID(a->id), a->flags, 4); put_u32(buf+3, a->u.data); return 3+4; @@ -177,7 +213,7 @@ bgp_encode_u32s(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size if (size < (4+len)) return -1; - uint hdr = bgp_put_attr_hdr(buf, EA_ID(a->id), a->flags, len); + uint hdr = bgp_put_attr_hdr(buf, EA_BGP_ID(a->id), a->flags, len); put_u32s(buf + hdr, (u32 *) a->u.ptr->data, len / 4); return hdr + len; @@ -198,7 +234,7 @@ bgp_put_attr(byte *buf, uint size, uint code, uint flags, const byte *data, uint static int bgp_encode_raw(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) { - return bgp_put_attr(buf, size, EA_ID(a->id), a->flags, a->u.ptr->data, a->u.ptr->length); + return bgp_put_attr(buf, size, EA_BGP_ID(a->id), a->flags, a->u.ptr->data, a->u.ptr->length); } @@ -336,9 +372,9 @@ bgp_aigp_set_metric(struct linpool *pool, const struct adata *ad, u64 metric) } int -bgp_total_aigp_metric_(rte *e, u64 *metric, const struct adata **ad) +bgp_total_aigp_metric_(const rte *e, u64 *metric, const struct adata **ad) { - eattr *a = ea_find(e->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AIGP)); + eattr *a = ea_find(e->attrs, BGP_EA_ID(BA_AIGP)); if (!a) return 0; @@ -347,7 +383,7 @@ bgp_total_aigp_metric_(rte *e, u64 *metric, const struct adata **ad) return 0; u64 aigp = get_u64(b + 3); - u64 step = e->attrs->igp_metric; + u64 step = rt_get_igp_metric(e); if (!rte_resolvable(e) || (step >= IGP_METRIC_UNKNOWN)) step = BGP_AIGP_MAX; @@ -366,7 +402,7 @@ bgp_total_aigp_metric_(rte *e, u64 *metric, const struct adata **ad) static inline int bgp_init_aigp_metric(rte *e, u64 *metric, const struct adata **ad) { - if (e->attrs->source == RTS_BGP) + if (rt_get_source_attr(e) == RTS_BGP) return 0; *metric = rt_get_igp_metric(e); @@ -375,7 +411,7 @@ bgp_init_aigp_metric(rte *e, u64 *metric, const struct adata **ad) } u32 -bgp_rte_igp_metric(struct rte *rt) +bgp_rte_igp_metric(const rte *rt) { u64 metric = bgp_total_aigp_metric(rt); return (u32) MIN(metric, (u64) IGP_METRIC_UNKNOWN); @@ -402,7 +438,7 @@ bgp_decode_origin(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte if (data[0] > 2) WITHDRAW(BAD_VALUE, "ORIGIN", data[0]); - bgp_set_attr_u32(to, s->pool, BA_ORIGIN, flags, data[0]); + bgp_set_attr_u32(to, BA_ORIGIN, flags, data[0]); } static void @@ -470,7 +506,7 @@ bgp_decode_as_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte !bgp_as_path_first_as_equal(data, len, p->remote_as)) WITHDRAW("Malformed AS_PATH attribute - %s", "First AS differs from neigbor AS"); - bgp_set_attr_data(to, s->pool, BA_AS_PATH, flags, data, len); + bgp_set_attr_data(to, BA_AS_PATH, flags, data, len); } @@ -542,7 +578,7 @@ bgp_decode_med(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *da WITHDRAW(BAD_LENGTH, "MULTI_EXIT_DISC", len); u32 val = get_u32(data); - bgp_set_attr_u32(to, s->pool, BA_MULTI_EXIT_DISC, flags, val); + bgp_set_attr_u32(to, BA_MULTI_EXIT_DISC, flags, val); } @@ -563,7 +599,7 @@ bgp_decode_local_pref(struct bgp_parse_state *s, uint code UNUSED, uint flags, b WITHDRAW(BAD_LENGTH, "LOCAL_PREF", len); u32 val = get_u32(data); - bgp_set_attr_u32(to, s->pool, BA_LOCAL_PREF, flags, val); + bgp_set_attr_u32(to, BA_LOCAL_PREF, flags, val); } @@ -573,7 +609,7 @@ bgp_decode_atomic_aggr(struct bgp_parse_state *s, uint code UNUSED, uint flags, if (len != 0) DISCARD(BAD_LENGTH, "ATOMIC_AGGR", len); - bgp_set_attr_data(to, s->pool, BA_ATOMIC_AGGR, flags, NULL, 0); + bgp_set_attr_data(to, BA_ATOMIC_AGGR, flags, NULL, 0); } static int @@ -607,7 +643,7 @@ bgp_decode_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flags, b len = aggregator_16to32(data, src); } - bgp_set_attr_data(to, s->pool, BA_AGGREGATOR, flags, data, len); + bgp_set_attr_data(to, BA_AGGREGATOR, flags, data, len); } static void @@ -636,7 +672,7 @@ bgp_decode_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, by struct adata *ad = lp_alloc_adata(s->pool, len); get_u32s(data, (u32 *) ad->data, len / 4); - bgp_set_attr_ptr(to, s->pool, BA_COMMUNITY, flags, ad); + bgp_set_attr_ptr(to, BA_COMMUNITY, flags, ad); } @@ -657,7 +693,7 @@ bgp_decode_originator_id(struct bgp_parse_state *s, uint code UNUSED, uint flags WITHDRAW(BAD_LENGTH, "ORIGINATOR_ID", len); u32 val = get_u32(data); - bgp_set_attr_u32(to, s->pool, BA_ORIGINATOR_ID, flags, val); + bgp_set_attr_u32(to, BA_ORIGINATOR_ID, flags, val); } @@ -682,7 +718,7 @@ bgp_decode_cluster_list(struct bgp_parse_state *s, uint code UNUSED, uint flags, struct adata *ad = lp_alloc_adata(s->pool, len); get_u32s(data, (u32 *) ad->data, len / 4); - bgp_set_attr_ptr(to, s->pool, BA_CLUSTER_LIST, flags, ad); + bgp_set_attr_ptr(to, BA_CLUSTER_LIST, flags, ad); } static void @@ -801,7 +837,7 @@ bgp_decode_ext_community(struct bgp_parse_state *s, uint code UNUSED, uint flags struct adata *ad = lp_alloc_adata(s->pool, len); get_u32s(data, (u32 *) ad->data, len / 4); - bgp_set_attr_ptr(to, s->pool, BA_EXT_COMMUNITY, flags, ad); + bgp_set_attr_ptr(to, BA_EXT_COMMUNITY, flags, ad); } @@ -814,7 +850,7 @@ bgp_decode_as4_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flag if (len != 8) DISCARD(BAD_LENGTH, "AS4_AGGREGATOR", len); - bgp_set_attr_data(to, s->pool, BA_AS4_AGGREGATOR, flags, data, len); + bgp_set_attr_data(to, BA_AS4_AGGREGATOR, flags, data, len); } static void @@ -844,7 +880,7 @@ bgp_decode_as4_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byt a = as_path_strip_confed(s->pool, a); } - bgp_set_attr_ptr(to, s->pool, BA_AS4_PATH, flags, a); + bgp_set_attr_ptr(to, BA_AS4_PATH, flags, a); } @@ -868,7 +904,7 @@ bgp_decode_aigp(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *d if (!bgp_aigp_valid(data, len, err, sizeof(err))) DISCARD("Malformed AIGP attribute - %s", err); - bgp_set_attr_data(to, s->pool, BA_AIGP, flags, data, len); + bgp_set_attr_data(to, BA_AIGP, flags, data, len); } static void @@ -900,7 +936,7 @@ bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint fla struct adata *ad = lp_alloc_adata(s->pool, len); get_u32s(data, (u32 *) ad->data, len / 4); - bgp_set_attr_ptr(to, s->pool, BA_LARGE_COMMUNITY, flags, ad); + bgp_set_attr_ptr(to, BA_LARGE_COMMUNITY, flags, ad); } @@ -911,14 +947,14 @@ bgp_decode_otc(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *da WITHDRAW(BAD_LENGTH, "OTC", len); u32 val = get_u32(data); - bgp_set_attr_u32(to, s->pool, BA_ONLY_TO_CUSTOMER, flags, val); + bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, flags, val); } static void bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a) { - net_addr *n = s->route->net->n.addr; + const net_addr *n = s->route->net; u32 *labels = (u32 *) a->u.ptr->data; uint lnum = a->u.ptr->length / 4; @@ -985,10 +1021,29 @@ bgp_format_mpls_label_stack(const eattr *a, byte *buf, uint size) } static inline void -bgp_decode_unknown(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to) +bgp_export_unknown(struct bgp_export_state *s UNUSED, eattr *a) +{ + if (!(a->flags & BAF_TRANSITIVE)) + UNSET(a); + + a->flags |= BAF_PARTIAL; +} + +static inline void +bgp_decode_unknown(struct bgp_parse_state *s UNUSED, uint code, uint flags, byte *data, uint len, ea_list **to) { + if (!(flags & BAF_OPTIONAL)) + WITHDRAW("Unknown attribute (code %u) - conflicting flags (%02x)", code, flags); + /* Cannot use bgp_set_attr_data() as it works on known attributes only */ - ea_set_attr_data(to, s->pool, EA_CODE(PROTOCOL_BGP, code), flags, EAF_TYPE_OPAQUE, data, len); + ea_set_attr_data(to, &bgp_attr_table[code].class, flags, data, len); +} + +static inline void +bgp_format_unknown(const eattr *a, byte *buf, uint size) +{ + if (a->flags & BAF_TRANSITIVE) + bsnprintf(buf, size, "(transitive)"); } @@ -996,10 +1051,10 @@ bgp_decode_unknown(struct bgp_parse_state *s, uint code, uint flags, byte *data, * Attribute table */ -static const struct bgp_attr_desc bgp_attr_table[] = { +static union bgp_attr_desc bgp_attr_table[BGP_ATTR_MAX] = { [BA_ORIGIN] = { - .name = "origin", - .type = EAF_TYPE_INT, + .name = "bgp_origin", + .type = T_ENUM_BGP_ORIGIN, .flags = BAF_TRANSITIVE, .export = bgp_export_origin, .encode = bgp_encode_u8, @@ -1007,69 +1062,69 @@ static const struct bgp_attr_desc bgp_attr_table[] = { .format = bgp_format_origin, }, [BA_AS_PATH] = { - .name = "as_path", - .type = EAF_TYPE_AS_PATH, + .name = "bgp_path", + .type = T_PATH, .flags = BAF_TRANSITIVE, .encode = bgp_encode_as_path, .decode = bgp_decode_as_path, }, [BA_NEXT_HOP] = { - .name = "next_hop", - .type = EAF_TYPE_IP_ADDRESS, + .name = "bgp_next_hop", + .type = T_IP, .flags = BAF_TRANSITIVE, .encode = bgp_encode_next_hop, .decode = bgp_decode_next_hop, .format = bgp_format_next_hop, }, [BA_MULTI_EXIT_DISC] = { - .name = "med", - .type = EAF_TYPE_INT, + .name = "bgp_med", + .type = T_INT, .flags = BAF_OPTIONAL, .encode = bgp_encode_u32, .decode = bgp_decode_med, }, [BA_LOCAL_PREF] = { - .name = "local_pref", - .type = EAF_TYPE_INT, + .name = "bgp_local_pref", + .type = T_INT, .flags = BAF_TRANSITIVE, .export = bgp_export_local_pref, .encode = bgp_encode_u32, .decode = bgp_decode_local_pref, }, [BA_ATOMIC_AGGR] = { - .name = "atomic_aggr", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_atomic_aggr", + .type = T_OPAQUE, .flags = BAF_TRANSITIVE, .encode = bgp_encode_raw, .decode = bgp_decode_atomic_aggr, }, [BA_AGGREGATOR] = { - .name = "aggregator", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_aggregator", + .type = T_OPAQUE, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .encode = bgp_encode_aggregator, .decode = bgp_decode_aggregator, .format = bgp_format_aggregator, }, [BA_COMMUNITY] = { - .name = "community", - .type = EAF_TYPE_INT_SET, + .name = "bgp_community", + .type = T_CLIST, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .export = bgp_export_community, .encode = bgp_encode_u32s, .decode = bgp_decode_community, }, [BA_ORIGINATOR_ID] = { - .name = "originator_id", - .type = EAF_TYPE_ROUTER_ID, + .name = "bgp_originator_id", + .type = T_QUAD, .flags = BAF_OPTIONAL, .export = bgp_export_originator_id, .encode = bgp_encode_u32, .decode = bgp_decode_originator_id, }, [BA_CLUSTER_LIST] = { - .name = "cluster_list", - .type = EAF_TYPE_INT_SET, + .name = "bgp_cluster_list", + .type = T_CLIST, .flags = BAF_OPTIONAL, .export = bgp_export_cluster_list, .encode = bgp_encode_u32s, @@ -1077,43 +1132,47 @@ static const struct bgp_attr_desc bgp_attr_table[] = { .format = bgp_format_cluster_list, }, [BA_MP_REACH_NLRI] = { - .name = "mp_reach_nlri", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_mp_reach_nlri", + .type = T_OPAQUE, + .hidden = 1, .flags = BAF_OPTIONAL, .decode = bgp_decode_mp_reach_nlri, }, [BA_MP_UNREACH_NLRI] = { - .name = "mp_unreach_nlri", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_mp_unreach_nlri", + .type = T_OPAQUE, + .hidden = 1, .flags = BAF_OPTIONAL, .decode = bgp_decode_mp_unreach_nlri, }, [BA_EXT_COMMUNITY] = { - .name = "ext_community", - .type = EAF_TYPE_EC_SET, + .name = "bgp_ext_community", + .type = T_ECLIST, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .export = bgp_export_ext_community, .encode = bgp_encode_u32s, .decode = bgp_decode_ext_community, }, [BA_AS4_PATH] = { - .name = "as4_path", - .type = EAF_TYPE_AS_PATH, + .name = "bgp_as4_path", + .type = T_PATH, + .hidden = 1, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .encode = bgp_encode_raw, .decode = bgp_decode_as4_path, }, [BA_AS4_AGGREGATOR] = { - .name = "as4_aggregator", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_as4_aggregator", + .type = T_OPAQUE, + .hidden = 1, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .encode = bgp_encode_raw, .decode = bgp_decode_as4_aggregator, .format = bgp_format_aggregator, }, [BA_AIGP] = { - .name = "aigp", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_aigp", + .type = T_OPAQUE, .flags = BAF_OPTIONAL | BAF_DECODE_FLAGS, .export = bgp_export_aigp, .encode = bgp_encode_raw, @@ -1121,8 +1180,8 @@ static const struct bgp_attr_desc bgp_attr_table[] = { .format = bgp_format_aigp, }, [BA_LARGE_COMMUNITY] = { - .name = "large_community", - .type = EAF_TYPE_LC_SET, + .name = "bgp_large_community", + .type = T_LCLIST, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .export = bgp_export_large_community, .encode = bgp_encode_u32s, @@ -1130,14 +1189,15 @@ static const struct bgp_attr_desc bgp_attr_table[] = { }, [BA_ONLY_TO_CUSTOMER] = { .name = "otc", - .type = EAF_TYPE_INT, + .type = T_INT, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .encode = bgp_encode_u32, .decode = bgp_decode_otc, }, [BA_MPLS_LABEL_STACK] = { - .name = "mpls_label_stack", - .type = EAF_TYPE_INT_SET, + .name = "bgp_mpls_label_stack", + .type = T_CLIST, + .readonly = 1, .export = bgp_export_mpls_label_stack, .encode = bgp_encode_mpls_label_stack, .decode = bgp_decode_mpls_label_stack, @@ -1145,12 +1205,32 @@ static const struct bgp_attr_desc bgp_attr_table[] = { }, }; -static inline int -bgp_attr_known(uint code) +eattr * +bgp_find_attr(ea_list *attrs, uint code) { - return (code < ARRAY_SIZE(bgp_attr_table)) && bgp_attr_table[code].name; + return ea_find(attrs, BGP_EA_ID(code)); } +void +bgp_register_attrs(void) +{ + for (uint i=0; i<ARRAY_SIZE(bgp_attr_table); i++) + { + if (!bgp_attr_table[i].name) + bgp_attr_table[i] = (union bgp_attr_desc) { + .name = mb_sprintf(&root_pool, "bgp_unknown_0x%02x", i), + .type = T_OPAQUE, + .flags = BAF_OPTIONAL, + .readonly = 1, + .export = bgp_export_unknown, + .encode = bgp_encode_raw, + .decode = bgp_decode_unknown, + .format = bgp_format_unknown, + }; + + ea_register_init(&bgp_attr_table[i].class); + } +} /* * Attribute export @@ -1159,38 +1239,24 @@ bgp_attr_known(uint code) static inline void bgp_export_attr(struct bgp_export_state *s, eattr *a, ea_list *to) { - if (EA_PROTO(a->id) != PROTOCOL_BGP) + const union bgp_attr_desc *desc = bgp_find_attr_desc(a); + if (!desc) return; - uint code = EA_ID(a->id); - - if (bgp_attr_known(code)) - { - const struct bgp_attr_desc *desc = &bgp_attr_table[code]; - - /* The flags might have been zero if the attr was added by filters */ - a->flags = (a->flags & BAF_PARTIAL) | desc->flags; - - /* Set partial bit if new opt-trans attribute is attached to non-local route */ - if ((s->src != NULL) && (a->originated) && - (a->flags & BAF_OPTIONAL) && (a->flags & BAF_TRANSITIVE)) - a->flags |= BAF_PARTIAL; + /* The flags might have been zero if the attr was added locally */ + a->flags = (a->flags & BAF_PARTIAL) | desc->flags; - /* Call specific hook */ - CALL(desc->export, s, a); + /* Set partial bit if new opt-trans attribute is attached to non-local route */ + if ((s->src != NULL) && (a->originated) && + (a->flags & BAF_OPTIONAL) && (a->flags & BAF_TRANSITIVE)) + a->flags |= BAF_PARTIAL; - /* Attribute might become undefined in hook */ - if (a->undef) - return; - } - else - { - /* Don't re-export unknown non-transitive attributes */ - if (!(a->flags & BAF_TRANSITIVE)) - return; + /* Call specific hook */ + CALL(desc->export, s, a); - a->flags |= BAF_PARTIAL; - } + /* Attribute might become undefined in hook */ + if (a->undef) + return; /* Append updated attribute */ to->attrs[to->count++] = *a; @@ -1210,12 +1276,11 @@ bgp_export_attr(struct bgp_export_state *s, eattr *a, ea_list *to) * Result: one sorted attribute list segment, or NULL if attributes are unsuitable. */ static inline ea_list * -bgp_export_attrs(struct bgp_export_state *s, ea_list *attrs) +bgp_export_attrs(struct bgp_export_state *s, ea_list *a) { /* Merge the attribute list */ - ea_list *new = lp_alloc(s->pool, ea_scan(attrs)); - ea_merge(attrs, new); - ea_sort(new); + ea_list *new = ea_normalize(a, 0); + ASSERT_DIE(new); uint i, count; count = new->count; @@ -1239,14 +1304,9 @@ bgp_export_attrs(struct bgp_export_state *s, ea_list *attrs) static inline int bgp_encode_attr(struct bgp_write_state *s, eattr *a, byte *buf, uint size) { - ASSERT(EA_PROTO(a->id) == PROTOCOL_BGP); - - uint code = EA_ID(a->id); - - if (bgp_attr_known(code)) - return bgp_attr_table[code].encode(s, a, buf, size); - else - return bgp_encode_raw(s, a, buf, size); + const union bgp_attr_desc *desc = bgp_find_attr_desc(a); + ASSERT_DIE(desc); + return desc->encode(s, a, buf, size); } /** @@ -1311,7 +1371,7 @@ bgp_cluster_list_loopy(struct bgp_proto *p, ea_list *attrs) } static inline void -bgp_decode_attr(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to) +bgp_decode_attr(struct bgp_parse_state *s, byte code, byte flags, byte *data, uint len, ea_list **to) { /* Handle duplicate attributes; RFC 7606 3 (g) */ if (BIT32_TEST(s->attrs_seen, code)) @@ -1323,24 +1383,15 @@ bgp_decode_attr(struct bgp_parse_state *s, uint code, uint flags, byte *data, ui } BIT32_SET(s->attrs_seen, code); - if (bgp_attr_known(code)) - { - const struct bgp_attr_desc *desc = &bgp_attr_table[code]; - - /* Handle conflicting flags; RFC 7606 3 (c) */ - if (((flags ^ desc->flags) & (BAF_OPTIONAL | BAF_TRANSITIVE)) && - !(desc->flags & BAF_DECODE_FLAGS)) - WITHDRAW("Malformed %s attribute - conflicting flags (%02x)", desc->name, flags); + ASSERT_DIE(bgp_attr_table[code].id); + const union bgp_attr_desc *desc = &bgp_attr_table[code]; - desc->decode(s, code, flags, data, len, to); - } - else /* Unknown attribute */ - { - if (!(flags & BAF_OPTIONAL)) - WITHDRAW("Unknown attribute (code %u) - conflicting flags (%02x)", code, flags); + /* Handle conflicting flags; RFC 7606 3 (c) */ + if (((flags ^ desc->flags) & (BAF_OPTIONAL | BAF_TRANSITIVE)) && + !(desc->flags & BAF_DECODE_FLAGS)) + WITHDRAW("Malformed %s attribute - conflicting flags (%02x, expected %02x)", desc->name, flags, desc->flags); - bgp_decode_unknown(s, code, flags, data, len, to); - } + desc->decode(s, code, flags, data, len, to); } /** @@ -1358,7 +1409,8 @@ bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len) { struct bgp_proto *p = s->proto; ea_list *attrs = NULL; - uint code, flags, alen; + uint alen; + byte code, flags; byte *pos = data; /* Parse the attributes */ @@ -1439,7 +1491,7 @@ bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len) /* If there is no local preference, define one */ if (!BIT32_TEST(s->attrs_seen, BA_LOCAL_PREF)) - bgp_set_attr_u32(&attrs, s->pool, BA_LOCAL_PREF, 0, p->cf->default_local_pref); + bgp_set_attr_u32(&attrs, BA_LOCAL_PREF, 0, p->cf->default_local_pref); return attrs; @@ -1463,20 +1515,20 @@ loop: } void -bgp_finish_attrs(struct bgp_parse_state *s, rta *a) +bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to) { /* AIGP test here instead of in bgp_decode_aigp() - we need to know channel */ if (BIT32_TEST(s->attrs_seen, BA_AIGP) && !s->channel->cf->aigp) { REPORT("Discarding AIGP attribute received on non-AIGP session"); - bgp_unset_attr(&a->eattrs, s->pool, BA_AIGP); + bgp_unset_attr(to, BA_AIGP); } /* Handle OTC ingress procedure, RFC 9234 */ if (bgp_channel_is_role_applicable(s->channel)) { struct bgp_proto *p = s->proto; - eattr *e = bgp_find_attr(a->eattrs, BA_ONLY_TO_CUSTOMER); + eattr *e = bgp_find_attr(*to, BA_ONLY_TO_CUSTOMER); /* Reject routes from downstream if they are leaked */ if (e && (p->cf->local_role == BGP_ROLE_PROVIDER || @@ -1492,7 +1544,7 @@ bgp_finish_attrs(struct bgp_parse_state *s, rta *a) if (!e && (p->cf->local_role == BGP_ROLE_CUSTOMER || p->cf->local_role == BGP_ROLE_PEER || p->cf->local_role == BGP_ROLE_RS_CLIENT)) - bgp_set_attr_u32(&a->eattrs, s->pool, BA_ONLY_TO_CUSTOMER, 0, p->cf->remote_as); + bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, 0, p->cf->remote_as); } } @@ -1512,8 +1564,8 @@ bgp_finish_attrs(struct bgp_parse_state *s, rta *a) HASH_DEFINE_REHASH_FN(RBH, struct bgp_bucket) -void -bgp_init_bucket_table(struct bgp_channel *c) +static void +bgp_init_bucket_table(struct bgp_pending_tx *c) { HASH_INIT(c->bucket_hash, c->pool, 8); @@ -1521,24 +1573,8 @@ bgp_init_bucket_table(struct bgp_channel *c) c->withdraw_bucket = NULL; } -void -bgp_free_bucket_table(struct bgp_channel *c) -{ - HASH_FREE(c->bucket_hash); - - struct bgp_bucket *b; - WALK_LIST_FIRST(b, c->bucket_queue) - { - rem_node(&b->send_node); - mb_free(b); - } - - mb_free(c->withdraw_bucket); - c->withdraw_bucket = NULL; -} - static struct bgp_bucket * -bgp_get_bucket(struct bgp_channel *c, ea_list *new) +bgp_get_bucket(struct bgp_pending_tx *c, ea_list *new) { /* Hash and lookup */ u32 hash = ea_hash(new); @@ -1547,55 +1583,27 @@ bgp_get_bucket(struct bgp_channel *c, ea_list *new) if (b) return b; - uint ea_size = sizeof(ea_list) + new->count * sizeof(eattr); - uint ea_size_aligned = BIRD_ALIGN(ea_size, CPU_STRUCT_ALIGN); - uint size = sizeof(struct bgp_bucket) + ea_size_aligned; - uint i; - byte *dest; - - /* Gather total size of non-inline attributes */ - for (i = 0; i < new->count; i++) - { - eattr *a = &new->attrs[i]; - - if (!(a->type & EAF_EMBEDDED)) - size += BIRD_ALIGN(sizeof(struct adata) + a->u.ptr->length, CPU_STRUCT_ALIGN); - } + /* Scan the list for total size */ + uint ea_size = BIRD_CPU_ALIGN(ea_list_size(new)); + uint size = sizeof(struct bgp_bucket) + ea_size; - /* Create the bucket */ + /* Allocate the bucket */ b = mb_alloc(c->pool, size); *b = (struct bgp_bucket) { }; init_list(&b->prefixes); b->hash = hash; - /* Copy list of extended attributes */ - memcpy(b->eattrs, new, ea_size); - dest = ((byte *) b->eattrs) + ea_size_aligned; - - /* Copy values of non-inline attributes */ - for (i = 0; i < new->count; i++) - { - eattr *a = &b->eattrs->attrs[i]; - - if (!(a->type & EAF_EMBEDDED)) - { - const struct adata *oa = a->u.ptr; - struct adata *na = (struct adata *) dest; - memcpy(na, oa, sizeof(struct adata) + oa->length); - a->u.ptr = na; - dest += BIRD_ALIGN(sizeof(struct adata) + na->length, CPU_STRUCT_ALIGN); - } - } + /* Copy the ea_list */ + ea_list_copy(b->eattrs, new, ea_size); - /* Insert the bucket to send queue and bucket hash */ - add_tail(&c->bucket_queue, &b->send_node); + /* Insert the bucket to bucket hash */ HASH_INSERT2(c->bucket_hash, RBH, c->pool, b); return b; } static struct bgp_bucket * -bgp_get_withdraw_bucket(struct bgp_channel *c) +bgp_get_withdraw_bucket(struct bgp_pending_tx *c) { if (!c->withdraw_bucket) { @@ -1606,25 +1614,45 @@ bgp_get_withdraw_bucket(struct bgp_channel *c) return c->withdraw_bucket; } -void -bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b) +static void +bgp_free_bucket(struct bgp_pending_tx *c, struct bgp_bucket *b) { - rem_node(&b->send_node); HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b); mb_free(b); } +int +bgp_done_bucket(struct bgp_channel *bc, struct bgp_bucket *b) +{ + struct bgp_pending_tx *c = bc->ptx; + + /* Won't free the withdraw bucket */ + if (b == c->withdraw_bucket) + return 0; + + if (EMPTY_LIST(b->prefixes)) + rem_node(&b->send_node); + + if (b->px_uc || !EMPTY_LIST(b->prefixes)) + return 0; + + bgp_free_bucket(c, b); + return 1; +} + void -bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_defer_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { + struct bgp_pending_tx *c = bc->ptx; rem_node(&b->send_node); add_tail(&c->bucket_queue, &b->send_node); } void -bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_withdraw_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { - struct bgp_proto *p = (void *) c->c.proto; + struct bgp_proto *p = (void *) bc->c.proto; + struct bgp_pending_tx *c = bc->ptx; struct bgp_bucket *wb = bgp_get_withdraw_bucket(c); log(L_ERR "%s: Attribute list too long", p->p.name); @@ -1633,8 +1661,8 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) struct bgp_prefix *px = HEAD(b->prefixes); log(L_ERR "%s: - withdrawing %N", p->p.name, &px->net); - rem_node(&px->buck_node); - add_tail(&wb->prefixes, &px->buck_node); + rem_node(&px->buck_node_xx); + add_tail(&wb->prefixes, &px->buck_node_xx); } } @@ -1645,7 +1673,7 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) #define PXH_KEY(px) px->net, px->path_id, px->hash #define PXH_NEXT(px) px->next -#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && i1 == i2 && net_equal(n1, n2) +#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2) #define PXH_FN(n,i,h) h #define PXH_REHASH bgp_pxh_rehash @@ -1654,34 +1682,34 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) HASH_DEFINE_REHASH_FN(PXH, struct bgp_prefix) -void -bgp_init_prefix_table(struct bgp_channel *c) +static void +bgp_init_prefix_table(struct bgp_channel *bc) { + struct bgp_pending_tx *c = bc->ptx; HASH_INIT(c->prefix_hash, c->pool, 8); - uint alen = net_addr_length[c->c.net_type]; + uint alen = net_addr_length[bc->c.net_type]; c->prefix_slab = alen ? sl_new(c->pool, sizeof(struct bgp_prefix) + alen) : NULL; } -void -bgp_free_prefix_table(struct bgp_channel *c) -{ - HASH_FREE(c->prefix_hash); - - rfree(c->prefix_slab); - c->prefix_slab = NULL; -} - static struct bgp_prefix * -bgp_get_prefix(struct bgp_channel *c, net_addr *net, u32 path_id) +bgp_get_prefix(struct bgp_pending_tx *c, const net_addr *net, struct rte_src *src, int add_path_tx) { + u32 path_id = src->global_id; + u32 path_id_hash = add_path_tx ? path_id : 0; /* We must use a different hash function than the rtable */ - u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id)); - struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id, hash); + u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id_hash)); + struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id_hash, hash); if (px) { - rem_node(&px->buck_node); + if (!add_path_tx && (path_id != px->path_id)) + { + rt_unlock_source(rt_find_source_global(px->path_id)); + rt_lock_source(src); + px->path_id = path_id; + } + return px; } @@ -1694,24 +1722,306 @@ bgp_get_prefix(struct bgp_channel *c, net_addr *net, u32 path_id) px->hash = hash; px->path_id = path_id; net_copy(px->net, net); + rt_lock_source(src); HASH_INSERT2(c->prefix_hash, PXH, c->pool, px); return px; } -void -bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px) +static void bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px); + +static inline int +bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *b) +{ +#define IS_WITHDRAW_BUCKET(b) ((b) == c->ptx->withdraw_bucket) +#define BPX_TRACE(what) do { \ + if (c->c.debug & D_ROUTES) log(L_TRACE "%s.%s < %s %N %uG %s", \ + c->c.proto->name, c->c.name, what, \ + px->net, px->path_id, IS_WITHDRAW_BUCKET(b) ? "withdraw" : "update"); } while (0) + px->lastmod = current_time(); + + /* Already queued for the same bucket */ + if (px->cur == b) + { + BPX_TRACE("already queued"); + return 0; + } + + /* Unqueue from the old bucket */ + if (px->cur) + { + rem_node(&px->buck_node_xx); + bgp_done_bucket(c, px->cur); + } + + /* The new bucket is the same as we sent before */ + if ((px->last == b) || c->c.out_table && !px->last && IS_WITHDRAW_BUCKET(b)) + { + if (px->cur) + BPX_TRACE("reverted"); + else + BPX_TRACE("already sent"); + + /* Well, we haven't sent anything yet */ + if (!px->last) + bgp_free_prefix(c->ptx, px); + + px->cur = NULL; + return 0; + } + + /* Enqueue the bucket if it has been empty */ + if (!IS_WITHDRAW_BUCKET(b) && EMPTY_LIST(b->prefixes)) + add_tail(&c->ptx->bucket_queue, &b->send_node); + + /* Enqueue to the new bucket and indicate the change */ + add_tail(&b->prefixes, &px->buck_node_xx); + px->cur = b; + + BPX_TRACE("queued"); + return 1; + +#undef BPX_TRACE +} + +static void +bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px) { - rem_node(&px->buck_node); HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px); + rt_unlock_source(rt_find_source_global(px->path_id)); + if (c->prefix_slab) sl_free(px); else mb_free(px); } +void +bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck) +{ + /* Cleanup: We're called from bucket senders. */ + ASSERT_DIE(px->cur == buck); + rem_node(&px->buck_node_xx); + + /* We may want to store the updates */ + if (c->c.out_table) + { + /* Nothing to be sent right now */ + px->cur = NULL; + + /* Unref the previous sent version */ + if (px->last) + px->last->px_uc--; + + /* Ref the current sent version */ + if (!IS_WITHDRAW_BUCKET(buck)) + { + px->last = buck; + px->last->px_uc++; + return; + } + + /* Prefixes belonging to the withdraw bucket are freed always */ + } + + bgp_free_prefix(c->ptx, px); +} + +static void +bgp_pending_tx_rfree(resource *r) +{ + struct bgp_pending_tx *ptx = SKIP_BACK(struct bgp_pending_tx, r, r); + + HASH_WALK(ptx->prefix_hash, next, n) + rt_unlock_source(rt_find_source_global(n->path_id)); + HASH_WALK_END; +} + +static void bgp_pending_tx_dump(resource *r UNUSED, unsigned indent UNUSED) { debug("\n"); } + +static struct resclass bgp_pending_tx_class = { + .name = "BGP Pending TX", + .size = sizeof(struct bgp_pending_tx), + .free = bgp_pending_tx_rfree, + .dump = bgp_pending_tx_dump, +}; + +void +bgp_init_pending_tx(struct bgp_channel *c) +{ + ASSERT_DIE(!c->ptx); + + pool *p = rp_new(c->pool, "BGP Pending TX"); + c->ptx = ralloc(p, &bgp_pending_tx_class); + c->ptx->pool = p; + + bgp_init_bucket_table(c->ptx); + bgp_init_prefix_table(c); +} + +void +bgp_free_pending_tx(struct bgp_channel *c) +{ + ASSERT_DIE(c->ptx); + ASSERT_DIE(c->ptx->pool); + + rfree(c->ptx->pool); + c->ptx = NULL; +} + + +/* + * Prefix hash table exporter + */ + +struct bgp_out_export_hook { + struct rt_export_hook h; + u32 hash_iter; /* Iterator over hash */ +}; + +static void +bgp_out_table_feed(void *data) +{ + struct bgp_out_export_hook *hook = data; + struct bgp_channel *bc = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->h.table); + struct bgp_pending_tx *c = bc->ptx; + + int max = 512; + + const net_addr *neq = (hook->h.req->addr_mode == TE_ADDR_EQUAL) ? hook->h.req->addr : NULL; + const net_addr *cand = NULL; + + do { + HASH_WALK_ITER(c->prefix_hash, PXH, n, hook->hash_iter) + { + switch (hook->h.req->addr_mode) + { + case TE_ADDR_IN: + if (!net_in_netX(n->net, hook->h.req->addr)) + continue; + /* fall through */ + case TE_ADDR_NONE: + /* Splitting only for multi-net exports */ + if (--max <= 0) + HASH_WALK_ITER_PUT; + break; + + case TE_ADDR_FOR: + if (!neq) + { + if (net_in_netX(hook->h.req->addr, n->net) && (!cand || (n->net->length > cand->length))) + cand = n->net; + continue; + } + /* fall through */ + case TE_ADDR_EQUAL: + if (!net_equal(n->net, neq)) + continue; + break; + } + + struct bgp_bucket *buck = n->cur ?: n->last; + ea_list *ea = NULL; + if (buck == c->withdraw_bucket) + ea_set_dest(&ea, 0, RTD_UNREACHABLE); + else + { + ea = buck->eattrs; + eattr *eanh = bgp_find_attr(ea, BA_NEXT_HOP); + ASSERT_DIE(eanh); + const ip_addr *nh = (const void *) eanh->u.ptr->data; + + struct nexthop_adata nhad = { + .ad = { .length = sizeof (struct nexthop_adata) - sizeof (struct adata), }, + .nh = { .gw = nh[0], }, + }; + + ea_set_attr(&ea, EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, tmp_copy_adata(&nhad.ad))); + } + + struct rte_storage es = { + .rte = { + .attrs = ea, + .net = n->net, + .src = rt_find_source_global(n->path_id), + .sender = NULL, + .lastmod = n->lastmod, + .flags = n->cur ? REF_PENDING : 0, + }, + }; + + struct rt_pending_export rpe = { + .new = &es, .new_best = &es, + }; + + if (hook->h.req->export_bulk) + { + const rte *feed = &es.rte; + hook->h.req->export_bulk(hook->h.req, n->net, &rpe, &rpe, &feed, 1); + } + else if (hook->h.req->export_one) + hook->h.req->export_one(hook->h.req, n->net, &rpe); + else + bug("No export method in export request"); + } + HASH_WALK_ITER_END; + + neq = cand; + cand = NULL; + } while (neq); + + if (hook->hash_iter) + ev_schedule_work(&hook->h.event); + else + rt_set_export_state(&hook->h, TES_READY); +} + +static void +bgp_out_table_export_start(struct rt_exporter *re, struct rt_export_request *req) +{ + req->hook = rt_alloc_export(re, sizeof(struct bgp_out_export_hook)); + req->hook->req = req; + + struct bgp_out_export_hook *hook = SKIP_BACK(struct bgp_out_export_hook, h, req->hook); + + hook->h.event.hook = bgp_out_table_feed; + rt_init_export(re, req->hook); +} + +static void +bgp_out_table_export_done(void *data) +{ + struct bgp_out_export_hook *hook = data; + struct rt_export_request *req = hook->h.req; + void (*stopped)(struct rt_export_request *) = hook->h.stopped; + + rt_export_stopped(&hook->h); + CALL(stopped, req); +} + +static const struct rt_exporter_class bgp_out_table_export_class = { + .start = bgp_out_table_export_start, + .done = bgp_out_table_export_done, +}; + +void +bgp_setup_out_table(struct bgp_channel *c) +{ + ASSERT_DIE(c->c.out_table == NULL); + + c->prefix_exporter = (struct rt_exporter) { + .class = &bgp_out_table_export_class, + .addr_type = c->c.table->addr_type, + .rp = c->c.proto->pool, + }; + + rt_exporter_init(&c->prefix_exporter); + + c->c.out_table = &c->prefix_exporter; +} + /* * BGP protocol glue @@ -1720,9 +2030,8 @@ bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px) int bgp_preexport(struct channel *C, rte *e) { - struct proto *SRC = e->src->proto; struct bgp_proto *p = (struct bgp_proto *) C->proto; - struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (struct bgp_proto *) SRC : NULL; + struct bgp_proto *src = bgp_rte_proto(e); struct bgp_channel *c = (struct bgp_channel *) C; /* Ignore non-BGP channels */ @@ -1738,8 +2047,20 @@ bgp_preexport(struct channel *C, rte *e) return 0; /* Reject flowspec that failed validation */ - if ((e->attrs->dest == RTD_UNREACHABLE) && net_is_flow(e->net->n.addr)) - return -1; + if (net_is_flow(e->net)) + switch (rt_get_flowspec_valid(e)) + { + case FLOWSPEC_VALID: + break; + case FLOWSPEC_INVALID: + return -1; + case FLOWSPEC_UNKNOWN: + ASSUME((rt_get_source_attr(e) != RTS_BGP) || + !((struct bgp_channel *) SKIP_BACK(struct channel, in_req, e->sender->req))->base_table); + break; + case FLOWSPEC__MAX: + bug("This never happens."); + } /* IBGP route reflection, RFC 4456 */ if (p->is_internal && src->is_internal && (p->local_as == src->local_as)) @@ -1750,14 +2071,14 @@ bgp_preexport(struct channel *C, rte *e) /* Generally, this should be handled when path is received, but we check it also here as rr_cluster_id may be undefined or different in src. */ - if (p->rr_cluster_id && bgp_cluster_list_loopy(p, e->attrs->eattrs)) + if (p->rr_cluster_id && bgp_cluster_list_loopy(p, e->attrs)) return -1; } /* Handle well-known communities, RFC 1997 */ struct eattr *a; if (p->cf->interpret_communities && - (a = bgp_find_attr(e->attrs->eattrs, BA_COMMUNITY))) + (a = bgp_find_attr(e->attrs, BA_COMMUNITY))) { const struct adata *d = a->u.ptr; @@ -1781,7 +2102,7 @@ bgp_preexport(struct channel *C, rte *e) /* Do not export routes marked with OTC to upstream, RFC 9234 */ if (bgp_channel_is_role_applicable(c)) { - a = bgp_find_attr(e->attrs->eattrs, BA_ONLY_TO_CUSTOMER); + a = bgp_find_attr(e->attrs, BA_ONLY_TO_CUSTOMER); if (a && (p->cf->local_role==BGP_ROLE_CUSTOMER || p->cf->local_role==BGP_ROLE_PEER || p->cf->local_role==BGP_ROLE_RS_CLIENT)) @@ -1794,8 +2115,7 @@ bgp_preexport(struct channel *C, rte *e) static ea_list * bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *attrs0, struct linpool *pool) { - struct proto *SRC = e->src->proto; - struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (void *) SRC : NULL; + struct bgp_proto *src = bgp_rte_proto(e); struct bgp_export_state s = { .proto = p, .channel = c, .pool = pool, .src = src, .route = e, .mpls = c->desc->mpls }; ea_list *attrs = attrs0; eattr *a; @@ -1803,7 +2123,7 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at /* ORIGIN attribute - mandatory, attach if missing */ if (! bgp_find_attr(attrs0, BA_ORIGIN)) - bgp_set_attr_u32(&attrs, pool, BA_ORIGIN, 0, src ? ORIGIN_INCOMPLETE : ORIGIN_IGP); + bgp_set_attr_u32(&attrs, BA_ORIGIN, 0, src ? ORIGIN_INCOMPLETE : ORIGIN_IGP); /* AS_PATH attribute - mandatory */ a = bgp_find_attr(attrs0, BA_AS_PATH); @@ -1818,24 +2138,24 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at { /* IBGP or route server -> just ensure there is one */ if (!a) - bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, &null_adata); + bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, &null_adata); } else if (p->is_interior) { /* Confederation -> prepend ASN as AS_CONFED_SEQUENCE */ ad = as_path_prepend2(pool, ad, AS_PATH_CONFED_SEQUENCE, p->public_as); - bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, ad); + bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, ad); } else /* Regular EBGP (no RS, no confederation) */ { /* Regular EBGP -> prepend ASN as regular sequence */ ad = as_path_prepend2(pool, ad, AS_PATH_SEQUENCE, p->public_as); - bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, ad); + bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, ad); /* MULTI_EXIT_DESC attribute - accept only if set in export filter */ a = bgp_find_attr(attrs0, BA_MULTI_EXIT_DISC); if (a && !(a->fresh)) - bgp_unset_attr(&attrs, pool, BA_MULTI_EXIT_DISC); + bgp_unset_attr(&attrs, BA_MULTI_EXIT_DISC); } /* NEXT_HOP attribute - delegated to AF-specific hook */ @@ -1844,7 +2164,7 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at /* LOCAL_PREF attribute - required for IBGP, attach if missing */ if (p->is_interior && ! bgp_find_attr(attrs0, BA_LOCAL_PREF)) - bgp_set_attr_u32(&attrs, pool, BA_LOCAL_PREF, 0, p->cf->default_local_pref); + bgp_set_attr_u32(&attrs, BA_LOCAL_PREF, 0, p->cf->default_local_pref); /* AIGP attribute - accumulate local metric or originate new one */ u64 metric; @@ -1853,7 +2173,7 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at (c->cf->aigp_originate && bgp_init_aigp_metric(e, &metric, &ad)))) { ad = bgp_aigp_set_metric(pool, ad, metric); - bgp_set_attr_ptr(&attrs, pool, BA_AIGP, 0, ad); + bgp_set_attr_ptr(&attrs, BA_AIGP, 0, ad); } /* IBGP route reflection, RFC 4456 */ @@ -1861,7 +2181,7 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at { /* ORIGINATOR_ID attribute - attach if not already set */ if (! bgp_find_attr(attrs0, BA_ORIGINATOR_ID)) - bgp_set_attr_u32(&attrs, pool, BA_ORIGINATOR_ID, 0, src->remote_id); + bgp_set_attr_u32(&attrs, BA_ORIGINATOR_ID, 0, src->remote_id); /* CLUSTER_LIST attribute - prepend cluster ID */ a = bgp_find_attr(attrs0, BA_CLUSTER_LIST); @@ -1876,7 +2196,7 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at ad = int_set_prepend(pool, ad, p->rr_cluster_id); /* Should be at least one prepended cluster ID */ - bgp_set_attr_ptr(&attrs, pool, BA_CLUSTER_LIST, 0, ad); + bgp_set_attr_ptr(&attrs, BA_CLUSTER_LIST, 0, ad); } /* AS4_* transition attributes, RFC 6793 4.2.2 */ @@ -1885,15 +2205,15 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at a = bgp_find_attr(attrs, BA_AS_PATH); if (a && as_path_contains_as4(a->u.ptr)) { - bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, as_path_to_old(pool, a->u.ptr)); - bgp_set_attr_ptr(&attrs, pool, BA_AS4_PATH, 0, as_path_strip_confed(pool, a->u.ptr)); + bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, as_path_to_old(pool, a->u.ptr)); + bgp_set_attr_ptr(&attrs, BA_AS4_PATH, 0, as_path_strip_confed(pool, a->u.ptr)); } a = bgp_find_attr(attrs, BA_AGGREGATOR); if (a && aggregator_contains_as4(a->u.ptr)) { - bgp_set_attr_ptr(&attrs, pool, BA_AGGREGATOR, 0, aggregator_to_old(pool, a->u.ptr)); - bgp_set_attr_ptr(&attrs, pool, BA_AS4_AGGREGATOR, 0, a->u.ptr); + bgp_set_attr_ptr(&attrs, BA_AGGREGATOR, 0, aggregator_to_old(pool, a->u.ptr)); + bgp_set_attr_ptr(&attrs, BA_AS4_AGGREGATOR, 0, a->u.ptr); } } @@ -1904,7 +2224,7 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at if (!a && (p->cf->local_role == BGP_ROLE_PROVIDER || p->cf->local_role == BGP_ROLE_PEER || p->cf->local_role == BGP_ROLE_RS_SERVER)) - bgp_set_attr_u32(&attrs, pool, BA_ONLY_TO_CUSTOMER, 0, p->public_as); + bgp_set_attr_u32(&attrs, BA_ONLY_TO_CUSTOMER, 0, p->public_as); } /* @@ -1918,13 +2238,12 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at } void -bgp_rt_notify(struct proto *P, struct channel *C, net *n, rte *new, rte *old) +bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, const rte *old) { struct bgp_proto *p = (void *) P; struct bgp_channel *c = (void *) C; struct bgp_bucket *buck; - struct bgp_prefix *px; - u32 path; + struct rte_src *path; /* Ignore non-BGP channels */ if (C->channel != &channel_bgp) @@ -1932,71 +2251,53 @@ bgp_rt_notify(struct proto *P, struct channel *C, net *n, rte *new, rte *old) if (new) { - struct ea_list *attrs = bgp_update_attrs(p, c, new, new->attrs->eattrs, tmp_linpool); + struct ea_list *attrs = bgp_update_attrs(p, c, new, new->attrs, tmp_linpool); /* Error during attribute processing */ if (!attrs) - log(L_ERR "%s: Invalid route %N withdrawn", p->p.name, n->n.addr); + log(L_ERR "%s: Invalid route %N withdrawn", p->p.name, n); /* If attributes are invalid, we fail back to withdraw */ - buck = attrs ? bgp_get_bucket(c, attrs) : bgp_get_withdraw_bucket(c); - path = new->src->global_id; + buck = attrs ? bgp_get_bucket(c->ptx, attrs) : bgp_get_withdraw_bucket(c->ptx); + path = new->src; } else { - buck = bgp_get_withdraw_bucket(c); - path = old->src->global_id; + buck = bgp_get_withdraw_bucket(c->ptx); + path = old->src; } - px = bgp_get_prefix(c, n->n.addr, c->add_path_tx ? path : 0); - add_tail(&buck->prefixes, &px->buck_node); - - bgp_schedule_packet(p->conn, c, PKT_UPDATE); + if (bgp_update_prefix(c, bgp_get_prefix(c->ptx, n, path, c->add_path_tx), buck)) + bgp_schedule_packet(p->conn, c, PKT_UPDATE); } static inline u32 -bgp_get_neighbor(rte *r) +bgp_get_neighbor(const rte *r) { - eattr *e = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); + eattr *e = ea_find(r->attrs, BGP_EA_ID(BA_AS_PATH)); u32 as; if (e && as_path_get_first_regular(e->u.ptr, &as)) return as; /* If AS_PATH is not defined, we treat rte as locally originated */ - struct bgp_proto *p = (void *) r->src->proto; + struct bgp_proto *p = bgp_rte_proto(r); return p->cf->confederation ?: p->local_as; } static inline int -rte_stale(rte *r) +rte_stale(const rte *r) { - if (r->pflags & BGP_REF_STALE) - return 1; - - if (r->pflags & BGP_REF_NOT_STALE) - return 0; - - /* If staleness is unknown, compute and cache it */ - eattr *a = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)); - if (a && int_set_contains(a->u.ptr, BGP_COMM_LLGR_STALE)) - { - r->pflags |= BGP_REF_STALE; - return 1; - } - else - { - r->pflags |= BGP_REF_NOT_STALE; - return 0; - } + eattr *a = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY)); + return a && int_set_contains(a->u.ptr, BGP_COMM_LLGR_STALE); } int -bgp_rte_better(rte *new, rte *old) +bgp_rte_better(const rte *new, const rte *old) { - struct bgp_proto *new_bgp = (struct bgp_proto *) new->src->proto; - struct bgp_proto *old_bgp = (struct bgp_proto *) old->src->proto; + struct bgp_proto *new_bgp = bgp_rte_proto(new); + struct bgp_proto *old_bgp = bgp_rte_proto(old); eattr *x, *y; u32 n, o; @@ -2025,8 +2326,8 @@ bgp_rte_better(rte *new, rte *old) return 1; /* Start with local preferences */ - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); + x = ea_find(new->attrs, BGP_EA_ID(BA_LOCAL_PREF)); + y = ea_find(old->attrs, BGP_EA_ID(BA_LOCAL_PREF)); n = x ? x->u.data : new_bgp->cf->default_local_pref; o = y ? y->u.data : old_bgp->cf->default_local_pref; if (n > o) @@ -2045,8 +2346,8 @@ bgp_rte_better(rte *new, rte *old) /* RFC 4271 9.1.2.2. a) Use AS path lengths */ if (new_bgp->cf->compare_path_lengths || old_bgp->cf->compare_path_lengths) { - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); + x = ea_find(new->attrs, BGP_EA_ID(BA_AS_PATH)); + y = ea_find(old->attrs, BGP_EA_ID(BA_AS_PATH)); n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; if (n < o) @@ -2056,8 +2357,8 @@ bgp_rte_better(rte *new, rte *old) } /* RFC 4271 9.1.2.2. b) Use origins */ - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); + x = ea_find(new->attrs, BGP_EA_ID(BA_ORIGIN)); + y = ea_find(old->attrs, BGP_EA_ID(BA_ORIGIN)); n = x ? x->u.data : ORIGIN_INCOMPLETE; o = y ? y->u.data : ORIGIN_INCOMPLETE; if (n < o) @@ -2079,8 +2380,8 @@ bgp_rte_better(rte *new, rte *old) if (new_bgp->cf->med_metric || old_bgp->cf->med_metric || (bgp_get_neighbor(new) == bgp_get_neighbor(old))) { - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); + x = ea_find(new->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC)); + y = ea_find(old->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC)); n = x ? x->u.data : new_bgp->cf->default_med; o = y ? y->u.data : old_bgp->cf->default_med; if (n < o) @@ -2096,8 +2397,8 @@ bgp_rte_better(rte *new, rte *old) return 1; /* RFC 4271 9.1.2.2. e) Compare IGP metrics */ - n = new_bgp->cf->igp_metric ? new->attrs->igp_metric : 0; - o = old_bgp->cf->igp_metric ? old->attrs->igp_metric : 0; + n = new_bgp->cf->igp_metric ? rt_get_igp_metric(new) : 0; + o = old_bgp->cf->igp_metric ? rt_get_igp_metric(old) : 0; if (n < o) return 1; if (n > o) @@ -2105,8 +2406,8 @@ bgp_rte_better(rte *new, rte *old) /* RFC 4271 9.1.2.2. f) Compare BGP identifiers */ /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighbor ID */ - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID)); + x = ea_find(new->attrs, BGP_EA_ID(BA_ORIGINATOR_ID)); + y = ea_find(old->attrs, BGP_EA_ID(BA_ORIGINATOR_ID)); n = x ? x->u.data : new_bgp->remote_id; o = y ? y->u.data : old_bgp->remote_id; @@ -2123,8 +2424,8 @@ bgp_rte_better(rte *new, rte *old) return 0; /* RFC 4456 9. b) Compare cluster list lengths */ - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_CLUSTER_LIST)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_CLUSTER_LIST)); + x = ea_find(new->attrs, BGP_EA_ID(BA_CLUSTER_LIST)); + y = ea_find(old->attrs, BGP_EA_ID(BA_CLUSTER_LIST)); n = x ? int_set_get_size(x->u.ptr) : 0; o = y ? int_set_get_size(y->u.ptr) : 0; if (n < o) @@ -2138,10 +2439,10 @@ bgp_rte_better(rte *new, rte *old) int -bgp_rte_mergable(rte *pri, rte *sec) +bgp_rte_mergable(const rte *pri, const rte *sec) { - struct bgp_proto *pri_bgp = (struct bgp_proto *) pri->src->proto; - struct bgp_proto *sec_bgp = (struct bgp_proto *) sec->src->proto; + struct bgp_proto *pri_bgp = bgp_rte_proto(pri); + struct bgp_proto *sec_bgp = bgp_rte_proto(sec); eattr *x, *y; u32 p, s; @@ -2158,8 +2459,8 @@ bgp_rte_mergable(rte *pri, rte *sec) return 0; /* Start with local preferences */ - x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); - y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); + x = ea_find(pri->attrs, BGP_EA_ID(BA_LOCAL_PREF)); + y = ea_find(sec->attrs, BGP_EA_ID(BA_LOCAL_PREF)); p = x ? x->u.data : pri_bgp->cf->default_local_pref; s = y ? y->u.data : sec_bgp->cf->default_local_pref; if (p != s) @@ -2168,8 +2469,8 @@ bgp_rte_mergable(rte *pri, rte *sec) /* RFC 4271 9.1.2.2. a) Use AS path lengths */ if (pri_bgp->cf->compare_path_lengths || sec_bgp->cf->compare_path_lengths) { - x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); - y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); + x = ea_find(pri->attrs, BGP_EA_ID(BA_AS_PATH)); + y = ea_find(sec->attrs, BGP_EA_ID(BA_AS_PATH)); p = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; s = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; @@ -2181,8 +2482,8 @@ bgp_rte_mergable(rte *pri, rte *sec) } /* RFC 4271 9.1.2.2. b) Use origins */ - x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); - y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); + x = ea_find(pri->attrs, BGP_EA_ID(BA_ORIGIN)); + y = ea_find(sec->attrs, BGP_EA_ID(BA_ORIGIN)); p = x ? x->u.data : ORIGIN_INCOMPLETE; s = y ? y->u.data : ORIGIN_INCOMPLETE; if (p != s) @@ -2192,8 +2493,8 @@ bgp_rte_mergable(rte *pri, rte *sec) if (pri_bgp->cf->med_metric || sec_bgp->cf->med_metric || (bgp_get_neighbor(pri) == bgp_get_neighbor(sec))) { - x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); - y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); + x = ea_find(pri->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC)); + y = ea_find(sec->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC)); p = x ? x->u.data : pri_bgp->cf->default_med; s = y ? y->u.data : sec_bgp->cf->default_med; if (p != s) @@ -2205,8 +2506,8 @@ bgp_rte_mergable(rte *pri, rte *sec) return 0; /* RFC 4271 9.1.2.2. e) Compare IGP metrics */ - p = pri_bgp->cf->igp_metric ? pri->attrs->igp_metric : 0; - s = sec_bgp->cf->igp_metric ? sec->attrs->igp_metric : 0; + p = pri_bgp->cf->igp_metric ? rt_get_igp_metric(pri) : 0; + s = sec_bgp->cf->igp_metric ? rt_get_igp_metric(sec) : 0; if (p != s) return 0; @@ -2219,22 +2520,21 @@ bgp_rte_mergable(rte *pri, rte *sec) static inline int same_group(rte *r, u32 lpref, u32 lasn) { - return (r->attrs->pref == lpref) && (bgp_get_neighbor(r) == lasn); + return (rt_get_preference(r) == lpref) && (bgp_get_neighbor(r) == lasn); } static inline int -use_deterministic_med(rte *r) +use_deterministic_med(struct rte_storage *r) { - struct proto *P = r->src->proto; - return (P->proto == &proto_bgp) && ((struct bgp_proto *) P)->cf->deterministic_med; + struct bgp_proto *p = bgp_rte_proto(&r->rte); + return p && p->cf->deterministic_med; } int -bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) +bgp_rte_recalculate(struct rtable_private *table, net *net, rte *new, rte *old, rte *old_best) { - rte *r, *s; rte *key = new ? new : old; - u32 lpref = key->attrs->pref; + u32 lpref = rt_get_preference(key); u32 lasn = bgp_get_neighbor(key); int old_suppressed = old ? !!(old->pflags & BGP_REF_SUPPRESSED) : 0; @@ -2300,13 +2600,13 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) } /* The default case - find a new best-in-group route */ - r = new; /* new may not be in the list */ - for (s=net->routes; rte_is_valid(s); s=s->next) - if (use_deterministic_med(s) && same_group(s, lpref, lasn)) + rte *r = new; /* new may not be in the list */ + for (struct rte_storage *s = net->routes; rte_is_valid(RTE_OR_NULL(s)); s = s->next) + if (use_deterministic_med(s) && same_group(&s->rte, lpref, lasn)) { - s->pflags |= BGP_REF_SUPPRESSED; - if (!r || bgp_rte_better(s, r)) - r = s; + s->rte.pflags |= BGP_REF_SUPPRESSED; + if (!r || bgp_rte_better(&s->rte, r)) + r = &s->rte; } /* Simple case - the last route in group disappears */ @@ -2318,10 +2618,10 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) new->pflags &= ~BGP_REF_SUPPRESSED; /* Found all existing routes mergable with best-in-group */ - for (s=net->routes; rte_is_valid(s); s=s->next) - if (use_deterministic_med(s) && same_group(s, lpref, lasn)) - if ((s != r) && bgp_rte_mergable(r, s)) - s->pflags &= ~BGP_REF_SUPPRESSED; + for (struct rte_storage *s = net->routes; rte_is_valid(RTE_OR_NULL(s)); s = s->next) + if (use_deterministic_med(s) && same_group(&s->rte, lpref, lasn)) + if ((&s->rte != r) && bgp_rte_mergable(r, &s->rte)) + s->rte.pflags &= ~BGP_REF_SUPPRESSED; /* Found best-in-group */ r->pflags &= ~BGP_REF_SUPPRESSED; @@ -2356,25 +2656,59 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) return !old_suppressed; } -struct rte * -bgp_rte_modify_stale(struct rte *r, struct linpool *pool) +void +bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) { - eattr *a = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)); - const struct adata *ad = a ? a->u.ptr : NULL; - uint flags = a ? a->flags : BAF_PARTIAL; + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + struct rt_import_hook *irh = c->c.in_req.hook; - if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR)) - return NULL; + /* Find our routes among others */ + for (uint i=0; i<count; i++) + { + const rte *r = feed[i]; + + if ( + !rte_is_valid(r) || /* Not a valid route */ + (r->sender != irh) || /* Not our route */ + (r->stale_cycle == irh->stale_set)) /* A new route, do not mark as stale */ + continue; + + eattr *ea = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY)); + const struct adata *ad = ea ? ea->u.ptr : NULL; + uint flags = ea ? ea->flags : BAF_PARTIAL; + + /* LLGR not allowed, withdraw the route */ + if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR)) + { + rte_import(&c->c.in_req, n, NULL, r->src); + continue; + } - if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE)) - return r; + /* Route already marked as LLGR, do nothing */ + if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE)) + continue; - r = rte_cow_rta(r, pool); - bgp_set_attr_ptr(&(r->attrs->eattrs), pool, BA_COMMUNITY, flags, - int_set_add(pool, ad, BGP_COMM_LLGR_STALE)); - r->pflags |= BGP_REF_STALE; + /* Store the tmp_linpool state to aggresively save memory */ + struct lp_state tmpp; + lp_save(tmp_linpool, &tmpp); - return r; + /* Mark the route as LLGR */ + rte e0 = *r; + bgp_set_attr_ptr(&e0.attrs, BA_COMMUNITY, flags, int_set_add(tmp_linpool, ad, BGP_COMM_LLGR_STALE)); + + /* We need to update the route but keep it stale. */ + ASSERT_DIE(irh->stale_set == irh->stale_valid + 1); + irh->stale_set--; + rte_import(&c->c.in_req, n, &e0, r->src); + irh->stale_set++; + + /* Restore the memory state */ + lp_restore(tmp_linpool, &tmpp); + } + + rpe_mark_seen_all(req->hook, first, last, NULL); } @@ -2390,8 +2724,8 @@ bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool) eattr *a4 = bgp_find_attr(*attrs, BA_AS4_AGGREGATOR); /* First, unset AS4_* attributes */ - if (p4) bgp_unset_attr(attrs, pool, BA_AS4_PATH); - if (a4) bgp_unset_attr(attrs, pool, BA_AS4_AGGREGATOR); + if (p4) bgp_unset_attr(attrs, BA_AS4_PATH); + if (a4) bgp_unset_attr(attrs, BA_AS4_AGGREGATOR); /* Handle AGGREGATOR attribute */ if (a2 && a4) @@ -2424,60 +2758,37 @@ bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool) } } -int -bgp_get_attr(const eattr *a, byte *buf, int buflen) -{ - uint i = EA_ID(a->id); - const struct bgp_attr_desc *d; - int len; - - if (bgp_attr_known(i)) - { - d = &bgp_attr_table[i]; - len = bsprintf(buf, "%s", d->name); - buf += len; - if (d->format) - { - *buf++ = ':'; - *buf++ = ' '; - d->format(a, buf, buflen - len - 2); - return GA_FULL; - } - return GA_NAME; - } - - bsprintf(buf, "%02x%s", i, (a->flags & BAF_TRANSITIVE) ? " [t]" : ""); - return GA_NAME; -} - void -bgp_get_route_info(rte *e, byte *buf) +bgp_get_route_info(const rte *e, byte *buf) { - eattr *p = ea_find(e->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); - eattr *o = ea_find(e->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); + eattr *p = ea_find(e->attrs, BGP_EA_ID(BA_AS_PATH)); + eattr *o = ea_find(e->attrs, BGP_EA_ID(BA_ORIGIN)); u32 origas; - buf += bsprintf(buf, " (%d", e->attrs->pref); + buf += bsprintf(buf, " (%d", rt_get_preference(e)); - if (e->pflags & BGP_REF_SUPPRESSED) - buf += bsprintf(buf, "-"); + if (!net_is_flow(e->net)) + { + if (e->pflags & BGP_REF_SUPPRESSED) + buf += bsprintf(buf, "-"); - if (rte_stale(e)) - buf += bsprintf(buf, "s"); + if (rte_stale(e)) + buf += bsprintf(buf, "s"); - u64 metric = bgp_total_aigp_metric(e); - if (metric < BGP_AIGP_MAX) - { - buf += bsprintf(buf, "/%lu", metric); - } - else if (e->attrs->igp_metric) - { - if (!rte_resolvable(e)) - buf += bsprintf(buf, "/-"); - else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN) - buf += bsprintf(buf, "/?"); - else - buf += bsprintf(buf, "/%d", e->attrs->igp_metric); + u64 metric = bgp_total_aigp_metric(e); + if (metric < BGP_AIGP_MAX) + { + buf += bsprintf(buf, "/%lu", metric); + } + else if (metric = rt_get_igp_metric(e)) + { + if (!rte_resolvable(e)) + buf += bsprintf(buf, "/-"); + else if (metric >= IGP_METRIC_UNKNOWN) + buf += bsprintf(buf, "/?"); + else + buf += bsprintf(buf, "/%d", metric); + } } buf += bsprintf(buf, ") ["); diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 2e442e16..fffa8cec 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -115,7 +115,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "nest/locks.h" #include "conf/conf.h" @@ -126,9 +126,13 @@ #include "bgp.h" +static void bgp_listen_create(void *); static list STATIC_LIST_INIT(bgp_sockets); /* Global list of listening sockets */ +static list STATIC_LIST_INIT(bgp_listen_pending); /* Global list of listening socket open requests */ +static event bgp_listen_event = { .hook = bgp_listen_create }; +DOMAIN(rtable) bgp_listen_domain; static void bgp_connect(struct bgp_proto *p); static void bgp_active(struct bgp_proto *p); @@ -139,73 +143,43 @@ static void bgp_update_bfd(struct bgp_proto *p, const struct bfd_options *bfd); static int bgp_incoming_connection(sock *sk, uint dummy UNUSED); static void bgp_listen_sock_err(sock *sk UNUSED, int err); +static void bgp_initiate_disable(struct bgp_proto *p, int err_val); -/** - * bgp_open - open a BGP instance - * @p: BGP instance - * - * This function allocates and configures shared BGP resources, mainly listening - * sockets. Should be called as the last step during initialization (when lock - * is acquired and neighbor is ready). When error, caller should change state to - * PS_DOWN and return immediately. - */ -static int -bgp_open(struct bgp_proto *p) +static void bgp_graceful_restart_feed(struct bgp_channel *c); + + +static inline int +bgp_setup_auth(struct bgp_proto *p, int enable) { - struct bgp_socket *bs = NULL; - struct iface *ifa = p->cf->strict_bind ? p->cf->iface : NULL; - ip_addr addr = p->cf->strict_bind ? p->cf->local_ip : - (p->ipv4 ? IPA_NONE4 : IPA_NONE6); - uint port = p->cf->local_port; - uint flags = p->cf->free_bind ? SKF_FREEBIND : 0; - uint flag_mask = SKF_FREEBIND; + /* Beware. This is done from main_birdloop and protocol birdloop is NOT ENTERED. + * Anyway, we are only accessing: + * - protocol config which can be changed only from main_birdloop (reconfig) + * - protocol listen socket which is always driven by main_birdloop + * - protocol name which is set on reconfig + */ - /* We assume that cf->iface is defined iff cf->local_ip is link-local */ + if (p->cf->password && p->listen.sock) + { + ip_addr prefix = p->cf->remote_ip; + int pxlen = -1; - WALK_LIST(bs, bgp_sockets) - if (ipa_equal(bs->sk->saddr, addr) && - (bs->sk->sport == port) && - (bs->sk->iface == ifa) && - (bs->sk->vrf == p->p.vrf) && - ((bs->sk->flags & flag_mask) == flags)) + if (p->cf->remote_range) { - bs->uc++; - p->sock = bs; - return 0; + prefix = net_prefix(p->cf->remote_range); + pxlen = net_pxlen(p->cf->remote_range); } - sock *sk = sk_new(proto_pool); - sk->type = SK_TCP_PASSIVE; - sk->ttl = 255; - sk->saddr = addr; - sk->sport = port; - sk->iface = ifa; - sk->vrf = p->p.vrf; - sk->flags = flags; - sk->tos = IP_PREC_INTERNET_CONTROL; - sk->rbsize = BGP_RX_BUFFER_SIZE; - sk->tbsize = BGP_TX_BUFFER_SIZE; - sk->rx_hook = bgp_incoming_connection; - sk->err_hook = bgp_listen_sock_err; - - if (sk_open(sk) < 0) - goto err; - - bs = mb_allocz(proto_pool, sizeof(struct bgp_socket)); - bs->sk = sk; - bs->uc = 1; - p->sock = bs; - sk->data = bs; - - add_tail(&bgp_sockets, &bs->n); + int rv = sk_set_md5_auth(p->listen.sock->sk, + p->cf->local_ip, prefix, pxlen, p->cf->iface, + enable ? p->cf->password : NULL, p->cf->setkey); - return 0; + if (rv < 0) + sk_log_error(p->listen.sock->sk, p->p.name); -err: - sk_log_error(sk, p->p.name); - log(L_ERR "%s: Cannot open listening socket", p->p.name); - rfree(sk); - return -1; + return rv; + } + else + return 0; } /** @@ -217,43 +191,155 @@ err: static void bgp_close(struct bgp_proto *p) { - struct bgp_socket *bs = p->sock; + LOCK_DOMAIN(rtable, bgp_listen_domain); - ASSERT(bs && bs->uc); + struct bgp_listen_request *req = &p->listen; + struct bgp_socket *bs = req->sock; - if (--bs->uc) - return; + if (bs) + { + req->sock = NULL; + rem_node(&req->n); - rfree(bs->sk); - rem_node(&bs->n); - mb_free(bs); + if (bs && EMPTY_LIST(bs->requests)) + ev_send(&global_event_list, &bgp_listen_event); + } + + UNLOCK_DOMAIN(rtable, bgp_listen_domain); } -static inline int -bgp_setup_auth(struct bgp_proto *p, int enable) +/** + * bgp_open - open a BGP instance + * @p: BGP instance + * + * This function allocates and configures shared BGP resources, mainly listening + * sockets. Should be called as the last step during initialization (when lock + * is acquired and neighbor is ready). When error, caller should change state to + * PS_DOWN and return immediately. + */ +static void +bgp_open(struct bgp_proto *p) { - if (p->cf->password) - { - ip_addr prefix = p->cf->remote_ip; - int pxlen = -1; + LOCK_DOMAIN(rtable, bgp_listen_domain); - if (p->cf->remote_range) + struct bgp_listen_request *req = &p->listen; + /* We assume that cf->iface is defined iff cf->local_ip is link-local */ + req->iface = p->cf->strict_bind ? p->cf->iface : NULL; + req->vrf = p->p.vrf; + req->addr = p->cf->strict_bind ? p->cf->local_ip : + (p->ipv4 ? IPA_NONE4 : IPA_NONE6); + req->port = p->cf->local_port; + req->flags = p->cf->free_bind ? SKF_FREEBIND : 0; + + BGP_TRACE(D_EVENTS, "Requesting listen socket at %I%J port %u", req->addr, req->iface, req->port); + + add_tail(&bgp_listen_pending, &req->n); + ev_send(&global_event_list, &bgp_listen_event); + + UNLOCK_DOMAIN(rtable, bgp_listen_domain); +} + +static void +bgp_listen_create(void *_ UNUSED) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); + uint flag_mask = SKF_FREEBIND; + + while (1) { + LOCK_DOMAIN(rtable, bgp_listen_domain); + + if (EMPTY_LIST(bgp_listen_pending)) { - prefix = net_prefix(p->cf->remote_range); - pxlen = net_pxlen(p->cf->remote_range); + UNLOCK_DOMAIN(rtable, bgp_listen_domain); + break; } - int rv = sk_set_md5_auth(p->sock->sk, - p->cf->local_ip, prefix, pxlen, p->cf->iface, - enable ? p->cf->password : NULL, p->cf->setkey); + /* Get the first request to match */ + struct bgp_listen_request *req = HEAD(bgp_listen_pending); + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, listen, req); + rem_node(&req->n); + + /* First try to find existing socket */ + struct bgp_socket *bs; + WALK_LIST(bs, bgp_sockets) + if (ipa_equal(bs->sk->saddr, req->addr) && + (bs->sk->sport == req->port) && + (bs->sk->iface == req->iface) && + (bs->sk->vrf == req->vrf) && + ((bs->sk->flags & flag_mask) == req->flags)) + break; - if (rv < 0) - sk_log_error(p->sock->sk, p->p.name); + /* Not found any */ + if (NODE_VALID(bs)) + BGP_TRACE(D_EVENTS, "Found a listening socket: %p", bs); + else + { + /* Allocating new socket from global protocol pool. + * We can do this in main_birdloop. */ + sock *sk = sk_new(proto_pool); + sk->type = SK_TCP_PASSIVE; + sk->ttl = 255; + sk->saddr = req->addr; + sk->sport = req->port; + sk->iface = req->iface; + sk->vrf = req->vrf; + sk->flags = req->flags; + sk->tos = IP_PREC_INTERNET_CONTROL; + sk->rbsize = BGP_RX_BUFFER_SIZE; + sk->tbsize = BGP_TX_BUFFER_SIZE; + sk->rx_hook = bgp_incoming_connection; + sk->err_hook = bgp_listen_sock_err; + + if (sk_open(sk, &main_birdloop) < 0) + { + sk_log_error(sk, p->p.name); + log(L_ERR "%s: Cannot open listening socket", p->p.name); + rfree(sk); + UNLOCK_DOMAIN(rtable, bgp_listen_domain); - return rv; + bgp_initiate_disable(p, BEM_NO_SOCKET); + continue; + } + + bs = mb_allocz(proto_pool, sizeof(struct bgp_socket)); + bs->sk = sk; + sk->data = bs; + + init_list(&bs->requests); + add_tail(&bgp_sockets, &bs->n); + + BGP_TRACE(D_EVENTS, "Created new listening socket: %p", bs); + } + + req->sock = bs; + add_tail(&bs->requests, &req->n); + + if (bgp_setup_auth(p, 1) < 0) + { + rem_node(&req->n); + req->sock = NULL; + + UNLOCK_DOMAIN(rtable, bgp_listen_domain); + + bgp_initiate_disable(p, BEM_INVALID_MD5); + continue; + } + + UNLOCK_DOMAIN(rtable, bgp_listen_domain); } - else - return 0; + + /* Cleanup leftover listening sockets */ + LOCK_DOMAIN(rtable, bgp_listen_domain); + struct bgp_socket *bs; + node *nxt; + WALK_LIST_DELSAFE(bs, nxt, bgp_sockets) + if (EMPTY_LIST(bs->requests)) + { + rfree(bs->sk); + rem_node(&bs->n); + mb_free(bs); + } + UNLOCK_DOMAIN(rtable, bgp_listen_domain); } static inline struct bgp_channel * @@ -279,6 +365,8 @@ bgp_startup(struct bgp_proto *p) if (p->postponed_sk) { /* Apply postponed incoming connection */ + sk_reloop(p->postponed_sk, p->p.loop); + bgp_setup_conn(p, &p->incoming_conn); bgp_setup_sk(&p->incoming_conn, p->postponed_sk); bgp_send_open(&p->incoming_conn); @@ -296,13 +384,7 @@ bgp_startup_timeout(timer *t) static void bgp_initiate(struct bgp_proto *p) { - int err_val; - - if (bgp_open(p) < 0) - { err_val = BEM_NO_SOCKET; goto err1; } - - if (bgp_setup_auth(p, 1) < 0) - { err_val = BEM_INVALID_MD5; goto err2; } + bgp_open(p); if (p->cf->bfd) bgp_update_bfd(p, p->cf->bfd); @@ -311,23 +393,28 @@ bgp_initiate(struct bgp_proto *p) { p->start_state = BSS_DELAY; BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay); - bgp_start_timer(p->startup_timer, p->startup_delay); + bgp_start_timer(p, p->startup_timer, p->startup_delay); } else bgp_startup(p); +} - return; - -err2: - bgp_close(p); -err1: - p->p.disabled = 1; - bgp_store_error(p, NULL, BE_MISC, err_val); - - p->neigh = NULL; - proto_notify_state(&p->p, PS_DOWN); - - return; +static void +bgp_initiate_disable(struct bgp_proto *p, int err_val) +{ + PROTO_LOCKED_FROM_MAIN(&p->p) + { + /* The protocol may be already down for another reason. + * Shutdown the protocol only if it isn't already shutting down. */ + switch (p->p.proto_state) + { + case PS_START: + case PS_UP: + p->p.disabled = 1; + bgp_store_error(p, NULL, BE_MISC, err_val); + bgp_stop(p, err_val, NULL, 0); + } + } } /** @@ -340,14 +427,14 @@ err1: * timers. */ void -bgp_start_timer(timer *t, uint value) +bgp_start_timer(struct bgp_proto *p, timer *t, uint value) { if (value) { /* The randomization procedure is specified in RFC 4271 section 10 */ btime time = value S; btime randomize = random() % ((time / 4) + 1); - tm_start(t, time - randomize); + tm_start_in(t, time - randomize, p->p.loop); } else tm_stop(t); @@ -374,8 +461,10 @@ bgp_close_conn(struct bgp_conn *conn) conn->keepalive_timer = NULL; rfree(conn->hold_timer); conn->hold_timer = NULL; + rfree(conn->tx_ev); conn->tx_ev = NULL; + rfree(conn->sk); conn->sk = NULL; @@ -512,9 +601,16 @@ void bgp_stop(struct bgp_proto *p, int subcode, byte *data, uint len) { proto_notify_state(&p->p, PS_STOP); + p->uncork_ev->data = NULL; bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len); bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len); - ev_schedule(p->event); + + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + if (c->ptx) + bgp_free_pending_tx(c); + + proto_send_event(&p->p, p->event); } static inline void @@ -607,7 +703,7 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) int active = loc->ready && rem->ready; c->c.disabled = !active; - c->c.reloadable = p->route_refresh || c->cf->import_table; + c->c.reloadable = p->route_refresh || ((c->c.in_keep & RIK_PREFILTER) == RIK_PREFILTER); c->index = active ? num++ : 0; @@ -705,7 +801,7 @@ bgp_conn_enter_close_state(struct bgp_conn *conn) conn->sk->rx_hook = NULL; /* Timeout for CLOSE state, if we cannot send notification soon then we just hangup */ - bgp_start_timer(conn->hold_timer, 10); + bgp_start_timer(p, conn->hold_timer, 10); if (os == BS_ESTABLISHED) bgp_conn_leave_established_state(p); @@ -719,7 +815,7 @@ bgp_conn_enter_idle_state(struct bgp_conn *conn) bgp_close_conn(conn); bgp_conn_set_state(conn, BS_IDLE); - ev_schedule(p->event); + proto_send_event(&p->p, p->event); if (os == BS_ESTABLISHED) bgp_conn_leave_established_state(p); @@ -761,32 +857,28 @@ bgp_handle_graceful_restart(struct bgp_proto *p) { case BGP_GRS_NONE: c->gr_active = BGP_GRS_ACTIVE; - rt_refresh_begin(c->c.table, &c->c); - break; + /* fall through */ case BGP_GRS_ACTIVE: - rt_refresh_end(c->c.table, &c->c); - rt_refresh_begin(c->c.table, &c->c); + rt_refresh_begin(&c->c.in_req); break; case BGP_GRS_LLGR: - rt_refresh_begin(c->c.table, &c->c); - rt_modify_stale(c->c.table, &c->c); + rt_refresh_begin(&c->c.in_req); + bgp_graceful_restart_feed(c); break; } } else { /* Just flush the routes */ - rt_refresh_begin(c->c.table, &c->c); - rt_refresh_end(c->c.table, &c->c); + rt_refresh_begin(&c->c.in_req); + rt_refresh_end(&c->c.in_req); } /* Reset bucket and prefix tables */ - bgp_free_bucket_table(c); - bgp_free_prefix_table(c); - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); + bgp_free_pending_tx(c); + bgp_init_pending_tx(c); c->packets_to_send = 0; } @@ -794,9 +886,56 @@ bgp_handle_graceful_restart(struct bgp_proto *p) ASSERT(p->gr_active_num > 0); proto_notify_state(&p->p, PS_START); - tm_start(p->gr_timer, p->conn->remote_caps->gr_time S); + tm_start_in(p->gr_timer, p->conn->remote_caps->gr_time S, p->p.loop); +} + +static void +bgp_graceful_restart_feed_done(struct rt_export_request *req) +{ + req->hook = NULL; +} + +static void +bgp_graceful_restart_feed_dump_req(struct rt_export_request *req) +{ + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + debug(" BGP-GR %s.%s export request %p\n", c->c.proto->name, c->c.name, req); +} + +static void +bgp_graceful_restart_feed_log_state_change(struct rt_export_request *req, u8 state) +{ + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + struct bgp_proto *p = (void *) c->c.proto; + BGP_TRACE(D_EVENTS, "Long-lived graceful restart export state changed to %s", rt_export_state_name(state)); + + if (state == TES_READY) + rt_stop_export(req, bgp_graceful_restart_feed_done); +} + +static void +bgp_graceful_restart_drop_export(struct rt_export_request *req UNUSED, const net_addr *n UNUSED, struct rt_pending_export *rpe UNUSED) +{ /* Nothing to do */ } + +static void +bgp_graceful_restart_feed(struct bgp_channel *c) +{ + c->stale_feed = (struct rt_export_request) { + .name = "BGP-GR", + .list = &global_work_list, + .trace_routes = c->c.debug | c->c.proto->debug, + .dump_req = bgp_graceful_restart_feed_dump_req, + .log_state_change = bgp_graceful_restart_feed_log_state_change, + .export_bulk = bgp_rte_modify_stale, + .export_one = bgp_graceful_restart_drop_export, + }; + + rt_request_export(c->c.table, &c->stale_feed); } + + + /** * bgp_graceful_restart_done - finish active BGP graceful restart * @c: BGP channel @@ -820,7 +959,7 @@ bgp_graceful_restart_done(struct bgp_channel *c) BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); tm_stop(c->stale_timer); - rt_refresh_end(c->c.table, &c->c); + rt_refresh_end(&c->c.in_req); } /** @@ -861,8 +1000,8 @@ bgp_graceful_restart_timeout(timer *t) /* Channel is in GR, and supports LLGR -> start LLGR */ c->gr_active = BGP_GRS_LLGR; - tm_start(c->stale_timer, c->stale_time S); - rt_modify_stale(c->c.table, &c->c); + tm_start_in(c->stale_timer, c->stale_time S, p->p.loop); + bgp_graceful_restart_feed(c); } } else @@ -900,10 +1039,7 @@ bgp_refresh_begin(struct bgp_channel *c) { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } c->load_state = BFS_REFRESHING; - rt_refresh_begin(c->c.table, &c->c); - - if (c->c.in_table) - rt_refresh_begin(c->c.in_table, &c->c); + rt_refresh_begin(&c->c.in_req); } /** @@ -924,10 +1060,7 @@ bgp_refresh_end(struct bgp_channel *c) { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } c->load_state = BFS_NONE; - rt_refresh_end(c->c.table, &c->c); - - if (c->c.in_table) - rt_prune_sync(c->c.in_table, 0); + rt_refresh_end(&c->c.in_req); } @@ -941,7 +1074,7 @@ bgp_send_open(struct bgp_conn *conn) bgp_prepare_capabilities(conn); bgp_schedule_packet(conn, NULL, PKT_OPEN); bgp_conn_set_state(conn, BS_OPENSENT); - bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time); + bgp_start_timer(conn->bgp, conn->hold_timer, conn->bgp->cf->initial_hold_time); } static void @@ -1018,7 +1151,7 @@ bgp_hold_timeout(timer *t) and perhaps just not processed BGP packets in time. */ if (sk_rx_ready(conn->sk) > 0) - bgp_start_timer(conn->hold_timer, 10); + bgp_start_timer(p, conn->hold_timer, 10); else if ((conn->state == BS_ESTABLISHED) && p->llgr_ready) { BGP_TRACE(D_EVENTS, "Hold timer expired"); @@ -1078,7 +1211,7 @@ bgp_active(struct bgp_proto *p) BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay); bgp_setup_conn(p, conn); bgp_conn_set_state(conn, BS_ACTIVE); - bgp_start_timer(conn->connect_timer, delay); + bgp_start_timer(p, conn->connect_timer, delay); } /** @@ -1109,6 +1242,7 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c s->tos = IP_PREC_INTERNET_CONTROL; s->password = p->cf->password; s->tx_hook = bgp_connected; + s->flags = p->cf->free_bind ? SKF_FREEBIND : 0; BGP_TRACE(D_EVENTS, "Connecting to %I%J from local address %I%J", s->daddr, ipa_is_link_local(s->daddr) ? p->cf->iface : NULL, s->saddr, ipa_is_link_local(s->saddr) ? s->iface : NULL); @@ -1116,7 +1250,7 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c bgp_setup_sk(conn, s); bgp_conn_set_state(conn, BS_CONNECT); - if (sk_open(s) < 0) + if (sk_open(s, p->p.loop) < 0) goto err; /* Set minimal receive TTL if needed */ @@ -1125,7 +1259,7 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c goto err; DBG("BGP: Waiting for connect success\n"); - bgp_start_timer(conn->connect_timer, p->cf->connect_retry_time); + bgp_start_timer(p, conn->connect_timer, p->cf->connect_retry_time); return; err: @@ -1146,12 +1280,17 @@ static struct bgp_proto * bgp_find_proto(sock *sk) { struct bgp_proto *best = NULL; - struct bgp_proto *p; + struct bgp_socket *bs = sk->data; + struct bgp_listen_request *req; /* sk->iface is valid only if src or dst address is link-local */ int link = ipa_is_link_local(sk->saddr) || ipa_is_link_local(sk->daddr); - WALK_LIST(p, proto_list) + LOCK_DOMAIN(rtable, bgp_listen_domain); + + WALK_LIST(req, bs->requests) + { + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, listen, req); if ((p->p.proto == &proto_bgp) && (ipa_equal(p->remote_ip, sk->daddr) || bgp_is_dynamic(p)) && (!p->cf->remote_range || ipa_in_netX(sk->daddr, p->cf->remote_range)) && @@ -1165,7 +1304,9 @@ bgp_find_proto(sock *sk) if (!bgp_is_dynamic(p)) break; } + } + UNLOCK_DOMAIN(rtable, bgp_listen_domain); return best; } @@ -1184,6 +1325,8 @@ bgp_find_proto(sock *sk) static int bgp_incoming_connection(sock *sk, uint dummy UNUSED) { + ASSERT_DIE(birdloop_inside(&main_birdloop)); + struct bgp_proto *p; int acc, hops; @@ -1197,6 +1340,8 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) return 0; } + birdloop_enter(p->p.loop); + /* * BIRD should keep multiple incoming connections in OpenSent state (for * details RFC 4271 8.2.1 par 3), but it keeps just one. Duplicate incoming @@ -1226,7 +1371,7 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) if (!acc) { rfree(sk); - return 0; + goto leave; } hops = p->cf->multihop ? : 1; @@ -1251,19 +1396,24 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) p = bgp_spawn(p, sk->daddr); p->postponed_sk = sk; rmove(sk, p->p.pool); - return 0; + goto leave; } rmove(sk, p->p.pool); + sk_reloop(sk, p->p.loop); + bgp_setup_conn(p, &p->incoming_conn); bgp_setup_sk(&p->incoming_conn, sk); bgp_send_open(&p->incoming_conn); - return 0; + goto leave; err: sk_log_error(sk, p->p.name); log(L_ERR "%s: Incoming connection aborted", p->p.name); rfree(sk); + +leave: + birdloop_leave(p->p.loop); return 0; } @@ -1374,15 +1524,22 @@ static void bgp_update_bfd(struct bgp_proto *p, const struct bfd_options *bfd) { if (bfd && p->bfd_req) + { + BGP_TRACE(D_EVENTS, "Updating existing BFD request"); bfd_update_request(p->bfd_req, bfd); + } if (bfd && !p->bfd_req && !bgp_is_dynamic(p)) + { p->bfd_req = bfd_request_session(p->p.pool, p->remote_ip, p->local_ip, p->cf->multihop ? NULL : p->neigh->iface, - p->p.vrf, bgp_bfd_notify, p, bfd); + p->p.vrf, bgp_bfd_notify, p, p->p.loop, bfd); + BGP_TRACE(D_EVENTS, "Requesting a new BFD session"); + } if (!bfd && p->bfd_req) { + BGP_TRACE(D_EVENTS, "Retracting the BFD request"); rfree(p->bfd_req); p->bfd_req = NULL; } @@ -1398,12 +1555,8 @@ bgp_reload_routes(struct channel *C) if (C->channel != &channel_bgp) return; - ASSERT(p->conn && (p->route_refresh || c->c.in_table)); - - if (c->c.in_table) - channel_schedule_reload(C); - else - bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); + ASSERT(p->conn && p->route_refresh); + bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); } static void @@ -1423,6 +1576,12 @@ bgp_feed_begin(struct channel *C, int initial) if (initial && p->cf->gr_mode) c->feed_state = BFS_LOADING; + if (!initial && C->out_table) + { + c->feed_out_table = 1; + return; + } + /* It is refeed and both sides support enhanced route refresh */ if (!initial && p->enhanced_refresh) { @@ -1445,6 +1604,12 @@ bgp_feed_end(struct channel *C) if (C->channel != &channel_bgp) return; + if (c->feed_out_table) + { + c->feed_out_table = 0; + return; + } + /* This should not happen */ if (!p->conn) return; @@ -1467,9 +1632,9 @@ bgp_feed_end(struct channel *C) static void -bgp_start_locked(struct object_lock *lock) +bgp_start_locked(void *_p) { - struct bgp_proto *p = lock->data; + struct bgp_proto *p = _p; const struct bgp_config *cf = p->cf; if (p->p.proto_state != PS_START) @@ -1545,6 +1710,8 @@ bgp_start(struct proto *P) p->last_rx_update = 0; p->event = ev_new_init(p->p.pool, bgp_decision, p); + p->uncork_ev = ev_new_init(p->p.pool, bgp_uncork, p); + p->startup_timer = tm_new_init(p->p.pool, bgp_startup_timeout, p, 0, 0); p->gr_timer = tm_new_init(p->p.pool, bgp_graceful_restart_timeout, p, 0, 0); @@ -1574,8 +1741,11 @@ bgp_start(struct proto *P) lock->iface = p->cf->iface; lock->vrf = p->cf->iface ? NULL : p->p.vrf; lock->type = OBJLOCK_TCP; - lock->hook = bgp_start_locked; - lock->data = p; + lock->event = (event) { + .hook = bgp_start_locked, + .data = p, + }; + lock->target = proto_event_list(P); /* For dynamic BGP, we use inst 1 to avoid collisions with regular BGP */ if (bgp_is_dynamic(p)) @@ -1675,6 +1845,13 @@ done: return p->p.proto_state; } +struct rte_owner_class bgp_rte_owner_class = { + .get_route_info = bgp_get_route_info, + .rte_better = bgp_rte_better, + .rte_mergable = bgp_rte_mergable, + .rte_igp_metric = bgp_rte_igp_metric, +}; + static struct proto * bgp_init(struct proto_config *CF) { @@ -1684,15 +1861,13 @@ bgp_init(struct proto_config *CF) P->rt_notify = bgp_rt_notify; P->preexport = bgp_preexport; - P->neigh_notify = bgp_neigh_notify; + P->iface_sub.neigh_notify = bgp_neigh_notify; P->reload_routes = bgp_reload_routes; P->feed_begin = bgp_feed_begin; P->feed_end = bgp_feed_end; - P->rte_better = bgp_rte_better; - P->rte_mergable = bgp_rte_mergable; - P->rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL; - P->rte_modify = bgp_rte_modify_stale; - P->rte_igp_metric = bgp_rte_igp_metric; + + P->sources.class = &bgp_rte_owner_class; + P->sources.rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL; p->cf = cf; p->is_internal = (cf->local_as == cf->remote_as); @@ -1759,14 +1934,11 @@ bgp_channel_start(struct channel *C) } c->pool = p->p.pool; // XXXX - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); - - if (c->cf->import_table) - channel_setup_in_table(C); if (c->cf->export_table) - channel_setup_out_table(C); + bgp_setup_out_table(c); + + bgp_init_pending_tx(c); c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0); @@ -1889,7 +2061,7 @@ bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32 return cc2->c.table; /* Last, try default table of given type */ - if (tab = cf->c.global->def_tables[type]) + if (tab = rt_get_default_table(cf->c.global, type)) return tab; cf_error("Undefined IGP table"); @@ -1908,7 +2080,7 @@ bgp_default_base_table(struct bgp_config *cf, struct bgp_channel_config *cc) return cc2->c.table; /* Last, try default table of given type */ - struct rtable_config *tab = cf->c.global->def_tables[type]; + struct rtable_config *tab = rt_get_default_table(cf->c.global, type); if (tab) return tab; @@ -2175,7 +2347,6 @@ bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *impor (new->llgr_time != old->llgr_time) || (new->ext_next_hop != old->ext_next_hop) || (new->add_path != old->add_path) || - (new->import_table != old->import_table) || (new->export_table != old->export_table) || (TABLE(new, igp_table_ip4) != TABLE(old, igp_table_ip4)) || (TABLE(new, igp_table_ip6) != TABLE(old, igp_table_ip6)) || @@ -2190,11 +2361,13 @@ bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *impor (new->aigp != old->aigp) || (new->cost != old->cost)) { - /* import_changed itself does not force ROUTE_REFRESH when import_table is active */ - if (c->c.in_table && (c->c.channel_state == CS_UP)) + /* If import table is active and route refresh is possible, we just ask for route refresh */ + if ((c->c.in_keep & RIK_PREFILTER) && (c->c.channel_state == CS_UP) && p->route_refresh) bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); - *import_changed = 1; + /* Otherwise we do complete reload */ + else + *import_changed = 1; } if (!ipa_equal(new->next_hop_addr, old->next_hop_addr) || @@ -2562,6 +2735,9 @@ bgp_show_proto_info(struct proto *P) tm_remains(p->conn->hold_timer), p->conn->hold_time); cli_msg(-1006, " Keepalive timer: %t/%u", tm_remains(p->conn->keepalive_timer), p->conn->keepalive_time); + cli_msg(-1006, " TX pending: %d bytes%s", + p->conn->sk->tpos - p->conn->sk->ttx, + ev_active(p->conn->tx_ev) ? " (refill scheduled)" : ""); } #if 0 @@ -2615,6 +2791,22 @@ bgp_show_proto_info(struct proto *P) if (c->base_table) cli_msg(-1006, " Base table: %s", c->base_table->name); + + uint bucket_cnt = 0; + uint prefix_cnt = 0; + struct bgp_bucket *buck; + struct bgp_prefix *px; + if (c->ptx) + WALK_LIST(buck, c->ptx->bucket_queue) + { + bucket_cnt++; + WALK_LIST(px, buck->prefixes) + if (px->cur) + prefix_cnt++; + } + + cli_msg(-1006, " Pending %u attribute sets with total %u prefixes to send", + bucket_cnt, prefix_cnt); } } } @@ -2632,7 +2824,6 @@ struct channel_class channel_bgp = { struct protocol proto_bgp = { .name = "BGP", .template = "bgp%d", - .class = PROTOCOL_BGP, .preference = DEF_PREF_BGP, .channel_mask = NB_IP | NB_VPN | NB_FLOW, .proto_size = sizeof(struct bgp_proto), @@ -2644,12 +2835,12 @@ struct protocol proto_bgp = { .reconfigure = bgp_reconfigure, .copy_config = bgp_copy_config, .get_status = bgp_get_status, - .get_attr = bgp_get_attr, - .get_route_info = bgp_get_route_info, .show_proto_info = bgp_show_proto_info }; void bgp_build(void) { proto_build(&proto_bgp); + bgp_register_attrs(); + bgp_listen_domain = DOMAIN_NEW(rtable, "BGP Listen Sockets"); } diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 2808d479..621e1bc5 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -14,7 +14,7 @@ #include <stdint.h> #include <setjmp.h> #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/bfd.h" //#include "lib/lists.h" #include "lib/hash.h" @@ -67,10 +67,10 @@ struct bgp_af_desc { u8 no_igp; const char *name; uint (*encode_nlri)(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size); - void (*decode_nlri)(struct bgp_parse_state *s, byte *pos, uint len, rta *a); + void (*decode_nlri)(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a); void (*update_next_hop)(struct bgp_export_state *s, eattr *nh, ea_list **to); uint (*encode_next_hop)(struct bgp_write_state *s, eattr *nh, byte *buf, uint size); - void (*decode_next_hop)(struct bgp_parse_state *s, byte *pos, uint len, rta *a); + void (*decode_next_hop)(struct bgp_parse_state *s, byte *pos, uint len, ea_list **to); }; @@ -162,7 +162,7 @@ struct bgp_channel_config { u8 aigp_originate; /* AIGP is originated automatically */ u32 cost; /* IGP cost for direct next hops */ u8 import_table; /* Use c.in_table as Adj-RIB-In */ - u8 export_table; /* Use c.out_table as Adj-RIB-Out */ + u8 export_table; /* Keep Adj-RIB-Out and export it */ struct rtable_config *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ struct rtable_config *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ @@ -220,8 +220,6 @@ struct bgp_channel_config { /* rte->pflags */ #define BGP_REF_SUPPRESSED 0x1 /* Used for deterministic MED comparison */ -#define BGP_REF_STALE 0x2 /* Route is LLGR_STATE */ -#define BGP_REF_NOT_STALE 0x4 /* Route is NOT LLGR_STATE */ struct bgp_af_caps { u32 afi; @@ -266,8 +264,8 @@ struct bgp_caps { struct bgp_socket { node n; /* Node in global bgp_sockets */ + list requests; /* Listen requests */ sock *sk; /* Real listening socket */ - u32 uc; /* Use count */ }; struct bgp_stats { @@ -302,6 +300,16 @@ struct bgp_conn { uint hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */ }; +struct bgp_listen_request { + node n; /* Node in bgp_socket / pending list */ + struct bgp_socket *sock; /* Assigned socket */ + ip_addr addr; + struct iface *iface; + struct iface *vrf; + uint port; + uint flags; +}; + struct bgp_proto { struct proto p; const struct bgp_config *cf; /* Shortcut to BGP configuration */ @@ -333,9 +341,10 @@ struct bgp_proto { struct bgp_conn incoming_conn; /* Incoming connection we have neither accepted nor rejected yet */ struct object_lock *lock; /* Lock for neighbor connection */ struct neighbor *neigh; /* Neighbor entry corresponding to remote ip, NULL if multihop */ - struct bgp_socket *sock; /* Shared listening socket */ + struct bgp_listen_request listen; /* Shared listening socket */ struct bfd_request *bfd_req; /* BFD request, if BFD is used */ struct birdsock *postponed_sk; /* Postponed incoming socket for dynamic BGP */ + event *uncork_ev; /* Uncork event in case of congestion */ struct bgp_stats stats; /* BGP statistics */ btime last_established; /* Last time of enter/leave of established state */ btime last_rx_update; /* Last time of RX update */ @@ -367,12 +376,8 @@ struct bgp_channel { /* Rest are zeroed when down */ pool *pool; - HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ - struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ - list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ - - HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ - slab *prefix_slab; /* Slab holding prefix nodes */ + struct bgp_pending_tx *ptx; /* Routes waiting to be sent */ + struct rt_exporter prefix_exporter; /* Table-like exporter for ptx */ ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */ ip_addr link_addr; /* Link-local version of next_hop_addr */ @@ -386,17 +391,23 @@ struct bgp_channel { timer *stale_timer; /* Long-lived stale timer for LLGR */ u32 stale_time; /* Stored LLGR stale time from last session */ + struct rt_export_request stale_feed; /* Feeder request for stale route modification */ u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */ u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */ u8 feed_state; /* Feed state (TX) for EoR, RR packets, see BFS_* */ u8 load_state; /* Load state (RX) for EoR, RR packets, see BFS_* */ + + u8 feed_out_table; /* Refeed into out_table */ }; struct bgp_prefix { - node buck_node; /* Node in per-bucket list */ + node buck_node_xx; /* Node in per-bucket list */ struct bgp_prefix *next; /* Node in prefix hash table */ + struct bgp_bucket *last; /* Last bucket sent with this prefix */ + struct bgp_bucket *cur; /* Current bucket (cur == last) if no update is required */ + btime lastmod; /* Last modification of this prefix */ u32 hash; u32 path_id; net_addr net[0]; @@ -405,11 +416,24 @@ struct bgp_prefix { struct bgp_bucket { node send_node; /* Node in send queue */ struct bgp_bucket *next; /* Node in bucket hash table */ - list prefixes; /* Prefixes in this bucket (struct bgp_prefix) */ + list prefixes; /* Prefixes to send in this bucket (struct bgp_prefix) */ u32 hash; /* Hash over extended attributes */ + u32 px_uc; /* How many prefixes are linking this bucket */ ea_list eattrs[0]; /* Per-bucket extended attributes */ }; +struct bgp_pending_tx { + resource r; + pool *pool; + + HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ + struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ + list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ + + HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ + slab *prefix_slab; /* Slab holding prefix nodes */ +}; + struct bgp_export_state { struct bgp_proto *proto; struct bgp_channel *channel; @@ -473,13 +497,12 @@ struct bgp_parse_state { uint err_subcode; jmp_buf err_jmpbuf; - struct hostentry *hostentry; adata *mpls_labels; /* Cached state for bgp_rte_update() */ u32 last_id; struct rte_src *last_src; - rta *cached_rta; + ea_list *cached_ea; }; #define BGP_PORT 179 @@ -524,7 +547,7 @@ bgp_parse_error(struct bgp_parse_state *s, uint subcode) } -void bgp_start_timer(timer *t, uint value); +void bgp_start_timer(struct bgp_proto *p, timer *t, uint value); void bgp_check_config(struct bgp_config *c); void bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int len); void bgp_close_conn(struct bgp_conn *c); @@ -542,11 +565,17 @@ void bgp_stop(struct bgp_proto *p, int subcode, byte *data, uint len); const char *bgp_format_role_name(u8 role); static inline int -rte_resolvable(rte *rt) +rte_resolvable(const rte *rt) { - return rt->attrs->dest != RTD_UNREACHABLE; + eattr *nhea = ea_find(rt->attrs, &ea_gen_nexthop); + if (!nhea) + return 0; + + struct nexthop_adata *nhad = (void *) nhea->u.ptr; + return NEXTHOP_IS_REACHABLE(nhad) || (nhad->dest != RTD_UNREACHABLE); } +extern struct rte_owner_class bgp_rte_owner_class; #ifdef LOCAL_DEBUG #define BGP_FORCE_DEBUG 1 @@ -562,73 +591,61 @@ rte_resolvable(rte *rt) /* attrs.c */ -static inline eattr * -bgp_find_attr(ea_list *attrs, uint code) -{ - return ea_find(attrs, EA_CODE(PROTOCOL_BGP, code)); -} - eattr * -bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintptr_t val); - -static inline void -bgp_set_attr_u32(ea_list **to, struct linpool *pool, uint code, uint flags, u32 val) -{ bgp_set_attr(to, pool, code, flags, (uintptr_t) val); } +bgp_find_attr(ea_list *attrs, uint code); -static inline void -bgp_set_attr_ptr(ea_list **to, struct linpool *pool, uint code, uint flags, const struct adata *val) -{ bgp_set_attr(to, pool, code, flags, (uintptr_t) val); } - -static inline void -bgp_set_attr_data(ea_list **to, struct linpool *pool, uint code, uint flags, void *data, uint len) -{ - struct adata *a = lp_alloc_adata(pool, len); - bmemcpy(a->data, data, len); - bgp_set_attr(to, pool, code, flags, (uintptr_t) a); -} - -#define bgp_unset_attr(to, pool, code) ea_unset_attr(to, pool, 0, code) +void bgp_set_attr_u32(ea_list **to, uint code, uint flags, u32 val); +void bgp_set_attr_ptr(ea_list **to, uint code, uint flags, const struct adata *ad); +void bgp_set_attr_data(ea_list **to, uint code, uint flags, void *data, uint len); +void bgp_unset_attr(ea_list **to, uint code); int bgp_encode_mp_reach_mrt(struct bgp_write_state *s, eattr *a, byte *buf, uint size); int bgp_encode_attrs(struct bgp_write_state *s, ea_list *attrs, byte *buf, byte *end); ea_list * bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len); -void bgp_finish_attrs(struct bgp_parse_state *s, rta *a); +void bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to); + +void bgp_setup_out_table(struct bgp_channel *c); + +void bgp_init_pending_tx(struct bgp_channel *c); +void bgp_free_pending_tx(struct bgp_channel *c); -void bgp_init_bucket_table(struct bgp_channel *c); -void bgp_free_bucket_table(struct bgp_channel *c); -void bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b); -void bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b); void bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b); +int bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b); -void bgp_init_prefix_table(struct bgp_channel *c); -void bgp_free_prefix_table(struct bgp_channel *c); -void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *bp); +void bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck); -int bgp_rte_better(struct rte *, struct rte *); -int bgp_rte_mergable(rte *pri, rte *sec); -int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best); -struct rte *bgp_rte_modify_stale(struct rte *r, struct linpool *pool); -u32 bgp_rte_igp_metric(struct rte *); -void bgp_rt_notify(struct proto *P, struct channel *C, net *n, rte *new, rte *old); +int bgp_rte_better(const rte *, const rte *); +int bgp_rte_mergable(const rte *pri, const rte *sec); +int bgp_rte_recalculate(struct rtable_private *table, net *net, rte *new, rte *old, rte *old_best); +void bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *first, struct rt_pending_export *last, const rte **feed, uint count); +u32 bgp_rte_igp_metric(const rte *); +void bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, const rte *old); int bgp_preexport(struct channel *, struct rte *); -int bgp_get_attr(const struct eattr *e, byte *buf, int buflen); -void bgp_get_route_info(struct rte *, byte *buf); -int bgp_total_aigp_metric_(rte *e, u64 *metric, const struct adata **ad); +void bgp_get_route_info(const rte *, byte *); +int bgp_total_aigp_metric_(const rte *e, u64 *metric, const struct adata **ad); + +static inline struct bgp_proto *bgp_rte_proto(const rte *rte) +{ + return (rte->src->owner->class == &bgp_rte_owner_class) ? + SKIP_BACK(struct bgp_proto, p.sources, rte->src->owner) : NULL; +} #define BGP_AIGP_METRIC 1 #define BGP_AIGP_MAX U64(0xffffffffffffffff) static inline u64 -bgp_total_aigp_metric(rte *r) +bgp_total_aigp_metric(const rte *e) { u64 metric = BGP_AIGP_MAX; const struct adata *ad; - bgp_total_aigp_metric_(r, &metric, &ad); + bgp_total_aigp_metric_(e, &metric, &ad); return metric; } +void bgp_register_attrs(void); + /* packets.c */ @@ -640,6 +657,7 @@ void bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) void bgp_kick_tx(void *vconn); void bgp_tx(struct birdsock *sk); int bgp_rx(struct birdsock *sk, uint size); +void bgp_uncork(void *vp); const char * bgp_error_dsc(unsigned code, unsigned subcode); void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len); @@ -665,27 +683,32 @@ void bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to); #define BAF_DECODE_FLAGS 0x0100 /* Private flag - attribute flags are handled by the decode hook */ -#define BA_ORIGIN 0x01 /* RFC 4271 */ /* WM */ -#define BA_AS_PATH 0x02 /* WM */ -#define BA_NEXT_HOP 0x03 /* WM */ -#define BA_MULTI_EXIT_DISC 0x04 /* ON */ -#define BA_LOCAL_PREF 0x05 /* WD */ -#define BA_ATOMIC_AGGR 0x06 /* WD */ -#define BA_AGGREGATOR 0x07 /* OT */ -#define BA_COMMUNITY 0x08 /* RFC 1997 */ /* OT */ -#define BA_ORIGINATOR_ID 0x09 /* RFC 4456 */ /* ON */ -#define BA_CLUSTER_LIST 0x0a /* RFC 4456 */ /* ON */ -#define BA_MP_REACH_NLRI 0x0e /* RFC 4760 */ -#define BA_MP_UNREACH_NLRI 0x0f /* RFC 4760 */ -#define BA_EXT_COMMUNITY 0x10 /* RFC 4360 */ -#define BA_AS4_PATH 0x11 /* RFC 6793 */ -#define BA_AS4_AGGREGATOR 0x12 /* RFC 6793 */ -#define BA_AIGP 0x1a /* RFC 7311 */ -#define BA_LARGE_COMMUNITY 0x20 /* RFC 8092 */ +enum bgp_attr_id { + BA_ORIGIN = 0x01, /* RFC 4271 */ /* WM */ + BA_AS_PATH = 0x02, /* WM */ + BA_NEXT_HOP = 0x03, /* WM */ + BA_MULTI_EXIT_DISC = 0x04, /* ON */ + BA_LOCAL_PREF = 0x05, /* WD */ + BA_ATOMIC_AGGR = 0x06, /* WD */ + BA_AGGREGATOR = 0x07, /* OT */ + BA_COMMUNITY = 0x08, /* RFC 1997 */ /* OT */ + BA_ORIGINATOR_ID = 0x09, /* RFC 4456 */ /* ON */ + BA_CLUSTER_LIST = 0x0a, /* RFC 4456 */ /* ON */ + BA_MP_REACH_NLRI = 0x0e, /* RFC 4760 */ + BA_MP_UNREACH_NLRI = 0x0f, /* RFC 4760 */ + BA_EXT_COMMUNITY = 0x10, /* RFC 4360 */ + BA_AS4_PATH = 0x11, /* RFC 6793 */ + BA_AS4_AGGREGATOR = 0x12, /* RFC 6793 */ + BA_AIGP = 0x1a, /* RFC 7311 */ + BA_LARGE_COMMUNITY = 0x20, /* RFC 8092 */ #define BA_ONLY_TO_CUSTOMER 0x23 /* RFC 9234 */ /* Bird's private internal BGP attributes */ -#define BA_MPLS_LABEL_STACK 0xfe /* MPLS label stack transfer attribute */ + BA_MPLS_LABEL_STACK = 0x100, /* MPLS label stack transfer attribute */ + +/* Maximum */ + BGP_ATTR_MAX, +}; /* BGP connection states */ diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index a2dfa747..8b0d4356 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -19,18 +19,17 @@ CF_DECLS CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE, MULTIHOP, STARTUP, VIA, NEXT, HOP, SELF, DEFAULT, PATH, METRIC, ERROR, - START, DELAY, FORGET, WAIT, ENABLE, DISABLE, AFTER, BGP_PATH, - BGP_LOCAL_PREF, BGP_MED, BGP_ORIGIN, BGP_NEXT_HOP, BGP_ATOMIC_AGGR, - BGP_AGGREGATOR, BGP_COMMUNITY, BGP_EXT_COMMUNITY, BGP_LARGE_COMMUNITY, + START, DELAY, FORGET, WAIT, ENABLE, DISABLE, AFTER, + BGP_LOCAL_PREF, BGP_MED, SOURCE, ADDRESS, PASSWORD, RR, RS, CLIENT, CLUSTER, ID, AS4, ADVERTISE, IPV4, CAPABILITIES, LIMIT, PASSIVE, PREFER, OLDER, MISSING, LLADDR, - DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, - BGP_CLUSTER_LIST, IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, + DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, + IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC, SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX, GRACEFUL, RESTART, AWARE, CHECK, LINK, PORT, EXTENDED, MESSAGES, SETKEY, STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG, LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS, - DYNAMIC, RANGE, NAME, DIGITS, BGP_AIGP, AIGP, ORIGINATE, COST, ENFORCE, + DYNAMIC, RANGE, NAME, DIGITS, AIGP, ORIGINATE, COST, ENFORCE, FIRST, FREE, VALIDATE, BASE, ROLE, ROLES, PEER, PROVIDER, CUSTOMER, RS_SERVER, RS_CLIENT, REQUIRE, BGP_OTC, GLOBAL) @@ -46,12 +45,14 @@ CF_KEYWORDS(CEASE, PREFIX, LIMIT, HIT, ADMINISTRATIVE, SHUTDOWN, RESET, PEER, CF_GRAMMAR /* Workaround for collisions between keywords and symbols */ -symbol: ROLE | PEER | PROVIDER | CUSTOMER | RS_SERVER | RS_CLIENT ; +toksym: ROLE | PEER | PROVIDER | CUSTOMER | RS_SERVER | RS_CLIENT ; +toksym: BGP_MED | BGP_LOCAL_PREF | SOURCE ; proto: bgp_proto '}' ; bgp_proto_start: proto_start BGP { this_proto = proto_config_new(&proto_bgp, $1); + this_proto->loop_order = DOMAIN_ORDER(proto); BGP_CFG->local_port = BGP_PORT; BGP_CFG->remote_port = BGP_PORT; BGP_CFG->multihop = -1; /* undefined */ @@ -329,43 +330,14 @@ bgp_channel_end: if (!this_channel->table) cf_error("Routing table not specified"); + if (BGP_CC->import_table) + this_channel->in_keep |= RIK_PREFILTER; + this_channel = NULL; }; bgp_proto_channel: bgp_channel_start bgp_channel_opt_list bgp_channel_end; - -dynamic_attr: BGP_ORIGIN - { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_ENUM_BGP_ORIGIN, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); } ; -dynamic_attr: BGP_PATH - { $$ = f_new_dynamic_attr(EAF_TYPE_AS_PATH, T_PATH, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); } ; -dynamic_attr: BGP_NEXT_HOP - { $$ = f_new_dynamic_attr(EAF_TYPE_IP_ADDRESS, T_IP, EA_CODE(PROTOCOL_BGP, BA_NEXT_HOP)); } ; -dynamic_attr: BGP_MED - { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); } ; -dynamic_attr: BGP_LOCAL_PREF - { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); } ; -dynamic_attr: BGP_ATOMIC_AGGR - { $$ = f_new_dynamic_attr(EAF_TYPE_OPAQUE, T_ENUM_EMPTY, EA_CODE(PROTOCOL_BGP, BA_ATOMIC_AGGR)); } ; -dynamic_attr: BGP_AGGREGATOR - { $$ = f_new_dynamic_attr(EAF_TYPE_OPAQUE, T_ENUM_EMPTY, EA_CODE(PROTOCOL_BGP, BA_AGGREGATOR)); } ; -dynamic_attr: BGP_COMMUNITY - { $$ = f_new_dynamic_attr(EAF_TYPE_INT_SET, T_CLIST, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)); } ; -dynamic_attr: BGP_ORIGINATOR_ID - { $$ = f_new_dynamic_attr(EAF_TYPE_ROUTER_ID, T_QUAD, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID)); } ; -dynamic_attr: BGP_CLUSTER_LIST - { $$ = f_new_dynamic_attr(EAF_TYPE_INT_SET, T_CLIST, EA_CODE(PROTOCOL_BGP, BA_CLUSTER_LIST)); } ; -dynamic_attr: BGP_EXT_COMMUNITY - { $$ = f_new_dynamic_attr(EAF_TYPE_EC_SET, T_ECLIST, EA_CODE(PROTOCOL_BGP, BA_EXT_COMMUNITY)); } ; -dynamic_attr: BGP_AIGP - { $$ = f_new_dynamic_attr(EAF_TYPE_OPAQUE, T_ENUM_EMPTY, EA_CODE(PROTOCOL_BGP, BA_AIGP)); } ; -dynamic_attr: BGP_LARGE_COMMUNITY - { $$ = f_new_dynamic_attr(EAF_TYPE_LC_SET, T_LCLIST, EA_CODE(PROTOCOL_BGP, BA_LARGE_COMMUNITY)); } ; -dynamic_attr: BGP_OTC - { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_CODE(PROTOCOL_BGP, BA_ONLY_TO_CUSTOMER)); } ; - - - CF_ENUM(T_ENUM_BGP_ORIGIN, ORIGIN_, IGP, EGP, INCOMPLETE) CF_CODE diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index 16818cf3..546f0809 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -15,8 +15,8 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "proto/mrt/mrt.h" #include "conf/conf.h" #include "lib/unaligned.h" @@ -973,7 +973,7 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session); bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE); - bgp_start_timer(conn->hold_timer, conn->hold_time); + bgp_start_timer(p, conn->hold_timer, conn->hold_time); bgp_conn_enter_openconfirm_state(conn); } @@ -1002,7 +1002,7 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) #define MISMATCHED_AF " - mismatched address family (%I for %s)" static void -bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll) +bgp_apply_next_hop(struct bgp_parse_state *s, ea_list **to, ip_addr gw, ip_addr ll) { struct bgp_proto *p = s->proto; struct bgp_channel *c = s->channel; @@ -1025,10 +1025,18 @@ bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll) if (nbr->scope == SCOPE_HOST) WITHDRAW(BAD_NEXT_HOP " - address %I is local", nbr->addr); - a->dest = RTD_UNICAST; - a->nh.gw = nbr->addr; - a->nh.iface = nbr->iface; - a->igp_metric = c->cf->cost; + ea_set_attr_u32(to, &ea_gen_igp_metric, 0, c->cf->cost); + + struct nexthop_adata nhad = { + .nh = { + .gw = nbr->addr, + .iface = nbr->iface, + }, + .ad = { + .length = sizeof nhad - sizeof nhad.ad, + }, + }; + ea_set_attr_data(to, &ea_gen_nexthop, 0, nhad.ad.data, nhad.ad.length); } else /* GW_RECURSIVE */ { @@ -1037,59 +1045,52 @@ bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll) rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6; ip_addr lla = (c->cf->next_hop_prefer == NHP_LOCAL) ? ll : IPA_NONE; - s->hostentry = rt_get_hostentry(tab, gw, lla, c->c.table); - - if (!s->mpls) - rta_apply_hostentry(a, s->hostentry, NULL); - /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */ + if (s->mpls) + { + u32 labels[BGP_MPLS_MAX]; + ea_set_hostentry(to, c->c.table, tab, gw, lla, BGP_MPLS_MAX, labels); + } + else + ea_set_hostentry(to, c->c.table, tab, gw, lla, 0, NULL); } } static void -bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum) +bgp_apply_mpls_labels(struct bgp_parse_state *s, ea_list **to, u32 lnum, u32 labels[lnum]) { if (lnum > MPLS_MAX_LABEL_STACK) { REPORT("Too many MPLS labels ($u)", lnum); - a->dest = RTD_UNREACHABLE; - a->hostentry = NULL; - a->nh = (struct nexthop) { }; + ea_set_dest(to, 0, RTD_UNREACHABLE); return; } /* Handle implicit NULL as empty MPLS stack */ if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL)) - lnum = 0; + lnum = s->mpls_labels->length = 0; if (s->channel->cf->gw_mode == GW_DIRECT) { - a->nh.labels = lnum; - memcpy(a->nh.label, labels, 4*lnum); + eattr *e = ea_find(*to, &ea_gen_nexthop); + struct { + struct nexthop_adata nhad; + u32 labels[MPLS_MAX_LABEL_STACK]; + } nh; + + memcpy(&nh.nhad, e->u.ptr, sizeof(struct adata) + e->u.ptr->length); + nh.nhad.nh.labels = lnum; + memcpy(nh.labels, labels, lnum * sizeof(u32)); + nh.nhad.ad.length = sizeof nh.nhad + lnum * sizeof(u32); } else /* GW_RECURSIVE */ { - mpls_label_stack ms; - - ms.len = lnum; - memcpy(ms.stack, labels, 4*lnum); - rta_apply_hostentry(a, s->hostentry, &ms); - } -} - -static void -bgp_apply_flow_validation(struct bgp_parse_state *s, const net_addr *n, rta *a) -{ - struct bgp_channel *c = s->channel; - int valid = rt_flowspec_check(c->base_table, c->c.table, n, a, s->proto->is_interior); - a->dest = valid ? RTD_NONE : RTD_UNREACHABLE; - - /* Invalidate cached rta if dest changes */ - if (s->cached_rta && (s->cached_rta->dest != a->dest)) - { - rta_free(s->cached_rta); - s->cached_rta = NULL; + eattr *e = ea_find(*to, &ea_gen_hostentry); + ASSERT_DIE(e); + struct hostentry_adata *head = (void *) e->u.ptr; + memcpy(&head->labels, labels, lnum * sizeof(u32)); + head->ad.length = (void *)(&head->labels[lnum]) - (void *) head->ad.data; } } @@ -1130,7 +1131,7 @@ bgp_use_next_hop(struct bgp_export_state *s, eattr *a) return 0; /* Do not pass NEXT_HOP between different VRFs */ - if (p->p.vrf_set && s->src && s->src->p.vrf_set && (p->p.vrf != s->src->p.vrf)) + if (p->p.vrf && s->src && s->src->p.vrf && (p->p.vrf != s->src->p.vrf)) return 0; /* Keep it when exported to internal peers */ @@ -1143,35 +1144,45 @@ bgp_use_next_hop(struct bgp_export_state *s, eattr *a) return p->neigh && (p->neigh->iface == ifa); } -static inline int +static inline struct nexthop * bgp_use_gateway(struct bgp_export_state *s) { struct bgp_proto *p = s->proto; struct bgp_channel *c = s->channel; - rta *ra = s->route->attrs; + ea_list *ra = s->route->attrs; /* Handle next hop self option - also applies to gateway */ if (c->cf->next_hop_self && bgp_match_src(s, c->cf->next_hop_self)) - return 0; + return NULL; + + eattr *nhea = ea_find(ra, &ea_gen_nexthop); + if (!nhea) + return NULL; /* We need one valid global gateway */ - if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw)) - return 0; + struct nexthop_adata *nhad = (struct nexthop_adata *) nhea->u.ptr; + if (!NEXTHOP_IS_REACHABLE(nhad) || + !NEXTHOP_ONE(nhad) || ipa_zero(nhad->nh.gw) || + ipa_is_link_local(nhad->nh.gw)) + return NULL; /* Check for non-matching AF */ - if ((ipa_is_ip4(ra->nh.gw) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop) - return 0; + if ((ipa_is_ip4(nhad->nh.gw) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop) + return NULL; /* Do not use gateway from different VRF */ - if (p->p.vrf_set && ra->nh.iface && (p->p.vrf != ra->nh.iface->master)) + if (p->p.vrf && nhad->nh.iface && (p->p.vrf != nhad->nh.iface->master)) return 0; /* Use it when exported to internal peers */ if (p->is_interior) - return 1; + return &nhad->nh; /* Use it when forwarded to single-hop BGP peer on on the same iface */ - return p->neigh && (p->neigh->iface == ra->nh.iface); + if (p->neigh && (p->neigh->iface == nhad->nh.iface)) + return &nhad->nh; + + return NULL; } static void @@ -1179,36 +1190,36 @@ bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to) { if (!a || !bgp_use_next_hop(s, a)) { - if (bgp_use_gateway(s)) + struct nexthop *nhloc; + if (nhloc = bgp_use_gateway(s)) { - rta *ra = s->route->attrs; - ip_addr nh[1] = { ra->nh.gw }; - bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16); + ip_addr nh[1] = { nhloc->gw }; + bgp_set_attr_data(to, BA_NEXT_HOP, 0, nh, 16); if (s->mpls) { u32 implicit_null = BGP_MPLS_NULL; - u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null; - uint lnum = ra->nh.labels ? ra->nh.labels : 1; - bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4); + u32 *labels = nhloc->labels ? nhloc->label : &implicit_null; + uint lnum = nhloc->labels ? nhloc->labels : 1; + bgp_set_attr_data(to, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4); } else - bgp_unset_attr(to, s->pool, BA_MPLS_LABEL_STACK); + bgp_unset_attr(to, BA_MPLS_LABEL_STACK); } else { ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr }; - bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16); + bgp_set_attr_data(to, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16); s->local_next_hop = 1; /* TODO: Use local MPLS assigned label */ if (s->mpls) { u32 implicit_null = BGP_MPLS_NULL; - bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, &implicit_null, 4); + bgp_set_attr_data(to, BA_MPLS_LABEL_STACK, 0, &implicit_null, 4); } else - bgp_unset_attr(to, s->pool, BA_MPLS_LABEL_STACK); + bgp_unset_attr(to, BA_MPLS_LABEL_STACK); } } @@ -1270,7 +1281,7 @@ bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size } static void -bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a) +bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, ea_list **to) { struct bgp_channel *c = s->channel; struct adata *ad = lp_alloc_adata(s->pool, 32); @@ -1311,8 +1322,8 @@ bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a) // XXXX validate next hop - bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad); - bgp_apply_next_hop(s, a, nh[0], nh[1]); + bgp_set_attr_ptr(to, BA_NEXT_HOP, 0, ad); + bgp_apply_next_hop(s, to, nh[0], nh[1]); } static uint @@ -1350,7 +1361,7 @@ bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint siz } static void -bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a) +bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, ea_list **to) { struct bgp_channel *c = s->channel; struct adata *ad = lp_alloc_adata(s->pool, 32); @@ -1392,8 +1403,8 @@ bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a) // XXXX validate next hop - bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad); - bgp_apply_next_hop(s, a, nh[0], nh[1]); + bgp_set_attr_ptr(to, BA_NEXT_HOP, 0, ad); + bgp_apply_next_hop(s, to, nh[0], nh[1]); } @@ -1405,7 +1416,7 @@ bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte } static void -bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED) +bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, ea_list **to UNUSED) { /* * Although we expect no next hop and RFC 7606 7.11 states that attribute @@ -1417,11 +1428,11 @@ bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, ui } static void -bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to) +bgp_update_next_hop_none(struct bgp_export_state *s UNUSED, eattr *a, ea_list **to) { /* NEXT_HOP shall not pass */ if (a) - bgp_unset_attr(to, s->pool, BA_NEXT_HOP); + bgp_unset_attr(to, BA_NEXT_HOP); } @@ -1430,15 +1441,17 @@ bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to) */ static void -bgp_rte_update(struct bgp_parse_state *s, const net_addr *n, u32 path_id, rta *a0) +bgp_rte_update(struct bgp_parse_state *s, const net_addr *n, u32 path_id, ea_list *a0) { if (path_id != s->last_id) { + rt_unlock_source(s->last_src); + s->last_src = rt_get_source(&s->proto->p, path_id); s->last_id = path_id; - rta_free(s->cached_rta); - s->cached_rta = NULL; + ea_free(s->cached_ea); + s->cached_ea = NULL; } if (!a0) @@ -1448,23 +1461,20 @@ bgp_rte_update(struct bgp_parse_state *s, const net_addr *n, u32 path_id, rta *a REPORT("Invalid route %N withdrawn", n); /* Route withdraw */ - rte_update3(&s->channel->c, n, NULL, s->last_src); + rte_update(&s->channel->c, n, NULL, s->last_src); return; } /* Prepare cached route attributes */ - if (s->cached_rta == NULL) - { - /* Workaround for rta_lookup() breaking eattrs */ - ea_list *ea = a0->eattrs; - s->cached_rta = rta_lookup(a0); - a0->eattrs = ea; - } + if (s->cached_ea == NULL) + s->cached_ea = ea_lookup(a0, 0); - rta *a = rta_clone(s->cached_rta); - rte *e = rte_get_temp(a, s->last_src); + rte e0 = { + .attrs = s->cached_ea, + .src = s->last_src, + }; - rte_update3(&s->channel->c, n, e, s->last_src); + rte_update(&s->channel->c, n, &e0, s->last_src); } static void @@ -1487,9 +1497,10 @@ bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, const adata *mpls, byte } static void -bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a) +bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, ea_list **to) { - u32 labels[BGP_MPLS_MAX], label; + u32 labels[BGP_MPLS_MAX]; + u32 label; uint lnum = 0; do { @@ -1508,26 +1519,15 @@ bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *p } while (!(label & BGP_MPLS_BOS)); - if (!a) + if (!*to) return; - /* Attach MPLS attribute unless we already have one */ - if (!s->mpls_labels) - { - s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX); - bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels); - } - - /* Overwrite data in the attribute */ - s->mpls_labels->length = 4*lnum; - memcpy(s->mpls_labels->data, labels, 4*lnum); - /* Update next hop entry in rta */ - bgp_apply_mpls_labels(s, a, labels, lnum); + bgp_apply_mpls_labels(s, to, lnum, labels); /* Attributes were changed, invalidate cached entry */ - rta_free(s->cached_rta); - s->cached_rta = NULL; + rta_free(s->cached_ea); + s->cached_ea = NULL; return; } @@ -1563,14 +1563,14 @@ bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1596,7 +1596,7 @@ bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) /* Decode MPLS labels */ if (s->mpls) - bgp_decode_mpls_labels(s, &pos, &len, &l, a); + bgp_decode_mpls_labels(s, &pos, &len, &l, &a); if (l > IP4_MAX_PREFIX_LENGTH) bgp_parse_error(s, 10); @@ -1648,14 +1648,14 @@ bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1681,7 +1681,7 @@ bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) /* Decode MPLS labels */ if (s->mpls) - bgp_decode_mpls_labels(s, &pos, &len, &l, a); + bgp_decode_mpls_labels(s, &pos, &len, &l, &a); if (l > IP6_MAX_PREFIX_LENGTH) bgp_parse_error(s, 10); @@ -1736,14 +1736,14 @@ bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *b memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1769,7 +1769,7 @@ bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) /* Decode MPLS labels */ if (s->mpls) - bgp_decode_mpls_labels(s, &pos, &len, &l, a); + bgp_decode_mpls_labels(s, &pos, &len, &l, &a); /* Decode route distinguisher */ if (l < 64) @@ -1833,14 +1833,14 @@ bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *b memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1866,7 +1866,7 @@ bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) /* Decode MPLS labels */ if (s->mpls) - bgp_decode_mpls_labels(s, &pos, &len, &l, a); + bgp_decode_mpls_labels(s, &pos, &len, &l, &a); /* Decode route distinguisher */ if (l < 64) @@ -1920,14 +1920,14 @@ bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte * memcpy(pos, net->data, flen); ADVANCE(pos, size, flen); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1978,10 +1978,6 @@ bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) net_fill_flow4(n, px, pxlen, pos, flen); ADVANCE(pos, len, flen); - /* Apply validation procedure per RFC 8955 (6) */ - if (a && s->channel->cf->validate) - bgp_apply_flow_validation(s, n, a); - bgp_rte_update(s, n, path_id, a); } } @@ -2012,14 +2008,14 @@ bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte * memcpy(pos, net->data, flen); ADVANCE(pos, size, flen); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -2070,10 +2066,6 @@ bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) net_fill_flow6(n, px, pxlen, pos, flen); ADVANCE(pos, len, flen); - /* Apply validation procedure per RFC 8955 (6) */ - if (a && s->channel->cf->validate) - bgp_apply_flow_validation(s, n, a); - bgp_rte_update(s, n, path_id, a); } } @@ -2250,6 +2242,8 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu * var IPv4 Network Layer Reachability Information */ + ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck); + int lr, la; la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH); @@ -2271,6 +2265,8 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu static byte * bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { + ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck); + /* * 2 B IPv4 Withdrawn Routes Length (zero) * --- IPv4 Withdrawn Routes NLRI (unused) @@ -2409,7 +2405,7 @@ again: ; }; /* Try unreachable bucket */ - if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) + if ((buck = c->ptx->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) { res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ? bgp_create_ip_unreach(&s, buck, buf, end): @@ -2419,14 +2415,13 @@ again: ; } /* Try reachable buckets */ - if (!EMPTY_LIST(c->bucket_queue)) + if (!EMPTY_LIST(c->ptx->bucket_queue)) { - buck = HEAD(c->bucket_queue); + buck = HEAD(c->ptx->bucket_queue); /* Cleanup empty buckets */ - if (EMPTY_LIST(buck->prefixes)) + if (bgp_done_bucket(c, buck)) { - bgp_free_bucket(c, buck); lp_restore(tmp_linpool, &tmpp); goto again; } @@ -2435,10 +2430,7 @@ again: ; bgp_create_ip_reach(&s, buck, buf, end): bgp_create_mp_reach(&s, buck, buf, end); - if (EMPTY_LIST(buck->prefixes)) - bgp_free_bucket(c, buck); - else - bgp_defer_bucket(c, buck); + bgp_done_bucket(c, buck); if (!res) { @@ -2522,7 +2514,6 @@ static inline void bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len) { struct bgp_channel *c = bgp_get_channel(s->proto, afi); - rta *a = NULL; if (!c) DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi)); @@ -2533,6 +2524,7 @@ bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_lis s->last_id = 0; s->last_src = s->proto->p.main_source; + rt_lock_source(s->last_src); /* * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not @@ -2543,26 +2535,24 @@ bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_lis if (ea) { - a = allocz(RTA_MAX_SIZE); + ea_set_attr_data(&ea, &ea_gen_from, 0, &s->proto->remote_ip, sizeof(ip_addr)); + ea_set_attr_u32(&ea, &ea_gen_preference, 0, c->c.preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_BGP); - a->source = RTS_BGP; - a->scope = SCOPE_UNIVERSE; - a->from = s->proto->remote_ip; - a->eattrs = ea; - a->pref = c->c.preference; - - c->desc->decode_next_hop(s, nh, nh_len, a); - bgp_finish_attrs(s, a); + c->desc->decode_next_hop(s, nh, nh_len, &ea); + bgp_finish_attrs(s, &ea); /* Handle withdraw during next hop decoding */ if (s->err_withdraw) - a = NULL; + ea = NULL; } - c->desc->decode_nlri(s, nlri, len, a); + c->desc->decode_nlri(s, nlri, len, ea); + + rta_free(s->cached_ea); + s->cached_ea = NULL; - rta_free(s->cached_rta); - s->cached_rta = NULL; + rt_unlock_source(s->last_src); } static void @@ -2582,7 +2572,7 @@ bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) if (conn->state != BS_ESTABLISHED) { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - bgp_start_timer(conn->hold_timer, conn->hold_time); + bgp_start_timer(p, conn->hold_timer, conn->hold_time); struct lp_state tmpp; lp_save(tmp_linpool, &tmpp); @@ -2667,7 +2657,7 @@ bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) ea, s.mp_next_hop_data, s.mp_next_hop_len); done: - rta_free(s.cached_rta); + rta_free(s.cached_ea); lp_restore(tmp_linpool, &tmpp); return; } @@ -2923,7 +2913,7 @@ bgp_fire_tx(struct bgp_conn *conn) { conn->packets_to_send &= ~(1 << PKT_KEEPALIVE); BGP_TRACE(D_PACKETS, "Sending KEEPALIVE"); - bgp_start_timer(conn->keepalive_timer, conn->keepalive_time); + bgp_start_timer(p, conn->keepalive_timer, conn->keepalive_time); return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH); } else while (conn->channels_to_send) @@ -2991,7 +2981,11 @@ bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) { ASSERT(conn->sk); - DBG("BGP: Scheduling packet type %d\n", type); + struct bgp_proto *p = conn->bgp; + if (c) + BGP_TRACE(D_PACKETS, "Scheduling packet type %d for channel %s", type, c->c.name); + else + BGP_TRACE(D_PACKETS, "Scheduling packet type %d", type); if (c) { @@ -3008,7 +3002,7 @@ bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) conn->packets_to_send |= 1 << type; if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev)) - ev_schedule(conn->tx_ev); + proto_send_event(&p->p, conn->tx_ev); } void bgp_kick_tx(void *vconn) @@ -3021,7 +3015,7 @@ bgp_kick_tx(void *vconn) ; if (!max && !ev_active(conn->tx_ev)) - ev_schedule(conn->tx_ev); + proto_send_event(&conn->bgp->p, conn->tx_ev); } void @@ -3029,13 +3023,14 @@ bgp_tx(sock *sk) { struct bgp_conn *conn = sk->data; + ASSERT_DIE(birdloop_inside(conn->bgp->p.loop)); DBG("BGP: TX hook\n"); uint max = 1024; while (--max && (bgp_fire_tx(conn) > 0)) ; if (!max && !ev_active(conn->tx_ev)) - ev_schedule(conn->tx_ev); + proto_send_event(&conn->bgp->p, conn->tx_ev); } @@ -3224,7 +3219,7 @@ bgp_rx_keepalive(struct bgp_conn *conn) struct bgp_proto *p = conn->bgp; BGP_TRACE(D_PACKETS, "Got KEEPALIVE"); - bgp_start_timer(conn->hold_timer, conn->hold_time); + bgp_start_timer(p, conn->hold_timer, conn->hold_time); if (conn->state == BS_OPENCONFIRM) { bgp_conn_enter_established_state(conn); return; } @@ -3266,6 +3261,30 @@ bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len) } } +void +bgp_uncork(void *vp) +{ + /* The uncork event is run from &main_birdloop and there is no useful way how + * to assign the target loop to it, thus we have to lock it ourselves. */ + + struct bgp_proto *p = vp; + if (!p) + return; + + birdloop_enter(p->p.loop); + + if (p && p->conn && (p->conn->state == BS_ESTABLISHED) && !p->conn->sk->rx_hook) + { + struct birdsock *sk = p->conn->sk; + ASSERT_DIE(sk->rpos > sk->rbuf); + sk_resume_rx(p->p.loop, sk, bgp_rx); + bgp_rx(sk, sk->rpos - sk->rbuf); + BGP_TRACE(D_PACKETS, "Uncorked"); + } + + birdloop_leave(p->p.loop); +} + /** * bgp_rx - handle received data * @sk: socket @@ -3280,6 +3299,7 @@ int bgp_rx(sock *sk, uint size) { struct bgp_conn *conn = sk->data; + struct bgp_proto *p = conn->bgp; byte *pkt_start = sk->rbuf; byte *end = pkt_start + size; uint i, len; @@ -3289,6 +3309,12 @@ bgp_rx(sock *sk, uint size) { if ((conn->state == BS_CLOSE) || (conn->sk != sk)) return 0; + if ((conn->state == BS_ESTABLISHED) && rt_cork_check(conn->bgp->uncork_ev)) + { + sk_pause_rx(p->p.loop, sk); + BGP_TRACE(D_PACKETS, "Corked"); + return 0; + } for(i=0; i<16; i++) if (pkt_start[i] != 0xff) { diff --git a/proto/mrt/mrt.c b/proto/mrt/mrt.c index d1c334e1..82fd426a 100644 --- a/proto/mrt/mrt.c +++ b/proto/mrt/mrt.c @@ -228,7 +228,7 @@ mrt_next_table_(rtable *tab, rtable *tab_ptr, const char *pattern) NODE_VALID(tn); tn = tn->next) { - tab = SKIP_BACK(struct rtable, n, tn); + tab = SKIP_BACK(rtable, n, tn); if (patmatch(pattern, tab->name) && ((tab->addr_type == NET_IP4) || (tab->addr_type == NET_IP6))) return tab; @@ -243,13 +243,15 @@ mrt_next_table(struct mrt_table_dump_state *s) rtable *tab = mrt_next_table_(s->table, s->table_ptr, s->table_expr); if (s->table) - rt_unlock_table(s->table); + RT_LOCKED(s->table, tab) + rt_unlock_table(tab); s->table = tab; s->ipv4 = tab ? (tab->addr_type == NET_IP4) : 0; if (s->table) - rt_lock_table(s->table); + RT_LOCKED(s->table, tab) + rt_lock_table(tab); return s->table; } @@ -423,7 +425,7 @@ mrt_rib_table_header(struct mrt_table_dump_state *s, net_addr *n) static void mrt_rib_table_entry_bgp_attrs(struct mrt_table_dump_state *s, rte *r) { - struct ea_list *eattrs = r->attrs->eattrs; + struct ea_list *eattrs = r->attrs; buffer *b = &s->buf; if (!eattrs) @@ -431,7 +433,7 @@ mrt_rib_table_entry_bgp_attrs(struct mrt_table_dump_state *s, rte *r) /* Attribute list must be normalized for bgp_encode_attrs() */ if (!rta_is_cached(r->attrs)) - ea_normalize(eattrs); + eattrs = ea_normalize(eattrs, 0); mrt_buffer_need(b, MRT_ATTR_BUFFER_SIZE); byte *pos = b->pos; @@ -460,7 +462,7 @@ mrt_rib_table_entry_bgp_attrs(struct mrt_table_dump_state *s, rte *r) return; fail: - mrt_log(s, "Attribute list too long for %N", r->net->n.addr); + mrt_log(s, "Attribute list too long for %N", r->net); } #endif @@ -472,9 +474,9 @@ mrt_rib_table_entry(struct mrt_table_dump_state *s, rte *r) #ifdef CONFIG_BGP /* Find peer index */ - if (r->src->proto->proto == &proto_bgp) + struct bgp_proto *p = bgp_rte_proto(r); + if (p) { - struct bgp_proto *p = (void *) r->src->proto; struct mrt_peer_entry *n = HASH_FIND(s->peer_hash, PEER, p->remote_id, p->remote_as, p->remote_ip); @@ -512,24 +514,21 @@ mrt_rib_table_dump(struct mrt_table_dump_state *s, net *n, int add_path) mrt_init_message(&s->buf, MRT_TABLE_DUMP_V2, subtype); mrt_rib_table_header(s, n->n.addr); - rte *rt, *rt0; - for (rt0 = n->routes; rt = rt0; rt0 = rt0->next) + for (struct rte_storage *rt, *rt0 = n->routes; rt = rt0; rt0 = rt0->next) { - if (rte_is_filtered(rt)) + if (rte_is_filtered(&rt->rte)) continue; /* Skip routes that should be reported in the other phase */ - if (!s->always_add_path && (!rt->src->private_id != !s->add_path)) + if (!s->always_add_path && (!rt->rte.src->private_id != !s->add_path)) { s->want_add_path = 1; continue; } - if (f_run(s->filter, &rt, s->linpool, 0) <= F_ACCEPT) - mrt_rib_table_entry(s, rt); - - if (rt != rt0) - rte_free(rt); + rte e = rt->rte; + if (f_run(s->filter, &e, 0) <= F_ACCEPT) + mrt_rib_table_entry(s, &e); lp_flush(s->linpool); } @@ -576,14 +575,18 @@ mrt_table_dump_init(pool *pp) static void mrt_table_dump_free(struct mrt_table_dump_state *s) { - if (s->table_open) - FIB_ITERATE_UNLINK(&s->fit, &s->table->fib); - if (s->table) - rt_unlock_table(s->table); + RT_LOCKED(s->table, tab) + { + if (s->table_open) + FIB_ITERATE_UNLINK(&s->fit, &tab->fib); + + rt_unlock_table(tab); + } if (s->table_ptr) - rt_unlock_table(s->table_ptr); + RT_LOCKED(s->table_ptr, tab) + rt_unlock_table(tab); config_del_obstacle(s->config); @@ -609,16 +612,19 @@ mrt_table_dump_step(struct mrt_table_dump_state *s) mrt_peer_table_dump(s); - FIB_ITERATE_INIT(&s->fit, &s->table->fib); + RT_LOCKED(s->table, tab) + { + + FIB_ITERATE_INIT(&s->fit, &tab->fib); s->table_open = 1; step: - FIB_ITERATE_START(&s->table->fib, &s->fit, net, n) + FIB_ITERATE_START(&tab->fib, &s->fit, net, n) { if (s->max < 0) { FIB_ITERATE_PUT(&s->fit); - return 0; + RT_RETURN(tab, 0); } /* With Always ADD_PATH option, we jump directly to second phase */ @@ -633,6 +639,8 @@ mrt_table_dump_step(struct mrt_table_dump_state *s) FIB_ITERATE_END; s->table_open = 0; + } + mrt_close_file(s); mrt_peer_table_flush(s); } @@ -664,7 +672,8 @@ mrt_timer(timer *t) s->always_add_path = cf->always_add_path; if (s->table_ptr) - rt_lock_table(s->table_ptr); + RT_LOCKED(s->table_ptr, tab) + rt_lock_table(tab); p->table_dump = s; ev_schedule(p->event); @@ -706,14 +715,17 @@ mrt_dump_cont(struct cli *c) cli_printf(c, 0, ""); mrt_table_dump_free(c->rover); - c->cont = c->cleanup = c->rover = NULL; + c->cont = NULL; + c->cleanup = NULL; + c->rover = NULL; } -static void +static int mrt_dump_cleanup(struct cli *c) { mrt_table_dump_free(c->rover); c->rover = NULL; + return 0; } void @@ -737,7 +749,8 @@ mrt_dump_cmd(struct mrt_dump_data *d) s->filename = d->filename; if (s->table_ptr) - rt_lock_table(s->table_ptr); + RT_LOCKED(s->table_ptr, tab) + rt_lock_table(tab); this_cli->cont = mrt_dump_cont; this_cli->cleanup = mrt_dump_cleanup; @@ -907,7 +920,6 @@ mrt_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUSE struct protocol proto_mrt = { .name = "MRT", .template = "mrt%d", - .class = PROTOCOL_MRT, .proto_size = sizeof(struct mrt_proto), .config_size = sizeof(struct mrt_config), .init = mrt_init, diff --git a/proto/mrt/mrt.h b/proto/mrt/mrt.h index 4ff94c12..f535a391 100644 --- a/proto/mrt/mrt.h +++ b/proto/mrt/mrt.h @@ -13,7 +13,7 @@ #include "nest/bird.h" #include "nest/protocol.h" #include "lib/lists.h" -#include "nest/route.h" +#include "nest/rt.h" #include "lib/event.h" #include "lib/hash.h" @@ -40,7 +40,7 @@ struct mrt_proto { struct mrt_dump_data { const char *table_expr; - struct rtable *table_ptr; + rtable *table_ptr; const struct filter *filter; const char *filename; }; @@ -60,7 +60,7 @@ struct mrt_table_dump_state { /* Configuration information */ const char *table_expr; /* Wildcard for table name (or NULL) */ - struct rtable *table_ptr; /* Explicit table (or NULL) */ + rtable *table_ptr; /* Explicit table (or NULL) */ const struct filter *filter; /* Optional filter */ const char *filename; /* Filename pattern */ int always_add_path; /* Always use *_ADDPATH message subtypes */ @@ -73,7 +73,7 @@ struct mrt_table_dump_state { HASH(struct mrt_peer_entry) peer_hash; /* Hash for peers to find the index */ - struct rtable *table; /* Processed table, NULL initially */ + rtable *table; /* Processed table, NULL initially */ struct fib_iterator fit; /* Iterator in processed table */ int table_open; /* Whether iterator is linked */ diff --git a/proto/ospf/config.Y b/proto/ospf/config.Y index 4b7d5a36..bc3df8db 100644 --- a/proto/ospf/config.Y +++ b/proto/ospf/config.Y @@ -190,7 +190,7 @@ ospf_check_auth(void) CF_DECLS -CF_KEYWORDS(OSPF, V2, V3, OSPF_METRIC1, OSPF_METRIC2, OSPF_TAG, OSPF_ROUTER_ID) +CF_KEYWORDS(OSPF, V2, V3) CF_KEYWORDS(AREA, NEIGHBORS, RFC1583COMPAT, STUB, TICK, COST, COST2, RETRANSMIT) CF_KEYWORDS(HELLO, TRANSMIT, PRIORITY, DEAD, TYPE, BROADCAST, BCAST, DEFAULT) CF_KEYWORDS(NONBROADCAST, NBMA, POINTOPOINT, PTP, POINTOMULTIPOINT, PTMP) @@ -505,11 +505,6 @@ ospf_iface: ospf_iface_start ospf_iface_patt_list ospf_iface_opt_list { ospf_iface_finish(); } ; -dynamic_attr: OSPF_METRIC1 { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_OSPF_METRIC1); } ; -dynamic_attr: OSPF_METRIC2 { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_OSPF_METRIC2); } ; -dynamic_attr: OSPF_TAG { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_OSPF_TAG); } ; -dynamic_attr: OSPF_ROUTER_ID { $$ = f_new_dynamic_attr(EAF_TYPE_ROUTER_ID, T_QUAD, EA_OSPF_ROUTER_ID); } ; - CF_CLI_HELP(SHOW OSPF, ..., [[Show information about OSPF protocol]]); CF_CLI(SHOW OSPF, optproto, [<name>], [[Show information about OSPF protocol]]) { PROTO_WALK_CMD($3, &proto_ospf, p) ospf_sh(p); }; diff --git a/proto/ospf/iface.c b/proto/ospf/iface.c index 87e3d95e..0aa7fa00 100644 --- a/proto/ospf/iface.c +++ b/proto/ospf/iface.c @@ -136,7 +136,7 @@ ospf_sk_open(struct ospf_iface *ifa) sk->flags = SKF_LADDR_RX | (ifa->check_ttl ? SKF_TTL_RX : 0); sk->ttl = ifa->cf->ttl_security ? 255 : 1; - if (sk_open(sk) < 0) + if (sk_open(sk, p->p.loop) < 0) goto err; /* 12 is an offset of the checksum in an OSPFv3 packet */ @@ -220,7 +220,7 @@ ospf_open_vlink_sk(struct ospf_proto *p) sk->data = (void *) p; sk->flags = 0; - if (sk_open(sk) < 0) + if (sk_open(sk, p->p.loop) < 0) goto err; /* 12 is an offset of the checksum in an OSPFv3 packet */ @@ -484,9 +484,9 @@ ospf_iface_find(struct ospf_proto *p, struct iface *what) } static void -ospf_iface_add(struct object_lock *lock) +ospf_iface_add(void *_ifa) { - struct ospf_iface *ifa = lock->data; + struct ospf_iface *ifa = _ifa; struct ospf_proto *p = ifa->oa->po; /* Open socket if interface is not stub */ @@ -668,8 +668,11 @@ ospf_iface_new(struct ospf_area *oa, struct ifa *addr, struct ospf_iface_patt *i lock->port = OSPF_PROTO; lock->inst = ifa->instance_id; lock->iface = iface; - lock->data = ifa; - lock->hook = ospf_iface_add; + lock->event = (event) { + .hook = ospf_iface_add, + .data = ifa, + }; + lock->target = &global_event_list; olock_acquire(lock); } @@ -1222,12 +1225,11 @@ ospf_ifa_notify3(struct proto *P, uint flags, struct ifa *a) static void ospf_reconfigure_ifaces2(struct ospf_proto *p) { - struct iface *iface; struct ifa *a; - WALK_LIST(iface, iface_list) + IFACE_WALK(iface) { - if (p->p.vrf_set && p->p.vrf != iface->master) + if (p->p.vrf && p->p.vrf != iface->master) continue; if (! (iface->flags & IF_UP)) @@ -1271,12 +1273,11 @@ ospf_reconfigure_ifaces2(struct ospf_proto *p) static void ospf_reconfigure_ifaces3(struct ospf_proto *p) { - struct iface *iface; struct ifa *a; - WALK_LIST(iface, iface_list) + IFACE_WALK(iface) { - if (p->p.vrf_set && p->p.vrf != iface->master) + if (p->p.vrf && p->p.vrf != iface->master) continue; if (! (iface->flags & IF_UP)) diff --git a/proto/ospf/neighbor.c b/proto/ospf/neighbor.c index ca369819..b0fdc42f 100644 --- a/proto/ospf/neighbor.c +++ b/proto/ospf/neighbor.c @@ -777,7 +777,7 @@ ospf_neigh_update_bfd(struct ospf_neighbor *n, int use_bfd) if (use_bfd && !n->bfd_req) n->bfd_req = bfd_request_session(n->pool, n->ip, n->ifa->addr->ip, n->ifa->iface, p->p.vrf, - ospf_neigh_bfd_hook, n, NULL); + ospf_neigh_bfd_hook, n, p->p.loop, NULL); if (!use_bfd && n->bfd_req) { diff --git a/proto/ospf/ospf.c b/proto/ospf/ospf.c index ad4b2d14..896bf5a3 100644 --- a/proto/ospf/ospf.c +++ b/proto/ospf/ospf.c @@ -106,11 +106,12 @@ #include <stdlib.h> #include "ospf.h" +#include "lib/macro.h" -static int ospf_preexport(struct channel *P, rte *new); +static int ospf_preexport(struct channel *C, rte *new); static void ospf_reload_routes(struct channel *C); -static int ospf_rte_better(struct rte *new, struct rte *old); -static u32 ospf_rte_igp_metric(struct rte *rt); +static int ospf_rte_better(const rte *new, const rte *old); +static u32 ospf_rte_igp_metric(const rte *rt); static void ospf_disp(timer *timer); @@ -370,39 +371,42 @@ ospf_init(struct proto_config *CF) P->main_channel = proto_add_channel(P, proto_cf_main_channel(CF)); P->rt_notify = ospf_rt_notify; - P->if_notify = ospf_if_notify; - P->ifa_notify = cf->ospf2 ? ospf_ifa_notify2 : ospf_ifa_notify3; + P->iface_sub.if_notify = ospf_if_notify; + P->iface_sub.ifa_notify = cf->ospf2 ? ospf_ifa_notify2 : ospf_ifa_notify3; P->preexport = ospf_preexport; P->reload_routes = ospf_reload_routes; P->feed_begin = ospf_feed_begin; P->feed_end = ospf_feed_end; - P->rte_better = ospf_rte_better; - P->rte_igp_metric = ospf_rte_igp_metric; + + P->sources.class = &ospf_rte_owner_class; return P; } /* If new is better return 1 */ static int -ospf_rte_better(struct rte *new, struct rte *old) +ospf_rte_better(const rte *new, const rte *old) { - u32 new_metric1 = ea_get_int(new->attrs->eattrs, EA_OSPF_METRIC1, LSINFINITY); + u32 new_metric1 = ea_get_int(new->attrs, &ea_ospf_metric1, LSINFINITY); if (new_metric1 == LSINFINITY) return 0; - if(new->attrs->source < old->attrs->source) return 1; - if(new->attrs->source > old->attrs->source) return 0; + u32 ns = rt_get_source_attr(new); + u32 os = rt_get_source_attr(old); + + if (ns < os) return 1; + if (ns > os) return 0; - if(new->attrs->source == RTS_OSPF_EXT2) + if (ns == RTS_OSPF_EXT2) { - u32 old_metric2 = ea_get_int(old->attrs->eattrs, EA_OSPF_METRIC2, LSINFINITY); - u32 new_metric2 = ea_get_int(new->attrs->eattrs, EA_OSPF_METRIC2, LSINFINITY); - if(new_metric2 < old_metric2) return 1; - if(new_metric2 > old_metric2) return 0; + u32 old_metric2 = ea_get_int(old->attrs, &ea_ospf_metric2, LSINFINITY); + u32 new_metric2 = ea_get_int(new->attrs, &ea_ospf_metric2, LSINFINITY); + if (new_metric2 < old_metric2) return 1; + if (new_metric2 > old_metric2) return 0; } - u32 old_metric1 = ea_get_int(old->attrs->eattrs, EA_OSPF_METRIC1, LSINFINITY); + u32 old_metric1 = ea_get_int(old->attrs, &ea_ospf_metric1, LSINFINITY); if (new_metric1 < old_metric1) return 1; @@ -410,12 +414,12 @@ ospf_rte_better(struct rte *new, struct rte *old) } static u32 -ospf_rte_igp_metric(struct rte *rt) +ospf_rte_igp_metric(const rte *rt) { - if (rt->attrs->source == RTS_OSPF_EXT2) + if (rt_get_source_attr(rt) == RTS_OSPF_EXT2) return IGP_METRIC_UNKNOWN; - return ea_get_int(rt->attrs->eattrs, EA_OSPF_METRIC1, LSINFINITY); + return ea_get_int(rt->attrs, &ea_ospf_metric1, LSINFINITY); } void @@ -488,7 +492,7 @@ ospf_preexport(struct channel *C, rte *e) struct ospf_area *oa = ospf_main_area(p); /* Reject our own routes */ - if (e->src->proto == &p->p) + if (e->sender == C->in_req.hook) return -1; /* Do not export routes to stub areas */ @@ -531,7 +535,7 @@ ospf_shutdown(struct proto *P) /* Cleanup locked rta entries */ FIB_WALK(&p->rtf, ort, nf) { - rta_free(nf->old_rta); + ea_free(nf->old_ea); } FIB_WALK_END; @@ -566,11 +570,12 @@ ospf_get_status(struct proto *P, byte * buf) } static void -ospf_get_route_info(rte * rte, byte * buf) +ospf_get_route_info(const rte * rte, byte * buf) { char *type = "<bug>"; - switch (rte->attrs->source) + uint source = rt_get_source_attr(rte); + switch (source) { case RTS_OSPF: type = "I"; @@ -587,42 +592,26 @@ ospf_get_route_info(rte * rte, byte * buf) } buf += bsprintf(buf, " %s", type); - buf += bsprintf(buf, " (%d/%d", rte->attrs->pref, ea_get_int(rte->attrs->eattrs, EA_OSPF_METRIC1, LSINFINITY)); - if (rte->attrs->source == RTS_OSPF_EXT2) - buf += bsprintf(buf, "/%d", ea_get_int(rte->attrs->eattrs, EA_OSPF_METRIC2, LSINFINITY)); + buf += bsprintf(buf, " (%d/%d", rt_get_preference(rte), ea_get_int(rte->attrs, &ea_ospf_metric1, LSINFINITY)); + if (source == RTS_OSPF_EXT2) + buf += bsprintf(buf, "/%d", ea_get_int(rte->attrs, &ea_ospf_metric2, LSINFINITY)); buf += bsprintf(buf, ")"); - if (rte->attrs->source == RTS_OSPF_EXT1 || rte->attrs->source == RTS_OSPF_EXT2) + if (source == RTS_OSPF_EXT1 || source == RTS_OSPF_EXT2) { - eattr *ea = ea_find(rte->attrs->eattrs, EA_OSPF_TAG); + eattr *ea = ea_find(rte->attrs, &ea_ospf_tag); if (ea && (ea->u.data > 0)) buf += bsprintf(buf, " [%x]", ea->u.data); } - eattr *ea = ea_find(rte->attrs->eattrs, EA_OSPF_ROUTER_ID); + eattr *ea = ea_find(rte->attrs, &ea_ospf_router_id); if (ea) buf += bsprintf(buf, " [%R]", ea->u.data); } -static int -ospf_get_attr(const eattr * a, byte * buf, int buflen UNUSED) +static void +ospf_tag_format(const eattr * a, byte * buf, uint buflen) { - switch (a->id) - { - case EA_OSPF_METRIC1: - bsprintf(buf, "metric1"); - return GA_NAME; - case EA_OSPF_METRIC2: - bsprintf(buf, "metric2"); - return GA_NAME; - case EA_OSPF_TAG: - bsprintf(buf, "tag: 0x%08x", a->u.data); - return GA_FULL; - case EA_OSPF_ROUTER_ID: - bsprintf(buf, "router_id"); - return GA_NAME; - default: - return GA_UNKNOWN; - } + bsnprintf(buf, buflen, "0x%08x", a->u.data); } static void @@ -1517,10 +1506,15 @@ ospf_sh_lsadb(struct lsadb_show_data *ld) } +struct rte_owner_class ospf_rte_owner_class = { + .get_route_info = ospf_get_route_info, + .rte_better = ospf_rte_better, + .rte_igp_metric = ospf_rte_igp_metric, +}; + struct protocol proto_ospf = { .name = "OSPF", .template = "ospf%d", - .class = PROTOCOL_OSPF, .preference = DEF_PREF_OSPF, .channel_mask = NB_IP, .proto_size = sizeof(struct ospf_proto), @@ -1531,12 +1525,38 @@ struct protocol proto_ospf = { .shutdown = ospf_shutdown, .reconfigure = ospf_reconfigure, .get_status = ospf_get_status, - .get_attr = ospf_get_attr, - .get_route_info = ospf_get_route_info +}; + +struct ea_class ea_ospf_metric1 = { + .name = "ospf_metric1", + .type = T_INT, +}; + +struct ea_class ea_ospf_metric2 = { + .name = "ospf_metric2", + .type = T_INT, +}; + +struct ea_class ea_ospf_tag = { + .name = "ospf_tag", + .type = T_INT, + .format = ospf_tag_format, +}; + +struct ea_class ea_ospf_router_id = { + .name = "ospf_router_id", + .type = T_QUAD, }; void ospf_build(void) { proto_build(&proto_ospf); + + EA_REGISTER_ALL( + &ea_ospf_metric1, + &ea_ospf_metric2, + &ea_ospf_tag, + &ea_ospf_router_id + ); } diff --git a/proto/ospf/ospf.h b/proto/ospf/ospf.h index 3e704ae8..3477ba5a 100644 --- a/proto/ospf/ospf.h +++ b/proto/ospf/ospf.h @@ -22,7 +22,7 @@ #include "lib/resource.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "nest/locks.h" #include "nest/bfd.h" @@ -939,12 +939,7 @@ struct lsadb_show_data { u32 router; /* Advertising router, 0 -> all */ }; - -#define EA_OSPF_METRIC1 EA_CODE(PROTOCOL_OSPF, 0) -#define EA_OSPF_METRIC2 EA_CODE(PROTOCOL_OSPF, 1) -#define EA_OSPF_TAG EA_CODE(PROTOCOL_OSPF, 2) -#define EA_OSPF_ROUTER_ID EA_CODE(PROTOCOL_OSPF, 3) - +extern struct ea_class ea_ospf_metric1, ea_ospf_metric2, ea_ospf_tag, ea_ospf_router_id; /* * For regular networks, neighbor address must match network prefix. @@ -1007,6 +1002,8 @@ void ospf_sh_state(struct proto *P, int verbose, int reachable); void ospf_sh_lsadb(struct lsadb_show_data *ld); +extern struct rte_owner_class ospf_rte_owner_class; + /* iface.c */ void ospf_iface_chstate(struct ospf_iface *ifa, u8 state); void ospf_iface_sm(struct ospf_iface *ifa, int event); diff --git a/proto/ospf/rt.c b/proto/ospf/rt.c index 471bb586..69c2907d 100644 --- a/proto/ospf/rt.c +++ b/proto/ospf/rt.c @@ -28,24 +28,30 @@ nh_is_vlink(struct nexthop *nhs) static inline int unresolved_vlink(ort *ort) { - return ort->n.nhs && nh_is_vlink(ort->n.nhs); + return ort->n.nhs && nh_is_vlink(&ort->n.nhs->nh); } -static inline struct nexthop * +static inline struct nexthop_adata * new_nexthop(struct ospf_proto *p, ip_addr gw, struct iface *iface, byte weight) { - struct nexthop *nh = lp_allocz(p->nhpool, sizeof(struct nexthop)); - nh->gw = gw; - nh->iface = iface; - nh->weight = weight; - return nh; + struct nexthop_adata *nhad = lp_alloc(p->nhpool, sizeof(struct nexthop_adata)); + *nhad = (struct nexthop_adata) { + .ad = { .length = sizeof *nhad - sizeof nhad->ad, }, + .nh = { + .gw = gw, + .iface = iface, + .weight = weight, + }, + }; + + return nhad; } /* Returns true if there are device nexthops in n */ static inline int -has_device_nexthops(const struct nexthop *n) +has_device_nexthops(struct nexthop_adata *nhad) { - for (; n; n = n->next) + NEXTHOP_WALK(n, nhad) if (ipa_zero(n->gw)) return 1; @@ -53,38 +59,22 @@ has_device_nexthops(const struct nexthop *n) } /* Replace device nexthops with nexthops to gw */ -static struct nexthop * -fix_device_nexthops(struct ospf_proto *p, const struct nexthop *n, ip_addr gw) +static struct nexthop_adata * +fix_device_nexthops(struct ospf_proto *p, struct nexthop_adata *old, ip_addr gw) { - struct nexthop *root1 = NULL; - struct nexthop *root2 = NULL; - struct nexthop **nn1 = &root1; - struct nexthop **nn2 = &root2; - if (!p->ecmp) - return new_nexthop(p, gw, n->iface, n->weight); - - /* This is a bit tricky. We cannot just copy the list and update n->gw, - because the list should stay sorted, so we create two lists, one with new - gateways and one with old ones, and then merge them. */ - - for (; n; n = n->next) { - struct nexthop *nn = new_nexthop(p, ipa_zero(n->gw) ? gw : n->gw, n->iface, n->weight); + struct nexthop_adata *new = (struct nexthop_adata *) lp_store_adata(p->nhpool, old->ad.data, old->ad.length); + new->nh.gw = gw; + return new; + } + struct nexthop_adata *tmp = (struct nexthop_adata *) tmp_copy_adata(&old->ad); + NEXTHOP_WALK(n, tmp) if (ipa_zero(n->gw)) - { - *nn1 = nn; - nn1 = &(nn->next); - } - else - { - *nn2 = nn; - nn2 = &(nn->next); - } - } + n->gw = gw; - return nexthop_merge(root1, root2, 1, 1, p->ecmp, p->nhpool); + return nexthop_sort(tmp, p->nhpool); } @@ -169,9 +159,9 @@ orta_compare(const struct ospf_proto *p, const orta *new, const orta *old) return -1; if (!new->nhs) return 1; - if (nh_is_vlink(new->nhs)) + if (nh_is_vlink(&new->nhs->nh)) return -1; - if (nh_is_vlink(old->nhs)) + if (nh_is_vlink(&old->nhs->nh)) return 1; @@ -279,11 +269,7 @@ ort_merge(struct ospf_proto *p, ort *o, const orta *new) orta *old = &o->n; if (old->nhs != new->nhs) - { - old->nhs = nexthop_merge(old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse, - p->ecmp, p->nhpool); - old->nhs_reuse = 1; - } + old->nhs = nexthop_merge(old->nhs, new->nhs, p->ecmp, p->nhpool); if (old->rid < new->rid) old->rid = new->rid; @@ -295,11 +281,7 @@ ort_merge_ext(struct ospf_proto *p, ort *o, const orta *new) orta *old = &o->n; if (old->nhs != new->nhs) - { - old->nhs = nexthop_merge(old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse, - p->ecmp, p->nhpool); - old->nhs_reuse = 1; - } + old->nhs = nexthop_merge(old->nhs, new->nhs, p->ecmp, p->nhpool); if (old->tag != new->tag) old->tag = 0; @@ -1165,7 +1147,7 @@ ospf_check_vlinks(struct ospf_proto *p) if (tmp && (tmp->color == INSPF) && ipa_nonzero(tmp->lb) && tmp->nhs) { - struct ospf_iface *nhi = ospf_iface_find(p, tmp->nhs->iface); + struct ospf_iface *nhi = ospf_iface_find(p, tmp->nhs->nh.iface); if ((ifa->state != OSPF_IS_PTP) || (ifa->vifa != nhi) @@ -1579,10 +1561,7 @@ ospf_ext_spf(struct ospf_proto *p) /* Replace device nexthops with nexthops to forwarding address from LSA */ if (has_device_nexthops(nfa.nhs)) - { nfa.nhs = fix_device_nexthops(p, nfa.nhs, rt.fwaddr); - nfa.nhs_reuse = 1; - } } if (rt.ebit) @@ -1726,10 +1705,10 @@ ospf_rt_spf(struct ospf_proto *p) static inline int -inherit_nexthops(struct nexthop *pn) +inherit_nexthops(struct nexthop_adata *pn) { /* Proper nexthops (with defined GW) or dummy vlink nexthops (without iface) */ - return pn && (ipa_nonzero(pn->gw) || !pn->iface); + return pn && (ipa_nonzero(pn->nh.gw) || !pn->nh.iface); } static inline ip_addr @@ -1744,12 +1723,12 @@ link_lsa_lladdr(struct ospf_proto *p, struct top_hash_entry *en) return ospf_is_ip4(p) ? ipa_from_ip4(ospf3_6to4(ll)) : ipa_from_ip6(ll); } -static struct nexthop * +static struct nexthop_adata * calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry *par, int pos, uint data, uint lif, uint nif) { struct ospf_proto *p = oa->po; - struct nexthop *pn = par->nhs; + struct nexthop_adata *pn = par->nhs; struct top_hash_entry *link = NULL; struct ospf_iface *ifa = NULL; ip_addr nh = IPA_NONE; @@ -1827,10 +1806,10 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, return NULL; } - struct nexthop *nhs = new_nexthop(p, nh, ifa->iface, ifa->ecmp_weight); + struct nexthop_adata *nhs = new_nexthop(p, nh, ifa->iface, ifa->ecmp_weight); if (ifa->addr->flags & IA_HOST) - nhs->flags = RNF_ONLINK; + nhs->nh.flags = RNF_ONLINK; return nhs; } @@ -1851,7 +1830,7 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, if (ipa_zero(en->lb)) goto bad; - return new_nexthop(p, en->lb, pn->iface, pn->weight); + return new_nexthop(p, en->lb, pn->nh.iface, pn->nh.weight); } else /* OSPFv3 */ { @@ -1859,7 +1838,7 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, * Next-hop is taken from lladdr field of Link-LSA, en->lb_id * is computed in link_back(). */ - link = ospf_hash_find(p->gr, pn->iface->index, en->lb_id, rid, LSA_T_LINK); + link = ospf_hash_find(p->gr, pn->nh.iface->index, en->lb_id, rid, LSA_T_LINK); if (!link) return NULL; @@ -1867,7 +1846,7 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, if (ipa_zero(nh)) return NULL; - return new_nexthop(p, nh, pn->iface, pn->weight); + return new_nexthop(p, nh, pn->nh.iface, pn->nh.weight); } } @@ -1914,7 +1893,7 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry if (!link_back(oa, en, par, lif, nif)) return; - struct nexthop *nhs = calc_next_hop(oa, en, par, pos, data, lif, nif); + struct nexthop_adata *nhs = calc_next_hop(oa, en, par, pos, data, lif, nif); if (!nhs) { log(L_WARN "%s: Cannot find next hop for LSA (Type: %04x, Id: %R, Rt: %R)", @@ -1923,7 +1902,7 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry } /* If en->dist > 0, we know that en->color == CANDIDATE and en->nhs is defined. */ - if ((dist == en->dist) && !nh_is_vlink(en->nhs)) + if ((dist == en->dist) && !nh_is_vlink(&en->nhs->nh)) { /* * For multipath, we should merge nexthops. We merge regular nexthops only. @@ -1947,13 +1926,11 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry */ /* Keep old ones */ - if (!p->ecmp || nh_is_vlink(nhs) || (nhs == en->nhs)) + if (!p->ecmp || nh_is_vlink(&nhs->nh) || (nhs == en->nhs)) return; /* Merge old and new */ - int new_reuse = (par->nhs != nhs); - en->nhs = nexthop_merge(en->nhs, nhs, en->nhs_reuse, new_reuse, p->ecmp, p->nhpool); - en->nhs_reuse = 1; + en->nhs = nexthop_merge(en->nhs, nhs, p->ecmp, p->nhpool); return; } @@ -1967,7 +1944,6 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry en->nhs = nhs; en->dist = dist; en->color = CANDIDATE; - en->nhs_reuse = (par->nhs != nhs); prev = NULL; @@ -2001,14 +1977,34 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry } static inline int -ort_changed(ort *nf, rta *nr) +ort_changed(ort *nf, ea_list *nr) { - rta *or = nf->old_rta; - return !or || + ea_list *or = nf->old_ea; + + if (!or || (nf->n.metric1 != nf->old_metric1) || (nf->n.metric2 != nf->old_metric2) || - (nf->n.tag != nf->old_tag) || (nf->n.rid != nf->old_rid) || - (nr->source != or->source) || (nr->dest != or->dest) || - !nexthop_same(&(nr->nh), &(or->nh)); + (nf->n.tag != nf->old_tag) || (nf->n.rid != nf->old_rid)) + return 1; + + eattr *nhea_n = ea_find(nr, &ea_gen_nexthop); + eattr *nhea_o = ea_find(or, &ea_gen_nexthop); + if (!nhea_n != !nhea_o) + return 1; + + if (nhea_n && nhea_o) + { + struct nexthop_adata *nhad_n = (struct nexthop_adata *) nhea_n->u.ptr; + struct nexthop_adata *nhad_o = (struct nexthop_adata *) nhea_o->u.ptr; + + if (!nexthop_same(nhad_n, nhad_o)) + return 1; + } + + if ( ea_get_int(nr, &ea_gen_source, 0) + != ea_get_int(or, &ea_gen_source, 0)) + return 1; + + return 0; } static void @@ -2030,10 +2026,9 @@ again1: FIB_ITERATE_START(fib, &fit, ort, nf) { /* Sanity check of next-hop addresses, failure should not happen */ - if (nf->n.type) + if (nf->n.type && nf->n.nhs) { - struct nexthop *nh; - for (nh = nf->n.nhs; nh; nh = nh->next) + NEXTHOP_WALK(nh, nf->n.nhs) if (ipa_nonzero(nh->gw)) { neighbor *nbr = neigh_find(&p->p, nh->gw, nh->iface, @@ -2052,68 +2047,69 @@ again1: if (nf->n.type) /* Add the route */ { - rta a0 = { - .source = nf->n.type, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, - .nh = *(nf->n.nhs), - .pref = p->p.main_channel->preference, - }; - - if (reload || ort_changed(nf, &a0)) - { - a0.eattrs = alloca(sizeof(ea_list) + 4 * sizeof(eattr)); - memset(a0.eattrs, 0, sizeof(ea_list)); + struct { + ea_list l; + eattr a[7]; + } eattrs; + + eattrs.l = (ea_list) {}; + + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_gen_preference, 0, p->p.main_channel->preference); + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_gen_source, 0, nf->n.type); + + eattrs.a[eattrs.l.count++] = + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nf->n.nhs->ad); + + if (reload || ort_changed(nf, &eattrs.l)) + { nf->old_metric1 = nf->n.metric1; nf->old_metric2 = nf->n.metric2; nf->old_tag = nf->n.tag; nf->old_rid = nf->n.rid; - a0.eattrs->attrs[a0.eattrs->count++] = (eattr) { - .id = EA_OSPF_METRIC1, - .type = EAF_TYPE_INT, - .u.data = nf->n.metric1, - }; + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_ospf_metric1, 0, nf->n.metric1); if (nf->n.type == RTS_OSPF_EXT2) - a0.eattrs->attrs[a0.eattrs->count++] = (eattr) { - .id = EA_OSPF_METRIC2, - .type = EAF_TYPE_INT, - .u.data = nf->n.metric2, - }; + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_ospf_metric2, 0, nf->n.metric2); if ((nf->n.type == RTS_OSPF_EXT1) || (nf->n.type == RTS_OSPF_EXT2)) - a0.eattrs->attrs[a0.eattrs->count++] = (eattr) { - .id = EA_OSPF_TAG, - .type = EAF_TYPE_INT, - .u.data = nf->n.tag, - }; - - a0.eattrs->attrs[a0.eattrs->count++] = (eattr) { - .id = EA_OSPF_ROUTER_ID, - .type = EAF_TYPE_ROUTER_ID, - .u.data = nf->n.rid, - }; + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_ospf_tag, 0, nf->n.tag); + + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_ospf_router_id, 0, nf->n.rid); - rta *a = rta_lookup(&a0); - rte *e = rte_get_temp(a, p->p.main_source); + ASSERT_DIE(ARRAY_SIZE(eattrs.a) >= eattrs.l.count); - rta_free(nf->old_rta); - nf->old_rta = rta_clone(a); + ea_list *eal = ea_lookup(&eattrs.l, 0); + ea_free(nf->old_ea); + nf->old_ea = eal; + rte e0 = { + .attrs = eal, + .src = p->p.main_source, + }; + + /* DBG("Mod rte type %d - %N via %I on iface %s, met %d\n", a0.source, nf->fn.addr, a0.gw, a0.iface ? a0.iface->name : "(none)", nf->n.metric1); - rte_update(&p->p, nf->fn.addr, e); + */ + + rte_update(p->p.main_channel, nf->fn.addr, &e0, p->p.main_source); } } - else if (nf->old_rta) + else if (nf->old_ea) { /* Remove the route */ - rta_free(nf->old_rta); - nf->old_rta = NULL; + rta_free(nf->old_ea); + nf->old_ea = NULL; - rte_update(&p->p, nf->fn.addr, NULL); + rte_update(p->p.main_channel, nf->fn.addr, NULL, p->p.main_source); } /* Remove unused rt entry, some special entries are persistent */ @@ -2129,7 +2125,6 @@ again1: } FIB_ITERATE_END; - WALK_LIST(oa, p->area_list) { /* Cleanup ASBR hash tables */ diff --git a/proto/ospf/rt.h b/proto/ospf/rt.h index 094e125b..88eefef9 100644 --- a/proto/ospf/rt.h +++ b/proto/ospf/rt.h @@ -18,8 +18,6 @@ typedef struct orta { u8 type; /* RTS_OSPF_* */ - u8 nhs_reuse; /* Whether nhs nodes can be reused during merging. - See a note in rt.c:add_cand() */ u32 options; /* * For ORT_ROUTER routes, options field are router-LSA style @@ -53,7 +51,7 @@ typedef struct orta struct ospf_area *oa; struct ospf_area *voa; /* Used when route is replaced in ospf_rt_sum_tr(), NULL otherwise */ - struct nexthop *nhs; /* Next hops computed during SPF */ + struct nexthop_adata *nhs; /* Next hops computed during SPF */ struct top_hash_entry *en; /* LSA responsible for this orta */ } orta; @@ -80,7 +78,7 @@ typedef struct ort */ orta n; u32 old_metric1, old_metric2, old_tag, old_rid; - rta *old_rta; + ea_list *old_ea; u32 lsa_id; u8 external_rte; u8 area_net; diff --git a/proto/ospf/topology.c b/proto/ospf/topology.c index 9fe68264..85bce03d 100644 --- a/proto/ospf/topology.c +++ b/proto/ospf/topology.c @@ -1300,7 +1300,7 @@ find_surrogate_fwaddr(struct ospf_proto *p, struct ospf_area *oa) } void -ospf_rt_notify(struct proto *P, struct channel *ch UNUSED, net *n, rte *new, rte *old UNUSED) +ospf_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *n, rte *new, const rte *old UNUSED) { struct ospf_proto *p = (struct ospf_proto *) P; struct ospf_area *oa = NULL; /* non-NULL for NSSA-LSA */ @@ -1319,7 +1319,7 @@ ospf_rt_notify(struct proto *P, struct channel *ch UNUSED, net *n, rte *new, rte if (!new) { - nf = fib_find(&p->rtf, n->n.addr); + nf = fib_find(&p->rtf, n); if (!nf || !nf->external_rte) return; @@ -1337,23 +1337,23 @@ ospf_rt_notify(struct proto *P, struct channel *ch UNUSED, net *n, rte *new, rte ASSERT(p->asbr); /* Get route attributes */ - rta *a = new->attrs; - eattr *m1a = ea_find(a->eattrs, EA_OSPF_METRIC1); - eattr *m2a = ea_find(a->eattrs, EA_OSPF_METRIC2); + ea_list *a = new->attrs; + eattr *m1a = ea_find(a, &ea_ospf_metric1); + eattr *m2a = ea_find(a, &ea_ospf_metric2); uint m1 = m1a ? m1a->u.data : 0; uint m2 = m2a ? m2a->u.data : 10000; if (m1 > LSINFINITY) { log(L_WARN "%s: Invalid ospf_metric1 value %u for route %N", - p->p.name, m1, n->n.addr); + p->p.name, m1, n); m1 = LSINFINITY; } if (m2 > LSINFINITY) { log(L_WARN "%s: Invalid ospf_metric2 value %u for route %N", - p->p.name, m2, n->n.addr); + p->p.name, m2, n); m2 = LSINFINITY; } @@ -1363,11 +1363,14 @@ ospf_rt_notify(struct proto *P, struct channel *ch UNUSED, net *n, rte *new, rte uint ebit = m2a || !m1a; uint metric = ebit ? m2 : m1; - uint tag = ea_get_int(a->eattrs, EA_OSPF_TAG, 0); + uint tag = ea_get_int(a, &ea_ospf_tag, 0); ip_addr fwd = IPA_NONE; - if ((a->dest == RTD_UNICAST) && use_gw_for_fwaddr(p, a->nh.gw, a->nh.iface)) - fwd = a->nh.gw; + eattr *nhea = ea_find(a, &ea_gen_nexthop); + struct nexthop_adata *nhad = (struct nexthop_adata *) nhea->u.ptr; + if (NEXTHOP_IS_REACHABLE(nhad)) + if (use_gw_for_fwaddr(p, nhad->nh.gw, nhad->nh.iface)) + fwd = nhad->nh.gw; /* NSSA-LSA with P-bit set must have non-zero forwarding address */ if (oa && ipa_zero(fwd)) @@ -1377,12 +1380,12 @@ ospf_rt_notify(struct proto *P, struct channel *ch UNUSED, net *n, rte *new, rte if (ipa_zero(fwd)) { log(L_ERR "%s: Cannot find forwarding address for NSSA-LSA %N", - p->p.name, n->n.addr); + p->p.name, n); return; } } - nf = fib_get(&p->rtf, n->n.addr); + nf = fib_get(&p->rtf, n); ospf_originate_ext_lsa(p, oa, nf, LSA_M_EXPORT, metric, ebit, fwd, tag, 1, p->vpn_pe); nf->external_rte = 1; } diff --git a/proto/ospf/topology.h b/proto/ospf/topology.h index 535d1f1b..3c92b431 100644 --- a/proto/ospf/topology.h +++ b/proto/ospf/topology.h @@ -28,7 +28,7 @@ struct top_hash_entry u16 next_lsa_opts; /* For postponed LSA origination */ btime inst_time; /* Time of installation into DB */ struct ort *nf; /* Reference fibnode for sum and ext LSAs, NULL for otherwise */ - struct nexthop *nhs; /* Computed nexthops - valid only in ospf_rt_spf() */ + struct nexthop_adata *nhs; /* Computed nexthops - valid only in ospf_rt_spf() */ ip_addr lb; /* In OSPFv2, link back address. In OSPFv3, any global address in the area useful for vlinks */ u32 lb_id; /* Interface ID of link back iface (for bcast or NBMA networks) */ u32 dist; /* Distance from the root */ @@ -39,8 +39,6 @@ struct top_hash_entry #define CANDIDATE 1 #define INSPF 2 u8 mode; /* LSA generated during RT calculation (LSA_RTCALC or LSA_STALE)*/ - u8 nhs_reuse; /* Whether nhs nodes can be reused during merging. - See a note in rt.c:add_cand() */ }; @@ -200,7 +198,7 @@ void ospf_originate_sum_rt_lsa(struct ospf_proto *p, struct ospf_area *oa, u32 d void ospf_originate_ext_lsa(struct ospf_proto *p, struct ospf_area *oa, ort *nf, u8 mode, u32 metric, u32 ebit, ip_addr fwaddr, u32 tag, int pbit, int dn); void ospf_originate_gr_lsa(struct ospf_proto *p, struct ospf_iface *ifa); -void ospf_rt_notify(struct proto *P, struct channel *ch, net *n, rte *new, rte *old); +void ospf_rt_notify(struct proto *P, struct channel *ch, const net_addr *n, rte *new, const rte *old); void ospf_update_topology(struct ospf_proto *p); struct top_hash_entry *ospf_hash_find(struct top_graph *, u32 domain, u32 lsa, u32 rtr, u32 type); diff --git a/proto/perf/perf.c b/proto/perf/perf.c index 75e405f0..dc5bbf2f 100644 --- a/proto/perf/perf.c +++ b/proto/perf/perf.c @@ -18,7 +18,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "conf/conf.h" #include "filter/filter.h" @@ -85,7 +85,7 @@ random_net_ip4(void) } struct perf_random_routes { - struct rta *a; + ea_list *a; net_addr net; }; @@ -142,17 +142,21 @@ perf_loop(void *data) *((net_addr_ip4 *) &(p->data[i].net)) = random_net_ip4(); if (!p->attrs_per_rte || !(i % p->attrs_per_rte)) { - struct rta a0 = { - .source = RTS_PERF, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, - .pref = p->p.main_channel->preference, + ea_list *ea = NULL; + + ea_set_attr_u32(&ea, &ea_gen_preference, 0, p->p.main_channel->preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_PERF); + + struct nexthop_adata nhad = { .nh.iface = p->ifa->iface, .nh.gw = gw, .nh.weight = 1, }; - p->data[i].a = rta_lookup(&a0); + ea_set_attr_data(&ea, &ea_gen_nexthop, 0, + &nhad.ad.data, sizeof nhad - sizeof nhad.ad); + + p->data[i].a = rta_lookup(ea, 0); } else p->data[i].a = rta_clone(p->data[i-1].a); @@ -160,17 +164,17 @@ perf_loop(void *data) clock_gettime(CLOCK_MONOTONIC, &ts_generated); - for (uint i=0; i<N; i++) { - rte *e = rte_get_temp(p->data[i].a, p->p.main_source); - - rte_update(P, &(p->data[i].net), e); + for (uint i=0; i<N; i++) + { + rte e0 = { .attrs = p->data[i].a, .src = P->main_source, }; + rte_update(P->main_channel, &(p->data[i].net), &e0, P->main_source); } clock_gettime(CLOCK_MONOTONIC, &ts_update); if (!p->keep) for (uint i=0; i<N; i++) - rte_update(P, &(p->data[i].net), NULL); + rte_update(P->main_channel, &(p->data[i].net), NULL, P->main_source); clock_gettime(CLOCK_MONOTONIC, &ts_withdraw); @@ -198,12 +202,14 @@ perf_loop(void *data) p->exp++; } - rt_schedule_prune(P->main_channel->table); + RT_LOCKED(P->main_channel->table, tab) + rt_schedule_prune(tab); + ev_schedule(p->loop); } static void -perf_rt_notify(struct proto *P, struct channel *c UNUSED, struct network *net UNUSED, struct rte *new UNUSED, struct rte *old UNUSED) +perf_rt_notify(struct proto *P, struct channel *c UNUSED, const net_addr *net UNUSED, struct rte *new UNUSED, const struct rte *old UNUSED) { struct perf_proto *p = (struct perf_proto *) P; p->exp++; @@ -266,7 +272,7 @@ perf_init(struct proto_config *CF) switch (p->mode) { case PERF_MODE_IMPORT: - P->ifa_notify = perf_ifa_notify; + P->iface_sub.ifa_notify = perf_ifa_notify; break; case PERF_MODE_EXPORT: P->rt_notify = perf_rt_notify; @@ -305,7 +311,6 @@ perf_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUS struct protocol proto_perf = { .name = "Perf", .template = "perf%d", - .class = PROTOCOL_PERF, .channel_mask = NB_IP, .proto_size = sizeof(struct perf_proto), .config_size = sizeof(struct perf_config), diff --git a/proto/pipe/config.Y b/proto/pipe/config.Y index 1202c169..444de127 100644 --- a/proto/pipe/config.Y +++ b/proto/pipe/config.Y @@ -16,7 +16,7 @@ CF_DEFINES CF_DECLS -CF_KEYWORDS(PIPE, PEER, TABLE) +CF_KEYWORDS(PIPE, PEER, TABLE, MAX, GENERATION) CF_GRAMMAR @@ -25,6 +25,8 @@ proto: pipe_proto '}' { this_channel = NULL; } ; pipe_proto_start: proto_start PIPE { this_proto = proto_config_new(&proto_pipe, $1); + this_proto->loop_order = DOMAIN_ORDER(proto); + PIPE_CFG->max_generation = 16; } proto_name { @@ -40,7 +42,17 @@ pipe_proto: pipe_proto_start '{' | pipe_proto proto_item ';' | pipe_proto channel_item_ ';' + | pipe_proto IMPORT IN net_any imexport ';' { + if (this_channel->net_type && ($4->type != this_channel->net_type)) + cf_error("Incompatible export prefilter type"); + PIPE_CFG->in_subprefix = $4; + this_channel->in_filter = $5; + } | pipe_proto PEER TABLE rtable ';' { PIPE_CFG->peer = $4; } + | pipe_proto MAX GENERATION expr ';' { + if (($4 < 1) || ($4 > 254)) cf_error("Max generation must be in range 1..254, got %u", $4); + PIPE_CFG->max_generation = $4; + } ; CF_CODE diff --git a/proto/pipe/pipe.c b/proto/pipe/pipe.c index d48ce387..b2083010 100644 --- a/proto/pipe/pipe.c +++ b/proto/pipe/pipe.c @@ -35,7 +35,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "conf/conf.h" #include "filter/filter.h" @@ -44,54 +44,54 @@ #include "pipe.h" static void -pipe_rt_notify(struct proto *P, struct channel *src_ch, net *n, rte *new, rte *old) +pipe_rt_notify(struct proto *P, struct channel *src_ch, const net_addr *n, rte *new, const rte *old) { struct pipe_proto *p = (void *) P; struct channel *dst = (src_ch == p->pri) ? p->sec : p->pri; - struct rte_src *src; - - rte *e; - rta *a; + uint *flags = (src_ch == p->pri) ? &p->sec_flags : &p->pri_flags; if (!new && !old) return; - if (dst->table->pipe_busy) - { - log(L_ERR "Pipe loop detected when sending %N to table %s", - n->n.addr, dst->table->name); - return; - } + /* Start the route refresh if requested to */ + if (*flags & PIPE_FL_RR_BEGIN_PENDING) + { + *flags &= ~PIPE_FL_RR_BEGIN_PENDING; + rt_refresh_begin(&dst->in_req); + } if (new) { - src = new->src; + rte e0 = rte_init_from(new); - a = alloca(rta_size(new->attrs)); - memcpy(a, new->attrs, rta_size(new->attrs)); + e0.generation = new->generation + 1; + ea_unset_attr(&e0.attrs, 0, &ea_gen_hostentry); - a->cached = 0; - a->hostentry = NULL; - e = rte_get_temp(a, src); + rte_update(dst, n, &e0, new->src); } else - { - e = NULL; - src = old->src; - } - - src_ch->table->pipe_busy = 1; - rte_update2(dst, n->n.addr, e, src); - src_ch->table->pipe_busy = 0; + rte_update(dst, n, NULL, old->src); } static int pipe_preexport(struct channel *C, rte *e) { - struct proto *pp = e->sender->proto; + struct pipe_proto *p = (void *) C->proto; - if (pp == C->proto) - return -1; /* Avoid local loops automatically */ + /* Avoid direct loopbacks */ + if (e->sender == C->in_req.hook) + return -1; + + /* Indirection check */ + uint max_generation = ((struct pipe_config *) p->p.cf)->max_generation; + if (e->generation >= max_generation) + { + log_rl(&p->rl_gen, L_ERR "Route overpiped (%u hops of %u configured in %s) in table %s: %N %s/%u:%u", + e->generation, max_generation, C->proto->name, + C->table->name, e->net, e->src->owner->name, e->src->private_id, e->src->global_id); + + return -1; + } return 0; } @@ -105,6 +105,32 @@ pipe_reload_routes(struct channel *C) channel_request_feeding((C == p->pri) ? p->sec : p->pri); } +static void +pipe_feed_begin(struct channel *C, int initial UNUSED) +{ + struct pipe_proto *p = (void *) C->proto; + uint *flags = (C == p->pri) ? &p->sec_flags : &p->pri_flags; + + *flags |= PIPE_FL_RR_BEGIN_PENDING; +} + +static void +pipe_feed_end(struct channel *C) +{ + struct pipe_proto *p = (void *) C->proto; + struct channel *dst = (C == p->pri) ? p->sec : p->pri; + uint *flags = (C == p->pri) ? &p->sec_flags : &p->pri_flags; + + /* If not even started, start the RR now */ + if (*flags & PIPE_FL_RR_BEGIN_PENDING) + { + *flags &= ~PIPE_FL_RR_BEGIN_PENDING; + rt_refresh_begin(&dst->in_req); + } + + /* Finish RR always */ + rt_refresh_end(&dst->in_req); +} static void pipe_postconfig(struct proto_config *CF) @@ -124,10 +150,16 @@ pipe_postconfig(struct proto_config *CF) if (cc->table->addr_type != cf->peer->addr_type) cf_error("Primary table and peer table must have the same type"); + if (cc->out_subprefix && (cc->table->addr_type != cc->out_subprefix->type)) + cf_error("Export subprefix must match table type"); + + if (cf->in_subprefix && (cc->table->addr_type != cf->in_subprefix->type)) + cf_error("Import subprefix must match table type"); + if (cc->rx_limit.action) cf_error("Pipe protocol does not support receive limits"); - if (cc->in_keep_filtered) + if (cc->in_keep) cf_error("Pipe protocol prohibits keeping filtered routes"); cc->debug = cf->c.debug; @@ -143,6 +175,7 @@ pipe_configure_channels(struct pipe_proto *p, struct pipe_config *cf) .channel = cc->channel, .table = cc->table, .out_filter = cc->out_filter, + .out_subprefix = cc->out_subprefix, .in_limit = cc->in_limit, .ra_mode = RA_ANY, .debug = cc->debug, @@ -154,6 +187,7 @@ pipe_configure_channels(struct pipe_proto *p, struct pipe_config *cf) .channel = cc->channel, .table = cf->peer, .out_filter = cc->in_filter, + .out_subprefix = cf->in_subprefix, .in_limit = cc->out_limit, .ra_mode = RA_ANY, .debug = cc->debug, @@ -175,6 +209,10 @@ pipe_init(struct proto_config *CF) P->rt_notify = pipe_rt_notify; P->preexport = pipe_preexport; P->reload_routes = pipe_reload_routes; + P->feed_begin = pipe_feed_begin; + P->feed_end = pipe_feed_end; + + p->rl_gen = (struct tbf) TBF_DEFAULT_LOG_LIMITS; pipe_configure_channels(p, cf); @@ -207,8 +245,18 @@ pipe_get_status(struct proto *P, byte *buf) static void pipe_show_stats(struct pipe_proto *p) { - struct proto_stats *s1 = &p->pri->stats; - struct proto_stats *s2 = &p->sec->stats; + struct channel_import_stats *s1i = &p->pri->import_stats; + struct channel_export_stats *s1e = &p->pri->export_stats; + struct channel_import_stats *s2i = &p->sec->import_stats; + struct channel_export_stats *s2e = &p->sec->export_stats; + + struct rt_import_stats *rs1i = p->pri->in_req.hook ? &p->pri->in_req.hook->stats : NULL; + struct rt_export_stats *rs1e = p->pri->out_req.hook ? &p->pri->out_req.hook->stats : NULL; + struct rt_import_stats *rs2i = p->sec->in_req.hook ? &p->sec->in_req.hook->stats : NULL; + struct rt_export_stats *rs2e = p->sec->out_req.hook ? &p->sec->out_req.hook->stats : NULL; + + u32 pri_routes = p->pri->in_limit.count; + u32 sec_routes = p->sec->in_limit.count; /* * Pipe stats (as anything related to pipes) are a bit tricky. There @@ -232,24 +280,22 @@ pipe_show_stats(struct pipe_proto *p) */ cli_msg(-1006, " Routes: %u imported, %u exported", - s1->imp_routes, s2->imp_routes); + pri_routes, sec_routes); cli_msg(-1006, " Route change stats: received rejected filtered ignored accepted"); cli_msg(-1006, " Import updates: %10u %10u %10u %10u %10u", - s2->exp_updates_received, s2->exp_updates_rejected + s1->imp_updates_invalid, - s2->exp_updates_filtered, s1->imp_updates_ignored, s1->imp_updates_accepted); + rs2e->updates_received, s2e->updates_rejected + s1i->updates_invalid, + s2e->updates_filtered, rs1i->updates_ignored, rs1i->updates_accepted); cli_msg(-1006, " Import withdraws: %10u %10u --- %10u %10u", - s2->exp_withdraws_received, s1->imp_withdraws_invalid, - s1->imp_withdraws_ignored, s1->imp_withdraws_accepted); + rs2e->withdraws_received, s1i->withdraws_invalid, + rs1i->withdraws_ignored, rs1i->withdraws_accepted); cli_msg(-1006, " Export updates: %10u %10u %10u %10u %10u", - s1->exp_updates_received, s1->exp_updates_rejected + s2->imp_updates_invalid, - s1->exp_updates_filtered, s2->imp_updates_ignored, s2->imp_updates_accepted); + rs1e->updates_received, s1e->updates_rejected + s2i->updates_invalid, + s1e->updates_filtered, rs2i->updates_ignored, rs2i->updates_accepted); cli_msg(-1006, " Export withdraws: %10u %10u --- %10u %10u", - s1->exp_withdraws_received, s2->imp_withdraws_invalid, - s2->imp_withdraws_ignored, s2->imp_withdraws_accepted); + rs1e->withdraws_received, s2i->withdraws_invalid, + rs2i->withdraws_ignored, rs2i->withdraws_accepted); } -static const char *pipe_feed_state[] = { [ES_DOWN] = "down", [ES_FEEDING] = "feed", [ES_READY] = "up" }; - static void pipe_show_proto_info(struct proto *P) { @@ -258,13 +304,17 @@ pipe_show_proto_info(struct proto *P) cli_msg(-1006, " Channel %s", "main"); cli_msg(-1006, " Table: %s", p->pri->table->name); cli_msg(-1006, " Peer table: %s", p->sec->table->name); - cli_msg(-1006, " Import state: %s", pipe_feed_state[p->sec->export_state]); - cli_msg(-1006, " Export state: %s", pipe_feed_state[p->pri->export_state]); + cli_msg(-1006, " Import state: %s", rt_export_state_name(rt_export_get_state(p->sec->out_req.hook))); + cli_msg(-1006, " Export state: %s", rt_export_state_name(rt_export_get_state(p->pri->out_req.hook))); cli_msg(-1006, " Import filter: %s", filter_name(p->sec->out_filter)); cli_msg(-1006, " Export filter: %s", filter_name(p->pri->out_filter)); - channel_show_limit(&p->pri->in_limit, "Import limit:"); - channel_show_limit(&p->sec->in_limit, "Export limit:"); + + + channel_show_limit(&p->pri->in_limit, "Import limit:", + (p->pri->limit_active & (1 << PLD_IN)), p->pri->limit_actions[PLD_IN]); + channel_show_limit(&p->sec->in_limit, "Export limit:", + (p->sec->limit_active & (1 << PLD_IN)), p->sec->limit_actions[PLD_IN]); if (P->proto_state != PS_DOWN) pipe_show_stats(p); @@ -282,7 +332,6 @@ pipe_update_debug(struct proto *P) struct protocol proto_pipe = { .name = "Pipe", .template = "pipe%d", - .class = PROTOCOL_PIPE, .proto_size = sizeof(struct pipe_proto), .config_size = sizeof(struct pipe_config), .postconfig = pipe_postconfig, diff --git a/proto/pipe/pipe.h b/proto/pipe/pipe.h index 038c6666..501b8565 100644 --- a/proto/pipe/pipe.h +++ b/proto/pipe/pipe.h @@ -12,12 +12,19 @@ struct pipe_config { struct proto_config c; struct rtable_config *peer; /* Table we're connected to */ + const net_addr *in_subprefix; + u8 max_generation; }; struct pipe_proto { struct proto p; struct channel *pri; struct channel *sec; + uint pri_flags; + uint sec_flags; + struct tbf rl_gen; }; +#define PIPE_FL_RR_BEGIN_PENDING 1 /* Route refresh should start with the first route notified */ + #endif diff --git a/proto/radv/config.Y b/proto/radv/config.Y index 8d4a3ab9..fb68d2e5 100644 --- a/proto/radv/config.Y +++ b/proto/radv/config.Y @@ -33,7 +33,7 @@ CF_KEYWORDS(RADV, PREFIX, INTERFACE, MIN, MAX, RA, DELAY, INTERVAL, SOLICITED, RETRANS, TIMER, CURRENT, HOP, LIMIT, DEFAULT, VALID, PREFERRED, MULT, LIFETIME, SKIP, ONLINK, AUTONOMOUS, RDNSS, DNSSL, NS, DOMAIN, LOCAL, TRIGGER, SENSITIVE, PREFERENCE, LOW, MEDIUM, HIGH, PROPAGATE, ROUTE, - ROUTES, RA_PREFERENCE, RA_LIFETIME) + ROUTES) CF_ENUM(T_ENUM_RA_PREFERENCE, RA_PREF_, LOW, MEDIUM, HIGH) @@ -336,9 +336,6 @@ radv_sensitive: | SENSITIVE bool { $$ = $2; } ; -dynamic_attr: RA_PREFERENCE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_ENUM_RA_PREFERENCE, EA_RA_PREFERENCE); } ; -dynamic_attr: RA_LIFETIME { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_RA_LIFETIME); } ; - CF_CODE CF_END diff --git a/proto/radv/packets.c b/proto/radv/packets.c index 5cd8b2de..c6b565d2 100644 --- a/proto/radv/packets.c +++ b/proto/radv/packets.c @@ -493,7 +493,7 @@ radv_sk_open(struct radv_iface *ifa) sk->data = ifa; sk->flags = SKF_LADDR_RX; - if (sk_open(sk) < 0) + if (sk_open(sk, ifa->ra->p.loop) < 0) goto err; /* We want listen just to ICMPv6 messages of type RS and RA */ diff --git a/proto/radv/radv.c b/proto/radv/radv.c index 119a8dc4..434155dc 100644 --- a/proto/radv/radv.c +++ b/proto/radv/radv.c @@ -10,6 +10,7 @@ #include <stdlib.h> #include "radv.h" +#include "lib/macro.h" /** * DOC: Router Advertisements @@ -42,6 +43,8 @@ * RFC 6106 - DNS extensions (RDDNS, DNSSL) */ +static struct ea_class ea_radv_preference, ea_radv_lifetime; + static void radv_prune_prefixes(struct radv_iface *ifa); static void radv_prune_routes(struct radv_proto *p); @@ -263,9 +266,9 @@ radv_iface_find(struct radv_proto *p, struct iface *what) } static void -radv_iface_add(struct object_lock *lock) +radv_iface_add(void *_ifa) { - struct radv_iface *ifa = lock->data; + struct radv_iface *ifa = _ifa; struct radv_proto *p = ifa->ra; if (! radv_sk_open(ifa)) @@ -302,8 +305,11 @@ radv_iface_new(struct radv_proto *p, struct iface *iface, struct radv_iface_conf lock->type = OBJLOCK_IP; lock->port = ICMPV6_PROTO; lock->iface = iface; - lock->data = ifa; - lock->hook = radv_iface_add; + lock->event = (event) { + .hook = radv_iface_add, + .data = ifa, + }; + lock->target = &global_event_list; ifa->lock = lock; olock_acquire(lock); @@ -385,9 +391,9 @@ radv_trigger_valid(struct radv_config *cf) } static inline int -radv_net_match_trigger(struct radv_config *cf, net *n) +radv_net_match_trigger(struct radv_config *cf, const net_addr *n) { - return radv_trigger_valid(cf) && net_equal(n->n.addr, &cf->trigger); + return radv_trigger_valid(cf) && net_equal(n, &cf->trigger); } int @@ -406,7 +412,7 @@ radv_preexport(struct channel *C, rte *new) } static void -radv_rt_notify(struct proto *P, struct channel *ch UNUSED, net *n, rte *new, rte *old UNUSED) +radv_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *n, rte *new, const rte *old UNUSED) { struct radv_proto *p = (struct radv_proto *) P; struct radv_config *cf = (struct radv_config *) (P->cf); @@ -444,11 +450,11 @@ radv_rt_notify(struct proto *P, struct channel *ch UNUSED, net *n, rte *new, rte { /* Update */ - ea = ea_find(new->attrs->eattrs, EA_RA_PREFERENCE); + ea = ea_find(new->attrs, &ea_radv_preference); uint preference = ea ? ea->u.data : RA_PREF_MEDIUM; uint preference_set = !!ea; - ea = ea_find(new->attrs->eattrs, EA_RA_LIFETIME); + ea = ea_find(new->attrs, &ea_radv_lifetime); uint lifetime = ea ? ea->u.data : 0; uint lifetime_set = !!ea; @@ -457,14 +463,14 @@ radv_rt_notify(struct proto *P, struct channel *ch UNUSED, net *n, rte *new, rte (preference != RA_PREF_HIGH)) { log(L_WARN "%s: Invalid ra_preference value %u on route %N", - p->p.name, preference, n->n.addr); + p->p.name, preference, n); preference = RA_PREF_MEDIUM; preference_set = 1; lifetime = 0; lifetime_set = 1; } - rt = fib_get(&p->routes, n->n.addr); + rt = fib_get(&p->routes, n); /* Ignore update if nothing changed */ if (rt->valid && @@ -487,7 +493,7 @@ radv_rt_notify(struct proto *P, struct channel *ch UNUSED, net *n, rte *new, rte else { /* Withdraw */ - rt = fib_find(&p->routes, n->n.addr); + rt = fib_find(&p->routes, n); if (!rt || !rt->valid) return; @@ -577,8 +583,8 @@ radv_init(struct proto_config *CF) P->preexport = radv_preexport; P->rt_notify = radv_rt_notify; - P->if_notify = radv_if_notify; - P->ifa_notify = radv_ifa_notify; + P->iface_sub.if_notify = radv_if_notify; + P->iface_sub.ifa_notify = radv_ifa_notify; return P; } @@ -660,10 +666,9 @@ radv_reconfigure(struct proto *P, struct proto_config *CF) if (!old->propagate_routes && new->propagate_routes) channel_request_feeding(p->p.main_channel); - struct iface *iface; - WALK_LIST(iface, iface_list) + IFACE_WALK(iface) { - if (p->p.vrf_set && p->p.vrf != iface->master) + if (p->p.vrf && p->p.vrf != iface->master) continue; if (!(iface->flags & IF_UP)) @@ -741,27 +746,26 @@ radv_pref_str(u32 pref) } } -/* The buffer has some minimal size */ -static int -radv_get_attr(const eattr *a, byte *buf, int buflen UNUSED) +static void +radv_preference_format(const eattr *a, byte *buf, uint buflen) { - switch (a->id) - { - case EA_RA_PREFERENCE: - bsprintf(buf, "preference: %s", radv_pref_str(a->u.data)); - return GA_FULL; - case EA_RA_LIFETIME: - bsprintf(buf, "lifetime"); - return GA_NAME; - default: - return GA_UNKNOWN; - } + bsnprintf(buf, buflen, "%s", radv_pref_str(a->u.data)); } +static struct ea_class ea_radv_preference = { + .name = "radv_preference", + .type = T_ENUM_RA_PREFERENCE, + .format = radv_preference_format, +}; + +static struct ea_class ea_radv_lifetime = { + .name = "radv_lifetime", + .type = T_INT, +}; + struct protocol proto_radv = { .name = "RAdv", .template = "radv%d", - .class = PROTOCOL_RADV, .channel_mask = NB_IP6, .proto_size = sizeof(struct radv_proto), .config_size = sizeof(struct radv_config), @@ -772,11 +776,15 @@ struct protocol proto_radv = { .reconfigure = radv_reconfigure, .copy_config = radv_copy_config, .get_status = radv_get_status, - .get_attr = radv_get_attr }; void radv_build(void) { proto_build(&proto_radv); + + EA_REGISTER_ALL( + &ea_radv_preference, + &ea_radv_lifetime + ); } diff --git a/proto/radv/radv.h b/proto/radv/radv.h index 14d40f8a..c9219bda 100644 --- a/proto/radv/radv.h +++ b/proto/radv/radv.h @@ -19,7 +19,7 @@ #include "lib/resource.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "nest/locks.h" #include "conf/conf.h" @@ -195,10 +195,6 @@ struct radv_iface #define RA_PREF_HIGH 0x08 #define RA_PREF_MASK 0x18 -/* Attributes */ -#define EA_RA_PREFERENCE EA_CODE(PROTOCOL_RADV, 0) -#define EA_RA_LIFETIME EA_CODE(PROTOCOL_RADV, 1) - #ifdef LOCAL_DEBUG #define RADV_FORCE_DEBUG 1 #else diff --git a/proto/rip/config.Y b/proto/rip/config.Y index 28ee9609..3c0973b1 100644 --- a/proto/rip/config.Y +++ b/proto/rip/config.Y @@ -37,7 +37,7 @@ CF_KEYWORDS(RIP, NG, ECMP, LIMIT, WEIGHT, INFINITY, METRIC, UPDATE, TIMEOUT, PASSIVE, VERSION, SPLIT, HORIZON, POISON, REVERSE, CHECK, ZERO, TIME, BFD, AUTHENTICATION, NONE, PLAINTEXT, CRYPTOGRAPHIC, MD5, TTL, SECURITY, RX, TX, BUFFER, LENGTH, PRIORITY, ONLY, LINK, - DEMAND, CIRCUIT, RIP_METRIC, RIP_TAG) + DEMAND, CIRCUIT) %type <i> rip_variant rip_auth @@ -190,9 +190,6 @@ rip_iface: rip_iface_start iface_patt_list_nopx rip_iface_opt_list rip_iface_finish; -dynamic_attr: RIP_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_RIP_METRIC); } ; -dynamic_attr: RIP_TAG { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_RIP_TAG); } ; - CF_CLI_HELP(SHOW RIP, ..., [[Show information about RIP protocol]]); CF_CLI(SHOW RIP INTERFACES, optproto opttext, [<name>] [\"<interface>\"], [[Show information about RIP interfaces]]) diff --git a/proto/rip/packets.c b/proto/rip/packets.c index 9c3bd7a3..fecdf896 100644 --- a/proto/rip/packets.c +++ b/proto/rip/packets.c @@ -1012,7 +1012,7 @@ rip_open_socket(struct rip_iface *ifa) /* sk->rbsize and sk->tbsize are handled in rip_iface_update_buffers() */ - if (sk_open(sk) < 0) + if (sk_open(sk, p->p.loop) < 0) goto err; if (ifa->cf->mode == RIP_IM_MULTICAST) diff --git a/proto/rip/rip.c b/proto/rip/rip.c index 8c2d5aeb..d15177da 100644 --- a/proto/rip/rip.c +++ b/proto/rip/rip.c @@ -78,6 +78,7 @@ #include <stdlib.h> #include "rip.h" +#include "lib/macro.h" static inline void rip_lock_neighbor(struct rip_neighbor *n); @@ -88,6 +89,7 @@ static inline void rip_iface_kick_timer(struct rip_iface *ifa); static void rip_iface_timer(timer *timer); static void rip_trigger_update(struct rip_proto *p); +static struct ea_class ea_rip_metric, ea_rip_tag, ea_rip_from; /* * RIP routes @@ -149,91 +151,94 @@ rip_announce_rte(struct rip_proto *p, struct rip_entry *en) if (rt) { /* Update */ - rta a0 = { - .pref = p->p.main_channel->preference, - .source = RTS_RIP, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, + struct { + ea_list l; + eattr a[3]; + } ea_block = { + .l.count = ARRAY_SIZE(ea_block.a), + .a = { + EA_LITERAL_EMBEDDED(&ea_gen_preference, 0, p->p.main_channel->preference), + EA_LITERAL_EMBEDDED(&ea_gen_source, 0, RTS_RIP), + EA_LITERAL_EMBEDDED(&ea_rip_metric, 0, rt->metric), + }, }; - u8 rt_metric = rt->metric; + ea_list *ea = &ea_block.l; + u16 rt_tag = rt->tag; + struct iface *rt_from = NULL; if (p->ecmp) { /* ECMP route */ - struct nexthop *nhs = NULL; int num = 0; for (rt = en->routes; rt && (num < p->ecmp); rt = rt->next) + if (rip_valid_rte(rt)) + num++; + + struct nexthop_adata *nhad = (struct nexthop_adata *) tmp_alloc_adata((num+1) * sizeof(struct nexthop)); + struct nexthop *nh = &nhad->nh; + + for (rt = en->routes; rt && (num < p->ecmp); rt = rt->next) { if (!rip_valid_rte(rt)) - continue; + continue; - struct nexthop *nh = allocz(sizeof(struct nexthop)); + *nh = (struct nexthop) { + .gw = rt->next_hop, + .iface = rt->from->ifa->iface, + .weight = rt->from->ifa->cf->ecmp_weight, + }; - nh->gw = rt->next_hop; - nh->iface = rt->from->ifa->iface; - nh->weight = rt->from->ifa->cf->ecmp_weight; + if (!rt_from) + rt_from = rt->from->ifa->iface; - nexthop_insert(&nhs, nh); - num++; + nh = NEXTHOP_NEXT(nh); if (rt->tag != rt_tag) rt_tag = 0; } - a0.nh = *nhs; + nhad->ad.length = ((void *) nh - (void *) nhad->ad.data); + + ea_set_attr(&ea, + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, + &(nexthop_sort(nhad, tmp_linpool)->ad))); } else { /* Unipath route */ - a0.from = rt->from->nbr->addr; - a0.nh.gw = rt->next_hop; - a0.nh.iface = rt->from->ifa->iface; + rt_from = rt->from->ifa->iface; + + struct nexthop_adata nhad = { + .nh.gw = rt->next_hop, + .nh.iface = rt->from->ifa->iface, + }; + + ea_set_attr_data(&ea, &ea_gen_nexthop, 0, + &nhad.ad.data, sizeof nhad - sizeof nhad.ad); + ea_set_attr_data(&ea, &ea_gen_from, 0, &rt->from->nbr->addr, sizeof(ip_addr)); } - struct { - ea_list l; - eattr e[3]; - struct rip_iface_adata riad; - } ea_block = { - .l = { .count = 3, }, - .e = { - { - .id = EA_RIP_METRIC, - .type = EAF_TYPE_INT, - .u.data = rt_metric, - }, - { - .id = EA_RIP_TAG, - .type = EAF_TYPE_INT, - .u.data = rt_tag, - }, - { - .id = EA_RIP_FROM, - .type = EAF_TYPE_IFACE, - .u.ptr = &ea_block.riad.ad, - } - }, - .riad = { - .ad = { .length = sizeof(struct rip_iface_adata) - sizeof(struct adata) }, - .iface = a0.nh.iface, - }, - }; + ea_set_attr_u32(&ea, &ea_rip_tag, 0, rt_tag); - a0.eattrs = &ea_block.l; + struct rip_iface_adata riad = { + .ad = { .length = sizeof(struct rip_iface_adata) - sizeof(struct adata) }, + .iface = rt_from, + }; + ea_set_attr(&ea, + EA_LITERAL_DIRECT_ADATA(&ea_rip_from, 0, &riad.ad)); - rta *a = rta_lookup(&a0); - rte *e = rte_get_temp(a, p->p.main_source); + rte e0 = { + .attrs = ea, + .src = p->p.main_source, + }; - rte_update(&p->p, en->n.addr, e); + rte_update(p->p.main_channel, en->n.addr, &e0, p->p.main_source); } else - { - /* Withdraw */ - rte_update(&p->p, en->n.addr, NULL); - } + rte_update(p->p.main_channel, en->n.addr, NULL, p->p.main_source); } /** @@ -328,8 +333,8 @@ rip_withdraw_rte(struct rip_proto *p, net_addr *n, struct rip_neighbor *from) * it into our data structures. */ static void -rip_rt_notify(struct proto *P, struct channel *ch UNUSED, struct network *net, struct rte *new, - struct rte *old UNUSED) +rip_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net, struct rte *new, + const struct rte *old UNUSED) { struct rip_proto *p = (struct rip_proto *) P; struct rip_entry *en; @@ -338,22 +343,22 @@ rip_rt_notify(struct proto *P, struct channel *ch UNUSED, struct network *net, s if (new) { /* Update */ - u32 rt_tag = ea_get_int(new->attrs->eattrs, EA_RIP_TAG, 0); - u32 rt_metric = ea_get_int(new->attrs->eattrs, EA_RIP_METRIC, 1); - const eattr *rie = ea_find(new->attrs->eattrs, EA_RIP_FROM); + u32 rt_tag = ea_get_int(new->attrs, &ea_rip_tag, 0); + u32 rt_metric = ea_get_int(new->attrs, &ea_rip_metric, 1); + const eattr *rie = ea_find(new->attrs, &ea_rip_from); struct iface *rt_from = rie ? ((struct rip_iface_adata *) rie->u.ptr)->iface : NULL; if (rt_metric > p->infinity) { log(L_WARN "%s: Invalid rip_metric value %u for route %N", - p->p.name, rt_metric, net->n.addr); + p->p.name, rt_metric, net); rt_metric = p->infinity; } if (rt_tag > 0xffff) { log(L_WARN "%s: Invalid rip_tag value %u for route %N", - p->p.name, rt_tag, net->n.addr); + p->p.name, rt_tag, net); rt_metric = p->infinity; rt_tag = 0; } @@ -365,21 +370,27 @@ rip_rt_notify(struct proto *P, struct channel *ch UNUSED, struct network *net, s * collection. */ - en = fib_get(&p->rtable, net->n.addr); + en = fib_get(&p->rtable, net); old_metric = en->valid ? en->metric : -1; en->valid = RIP_ENTRY_VALID; en->metric = rt_metric; en->tag = rt_tag; - en->from = (new->src->proto == P) ? rt_from : NULL; - en->iface = new->attrs->nh.iface; - en->next_hop = new->attrs->nh.gw; + en->from = (new->src->owner == &P->sources) ? rt_from : NULL; + + eattr *nhea = ea_find(new->attrs, &ea_gen_nexthop); + if (nhea) + { + struct nexthop_adata *nhad = (struct nexthop_adata *) nhea->u.ptr; + en->iface = nhad->nh.iface; + en->next_hop = nhad->nh.gw; + } } else { /* Withdraw */ - en = fib_find(&p->rtable, net->n.addr); + en = fib_find(&p->rtable, net); if (!en || en->valid != RIP_ENTRY_VALID) return; @@ -532,7 +543,7 @@ rip_update_bfd(struct rip_proto *p, struct rip_neighbor *n) ip_addr saddr = rip_is_v2(p) ? n->ifa->sk->saddr : n->nbr->ifa->ip; n->bfd_req = bfd_request_session(p->p.pool, n->nbr->addr, saddr, n->nbr->iface, p->p.vrf, - rip_bfd_notify, n, NULL); + rip_bfd_notify, n, p->p.loop, NULL); } if (!use_bfd && n->bfd_req) @@ -656,9 +667,9 @@ rip_iface_update_bfd(struct rip_iface *ifa) static void -rip_iface_locked(struct object_lock *lock) +rip_iface_locked(void *_ifa) { - struct rip_iface *ifa = lock->data; + struct rip_iface *ifa = _ifa; struct rip_proto *p = ifa->rip; if (!rip_open_socket(ifa)) @@ -720,8 +731,11 @@ rip_add_iface(struct rip_proto *p, struct iface *iface, struct rip_iface_config lock->type = OBJLOCK_UDP; lock->port = ic->port; lock->iface = iface; - lock->data = ifa; - lock->hook = rip_iface_locked; + lock->event = (event) { + .hook = rip_iface_locked, + .data = ifa, + }; + lock->target = &global_event_list; ifa->lock = lock; olock_acquire(lock); @@ -793,11 +807,9 @@ rip_reconfigure_iface(struct rip_proto *p, struct rip_iface *ifa, struct rip_ifa static void rip_reconfigure_ifaces(struct rip_proto *p, struct rip_config *cf) { - struct iface *iface; - - WALK_LIST(iface, iface_list) + IFACE_WALK(iface) { - if (p->p.vrf_set && p->p.vrf != iface->master) + if (p->p.vrf && p->p.vrf != iface->master) continue; if (!(iface->flags & IF_UP)) @@ -1104,14 +1116,23 @@ rip_reload_routes(struct channel *C) rip_kick_timer(p); } +static struct rte_owner_class rip_rte_owner_class; + +static inline struct rip_proto * +rip_rte_proto(const rte *rte) +{ + return (rte->src->owner->class == &rip_rte_owner_class) ? + SKIP_BACK(struct rip_proto, p.sources, rte->src->owner) : NULL; +} + static u32 -rip_rte_igp_metric(struct rte *rt) +rip_rte_igp_metric(const rte *rt) { - return ea_get_int(rt->attrs->eattrs, EA_RIP_METRIC, IGP_METRIC_UNKNOWN); + return ea_get_int(rt->attrs, &ea_rip_metric, IGP_METRIC_UNKNOWN); } static int -rip_rte_better(struct rte *new, struct rte *old) +rip_rte_better(const rte *new, const rte *old) { return rip_rte_igp_metric(new) < rip_rte_igp_metric(old); } @@ -1133,12 +1154,11 @@ rip_init(struct proto_config *CF) P->main_channel = proto_add_channel(P, proto_cf_main_channel(CF)); - P->if_notify = rip_if_notify; + P->iface_sub.if_notify = rip_if_notify; P->rt_notify = rip_rt_notify; - P->neigh_notify = rip_neigh_notify; + P->iface_sub.neigh_notify = rip_neigh_notify; P->reload_routes = rip_reload_routes; - P->rte_better = rip_rte_better; - P->rte_igp_metric = rip_rte_igp_metric; + P->sources.class = &rip_rte_owner_class; return P; } @@ -1211,38 +1231,41 @@ rip_reconfigure(struct proto *P, struct proto_config *CF) } static void -rip_get_route_info(rte *rte, byte *buf) +rip_get_route_info(const rte *rte, byte *buf) { - struct rip_proto *p = (struct rip_proto *) rte->src->proto; - u32 rt_metric = ea_get_int(rte->attrs->eattrs, EA_RIP_METRIC, p->infinity); - u32 rt_tag = ea_get_int(rte->attrs->eattrs, EA_RIP_TAG, 0); + struct rip_proto *p = rip_rte_proto(rte); + u32 rt_metric = ea_get_int(rte->attrs, &ea_rip_metric, p->infinity); + u32 rt_tag = ea_get_int(rte->attrs, &ea_rip_tag, 0); - buf += bsprintf(buf, " (%d/%d)", rte->attrs->pref, rt_metric); + buf += bsprintf(buf, " (%d/%d)", rt_get_preference(rte), rt_metric); if (rt_tag) bsprintf(buf, " [%04x]", rt_tag); } -static int -rip_get_attr(const eattr *a, byte *buf, int buflen UNUSED) +static void +rip_tag_format(const eattr *a, byte *buf, uint buflen) { - switch (a->id) - { - case EA_RIP_METRIC: - bsprintf(buf, "metric: %d", a->u.data); - return GA_FULL; + bsnprintf(buf, buflen, "%04x", a->u.data); +} - case EA_RIP_TAG: - bsprintf(buf, "tag: %04x", a->u.data); - return GA_FULL; +static struct ea_class ea_rip_metric = { + .name = "rip_metric", + .type = T_INT, +}; - case EA_RIP_FROM: - return GA_HIDDEN; +static struct ea_class ea_rip_tag = { + .name = "rip_tag", + .type = T_INT, + .format = rip_tag_format, +}; - default: - return GA_UNKNOWN; - } -} +static struct ea_class ea_rip_from = { + .name = "rip_from", + .type = T_IFACE, + .readonly = 1, + .hidden = 1, +}; void rip_show_interfaces(struct proto *P, const char *iff) @@ -1343,10 +1366,15 @@ rip_dump(struct proto *P) } +static struct rte_owner_class rip_rte_owner_class = { + .get_route_info = rip_get_route_info, + .rte_better = rip_rte_better, + .rte_igp_metric = rip_rte_igp_metric, +}; + struct protocol proto_rip = { .name = "RIP", .template = "rip%d", - .class = PROTOCOL_RIP, .preference = DEF_PREF_RIP, .channel_mask = NB_IP, .proto_size = sizeof(struct rip_proto), @@ -1357,12 +1385,16 @@ struct protocol proto_rip = { .start = rip_start, .shutdown = rip_shutdown, .reconfigure = rip_reconfigure, - .get_route_info = rip_get_route_info, - .get_attr = rip_get_attr }; void rip_build(void) { proto_build(&proto_rip); + + EA_REGISTER_ALL( + &ea_rip_metric, + &ea_rip_tag, + &ea_rip_from + ); } diff --git a/proto/rip/rip.h b/proto/rip/rip.h index f8713c4a..a01f8d3b 100644 --- a/proto/rip/rip.h +++ b/proto/rip/rip.h @@ -16,7 +16,7 @@ #include "nest/cli.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/password.h" #include "nest/locks.h" #include "nest/bfd.h" @@ -195,10 +195,6 @@ struct rip_rte #define RIP_ENTRY_VALID 1 /* Valid outgoing route */ #define RIP_ENTRY_STALE 2 /* Stale outgoing route, waiting for GC */ -#define EA_RIP_METRIC EA_CODE(PROTOCOL_RIP, 0) -#define EA_RIP_TAG EA_CODE(PROTOCOL_RIP, 1) -#define EA_RIP_FROM EA_CODE(PROTOCOL_RIP, 2) - static inline int rip_is_v2(struct rip_proto *p) { return p->rip2; } diff --git a/proto/rpki/config.Y b/proto/rpki/config.Y index d6d326b8..743b5b42 100644 --- a/proto/rpki/config.Y +++ b/proto/rpki/config.Y @@ -42,6 +42,7 @@ proto: rpki_proto ; rpki_proto_start: proto_start RPKI { this_proto = proto_config_new(&proto_rpki, $1); + this_proto->loop_order = DOMAIN_ORDER(proto); RPKI_CFG->retry_interval = RPKI_RETRY_INTERVAL; RPKI_CFG->refresh_interval = RPKI_REFRESH_INTERVAL; RPKI_CFG->expire_interval = RPKI_EXPIRE_INTERVAL; diff --git a/proto/rpki/packets.c b/proto/rpki/packets.c index d246dd50..d7895a22 100644 --- a/proto/rpki/packets.c +++ b/proto/rpki/packets.c @@ -233,7 +233,12 @@ static const size_t min_pdu_size[] = { [ERROR] = 16, }; -static int rpki_send_error_pdu(struct rpki_cache *cache, const enum pdu_error_type error_code, const u32 err_pdu_len, const struct pdu_header *erroneous_pdu, const char *fmt, ...); +static int rpki_send_error_pdu_(struct rpki_cache *cache, const enum pdu_error_type error_code, const u32 err_pdu_len, const struct pdu_header *erroneous_pdu, const char *fmt, ...); + +#define rpki_send_error_pdu(cache, error_code, err_pdu_len, erroneous_pdu, fmt...) ({ \ + rpki_send_error_pdu_(cache, error_code, err_pdu_len, erroneous_pdu, #fmt); \ + CACHE_TRACE(D_PACKETS, cache, #fmt); \ + }) static void rpki_pdu_to_network_byte_order(struct pdu_header *pdu) @@ -595,6 +600,7 @@ rpki_handle_error_pdu(struct rpki_cache *cache, const struct pdu_error *pdu) case INTERNAL_ERROR: case INVALID_REQUEST: case UNSUPPORTED_PDU_TYPE: + CACHE_TRACE(D_PACKETS, cache, "Got UNSUPPORTED_PDU_TYPE"); rpki_cache_change_state(cache, RPKI_CS_ERROR_FATAL); break; @@ -652,21 +658,7 @@ rpki_handle_cache_response_pdu(struct rpki_cache *cache, const struct pdu_cache_ { if (cache->request_session_id) { - if (cache->last_update) - { - /* - * This isn't the first sync and we already received records. This point - * is after Reset Query and before importing new records from cache - * server. We need to load new ones and kick out missing ones. So start - * a refresh cycle. - */ - if (cache->p->roa4_channel) - rt_refresh_begin(cache->p->roa4_channel->table, cache->p->roa4_channel); - if (cache->p->roa6_channel) - rt_refresh_begin(cache->p->roa6_channel->table, cache->p->roa6_channel); - - cache->p->refresh_channels = 1; - } + rpki_start_refresh(cache->p); cache->session_id = pdu->session_id; cache->request_session_id = 0; } @@ -842,14 +834,7 @@ rpki_handle_end_of_data_pdu(struct rpki_cache *cache, const struct pdu_end_of_da (cf->keep_expire_interval ? "keeps " : ""), cache->expire_interval); } - if (cache->p->refresh_channels) - { - cache->p->refresh_channels = 0; - if (cache->p->roa4_channel) - rt_refresh_end(cache->p->roa4_channel->table, cache->p->roa4_channel); - if (cache->p->roa6_channel) - rt_refresh_end(cache->p->roa6_channel->table, cache->p->roa6_channel); - } + rpki_stop_refresh(cache->p); cache->last_update = current_time(); cache->serial_num = pdu->serial_num; @@ -924,6 +909,9 @@ rpki_rx_hook(struct birdsock *sk, uint size) struct rpki_cache *cache = sk->data; struct rpki_proto *p = cache->p; + if ((p->p.proto_state == PS_DOWN) || (p->cache != cache)) + return 0; + byte *pkt_start = sk->rbuf; byte *end = pkt_start + size; @@ -980,6 +968,8 @@ rpki_err_hook(struct birdsock *sk, int error_num) CACHE_TRACE(D_EVENTS, cache, "The other side closed a connection"); } + if (cache->p->cache != cache) + return; rpki_cache_change_state(cache, RPKI_CS_ERROR_TRANSPORT); } @@ -999,6 +989,9 @@ rpki_tx_hook(sock *sk) { struct rpki_cache *cache = sk->data; + if (cache->p->cache != cache) + return; + while (rpki_fire_tx(cache) > 0) ; } @@ -1008,6 +1001,9 @@ rpki_connected_hook(sock *sk) { struct rpki_cache *cache = sk->data; + if (cache->p->cache != cache) + return; + CACHE_TRACE(D_EVENTS, cache, "Connected"); proto_notify_state(&cache->p->p, PS_UP); @@ -1029,7 +1025,7 @@ rpki_connected_hook(sock *sk) * This function prepares Error PDU and sends it to a cache server. */ static int -rpki_send_error_pdu(struct rpki_cache *cache, const enum pdu_error_type error_code, const u32 err_pdu_len, const struct pdu_header *erroneous_pdu, const char *fmt, ...) +rpki_send_error_pdu_(struct rpki_cache *cache, const enum pdu_error_type error_code, const u32 err_pdu_len, const struct pdu_header *erroneous_pdu, const char *fmt, ...) { va_list args; char msg[128]; diff --git a/proto/rpki/rpki.c b/proto/rpki/rpki.c index 3e321627..e5638aff 100644 --- a/proto/rpki/rpki.c +++ b/proto/rpki/rpki.c @@ -109,6 +109,7 @@ static void rpki_schedule_next_expire_check(struct rpki_cache *cache); static void rpki_stop_refresh_timer_event(struct rpki_cache *cache); static void rpki_stop_retry_timer_event(struct rpki_cache *cache); static void rpki_stop_expire_timer_event(struct rpki_cache *cache); +static void rpki_stop_all_timers(struct rpki_cache *cache); /* @@ -120,26 +121,46 @@ rpki_table_add_roa(struct rpki_cache *cache, struct channel *channel, const net_ { struct rpki_proto *p = cache->p; - rta a0 = { - .pref = channel->preference, - .source = RTS_RPKI, - .scope = SCOPE_UNIVERSE, - .dest = RTD_NONE, - }; + ea_list *ea = NULL; + ea_set_attr_u32(&ea, &ea_gen_preference, 0, channel->preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_RPKI); - rta *a = rta_lookup(&a0); - rte *e = rte_get_temp(a, p->p.main_source); + rte e0 = { .attrs = ea, .src = p->p.main_source, }; - rte_update2(channel, &pfxr->n, e, e->src); + rte_update(channel, &pfxr->n, &e0, p->p.main_source); } void rpki_table_remove_roa(struct rpki_cache *cache, struct channel *channel, const net_addr_union *pfxr) { struct rpki_proto *p = cache->p; - rte_update2(channel, &pfxr->n, NULL, p->p.main_source); + rte_update(channel, &pfxr->n, NULL, p->p.main_source); +} + +void +rpki_start_refresh(struct rpki_proto *p) +{ + if (p->roa4_channel) + rt_refresh_begin(&p->roa4_channel->in_req); + if (p->roa6_channel) + rt_refresh_begin(&p->roa6_channel->in_req); + + p->refresh_channels = 1; } +void +rpki_stop_refresh(struct rpki_proto *p) +{ + if (!p->refresh_channels) + return; + + p->refresh_channels = 0; + + if (p->roa4_channel) + rt_refresh_end(&p->roa4_channel->in_req); + if (p->roa6_channel) + rt_refresh_end(&p->roa6_channel->in_req); +} /* * RPKI Protocol Logic @@ -196,6 +217,8 @@ rpki_force_restart_proto(struct rpki_proto *p) { if (p->cache) { + rpki_tr_close(p->cache->tr_sock); + rpki_stop_all_timers(p->cache); CACHE_DBG(p->cache, "Connection object destroying"); } @@ -320,7 +343,7 @@ rpki_schedule_next_refresh(struct rpki_cache *cache) btime t = cache->refresh_interval S; CACHE_DBG(cache, "after %t s", t); - tm_start(cache->refresh_timer, t); + tm_start_in(cache->refresh_timer, t, cache->p->p.loop); } static void @@ -329,7 +352,7 @@ rpki_schedule_next_retry(struct rpki_cache *cache) btime t = cache->retry_interval S; CACHE_DBG(cache, "after %t s", t); - tm_start(cache->retry_timer, t); + tm_start_in(cache->retry_timer, t, cache->p->p.loop); } static void @@ -340,7 +363,7 @@ rpki_schedule_next_expire_check(struct rpki_cache *cache) t = MAX(t, 1 S); CACHE_DBG(cache, "after %t s", t); - tm_start(cache->expire_timer, t); + tm_start_in(cache->expire_timer, t, cache->p->p.loop); } static void @@ -357,13 +380,21 @@ rpki_stop_retry_timer_event(struct rpki_cache *cache) tm_stop(cache->retry_timer); } -static void UNUSED +static void rpki_stop_expire_timer_event(struct rpki_cache *cache) { CACHE_DBG(cache, "Stop"); tm_stop(cache->expire_timer); } +static void +rpki_stop_all_timers(struct rpki_cache *cache) +{ + rpki_stop_refresh_timer_event(cache); + rpki_stop_retry_timer_event(cache); + rpki_stop_expire_timer_event(cache); +} + static int rpki_do_we_recv_prefix_pdu_in_last_seconds(struct rpki_cache *cache) { @@ -386,6 +417,9 @@ rpki_refresh_hook(timer *tm) { struct rpki_cache *cache = tm->data; + if (cache->p->cache != cache) + return; + CACHE_DBG(cache, "%s", rpki_cache_state_to_str(cache->state)); switch (cache->state) @@ -432,6 +466,9 @@ rpki_retry_hook(timer *tm) { struct rpki_cache *cache = tm->data; + if (cache->p->cache != cache) + return; + CACHE_DBG(cache, "%s", rpki_cache_state_to_str(cache->state)); switch (cache->state) @@ -482,6 +519,9 @@ rpki_expire_hook(timer *tm) { struct rpki_cache *cache = tm->data; + if (cache->p->cache != cache) + return; + if (!cache->last_update) return; @@ -624,6 +664,7 @@ rpki_close_connection(struct rpki_cache *cache) { CACHE_TRACE(D_EVENTS, cache, "Closing a connection"); rpki_tr_close(cache->tr_sock); + rpki_stop_refresh(cache->p); proto_notify_state(&cache->p->p, PS_START); } @@ -950,7 +991,6 @@ rpki_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUS struct protocol proto_rpki = { .name = "RPKI", .template = "rpki%d", - .class = PROTOCOL_RPKI, .preference = DEF_PREF_RPKI, .proto_size = sizeof(struct rpki_proto), .config_size = sizeof(struct rpki_config), diff --git a/proto/rpki/rpki.h b/proto/rpki/rpki.h index 8a5c38fd..20253844 100644 --- a/proto/rpki/rpki.h +++ b/proto/rpki/rpki.h @@ -13,7 +13,7 @@ #define _BIRD_RPKI_H_ #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "lib/socket.h" #include "lib/ip.h" @@ -83,6 +83,8 @@ const char *rpki_cache_state_to_str(enum rpki_cache_state state); void rpki_table_add_roa(struct rpki_cache *cache, struct channel *channel, const net_addr_union *pfxr); void rpki_table_remove_roa(struct rpki_cache *cache, struct channel *channel, const net_addr_union *pfxr); +void rpki_start_refresh(struct rpki_proto *p); +void rpki_stop_refresh(struct rpki_proto *p); /* * RPKI Protocol Logic diff --git a/proto/rpki/ssh_transport.c b/proto/rpki/ssh_transport.c index 6333f367..425ad460 100644 --- a/proto/rpki/ssh_transport.c +++ b/proto/rpki/ssh_transport.c @@ -35,7 +35,7 @@ rpki_tr_ssh_open(struct rpki_tr_sock *tr) sk->ssh->subsystem = "rpki-rtr"; sk->ssh->state = SK_SSH_CONNECT; - if (sk_open(sk) != 0) + if (sk_open(sk, cache->p->p.loop) != 0) return RPKI_TR_ERROR; return RPKI_TR_SUCCESS; diff --git a/proto/rpki/tcp_transport.c b/proto/rpki/tcp_transport.c index 132f8e2d..ebb8030f 100644 --- a/proto/rpki/tcp_transport.c +++ b/proto/rpki/tcp_transport.c @@ -28,7 +28,7 @@ rpki_tr_tcp_open(struct rpki_tr_sock *tr) sk->type = SK_TCP_ACTIVE; - if (sk_open(sk) != 0) + if (sk_open(sk, tr->cache->p->p.loop) != 0) return RPKI_TR_ERROR; return RPKI_TR_SUCCESS; diff --git a/proto/static/static.c b/proto/static/static.c index bb93305e..6bae827b 100644 --- a/proto/static/static.c +++ b/proto/static/static.c @@ -38,7 +38,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "conf/conf.h" #include "filter/filter.h" @@ -47,91 +47,96 @@ #include "static.h" -static linpool *static_lp; - static inline struct rte_src * static_get_source(struct static_proto *p, uint i) { return i ? rt_get_source(&p->p, i) : p->p.main_source; } +static inline void static_free_source(struct rte_src *src, uint i) +{ if (i) rt_unlock_source(src); } + static void static_announce_rte(struct static_proto *p, struct static_route *r) { - rta *a = allocz(RTA_MAX_SIZE); - struct rte_src *src = static_get_source(p, r->index); - a->source = RTS_STATIC; - a->scope = SCOPE_UNIVERSE; - a->dest = r->dest; - a->pref = p->p.main_channel->preference; + struct rte_src *src; + ea_list *ea = NULL; + ea_set_attr_u32(&ea, &ea_gen_preference, 0, p->p.main_channel->preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_STATIC); if (r->dest == RTD_UNICAST) { - struct static_route *r2; - struct nexthop *nhs = NULL; + uint sz = 0; + for (struct static_route *r2 = r; r2; r2 = r2->mp_next) + if (r2->active) + sz += NEXTHOP_SIZE_CNT(r2->mls ? r2->mls->length / sizeof(u32) : 0); - for (r2 = r; r2; r2 = r2->mp_next) + if (!sz) + goto withdraw; + + struct nexthop_adata *nhad = allocz(sz + sizeof *nhad); + struct nexthop *nh = &nhad->nh; + + for (struct static_route *r2 = r; r2; r2 = r2->mp_next) { if (!r2->active) continue; - struct nexthop *nh = allocz(NEXTHOP_MAX_SIZE); - nh->gw = r2->via; - nh->iface = r2->neigh->iface; - nh->flags = r2->onlink ? RNF_ONLINK : 0; - nh->weight = r2->weight; + *nh = (struct nexthop) { + .gw = r2->via, + .iface = r2->neigh->iface, + .flags = r2->onlink ? RNF_ONLINK : 0, + .weight = r2->weight, + }; + if (r2->mls) { - nh->labels = r2->mls->len; - memcpy(nh->label, r2->mls->stack, r2->mls->len * sizeof(u32)); + nh->labels = r2->mls->length / sizeof(u32); + memcpy(nh->label, r2->mls->data, r2->mls->length); } - nexthop_insert(&nhs, nh); + nh = NEXTHOP_NEXT(nh); } - if (!nhs) - goto withdraw; - - nexthop_link(a, nhs); + ea_set_attr_data(&ea, &ea_gen_nexthop, 0, + nhad->ad.data, (void *) nh - (void *) nhad->ad.data); } - if (r->dest == RTDX_RECURSIVE) + else if (r->dest == RTDX_RECURSIVE) { rtable *tab = ipa_is_ip4(r->via) ? p->igp_table_ip4 : p->igp_table_ip6; - rta_set_recursive_next_hop(p->p.main_channel->table, a, tab, r->via, IPA_NONE, r->mls); + u32 *labels = r->mls ? (void *) r->mls->data : NULL; + u32 lnum = r->mls ? r->mls->length / sizeof(u32) : 0; + + ea_set_hostentry(&ea, p->p.main_channel->table, tab, + r->via, IPA_NONE, lnum, labels); } + else if (r->dest) + ea_set_dest(&ea, 0, r->dest); + /* Already announced */ if (r->state == SRS_CLEAN) return; /* We skip rta_lookup() here */ - rte *e = rte_get_temp(a, src); + src = static_get_source(p, r->index); + rte e0 = { .attrs = ea, .src = src, .net = r->net, }, *e = &e0; + /* Evaluate the filter */ if (r->cmds) - { - /* Create a temporary table node */ - e->net = alloca(sizeof(net) + r->net->length); - memset(e->net, 0, sizeof(net) + r->net->length); - net_copy(e->net->n.addr, r->net); + f_eval_rte(r->cmds, e); - /* Evaluate the filter */ - f_eval_rte(r->cmds, &e, static_lp); + rte_update(p->p.main_channel, r->net, e, src); + static_free_source(src, r->index); - /* Remove the temporary node */ - e->net = NULL; - } - - rte_update2(p->p.main_channel, r->net, e, src); r->state = SRS_CLEAN; - - if (r->cmds) - lp_flush(static_lp); - return; withdraw: if (r->state == SRS_DOWN) return; - rte_update2(p->p.main_channel, r->net, NULL, src); + src = static_get_source(p, r->index); + rte_update(p->p.main_channel, r->net, NULL, src); + static_free_source(src, r->index); r->state = SRS_DOWN; } @@ -208,7 +213,7 @@ static_update_bfd(struct static_proto *p, struct static_route *r) // ip_addr local = ipa_nonzero(r->local) ? r->local : nb->ifa->ip; r->bfd_req = bfd_request_session(p->p.pool, r->via, nb->ifa->ip, nb->iface, p->p.vrf, - static_bfd_notify, r, NULL); + static_bfd_notify, r, p->p.loop, NULL); } if (!bfd_up && r->bfd_req) @@ -297,7 +302,11 @@ static void static_remove_rte(struct static_proto *p, struct static_route *r) { if (r->state) - rte_update2(p->p.main_channel, r->net, NULL, static_get_source(p, r->index)); + { + struct rte_src *src = static_get_source(p, r->index); + rte_update(p->p.main_channel, r->net, NULL, src); + static_free_source(src, r->index); + } static_reset_rte(p, r); } @@ -320,31 +329,17 @@ static_same_dest(struct static_route *x, struct static_route *y) (x->weight != y->weight) || (x->use_bfd != y->use_bfd) || (!x->mls != !y->mls) || - ((x->mls) && (y->mls) && (x->mls->len != y->mls->len))) + ((x->mls) && (y->mls) && adata_same(x->mls, y->mls))) return 0; - - if (!x->mls) - continue; - - for (uint i = 0; i < x->mls->len; i++) - if (x->mls->stack[i] != y->mls->stack[i]) - return 0; } return !x && !y; case RTDX_RECURSIVE: if (!ipa_equal(x->via, y->via) || (!x->mls != !y->mls) || - ((x->mls) && (y->mls) && (x->mls->len != y->mls->len))) + ((x->mls) && (y->mls) && adata_same(x->mls, y->mls))) return 0; - if (!x->mls) - return 1; - - for (uint i = 0; i < x->mls->len; i++) - if (x->mls->stack[i] != y->mls->stack[i]) - return 0; - return 1; default: @@ -411,18 +406,18 @@ static_reload_routes(struct channel *C) } static int -static_rte_better(rte *new, rte *old) +static_rte_better(const rte *new, const rte *old) { - u32 n = ea_get_int(new->attrs->eattrs, EA_GEN_IGP_METRIC, IGP_METRIC_UNKNOWN); - u32 o = ea_get_int(old->attrs->eattrs, EA_GEN_IGP_METRIC, IGP_METRIC_UNKNOWN); + u32 n = ea_get_int(new->attrs, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN); + u32 o = ea_get_int(old->attrs, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN); return n < o; } static int -static_rte_mergable(rte *pri, rte *sec) +static_rte_mergable(const rte *pri, const rte *sec) { - u32 a = ea_get_int(pri->attrs->eattrs, EA_GEN_IGP_METRIC, IGP_METRIC_UNKNOWN); - u32 b = ea_get_int(sec->attrs->eattrs, EA_GEN_IGP_METRIC, IGP_METRIC_UNKNOWN); + u32 a = ea_get_int(pri->attrs, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN); + u32 b = ea_get_int(sec->attrs, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN); return a == b; } @@ -441,11 +436,11 @@ static_postconfig(struct proto_config *CF) if (!cf->igp_table_ip4) cf->igp_table_ip4 = (cc->table->addr_type == NET_IP4) ? - cc->table : cf->c.global->def_tables[NET_IP4]; + cc->table : rt_get_default_table(cf->c.global, NET_IP4); if (!cf->igp_table_ip6) cf->igp_table_ip6 = (cc->table->addr_type == NET_IP6) ? - cc->table : cf->c.global->def_tables[NET_IP6]; + cc->table : rt_get_default_table(cf->c.global, NET_IP6); WALK_LIST(r, cf->routes) if (r->net && (r->net->type != CF->net_type)) @@ -454,6 +449,8 @@ static_postconfig(struct proto_config *CF) static_index_routes(cf); } +static struct rte_owner_class static_rte_owner_class; + static struct proto * static_init(struct proto_config *CF) { @@ -463,10 +460,9 @@ static_init(struct proto_config *CF) P->main_channel = proto_add_channel(P, proto_cf_main_channel(CF)); - P->neigh_notify = static_neigh_notify; + P->iface_sub.neigh_notify = static_neigh_notify; P->reload_routes = static_reload_routes; - P->rte_better = static_rte_better; - P->rte_mergable = static_rte_mergable; + P->sources.class = &static_rte_owner_class; if (cf->igp_table_ip4) p->igp_table_ip4 = cf->igp_table_ip4->table; @@ -484,9 +480,6 @@ static_start(struct proto *P) struct static_config *cf = (void *) P->cf; struct static_route *r; - if (!static_lp) - static_lp = lp_new(&root_pool); - if (p->igp_table_ip4) rt_lock_table(p->igp_table_ip4); @@ -501,7 +494,12 @@ static_start(struct proto *P) proto_notify_state(P, PS_UP); WALK_LIST(r, cf->routes) + { + struct lp_state lps; + lp_save(tmp_linpool, &lps); static_add_rte(p, r); + lp_restore(tmp_linpool, &lps); + } return PS_UP; } @@ -517,19 +515,13 @@ static_shutdown(struct proto *P) WALK_LIST(r, cf->routes) static_reset_rte(p, r); - return PS_DOWN; -} - -static void -static_cleanup(struct proto *P) -{ - struct static_proto *p = (void *) P; - if (p->igp_table_ip4) rt_unlock_table(p->igp_table_ip4); if (p->igp_table_ip6) rt_unlock_table(p->igp_table_ip6); + + return PS_DOWN; } static void @@ -717,13 +709,14 @@ static_copy_config(struct proto_config *dest, struct proto_config *src) } static void -static_get_route_info(rte *rte, byte *buf) +static_get_route_info(const rte *rte, byte *buf) { - eattr *a = ea_find(rte->attrs->eattrs, EA_GEN_IGP_METRIC); - if (a) - buf += bsprintf(buf, " (%d/%u)", rte->attrs->pref, a->u.data); + eattr *a = ea_find(rte->attrs, &ea_gen_igp_metric); + u32 pref = rt_get_preference(rte); + if (a && (a->u.data < IGP_METRIC_UNKNOWN)) + buf += bsprintf(buf, " (%d/%u)", pref, a->u.data); else - buf += bsprintf(buf, " (%d)", rte->attrs->pref); + buf += bsprintf(buf, " (%d)", pref); } static void @@ -773,11 +766,15 @@ static_show(struct proto *P) static_show_rt(r); } +static struct rte_owner_class static_rte_owner_class = { + .get_route_info = static_get_route_info, + .rte_better = static_rte_better, + .rte_mergable = static_rte_mergable, +}; struct protocol proto_static = { .name = "Static", .template = "static%d", - .class = PROTOCOL_STATIC, .preference = DEF_PREF_STATIC, .channel_mask = NB_ANY, .proto_size = sizeof(struct static_proto), @@ -787,10 +784,8 @@ struct protocol proto_static = { .dump = static_dump, .start = static_start, .shutdown = static_shutdown, - .cleanup = static_cleanup, .reconfigure = static_reconfigure, .copy_config = static_copy_config, - .get_route_info = static_get_route_info, }; void diff --git a/proto/static/static.h b/proto/static/static.h index fc91f71c..ea7ca33b 100644 --- a/proto/static/static.h +++ b/proto/static/static.h @@ -9,7 +9,7 @@ #ifndef _BIRD_STATIC_H_ #define _BIRD_STATIC_H_ -#include "nest/route.h" +#include "nest/rt.h" #include "nest/bfd.h" #include "lib/buffer.h" @@ -49,7 +49,7 @@ struct static_route { byte weight; /* Multipath next hop weight */ byte use_bfd; /* Configured to use BFD */ struct bfd_request *bfd_req; /* BFD request, if BFD is used */ - mpls_label_stack *mls; /* MPLS label stack; may be NULL */ + struct adata *mls; /* MPLS label stack; may be NULL */ }; /* diff --git a/sysdep/bsd/krt-sock.c b/sysdep/bsd/krt-sock.c index 1f793293..094268b7 100644 --- a/sysdep/bsd/krt-sock.c +++ b/sysdep/bsd/krt-sock.c @@ -25,7 +25,7 @@ #include "nest/bird.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "sysdep/unix/unix.h" @@ -168,15 +168,18 @@ const uint krt_max_metric = KRT_MAX_METRIC; int krt_capable(rte *e) { - rta *a = e->attrs; + ea_list *eattrs = e->attrs; + eattr *nhea = ea_find(eattrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); return - ((a->dest == RTD_UNICAST && !a->nh.next) /* No multipath support */ + ((dest == RTD_UNICAST && !NEXTHOP_ONE(nh)) /* No multipath support */ #ifdef RTF_REJECT - || a->dest == RTD_UNREACHABLE + || dest == RTD_UNREACHABLE #endif #ifdef RTF_BLACKHOLE - || a->dest == RTD_BLACKHOLE + || dest == RTD_BLACKHOLE #endif ); } @@ -221,18 +224,22 @@ sockaddr_fill_dl(struct sockaddr_dl *sa, struct iface *ifa) } static int -krt_send_route(struct krt_proto *p, int cmd, rte *e) +krt_send_route(struct krt_proto *p, int cmd, const rte *e) { - net *net = e->net; - rta *a = e->attrs; + const net_addr *net = e->net; + ea_list *eattrs = e->attrs; + eattr *nhea = ea_find(eattrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); + static int msg_seq; - struct iface *j, *i = a->nh.iface; + struct iface *j, *i = (dest == RTD_UNICAST) ? nh->nh.iface : NULL; int l; struct ks_msg msg; char *body = (char *)msg.buf; sockaddr gate, mask, dst; - DBG("krt-sock: send %I/%d via %I\n", net->n.prefix, net->n.pxlen, a->gw); + DBG("krt-sock: send %N via %I\n", net, nh->nh.gw); bzero(&msg,sizeof (struct rt_msghdr)); msg.rtm.rtm_version = RTM_VERSION; @@ -242,7 +249,7 @@ krt_send_route(struct krt_proto *p, int cmd, rte *e) msg.rtm.rtm_flags = RTF_UP | RTF_PROTO1; /* XXXX */ - if (net_pxlen(net->n.addr) == net_max_prefix_length[net->n.addr->type]) + if (net_pxlen(e->net) == net_max_prefix_length[net->type]) msg.rtm.rtm_flags |= RTF_HOST; else msg.rtm.rtm_addrs |= RTA_NETMASK; @@ -256,11 +263,11 @@ krt_send_route(struct krt_proto *p, int cmd, rte *e) #endif #ifdef RTF_REJECT - if(a->dest == RTD_UNREACHABLE) + if(dest == RTD_UNREACHABLE) msg.rtm.rtm_flags |= RTF_REJECT; #endif #ifdef RTF_BLACKHOLE - if(a->dest == RTD_BLACKHOLE) + if(dest == RTD_BLACKHOLE) msg.rtm.rtm_flags |= RTF_BLACKHOLE; #endif @@ -270,14 +277,17 @@ krt_send_route(struct krt_proto *p, int cmd, rte *e) */ if (!i) { - WALK_LIST(j, iface_list) + j = if_walk_first(); + while (j) { if (j->flags & IF_LOOPBACK) { i = j; break; } + j = if_walk_next(j); } + if_walk_done(); if (!i) { @@ -288,7 +298,7 @@ krt_send_route(struct krt_proto *p, int cmd, rte *e) int af = AF_UNSPEC; - switch (net->n.addr->type) { + switch (net->type) { case NET_IP4: af = AF_INET; break; @@ -296,19 +306,19 @@ krt_send_route(struct krt_proto *p, int cmd, rte *e) af = AF_INET6; break; default: - log(L_ERR "KRT: Not sending route %N to kernel", net->n.addr); + log(L_ERR "KRT: Not sending route %N to kernel", net); return -1; } - sockaddr_fill(&dst, af, net_prefix(net->n.addr), NULL, 0); - sockaddr_fill(&mask, af, net_pxmask(net->n.addr), NULL, 0); + sockaddr_fill(&dst, af, net_prefix(net), NULL, 0); + sockaddr_fill(&mask, af, net_pxmask(net), NULL, 0); - switch (a->dest) + switch (dest) { case RTD_UNICAST: - if (ipa_nonzero(a->nh.gw)) + if (ipa_nonzero(nh->nh.gw)) { - ip_addr gw = a->nh.gw; + ip_addr gw = nh->nh.gw; /* Embed interface ID to link-local address */ if (ipa_is_link_local(gw)) @@ -320,6 +330,8 @@ krt_send_route(struct krt_proto *p, int cmd, rte *e) break; } + /* Fall through */ + #ifdef RTF_REJECT case RTD_UNREACHABLE: #endif @@ -331,7 +343,7 @@ krt_send_route(struct krt_proto *p, int cmd, rte *e) #if __OpenBSD__ /* Keeping temporarily old code for OpenBSD */ - struct ifa *addr = (net->n.addr->type == NET_IP4) ? i->addr4 : (i->addr6 ?: i->llv6); + struct ifa *addr = (net->type == NET_IP4) ? i->addr4 : (i->addr6 ?: i->llv6); if (!addr) { @@ -367,7 +379,7 @@ krt_send_route(struct krt_proto *p, int cmd, rte *e) msg.rtm.rtm_msglen = l; if ((l = write(p->sys.sk->fd, (char *)&msg, l)) < 0) { - log(L_ERR "KRT: Error sending route %N to kernel: %m", net->n.addr); + log(L_ERR "KRT: Error sending route %N to kernel: %m", net); return -1; } @@ -375,7 +387,7 @@ krt_send_route(struct krt_proto *p, int cmd, rte *e) } void -krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old) +krt_replace_rte(struct krt_proto *p, const net_addr *n UNUSED, rte *new, const rte *old) { int err = 0; @@ -426,8 +438,6 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan) /* p is NULL iff KRT_SHARED_SOCKET and !scan */ int ipv6; - rte *e; - net *net; sockaddr dst, gate, mask; ip_addr idst, igate, imask; net_addr ndst; @@ -543,96 +553,92 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan) src = KRT_SRC_ALIEN; else src = KRT_SRC_KERNEL; + + union { + struct { + struct adata ad; + struct nexthop nh; + u32 labels[MPLS_MAX_LABEL_STACK]; + }; + struct nexthop_adata nhad; + } nhad = {}; - net = net_get(p->p.main_channel->table, &ndst); + ea_list *eattrs = NULL; - rta a = { - .source = RTS_INHERIT, - .scope = SCOPE_UNIVERSE, - }; + ea_set_attr_u32(&eattrs, &ea_gen_source, 0, RTS_INHERIT); /* reject/blackhole routes have also set RTF_GATEWAY, we wil check them first. */ #ifdef RTF_REJECT if(flags & RTF_REJECT) { - a.dest = RTD_UNREACHABLE; + nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_UNREACHABLE); goto done; } #endif #ifdef RTF_BLACKHOLE if(flags & RTF_BLACKHOLE) { - a.dest = RTD_BLACKHOLE; + nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_BLACKHOLE); goto done; } #endif - a.nh.iface = if_find_by_index(msg->rtm.rtm_index); - if (!a.nh.iface) + nhad.nh.iface = if_find_by_index(msg->rtm.rtm_index); + if (!nhad.nh.iface) { log(L_ERR "KRT: Received route %N with unknown ifindex %u", - net->n.addr, msg->rtm.rtm_index); + &ndst, msg->rtm.rtm_index); return; } - a.dest = RTD_UNICAST; if (flags & RTF_GATEWAY) { - a.nh.gw = igate; + nhad.nh.gw = igate; /* Clean up embedded interface ID returned in link-local address */ - if (ipa_is_link_local(a.nh.gw)) - _I0(a.nh.gw) = 0xfe800000; + if (ipa_is_link_local(nhad.nh.gw)) + _I0(nhad.nh.gw) = 0xfe800000; /* The BSD kernel does not support an onlink flag. We heuristically set the onlink flag, if the iface has only host addresses. */ - if (krt_assume_onlink(a.nh.iface, ipv6)) - a.nh.flags |= RNF_ONLINK; + if (krt_assume_onlink(nhad.nh.iface, ipv6)) + nhad.nh.flags |= RNF_ONLINK; neighbor *nbr; - nbr = neigh_find(&p->p, a.nh.gw, a.nh.iface, - (a.nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0); + nbr = neigh_find(&p->p, nhad.nh.gw, nhad.nh.iface, + (nhad.nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0); if (!nbr || (nbr->scope == SCOPE_HOST)) { /* Ignore routes with next-hop 127.0.0.1, host routes with such next-hop appear on OpenBSD for address aliases. */ - if (ipa_classify(a.nh.gw) == (IADDR_HOST | SCOPE_HOST)) + if (ipa_classify(nhad.nh.gw) == (IADDR_HOST | SCOPE_HOST)) return; log(L_ERR "KRT: Received route %N with strange next-hop %I", - net->n.addr, a.nh.gw); + &ndst, nhad.nh.gw); return; } } - done: - e = rte_get_temp(&a, p->p.main_source); - e->net = net; + nhad.ad.length = (void *) NEXTHOP_NEXT(&nhad.nh) - (void *) nhad.ad.data; - ea_list *ea = alloca(sizeof(ea_list) + 2 * sizeof(eattr)); - *ea = (ea_list) { .count = 1, .next = e->attrs->eattrs }; - e->attrs->eattrs = ea; + done: + ea_set_attr(&eattrs, EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nhad.ad)); + rte e0 = { .attrs = eattrs, .net = &ndst, }; - ea->attrs[0] = (eattr) { - .id = EA_KRT_SOURCE, - .type = EAF_TYPE_INT, - .u.data = src2, - }; + ea_set_attr(&e0.attrs, + EA_LITERAL_EMBEDDED(&ea_krt_source, 0, src2)); #ifdef KRT_USE_METRIC - ea->count++; - ea->attrs[1] = (eattr) { - .id = EA_KRT_METRIC, - .type = EAF_TYPE_INT, - .u.data = msg->rtm.rtm_priority, - }; + ea_set_attr(&e0.attrs, + EA_LITERAL_EMBEDDED(&ea_krt_metric, 0, msg->rtm.rtm_priority)); #endif if (scan) - krt_got_route(p, e, src); + krt_got_route(p, &e0, src); else - krt_got_route_async(p, e, new, src); + krt_got_route_async(p, &e0, new, src); } static void @@ -868,6 +874,7 @@ krt_read_msg(struct proto *p, struct ks_msg *msg, int scan) { case RTM_GET: if(!scan) return; + /* Fall through */ case RTM_ADD: case RTM_DELETE: case RTM_CHANGE: @@ -1081,7 +1088,7 @@ krt_sock_open(pool *pool, void *data, int table_id UNUSED) sk->fd = fd; sk->data = data; - if (sk_open(sk) < 0) + if (sk_open(sk, &main_birdloop) < 0) bug("krt-sock: sk_open failed"); return sk; @@ -1241,7 +1248,7 @@ kif_update_sysdep_addr(struct iface *i) return 0; ip4_addr old = i->sysdep; - i->sysdep = ipa_to_ip4(ipa_from_sa4((sockaddr *) &ifr.ifr_addr)); + i->sysdep = ipa_to_ip4(ipa_from_sa4(&ifr.ifr_addr)); return !ip4_equal(i->sysdep, old); } diff --git a/sysdep/linux/krt-sys.h b/sysdep/linux/krt-sys.h index 8897f889..aa90f6e4 100644 --- a/sysdep/linux/krt-sys.h +++ b/sysdep/linux/krt-sys.h @@ -34,38 +34,6 @@ static inline struct ifa * kif_get_primary_ip(struct iface *i UNUSED) { return N #define KRT_ALLOW_MERGE_PATHS 1 -#define EA_KRT_PREFSRC EA_CODE(PROTOCOL_KERNEL, 0x10) -#define EA_KRT_REALM EA_CODE(PROTOCOL_KERNEL, 0x11) -#define EA_KRT_SCOPE EA_CODE(PROTOCOL_KERNEL, 0x12) - - -#define KRT_METRICS_MAX 0x10 /* RTAX_QUICKACK+1 */ -#define KRT_METRICS_OFFSET 0x20 /* Offset of EA_KRT_* vs RTAX_* */ - -#define KRT_FEATURES_MAX 4 - -/* - * Following attributes are parts of RTA_METRICS kernel route attribute, their - * ids must be consistent with their RTAX_* constants (+ KRT_METRICS_OFFSET) - */ -#define EA_KRT_METRICS EA_CODE(PROTOCOL_KERNEL, 0x20) /* Dummy one */ -#define EA_KRT_LOCK EA_CODE(PROTOCOL_KERNEL, 0x21) -#define EA_KRT_MTU EA_CODE(PROTOCOL_KERNEL, 0x22) -#define EA_KRT_WINDOW EA_CODE(PROTOCOL_KERNEL, 0x23) -#define EA_KRT_RTT EA_CODE(PROTOCOL_KERNEL, 0x24) -#define EA_KRT_RTTVAR EA_CODE(PROTOCOL_KERNEL, 0x25) -#define EA_KRT_SSTRESH EA_CODE(PROTOCOL_KERNEL, 0x26) -#define EA_KRT_CWND EA_CODE(PROTOCOL_KERNEL, 0x27) -#define EA_KRT_ADVMSS EA_CODE(PROTOCOL_KERNEL, 0x28) -#define EA_KRT_REORDERING EA_CODE(PROTOCOL_KERNEL, 0x29) -#define EA_KRT_HOPLIMIT EA_CODE(PROTOCOL_KERNEL, 0x2a) -#define EA_KRT_INITCWND EA_CODE(PROTOCOL_KERNEL, 0x2b) -#define EA_KRT_FEATURES EA_CODE(PROTOCOL_KERNEL, 0x2c) -#define EA_KRT_RTO_MIN EA_CODE(PROTOCOL_KERNEL, 0x2d) -#define EA_KRT_INITRWND EA_CODE(PROTOCOL_KERNEL, 0x2e) -#define EA_KRT_QUICKACK EA_CODE(PROTOCOL_KERNEL, 0x2f) - - struct krt_params { u32 table_id; /* Kernel table ID we sync with */ u32 metric; /* Kernel metric used for all routes */ diff --git a/sysdep/linux/netlink.Y b/sysdep/linux/netlink.Y index 487ad1d8..9b07e9bb 100644 --- a/sysdep/linux/netlink.Y +++ b/sysdep/linux/netlink.Y @@ -11,13 +11,12 @@ CF_HDR CF_DECLS CF_KEYWORDS(KERNEL, TABLE, METRIC, NETLINK, RX, BUFFER, - KRT_PREFSRC, KRT_REALM, KRT_SCOPE, KRT_MTU, KRT_WINDOW, - KRT_RTT, KRT_RTTVAR, KRT_SSTRESH, KRT_CWND, KRT_ADVMSS, KRT_REORDERING, - KRT_HOPLIMIT, KRT_INITCWND, KRT_RTO_MIN, KRT_INITRWND, KRT_QUICKACK, KRT_LOCK_MTU, KRT_LOCK_WINDOW, KRT_LOCK_RTT, KRT_LOCK_RTTVAR, KRT_LOCK_SSTRESH, KRT_LOCK_CWND, KRT_LOCK_ADVMSS, KRT_LOCK_REORDERING, KRT_LOCK_HOPLIMIT, KRT_LOCK_RTO_MIN, KRT_FEATURE_ECN, KRT_FEATURE_ALLFRAG) +%type <fab> attr_bit + CF_GRAMMAR kern_proto: kern_proto kern_sys_item ';' ; @@ -28,40 +27,42 @@ kern_sys_item: | NETLINK RX BUFFER expr { THIS_KRT->sys.netlink_rx_buffer = $4; } ; -dynamic_attr: KRT_PREFSRC { $$ = f_new_dynamic_attr(EAF_TYPE_IP_ADDRESS, T_IP, EA_KRT_PREFSRC); } ; -dynamic_attr: KRT_REALM { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_REALM); } ; -dynamic_attr: KRT_SCOPE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SCOPE); } ; - -dynamic_attr: KRT_MTU { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_MTU); } ; -dynamic_attr: KRT_WINDOW { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_WINDOW); } ; -dynamic_attr: KRT_RTT { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTT); } ; -dynamic_attr: KRT_RTTVAR { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTTVAR); } ; -dynamic_attr: KRT_SSTRESH { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SSTRESH); } ; -dynamic_attr: KRT_CWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_CWND); } ; -dynamic_attr: KRT_ADVMSS { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_ADVMSS); } ; -dynamic_attr: KRT_REORDERING { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_REORDERING); } ; -dynamic_attr: KRT_HOPLIMIT { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_HOPLIMIT); } ; -dynamic_attr: KRT_INITCWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_INITCWND); } ; -dynamic_attr: KRT_RTO_MIN { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTO_MIN); } ; -dynamic_attr: KRT_INITRWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_INITRWND); } ; -dynamic_attr: KRT_QUICKACK { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_QUICKACK); } ; - /* Bits of EA_KRT_LOCK, based on RTAX_* constants */ -dynamic_attr: KRT_LOCK_MTU { $$ = f_new_dynamic_attr_bit(2, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_WINDOW { $$ = f_new_dynamic_attr_bit(3, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_RTT { $$ = f_new_dynamic_attr_bit(4, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_RTTVAR { $$ = f_new_dynamic_attr_bit(5, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_SSTRESH { $$ = f_new_dynamic_attr_bit(6, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_CWND { $$ = f_new_dynamic_attr_bit(7, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_ADVMSS { $$ = f_new_dynamic_attr_bit(8, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_REORDERING { $$ = f_new_dynamic_attr_bit(9, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_HOPLIMIT { $$ = f_new_dynamic_attr_bit(10, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_RTO_MIN { $$ = f_new_dynamic_attr_bit(13, T_BOOL, EA_KRT_LOCK); } ; +attr_bit: KRT_LOCK_MTU { $$ = f_new_dynamic_attr_bit(2, "krt_lock"); } ; +attr_bit: KRT_LOCK_WINDOW { $$ = f_new_dynamic_attr_bit(3, "krt_lock"); } ; +attr_bit: KRT_LOCK_RTT { $$ = f_new_dynamic_attr_bit(4, "krt_lock"); } ; +attr_bit: KRT_LOCK_RTTVAR { $$ = f_new_dynamic_attr_bit(5, "krt_lock"); } ; +attr_bit: KRT_LOCK_SSTRESH { $$ = f_new_dynamic_attr_bit(6, "krt_lock"); } ; +attr_bit: KRT_LOCK_CWND { $$ = f_new_dynamic_attr_bit(7, "krt_lock"); } ; +attr_bit: KRT_LOCK_ADVMSS { $$ = f_new_dynamic_attr_bit(8, "krt_lock"); } ; +attr_bit: KRT_LOCK_REORDERING { $$ = f_new_dynamic_attr_bit(9, "krt_lock"); } ; +attr_bit: KRT_LOCK_HOPLIMIT { $$ = f_new_dynamic_attr_bit(10, "krt_lock"); } ; +attr_bit: KRT_LOCK_RTO_MIN { $$ = f_new_dynamic_attr_bit(13, "krt_lock"); } ; -dynamic_attr: KRT_FEATURE_ECN { $$ = f_new_dynamic_attr_bit(0, T_BOOL, EA_KRT_FEATURES); } ; -dynamic_attr: KRT_FEATURE_ALLFRAG { $$ = f_new_dynamic_attr(3, T_BOOL, EA_KRT_FEATURES); } ; +/* Bits of EA_KRT_FEATURES */ +attr_bit: KRT_FEATURE_ECN { $$ = f_new_dynamic_attr_bit(0, "krt_features"); } ; +attr_bit: KRT_FEATURE_ALLFRAG { $$ = f_new_dynamic_attr_bit(3, "krt_features"); } ; +/* Getting attribute bits (moved here to not confuse Bison on *BSD) */ +term: + attr_bit { + struct f_inst *c = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_INT, .val.i = (1U << $1.bit)}); + $$ = f_new_inst(FI_EQ, c, f_new_inst(FI_BITAND, f_new_inst(FI_EA_GET, $1.class), c)); + } + ; + +/* Setting attribute bits (moved here to not confuse Bison on *BSD) */ +cmd: + attr_bit '=' term ';' { + $$ = f_new_inst(FI_CONDITION, $3, + f_generate_complex_default(FI_BITOR, $1.class, + f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_INT, .val.i = (1U << $1.bit)}), 0), + f_generate_complex_default(FI_BITAND, $1.class, + f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_INT, .val.i = ~(1U << $1.bit)}), 0) + ); + } + ; CF_CODE diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 7f0d4736..e8a86ce4 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -16,7 +16,7 @@ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "lib/alloca.h" @@ -25,6 +25,7 @@ #include "lib/socket.h" #include "lib/string.h" #include "lib/hash.h" +#include "lib/macro.h" #include "conf/conf.h" #include CONFIG_INCLUDE_NLSYS_H @@ -43,6 +44,101 @@ struct nl_parse_state }; /* + * Netlink eattr definitions + */ + +#define KRT_METRICS_MAX ARRAY_SIZE(ea_krt_metrics) +#define KRT_FEATURES_MAX 4 + +static void krt_bitfield_format(const eattr *e, byte *buf, uint buflen); + +static struct ea_class + ea_krt_prefsrc = { + .name = "krt_prefsrc", + .type = T_IP, + }, + ea_krt_realm = { + .name = "krt_realm", + .type = T_INT, + }, + ea_krt_scope = { + .name = "krt_scope", + .type = T_INT, + }; + +static struct ea_class ea_krt_metrics[] = { + [RTAX_LOCK] = { + .name = "krt_lock", + .type = T_INT, + .format = krt_bitfield_format, + }, + [RTAX_FEATURES] = { + .name = "krt_features", + .type = T_INT, + .format = krt_bitfield_format, + }, +#define KRT_METRIC_INT(_rtax, _name) [_rtax] = { .name = _name, .type = T_INT } + KRT_METRIC_INT(RTAX_MTU, "krt_mtu"), + KRT_METRIC_INT(RTAX_WINDOW, "krt_window"), + KRT_METRIC_INT(RTAX_RTT, "krt_rtt"), + KRT_METRIC_INT(RTAX_RTTVAR, "krt_rttvar"), + KRT_METRIC_INT(RTAX_SSTHRESH, "krt_sstresh"), + KRT_METRIC_INT(RTAX_CWND, "krt_cwnd"), + KRT_METRIC_INT(RTAX_ADVMSS, "krt_advmss"), + KRT_METRIC_INT(RTAX_REORDERING, "krt_reordering"), + KRT_METRIC_INT(RTAX_HOPLIMIT, "krt_hoplimit"), + KRT_METRIC_INT(RTAX_INITCWND, "krt_initcwnd"), + KRT_METRIC_INT(RTAX_RTO_MIN, "krt_rto_min"), + KRT_METRIC_INT(RTAX_INITRWND, "krt_initrwnd"), + KRT_METRIC_INT(RTAX_QUICKACK, "krt_quickack"), +#undef KRT_METRIC_INT +}; + +static const char *krt_metrics_names[KRT_METRICS_MAX] = { + NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss", + "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack" +}; + +static const char *krt_features_names[KRT_FEATURES_MAX] = { + "ecn", NULL, NULL, "allfrag" +}; + +static void +krt_bitfield_format(const eattr *a, byte *buf, uint buflen) +{ + if (a->id == ea_krt_metrics[RTAX_LOCK].id) + ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX); + else if (a->id == ea_krt_metrics[RTAX_FEATURES].id) + ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX); +} + +static void +nl_ea_register(void) +{ + EA_REGISTER_ALL( + &ea_krt_prefsrc, + &ea_krt_realm, + &ea_krt_scope + ); + + for (uint i = 0; i < KRT_METRICS_MAX; i++) + { + if (!ea_krt_metrics[i].name) + ea_krt_metrics[i] = (struct ea_class) { + .name = mb_sprintf(&root_pool, "krt_metric_%d", i), + .type = T_INT, + }; + + ea_register_init(&ea_krt_metrics[i]); + } + + for (uint i = 1; i < KRT_METRICS_MAX; i++) + ASSERT_DIE(ea_krt_metrics[i].id == ea_krt_metrics[0].id + i); +} + + + +/* * Synchronous Netlink interface */ @@ -668,12 +764,12 @@ nl_add_nexthop(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af UNUS } static void -nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, ea_list *eattrs) +nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop_adata *nhad, int af, ea_list *eattrs) { struct rtattr *a = nl_open_attr(h, bufsize, RTA_MULTIPATH); - eattr *flow = ea_find(eattrs, EA_KRT_REALM); + eattr *flow = ea_find(eattrs, &ea_krt_realm); - for (; nh; nh = nh->next) + NEXTHOP_WALK(nh, nhad) { struct rtnexthop *rtnh = nl_open_nexthop(h, bufsize); @@ -697,31 +793,44 @@ nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, e nl_close_attr(h, a); } -static struct nexthop * +static struct nexthop_adata * nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr *n, struct rtattr *ra, int af, int krt_src) { struct rtattr *a[BIRD_RTA_MAX]; - struct rtnexthop *nh = RTA_DATA(ra); - struct nexthop *rv, *first, **last; - unsigned len = RTA_PAYLOAD(ra); + struct rtnexthop *nh, *orig_nh = RTA_DATA(ra); + unsigned len, orig_len = RTA_PAYLOAD(ra); + uint cnt = 0; + + /* First count the nexthops */ + for (len = orig_len, nh = orig_nh; len; len -= NLMSG_ALIGN(nh->rtnh_len), nh = RTNH_NEXT(nh)) + { + /* Use RTNH_OK(nh,len) ?? */ + if ((len < sizeof(*nh)) || (len < nh->rtnh_len)) + goto err; + + if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD)) + ; + else + cnt++; + } - first = NULL; - last = &first; + struct nexthop_adata *nhad = lp_allocz(s->pool, cnt * NEXTHOP_MAX_SIZE + sizeof *nhad); + struct nexthop *rv = &nhad->nh; - while (len) + for (len = orig_len, nh = orig_nh; len; len -= NLMSG_ALIGN(nh->rtnh_len), nh = RTNH_NEXT(nh)) { /* Use RTNH_OK(nh,len) ?? */ if ((len < sizeof(*nh)) || (len < nh->rtnh_len)) goto err; if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD)) - goto next; + continue; - *last = rv = lp_allocz(s->pool, NEXTHOP_MAX_SIZE); - last = &(rv->next); + *rv = (struct nexthop) { + .weight = nh->rtnh_hops, + .iface = if_find_by_index(nh->rtnh_ifindex), + }; - rv->weight = nh->rtnh_hops; - rv->iface = if_find_by_index(nh->rtnh_ifindex); if (!rv->iface) { log(L_ERR "KRT: Received route %N with unknown ifindex %u", n, nh->rtnh_ifindex); @@ -800,16 +909,14 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr } #endif - next: - len -= NLMSG_ALIGN(nh->rtnh_len); - nh = RTNH_NEXT(nh); + rv = NEXTHOP_NEXT(rv); } - /* Ensure nexthops are sorted to satisfy nest invariant */ - if (!nexthop_is_sorted(first)) - first = nexthop_sort(first); + /* Store final length */ + nhad->ad.length = (void *) rv - (void *) nhad->ad.data; - return first; + /* Ensure nexthops are sorted to satisfy nest invariant */ + return nexthop_is_sorted(nhad) ? nhad : nexthop_sort(nhad, s->pool); err: log(L_ERR "KRT: Received strange multipath route %N", n); @@ -1176,8 +1283,7 @@ kif_do_scan(struct kif_proto *p UNUSED) log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type); /* Re-resolve master interface for slaves */ - struct iface *i; - WALK_LIST(i, iface_list) + IFACE_WALK(i) if (i->master_index) { struct iface f = { @@ -1185,13 +1291,13 @@ kif_do_scan(struct kif_proto *p UNUSED) .mtu = i->mtu, .index = i->index, .master_index = i->master_index, - .master = if_find_by_index(i->master_index) + .master = if_find_by_index_locked(i->master_index) }; if (f.master != i->master) { memcpy(f.name, i->name, sizeof(f.name)); - if_update(&f); + if_update_locked(&f); } } @@ -1237,11 +1343,16 @@ HASH_DEFINE_REHASH_FN(RTH, struct krt_proto) int krt_capable(rte *e) { - rta *a = e->attrs; + eattr *ea = ea_find(e->attrs, &ea_gen_nexthop); + if (!ea) + return 0; - switch (a->dest) + struct nexthop_adata *nhad = (void *) ea->u.ptr; + if (NEXTHOP_IS_REACHABLE(nhad)) + return 1; + + switch (nhad->dest) { - case RTD_UNICAST: case RTD_BLACKHOLE: case RTD_UNREACHABLE: case RTD_PROHIBIT: @@ -1253,22 +1364,24 @@ krt_capable(rte *e) } static inline int -nh_bufsize(struct nexthop *nh) +nh_bufsize(struct nexthop_adata *nhad) { int rv = 0; - for (; nh != NULL; nh = nh->next) + NEXTHOP_WALK(nh, nhad) rv += RTNH_LENGTH(RTA_LENGTH(sizeof(ip_addr))); return rv; } static int -nl_send_route(struct krt_proto *p, rte *e, int op) +nl_send_route(struct krt_proto *p, const rte *e, int op) { eattr *ea; - net *net = e->net; - rta *a = e->attrs; - ea_list *eattrs = a->eattrs; - int bufsize = 128 + KRT_METRICS_MAX*8 + nh_bufsize(&(a->nh)); + ea_list *eattrs = e->attrs; + eattr *nhea = ea_find(eattrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); + + int bufsize = 128 + KRT_METRICS_MAX*8 + (nh ? nh_bufsize(nh) : 0); u32 priority = 0; struct { @@ -1280,7 +1393,7 @@ nl_send_route(struct krt_proto *p, rte *e, int op) int rsize = sizeof(*r) + bufsize; r = alloca(rsize); - DBG("nl_send_route(%N,op=%x)\n", net->n.addr, op); + DBG("nl_send_route(%N,op=%x)\n", e->net, op); bzero(&r->h, sizeof(r->h)); bzero(&r->r, sizeof(r->r)); @@ -1289,7 +1402,7 @@ nl_send_route(struct krt_proto *p, rte *e, int op) r->h.nlmsg_flags = op | NLM_F_REQUEST | NLM_F_ACK; r->r.rtm_family = p->af; - r->r.rtm_dst_len = net_pxlen(net->n.addr); + r->r.rtm_dst_len = net_pxlen(e->net); r->r.rtm_protocol = RTPROT_BIRD; r->r.rtm_scope = RT_SCOPE_NOWHERE; #ifdef HAVE_MPLS_KERNEL @@ -1301,7 +1414,7 @@ nl_send_route(struct krt_proto *p, rte *e, int op) * 2) Never use RTA_PRIORITY */ - u32 label = net_mpls(net->n.addr); + u32 label = net_mpls(e->net); nl_add_attr_mpls(&r->h, rsize, RTA_DST, 1, &label); r->r.rtm_scope = RT_SCOPE_UNIVERSE; r->r.rtm_type = RTN_UNICAST; @@ -1309,12 +1422,12 @@ nl_send_route(struct krt_proto *p, rte *e, int op) else #endif { - nl_add_attr_ipa(&r->h, rsize, RTA_DST, net_prefix(net->n.addr)); + nl_add_attr_ipa(&r->h, rsize, RTA_DST, net_prefix(e->net)); /* Add source address for IPv6 SADR routes */ - if (net->n.addr->type == NET_IP6_SADR) + if (e->net->type == NET_IP6_SADR) { - net_addr_ip6_sadr *a = (void *) &net->n.addr; + net_addr_ip6_sadr *a = (void *) &e->net; nl_add_attr_ip6(&r->h, rsize, RTA_SRC, a->src_prefix); r->r.rtm_src_len = a->src_pxlen; } @@ -1336,7 +1449,7 @@ nl_send_route(struct krt_proto *p, rte *e, int op) priority = 0; else if (KRT_CF->sys.metric) priority = KRT_CF->sys.metric; - else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, EA_KRT_METRIC))) + else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, &ea_krt_metric))) priority = ea->u.data; if (priority) @@ -1349,17 +1462,17 @@ nl_send_route(struct krt_proto *p, rte *e, int op) /* Default scope is LINK for device routes, UNIVERSE otherwise */ if (p->af == AF_MPLS) r->r.rtm_scope = RT_SCOPE_UNIVERSE; - else if (ea = ea_find(eattrs, EA_KRT_SCOPE)) + else if (ea = ea_find(eattrs, &ea_krt_scope)) r->r.rtm_scope = ea->u.data; - else if (a->dest == RTD_UNICAST && ipa_zero(a->nh.gw)) + else if (dest == RTD_UNICAST && ipa_zero(nh->nh.gw)) r->r.rtm_scope = RT_SCOPE_LINK; else r->r.rtm_scope = RT_SCOPE_UNIVERSE; - if (ea = ea_find(eattrs, EA_KRT_PREFSRC)) + if (ea = ea_find(eattrs, &ea_krt_prefsrc)) nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data); - if (ea = ea_find(eattrs, EA_KRT_REALM)) + if (ea = ea_find(eattrs, &ea_krt_realm)) nl_add_attr_u32(&r->h, rsize, RTA_FLOW, ea->u.data); @@ -1367,9 +1480,9 @@ nl_send_route(struct krt_proto *p, rte *e, int op) metrics[0] = 0; struct ea_walk_state ews = { .eattrs = eattrs }; - while (ea = ea_walk(&ews, EA_KRT_METRICS, KRT_METRICS_MAX)) + while (ea = ea_walk(&ews, ea_krt_metrics[0].id, KRT_METRICS_MAX)) { - int id = ea->id - EA_KRT_METRICS; + int id = ea->id - ea_krt_metrics[0].id; metrics[0] |= 1 << id; metrics[id] = ea->u.data; } @@ -1377,19 +1490,18 @@ nl_send_route(struct krt_proto *p, rte *e, int op) if (metrics[0]) nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX); - switch (a->dest) + switch (dest) { case RTD_UNICAST: r->r.rtm_type = RTN_UNICAST; - struct nexthop *nh = &(a->nh); - if (nh->next) + if (!NEXTHOP_ONE(nh)) nl_add_multipath(&r->h, rsize, nh, p->af, eattrs); else { - nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->iface->index); - nl_add_nexthop(&r->h, rsize, nh, p->af); + nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->nh.iface->index); + nl_add_nexthop(&r->h, rsize, &nh->nh, p->af); - if (nh->flags & RNF_ONLINK) + if (nh->nh.flags & RNF_ONLINK) r->r.rtm_flags |= RTNH_F_ONLINK; } break; @@ -1435,12 +1547,15 @@ nl_allow_replace(struct krt_proto *p, rte *new) if (krt_ipv4(p)) return 1; - rta *a = new->attrs; - return (a->dest == RTD_UNICAST) && ipa_nonzero(a->nh.gw); + eattr *nhea = ea_find(new->attrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); + + return (dest == RTD_UNICAST) && ipa_nonzero(nh->nh.gw); } void -krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old) +krt_replace_rte(struct krt_proto *p, const net_addr *n UNUSED, rte *new, const rte *old) { int err = 0; @@ -1593,107 +1708,120 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) krt_src = KRT_SRC_ALIEN; } - net_addr *n = &dst; + net_addr *net = &dst; if (p->p.net_type == NET_IP6_SADR) { - n = alloca(sizeof(net_addr_ip6_sadr)); - net_fill_ip6_sadr(n, net6_prefix(&dst), net6_pxlen(&dst), + net = alloca(sizeof(net_addr_ip6_sadr)); + net_fill_ip6_sadr(net, net6_prefix(&dst), net6_pxlen(&dst), net6_prefix(&src), net6_pxlen(&src)); } - net *net = net_get(p->p.main_channel->table, n); - - rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE); - ra->source = RTS_INHERIT; - ra->scope = SCOPE_UNIVERSE; - - { - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + 2 * sizeof(eattr)); - *ea = (ea_list) { .flags = EALF_SORTED, .count = 2 }; - ea->next = ra->eattrs; - ra->eattrs = ea; - - ea->attrs[0] = (eattr) { - .id = EA_KRT_SOURCE, - .type = EAF_TYPE_INT, - .u.data = i->rtm_protocol - }; - - ea->attrs[1] = (eattr) { - .id = EA_KRT_METRIC, - .type = EAF_TYPE_INT, - .u.data = priority, - }; - } + ea_list *ra = NULL; + ea_set_attr_u32(&ra, &ea_gen_source, 0, RTS_INHERIT); + ea_set_attr_u32(&ra, &ea_krt_source, 0, i->rtm_protocol); + ea_set_attr_u32(&ra, &ea_krt_metric, 0, priority); if (a[RTA_FLOW]) s->rta_flow = rta_get_u32(a[RTA_FLOW]); else s->rta_flow = 0; + union { + struct { + struct adata ad; + struct nexthop nh; + u32 labels[MPLS_MAX_LABEL_STACK]; + }; + struct nexthop_adata nhad; + } nhad = {}; + switch (i->rtm_type) { case RTN_UNICAST: - ra->dest = RTD_UNICAST; - if (a[RTA_MULTIPATH]) { - struct nexthop *nh = nl_parse_multipath(s, p, n, a[RTA_MULTIPATH], i->rtm_family, krt_src); + struct nexthop_adata *nh = nl_parse_multipath(s, p, net, a[RTA_MULTIPATH], i->rtm_family, krt_src); if (!nh) SKIP("strange RTA_MULTIPATH\n"); - nexthop_link(ra, nh); + ea_set_attr(&ra, EA_LITERAL_DIRECT_ADATA( + &ea_gen_nexthop, 0, &nh->ad)); break; } if ((i->rtm_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD)) SKIP("ignore RTNH_F_DEAD\n"); - ra->nh.iface = if_find_by_index(oif); - if (!ra->nh.iface) + nhad.nh.iface = if_find_by_index(oif); + if (!nhad.nh.iface) { - log(L_ERR "KRT: Received route %N with unknown ifindex %u", net->n.addr, oif); + log(L_ERR "KRT: Received route %N with unknown ifindex %u", net, oif); return; } if (a[RTA_GATEWAY]) - ra->nh.gw = rta_get_ipa(a[RTA_GATEWAY]); + nhad.nh.gw = rta_get_ipa(a[RTA_GATEWAY]); #ifdef HAVE_MPLS_KERNEL if (a[RTA_VIA]) - ra->nh.gw = rta_get_via(a[RTA_VIA]); + nhad.nh.gw = rta_get_via(a[RTA_VIA]); #endif if (i->rtm_flags & RTNH_F_ONLINK) - ra->nh.flags |= RNF_ONLINK; + nhad.nh.flags |= RNF_ONLINK; - if (ipa_nonzero(ra->nh.gw)) + if (ipa_nonzero(nhad.nh.gw)) { /* Silently skip strange 6to4 routes */ const net_addr_ip6 sit = NET_ADDR_IP6(IP6_NONE, 96); - if ((i->rtm_family == AF_INET6) && ipa_in_netX(ra->nh.gw, (net_addr *) &sit)) + if ((i->rtm_family == AF_INET6) && ipa_in_netX(nhad.nh.gw, (net_addr *) &sit)) return; neighbor *nbr; - nbr = neigh_find(&p->p, ra->nh.gw, ra->nh.iface, - (ra->nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0); + nbr = neigh_find(&p->p, nhad.nh.gw, nhad.nh.iface, + (nhad.nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0); if (!nbr || (nbr->scope == SCOPE_HOST)) { - log(L_ERR "KRT: Received route %N with strange next-hop %I", net->n.addr, - ra->nh.gw); + log(L_ERR "KRT: Received route %N with strange next-hop %I", net, + nhad.nh.gw); return; } } +#ifdef HAVE_MPLS_KERNEL + if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !a[RTA_MULTIPATH]) + nhad.nh.labels = rta_get_mpls(a[RTA_NEWDST], nhad.nh.label); + + if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !a[RTA_MULTIPATH]) + { + switch (rta_get_u16(a[RTA_ENCAP_TYPE])) + { + case LWTUNNEL_ENCAP_MPLS: + { + struct rtattr *enca[BIRD_RTA_MAX]; + nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]); + nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca)); + nhad.nh.labels = rta_get_mpls(enca[RTA_DST], nhad.nh.label); + break; + } + default: + SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE])); + break; + } + } +#endif + + /* Finalize the nexthop */ + nhad.ad.length = (void *) NEXTHOP_NEXT(&nhad.nh) - (void *) nhad.ad.data; break; case RTN_BLACKHOLE: - ra->dest = RTD_BLACKHOLE; + nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_BLACKHOLE); break; case RTN_UNREACHABLE: - ra->dest = RTD_UNREACHABLE; + nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_UNREACHABLE); break; case RTN_PROHIBIT: - ra->dest = RTD_PROHIBIT; + nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_PROHIBIT); break; /* FIXME: What about RTN_THROW? */ default: @@ -1701,118 +1829,50 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) return; } -#ifdef HAVE_MPLS_KERNEL - if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !ra->nh.next) - ra->nh.labels = rta_get_mpls(a[RTA_NEWDST], ra->nh.label); - - if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !ra->nh.next) - { - switch (rta_get_u16(a[RTA_ENCAP_TYPE])) - { - case LWTUNNEL_ENCAP_MPLS: - { - struct rtattr *enca[BIRD_RTA_MAX]; - nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]); - nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca)); - ra->nh.labels = rta_get_mpls(enca[RTA_DST], ra->nh.label); - break; - } - default: - SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE])); - break; - } - } -#endif + if (nhad.ad.length) + ea_set_attr(&ra, EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nhad.ad)); if (i->rtm_scope != def_scope) - { - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = ra->eattrs; - ra->eattrs = ea; - ea->flags = EALF_SORTED; - ea->count = 1; - ea->attrs[0] = (eattr) { - .id = EA_KRT_SCOPE, - .flags = 0, - .type = EAF_TYPE_INT, - .u.data = i->rtm_scope, - }; - } + ea_set_attr(&ra, + EA_LITERAL_EMBEDDED(&ea_krt_scope, 0, i->rtm_scope)); if (a[RTA_PREFSRC]) - { - ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]); - - struct adata *ad = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps)); - ad->length = sizeof(ps); - memcpy(ad->data, &ps, sizeof(ps)); - - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = ra->eattrs; - ra->eattrs = ea; - ea->flags = EALF_SORTED; - ea->count = 1; - ea->attrs[0] = (eattr) { - .id = EA_KRT_PREFSRC, - .flags = 0, - .type = EAF_TYPE_IP_ADDRESS, - .u.ptr = ad, - }; - } + { + ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]); + + ea_set_attr(&ra, + EA_LITERAL_STORE_ADATA(&ea_krt_prefsrc, 0, &ps, sizeof(ps))); + } /* Can be set per-route or per-nexthop */ if (s->rta_flow) - { - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = ra->eattrs; - ra->eattrs = ea; - ea->flags = EALF_SORTED; - ea->count = 1; - ea->attrs[0] = (eattr) { - .id = EA_KRT_REALM, - .flags = 0, - .type = EAF_TYPE_INT, - .u.data = s->rta_flow, - }; - } + ea_set_attr(&ra, + EA_LITERAL_EMBEDDED(&ea_krt_realm, 0, s->rta_flow)); if (a[RTA_METRICS]) { u32 metrics[KRT_METRICS_MAX]; - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr)); - int t, n = 0; - if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0) { - log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net->n.addr); + log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net); return; } - for (t = 1; t < KRT_METRICS_MAX; t++) + for (uint t = 1; t < KRT_METRICS_MAX; t++) if (metrics[0] & (1 << t)) - ea->attrs[n++] = (eattr) { - .id = EA_CODE(PROTOCOL_KERNEL, KRT_METRICS_OFFSET + t), - .flags = 0, - .type = EAF_TYPE_INT, /* FIXME: Some are EAF_TYPE_BITFIELD */ - .u.data = metrics[t], - }; - - if (n > 0) - { - ea->next = ra->eattrs; - ea->flags = EALF_SORTED; - ea->count = n; - ra->eattrs = ea; - } + ea_set_attr(&ra, + EA_LITERAL_EMBEDDED(&ea_krt_metrics[t], 0, metrics[t])); } - rte *e = rte_get_temp(ra, p->p.main_source); - e->net = net; + rte e0 = { + .net = net, + .attrs = ra, + }; if (s->scan) - krt_got_route(p, e, krt_src); + krt_got_route(p, &e0, krt_src); else - krt_got_route_async(p, e, new, krt_src); + krt_got_route_async(p, &e0, new, krt_src); lp_flush(s->pool); } @@ -1983,7 +2043,7 @@ nl_open_async(void) sk->rx_hook = nl_async_hook; sk->err_hook = nl_async_err_hook; sk->fd = fd; - if (sk_open(sk) < 0) + if (sk_open(sk, &main_birdloop) < 0) bug("Netlink: sk_open failed"); } @@ -2023,6 +2083,8 @@ krt_sys_io_init(void) { nl_linpool = lp_new_default(krt_pool); HASH_INIT(nl_table_map, krt_pool, 6); + + nl_ea_register(); } int @@ -2076,56 +2138,6 @@ krt_sys_copy_config(struct krt_config *d, struct krt_config *s) d->sys.metric = s->sys.metric; } -static const char *krt_metrics_names[KRT_METRICS_MAX] = { - NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss", - "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack" -}; - -static const char *krt_features_names[KRT_FEATURES_MAX] = { - "ecn", NULL, NULL, "allfrag" -}; - -int -krt_sys_get_attr(const eattr *a, byte *buf, int buflen UNUSED) -{ - switch (a->id) - { - case EA_KRT_PREFSRC: - bsprintf(buf, "prefsrc"); - return GA_NAME; - - case EA_KRT_REALM: - bsprintf(buf, "realm"); - return GA_NAME; - - case EA_KRT_SCOPE: - bsprintf(buf, "scope"); - return GA_NAME; - - case EA_KRT_LOCK: - buf += bsprintf(buf, "lock:"); - ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX); - return GA_FULL; - - case EA_KRT_FEATURES: - buf += bsprintf(buf, "features:"); - ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX); - return GA_FULL; - - default:; - int id = (int)EA_ID(a->id) - KRT_METRICS_OFFSET; - if (id > 0 && id < KRT_METRICS_MAX) - { - bsprintf(buf, "%s", krt_metrics_names[id]); - return GA_NAME; - } - - return GA_UNKNOWN; - } -} - - - void kif_sys_start(struct kif_proto *p UNUSED) { diff --git a/sysdep/unix/Makefile b/sysdep/unix/Makefile index d0d36b5f..6f6b0d26 100644 --- a/sysdep/unix/Makefile +++ b/sysdep/unix/Makefile @@ -1,4 +1,4 @@ -src := alloc.c io.c krt.c log.c main.c random.c +src := alloc.c io.c io-loop.c krt.c log.c main.c random.c domain.c obj := $(src-o-files) $(all-daemon) $(cf-local) diff --git a/sysdep/unix/alloc.c b/sysdep/unix/alloc.c index c8f1c83f..cafcc8dd 100644 --- a/sysdep/unix/alloc.c +++ b/sysdep/unix/alloc.c @@ -10,6 +10,7 @@ #include "lib/resource.h" #include "lib/lists.h" #include "lib/event.h" +#include "lib/rcu.h" #include <errno.h> #include <stdlib.h> @@ -29,56 +30,55 @@ long page_size = 0; #ifdef HAVE_MMAP -#define KEEP_PAGES_MAIN_MAX 256 -#define KEEP_PAGES_MAIN_MIN 8 -#define CLEANUP_PAGES_BULK 256 +#define KEEP_PAGES_MAX 512 +#define KEEP_PAGES_MIN 32 +#define KEEP_PAGES_MAX_LOCAL 16 +#define ALLOC_PAGES_AT_ONCE 8 -STATIC_ASSERT(KEEP_PAGES_MAIN_MIN * 4 < KEEP_PAGES_MAIN_MAX); +STATIC_ASSERT(KEEP_PAGES_MIN * 4 < KEEP_PAGES_MAX); +STATIC_ASSERT(ALLOC_PAGES_AT_ONCE < KEEP_PAGES_MAX_LOCAL); static _Bool use_fake = 0; +static _Bool initialized = 0; #if DEBUGGING struct free_page { node unused[42]; - node n; + struct free_page * _Atomic next; }; #else struct free_page { - node n; + struct free_page * _Atomic next; }; #endif #define EP_POS_MAX ((page_size - OFFSETOF(struct empty_pages, pages)) / sizeof (void *)) struct empty_pages { - node n; + struct empty_pages *next; uint pos; void *pages[0]; }; -struct free_pages { - list pages; /* List of (struct free_page) keeping free pages without releasing them (hot) */ - list empty; /* List of (struct empty_pages) keeping invalidated pages mapped for us (cold) */ - u16 min, max; /* Minimal and maximal number of free pages kept */ - uint cnt; /* Number of free pages in list */ - event cleanup; -}; +DEFINE_DOMAIN(resource); +static DOMAIN(resource) empty_pages_domain; +static struct empty_pages *empty_pages = NULL; -static void global_free_pages_cleanup_event(void *); -static void *alloc_cold_page(void); +static struct free_page * _Atomic page_stack = NULL; +static _Thread_local struct free_page * local_page_stack = NULL; -static struct free_pages global_free_pages = { - .min = KEEP_PAGES_MAIN_MIN, - .max = KEEP_PAGES_MAIN_MAX, - .cleanup = { .hook = global_free_pages_cleanup_event }, -}; +static void page_cleanup(void *); +static event page_cleanup_event = { .hook = page_cleanup, }; +#define SCHEDULE_CLEANUP do if (initialized && !shutting_down) ev_send(&global_event_list, &page_cleanup_event); while (0) -uint *pages_kept = &global_free_pages.cnt; +_Atomic int pages_kept = 0; +_Atomic int pages_kept_locally = 0; +static _Thread_local int pages_kept_here = 0; static void * alloc_sys_page(void) { - void *ptr = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + void *ptr = mmap(NULL, page_size * ALLOC_PAGES_AT_ONCE, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr == MAP_FAILED) die("mmap(%ld) failed: %m", (s64) page_size); @@ -108,45 +108,57 @@ alloc_page(void) } #ifdef HAVE_MMAP - struct free_pages *fps = &global_free_pages; - - /* If there is any free page kept hot, we use it. */ - if (fps->cnt) + /* If there is any free page kept hot in this thread, we use it. */ + struct free_page *fp = local_page_stack; + if (fp) { - struct free_page *fp = SKIP_BACK(struct free_page, n, HEAD(fps->pages)); - rem_node(&fp->n); + local_page_stack = atomic_load_explicit(&fp->next, memory_order_acquire); + atomic_fetch_sub_explicit(&pages_kept_locally, 1, memory_order_relaxed); + pages_kept_here--; + return fp; + } - /* If the hot-free-page cache is getting short, request the cleanup routine to replenish the cache */ - if ((--fps->cnt < fps->min) && !shutting_down) - ev_schedule(&fps->cleanup); + ASSERT_DIE(pages_kept_here == 0); + /* If there is any free page kept hot in global storage, we use it. */ + rcu_read_lock(); + fp = atomic_load_explicit(&page_stack, memory_order_acquire); + while (fp && !atomic_compare_exchange_strong_explicit( + &page_stack, &fp, atomic_load_explicit(&fp->next, memory_order_acquire), + memory_order_acq_rel, memory_order_acquire)) + ; + rcu_read_unlock(); + + if (fp) + { + atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed); return fp; } - else - return alloc_cold_page(); -} - -static void * -alloc_cold_page(void) -{ - struct free_pages *fps = &global_free_pages; /* If there is any free page kept cold, we use that. */ - if (!EMPTY_LIST(fps->empty)) - { - struct empty_pages *ep = HEAD(fps->empty); + LOCK_DOMAIN(resource, empty_pages_domain); + if (empty_pages) { + if (empty_pages->pos) + /* Either the keeper page contains at least one cold page pointer, return that */ + fp = empty_pages->pages[--empty_pages->pos]; + else + { + /* Or the keeper page has no more cold page pointer, return the keeper page */ + fp = (struct free_page *) empty_pages; + empty_pages = empty_pages->next; + } + } + UNLOCK_DOMAIN(resource, empty_pages_domain); - /* Either the keeper page contains at least one cold page pointer, return that */ - if (ep->pos) - return ep->pages[--ep->pos]; + if (fp) + return fp; - /* Or the keeper page has no more cold page pointer, return the keeper page */ - rem_node(&ep->n); - return ep; - } + /* And in the worst case, allocate some new pages by mmap() */ + void *ptr = alloc_sys_page(); + for (int i=1; i<ALLOC_PAGES_AT_ONCE; i++) + free_page(ptr + page_size * i); - /* And in the worst case, allocate a new page by mmap() */ - return alloc_sys_page(); + return ptr; #endif } @@ -161,53 +173,108 @@ free_page(void *ptr) } #ifdef HAVE_MMAP - struct free_pages *fps = &global_free_pages; + /* We primarily try to keep the pages locally. */ struct free_page *fp = ptr; + if (shutting_down || (pages_kept_here < KEEP_PAGES_MAX_LOCAL)) + { + atomic_store_explicit(&fp->next, local_page_stack, memory_order_relaxed); + local_page_stack = fp; - /* Otherwise, we add the free page to the hot-free-page list */ - fp->n = (node) {}; - add_tail(&fps->pages, &fp->n); + atomic_fetch_add_explicit(&pages_kept_locally, 1, memory_order_relaxed); + pages_kept_here++; + return; + } + + /* If there are too many local pages, we add the free page to the global hot-free-page list */ + rcu_read_lock(); + struct free_page *next = atomic_load_explicit(&page_stack, memory_order_acquire); - /* And if there are too many hot free pages, we ask for page cleanup */ - if ((++fps->cnt > fps->max) && !shutting_down) - ev_schedule(&fps->cleanup); + do atomic_store_explicit(&fp->next, next, memory_order_release); + while (!atomic_compare_exchange_strong_explicit( + &page_stack, &next, fp, + memory_order_acq_rel, memory_order_acquire)); + rcu_read_unlock(); + + /* And if there are too many global hot free pages, we ask for page cleanup */ + if (atomic_fetch_add_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX) + SCHEDULE_CLEANUP; #endif } +/* When the routine is going to sleep for a long time, we flush the local + * hot page cache to not keep dirty pages for nothing. */ +void +flush_local_pages(void) +{ + if (use_fake || !local_page_stack || shutting_down) + return; + + /* We first count the pages to enable consistency checking. + * Also, we need to know the last page. */ + struct free_page *last = local_page_stack, *next; + int check_count = 1; + while (next = atomic_load_explicit(&last->next, memory_order_acquire)) + { + check_count++; + last = next; + } + + /* The actual number of pages must be equal to the counter value. */ + ASSERT_DIE(check_count == pages_kept_here); + + /* Repeatedly trying to insert the whole page list into global page stack at once. */ + rcu_read_lock(); + next = atomic_load_explicit(&page_stack, memory_order_acquire); + + /* First we set the outwards pointer (from our last), + * then we try to set the inwards pointer to our first page. */ + do atomic_store_explicit(&last->next, next, memory_order_release); + while (!atomic_compare_exchange_strong_explicit( + &page_stack, &next, local_page_stack, + memory_order_acq_rel, memory_order_acquire)); + rcu_read_unlock(); + + /* Finished. Now the local stack is empty. */ + local_page_stack = NULL; + pages_kept_here = 0; + + /* Check the state of global page cache and maybe schedule its cleanup. */ + atomic_fetch_sub_explicit(&pages_kept_locally, check_count, memory_order_relaxed); + if (atomic_fetch_add_explicit(&pages_kept, check_count, memory_order_relaxed) >= KEEP_PAGES_MAX) + SCHEDULE_CLEANUP; +} + #ifdef HAVE_MMAP static void -global_free_pages_cleanup_event(void *data UNUSED) +page_cleanup(void *_ UNUSED) { /* Cleanup on shutdown is ignored. All pages may be kept hot, OS will take care. */ if (shutting_down) return; - struct free_pages *fps = &global_free_pages; + struct free_page *stack = atomic_exchange_explicit(&page_stack, NULL, memory_order_acq_rel); + if (!stack) + return; - /* Cleanup may get called when hot free page cache is short of pages. Replenishing. */ - while (fps->cnt / 2 < fps->min) - free_page(alloc_cold_page()); - /* Or the hot free page cache is too big. Moving some pages to the cold free page cache. */ - for (int limit = CLEANUP_PAGES_BULK; limit && (fps->cnt > fps->max / 2); fps->cnt--, limit--) - { - struct free_page *fp = SKIP_BACK(struct free_page, n, TAIL(fps->pages)); - rem_node(&fp->n); + do { + synchronize_rcu(); + struct free_page *fp = stack; + stack = atomic_load_explicit(&fp->next, memory_order_acquire); + LOCK_DOMAIN(resource, empty_pages_domain); /* Empty pages are stored as pointers. To store them, we need a pointer block. */ - struct empty_pages *ep; - if (EMPTY_LIST(fps->empty) || ((ep = HEAD(fps->empty))->pos == EP_POS_MAX)) + if (!empty_pages || (empty_pages->pos == EP_POS_MAX)) { /* There is either no pointer block or the last block is full. We use this block as a pointer block. */ - ep = (struct empty_pages *) fp; - *ep = (struct empty_pages) {}; - add_head(&fps->empty, &ep->n); + empty_pages = (struct empty_pages *) fp; + *empty_pages = (struct empty_pages) {}; } else { /* We store this block as a pointer into the first free place * and tell the OS that the underlying memory is trash. */ - ep->pages[ep->pos++] = fp; + empty_pages->pages[empty_pages->pos++] = fp; if (madvise(fp, page_size, #ifdef CONFIG_MADV_DONTNEED_TO_FREE MADV_DONTNEED @@ -217,12 +284,18 @@ global_free_pages_cleanup_event(void *data UNUSED) ) < 0) bug("madvise(%p) failed: %m", fp); } + UNLOCK_DOMAIN(resource, empty_pages_domain); } + while ((atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX / 2) && stack); - /* If the hot free page cleanup hit the limit, re-schedule this routine - * to allow for other routines to run. */ - if (fps->cnt > fps->max) - ev_schedule(&fps->cleanup); + while (stack) + { + struct free_page *f = stack; + stack = atomic_load_explicit(&f->next, memory_order_acquire); + free_page(f); + + atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed); + } } #endif @@ -236,8 +309,6 @@ resource_sys_init(void) #endif #ifdef HAVE_MMAP - ASSERT_DIE(global_free_pages.cnt == 0); - /* Check what page size the system supports */ if (!(page_size = sysconf(_SC_PAGESIZE))) die("System page size must be non-zero"); @@ -247,11 +318,8 @@ resource_sys_init(void) /* We assume that page size has only one bit and is between 1K and 256K (incl.). * Otherwise, the assumptions in lib/slab.c (sl_head's num_full range) aren't met. */ - struct free_pages *fps = &global_free_pages; - - init_list(&fps->pages); - init_list(&fps->empty); - global_free_pages_cleanup_event(NULL); + empty_pages_domain = DOMAIN_NEW(resource, "Empty Pages"); + initialized = 1; return; } @@ -261,4 +329,5 @@ resource_sys_init(void) #endif page_size = 4096; + initialized = 1; } diff --git a/sysdep/unix/config.Y b/sysdep/unix/config.Y index 5c4b5bef..a50ec757 100644 --- a/sysdep/unix/config.Y +++ b/sysdep/unix/config.Y @@ -101,6 +101,12 @@ mrtdump_base: ; +conf: THREADS expr { + if ($2 < 1) cf_error("Number of threads must be at least one."); + new_config->thread_count = $2; +} + + conf: debug_unix ; debug_unix: @@ -145,6 +151,11 @@ CF_CLI_HELP(GRACEFUL, restart, [[Shut the daemon down for graceful restart]]) CF_CLI(GRACEFUL RESTART,,, [[Shut the daemon down for graceful restart]]) { cmd_graceful_restart(); } ; +CF_CLI(SHOW THREADS,,, [[Write out thread information]]) +{ cmd_show_threads(0); } ; + +CF_CLI(SHOW THREADS ALL,,, [[Write out thread and IO loop information]]) +{ cmd_show_threads(1); } ; cfg_name: /* empty */ { $$ = NULL; } diff --git a/sysdep/unix/domain.c b/sysdep/unix/domain.c new file mode 100644 index 00000000..f4ee595d --- /dev/null +++ b/sysdep/unix/domain.c @@ -0,0 +1,126 @@ +/* + * BIRD Locking + * + * (c) 2020 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#undef LOCAL_DEBUG + +#undef DEBUG_LOCKING + +#include "lib/birdlib.h" +#include "lib/locking.h" +#include "lib/resource.h" +#include "lib/timer.h" + +#include "conf/conf.h" + +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <pthread.h> +#include <semaphore.h> +#include <stdatomic.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +/* + * Locking subsystem + */ + +_Thread_local struct lock_order locking_stack = {}; +_Thread_local struct domain_generic **last_locked = NULL; + +#define ASSERT_NO_LOCK ASSERT_DIE(last_locked == NULL) + +struct domain_generic { + pthread_mutex_t mutex; + uint order; + struct domain_generic **prev; + struct lock_order *locked_by; + const char *name; +}; + +#define DOMAIN_INIT(_name, _order) { .mutex = PTHREAD_MUTEX_INITIALIZER, .name = _name, .order = _order } + +static struct domain_generic the_bird_domain_gen = DOMAIN_INIT("The BIRD", OFFSETOF(struct lock_order, the_bird)); + +DOMAIN(the_bird) the_bird_domain = { .the_bird = &the_bird_domain_gen }; + +struct domain_generic * +domain_new(const char *name, uint order) +{ + ASSERT_DIE(order < sizeof(struct lock_order)); + struct domain_generic *dg = xmalloc(sizeof(struct domain_generic)); + *dg = (struct domain_generic) DOMAIN_INIT(name, order); + return dg; +} + +void +domain_free(struct domain_generic *dg) +{ + pthread_mutex_destroy(&dg->mutex); + xfree(dg); +} + +const char * +domain_name(struct domain_generic *dg) +{ + return dg->name; +} + +uint dg_order(struct domain_generic *dg) +{ + return dg->order; +} + +void do_lock(struct domain_generic *dg, struct domain_generic **lsp) +{ + struct lock_order stack_copy; + memcpy(&stack_copy, &locking_stack, sizeof(stack_copy)); + struct domain_generic **lll = last_locked; + + if ((char *) lsp - (char *) &locking_stack != dg->order) + bug("Trying to lock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack); + + if (lsp <= last_locked) + bug("Trying to lock in a bad order: %p %p", &stack_copy, lll); + if (*lsp) + bug("Inconsistent locking stack state on lock"); + + btime lock_begin = current_time(); + pthread_mutex_lock(&dg->mutex); + btime duration = current_time() - lock_begin; + if (config && (duration > config->watchdog_warning)) + log(L_WARN "Locking of %s took %d ms", dg->name, (int) (duration TO_MS)); + + if (dg->prev || dg->locked_by) + bug("Previous unlock not finished correctly"); + dg->prev = last_locked; + *lsp = dg; + last_locked = lsp; + dg->locked_by = &locking_stack; +} + +void do_unlock(struct domain_generic *dg, struct domain_generic **lsp) +{ + if ((char *) lsp - (char *) &locking_stack != dg->order) + bug("Trying to unlock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack); + + if (dg->locked_by != &locking_stack) + bug("Inconsistent domain state on unlock"); + if ((last_locked != lsp) || (*lsp != dg)) + bug("Inconsistent locking stack state on unlock"); + dg->locked_by = NULL; + last_locked = dg->prev; + *lsp = NULL; + dg->prev = NULL; + pthread_mutex_unlock(&dg->mutex); +} diff --git a/sysdep/unix/io-loop.c b/sysdep/unix/io-loop.c new file mode 100644 index 00000000..2532fe33 --- /dev/null +++ b/sysdep/unix/io-loop.c @@ -0,0 +1,1296 @@ +/* + * BIRD -- I/O and event loop + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <pthread.h> +#include <time.h> +#include <sys/time.h> + +#include "nest/bird.h" + +#include "lib/buffer.h" +#include "lib/lists.h" +#include "lib/locking.h" +#include "lib/resource.h" +#include "lib/event.h" +#include "lib/timer.h" +#include "lib/socket.h" + +#include "lib/io-loop.h" +#include "sysdep/unix/io-loop.h" +#include "conf/conf.h" +#include "nest/cli.h" + +#define THREAD_STACK_SIZE 65536 /* To be lowered in near future */ + +static struct birdloop *birdloop_new_internal(pool *pp, uint order, const char *name, int request_pickup); + +/* + * Nanosecond time for accounting purposes + * + * A fixed point on startup is set as zero, all other values are relative to that. + * Caution: this overflows after like 500 years or so. If you plan to run + * BIRD for such a long time, please implement some means of overflow prevention. + */ + +static struct timespec ns_begin; + +static void ns_init(void) +{ + if (clock_gettime(CLOCK_MONOTONIC, &ns_begin)) + bug("clock_gettime: %m"); +} + +static u64 ns_now(void) +{ + struct timespec ts; + if (clock_gettime(CLOCK_MONOTONIC, &ts)) + bug("clock_gettime: %m"); + + return (u64) (ts.tv_sec - ns_begin.tv_sec) * 1000000000 + ts.tv_nsec - ns_begin.tv_nsec; +} + + +/* + * Current thread context + */ + +_Thread_local struct birdloop *birdloop_current; +static _Thread_local struct birdloop *birdloop_wakeup_masked; +static _Thread_local uint birdloop_wakeup_masked_count; + +#define LOOP_NAME(loop) domain_name((loop)->time.domain) + +#define LOOP_TRACE(loop, fmt, args...) do { if (config && config->latency_debug) log(L_TRACE "%s (%p): " fmt, LOOP_NAME(loop), (loop), ##args); } while (0) +#define THREAD_TRACE(...) do { if (config && config->latency_debug) log(L_TRACE "Thread: " __VA_ARGS__); } while (0) + +#define LOOP_WARN(loop, fmt, args...) log(L_TRACE "%s (%p): " fmt, LOOP_NAME(loop), (loop), ##args) + + +event_list * +birdloop_event_list(struct birdloop *loop) +{ + return &loop->event_list; +} + +struct timeloop * +birdloop_time_loop(struct birdloop *loop) +{ + return &loop->time; +} + +_Bool +birdloop_inside(struct birdloop *loop) +{ + for (struct birdloop *c = birdloop_current; c; c = c->prev_loop) + if (loop == c) + return 1; + + return 0; +} + +_Bool +birdloop_in_this_thread(struct birdloop *loop) +{ + return pthread_equal(pthread_self(), loop->thread->thread_id); +} + +void +birdloop_flag(struct birdloop *loop, u32 flag) +{ + atomic_fetch_or_explicit(&loop->flags, flag, memory_order_acq_rel); + birdloop_ping(loop); +} + +void +birdloop_flag_set_handler(struct birdloop *loop, struct birdloop_flag_handler *fh) +{ + ASSERT_DIE(birdloop_inside(loop)); + loop->flag_handler = fh; +} + +static int +birdloop_process_flags(struct birdloop *loop) +{ + if (!loop->flag_handler) + return 0; + + u32 flags = atomic_exchange_explicit(&loop->flags, 0, memory_order_acq_rel); + if (!flags) + return 0; + + loop->flag_handler->hook(loop->flag_handler, flags); + return 1; +} + +/* + * Wakeup code for birdloop + */ + +void +pipe_new(struct pipe *p) +{ + int rv = pipe(p->fd); + if (rv < 0) + die("pipe: %m"); + + if (fcntl(p->fd[0], F_SETFL, O_NONBLOCK) < 0) + die("fcntl(O_NONBLOCK): %m"); + + if (fcntl(p->fd[1], F_SETFL, O_NONBLOCK) < 0) + die("fcntl(O_NONBLOCK): %m"); +} + +void +pipe_drain(struct pipe *p) +{ + while (1) { + char buf[64]; + int rv = read(p->fd[0], buf, sizeof(buf)); + if ((rv < 0) && (errno == EAGAIN)) + return; + + if (rv == 0) + bug("wakeup read eof"); + if ((rv < 0) && (errno != EINTR)) + bug("wakeup read: %m"); + } +} + +int +pipe_read_one(struct pipe *p) +{ + while (1) { + char v; + int rv = read(p->fd[0], &v, sizeof(v)); + if (rv == 1) + return 1; + if ((rv < 0) && (errno == EAGAIN)) + return 0; + if (rv > 1) + bug("wakeup read more bytes than expected: %d", rv); + if (rv == 0) + bug("wakeup read eof"); + if (errno != EINTR) + bug("wakeup read: %m"); + } +} + +void +pipe_kick(struct pipe *p) +{ + char v = 1; + int rv; + + while (1) { + rv = write(p->fd[1], &v, sizeof(v)); + if ((rv >= 0) || (errno == EAGAIN)) + return; + if (errno != EINTR) + bug("wakeup write: %m"); + } +} + +void +pipe_pollin(struct pipe *p, struct pfd *pfd) +{ + BUFFER_PUSH(pfd->pfd) = (struct pollfd) { + .fd = p->fd[0], + .events = POLLIN, + }; + BUFFER_PUSH(pfd->loop) = NULL; +} + +static inline void +wakeup_init(struct bird_thread *loop) +{ + pipe_new(&loop->wakeup); +} + +static inline void +wakeup_drain(struct bird_thread *loop) +{ + pipe_drain(&loop->wakeup); +} + +static inline void +wakeup_do_kick(struct bird_thread *loop) +{ + pipe_kick(&loop->wakeup); +} + +static inline _Bool +birdloop_try_ping(struct birdloop *loop, u32 ltt) +{ + /* Somebody else is already pinging, be idempotent */ + if (ltt & LTT_PING) + { + LOOP_TRACE(loop, "already being pinged"); + return 0; + } + + /* Thread moving is an implicit ping */ + if (ltt & LTT_MOVE) + { + LOOP_TRACE(loop, "ping while moving"); + return 1; + } + + /* No more flags allowed */ + ASSERT_DIE(!ltt); + + /* No ping when not picked up */ + if (!loop->thread) + { + LOOP_TRACE(loop, "not picked up yet, can't ping"); + return 1; + } + + /* No ping when masked */ + if (loop == birdloop_wakeup_masked) + { + LOOP_TRACE(loop, "wakeup masked, can't ping"); + birdloop_wakeup_masked_count++; + return 1; + } + + /* Send meta event to ping */ + if ((loop != loop->thread->meta) && (loop != &main_birdloop)) + { + LOOP_TRACE(loop, "Ping by meta event to %p", loop->thread->meta); + ev_send_loop(loop->thread->meta, &loop->event); + return 1; + } + + /* Do the real ping */ + LOOP_TRACE(loop, "sending pipe ping"); + wakeup_do_kick(loop->thread); + return 0; +} + +static inline void +birdloop_do_ping(struct birdloop *loop) +{ + /* Register our ping effort */ + u32 ltt = atomic_fetch_or_explicit(&loop->thread_transition, LTT_PING, memory_order_acq_rel); + + /* Try to ping in multiple ways */ + if (birdloop_try_ping(loop, ltt)) + atomic_fetch_and_explicit(&loop->thread_transition, ~LTT_PING, memory_order_acq_rel); +} + +void +birdloop_ping(struct birdloop *loop) +{ + if (!birdloop_inside(loop)) + { + LOOP_TRACE(loop, "ping from outside"); + birdloop_do_ping(loop); + } + else + { + LOOP_TRACE(loop, "ping from inside, pending=%d", loop->ping_pending); + if (!loop->ping_pending) + loop->ping_pending++; + } +} + + +/* + * Sockets + */ + +static void +sockets_init(struct birdloop *loop) +{ + init_list(&loop->sock_list); + loop->sock_num = 0; +} + +void +socket_changed(sock *s) +{ + struct birdloop *loop = s->loop; + ASSERT_DIE(birdloop_inside(loop)); + + loop->sock_changed++; + birdloop_ping(loop); +} + +void +birdloop_add_socket(struct birdloop *loop, sock *s) +{ + ASSERT_DIE(birdloop_inside(loop)); + ASSERT_DIE(!s->loop); + + LOOP_TRACE(loop, "adding socket %p (total=%d)", s, loop->sock_num); + add_tail(&loop->sock_list, &s->n); + loop->sock_num++; + + s->loop = loop; + s->index = -1; + + socket_changed(s); +} + +extern sock *stored_sock; /* mainloop hack */ + +void +birdloop_remove_socket(struct birdloop *loop, sock *s) +{ + ASSERT_DIE(!enlisted(&s->n) == !s->loop); + + if (!s->loop) + return; + + ASSERT_DIE(birdloop_inside(loop)); + ASSERT_DIE(s->loop == loop); + + /* Decouple the socket from the loop at all. */ + LOOP_TRACE(loop, "removing socket %p (total=%d)", s, loop->sock_num); + + if (loop->sock_active == s) + loop->sock_active = sk_next(s); + + if ((loop == &main_birdloop) && (s == stored_sock)) + stored_sock = sk_next(s); + + rem_node(&s->n); + loop->sock_num--; + + socket_changed(s); + + s->loop = NULL; + s->index = -1; +} + +void +sk_reloop(sock *s, struct birdloop *loop) +{ + ASSERT_DIE(birdloop_inside(loop)); + ASSERT_DIE(birdloop_inside(s->loop)); + + if (loop == s->loop) + return; + + birdloop_remove_socket(s->loop, s); + birdloop_add_socket(loop, s); +} + +void +sk_pause_rx(struct birdloop *loop, sock *s) +{ + ASSERT_DIE(birdloop_inside(loop)); + s->rx_hook = NULL; + socket_changed(s); +} + +void +sk_resume_rx(struct birdloop *loop, sock *s, int (*hook)(sock *, uint)) +{ + ASSERT_DIE(birdloop_inside(loop)); + ASSERT_DIE(hook); + s->rx_hook = hook; + socket_changed(s); +} + +static inline uint sk_want_events(sock *s) +{ return (s->rx_hook ? POLLIN : 0) | (sk_tx_pending(s) ? POLLOUT : 0); } + +void +sockets_prepare(struct birdloop *loop, struct pfd *pfd) +{ + node *n; + WALK_LIST(n, loop->sock_list) + { + sock *s = SKIP_BACK(sock, n, n); + uint w = sk_want_events(s); + + if (!w) + { + s->index = -1; + continue; + } + + s->index = pfd->pfd.used; + LOOP_TRACE(loop, "socket %p poll index is %d", s, s->index); + + BUFFER_PUSH(pfd->pfd) = (struct pollfd) { + .fd = s->fd, + .events = sk_want_events(s), + }; + BUFFER_PUSH(pfd->loop) = loop; + } +} + +int sk_read(sock *s, int revents); +int sk_write(sock *s); +void sk_err(sock *s, int revents); + +static int +sockets_fire(struct birdloop *loop) +{ + if (EMPTY_LIST(loop->sock_list)) + return 0; + + int repeat = 0; + + times_update(); + + struct pollfd *pfd = loop->thread->pfd->pfd.data; + loop->sock_active = SKIP_BACK(sock, n, HEAD(loop->sock_list)); + + while (loop->sock_active) + { + sock *s = loop->sock_active; + + int rev; + if ((s->index >= 0) && (rev = pfd[s->index].revents) && !(rev & POLLNVAL)) + { + int e = 1; + + if (rev & POLLOUT) + { + /* Write everything. */ + while ((s == loop->sock_active) && (e = sk_write(s))) + ; + + if (s != loop->sock_active) + continue; + + if (!sk_tx_pending(s)) + loop->thread->sock_changed++; + } + + if (rev & POLLIN) + /* Read just one packet and request repeat. */ + if ((s == loop->sock_active) && s->rx_hook) + if (sk_read(s, rev)) + repeat++; + + if (s != loop->sock_active) + continue; + + if (!(rev & (POLLOUT | POLLIN)) && (rev & POLLERR)) + sk_err(s, rev); + + if (s != loop->sock_active) + continue; + } + + loop->sock_active = sk_next(s); + } + + return repeat; +} + +/* + * Threads + */ + +DEFINE_DOMAIN(resource); +static DOMAIN(resource) birdloop_domain; +static list birdloop_pickup; +static list bird_thread_pickup; + +static _Thread_local struct bird_thread *this_thread; + +static void +birdloop_set_thread(struct birdloop *loop, struct bird_thread *thr) +{ + struct bird_thread *old = loop->thread; + ASSERT_DIE(!thr != !old); + + /* Signal our moving effort */ + u32 ltt = atomic_fetch_or_explicit(&loop->thread_transition, LTT_MOVE, memory_order_acq_rel); + ASSERT_DIE((ltt & LTT_MOVE) == 0); + + while (ltt & LTT_PING) + { + birdloop_yield(); + ltt = atomic_load_explicit(&loop->thread_transition, memory_order_acquire); + ASSERT_DIE(ltt & LTT_MOVE); + } + /* Now we are free of running pings */ + + if (loop->thread = thr) + { + add_tail(&thr->loops, &loop->n); + thr->loop_count++; + } + else + { + old->loop_count--; + + LOCK_DOMAIN(resource, birdloop_domain); + add_tail(&birdloop_pickup, &loop->n); + UNLOCK_DOMAIN(resource, birdloop_domain); + } + + /* Finished */ + atomic_fetch_and_explicit(&loop->thread_transition, ~LTT_MOVE, memory_order_acq_rel); + + /* Request to run by force */ + ev_send_loop(loop->thread->meta, &loop->event); +} + +static struct birdloop * +birdloop_take(void) +{ + struct birdloop *loop = NULL; + + LOCK_DOMAIN(resource, birdloop_domain); + if (!EMPTY_LIST(birdloop_pickup)) + { + /* Take the first loop from the pickup list and unlock */ + loop = SKIP_BACK(struct birdloop, n, HEAD(birdloop_pickup)); + rem_node(&loop->n); + UNLOCK_DOMAIN(resource, birdloop_domain); + + birdloop_set_thread(loop, this_thread); + + /* This thread goes to the end of the pickup list */ + LOCK_DOMAIN(resource, birdloop_domain); + rem_node(&this_thread->n); + add_tail(&bird_thread_pickup, &this_thread->n); + + /* If there are more loops to be picked up, wakeup the next thread in order */ + if (!EMPTY_LIST(birdloop_pickup)) + wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(bird_thread_pickup))); + } + UNLOCK_DOMAIN(resource, birdloop_domain); + + return loop; +} + +static void +birdloop_drop(struct birdloop *loop) +{ + /* Remove loop from this thread's list */ + rem_node(&loop->n); + + /* Unset loop's thread */ + if (birdloop_inside(loop)) + birdloop_set_thread(loop, NULL); + else + { + birdloop_enter(loop); + birdloop_set_thread(loop, NULL); + birdloop_leave(loop); + } + + /* Put loop into pickup list */ + LOCK_DOMAIN(resource, birdloop_domain); + add_tail(&birdloop_pickup, &loop->n); + UNLOCK_DOMAIN(resource, birdloop_domain); +} + +static int +poll_timeout(struct birdloop *loop) +{ + timer *t = timers_first(&loop->time); + if (!t) + return -1; + + btime remains = tm_remains(t); + return remains TO_MS + ((remains TO_MS) MS < remains); +} + +static void * +bird_thread_main(void *arg) +{ + struct bird_thread *thr = this_thread = arg; + + rcu_thread_start(&thr->rcu); + synchronize_rcu(); + + tmp_init(thr->pool); + init_list(&thr->loops); + + thr->meta = birdloop_new_internal(thr->pool, DOMAIN_ORDER(meta), "Thread Meta", 0); + thr->meta->thread = thr; + birdloop_enter(thr->meta); + + thr->sock_changed = 1; + + struct pfd pfd; + BUFFER_INIT(pfd.pfd, thr->pool, 16); + BUFFER_INIT(pfd.loop, thr->pool, 16); + thr->pfd = &pfd; + + while (1) + { + u64 thr_loop_start = ns_now(); + int timeout; + + /* Pickup new loops */ + struct birdloop *loop = birdloop_take(); + if (loop) + { + birdloop_enter(loop); + if (!EMPTY_LIST(loop->sock_list)) + thr->sock_changed = 1; + birdloop_leave(loop); + } + + /* Schedule all loops with timed out timers */ + timers_fire(&thr->meta->time, 0); + + /* Compute maximal time per loop */ + u64 thr_before_run = ns_now(); + if (thr->loop_count > 0) + thr->max_loop_time_ns = (thr->max_latency_ns / 2 - (thr_before_run - thr_loop_start)) / (u64) thr->loop_count; + + /* Run all scheduled loops */ + int more_events = ev_run_list(&thr->meta->event_list); + if (more_events) + { + THREAD_TRACE("More events to run"); + timeout = 0; + } + else + { + timeout = poll_timeout(thr->meta); + if (timeout == -1) + THREAD_TRACE("No timers, no events"); + else + THREAD_TRACE("Next timer in %d ms", timeout); + } + + /* Run priority events before sleeping */ + ev_run_list(&thr->priority_events); + + /* Do we have to refresh sockets? */ + if (thr->sock_changed) + { + thr->sock_changed = 0; + + BUFFER_FLUSH(pfd.pfd); + BUFFER_FLUSH(pfd.loop); + + pipe_pollin(&thr->wakeup, &pfd); + + node *nn; + WALK_LIST2(loop, nn, thr->loops, n) + { + birdloop_enter(loop); + sockets_prepare(loop, &pfd); + birdloop_leave(loop); + } + + ASSERT_DIE(pfd.loop.used == pfd.pfd.used); + } + /* Nothing to do in at least 5 seconds, flush local hot page cache */ + else if (timeout > 5000) + flush_local_pages(); + +poll_retry:; + int rv = poll(pfd.pfd.data, pfd.pfd.used, timeout); + if (rv < 0) + { + if (errno == EINTR || errno == EAGAIN) + goto poll_retry; + bug("poll in %p: %m", thr); + } + + /* Drain wakeup fd */ + if (pfd.pfd.data[0].revents & POLLIN) + { + ASSERT_DIE(rv > 0); + rv--; + wakeup_drain(thr); + } + + atomic_fetch_and_explicit(&thr->meta->thread_transition, ~LTT_PING, memory_order_acq_rel); + + /* Schedule loops with active sockets */ + if (rv) + for (uint i = 1; i < pfd.pfd.used; i++) + if (pfd.pfd.data[i].revents) + { + LOOP_TRACE(pfd.loop.data[i], "socket id %d got revents=%d", i, pfd.pfd.data[i].revents); + ev_send_loop(thr->meta, &pfd.loop.data[i]->event); + } + } + + bug("An infinite loop has ended."); +} + +static void +bird_thread_cleanup(void *_thr) +{ + struct bird_thread *thr = _thr; + ASSERT_DIE(birdloop_inside(&main_birdloop)); + + /* Thread attributes no longer needed */ + pthread_attr_destroy(&thr->thread_attr); + + /* Free all remaining memory */ + rfree(thr->pool); +} + +static struct bird_thread * +bird_thread_start(btime max_latency) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); + + pool *p = rp_new(&root_pool, "Thread"); + + struct bird_thread *thr = mb_allocz(p, sizeof(*thr)); + thr->pool = p; + thr->cleanup_event = (event) { .hook = bird_thread_cleanup, .data = thr, }; + thr->max_latency_ns = max_latency TO_NS; + + wakeup_init(thr); + ev_init_list(&thr->priority_events, NULL, "Thread direct event list"); + + LOCK_DOMAIN(resource, birdloop_domain); + add_tail(&bird_thread_pickup, &thr->n); + UNLOCK_DOMAIN(resource, birdloop_domain); + + int e = 0; + + if (e = pthread_attr_init(&thr->thread_attr)) + die("pthread_attr_init() failed: %M", e); + + /* We don't have to worry about thread stack size so much. + if (e = pthread_attr_setstacksize(&thr->thread_attr, THREAD_STACK_SIZE)) + die("pthread_attr_setstacksize(%u) failed: %M", THREAD_STACK_SIZE, e); + */ + + if (e = pthread_attr_setdetachstate(&thr->thread_attr, PTHREAD_CREATE_DETACHED)) + die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e); + + if (e = pthread_create(&thr->thread_id, &thr->thread_attr, bird_thread_main, thr)) + die("pthread_create() failed: %M", e); + + return thr; +} + +static struct birdloop *thread_dropper; +static event *thread_dropper_event; +static uint thread_dropper_goal; + +static void +bird_thread_shutdown(void * _ UNUSED) +{ + LOCK_DOMAIN(resource, birdloop_domain); + int dif = list_length(&bird_thread_pickup) - thread_dropper_goal; + struct birdloop *tdl_stop = NULL; + + if (dif > 0) + ev_send_loop(thread_dropper, thread_dropper_event); + else + { + tdl_stop = thread_dropper; + thread_dropper = NULL; + } + + UNLOCK_DOMAIN(resource, birdloop_domain); + + DBG("Thread pickup size differs from dropper goal by %d%s\n", dif, tdl_stop ? ", stopping" : ""); + + if (tdl_stop) + { + birdloop_stop_self(tdl_stop, NULL, NULL); + return; + } + + struct bird_thread *thr = this_thread; + + /* Leave the thread-picker list to get no more loops */ + LOCK_DOMAIN(resource, birdloop_domain); + rem_node(&thr->n); + UNLOCK_DOMAIN(resource, birdloop_domain); + + /* Drop loops including the thread dropper itself */ + while (!EMPTY_LIST(thr->loops)) + birdloop_drop(HEAD(thr->loops)); + + /* Let others know about new loops */ + if (!EMPTY_LIST(birdloop_pickup)) + wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(bird_thread_pickup))); + UNLOCK_DOMAIN(resource, birdloop_domain); + + /* Leave the thread-dropper loop as we aren't going to return. */ + birdloop_leave(thread_dropper); + + /* Stop the meta loop */ + birdloop_leave(thr->meta); + domain_free(thr->meta->time.domain); + rfree(thr->meta->pool); + + /* Local pages not needed anymore */ + flush_local_pages(); + + /* Unregister from RCU */ + rcu_thread_stop(&thr->rcu); + + /* Request thread cleanup from main loop */ + ev_send_loop(&main_birdloop, &thr->cleanup_event); + + /* Exit! */ + pthread_exit(NULL); +} + + +void +bird_thread_commit(struct config *new, struct config *old UNUSED) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); + + if (new->shutdown) + return; + + if (!new->thread_count) + new->thread_count = 1; + + while (1) + { + LOCK_DOMAIN(resource, birdloop_domain); + int dif = list_length(&bird_thread_pickup) - (thread_dropper_goal = new->thread_count); + _Bool thread_dropper_running = !!thread_dropper; + UNLOCK_DOMAIN(resource, birdloop_domain); + + if (dif < 0) + { + bird_thread_start(5 S); + continue; + } + + if ((dif > 0) && !thread_dropper_running) + { + struct birdloop *tdl = birdloop_new(&root_pool, DOMAIN_ORDER(control), "Thread dropper"); + event *tde = ev_new_init(tdl->pool, bird_thread_shutdown, NULL); + + LOCK_DOMAIN(resource, birdloop_domain); + thread_dropper = tdl; + thread_dropper_event = tde; + UNLOCK_DOMAIN(resource, birdloop_domain); + + ev_send_loop(thread_dropper, thread_dropper_event); + } + + return; + } +} + + +DEFINE_DOMAIN(control); + +struct bird_thread_show_data { + cli *cli; + pool *pool; + DOMAIN(control) lock; + uint total; + uint done; + u8 show_loops; +}; + +static void +bird_thread_show_cli_cont(struct cli *c UNUSED) +{ + /* Explicitly do nothing to prevent CLI from trying to parse another command. */ +} + +static int +bird_thread_show_cli_cleanup(struct cli *c UNUSED) +{ + return 1; /* Defer the cleanup until the writeout is finished. */ +} + +static void +bird_thread_show(void *data) +{ + struct bird_thread_show_data *tsd = data; + + LOCK_DOMAIN(control, tsd->lock); + if (tsd->show_loops) + cli_printf(tsd->cli, -1026, "Thread %p", this_thread); + + u64 total_time_ns = 0; + struct birdloop *loop; + WALK_LIST(loop, this_thread->loops) + { + if (tsd->show_loops) + cli_printf(tsd->cli, -1026, " Loop %s time: %t", domain_name(loop->time.domain), loop->total_time_spent_ns NS); + total_time_ns += loop->total_time_spent_ns; + } + + tsd->done++; + int last = (tsd->done == tsd->total); + + if (last) + { + tsd->cli->cont = NULL; + tsd->cli->cleanup = NULL; + } + + if (tsd->show_loops) + cli_printf(tsd->cli, (last ? 1 : -1) * 1026, " Total time: %t", total_time_ns NS); + else + cli_printf(tsd->cli, (last ? 1 : -1) * 1026, "Thread %p time %t", this_thread, total_time_ns NS); + + UNLOCK_DOMAIN(control, tsd->lock); + + if (last) + { + the_bird_lock(); + + LOCK_DOMAIN(resource, birdloop_domain); + if (!EMPTY_LIST(birdloop_pickup)) + if (tsd->show_loops) + { + cli_printf(tsd->cli, -1026, "Unassigned loops"); + WALK_LIST(loop, birdloop_pickup) + cli_printf(tsd->cli, -1026, " Loop %s time: %t", domain_name(loop->time.domain), loop->total_time_spent_ns NS); + } + else + { + uint count = 0; + u64 total_time_ns = 0; + WALK_LIST(loop, birdloop_pickup) + { + count++; + total_time_ns += loop->total_time_spent_ns; + } + cli_printf(tsd->cli, -1026, "Unassigned loops: %d, total time %t", count, total_time_ns NS); + } + UNLOCK_DOMAIN(resource, birdloop_domain); + + cli_write_trigger(tsd->cli); + DOMAIN_FREE(control, tsd->lock); + rfree(tsd->pool); + + the_bird_unlock(); + } +} + + +void +cmd_show_threads(int show_loops) +{ + pool *p = rp_new(&root_pool, "Show Threads"); + + struct bird_thread_show_data *tsd = mb_allocz(p, sizeof(struct bird_thread_show_data)); + tsd->lock = DOMAIN_NEW(control, "Show Threads"); + tsd->cli = this_cli; + tsd->pool = p; + tsd->show_loops = show_loops; + + this_cli->cont = bird_thread_show_cli_cont; + this_cli->cleanup = bird_thread_show_cli_cleanup; + + LOCK_DOMAIN(control, tsd->lock); + LOCK_DOMAIN(resource, birdloop_domain); + + struct bird_thread *thr; + WALK_LIST(thr, bird_thread_pickup) + { + tsd->total++; + ev_send(&thr->priority_events, ev_new_init(p, bird_thread_show, tsd)); + wakeup_do_kick(thr); + } + + UNLOCK_DOMAIN(resource, birdloop_domain); + UNLOCK_DOMAIN(control, tsd->lock); +} + +/* + * Birdloop + */ + +static struct bird_thread main_thread; +struct birdloop main_birdloop = { .thread = &main_thread, }; + +static void birdloop_enter_locked(struct birdloop *loop); + +void +birdloop_init(void) +{ + ns_init(); + + birdloop_domain = DOMAIN_NEW(resource, "Loop Pickup"); + init_list(&birdloop_pickup); + init_list(&bird_thread_pickup); + + wakeup_init(main_birdloop.thread); + + main_birdloop.time.domain = the_bird_domain.the_bird; + main_birdloop.time.loop = &main_birdloop; + + times_update(); + timers_init(&main_birdloop.time, &root_pool); + + birdloop_enter_locked(&main_birdloop); +} + +static void +birdloop_stop_internal(struct birdloop *loop) +{ + LOOP_TRACE(loop, "Stopping"); + + /* Block incoming pings */ + u32 ltt = atomic_load_explicit(&loop->thread_transition, memory_order_acquire); + while (!atomic_compare_exchange_strong_explicit( + &loop->thread_transition, <t, LTT_PING, + memory_order_acq_rel, memory_order_acquire)) + ; + + /* Flush remaining events */ + ASSERT_DIE(!ev_run_list(&loop->event_list)); + + /* Drop timers */ + timer *t; + while (t = timers_first(&loop->time)) + tm_stop(t); + + /* Drop sockets */ + sock *s; + WALK_LIST_FIRST2(s, n, loop->sock_list) + birdloop_remove_socket(loop, s); + + /* Unschedule from Meta */ + ev_postpone(&loop->event); + tm_stop(&loop->timer); + + /* Remove from thread loop list */ + rem_node(&loop->n); + loop->thread = NULL; + + /* Leave the loop context without causing any other fuss */ + ASSERT_DIE(!ev_active(&loop->event)); + loop->ping_pending = 0; + birdloop_leave(loop); + + /* Request local socket reload */ + this_thread->sock_changed++; + + /* Tail-call the stopped hook */ + loop->stopped(loop->stop_data); +} + +static void +birdloop_run(void *_loop) +{ + /* Run priority events before the loop is executed */ + ev_run_list(&this_thread->priority_events); + + u64 start_time = ns_now(); + u64 end_time = start_time + this_thread->max_loop_time_ns; + + struct birdloop *loop = _loop; + birdloop_enter(loop); + + u64 locked_time = ns_now(), task_done_time; + if (locked_time > end_time) + LOOP_WARN(loop, "locked %luns after its scheduled end time", locked_time - end_time); + + uint repeat, loop_runs = 0; + do { + repeat = 0; + LOOP_TRACE(loop, "Regular run"); + loop_runs++; + + if (loop->stopped) + /* Birdloop left inside the helper function */ + return birdloop_stop_internal(loop); + + /* Process sockets */ + repeat += sockets_fire(loop); + + /* Run timers */ + timers_fire(&loop->time, 0); + + /* Run flag handlers */ + repeat += birdloop_process_flags(loop); + + /* Run events */ + repeat += ev_run_list(&loop->event_list); + + /* Check end time */ + } while (((task_done_time = ns_now()) < end_time) && repeat); + + /* Request meta timer */ + timer *t = timers_first(&loop->time); + if (t) + tm_start_in(&loop->timer, tm_remains(t), this_thread->meta); + else + tm_stop(&loop->timer); + + /* Request re-run if needed */ + if (repeat) + ev_send_loop(this_thread->meta, &loop->event); + + /* Collect socket change requests */ + this_thread->sock_changed += loop->sock_changed; + loop->sock_changed = 0; + + birdloop_leave(loop); +} + +static void +birdloop_run_timer(timer *tm) +{ + struct birdloop *loop = tm->data; + LOOP_TRACE(loop, "Timer ready, requesting run"); + ev_send_loop(loop->thread->meta, &loop->event); +} + +static struct birdloop * +birdloop_new_internal(pool *pp, uint order, const char *name, int request_pickup) +{ + struct domain_generic *dg = domain_new(name, order); + + pool *p = rp_new(pp, name); + struct birdloop *loop = mb_allocz(p, sizeof(struct birdloop)); + loop->pool = p; + + loop->time.domain = dg; + loop->time.loop = loop; + + atomic_store_explicit(&loop->thread_transition, 0, memory_order_relaxed); + + birdloop_enter(loop); + + ev_init_list(&loop->event_list, loop, name); + timers_init(&loop->time, p); + sockets_init(loop); + + loop->event = (event) { .hook = birdloop_run, .data = loop, }; + loop->timer = (timer) { .hook = birdloop_run_timer, .data = loop, }; + + if (request_pickup) + { + LOCK_DOMAIN(resource, birdloop_domain); + add_tail(&birdloop_pickup, &loop->n); + wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(bird_thread_pickup))); + UNLOCK_DOMAIN(resource, birdloop_domain); + } + else + loop->n.next = loop->n.prev = &loop->n; + + birdloop_leave(loop); + + return loop; +} + +struct birdloop * +birdloop_new(pool *pp, uint order, const char *name) +{ + return birdloop_new_internal(pp, order, name, 1); +} + +static void +birdloop_do_stop(struct birdloop *loop, void (*stopped)(void *data), void *data) +{ + LOOP_TRACE(loop, "Stop requested"); + + loop->stopped = stopped; + loop->stop_data = data; + + birdloop_do_ping(loop); +} + +void +birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data) +{ + DG_LOCK(loop->time.domain); + birdloop_do_stop(loop, stopped, data); + DG_UNLOCK(loop->time.domain); +} + +void +birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data) +{ + ASSERT_DIE(loop == birdloop_current); + ASSERT_DIE(DG_IS_LOCKED(loop->time.domain)); + + birdloop_do_stop(loop, stopped, data); +} + +void +birdloop_free(struct birdloop *loop) +{ + ASSERT_DIE(loop->thread == NULL); + + domain_free(loop->time.domain); + rfree(loop->pool); +} + +static void +birdloop_enter_locked(struct birdloop *loop) +{ + ASSERT_DIE(DG_IS_LOCKED(loop->time.domain)); + ASSERT_DIE(!birdloop_inside(loop)); + + /* Store the old context */ + loop->prev_loop = birdloop_current; + + /* Put the new context */ + birdloop_current = loop; +} + +void +birdloop_enter(struct birdloop *loop) +{ + DG_LOCK(loop->time.domain); + return birdloop_enter_locked(loop); +} + +static void +birdloop_leave_locked(struct birdloop *loop) +{ + /* Check the current context */ + ASSERT_DIE(birdloop_current == loop); + + /* Send pending pings */ + if (loop->ping_pending) + { + LOOP_TRACE(loop, "sending pings on leave"); + loop->ping_pending = 0; + birdloop_do_ping(loop); + } + + /* Restore the old context */ + birdloop_current = loop->prev_loop; +} + +void +birdloop_leave(struct birdloop *loop) +{ + birdloop_leave_locked(loop); + DG_UNLOCK(loop->time.domain); +} + +void +birdloop_mask_wakeups(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_wakeup_masked == NULL); + birdloop_wakeup_masked = loop; +} + +void +birdloop_unmask_wakeups(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_wakeup_masked == loop); + birdloop_wakeup_masked = NULL; + if (birdloop_wakeup_masked_count) + wakeup_do_kick(loop->thread); + + birdloop_wakeup_masked_count = 0; +} + +void +birdloop_yield(void) +{ + usleep(100); +} diff --git a/sysdep/unix/io-loop.h b/sysdep/unix/io-loop.h new file mode 100644 index 00000000..c531d43e --- /dev/null +++ b/sysdep/unix/io-loop.h @@ -0,0 +1,93 @@ +/* + * BIRD -- I/O and event loop + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_SYSDEP_UNIX_IO_LOOP_H_ +#define _BIRD_SYSDEP_UNIX_IO_LOOP_H_ + +#include "lib/rcu.h" + +#include <pthread.h> + +struct pipe +{ + int fd[2]; +}; + +struct pfd { + BUFFER(struct pollfd) pfd; + BUFFER(struct birdloop *) loop; +}; + +void sockets_prepare(struct birdloop *, struct pfd *); +void socket_changed(struct birdsock *); + +void pipe_new(struct pipe *); +void pipe_pollin(struct pipe *, struct pfd *); +void pipe_drain(struct pipe *); +void pipe_kick(struct pipe *); + +struct birdloop +{ + node n; + + event event; + timer timer; + + pool *pool; + + struct timeloop time; + event_list event_list; + list sock_list; + struct birdsock *sock_active; + int sock_num; + uint sock_changed; + + uint ping_pending; + + _Atomic u32 thread_transition; +#define LTT_PING 1 +#define LTT_MOVE 2 + _Atomic u32 flags; + struct birdloop_flag_handler *flag_handler; + + void (*stopped)(void *data); + void *stop_data; + + struct birdloop *prev_loop; + + struct bird_thread *thread; + + u64 total_time_spent_ns; +}; + +struct bird_thread +{ + node n; + + struct pipe wakeup; + event_list priority_events; + + struct birdloop *meta; + + pthread_t thread_id; + pthread_attr_t thread_attr; + + struct rcu_thread rcu; + + list loops; + pool *pool; + struct pfd *pfd; + + event cleanup_event; + + int sock_changed; + uint loop_count; + + u64 max_latency_ns; + u64 max_loop_time_ns; +}; + +#endif diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index e131ca41..88d187a4 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -36,12 +36,14 @@ #include "lib/resource.h" #include "lib/socket.h" #include "lib/event.h" +#include "lib/locking.h" #include "lib/timer.h" #include "lib/string.h" #include "nest/iface.h" #include "conf/conf.h" #include "sysdep/unix/unix.h" +#include "sysdep/unix/io-loop.h" #include CONFIG_INCLUDE_SYSIO_H /* Maximum number of calls of tx handler for one socket in one @@ -74,7 +76,7 @@ rf_free(resource *r) } static void -rf_dump(resource *r) +rf_dump(resource *r, unsigned indent UNUSED) { struct rfile *a = (struct rfile *) r; @@ -122,12 +124,16 @@ rf_fileno(struct rfile *f) btime boot_time; + void -times_init(struct timeloop *loop) +times_update(void) { struct timespec ts; int rv; + btime old_time = current_time(); + btime old_real_time = current_real_time(); + rv = clock_gettime(CLOCK_MONOTONIC, &ts); if (rv < 0) die("Monotonic clock is missing"); @@ -135,42 +141,33 @@ times_init(struct timeloop *loop) if ((ts.tv_sec < 0) || (((u64) ts.tv_sec) > ((u64) 1 << 40))) log(L_WARN "Monotonic clock is crazy"); - loop->last_time = ts.tv_sec S + ts.tv_nsec NS; - loop->real_time = 0; -} - -void -times_update(struct timeloop *loop) -{ - struct timespec ts; - int rv; - - rv = clock_gettime(CLOCK_MONOTONIC, &ts); - if (rv < 0) - die("clock_gettime: %m"); - btime new_time = ts.tv_sec S + ts.tv_nsec NS; - if (new_time < loop->last_time) + if (new_time < old_time) log(L_ERR "Monotonic clock is broken"); - loop->last_time = new_time; - loop->real_time = 0; -} - -void -times_update_real_time(struct timeloop *loop) -{ - struct timespec ts; - int rv; - rv = clock_gettime(CLOCK_REALTIME, &ts); if (rv < 0) die("clock_gettime: %m"); - loop->real_time = ts.tv_sec S + ts.tv_nsec NS; -} + btime new_real_time = ts.tv_sec S + ts.tv_nsec NS; + + if (!atomic_compare_exchange_strong_explicit( + &last_time, + &old_time, + new_time, + memory_order_acq_rel, + memory_order_relaxed)) + DBG("Time update collision: last_time"); + if (!atomic_compare_exchange_strong_explicit( + &real_time, + &old_real_time, + new_real_time, + memory_order_acq_rel, + memory_order_relaxed)) + DBG("Time update collision: real_time"); +} /** * DOC: Sockets @@ -725,11 +722,7 @@ sk_log_error(sock *s, const char *p) * Actual struct birdsock code */ -static list sock_list; -static struct birdsock *current_sock; -static struct birdsock *stored_sock; - -static inline sock * +sock * sk_next(sock *s) { if (!s->n.next->next) @@ -777,8 +770,7 @@ sk_ssh_free(sock *s) if (ssh->channel) { - if (ssh_channel_is_open(ssh->channel)) - ssh_channel_close(ssh->channel); + ssh_channel_close(ssh->channel); ssh_channel_free(ssh->channel); ssh->channel = NULL; } @@ -792,6 +784,7 @@ sk_ssh_free(sock *s) } #endif + static void sk_free(resource *r) { @@ -804,20 +797,10 @@ sk_free(resource *r) sk_ssh_free(s); #endif - if (s->fd < 0) - return; - - /* FIXME: we should call sk_stop() for SKF_THREAD sockets */ - if (!(s->flags & SKF_THREAD)) - { - if (s == current_sock) - current_sock = sk_next(s); - if (s == stored_sock) - stored_sock = sk_next(s); - rem_node(&s->n); - } + if (s->loop) + birdloop_remove_socket(s->loop, s); - if (s->type != SK_SSH && s->type != SK_SSH_ACTIVE) + if (s->fd >= 0 && s->type != SK_SSH && s->type != SK_SSH_ACTIVE) close(s->fd); s->fd = -1; @@ -868,7 +851,7 @@ sk_reallocate(sock *s) } static void -sk_dump(resource *r) +sk_dump(resource *r, unsigned indent UNUSED) { sock *s = (sock *) r; static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "SSH>", "SSH", "DEL!" }; @@ -1030,12 +1013,6 @@ sk_setup(sock *s) } static void -sk_insert(sock *s) -{ - add_tail(&sock_list, &s->n); -} - -static void sk_tcp_connected(sock *s) { sockaddr sa; @@ -1108,7 +1085,8 @@ sk_passive_connected(sock *s, int type) return 1; } - sk_insert(t); + birdloop_add_socket(s->loop, t); + sk_alloc_bufs(t); s->rx_hook(t, 0); return 1; @@ -1153,34 +1131,45 @@ sk_ssh_connect(sock *s) { int server_identity_is_ok = 1; +#ifdef HAVE_SSH_OLD_SERVER_VALIDATION_API +#define ssh_session_is_known_server ssh_is_server_known +#define SSH_KNOWN_HOSTS_OK SSH_SERVER_KNOWN_OK +#define SSH_KNOWN_HOSTS_UNKNOWN SSH_SERVER_NOT_KNOWN +#define SSH_KNOWN_HOSTS_CHANGED SSH_SERVER_KNOWN_CHANGED +#define SSH_KNOWN_HOSTS_NOT_FOUND SSH_SERVER_FILE_NOT_FOUND +#define SSH_KNOWN_HOSTS_ERROR SSH_SERVER_ERROR +#define SSH_KNOWN_HOSTS_OTHER SSH_SERVER_FOUND_OTHER +#endif + /* Check server identity */ - switch (ssh_is_server_known(s->ssh->session)) + switch (ssh_session_is_known_server(s->ssh->session)) { #define LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s,msg,args...) log(L_WARN "SSH Identity %s@%s:%u: " msg, (s)->ssh->username, (s)->host, (s)->dport, ## args); - case SSH_SERVER_KNOWN_OK: + case SSH_KNOWN_HOSTS_OK: /* The server is known and has not changed. */ break; - case SSH_SERVER_NOT_KNOWN: + case SSH_KNOWN_HOSTS_UNKNOWN: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server is unknown, its public key was not found in the known host file %s", s->ssh->server_hostkey_path); + server_identity_is_ok = 0; break; - case SSH_SERVER_KNOWN_CHANGED: + case SSH_KNOWN_HOSTS_CHANGED: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server key has changed. Either you are under attack or the administrator changed the key."); server_identity_is_ok = 0; break; - case SSH_SERVER_FILE_NOT_FOUND: + case SSH_KNOWN_HOSTS_NOT_FOUND: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The known host file %s does not exist", s->ssh->server_hostkey_path); server_identity_is_ok = 0; break; - case SSH_SERVER_ERROR: + case SSH_KNOWN_HOSTS_ERROR: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "Some error happened"); server_identity_is_ok = 0; break; - case SSH_SERVER_FOUND_OTHER: + case SSH_KNOWN_HOSTS_OTHER: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server gave use a key of a type while we had an other type recorded. " \ "It is a possible attack."); server_identity_is_ok = 0; @@ -1311,6 +1300,7 @@ sk_open_ssh(sock *s) /** * sk_open - open a socket + * @loop: loop * @s: socket * * This function takes a socket resource created by sk_new() and @@ -1320,7 +1310,7 @@ sk_open_ssh(sock *s) * Result: 0 for success, -1 for an error. */ int -sk_open(sock *s) +sk_open(sock *s, struct birdloop *loop) { int af = AF_UNSPEC; int fd = -1; @@ -1473,9 +1463,7 @@ sk_open(sock *s) sk_alloc_bufs(s); } - if (!(s->flags & SKF_THREAD)) - sk_insert(s); - + birdloop_add_socket(loop, s); return 0; err: @@ -1485,7 +1473,7 @@ err: } int -sk_open_unix(sock *s, char *name) +sk_open_unix(sock *s, struct birdloop *loop, char *name) { struct sockaddr_un sa; int fd; @@ -1512,7 +1500,7 @@ sk_open_unix(sock *s, char *name) return -1; s->fd = fd; - sk_insert(s); + birdloop_add_socket(loop, s); return 0; } @@ -1638,6 +1626,13 @@ sk_recvmsg(sock *s) static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; } +_Bool +sk_tx_pending(sock *s) +{ + return s->ttx != s->tpos; +} + + static int sk_maybe_write(sock *s) { @@ -1648,7 +1643,7 @@ sk_maybe_write(sock *s) case SK_TCP: case SK_MAGIC: case SK_UNIX: - while (s->ttx != s->tpos) + while (sk_tx_pending(s)) { e = write(s->fd, s->ttx, s->tpos - s->ttx); @@ -1670,7 +1665,7 @@ sk_maybe_write(sock *s) #ifdef HAVE_LIBSSH case SK_SSH: - while (s->ttx != s->tpos) + while (sk_tx_pending(s)) { e = ssh_channel_write(s->ssh->channel, s->ttx, s->tpos - s->ttx); @@ -1753,7 +1748,12 @@ sk_send(sock *s, unsigned len) { s->ttx = s->tbuf; s->tpos = s->tbuf + len; - return sk_maybe_write(s); + + int e = sk_maybe_write(s); + if (e == 0) /* Trigger thread poll reload to poll this socket's write. */ + socket_changed(s); + + return e; } /** @@ -1800,7 +1800,7 @@ call_rx_hook(sock *s, int size) if (s->rx_hook(s, size)) { /* We need to be careful since the socket could have been deleted by the hook */ - if (current_sock == s) + if (s->loop->sock_active == s) s->rpos = s->rbuf; } } @@ -1964,7 +1964,7 @@ sk_write_noflush(sock *s) #endif default: - if (s->ttx != s->tpos && sk_maybe_write(s) > 0) + if (sk_tx_pending(s) && sk_maybe_write(s) > 0) { if (s->tx_hook) s->tx_hook(s); @@ -2010,11 +2010,11 @@ sk_dump_all(void) sock *s; debug("Open sockets:\n"); - WALK_LIST(n, sock_list) + WALK_LIST(n, main_birdloop.sock_list) { s = SKIP_BACK(sock, n, n); debug("%p ", s); - sk_dump(&s->r); + sk_dump(&s->r, 3); } debug("\n"); } @@ -2037,30 +2037,17 @@ struct event_log_entry static struct event_log_entry event_log[EVENT_LOG_LENGTH]; static struct event_log_entry *event_open; static int event_log_pos, event_log_num, watchdog_active; -static btime last_time; +static btime last_io_time; static btime loop_time; static void io_update_time(void) { - struct timespec ts; - int rv; - - /* - * This is third time-tracking procedure (after update_times() above and - * times_update() in BFD), dedicated to internal event log and latency - * tracking. Hopefully, we consolidate these sometimes. - */ - - rv = clock_gettime(CLOCK_MONOTONIC, &ts); - if (rv < 0) - die("clock_gettime: %m"); - - last_time = ts.tv_sec S + ts.tv_nsec NS; + last_io_time = current_time(); if (event_open) { - event_open->duration = last_time - event_open->timestamp; + event_open->duration = last_io_time - event_open->timestamp; if (event_open->duration > config->latency_limit) log(L_WARN "Event 0x%p 0x%p took %u.%03u ms", @@ -2089,7 +2076,7 @@ io_log_event(void *hook, void *data) en->hook = hook; en->data = data; - en->timestamp = last_time; + en->timestamp = last_io_time; en->duration = 0; event_log_num++; @@ -2117,14 +2104,14 @@ io_log_dump(void) struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH; if (en->hook) log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data, - (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS)); + (int) ((last_io_time - en->timestamp) TO_MS), (int) (en->duration TO_MS)); } } void watchdog_sigalrm(int sig UNUSED) { - /* Update last_time and duration, but skip latency check */ + /* Update last_io_time and duration, but skip latency check */ config->latency_limit = 0xffffffff; io_update_time(); @@ -2139,7 +2126,7 @@ watchdog_start1(void) { io_update_time(); - loop_time = last_time; + loop_time = last_io_time; } static inline void @@ -2147,7 +2134,7 @@ watchdog_start(void) { io_update_time(); - loop_time = last_time; + loop_time = last_io_time; event_log_num = 0; if (config->watchdog_timeout) @@ -2168,7 +2155,7 @@ watchdog_stop(void) watchdog_active = 0; } - btime duration = last_time - loop_time; + btime duration = last_io_time - loop_time; if (duration > config->watchdog_warning) log(L_WARN "I/O loop cycle took %u.%03u ms for %d events", (uint) (duration TO_MS), (uint) (duration % 1000), event_log_num); @@ -2182,9 +2169,10 @@ watchdog_stop(void) void io_init(void) { - init_list(&sock_list); - init_list(&global_event_list); - init_list(&global_work_list); + init_list(&main_birdloop.sock_list); + ev_init_list(&global_event_list, &main_birdloop, "Global event list"); + ev_init_list(&global_work_list, &main_birdloop, "Global work list"); + ev_init_list(&main_birdloop.event_list, &main_birdloop, "Global fast event list"); krt_io_init(); // XXX init_times(); // XXX update_times(); @@ -2198,64 +2186,42 @@ static int short_loops = 0; #define SHORT_LOOP_MAX 10 #define WORK_EVENTS_MAX 10 +sock *stored_sock; + void io_loop(void) { int poll_tout, timeout; - int nfds, events, pout; + int events, pout; timer *t; - sock *s; - node *n; - int fdmax = 256; - struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd)); + struct pfd pfd; + BUFFER_INIT(pfd.pfd, &root_pool, 16); + BUFFER_INIT(pfd.loop, &root_pool, 16); watchdog_start1(); for(;;) { - times_update(&main_timeloop); + times_update(); events = ev_run_list(&global_event_list); events = ev_run_list_limited(&global_work_list, WORK_EVENTS_MAX) || events; - timers_fire(&main_timeloop); + events = ev_run_list(&main_birdloop.event_list) || events; + timers_fire(&main_birdloop.time, 1); io_close_event(); // FIXME poll_tout = (events ? 0 : 3000); /* Time in milliseconds */ - if (t = timers_first(&main_timeloop)) + if (t = timers_first(&main_birdloop.time)) { - times_update(&main_timeloop); + times_update(); timeout = (tm_remains(t) TO_MS) + 1; poll_tout = MIN(poll_tout, timeout); } - nfds = 0; - WALK_LIST(n, sock_list) - { - pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */ - s = SKIP_BACK(sock, n, n); - if (s->rx_hook) - { - pfd[nfds].fd = s->fd; - pfd[nfds].events |= POLLIN; - } - if (s->tx_hook && s->ttx != s->tpos) - { - pfd[nfds].fd = s->fd; - pfd[nfds].events |= POLLOUT; - } - if (pfd[nfds].fd != -1) - { - s->index = nfds; - nfds++; - } - else - s->index = -1; + BUFFER_FLUSH(pfd.pfd); + BUFFER_FLUSH(pfd.loop); - if (nfds >= fdmax) - { - fdmax *= 2; - pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd)); - } - } + pipe_pollin(&main_birdloop.thread->wakeup, &pfd); + sockets_prepare(&main_birdloop, &pfd); /* * Yes, this is racy. But even if the signal comes before this test @@ -2286,103 +2252,108 @@ io_loop(void) /* And finally enter poll() to find active sockets */ watchdog_stop(); - pout = poll(pfd, nfds, poll_tout); + birdloop_leave(&main_birdloop); + pout = poll(pfd.pfd.data, pfd.pfd.used, poll_tout); + birdloop_enter(&main_birdloop); watchdog_start(); if (pout < 0) { if (errno == EINTR || errno == EAGAIN) continue; - die("poll: %m"); + bug("poll: %m"); } if (pout) { - times_update(&main_timeloop); + if (pfd.pfd.data[0].revents & POLLIN) + { + /* IO loop reload requested */ + pipe_drain(&main_birdloop.thread->wakeup); + atomic_fetch_and_explicit(&main_birdloop.thread_transition, ~LTT_PING, memory_order_acq_rel); + continue; + } + + times_update(); /* guaranteed to be non-empty */ - current_sock = SKIP_BACK(sock, n, HEAD(sock_list)); + main_birdloop.sock_active = SKIP_BACK(sock, n, HEAD(main_birdloop.sock_list)); - while (current_sock) + while (main_birdloop.sock_active) + { + sock *s = main_birdloop.sock_active; + if (s->index != -1) { - sock *s = current_sock; - if (s->index == -1) - { - current_sock = sk_next(s); - goto next; - } - int e; int steps; steps = MAX_STEPS; - if (s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook) + if (s->fast_rx && (pfd.pfd.data[s->index].revents & POLLIN) && s->rx_hook) do { steps--; io_log_event(s->rx_hook, s->data); - e = sk_read(s, pfd[s->index].revents); - if (s != current_sock) - goto next; + e = sk_read(s, pfd.pfd.data[s->index].revents); } - while (e && s->rx_hook && steps); + while (e && (main_birdloop.sock_active == s) && s->rx_hook && steps); + + if (s != main_birdloop.sock_active) + continue; steps = MAX_STEPS; - if (pfd[s->index].revents & POLLOUT) + if (pfd.pfd.data[s->index].revents & POLLOUT) do { steps--; io_log_event(s->tx_hook, s->data); e = sk_write(s); - if (s != current_sock) - goto next; } - while (e && steps); + while (e && (main_birdloop.sock_active == s) && steps); - current_sock = sk_next(s); - next: ; + if (s != main_birdloop.sock_active) + continue; } + main_birdloop.sock_active = sk_next(s); + } + short_loops++; if (events && (short_loops < SHORT_LOOP_MAX)) continue; short_loops = 0; int count = 0; - current_sock = stored_sock; - if (current_sock == NULL) - current_sock = SKIP_BACK(sock, n, HEAD(sock_list)); + main_birdloop.sock_active = stored_sock; + if (main_birdloop.sock_active == NULL) + main_birdloop.sock_active = SKIP_BACK(sock, n, HEAD(main_birdloop.sock_list)); - while (current_sock && count < MAX_RX_STEPS) + while (main_birdloop.sock_active && count < MAX_RX_STEPS) { - sock *s = current_sock; + sock *s = main_birdloop.sock_active; if (s->index == -1) - { - current_sock = sk_next(s); - goto next2; - } + goto next2; - if (!s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook) + if (!s->fast_rx && (pfd.pfd.data[s->index].revents & POLLIN) && s->rx_hook) { count++; io_log_event(s->rx_hook, s->data); - sk_read(s, pfd[s->index].revents); - if (s != current_sock) - goto next2; + sk_read(s, pfd.pfd.data[s->index].revents); + if (s != main_birdloop.sock_active) + continue; } - if (pfd[s->index].revents & (POLLHUP | POLLERR)) + if (pfd.pfd.data[s->index].revents & (POLLHUP | POLLERR)) { - sk_err(s, pfd[s->index].revents); - if (s != current_sock) - goto next2; + sk_err(s, pfd.pfd.data[s->index].revents); + if (s != main_birdloop.sock_active) + continue; } - current_sock = sk_next(s); next2: ; + main_birdloop.sock_active = sk_next(s); } - stored_sock = current_sock; + stored_sock = main_birdloop.sock_active; } } } diff --git a/sysdep/unix/krt.Y b/sysdep/unix/krt.Y index 95b54d65..4ce9a328 100644 --- a/sysdep/unix/krt.Y +++ b/sysdep/unix/krt.Y @@ -29,7 +29,7 @@ kif_set_preferred(ip_addr ip) CF_DECLS -CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, KRT_SOURCE, KRT_METRIC, MERGE, PATHS) +CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, MERGE, PATHS) CF_KEYWORDS(INTERFACE, PREFERRED) %type <i> kern_mp_limit @@ -122,9 +122,6 @@ kif_iface: kif_iface_start iface_patt_list_nopx kif_iface_opt_list; -dynamic_attr: KRT_SOURCE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SOURCE); } ; -dynamic_attr: KRT_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_METRIC); } ; - CF_CODE CF_END diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index 9f95247f..9e6ddb45 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -53,7 +53,7 @@ #include "nest/bird.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "filter/filter.h" #include "conf/conf.h" @@ -163,6 +163,15 @@ kif_shutdown(struct proto *P) return PS_DOWN; } +static void +kif_cleanup(struct proto *p) +{ + if (p->debug & D_EVENTS) + log(L_TRACE "%s: Flushing interfaces", p->name); + if_start_update(); + if_end_update(); +} + static int kif_reconfigure(struct proto *p, struct proto_config *new) { @@ -232,13 +241,13 @@ kif_copy_config(struct proto_config *dest, struct proto_config *src) struct protocol proto_unix_iface = { .name = "Device", .template = "device%d", - .class = PROTOCOL_DEVICE, .proto_size = sizeof(struct kif_proto), .config_size = sizeof(struct kif_config), .preconfig = kif_preconfig, .init = kif_init, .start = kif_start, .shutdown = kif_shutdown, + .cleanup = kif_cleanup, .reconfigure = kif_reconfigure, .copy_config = kif_copy_config }; @@ -258,14 +267,14 @@ static inline void krt_trace_in(struct krt_proto *p, rte *e, char *msg) { if (p->p.debug & D_PACKETS) - log(L_TRACE "%s: %N: %s", p->p.name, e->net->n.addr, msg); + log(L_TRACE "%s: %N: %s", p->p.name, e->net, msg); } static inline void krt_trace_in_rl(struct tbf *f, struct krt_proto *p, rte *e, char *msg) { if (p->p.debug & D_PACKETS) - log_rl(f, L_TRACE "%s: %N: %s", p->p.name, e->net->n.addr, msg); + log_rl(f, L_TRACE "%s: %N: %s", p->p.name, e->net, msg); } /* @@ -287,259 +296,40 @@ static struct tbf rl_alien = TBF_DEFAULT_LOG_LIMITS; static inline u32 krt_metric(rte *a) { - eattr *ea = ea_find(a->attrs->eattrs, EA_KRT_METRIC); + eattr *ea = ea_find(a->attrs, &ea_krt_metric); return ea ? ea->u.data : 0; } -static inline int -krt_same_key(rte *a, rte *b) -{ - return (krt_metric(a) == krt_metric(b)); -} - -static inline int -krt_uptodate(rte *a, rte *b) -{ - return (a->attrs == b->attrs); -} - -static void -krt_learn_announce_update(struct krt_proto *p, rte *e) -{ - net *n = e->net; - rta *aa = rta_clone(e->attrs); - rte *ee = rte_get_temp(aa, p->p.main_source); - rte_update(&p->p, n->n.addr, ee); -} - -static void -krt_learn_announce_delete(struct krt_proto *p, net *n) -{ - rte_update(&p->p, n->n.addr, NULL); -} - static void krt_learn_alien_attr(struct channel *c, rte *e) { - ASSERT(!e->attrs->cached); - e->attrs->pref = c->preference; - - e->attrs = rta_lookup(e->attrs); + ea_set_attr_u32(&e->attrs, &ea_gen_preference, 0, c->preference); } /* Called when alien route is discovered during scan */ static void krt_learn_scan(struct krt_proto *p, rte *e) { - net *n0 = e->net; - net *n = net_get(p->krt_table, n0->n.addr); - rte *m, **mm; + rte e0 = { + .attrs = e->attrs, + .src = rt_get_source(&p->p, krt_metric(e)), + }; - krt_learn_alien_attr(p->p.main_channel, e); + krt_learn_alien_attr(p->p.main_channel, &e0); - for(mm=&n->routes; m = *mm; mm=&m->next) - if (krt_same_key(m, e)) - break; - if (m) - { - if (krt_uptodate(m, e)) - { - krt_trace_in_rl(&rl_alien, p, e, "[alien] seen"); - rte_free(e); - m->pflags |= KRT_REF_SEEN; - } - else - { - krt_trace_in(p, e, "[alien] updated"); - *mm = m->next; - rte_free(m); - m = NULL; - } - } - else - krt_trace_in(p, e, "[alien] created"); - if (!m) - { - e->next = n->routes; - n->routes = e; - e->pflags |= KRT_REF_SEEN; - } -} - -static void -krt_learn_prune(struct krt_proto *p) -{ - struct fib *fib = &p->krt_table->fib; - struct fib_iterator fit; - - KRT_TRACE(p, D_EVENTS, "Pruning inherited routes"); - - FIB_ITERATE_INIT(&fit, fib); -again: - FIB_ITERATE_START(fib, &fit, net, n) - { - rte *e, **ee, *best, **pbest, *old_best; - - /* - * Note that old_best may be NULL even if there was an old best route in - * the previous step, because it might be replaced in krt_learn_scan(). - * But in that case there is a new valid best route. - */ - - old_best = NULL; - best = NULL; - pbest = NULL; - ee = &n->routes; - while (e = *ee) - { - if (e->pflags & KRT_REF_BEST) - old_best = e; - - if (!(e->pflags & KRT_REF_SEEN)) - { - *ee = e->next; - rte_free(e); - continue; - } - - if (!best || krt_metric(best) > krt_metric(e)) - { - best = e; - pbest = ee; - } - - e->pflags &= ~(KRT_REF_SEEN | KRT_REF_BEST); - ee = &e->next; - } - if (!n->routes) - { - DBG("%I/%d: deleting\n", n->n.prefix, n->n.pxlen); - if (old_best) - krt_learn_announce_delete(p, n); - - FIB_ITERATE_PUT(&fit); - fib_delete(fib, n); - goto again; - } - - best->pflags |= KRT_REF_BEST; - *pbest = best->next; - best->next = n->routes; - n->routes = best; - - if ((best != old_best) || p->reload) - { - DBG("%I/%d: announcing (metric=%d)\n", n->n.prefix, n->n.pxlen, krt_metric(best)); - krt_learn_announce_update(p, best); - } - else - DBG("%I/%d: uptodate (metric=%d)\n", n->n.prefix, n->n.pxlen, krt_metric(best)); - } - FIB_ITERATE_END; - - p->reload = 0; + rte_update(p->p.main_channel, e->net, &e0, e0.src); + rt_unlock_source(e0.src); } static void krt_learn_async(struct krt_proto *p, rte *e, int new) { - net *n0 = e->net; - net *n = net_get(p->krt_table, n0->n.addr); - rte *g, **gg, *best, **bestp, *old_best; - - krt_learn_alien_attr(p->p.main_channel, e); - - old_best = n->routes; - for(gg=&n->routes; g = *gg; gg = &g->next) - if (krt_same_key(g, e)) - break; if (new) - { - if (g) - { - if (krt_uptodate(g, e)) - { - krt_trace_in(p, e, "[alien async] same"); - rte_free(e); - return; - } - krt_trace_in(p, e, "[alien async] updated"); - *gg = g->next; - rte_free(g); - } - else - krt_trace_in(p, e, "[alien async] created"); + return krt_learn_scan(p, e); - e->next = n->routes; - n->routes = e; - } - else if (!g) - { - krt_trace_in(p, e, "[alien async] delete failed"); - rte_free(e); - return; - } - else - { - krt_trace_in(p, e, "[alien async] removed"); - *gg = g->next; - rte_free(e); - rte_free(g); - } - best = n->routes; - bestp = &n->routes; - for(gg=&n->routes; g=*gg; gg=&g->next) - { - if (krt_metric(best) > krt_metric(g)) - { - best = g; - bestp = gg; - } - - g->pflags &= ~KRT_REF_BEST; - } - - if (best) - { - best->pflags |= KRT_REF_BEST; - *bestp = best->next; - best->next = n->routes; - n->routes = best; - } - - if (best != old_best) - { - DBG("krt_learn_async: distributing change\n"); - if (best) - krt_learn_announce_update(p, best); - else - krt_learn_announce_delete(p, n); - } -} - -static void -krt_learn_init(struct krt_proto *p) -{ - if (KRT_CF->learn) - { - struct rtable_config *cf = mb_allocz(p->p.pool, sizeof(struct rtable_config)); - cf->name = "Inherited"; - cf->addr_type = p->p.net_type; - cf->internal = 1; - - p->krt_table = rt_setup(p->p.pool, cf); - } -} - -static void -krt_dump(struct proto *P) -{ - struct krt_proto *p = (struct krt_proto *) P; - - if (!KRT_CF->learn) - return; - debug("KRT: Table of inheritable routes\n"); - rt_dump(p->krt_table); + struct rte_src *src = rt_get_source(&p->p, krt_metric(e)); + rte_update(p->p.main_channel, e->net, NULL, src); + rt_unlock_source(src); } #endif @@ -551,40 +341,60 @@ krt_dump(struct proto *P) static inline int krt_is_installed(struct krt_proto *p, net *n) { - return n->routes && bmap_test(&p->p.main_channel->export_map, n->routes->id); + return n->routes && bmap_test(&p->p.main_channel->export_map, n->routes->rte.id); } -static void -krt_flush_routes(struct krt_proto *p) +static uint +rte_feed_count(net *n) { - struct rtable *t = p->p.main_channel->table; + uint count = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + if (rte_is_valid(RTE_OR_NULL(e))) + count++; + return count; +} - KRT_TRACE(p, D_EVENTS, "Flushing kernel routes"); - FIB_WALK(&t->fib, net, n) +static void +rte_feed_obtain(net *n, const rte **feed, uint count) +{ + uint i = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + if (rte_is_valid(RTE_OR_NULL(e))) { - if (krt_is_installed(p, n)) - { - /* FIXME: this does not work if gw is changed in export filter */ - krt_replace_rte(p, n, NULL, n->routes); - } + ASSERT_DIE(i < count); + feed[i++] = &e->rte; } - FIB_WALK_END; + ASSERT_DIE(i == count); } static struct rte * -krt_export_net(struct krt_proto *p, net *net, rte **rt_free) +krt_export_net(struct krt_proto *p, net *net) { + /* FIXME: Here we are calling filters in table-locked context when exporting + * to kernel. Here BIRD can crash if the user requested ROA check in kernel + * export filter. It doesn't make much sense to write the filters like this, + * therefore we may keep this unfinished piece of work here for later as it + * won't really affect anybody. */ + ASSERT_DIE(RT_IS_LOCKED(p->p.main_channel->table)); + struct channel *c = p->p.main_channel; const struct filter *filter = c->out_filter; - rte *rt; if (c->ra_mode == RA_MERGED) - return rt_export_merged(c, net, rt_free, krt_filter_lp, 1); + { + uint count = rte_feed_count(net); + if (!count) + return NULL; + + const rte **feed = alloca(count * sizeof(rte *)); + rte_feed_obtain(net, feed, count); + return rt_export_merged(c, feed, count, krt_filter_lp, 1); + } - rt = net->routes; - *rt_free = NULL; + static _Thread_local rte rt; + rt = net->routes->rte; - if (!rte_is_valid(rt)) + if (!rte_is_valid(&rt)) return NULL; if (filter == FILTER_REJECT) @@ -595,33 +405,26 @@ krt_export_net(struct krt_proto *p, net *net, rte **rt_free) if (filter == FILTER_ACCEPT) goto accept; - if (f_run(filter, &rt, krt_filter_lp, FF_SILENT) > F_ACCEPT) + if (f_run(filter, &rt, FF_SILENT) > F_ACCEPT) goto reject; accept: - if (rt != net->routes) - *rt_free = rt; - return rt; + return &rt; reject: - if (rt != net->routes) - rte_free(rt); return NULL; } static int krt_same_dest(rte *k, rte *e) { - rta *ka = k->attrs, *ea = e->attrs; + ea_list *ka = k->attrs, *ea = e->attrs; - if (ka->dest != ea->dest) - return 0; - - if (ka->dest == RTD_UNICAST) - return nexthop_same(&(ka->nh), &(ea->nh)); + eattr *nhea_k = ea_find(ka, &ea_gen_nexthop); + eattr *nhea_e = ea_find(ea, &ea_gen_nexthop); - return 1; + return (!nhea_k == !nhea_e) && adata_same(nhea_k->u.ptr, nhea_e->u.ptr); } /* @@ -632,41 +435,48 @@ krt_same_dest(rte *k, rte *e) void krt_got_route(struct krt_proto *p, rte *e, s8 src) { - rte *new = NULL, *rt_free = NULL; - net *n = e->net; + rte *new = NULL; e->pflags = 0; #ifdef KRT_ALLOW_LEARN switch (src) { case KRT_SRC_KERNEL: - goto ignore; + krt_trace_in(p, e, "ignored"); + return; case KRT_SRC_REDIRECT: - goto delete; + krt_trace_in(p, e, "deleting"); + krt_replace_rte(p, e->net, NULL, e); + return; case KRT_SRC_ALIEN: if (KRT_CF->learn) krt_learn_scan(p, e); else - { - krt_trace_in_rl(&rl_alien, p, e, "[alien] ignored"); - rte_free(e); - } + krt_trace_in_rl(&rl_alien, p, e, "[alien] ignored"); return; } #endif + /* The rest is for KRT_SRC_BIRD (or KRT_SRC_UNKNOWN) */ + RT_LOCKED(p->p.main_channel->table, tab) + { + + /* Deleting all routes if flush is requested */ + if (p->flush_routes) + goto delete; /* We wait for the initial feed to have correct installed state */ if (!p->ready) goto ignore; - if (!krt_is_installed(p, n)) + net *net = net_find(tab, e->net); + if (!net || !krt_is_installed(p, net)) goto delete; - new = krt_export_net(p, n, &rt_free); + new = krt_export_net(p, net); /* Rejected by filters */ if (!new) @@ -699,19 +509,16 @@ ignore: update: krt_trace_in(p, new, "updating"); - krt_replace_rte(p, n, new, e); + krt_replace_rte(p, e->net, new, e); goto done; delete: krt_trace_in(p, e, "deleting"); - krt_replace_rte(p, n, NULL, e); + krt_replace_rte(p, e->net, NULL, e); goto done; -done: - rte_free(e); - - if (rt_free) - rte_free(rt_free); +done:; + } lp_flush(krt_filter_lp); } @@ -725,43 +532,47 @@ krt_init_scan(struct krt_proto *p) static void krt_prune(struct krt_proto *p) { - struct rtable *t = p->p.main_channel->table; + RT_LOCKED(p->p.main_channel->table, t) + { KRT_TRACE(p, D_EVENTS, "Pruning table %s", t->name); FIB_WALK(&t->fib, net, n) { - if (p->ready && krt_is_installed(p, n) && !bmap_test(&p->seen_map, n->routes->id)) + if (p->ready && krt_is_installed(p, n) && !bmap_test(&p->seen_map, n->routes->rte.id)) { - rte *rt_free = NULL; - rte *new = krt_export_net(p, n, &rt_free); + rte *new = krt_export_net(p, n); if (new) { krt_trace_in(p, new, "installing"); - krt_replace_rte(p, n, new, NULL); + krt_replace_rte(p, n->n.addr, new, NULL); } - if (rt_free) - rte_free(rt_free); - lp_flush(krt_filter_lp); } } FIB_WALK_END; -#ifdef KRT_ALLOW_LEARN - if (KRT_CF->learn) - krt_learn_prune(p); -#endif - if (p->ready) p->initialized = 1; + + } +} + +static void +krt_flush_routes(struct krt_proto *p) +{ + KRT_TRACE(p, D_EVENTS, "Flushing kernel routes"); + p->flush_routes = 1; + krt_init_scan(p); + krt_do_scan(p); + /* No prune! */ + p->flush_routes = 0; } void krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src) { - net *net = e->net; e->pflags = 0; switch (src) @@ -774,7 +585,7 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src) if (new) { krt_trace_in(p, e, "[redirect] deleting"); - krt_replace_rte(p, net, NULL, e); + krt_replace_rte(p, e->net, NULL, e); } /* If !new, it is probably echo of our deletion */ break; @@ -788,7 +599,6 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src) } #endif } - rte_free(e); } @@ -910,9 +720,12 @@ krt_scan_timer_kick(struct krt_proto *p) static int krt_preexport(struct channel *C, rte *e) { - // struct krt_proto *p = (struct krt_proto *) P; - if (e->src->proto == C->proto) + if (e->src->owner == &C->proto->sources) +#ifdef CONFIG_SINGLE_ROUTE + return 1; +#else return -1; +#endif if (!krt_capable(e)) return -1; @@ -921,8 +734,8 @@ krt_preexport(struct channel *C, rte *e) } static void -krt_rt_notify(struct proto *P, struct channel *ch UNUSED, net *net, - rte *new, rte *old) +krt_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net, + rte *new, const rte *old) { struct krt_proto *p = (struct krt_proto *) P; @@ -930,14 +743,8 @@ krt_rt_notify(struct proto *P, struct channel *ch UNUSED, net *net, return; #ifdef CONFIG_SINGLE_ROUTE - /* - * Implicit withdraw - when the imported kernel route becomes the best one, - * we know that the previous one exported to the kernel was already removed, - * but if we processed the update as usual, we would send withdraw to the - * kernel, which would remove the new imported route instead. - */ - rte *best = net->routes; - if (!new && best && (best->src->proto == P)) + /* Got the same route as we imported. Keep it, do nothing. */ + if (new && new->src->owner == &P->sources) return; #endif @@ -985,6 +792,14 @@ krt_feed_end(struct channel *C) krt_scan_timer_kick(p); } +static int +krt_rte_better(const rte *new, const rte *old) +{ + u32 n = ea_get_int(new->attrs, &ea_krt_metric, IGP_METRIC_UNKNOWN); + u32 o = ea_get_int(old->attrs, &ea_krt_metric, IGP_METRIC_UNKNOWN); + + return (n < o); +} /* * Protocol glue @@ -1026,6 +841,10 @@ krt_postconfig(struct proto_config *CF) krt_sys_postconfig(cf); } +struct rte_owner_class krt_rte_owner_class = { + .rte_better = krt_rte_better, +}; + static struct proto * krt_init(struct proto_config *CF) { @@ -1036,10 +855,12 @@ krt_init(struct proto_config *CF) p->p.preexport = krt_preexport; p->p.rt_notify = krt_rt_notify; - p->p.if_notify = krt_if_notify; + p->p.iface_sub.if_notify = krt_if_notify; p->p.reload_routes = krt_reload_routes; p->p.feed_end = krt_feed_end; + p->p.sources.class = &krt_rte_owner_class; + krt_sys_init(p); return &p->p; } @@ -1064,10 +885,6 @@ krt_start(struct proto *P) bmap_init(&p->seen_map, p->p.pool, 1024); add_tail(&krt_proto_list, &p->krt_node); -#ifdef KRT_ALLOW_LEARN - krt_learn_init(p); -#endif - if (!krt_sys_start(p)) { rem_node(&p->krt_node); @@ -1147,24 +964,15 @@ krt_copy_config(struct proto_config *dest, struct proto_config *src) krt_sys_copy_config(d, s); } -static int -krt_get_attr(const eattr *a, byte *buf, int buflen) -{ - switch (a->id) - { - case EA_KRT_SOURCE: - bsprintf(buf, "source"); - return GA_NAME; - - case EA_KRT_METRIC: - bsprintf(buf, "metric"); - return GA_NAME; - - default: - return krt_sys_get_attr(a, buf, buflen); - } -} +struct ea_class ea_krt_source = { + .name = "krt_source", + .type = T_INT, +}; +struct ea_class ea_krt_metric = { + .name = "krt_metric", + .type = T_INT, +}; #ifdef CONFIG_IP6_SADR_KERNEL #define MAYBE_IP6_SADR NB_IP6_SADR @@ -1181,7 +989,6 @@ krt_get_attr(const eattr *a, byte *buf, int buflen) struct protocol proto_unix_kernel = { .name = "Kernel", .template = "kernel%d", - .class = PROTOCOL_KERNEL, .preference = DEF_PREF_INHERITED, .channel_mask = NB_IP | MAYBE_IP6_SADR | MAYBE_MPLS, .proto_size = sizeof(struct krt_proto), @@ -1193,14 +1000,15 @@ struct protocol proto_unix_kernel = { .shutdown = krt_shutdown, .reconfigure = krt_reconfigure, .copy_config = krt_copy_config, - .get_attr = krt_get_attr, -#ifdef KRT_ALLOW_LEARN - .dump = krt_dump, -#endif }; void krt_build(void) { proto_build(&proto_unix_kernel); + + EA_REGISTER_ALL( + &ea_krt_source, + &ea_krt_metric, + ); } diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h index 18a206e6..9f7ebb4f 100644 --- a/sysdep/unix/krt.h +++ b/sysdep/unix/krt.h @@ -21,8 +21,7 @@ struct kif_proto; #define KRT_DEFAULT_ECMP_LIMIT 16 -#define EA_KRT_SOURCE EA_CODE(PROTOCOL_KERNEL, 0) -#define EA_KRT_METRIC EA_CODE(PROTOCOL_KERNEL, 1) +extern struct ea_class ea_krt_source, ea_krt_metric; #define KRT_REF_SEEN 0x1 /* Seen in table */ #define KRT_REF_BEST 0x2 /* Best in table */ @@ -51,10 +50,6 @@ struct krt_proto { struct proto p; struct krt_state sys; /* Sysdep state */ -#ifdef KRT_ALLOW_LEARN - struct rtable *krt_table; /* Internal table of inherited routes */ -#endif - timer *scan_timer; struct bmap sync_map; /* Keeps track which exported routes were successfully written to kernel */ struct bmap seen_map; /* Routes seen during last periodic scan */ @@ -63,6 +58,7 @@ struct krt_proto { byte ready; /* Initial feed has been finished */ byte initialized; /* First scan has been finished */ byte reload; /* Next scan is doing reload */ + byte flush_routes; /* Scanning to flush */ }; extern pool *krt_pool; @@ -141,7 +137,7 @@ void krt_sys_copy_config(struct krt_config *, struct krt_config *); int krt_capable(rte *e); void krt_do_scan(struct krt_proto *); -void krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old); +void krt_replace_rte(struct krt_proto *p, const net_addr *n, rte *new, const rte *old); int krt_sys_get_attr(const eattr *a, byte *buf, int buflen); diff --git a/sysdep/unix/log.c b/sysdep/unix/log.c index 53122aee..be7f8adf 100644 --- a/sysdep/unix/log.c +++ b/sysdep/unix/log.c @@ -15,6 +15,7 @@ * user's manual. */ +#include <stdatomic.h> #include <stdio.h> #include <stdlib.h> #include <stdarg.h> @@ -36,8 +37,10 @@ static FILE *dbgf; static list *current_log_list; static char *current_syslog_name; /* NULL -> syslog closed */ +static _Atomic uint max_thread_id = ATOMIC_VAR_INIT(1); +static _Thread_local uint this_thread_id; -#ifdef USE_PTHREADS +#define THIS_THREAD_ID (this_thread_id ?: (this_thread_id = atomic_fetch_add_explicit(&max_thread_id, 1, memory_order_acq_rel))) #include <pthread.h> @@ -49,15 +52,6 @@ static pthread_t main_thread; void main_thread_init(void) { main_thread = pthread_self(); } static int main_thread_self(void) { return pthread_equal(pthread_self(), main_thread); } -#else - -static inline void log_lock(void) { } -static inline void log_unlock(void) { } -void main_thread_init(void) { } -static int main_thread_self(void) { return 1; } - -#endif - #ifdef HAVE_SYSLOG_H #include <sys/syslog.h> @@ -190,7 +184,7 @@ log_commit(int class, buffer *buf) l->pos += msg_len; } - fprintf(l->fh, "%s <%s> ", tbuf, class_names[class]); + fprintf(l->fh, "%s [%04x] <%s> ", tbuf, THIS_THREAD_ID, class_names[class]); } fputs(buf->start, l->fh); fputc('\n', l->fh); @@ -300,6 +294,8 @@ die(const char *msg, ...) exit(1); } +static struct timespec dbg_time_start; + /** * debug - write to debug output * @msg: a printf-like message @@ -312,12 +308,34 @@ debug(const char *msg, ...) { #define MAX_DEBUG_BUFSIZE 16384 va_list args; - char buf[MAX_DEBUG_BUFSIZE]; + char buf[MAX_DEBUG_BUFSIZE], *pos = buf; + int max = MAX_DEBUG_BUFSIZE; va_start(args, msg); if (dbgf) { - if (bvsnprintf(buf, MAX_DEBUG_BUFSIZE, msg, args) < 0) +#if 0 + struct timespec dbg_time; + clock_gettime(CLOCK_MONOTONIC, &dbg_time); + uint nsec; + uint sec; + + if (dbg_time.tv_nsec > dbg_time_start.tv_nsec) + { + nsec = dbg_time.tv_nsec - dbg_time_start.tv_nsec; + sec = dbg_time.tv_sec - dbg_time_start.tv_sec; + } + else + { + nsec = 1000000000 + dbg_time.tv_nsec - dbg_time_start.tv_nsec; + sec = dbg_time.tv_sec - dbg_time_start.tv_sec - 1; + } + + int n = bsnprintf(pos, max, "%u.%09u: [%04x] ", sec, nsec, THIS_THREAD_ID); + pos += n; + max -= n; +#endif + if (bvsnprintf(pos, max, msg, args) < 0) bug("Extremely long debug output, split it."); fputs(buf, dbgf); @@ -438,6 +456,8 @@ done: void log_init_debug(char *f) { + clock_gettime(CLOCK_MONOTONIC, &dbg_time_start); + dbg_fd = -1; if (dbgf && dbgf != stderr) fclose(dbgf); diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c index 627d7a4a..79db32d3 100644 --- a/sysdep/unix/main.c +++ b/sysdep/unix/main.c @@ -28,9 +28,10 @@ #include "lib/resource.h" #include "lib/socket.h" #include "lib/event.h" +#include "lib/locking.h" #include "lib/timer.h" #include "lib/string.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "nest/cli.h" @@ -51,12 +52,12 @@ async_dump(void) { debug("INTERNAL STATE DUMP\n\n"); - rdump(&root_pool); + rdump(&root_pool, 0); sk_dump_all(); // XXXX tm_dump_all(); if_dump_all(); neigh_dump_all(); - rta_dump_all(); + ea_dump_all(); rt_dump_all(); protos_dump_all(); @@ -199,9 +200,11 @@ sysdep_preconfig(struct config *c) } int -sysdep_commit(struct config *new, struct config *old UNUSED) +sysdep_commit(struct config *new, struct config *old) { log_switch(0, &new->logfiles, new->syslog_name); + bird_thread_commit(new, old); + return 0; } @@ -401,7 +404,8 @@ static char *path_control_socket = PATH_CONTROL_SOCKET; static void cli_write(cli *c) { - sock *s = c->priv; + sock *s = c->sock; + ASSERT_DIE(c->sock); while (c->tx_pos) { @@ -425,7 +429,9 @@ cli_write(cli *c) void cli_write_trigger(cli *c) { - sock *s = c->priv; + sock *s = c->sock; + if (!s) + return; if (s->tbuf == NULL) cli_write(c); @@ -440,7 +446,8 @@ cli_tx(sock *s) int cli_get_command(cli *c) { - sock *s = c->priv; + sock *s = c->sock; + ASSERT_DIE(c->sock); byte *t = s->rbuf; byte *tend = s->rpos; byte *d = c->rx_pos; @@ -489,6 +496,7 @@ cli_err(sock *s, int err) else log(L_INFO "CLI connection closed"); } + cli_free(s->data); } @@ -534,7 +542,7 @@ cli_init_unix(uid_t use_uid, gid_t use_gid) /* Return value intentionally ignored */ unlink(path_control_socket); - if (sk_open_unix(s, path_control_socket) < 0) + if (sk_open_unix(s, &main_birdloop, path_control_socket) < 0) die("Cannot create control socket %s: %m", path_control_socket); if (use_uid || use_gid) @@ -885,16 +893,19 @@ main(int argc, char **argv) dmalloc_debug(0x2f03d00); #endif + times_update(); parse_args(argc, argv); log_switch(1, NULL, NULL); + the_bird_lock(); + random_init(); net_init(); resource_init(); - timer_init(); + birdloop_init(); olock_init(); - io_init(); rt_init(); + io_init(); if_init(); // roa_init(); config_init(); @@ -924,6 +935,8 @@ main(int argc, char **argv) if (parse_and_exit) exit(0); + flush_local_pages(); + if (!run_in_foreground) { pid_t pid = fork(); @@ -939,6 +952,7 @@ main(int argc, char **argv) dup2(0, 2); } + main_thread_init(); write_pid_file(); diff --git a/sysdep/unix/unix.h b/sysdep/unix/unix.h index ad85d1ea..606b79cd 100644 --- a/sysdep/unix/unix.h +++ b/sysdep/unix/unix.h @@ -9,6 +9,9 @@ #ifndef _BIRD_UNIX_H_ #define _BIRD_UNIX_H_ +#include "nest/bird.h" +#include "lib/io-loop.h" + #include <sys/socket.h> #include <signal.h> @@ -16,6 +19,7 @@ struct pool; struct iface; struct birdsock; struct rfile; +struct config; /* main.c */ @@ -32,6 +36,8 @@ void cmd_reconfig_undo(void); void cmd_reconfig_status(void); void cmd_shutdown(void); void cmd_graceful_restart(void); +void cmd_show_threads(int); +void bird_thread_commit(struct config *new, struct config *old); #define UNIX_DEFAULT_CONFIGURE_TIMEOUT 300 @@ -107,7 +113,7 @@ extern volatile sig_atomic_t async_shutdown_flag; void io_init(void); void io_loop(void); void io_log_dump(void); -int sk_open_unix(struct birdsock *s, char *name); +int sk_open_unix(struct birdsock *s, struct birdloop *, char *name); struct rfile *rf_open(struct pool *, const char *name, const char *mode); void *rf_file(struct rfile *f); int rf_fileno(struct rfile *f); diff --git a/test/birdtest.c b/test/birdtest.c index 3bf1fa77..cdca3829 100644 --- a/test/birdtest.c +++ b/test/birdtest.c @@ -21,6 +21,7 @@ #include "test/birdtest.h" #include "lib/string.h" #include "lib/event.h" +#include "lib/io-loop.h" #ifdef HAVE_EXECINFO_H #include <execinfo.h> @@ -64,6 +65,9 @@ bt_init(int argc, char *argv[]) { int c; + /* We have no interest in stdin */ + close(0); + initstate(BT_RANDOM_SEED, (char *) bt_random_state, sizeof(bt_random_state)); bt_verbose = 0; @@ -120,9 +124,11 @@ bt_init(int argc, char *argv[]) clock_gettime(CLOCK_MONOTONIC, &bt_begin); bt_suite_case_begin = bt_suite_begin = bt_begin; + the_bird_lock(); resource_init(); - ev_init_list(&global_event_list); - + ev_init_list(&global_event_list, &main_birdloop, "Global event list in unit tests"); + ev_init_list(&global_work_list, &main_birdloop, "Global work list in unit tests"); + birdloop_init(); return; usage: @@ -246,7 +252,7 @@ bt_log_result(int result, u64 time, const char *fmt, va_list argptr) printf("%s\n", result_str); if (do_die && !result) - abort(); + exit(1); } static u64 @@ -551,7 +557,14 @@ void cmd_reconfig_undo_notify(void) {} #include "lib/net.h" #include "conf/conf.h" void sysdep_preconfig(struct config *c UNUSED) {} -int sysdep_commit(struct config *new UNUSED, struct config *old UNUSED) { return 0; } + +void bird_thread_commit(struct config *new, struct config *old); +int sysdep_commit(struct config *new, struct config *old) +{ + bird_thread_commit(new, old); + return 0; +} + void sysdep_shutdown_done(void) {} #include "nest/cli.h" diff --git a/test/birdtest.h b/test/birdtest.h index ad5f8f9c..b8978b3e 100644 --- a/test/birdtest.h +++ b/test/birdtest.h @@ -40,7 +40,7 @@ static inline u64 bt_random(void) void bt_log_suite_result(int result, const char *fmt, ...); void bt_log_suite_case_result(int result, const char *fmt, ...); -#define BT_TIMEOUT 5 /* Default timeout in seconds */ +#define BT_TIMEOUT 20 /* Default timeout in seconds */ #define BT_FORKING 1 /* Forking is enabled in default */ #define BT_RANDOM_SEED 0x5097d2bb diff --git a/test/bt-utils.c b/test/bt-utils.c index 8496e185..d497840f 100644 --- a/test/bt-utils.c +++ b/test/bt-utils.c @@ -14,7 +14,7 @@ #include "test/bt-utils.h" #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "sysdep/unix/unix.h" @@ -53,6 +53,8 @@ cf_file_read(byte *dest, uint max_len, int fd) return l; } +void resource_sys_init(void); + void bt_bird_init(void) { @@ -61,23 +63,14 @@ bt_bird_init(void) log_switch(bt_verbose != 0, NULL, NULL); olock_init(); - timer_init(); - io_init(); rt_init(); + io_init(); if_init(); config_init(); protos_build(); } -void bt_bird_cleanup(void) -{ - for (int i = 0; i < PROTOCOL__MAX; i++) - class_to_protocol[i] = NULL; - - config = new_config = NULL; -} - static char * bt_load_file(const char *filename, int quiet) { diff --git a/test/bt-utils.h b/test/bt-utils.h index 13d267cc..758dbf48 100644 --- a/test/bt-utils.h +++ b/test/bt-utils.h @@ -28,7 +28,6 @@ uint bt_naive_pow(uint base, uint power); void bt_bytes_to_hex(char *buf, const byte *in_data, size_t size); void bt_bird_init(void); -void bt_bird_cleanup(void); struct config *bt_config_parse(const char *cfg); struct config *bt_config_file_parse(const char *filepath); |