diff options
121 files changed, 6517 insertions, 2985 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7809fecd..0a758cff 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -134,21 +134,11 @@ docker_fedora-34-amd64: IMG_NAME: "fedora-34-amd64" <<: *docker_build -docker_centos-7-amd64: - variables: - IMG_NAME: "centos-7-amd64" - <<: *docker_build - docker_centos-8-amd64: variables: IMG_NAME: "centos-8-amd64" <<: *docker_build -docker_ubuntu-14_04-amd64: - variables: - IMG_NAME: "ubuntu-14.04-amd64" - <<: *docker_build - docker_ubuntu-16_04-amd64: variables: IMG_NAME: "ubuntu-16.04-amd64" @@ -312,18 +302,10 @@ build-fedora-34-amd64: <<: *build-linux image: registry.nic.cz/labs/bird:fedora-33-amd64 -build-centos-7-amd64: - <<: *build-linux - image: registry.nic.cz/labs/bird:centos-7-amd64 - build-centos-8-amd64: <<: *build-linux image: registry.nic.cz/labs/bird:centos-8-amd64 -build-ubuntu-14_04-amd64: - <<: *build-linux - image: registry.nic.cz/labs/bird:ubuntu-14.04-amd64 - build-ubuntu-16_04-amd64: <<: *build-linux image: registry.nic.cz/labs/bird:ubuntu-16.04-amd64 @@ -466,14 +448,7 @@ pkg-fedora-33-amd64: pkg-fedora-34-amd64: <<: *pkg-rpm needs: [build-fedora-34-amd64] - image: registry.nic.cz/labs/bird:fedora-34-amd64 - -pkg-centos-7-amd64: - <<: *pkg-rpm-wa - variables: - LC_ALL: en_US.UTF-8 - needs: [build-centos-7-amd64] - image: registry.nic.cz/labs/bird:centos-7-amd64 + image: registry.labs.nic.cz/labs/bird:fedora-34-amd64 pkg-centos-8-amd64: <<: *pkg-rpm-wa @@ -27,9 +27,9 @@ Requirements For compiling BIRD you need these programs and libraries: - - GNU C Compiler (or LLVM Clang) + - GNU C Compiler (or LLVM Clang) capable of compiling C11 code - GNU Make - - GNU Bison + - GNU Bison (at least 3.0) - GNU M4 - Flex diff --git a/Makefile.in b/Makefile.in index 0d55807b..fa534872 100644 --- a/Makefile.in +++ b/Makefile.in @@ -26,6 +26,7 @@ INSTALL_DATA=@INSTALL_DATA@ client=$(addprefix $(exedir)/,@CLIENT@) daemon=$(exedir)/bird protocols=@protocols@ +PROTO_BUILD := $(protocols) dev kif krt prefix=@prefix@ exec_prefix=@exec_prefix@ @@ -82,9 +83,6 @@ conf-lex-targets := $(addprefix $(objdir)/conf/,cf-lex.o) conf-y-targets := $(addprefix $(objdir)/conf/,cf-parse.y keywords.h commands.h) cf-local = $(conf-y-targets): $(s)config.Y -# nest/Makefile declarations needed for all other modules -proto-build-c := $(addprefix $(objdir)/nest/,proto-build.c) - src-o-files = $(patsubst %.c,$(o)%.o,$(src)) tests-target-files = $(patsubst %.c,$(o)%,$(tests_src)) @@ -98,13 +96,6 @@ else o = $(patsubst $(srcdir)%,$(objdir)%,$(s)) endif -define proto-build_in = -PROTO_BUILD += $(1) -$(proto-build-c): $(lastword $(MAKEFILE_LIST)) -endef - -proto-build = $(eval $(call proto-build_in,$(1))) - define clean_in = clean:: rm -f $(addprefix $(o),$(1)) @@ -1,3 +1,11 @@ +Version 2.0.10 (2022-06-16) + o BGP performance improvements + o BFD: New 'strict bind' option + o RPKI: VRF support + o Allow use of 240.0.0.0/4 as a private range + o BIRD client uses exit status to report errors + o Important bugfixes + Version 2.0.9 (2022-02-09) o BGP: Flowspec validation procedure o Babel: MAC authentication support @@ -1,5 +1,6 @@ dnl ** Additional Autoconf tests for BIRD configure script dnl ** (c) 1999 Martin Mares <mj@ucw.cz> +dnl ** (c) 2021 Maria Matejka <mq@jmq.cz> AC_DEFUN([BIRD_CHECK_POINTER_ALIGNMENT], [ @@ -35,14 +36,23 @@ AC_DEFUN([BIRD_CHECK_THREAD_LOCAL], AC_COMPILE_IFELSE([ AC_LANG_PROGRAM( [ - _Thread_local static int x = 42; + static _Thread_local int x = 42; ], [] ) ], [bird_cv_thread_local=yes], + [AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM( + [ + static __thread int x = 42; + ], + [] + ) + ], + [bird_cv_thread_local=__thread], [bird_cv_thread_local=no] - ) + )]) ) ]) diff --git a/bird-gdb.py b/bird-gdb.py index 62c0ec87..077703f9 100644 --- a/bird-gdb.py +++ b/bird-gdb.py @@ -122,7 +122,7 @@ class BIRDFLinePrinter(BIRDPrinter): "n": n, "code": str(self.val['items'][n]['fi_code']), } if n % 8 == 0 else str(self.val['items'][n]['fi_code']) for n in range(cnt)])) - + class BIRDFExecStackPrinter(BIRDPrinter): "Print BIRD's struct f_exec_stack" @@ -140,6 +140,118 @@ class BIRDFExecStackPrinter(BIRDPrinter): "n": n } for n in range(cnt-1, -1, -1) ]) + +class BIRD: + def skip_back(t, i, v): + if isinstance(t, str): + t = gdb.lookup_type(t) + elif isinstance(t, gdb.Value): + t = gdb.lookup_type(t.string()) + elif not isinstance(t, gdb.Type): + raise Exception(f"First argument of skip_back(t, i, v) must be a type, got {type(t)}") + + t = t.strip_typedefs() + nullptr = gdb.Value(0).cast(t.pointer()) + + if isinstance(i, gdb.Value): + i = i.string() + elif not isinstance(i, str): + raise Exception(f"Second argument of skip_back(t, i, v) must be a item name, got {type(i)}") + + if not isinstance(v, gdb.Value): + raise Exception(f"Third argument of skip_back(t, i, v) must be a value, got {type(v)}") + if v.type.code != gdb.TYPE_CODE_PTR and v.type.code != gdb.TYPE_CODE_REF: + raise Exception(f"Third argument of skip_back(t, i, v) must be a pointer, is {v.type} ({v.type.code})") + if v.type.target().strip_typedefs() != nullptr[i].type: + raise Exception(f"Third argument of skip_back(t, i, v) points to type {v.type.target().strip_typedefs()}, should be {nullptr[i].type}") + + uintptr_t = gdb.lookup_type("uintptr_t") + taddr = v.dereference().address.cast(uintptr_t) - nullptr[i].address.cast(uintptr_t) + return gdb.Value(taddr).cast(t.pointer()) + + class skip_back_gdb(gdb.Function): + "Given address of a structure item, returns address of the structure, as the SKIP_BACK macro does" + def __init__(self): + gdb.Function.__init__(self, "SKIP_BACK") + + def invoke(self, t, i, v): + return BIRD.skip_back(t, i, v) + + +BIRD.skip_back_gdb() + + +class BIRDList: + def __init__(self, val): + ltype = val.type.strip_typedefs() + if ltype.code != gdb.TYPE_CODE_UNION or ltype.tag != "list": + raise Exception(f"Not a list, is type {ltype}") + + self.head = val["head"] + self.tail_node = val["tail_node"] + + if str(self.head.address) == '0x0': + raise Exception("List head is NULL") + + if str(self.tail_node["prev"].address) == '0x0': + raise Exception("List tail is NULL") + + def walk(self, do): + cur = self.head + while cur.dereference() != self.tail_node: + do(cur) + cur = cur.dereference()["next"] + + +class BIRDListLength(gdb.Function): + """Returns length of the list, as in + print $list_length(routing_tables)""" + def __init__(self): + super(BIRDListLength, self).__init__("list_length") + + def count(self, _): + self.cnt += 1 + + def invoke(self, l): + self.cnt = 0 + BIRDList(l).walk(self.count) + return self.cnt + +BIRDListLength() + +class BIRDListItem(gdb.Function): + """Returns n-th item of the list.""" + def __init__(self): + super(BIRDListItem, self).__init__("list_item") + + class BLException(Exception): + def __init__(self, node, msg): + Exception.__init__(self, msg) + self.node = node + + def count(self, node): + if self.cnt == self.pos: + raise self.BLException(node, "Node found") + + self.cnt += 1 + + def invoke(self, l, n, t=None, item="n"): + self.cnt = 0 + self.pos = n + bl = BIRDList(l) + try: + bl.walk(self.count) + except self.BLException as e: + if t is None: + return e.node + else: + return BIRD.skip_back(t, item, e.node) + + raise Exception("List too short") + +BIRDListItem() + + def register_printers(objfile): objfile.pretty_printers.append(BIRDFInstPrinter.lookup) objfile.pretty_printers.append(BIRDFValPrinter.lookup) diff --git a/conf/cf-lex.l b/conf/cf-lex.l index 11bcdb18..04e0b3a5 100644 --- a/conf/cf-lex.l +++ b/conf/cf-lex.l @@ -577,6 +577,8 @@ check_eof(void) return 0; } +static inline void cf_swap_soft_scope(void); + static struct symbol * cf_new_symbol(const byte *c) { @@ -586,6 +588,8 @@ cf_new_symbol(const byte *c) if (l > SYM_MAX_LEN) cf_error("Symbol too long"); + cf_swap_soft_scope(); + s = cfg_allocz(sizeof(struct symbol) + l + 1); *s = (struct symbol) { .scope = conf_this_scope, .class = SYM_VOID, }; strcpy(s->name, c); @@ -676,8 +680,8 @@ cf_localize_symbol(struct symbol *sym) return sym; /* If the scope is the current, it is already defined in this scope. */ - if (sym->scope == conf_this_scope) - cf_error("Symbol already defined"); + if (cf_symbol_is_local(sym)) + cf_error("Symbol '%s' already defined", sym->name); /* Not allocated here yet, doing it now. */ return cf_new_symbol(sym->name); @@ -839,6 +843,8 @@ cf_push_scope(struct symbol *sym) void cf_pop_scope(void) { + ASSERT(!conf_this_scope->soft_scopes); + conf_this_scope->active = 0; conf_this_scope = conf_this_scope->next; @@ -846,6 +852,52 @@ cf_pop_scope(void) } /** + * cf_push_soft_scope - enter new soft scope + * + * If we want to enter a new anonymous scope that most likely will not contain + * any symbols, we can use cf_push_soft_scope() insteas of cf_push_scope(). + * Such scope will be converted to a regular scope on first use. + */ +void +cf_push_soft_scope(void) +{ + if (conf_this_scope->soft_scopes < 0xfe) + conf_this_scope->soft_scopes++; + else + cf_push_scope(NULL); +} + +/** + * cf_pop_soft_scope - leave a soft scope + * + * Leave a soft scope entered by cf_push_soft_scope(). + */ +void +cf_pop_soft_scope(void) +{ + if (conf_this_scope->soft_scopes) + conf_this_scope->soft_scopes--; + else + cf_pop_scope(); +} + +/** + * cf_swap_soft_scope - convert soft scope to regular scope + * + * Soft scopes cannot hold symbols, so they must be converted to regular scopes + * on first use. It is done automatically by cf_new_symbol(). + */ +static inline void +cf_swap_soft_scope(void) +{ + if (conf_this_scope->soft_scopes) + { + conf_this_scope->soft_scopes--; + cf_push_scope(NULL); + } +} + +/** * cf_symbol_class_name - get name of a symbol class * @sym: symbol * diff --git a/conf/conf.c b/conf/conf.c index 580a6472..17424402 100644 --- a/conf/conf.c +++ b/conf/conf.c @@ -140,6 +140,7 @@ config_parse(struct config *c) protos_preconfig(c); rt_preconfig(c); cf_parse(); + rt_postconfig(c); if (EMPTY_LIST(c->protos)) cf_error("No protocol is specified in the config file"); diff --git a/conf/conf.h b/conf/conf.h index 4ccaa54e..ffefa519 100644 --- a/conf/conf.h +++ b/conf/conf.h @@ -27,7 +27,7 @@ struct config { int mrtdump_file; /* Configured MRTDump file (sysdep, fd in unix) */ const char *syslog_name; /* Name used for syslog (NULL -> no syslog) */ - struct rtable_config *def_tables[NET_MAX]; /* Default routing tables for each network */ + struct symbol *def_tables[NET_MAX]; /* Default routing tables for each network */ struct iface_patt *router_id_from; /* Configured list of router ID iface patterns */ u32 router_id; /* Our Router ID */ @@ -44,7 +44,7 @@ struct config { int cli_debug; /* Tracing of CLI connections and commands */ int latency_debug; /* I/O loop tracks duration of each event */ - int pipe_debug; /* Track route propagation through pipes */ + int table_debug; /* Track route propagation through tables */ u32 latency_limit; /* Events with longer duration are logged (us) */ u32 watchdog_warning; /* I/O loop watchdog limit for warning (us) */ u32 watchdog_timeout; /* Watchdog timeout (in seconds, 0 = disabled) */ @@ -136,7 +136,8 @@ struct sym_scope { HASH(struct symbol) hash; /* Local symbol hash */ uint slots; /* Variable slots */ - int active; /* Currently entered */ + byte active; /* Currently entered */ + byte soft_scopes; /* Number of soft scopes above */ }; extern struct sym_scope *global_root_scope; @@ -202,6 +203,9 @@ struct symbol *cf_get_symbol(const byte *c); struct symbol *cf_default_name(char *template, int *counter); struct symbol *cf_localize_symbol(struct symbol *sym); +static inline int cf_symbol_is_local(struct symbol *sym) +{ return (sym->scope == conf_this_scope) && !conf_this_scope->soft_scopes; } + /** * cf_define_symbol - define meaning of a symbol * @sym: symbol to be defined @@ -225,6 +229,9 @@ struct symbol *cf_localize_symbol(struct symbol *sym); void cf_push_scope(struct symbol *); void cf_pop_scope(void); +void cf_push_soft_scope(void); +void cf_pop_soft_scope(void); + char *cf_symbol_class_name(struct symbol *sym); /* Parser */ diff --git a/conf/confbase.Y b/conf/confbase.Y index 58890cd6..8e5da9e3 100644 --- a/conf/confbase.Y +++ b/conf/confbase.Y @@ -14,6 +14,7 @@ CF_HDR #include "conf/conf.h" #include "lib/resource.h" #include "lib/socket.h" +#include "lib/settle.h" #include "lib/timer.h" #include "lib/string.h" #include "nest/protocol.h" @@ -76,6 +77,7 @@ CF_DECLS struct f_attr_bit fab; struct f_lval flv; struct f_line *fl; + struct f_arg *fa; const struct filter *f; struct f_tree *e; struct f_trie *trie; @@ -92,6 +94,7 @@ CF_DECLS struct proto_spec ps; struct channel_limit cl; struct timeformat *tf; + struct settle_config settle; struct adata *ad; struct bytestring *bs; } @@ -110,8 +113,9 @@ CF_DECLS %type <i> expr bool pxlen4 %type <time> expr_us time +%type <settle> settle %type <a> ipa -%type <net> net_ip4_ net_ip6_ net_ip6 net_ip_ net_ip net_or_ipa +%type <net> net_ip4_ net_ip4 net_ip6_ net_ip6 net_ip_ net_ip net_or_ipa %type <net_ptr> net_ net_any net_vpn4_ net_vpn6_ net_vpn_ net_roa4_ net_roa6_ net_roa_ net_ip6_sadr_ net_mpls_ %type <ad> label_stack_start label_stack @@ -154,14 +158,14 @@ conf: definition ; definition: DEFINE symbol '=' term ';' { struct f_val val; - if (f_eval(f_linearize($4), &val) > F_RETURN) cf_error("Runtime error"); + if (f_eval(f_linearize($4, 1), &val) > F_RETURN) cf_error("Runtime error"); cf_define_symbol($2, SYM_CONSTANT | val.type, val, lp_val_copy(cfg_mem, &val)); } ; expr: NUM - | '(' term ')' { $$ = f_eval_int(f_linearize($2)); } + | '(' term ')' { $$ = f_eval_int(f_linearize($2, 1)); } | symbol_known { if ($1->class != (SYM_CONSTANT | T_INT)) cf_error("Number constant expected"); $$ = SYM_VAL($1).i; } @@ -307,6 +311,15 @@ net_: /* Networks - regular */ +net_ip4: + net_ip4_ + | CF_SYM_KNOWN { + if (($1->class != (SYM_CONSTANT | T_NET)) || (SYM_VAL($1).net->type != NET_IP4)) + cf_error("IPv4 network constant expected"); + $$ = * SYM_VAL($1).net; + } + ; + net_ip6: net_ip6_ | CF_SYM_KNOWN { @@ -376,6 +389,13 @@ time: } ; +/* Settle timer configuration */ +settle: expr_us expr_us { + if ($1 > $2) cf_error("Minimum settle time %t is bigger than maximum settle time %t", $1, $2); + $$.min = $1; + $$.max = $2; +}; + text: TEXT | CF_SYM_KNOWN { diff --git a/conf/flowspec.Y b/conf/flowspec.Y index 56a7c5dc..dbdbdda5 100644 --- a/conf/flowspec.Y +++ b/conf/flowspec.Y @@ -142,7 +142,7 @@ flow_frag_opts: ; flow4_item: - flow_srcdst net_ip { + flow_srcdst net_ip4 { flow_builder_set_type(this_flow, $1); flow_builder4_add_pfx(this_flow, (net_addr_ip4 *) &($2)); } diff --git a/configure.ac b/configure.ac index 321bed95..330add87 100644 --- a/configure.ac +++ b/configure.ac @@ -36,12 +36,6 @@ AC_ARG_ENABLE([memcheck], [enable_memcheck=yes] ) -AC_ARG_ENABLE([pthreads], - [AS_HELP_STRING([--enable-pthreads], [enable POSIX threads support @<:@try@:>@])], - [], - [enable_pthreads=try] -) - AC_ARG_ENABLE([libssh], [AS_HELP_STRING([--enable-libssh], [enable LibSSH support in RPKI @<:@try@:>@])], [], @@ -125,25 +119,19 @@ if test -z "$GCC" ; then fi BIRD_CHECK_THREAD_LOCAL -if test "$bird_cv_thread_local" = yes ; then - AC_DEFINE([HAVE_THREAD_LOCAL], [1], [Define to 1 if _Thread_local is available]) +if test "$bird_cv_thread_local" = no ; then + AC_MSG_ERROR([This program requires thread local storage.]) +elif test "$bird_cv_thread_local" != yes ; then + AC_DEFINE_UNQUOTED([_Thread_local], [$bird_cv_thread_local], [Legacy _Thread_local]) fi -if test "$enable_pthreads" != no ; then - BIRD_CHECK_PTHREADS +BIRD_CHECK_PTHREADS - if test "$bird_cv_lib_pthreads" = yes ; then - AC_DEFINE([USE_PTHREADS], [1], [Define to 1 if pthreads are enabled]) - CFLAGS="$CFLAGS -pthread" - LDFLAGS="$LDFLAGS -pthread" - proto_bfd=bfd - elif test "$enable_pthreads" = yes ; then - AC_MSG_ERROR([POSIX threads not available.]) - fi - - if test "$enable_pthreads" = try ; then - enable_pthreads="$bird_cv_lib_pthreads" - fi +if test "$bird_cv_lib_pthreads" = yes ; then + CFLAGS="$CFLAGS -pthread" + LDFLAGS="$LDFLAGS -pthread" +else + AC_MSG_ERROR([POSIX threads not available.]) fi # This is assumed to be necessary for proper BIRD build @@ -304,8 +292,7 @@ if test "$enable_mpls_kernel" != no ; then fi fi -all_protocols="$proto_bfd babel bgp mrt ospf perf pipe radv rip rpki static" - +all_protocols="bfd babel bgp mrt ospf perf pipe radv rip rpki static" all_protocols=`echo $all_protocols | sed 's/ /,/g'` if test "$with_protocols" = all ; then @@ -350,10 +337,16 @@ case $sysdesc in ;; esac -AC_CHECK_HEADERS_ONCE([alloca.h syslog.h]) -AC_CHECK_HEADER([sys/mman.h], [AC_DEFINE([HAVE_MMAP], [1], [Define to 1 if mmap() is available.])]) +AC_CHECK_HEADERS_ONCE([alloca.h syslog.h stdatomic.h]) +AC_CHECK_HEADER([sys/mman.h], [AC_DEFINE([HAVE_MMAP], [1], [Define to 1 if mmap() is available.])], have_mman=no) +AC_CHECK_FUNC([aligned_alloc], [AC_DEFINE([HAVE_ALIGNED_ALLOC], [1], [Define to 1 if aligned_alloc() is available.])], have_aligned_alloc=no) AC_CHECK_MEMBERS([struct sockaddr.sa_len], [], [], [#include <sys/socket.h>]) +if test "$have_aligned_alloc" = "no" && test "$have_mman" = "no" ; then + AC_MSG_ERROR([No means of aligned alloc found. Need mmap() or aligned_alloc().]) +fi + + AC_C_BIGENDIAN( [AC_DEFINE([CPU_BIG_ENDIAN], [1], [Define to 1 if cpu is big endian])], [AC_DEFINE([CPU_LITTLE_ENDIAN], [1], [Define to 1 if cpu is little endian])], @@ -409,7 +402,7 @@ if test "$enable_debug" = yes ; then fi fi - if test "enable_debug_expensive" = yes ; then + if test "$enable_debug_expensive" = yes ; then AC_DEFINE([ENABLE_EXPENSIVE_CHECKS], [1], [Define to 1 if you want to run expensive consistency checks.]) fi fi @@ -474,7 +467,6 @@ AC_MSG_RESULT([ Object directory: $objdir]) AC_MSG_RESULT([ Iproute2 directory: $iproutedir]) AC_MSG_RESULT([ System configuration: $sysdesc]) AC_MSG_RESULT([ Debugging: $enable_debug]) -AC_MSG_RESULT([ POSIX threads: $enable_pthreads]) AC_MSG_RESULT([ Routing protocols: $protocols]) AC_MSG_RESULT([ LibSSH support in RPKI: $enable_libssh]) AC_MSG_RESULT([ Kernel MPLS support: $enable_mpls_kernel]) diff --git a/doc/bird.sgml b/doc/bird.sgml index c9ce670b..0cfe19c4 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -505,6 +505,11 @@ include "tablename.conf";; See <ref id="channel-debug" name="debug"> in the channel section. Default: off. + <tag><label id="opt-debug-tables">debug tables all|off|{ states|routes|filters|events [, <m/.../] }</tag> + Set global defaults of table debugging options. + See <ref id="rtable-debug" name="debug"> in the table section. + Default: off. + <tag><label id="opt-debug-commands">debug commands <m/number/</tag> Control logging of client connections (0 for no logging, 1 for logging of connects and disconnects, 2 and higher for logging of all client @@ -670,20 +675,54 @@ to set options. disadvantage is that trie-enabled routing tables require more memory, which may be an issue especially in multi-table setups. Default: off. - <tag><label id="rtable-min-settle-time">min settle time <m/time/</tag> - Specify a minimum value of the settle time. When a ROA table changes, - automatic <ref id="proto-rpki-reload" name="RPKI reload"> may be - triggered, after a short settle time. Minimum settle time is a delay - from the last ROA table change to wait for more updates. Default: 1 s. - + <tag><label id="rtable-gc-threshold">gc threshold <m/number/</tag> + Specify a minimum amount of removed networks that triggers a garbage + collection (GC) cycle. Default: 1000. + + <tag><label id="rtable-gc-period">gc period <m/time/</tag> + Specify a period of time between consecutive GC cycles. When there is a + significant amount of route withdraws, GC cycles are executed repeatedly + with given period time (with some random factor). When there is just + small amount of changes, GC cycles are not executed. In extensive route + server setups, running GC on hundreds of full BGP routing tables can + take significant amount of time, therefore they should use higher GC + periods. Default: adaptive, based on number of routing tables in the + configuration. From 10 s (with <= 25 routing tables) up to 600 s (with + >= 1500 routing tables). + + <tag><label id="rtable-cork-threshold">cork threshold <m/number/ <m/number/</tag> + Too many pending exports may lead to memory bloating. In such cases, + BIRD tries to relieve the memory pressure by pausing some routines until + the queue sizes get low enough. This option allows the user to set the + thresholds; first value is the low threshold (when to resume), the + second one is the high threshold (when to pause). The higher is the + threshold, the more memory can get used. In most cases, the defaults + should work for you. Default: 128, 512. + + <tag><label id="rtable-export-settle-time">export settle time <m/time/ <m/time/</tag> + Minimum and maximum settle times, respectively, for export announcements. + When multiple routes are changing, this mechanism waits for the changes + to settle before waking up sleeping export threads but if the changes are coming + steadily, BIRD isn't waiting forever; at most the maximum time. + Default values: <cf/1 ms 100 ms/. You have to always provide both values. + + <tag><label id="rtable-route-refresh-export-settle-time">route refresh export settle time <m/time/ <m/time/</tag> + Minimum and maximum settle times, respectively, for export announcements + (the same as above), valid when any channel is currently doing a route refresh. + This serves a purpose of even more aggresive change bundling, knowing that there + is some active process generating changes in a fast pace. If you don't want + this feature, set this to the same values as <ref id="rtable-export-settle-time" name="export settle time">. + Default values: <cf/100 ms 3 s/. + + <tag><label id="rtable-debug">debug all|off|{ states|routes|events [, <m/.../] }</tag> + Set table debugging options. Each table can write some trace messages + into log with category <cf/trace/. You can request <cf/all/ trace messages + or select some types: <cf/states/ for table state changes and auxiliary + processes, <cf/routes/ for auxiliary route notifications (next hop update, + flowspec revalidation) and <cf/events/ for more detailed auxiliary routine + debug. See also <ref id="channel-debug" name="channel debugging option">. + Default: off. - <tag><label id="rtable-max-settle-time">max settle time <m/time/</tag> - Specify a maximum value of the settle time. When a ROA table changes, - automatic <ref id="proto-rpki-reload" name="RPKI reload"> may be - triggered, after a short settle time. Maximum settle time is an upper - limit to the settle time from the initial ROA table change even if - there are consecutive updates gradually renewing the settle time. - Default: 20 s. </descrip> @@ -941,6 +980,16 @@ inherited from templates can be updated by new definitions. <ref id="bgp-export-table" name="export table"> (for respective direction). Default: on. + <tag><label id="rtable-min-settle-time">roa settle time <m/time/ <m/time/</tag> + Minimum and maximum settle times, respectively, for ROA table changes. + The automatic reload is triggered after the minimum time after the last + ROA table change has been received but not later than the maximum time after + first unprocessed ROA table change. Therefore with default values, the + automatic reload happens 1 second after the ROA table stops updating, yet if it + were to be later than 20 seconds after the ROA table starts updating, + the automatic reload is triggered anyway. Default values: <cf/1 s 20 s/. + You have to always provide both values. + <tag><label id="proto-import-limit">import limit [<m/number/ | off ] [action warn | block | restart | disable]</tag> Specify an import route limit (a maximum number of routes imported from the protocol) and optionally the action to be taken when the limit is @@ -1247,8 +1296,8 @@ this: <code> filter not_too_far -int var; { + int var; if defined( rip_metric ) then var = rip_metric; else { @@ -1277,9 +1326,9 @@ local variables. Recursion is not allowed. Function definitions look like this: <code> function name () -int local_variable; { - local_variable = 5; + int local_variable; + int another_variable = 5; } function with_parameters (int parameter) @@ -1288,16 +1337,19 @@ function with_parameters (int parameter) } </code> -<p>Unlike in C, variables are declared after the <cf/function/ line, but before -the first <cf/{/. You can't declare variables in nested blocks. Functions are -called like in C: <cf>name(); with_parameters(5);</cf>. Function may return -values using the <cf>return <m/[expr]/</cf> command. Returning a value exits -from current function (this is similar to C). +<p>Like in C programming language, variables are declared inside function body, +either at the beginning, or mixed with other statements. Declarations may +contain initialization. You can also declare variables in nested blocks, such +variables have scope restricted to such block. There is a deprecated syntax to +declare variables after the <cf/function/ line, but before the first <cf/{/. +Functions are called like in C: <cf>name(); with_parameters(5);</cf>. Function +may return values using the <cf>return <m/[expr]/</cf> command. Returning a +value exits from current function (this is similar to C). -<p>Filters are defined in a way similar to functions except they can't have +<p>Filters are defined in a way similar to functions except they cannot have explicit parameters. They get a route table entry as an implicit parameter, it is also passed automatically to any functions called. The filter must terminate -with either <cf/accept/ or <cf/reject/ statement. If there's a runtime error in +with either <cf/accept/ or <cf/reject/ statement. If there is a runtime error in filter, the route is rejected. <p>A nice trick to debug filters is to use <cf>show route filter <m/name/</cf> @@ -1667,7 +1719,8 @@ prefix and an ASN as arguments. <sect>Control structures <label id="control-structures"> -<p>Filters support two control structures: conditions and case switches. +<p>Filters support several control structures: conditions, for loops and case +switches. <p>Syntax of a condition is: <cf>if <M>boolean expression</M> then <m/commandT/; else <m/commandF/;</cf> and you can use <cf>{ <m/command1/; <m/command2/; @@ -1675,6 +1728,14 @@ else <m/commandF/;</cf> and you can use <cf>{ <m/command1/; <m/command2/; omitted. If the <cf><m>boolean expression</m></cf> is true, <m/commandT/ is executed, otherwise <m/commandF/ is executed. +<p>For loops allow to iterate over elements in compound data like BGP paths or +community lists. The syntax is: <cf>for [ <m/type/ ] <m/variable/ in <m/expr/ +do <m/command/;</cf> and you can also use compound command like in conditions. +The expression is evaluated to a compound data, then for each element from such +data the command is executed with the item assigned to the variable. A variable +may be an existing one (when just name is used) or a locally defined (when type +and name is used). In both cases, it must have the same type as elements. + <p>The <cf>case</cf> is similar to case from Pascal. Syntax is <cf>case <m/expr/ { else: | <m/num_or_prefix [ .. num_or_prefix]/: <m/statement/ ; [ ... ] }</cf>. The expression after <cf>case</cf> can be of any type which can be @@ -1687,16 +1748,21 @@ neither of the <cf/:/ clauses, the statements after <cf/else:/ are executed. <p>Here is example that uses <cf/if/ and <cf/case/ structures: <code> +if 1234 = i then printn "."; else { + print "not 1234"; + print "You need {} around multiple commands"; +} + +for int asn in bgp_path do { + printn "ASN: ", asn; + if asn < 65536 then print " (2B)"; else print " (4B)"; +} + case arg1 { 2: print "two"; print "I can do more commands without {}"; 3 .. 5: print "three to five"; else: print "something else"; } - -if 1234 = i then printn "."; else { - print "not 1234"; - print "You need {} around multiple commands"; -} </code> @@ -2338,6 +2404,7 @@ avoid routing loops. <item> <rfc id="8203"> - BGP Administrative Shutdown Communication <item> <rfc id="8212"> - Default EBGP Route Propagation Behavior without Policies <item> <rfc id="9117"> - Revised Validation Procedure for BGP Flow Specifications +<item> <rfc id="9234"> - Route Leak Prevention and Detection Using Roles </itemize> <sect1>Route selection rules @@ -2778,6 +2845,29 @@ using the following configuration parameters: protocol itself (for example, if a route is received through eBGP and therefore does not have such attribute). Default: 100 (0 in pre-1.2.0 versions of BIRD). + + <tag><label id="bgp-local-role">local role <m/role-name/</tag> + BGP roles are a mechanism for route leak prevention and automatic route + filtering based on common BGP topology relationships. They are defined + in <rfc id="9234">. Instead of manually configuring filters and + communities, automatic filtering is done with the help of the OTC + attribute - a flag for routes that should be sent only to customers. + The same attribute is also used to automatically detect and filter route + leaks created by third parties. + + This option is valid for EBGP sessions, but it is not recommended to be + used within AS confederations (which would require manual filtering of + <cf/bgp_otc/ attribute on confederation boundaries). + + Possible <cf><m/role-name/</cf> values are: <cf/provider/, + <cf/rs_server/, <cf/rs_client/, <cf/customer/ and <cf/peer/. + Default: No local role assigned. + + <tag><label id="bgp-require-roles">require roles <m/switch/</tag> + If this option is set, the BGP roles must be defined on both sides, + otherwise the session will not be established. This behavior is defined + in <rfc id="9234"> as "strict mode" and is used to enforce corresponding + configuration at your conterpart side. Default: disabled. </descrip> <sect1>Channel configuration @@ -3085,6 +3175,11 @@ some of them (marked with `<tt/O/') are optional. This attribute contains accumulated IGP metric, which is a total distance to the destination through multiple autonomous systems. Currently, the attribute is not accessible from filters. + + <tag><label id="bgp-otc">int bgp_otc [O]</tag> + This attribute is defined in <rfc id="9234">. OTC is a flag that marks + routes that should be sent only to customers. If <ref id="bgp-role" + name="local Role"> is configured it set automatically. </descrip> <sect1>Example diff --git a/filter/config.Y b/filter/config.Y index 8cecf936..5ba4f7e6 100644 --- a/filter/config.Y +++ b/filter/config.Y @@ -32,6 +32,36 @@ static inline u32 pair_b(u32 p) { return p & 0xFFFF; } f_new_inst(FI_EA_SET, f_new_inst(fi_code, f_new_inst(FI_DEFAULT, f_new_inst(FI_EA_GET, da), f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_INT, .val.i = def })), arg), da) +static int +f_new_var(struct sym_scope *s) +{ + /* + * - A variable is an offset on vstack from vbase. + * - Vbase is set on filter start / function call. + * - Scopes contain anonymous scopes (blocks) inside filter/function scope + * - Each scope knows number of vars in that scope + * - Offset is therefore a sum of 'slots' up to named scope + * - New variables are added on top of vstk, so intermediate values cannot + * be there during FI_VAR_INIT. I.e. no 'var' inside 'term'. + * - Also, each f_line must always have its scope, otherwise a variable may + * be defined but not initialized if relevant f_line is not executed. + */ + + int offset = s->slots++; + + while (!s->name) + { + s = s->next; + ASSERT(s); + offset += s->slots; + } + + if (offset >= 0xff) + cf_error("Too many variables, at most 255 allowed"); + + return offset; +} + /* * Sets and their items are during parsing handled as lists, linked * through left ptr. The first item in a list also contains a pointer @@ -276,7 +306,7 @@ assert_assign(struct f_lval *lval, struct f_inst *expr, const char *start, const checker = f_new_inst(FI_EQ, expr, getter); setter->next = checker; - + return assert_done(setter, start, end); } @@ -287,6 +317,7 @@ CF_KEYWORDS(FUNCTION, PRINT, PRINTN, UNSET, RETURN, INT, BOOL, IP, TYPE, PREFIX, RD, PAIR, QUAD, EC, LC, SET, STRING, BGPMASK, BGPPATH, CLIST, ECLIST, LCLIST, IF, THEN, ELSE, CASE, + FOR, IN, DO, TRUE, FALSE, RT, RO, UNKNOWN, GENERIC, FROM, GW, NET, MASK, PROTO, SCOPE, DEST, IFNAME, IFINDEX, WEIGHT, GW_MPLS, ROA_CHECK, ASN, SRC, DST, @@ -305,21 +336,23 @@ CF_KEYWORDS(FUNCTION, PRINT, PRINTN, UNSET, RETURN, %nonassoc ELSE %type <xp> cmds_int cmd_prep -%type <x> term block cmd cmds constant constructor print_list var_list function_call symbol_value bgp_path_expr bgp_path bgp_path_tail +%type <x> term cmd cmd_var cmds cmds_scoped constant constructor print_list var var_init var_list function_call symbol_value bgp_path_expr bgp_path bgp_path_tail %type <fsa> static_attr %type <fab> attr_bit %type <f> filter where_filter %type <fl> filter_body function_body %type <flv> lvalue -%type <i> type function_args function_vars +%type <i> type function_vars +%type <fa> function_argsn function_args %type <ecs> ec_kind -%type <fret> break_command +%type <fret> break_command %type <i32> cnum %type <e> pair_item ec_item lc_item set_item switch_item set_items switch_items switch_body %type <trie> fprefix_set %type <v> set_atom switch_atom fipa %type <px> fprefix %type <t> get_cf_position +%type <s> for_var CF_GRAMMAR @@ -343,7 +376,7 @@ filter_def: conf: filter_eval ; filter_eval: - EVAL term { f_eval_int(f_linearize($2)); } + EVAL term { f_eval_int(f_linearize($2, 1)); } ; conf: custom_attr ; @@ -425,25 +458,28 @@ type: ; function_argsn: - /* EMPTY */ + /* EMPTY */ { $$ = NULL; } | function_argsn type symbol ';' { if ($3->scope->slots >= 0xfe) cf_error("Too many declarations, at most 255 allowed"); - cf_define_symbol($3, SYM_VARIABLE | $2, offset, $3->scope->slots++); + $$ = cfg_alloc(sizeof(struct f_arg)); + $$->arg = cf_define_symbol($3, SYM_VARIABLE | $2, offset, sym_->scope->slots++); + $$->next = $1; } ; function_args: - '(' ')' { $$ = 0; } + '(' ')' { $$ = NULL; } | '(' function_argsn type symbol ')' { - cf_define_symbol($4, SYM_VARIABLE | $3, offset, $4->scope->slots++); - $$ = $4->scope->slots; + $$ = cfg_alloc(sizeof(struct f_arg)); + $$->arg = cf_define_symbol($4, SYM_VARIABLE | $3, offset, sym_->scope->slots++); + $$->next = $2; } ; function_vars: /* EMPTY */ { $$ = 0; } | function_vars type symbol ';' { - cf_define_symbol($3, SYM_VARIABLE | $2, offset, $3->scope->slots++); + cf_define_symbol($3, SYM_VARIABLE | $2, offset, f_new_var(sym_->scope)); $$ = $1 + 1; } ; @@ -471,20 +507,35 @@ where_filter: function_body: function_vars '{' cmds '}' { - $$ = f_linearize($3); + $$ = f_linearize($3, 0); $$->vars = $1; } ; conf: function_def ; function_def: - FUNCTION symbol { DBG( "Beginning of function %s\n", $2->name ); + FUNCTION symbol { + DBG( "Beginning of function %s\n", $2->name ); $2 = cf_define_symbol($2, SYM_FUNCTION, function, NULL); cf_push_scope($2); - } function_args function_body { - DBG("Definition of function %s with %u args and %u local vars.\n", $2->name, $4, $5->vars); - $5->args = $4; - $2->function = $5; + } function_args { + /* Make dummy f_line for storing function prototype */ + struct f_line *dummy = cfg_allocz(sizeof(struct f_line)); + $2->function = dummy; + + /* Revert the args */ + while ($4) { + struct f_arg *tmp = $4; + $4 = $4->next; + + tmp->next = dummy->arg_list; + dummy->arg_list = tmp; + dummy->args++; + } + } function_body { + $6->args = $2->function->args; + $6->arg_list = $2->function->arg_list; + $2->function = $6; cf_pop_scope(); } ; @@ -495,7 +546,11 @@ cmds: /* EMPTY */ { $$ = NULL; } | cmds_int { $$ = $1.begin; } ; -cmd_prep: cmd { +cmds_scoped: { cf_push_soft_scope(); } cmds { cf_pop_soft_scope(); $$ = $2; } ; + +cmd_var: var | cmd ; + +cmd_prep: cmd_var { $$.begin = $$.end = $1; if ($1) while ($$.end->next) @@ -517,15 +572,6 @@ cmds_int: cmd_prep } ; -block: - cmd { - $$=$1; - } - | '{' cmds '}' { - $$=$2; - } - ; - /* * Complex types, their bison value is struct f_val */ @@ -549,7 +595,7 @@ set_atom: | VPN_RD { $$.type = T_RD; $$.val.ec = $1; } | ENUM { $$.type = pair_a($1); $$.val.i = pair_b($1); } | '(' term ')' { - if (f_eval(f_linearize($2), &($$)) > F_RETURN) cf_error("Runtime error"); + if (f_eval(f_linearize($2, 1), &($$)) > F_RETURN) cf_error("Runtime error"); if (!f_valid_set_type($$.type)) cf_error("Set-incompatible type"); } | symbol_known { @@ -561,13 +607,13 @@ set_atom: switch_atom: NUM { $$.type = T_INT; $$.val.i = $1; } - | '(' term ')' { $$.type = T_INT; $$.val.i = f_eval_int(f_linearize($2)); } + | '(' term ')' { $$.type = T_INT; $$.val.i = f_eval_int(f_linearize($2, 1)); } | fipa { $$ = $1; } | ENUM { $$.type = pair_a($1); $$.val.i = pair_b($1); } ; cnum: - term { $$ = f_eval_int(f_linearize($1)); } + term { $$ = f_eval_int(f_linearize($1, 1)); } pair_item: '(' cnum ',' cnum ')' { $$ = f_new_pair_item($2, $2, $4, $4); } @@ -651,19 +697,19 @@ fprefix_set: ; switch_body: /* EMPTY */ { $$ = NULL; } - | switch_body switch_items ':' cmds { + | switch_body switch_items ':' cmds_scoped { /* Fill data fields */ struct f_tree *t; - struct f_line *line = f_linearize($4); + struct f_line *line = f_linearize($4, 0); for (t = $2; t; t = t->left) t->data = line; $$ = f_merge_items($1, $2); } - | switch_body ELSECOL cmds { + | switch_body ELSECOL cmds_scoped { struct f_tree *t = f_new_tree(); t->from.type = t->to.type = T_VOID; t->right = t; - t->data = f_linearize($3); + t->data = f_linearize($3, 0); $$ = f_merge_items($1, t); } ; @@ -680,6 +726,7 @@ bgp_path: bgp_path_tail: NUM bgp_path_tail { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_PATH_MASK_ITEM, .val.pmi = { .asn = $1, .kind = PM_ASN, }, }); $$->next = $2; } | NUM DDOT NUM bgp_path_tail { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_PATH_MASK_ITEM, .val.pmi = { .from = $1, .to = $3, .kind = PM_ASN_RANGE }, }); $$->next = $4; } + | '[' ']' bgp_path_tail { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_PATH_MASK_ITEM, .val.pmi = { .set = NULL, .kind = PM_ASN_SET }, }); $$->next = $3; } | '[' set_items ']' bgp_path_tail { if ($2->from.type != T_INT) cf_error("Only integer sets allowed in path mask"); $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_PATH_MASK_ITEM, .val.pmi = { .set = build_tree($2), .kind = PM_ASN_SET }, }); $$->next = $4; @@ -699,6 +746,7 @@ constant: | fipa { $$ = f_new_inst(FI_CONSTANT, $1); } | VPN_RD { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_RD, .val.ec = $1, }); } | net_ { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_NET, .val.net = $1, }); } + | '[' ']' { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_SET, .val.t = NULL, }); } | '[' set_items ']' { DBG( "We've got a set here..." ); $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_SET, .val.t = build_tree($2), }); @@ -722,27 +770,22 @@ var_list: /* EMPTY */ { $$ = NULL; } | var_list ',' term { $$ = $3; $$->next = $1; } function_call: - symbol_known '(' var_list ')' { + symbol_known '(' var_list ')' + { if ($1->class != SYM_FUNCTION) cf_error("You can't call something which is not a function. Really."); - struct f_inst *fc = f_new_inst(FI_CALL, $1); - uint args = 0; + /* Revert the var_list */ + struct f_inst *args = NULL; while ($3) { - args++; - struct f_inst *tmp = $3->next; - $3->next = fc; + struct f_inst *tmp = $3; + $3 = $3->next; - fc = $3; - $3 = tmp; + tmp->next = args; + args = tmp; } - if (args != $1->function->args) - cf_error("Function call '%s' got %u arguments, need %u arguments.", - $1->name, args, $1->function->args); - - $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_VOID }); - $$->next = fc; + $$ = f_new_inst(FI_CALL, args, $1); } ; @@ -866,13 +909,44 @@ print_list: /* EMPTY */ { $$ = NULL; } } ; +var_init: + /* empty */ { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { }); } + | '=' term { $$ = $2; } + ; + +var: + type symbol var_init ';' { + struct symbol *sym = cf_define_symbol($2, SYM_VARIABLE | $1, offset, f_new_var(sym_->scope)); + $$ = f_new_inst(FI_VAR_INIT, $3, sym); + } + +for_var: + type symbol { $$ = cf_define_symbol($2, SYM_VARIABLE | $1, offset, f_new_var(sym_->scope)); } + | CF_SYM_KNOWN { $$ = $1; cf_assert_symbol($1, SYM_VARIABLE); } + ; + cmd: - IF term THEN block { + '{' cmds_scoped '}' { + $$ = $2; + } + | IF term THEN cmd { $$ = f_new_inst(FI_CONDITION, $2, $4, NULL); } - | IF term THEN block ELSE block { + | IF term THEN cmd ELSE cmd { $$ = f_new_inst(FI_CONDITION, $2, $4, $6); } + | FOR { + /* Reserve space for walk data on stack */ + cf_push_scope(NULL); + conf_this_scope->slots += 2; + } for_var IN + /* Parse term in the parent scope */ + { conf_this_scope->active = 0; } term { conf_this_scope->active = 1; } + DO cmd { + cf_pop_scope(); + $$ = f_new_inst(FI_FOR_INIT, $6, $3); + $$->next = f_new_inst(FI_FOR_NEXT, $3, $9); + } | symbol_known '=' term ';' { switch ($1->class) { case SYM_VARIABLE_RANGE: @@ -929,7 +1003,7 @@ cmd: | PRINTN print_list ';' { $$ = f_new_inst(FI_PRINT, $2); } - | function_call ';' { $$ = f_new_inst(FI_DROP_RESULT, $1); } + | function_call ';' { $$ = f_new_inst(FI_DROP_RESULT, $1); } | CASE term '{' switch_body '}' { $$ = f_new_inst(FI_SWITCH, $2, build_tree($4)); } diff --git a/filter/data.c b/filter/data.c index 9dab1915..d26b07f5 100644 --- a/filter/data.c +++ b/filter/data.c @@ -71,6 +71,7 @@ btype f_type_element_type(btype t) { switch(t) { + case T_PATH: return T_INT; case T_CLIST: return T_PAIR; case T_ECLIST: return T_EC; case T_LCLIST: return T_LC; @@ -78,6 +79,8 @@ f_type_element_type(btype t) }; } +const struct f_trie f_const_empty_trie = { .ipv4 = -1, }; + const struct f_val f_const_empty_path = { .type = T_PATH, .val.ad = &null_adata, @@ -90,6 +93,9 @@ const struct f_val f_const_empty_path = { }, f_const_empty_lclist = { .type = T_LCLIST, .val.ad = &null_adata, +}, f_const_empty_prefix_set = { + .type = T_PREFIX_SET, + .val.ti = &f_const_empty_trie, }; static void @@ -178,7 +184,7 @@ val_compare(const struct f_val *v1, const struct f_val *v2) if (val_is_ip4(v1) && (v2->type == T_QUAD)) return uint_cmp(ipa_to_u32(v1->val.ip), v2->val.i); - debug( "Types do not match in val_compare\n" ); + DBG( "Types do not match in val_compare\n" ); return F_CMP_ERROR; } @@ -292,6 +298,12 @@ val_same(const struct f_val *v1, const struct f_val *v2) int clist_set_type(const struct f_tree *set, struct f_val *v) { + if (!set) + { + v->type = T_VOID; + return 1; + } + switch (set->from.type) { case T_PAIR: @@ -528,6 +540,9 @@ val_in_range(const struct f_val *v1, const struct f_val *v2) if (v2->type != T_SET) return F_CMP_ERROR; + if (!v2->val.t) + return 0; + /* With integrated Quad<->IP implicit conversion */ if ((v1->type == v2->val.t->from.type) || ((v1->type == T_QUAD) && val_is_ip4(&(v2->val.t->from)) && val_is_ip4(&(v2->val.t->to)))) diff --git a/filter/data.h b/filter/data.h index 23db4a85..c1e7c736 100644 --- a/filter/data.h +++ b/filter/data.h @@ -209,9 +209,11 @@ int val_in_range(const struct f_val *v1, const struct f_val *v2); int clist_set_type(const struct f_tree *set, struct f_val *v); static inline int eclist_set_type(const struct f_tree *set) -{ return set->from.type == T_EC; } +{ return !set || set->from.type == T_EC; } static inline int lclist_set_type(const struct f_tree *set) -{ return set->from.type == T_LC; } +{ return !set || set->from.type == T_LC; } +static inline int path_set_type(const struct f_tree *set) +{ return !set || set->from.type == T_INT; } const struct adata *clist_filter(struct linpool *pool, const struct adata *list, const struct f_val *set, int pos); const struct adata *eclist_filter(struct linpool *pool, const struct adata *list, const struct f_val *set, int pos); @@ -227,7 +229,7 @@ undef_value(struct f_val v) (v.val.ad == &null_adata); } -extern const struct f_val f_const_empty_path, f_const_empty_clist, f_const_empty_eclist, f_const_empty_lclist; +extern const struct f_val f_const_empty_path, f_const_empty_clist, f_const_empty_eclist, f_const_empty_lclist, f_const_empty_prefix_set; static inline const struct f_val *f_get_empty(btype t) { switch (t) { diff --git a/filter/decl.m4 b/filter/decl.m4 index c59cd7f3..e2472127 100644 --- a/filter/decl.m4 +++ b/filter/decl.m4 @@ -191,6 +191,12 @@ if (f$1->type && f$2->type && (f$1->type != f$2->type) && cf_error("Arguments $1 and $2 of %s must be of the same type", f_instruction_name(what->fi_code)); FID_INTERPRET_BODY()') +m4_define(ARG_PREFER_SAME_TYPE, ` +FID_NEW_BODY()m4_dnl +if (f$1->type && f$2->type && (f$1->type != f$2->type)) + (void) (f_const_promotion(f$2, f$1->type) || f_const_promotion(f$1, f$2->type)); +FID_INTERPRET_BODY()') + # Executing another filter line. This replaces the recursion # that was needed in the former implementation. m4_define(LINEX, `FID_INTERPRET_EXEC()LINEX_($1)FID_INTERPRET_NEW()return $1 FID_INTERPRET_BODY()') @@ -216,7 +222,7 @@ whati->f$1 = f$1; FID_DUMP_BODY()m4_dnl f_dump_line(item->fl$1, indent + 1); FID_LINEARIZE_BODY()m4_dnl -item->fl$1 = f_linearize(whati->f$1); +item->fl$1 = f_linearize(whati->f$1, $2); FID_SAME_BODY()m4_dnl if (!f_same(f1->fl$1, f2->fl$1)) return 0; FID_ITERATE_BODY()m4_dnl @@ -244,9 +250,13 @@ m4_define(ERROR, # This macro specifies result type and makes there are no conflicting definitions m4_define(RESULT_TYPE, `m4_ifdef([[INST_RESULT_TYPE]], - [[m4_ifelse(INST_RESULT_TYPE,$1,,[[ERROR([[Multiple type definitons]])]])]], + [[m4_ifelse(INST_RESULT_TYPE,$1,,[[ERROR([[Multiple type definitions in]] INST_NAME)]])]], [[m4_define(INST_RESULT_TYPE,$1) RESULT_TYPE_($1)]])') +m4_define(RESULT_TYPE_CHECK, + `m4_ifelse(INST_OUTVAL,0,, + [[m4_ifdef([[INST_RESULT_TYPE]],,[[ERROR([[Missing type definition in]] INST_NAME)]])]])') + m4_define(RESULT_TYPE_, ` FID_NEW_BODY()m4_dnl what->type = $1; @@ -300,6 +310,7 @@ m4_define(FID_ITERATE, `FID_ZONE(10, Iteration)') # This macro does all the code wrapping. See inline comments. m4_define(INST_FLUSH, `m4_ifdef([[INST_NAME]], [[ +RESULT_TYPE_CHECK()m4_dnl Check for defined RESULT_TYPE() FID_ENUM()m4_dnl Contents of enum fi_code { ... } INST_NAME(), FID_ENUM_STR()m4_dnl Contents of const char * indexed by enum fi_code @@ -375,6 +386,7 @@ case INST_NAME(): { #undef whati #undef item dest->items[pos].fi_code = what->fi_code; + dest->items[pos].flags = what->flags; dest->items[pos].lineno = what->lineno; break; } @@ -402,6 +414,7 @@ m4_define(INST, `m4_dnl This macro is called on beginning of each instruction INST_FLUSH()m4_dnl First, old data is flushed m4_define([[INST_NAME]], [[$1]])m4_dnl Then we store instruction name, m4_define([[INST_INVAL]], [[$2]])m4_dnl instruction input value count, +m4_define([[INST_OUTVAL]], [[$3]])m4_dnl instruction output value count, m4_undefine([[INST_NEVER_CONSTANT]])m4_dnl reset NEVER_CONSTANT trigger, m4_undefine([[INST_RESULT_TYPE]])m4_dnl and reset RESULT_TYPE value. FID_INTERPRET_BODY()m4_dnl By default, every code is interpreter code. @@ -505,6 +518,11 @@ f_const_promotion(struct f_inst *arg, btype want) return 1; } + else if ((c->type == T_SET) && (!c->val.t) && (want == T_PREFIX_SET)) { + *c = f_const_empty_prefix_set; + return 1; + } + return 0; } @@ -560,7 +578,7 @@ FID_WR_PUT(8) } struct f_line * -f_linearize_concat(const struct f_inst * const inst[], uint count) +f_linearize_concat(const struct f_inst * const inst[], uint count, uint results) { uint len = 0; for (uint i=0; i<count; i++) @@ -572,6 +590,8 @@ f_linearize_concat(const struct f_inst * const inst[], uint count) for (uint i=0; i<count; i++) out->len = linearize(out, inst[i], out->len); + out->results = results; + #ifdef LOCAL_DEBUG f_dump_line(out, 0); #endif @@ -640,6 +660,7 @@ FID_WR_PUT(4)m4_dnl struct f_inst { struct f_inst *next; /* Next instruction */ enum f_instruction_code fi_code; /* Instruction code */ + enum f_instruction_flags flags; /* Flags, instruction-specific */ btype type; /* Type of returned value, if known */ int size; /* How many instructions are underneath */ int lineno; /* Line number */ diff --git a/filter/f-inst.c b/filter/f-inst.c index e7b642ab..801eceec 100644 --- a/filter/f-inst.c +++ b/filter/f-inst.c @@ -62,14 +62,14 @@ * m4_dnl INST(FI_NOP, in, out) { enum value, input args, output args * m4_dnl ARG(num, type); argument, its id (in data fields) and type accessible by v1, v2, v3 * m4_dnl ARG_ANY(num); argument with no type check accessible by v1, v2, v3 + * m4_dnl ARG_TYPE(num, type); just declare the type of argument * m4_dnl VARARG; variable-length argument list; accessible by vv(i) and whati->varcount - * m4_dnl LINE(num, unused); this argument has to be converted to its own f_line + * m4_dnl LINE(num, out); this argument has to be converted to its own f_line * m4_dnl SYMBOL; symbol handed from config * m4_dnl STATIC_ATTR; static attribute definition * m4_dnl DYNAMIC_ATTR; dynamic attribute definition * m4_dnl RTC; route table config * m4_dnl ACCESS_RTE; this instruction needs route - * m4_dnl ACCESS_EATTRS; this instruction needs extended attributes * * m4_dnl FID_MEMBER( custom instruction member * m4_dnl C type, for storage in structs @@ -80,10 +80,17 @@ * m4_dnl ) * * m4_dnl RESULT(type, union-field, value); putting this on value stack + * m4_dnl RESULT_(type, union-field, value); like RESULT(), but do not declare the type * m4_dnl RESULT_VAL(value-struct); pass the struct f_val directly + * m4_dnl RESULT_TYPE(type); just declare the type of result value * m4_dnl RESULT_VOID; return undef * m4_dnl } * + * Note that runtime arguments m4_dnl (ARG*, VARARG) must be defined before + * parse-time arguments m4_dnl (LINE, SYMBOL, ...). During linearization, + * first ones move position in f_line by linearizing arguments first, while + * second ones store data to the current position. + * * Also note that the { ... } blocks are not respected by M4 at all. * If you get weird unmatched-brace-pair errors, check what it generated and why. * What is really considered as one instruction is not the { ... } block @@ -91,6 +98,24 @@ * * Other code is just copied into the interpreter part. * + * The filter language uses a simple type system, where values have types + * (constants T_*) and also terms (instructions) are statically typed. Our + * static typing is partial (some terms do not declare types of arguments + * or results), therefore it can detect most but not all type errors and + * therefore we still have runtime type checks. + * + * m4_dnl Types of arguments are declared by macros ARG() and ARG_TYPE(), + * m4_dnl types of results are declared by RESULT() and RESULT_TYPE(). + * m4_dnl Macros ARG_ANY(), RESULT_() and RESULT_VAL() do not declare types + * m4_dnl themselves, but can be combined with ARG_TYPE() / RESULT_TYPE(). + * + * m4_dnl Note that types should be declared only once. If there are + * m4_dnl multiple RESULT() macros in an instruction definition, they must + * m4_dnl use the exact same expression for type, or they should be replaced + * m4_dnl by multiple RESULT_() macros and a common RESULT_TYPE() macro. + * m4_dnl See e.g. FI_EA_GET or FI_MIN instructions. + * + * * If you are satisfied with this, you don't need to read the following * detailed description of what is really done with the instruction definitions. * @@ -211,10 +236,40 @@ * m4_dnl fpool -> the current linpool * m4_dnl NEVER_CONSTANT-> don't generate pre-interpretation code at all * m4_dnl ACCESS_RTE -> check that route is available, also NEVER_CONSTANT - * m4_dnl ACCESS_EATTRS -> pre-cache the eattrs; use only with ACCESS_RTE * * m4_dnl If you are stymied, see FI_CALL or FI_CONSTANT or just search for * m4_dnl the mentioned macros in this file to see what is happening there in wild. + * + * + * A note about soundness of the type system: + * + * A type system is sound when types of expressions are consistent with + * types of values resulting from evaluation of such expressions. Untyped + * expressions are ok, but badly typed expressions are not sound. So is + * the type system of BIRD filtering code sound? There are some points: + * + * All cases of (one) m4_dnl RESULT() macro are obviously ok, as the macro + * both declares a type and returns a value. One have to check instructions + * that use m4_dnl RESULT_TYPE() macro. There are two issues: + * + * FI_AND, FI_OR - second argument is statically checked to be T_BOOL and + * passed as result without dynamic typecheck, declared to be T_BOOL. If + * an untyped non-bool expression is used as a second argument, then + * the mismatched type is returned. + * + * FI_VAR_GET - soundness depends on consistency of declared symbol types + * and stored values. This is maintained when values are stored by + * FI_VAR_SET, but when they are stored by FI_CALL, only static checking is + * used, so when an untyped expression returning mismatched value is used + * as a function argument, then inconsistent value is stored and subsequent + * FI_VAR_GET would be unsound. + * + * Both of these issues are inconsequential, as mismatched values from + * unsound expressions will be caught by dynamic typechecks like mismatched + * values from untyped expressions. + * + * Also note that FI_CALL is the only expression without properly declared + * result type. */ /* Binary operators */ @@ -255,7 +310,7 @@ RESULT_TYPE(T_BOOL); if (v1.val.i) - LINE(2,0); + LINE(2,1); else RESULT_VAL(v1); } @@ -265,7 +320,7 @@ RESULT_TYPE(T_BOOL); if (!v1.val.i) - LINE(2,0); + LINE(2,1); else RESULT_VAL(v1); } @@ -358,7 +413,7 @@ break; case T_SET: - if (vv(i).val.t->from.type != T_INT) + if (!path_set_type(vv(i).val.t)) runtime("Only integer sets allowed in path mask"); pm->item[i] = (struct f_path_mask_item) { @@ -380,12 +435,14 @@ INST(FI_NEQ, 2, 1) { ARG_ANY(1); ARG_ANY(2); + ARG_PREFER_SAME_TYPE(1, 2); RESULT(T_BOOL, i, !val_same(&v1, &v2)); } INST(FI_EQ, 2, 1) { ARG_ANY(1); ARG_ANY(2); + ARG_PREFER_SAME_TYPE(1, 2); RESULT(T_BOOL, i, val_same(&v1, &v2)); } @@ -456,6 +513,18 @@ RESULT(T_BOOL, i, ipa_is_ip4(v1.val.ip)); } + INST(FI_VAR_INIT, 1, 0) { + NEVER_CONSTANT; + ARG_ANY(1); + SYMBOL; + ARG_TYPE(1, sym->class & 0xff); + + /* New variable is always the last on stack */ + uint pos = curline.vbase + sym->offset; + fstk->vstk[pos] = v1; + fstk->vcnt = pos + 1; + } + /* Set to indirect value prepared in v1 */ INST(FI_VAR_SET, 1, 0) { NEVER_CONSTANT; @@ -486,12 +555,100 @@ RESULT_VAL(val); } + INST(FI_FOR_INIT, 1, 0) { + NEVER_CONSTANT; + ARG_ANY(1); + SYMBOL; + + FID_NEW_BODY() + ASSERT((sym->class & ~0xff) == SYM_VARIABLE); + + /* Static type check */ + if (f1->type) + { + enum btype t_var = (sym->class & 0xff); + enum btype t_arg = f_type_element_type(f1->type); + if (!t_arg) + cf_error("Value of expression in FOR must be iterable, got %s", + f_type_name(f1->type)); + if (t_var != t_arg) + cf_error("Loop variable '%s' in FOR must be %s, is %s", + sym->name, f_type_name(t_arg), f_type_name(t_var)); + } + + FID_INTERPRET_BODY() + + /* Dynamic type check */ + if ((sym->class & 0xff) != f_type_element_type(v1.type)) + runtime("Mismatched argument and variable type"); + + /* Setup the index */ + v2 = (struct f_val) { .type = T_INT, .val.i = 0 }; + + /* Keep v1 and v2 on the stack */ + fstk->vcnt += 2; + } + + INST(FI_FOR_NEXT, 2, 0) { + NEVER_CONSTANT; + SYMBOL; + + /* Type checks are done in FI_FOR_INIT */ + + /* Loop variable */ + struct f_val *var = &fstk->vstk[curline.vbase + sym->offset]; + int step = 0; + + switch(v1.type) + { + case T_PATH: + var->type = T_INT; + step = as_path_walk(v1.val.ad, &v2.val.i, &var->val.i); + break; + + case T_CLIST: + var->type = T_PAIR; + step = int_set_walk(v1.val.ad, &v2.val.i, &var->val.i); + break; + + case T_ECLIST: + var->type = T_EC; + step = ec_set_walk(v1.val.ad, &v2.val.i, &var->val.ec); + break; + + case T_LCLIST: + var->type = T_LC; + step = lc_set_walk(v1.val.ad, &v2.val.i, &var->val.lc); + break; + + default: + runtime( "Clist or lclist expected" ); + } + + if (step) + { + /* Keep v1 and v2 on the stack */ + fstk->vcnt += 2; + + /* Repeat this instruction */ + curline.pos--; + + /* Execute the loop body */ + LINE(1, 0); + + /* Space for loop variable, may be unused */ + fstk->vcnt += 1; + } + else + var->type = T_VOID; + } + INST(FI_CONDITION, 1, 0) { ARG(1, T_BOOL); if (v1.val.i) LINE(2,0); else - LINE(3,1); + LINE(3,0); } INST(FI_PRINT, 0, 0) { @@ -528,15 +685,14 @@ { STATIC_ATTR; ACCESS_RTE; - ACCESS_EATTRS; switch (sa.sa_code) { case SA_NET: RESULT(sa.type, net, fs->rte->net); break; - case SA_PROTO: RESULT(sa.type, s, fs->rte->src->proto->name); break; + case SA_PROTO: RESULT(sa.type, s, fs->rte->src->owner->name); break; default: { - struct eattr *nhea = ea_find(*fs->eattrs, &ea_gen_nexthop); + struct eattr *nhea = ea_find(fs->rte->attrs, &ea_gen_nexthop); struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; struct nexthop *nh = nhad ? &nhad->nh : NULL; @@ -572,7 +728,6 @@ INST(FI_RTA_SET, 1, 0) { ACCESS_RTE; - ACCESS_EATTRS; ARG_ANY(1); STATIC_ATTR; ARG_TYPE(1, sa.type); @@ -606,13 +761,14 @@ } case SA_GW: { - struct eattr *nh_ea = ea_find(*fs->eattrs, &ea_gen_nexthop); + struct eattr *nh_ea = ea_find(fs->rte->attrs, &ea_gen_nexthop); ip_addr ip = v1.val.ip; struct iface *ifa = (ipa_is_link_local(ip) && nh_ea) ? ((struct nexthop_adata *) nh_ea->u.ptr)->nh.iface : NULL; - neighbor *n = neigh_find(fs->rte->src->proto, ip, ifa, 0); + /* XXX this code supposes that every owner is a protocol XXX */ + neighbor *n = neigh_find(SKIP_BACK(struct proto, sources, fs->rte->src->owner), ip, ifa, 0); if (!n || (n->scope == SCOPE_HOST)) runtime( "Invalid gw address" ); @@ -640,7 +796,7 @@ if (v1.val.i >= 0x100000) runtime( "Invalid MPLS label" ); - struct eattr *nh_ea = ea_find(*fs->eattrs, &ea_gen_nexthop); + struct eattr *nh_ea = ea_find(fs->rte->attrs, &ea_gen_nexthop); if (!nh_ea) runtime( "No nexthop to add a MPLS label to" ); @@ -663,7 +819,7 @@ if (i < 1 || i > 256) runtime( "Setting weight value out of bounds" ); - struct eattr *nh_ea = ea_find(*fs->eattrs, &ea_gen_nexthop); + struct eattr *nh_ea = ea_find(fs->rte->attrs, &ea_gen_nexthop); if (!nh_ea) runtime( "No nexthop to set weight on" ); @@ -677,7 +833,7 @@ NEXTHOP_WALK(nh, nhax) nh->weight = i - 1; - a = ea_set_attr(fs->eattrs, + a = ea_set_attr(&fs->rte->attrs, EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nhax->ad)); } break; @@ -687,7 +843,7 @@ } if (!a) - a = ea_set_attr(fs->eattrs, + a = ea_set_attr(&fs->rte->attrs, EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, tmp_copy_adata(&nha.ad))); a->originated = 1; @@ -698,11 +854,10 @@ INST(FI_EA_GET, 0, 1) { /* Access to extended attributes */ DYNAMIC_ATTR; ACCESS_RTE; - ACCESS_EATTRS; RESULT_TYPE(da->type); { const struct f_val *empty; - const eattr *e = ea_find(*fs->eattrs, da->id); + const eattr *e = ea_find(fs->rte->attrs, da->id); if (e) { @@ -728,7 +883,6 @@ INST(FI_EA_SET, 1, 0) { ACCESS_RTE; - ACCESS_EATTRS; ARG_ANY(1); DYNAMIC_ATTR; ARG_TYPE(1, da->type); @@ -745,12 +899,12 @@ break; case T_IP: - a = ea_set_attr(fs->eattrs, + a = ea_set_attr(&fs->rte->attrs, EA_LITERAL_STORE_ADATA(da, 0, &v1.val.ip, sizeof(ip_addr))); break; default: - a = ea_set_attr(fs->eattrs, + a = ea_set_attr(&fs->rte->attrs, EA_LITERAL_GENERIC(da->id, da->type, 0, .u = v1.val.bval)); break; } @@ -763,14 +917,14 @@ INST(FI_EA_UNSET, 0, 0) { DYNAMIC_ATTR; ACCESS_RTE; - ACCESS_EATTRS; - ea_unset_attr(fs->eattrs, 1, da); + ea_unset_attr(&fs->rte->attrs, 1, da); } INST(FI_DEFAULT, 2, 1) { ARG_ANY(1); ARG_ANY(2); + RESULT_TYPE(f_type_element_type(v2.type)); log(L_INFO "Type of arg 1 is: %d", v1.type); @@ -945,7 +1099,7 @@ RESULT(T_INT, i, v1.val.lc.ldp2); } - INST(FI_MIN, 1, 1) { /* Get minimum element from set */ + INST(FI_MIN, 1, 1) { /* Get minimum element from list */ ARG_ANY(1); RESULT_TYPE(f_type_element_type(v1.type)); switch(v1.type) @@ -979,7 +1133,7 @@ } } - INST(FI_MAX, 1, 1) { /* Get maximum element from set */ + INST(FI_MAX, 1, 1) { /* Get maximum element from list */ ARG_ANY(1); RESULT_TYPE(f_type_element_type(v1.type)); switch(v1.type) @@ -1013,7 +1167,7 @@ } } - INST(FI_RETURN, 1, 1) { + INST(FI_RETURN, 1, 0) { NEVER_CONSTANT; /* Acquire the return value */ ARG_ANY(1); @@ -1041,28 +1195,59 @@ INST(FI_CALL, 0, 1) { NEVER_CONSTANT; + VARARG; SYMBOL; + /* Fake result type declaration */ + RESULT_TYPE(T_VOID); + + FID_NEW_BODY() + ASSERT(sym->class == SYM_FUNCTION); + + if (whati->varcount != sym->function->args) + cf_error("Function '%s' expects %u arguments, got %u arguments", + sym->name, sym->function->args, whati->varcount); + + /* Typecheck individual arguments */ + struct f_inst *a = fvar; + struct f_arg *b = sym->function->arg_list; + for (uint i = 1; a && b; a = a->next, b = b->next, i++) + { + enum btype b_type = b->arg->class & 0xff; + + if (a->type && (a->type != b_type) && !f_const_promotion(a, b_type)) + cf_error("Argument %u of '%s' must be %s, got %s", + i, sym->name, f_type_name(b_type), f_type_name(a->type)); + } + ASSERT(!a && !b); + + /* Add implicit void slot for the return value */ + struct f_inst *tmp = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_VOID }); + tmp->next = whati->fvar; + whati->fvar = tmp; + what->size += tmp->size; + + /* Mark recursive calls, they have dummy f_line */ + if (!sym->function->len) + what->flags |= FIF_RECURSIVE; + FID_SAME_BODY() - if (!(f1->sym->flags & SYM_FLAG_SAME)) - return 0; + if (!(f1->sym->flags & SYM_FLAG_SAME) && !(f1_->flags & FIF_RECURSIVE)) + return 0; FID_ITERATE_BODY() + if (!(what->flags & FIF_RECURSIVE)) BUFFER_PUSH(fit->lines) = whati->sym->function; FID_INTERPRET_BODY() /* Push the body on stack */ LINEX(sym->function); + curline.vbase = curline.ventry; curline.emask |= FE_RETURN; - /* Before this instruction was called, there was the T_VOID - * automatic return value pushed on value stack and also - * sym->function->args function arguments. Setting the - * vbase to point to first argument. */ - ASSERT(curline.ventry >= sym->function->args); - curline.ventry -= sym->function->args; - curline.vbase = curline.ventry; + /* Arguments on stack */ + fstk->vcnt += sym->function->args; /* Storage for local variables */ f_vcnt_check_overflow(sym->function->vars); @@ -1176,17 +1361,10 @@ if (v1.type == T_PATH) { - const struct f_tree *set = NULL; - u32 key = 0; - - if (v2.type == T_INT) - key = v2.val.i; - else if ((v2.type == T_SET) && (v2.val.t->from.type == T_INT)) - set = v2.val.t; + if ((v2.type == T_SET) && path_set_type(v2.val.t) || (v2.type == T_INT)) + RESULT_(T_PATH, ad, [[ as_path_filter(fpool, v1.val.ad, &v2, 0) ]]); else runtime("Can't delete non-integer (set)"); - - RESULT_(T_PATH, ad, [[ as_path_filter(fpool, v1.val.ad, set, key, 0) ]]); } else if (v1.type == T_CLIST) @@ -1238,10 +1416,8 @@ if (v1.type == T_PATH) { - u32 key = 0; - - if ((v2.type == T_SET) && (v2.val.t->from.type == T_INT)) - RESULT_(T_PATH, ad, [[ as_path_filter(fpool, v1.val.ad, v2.val.t, key, 1) ]]); + if ((v2.type == T_SET) && path_set_type(v2.val.t)) + RESULT_(T_PATH, ad, [[ as_path_filter(fpool, v1.val.ad, &v2, 1) ]]); else runtime("Can't filter integer"); } @@ -1284,7 +1460,7 @@ ARG(1, T_NET); ARG(2, T_INT); RTC(3); - struct rtable *table = rtc->table; + rtable *table = rtc->table; u32 as = v2.val.i; @@ -1301,7 +1477,7 @@ } - INST(FI_FORMAT, 1, 0) { /* Format */ + INST(FI_FORMAT, 1, 1) { /* Format */ ARG_ANY(1); RESULT(T_STRING, s, val_format_str(fpool, &v1)); } diff --git a/filter/f-inst.h b/filter/f-inst.h index 047a66c9..fbc59de7 100644 --- a/filter/f-inst.h +++ b/filter/f-inst.h @@ -22,7 +22,7 @@ /* Flags for instructions */ enum f_instruction_flags { - FIF_PRINTED = 1, /* FI_PRINT_AND_DIE: message put in buffer */ + FIF_RECURSIVE = 1, /* FI_CALL: function is directly recursive */ } PACKED; /* Include generated filter instruction declarations */ @@ -35,19 +35,26 @@ const char *f_instruction_name_(enum f_instruction_code fi); static inline const char *f_instruction_name(enum f_instruction_code fi) { return f_instruction_name_(fi) + 3; } +struct f_arg { + struct symbol *arg; + struct f_arg *next; +}; + /* Filter structures for execution */ /* Line of instructions to be unconditionally executed one after another */ struct f_line { uint len; /* Line length */ u8 args; /* Function: Args required */ u8 vars; + u8 results; /* Results left on stack: cmd -> 0, term -> 1 */ + struct f_arg *arg_list; struct f_line_item items[0]; /* The items themselves */ }; /* Convert the f_inst infix tree to the f_line structures */ -struct f_line *f_linearize_concat(const struct f_inst * const inst[], uint count); -static inline struct f_line *f_linearize(const struct f_inst *root) -{ return f_linearize_concat(&root, 1); } +struct f_line *f_linearize_concat(const struct f_inst * const inst[], uint count, uint results); +static inline struct f_line *f_linearize(const struct f_inst *root, uint results) +{ return f_linearize_concat(&root, 1, results); } void f_dump_line(const struct f_line *, uint indent); diff --git a/filter/f-util.c b/filter/f-util.c index fb93ee80..82a06bdd 100644 --- a/filter/f-util.c +++ b/filter/f-util.c @@ -37,6 +37,6 @@ struct filter *f_new_where(struct f_inst *where) f_new_inst(FI_DIE, F_REJECT)); struct filter *f = cfg_allocz(sizeof(struct filter)); - f->root = f_linearize(cond); + f->root = f_linearize(cond, 0); return f; } diff --git a/filter/filter.c b/filter/filter.c index ad2fafe2..0aff4d30 100644 --- a/filter/filter.c +++ b/filter/filter.c @@ -76,9 +76,6 @@ struct filter_state { /* The route we are processing. This may be NULL to indicate no route available. */ struct rte *rte; - /* Cached pointer to ea_list */ - struct ea_list **eattrs; - /* Buffer for log output */ struct buffer buf; @@ -94,11 +91,6 @@ void (*bt_assert_hook)(int result, const struct f_line_item *assert); #define f_stack_init(fs) ( _f_stack_init(fs, v, 128), _f_stack_init(fs, e, 128) ) -static inline void f_cache_eattrs(struct filter_state *fs) -{ - fs->eattrs = &(fs->rte->attrs); -} - static struct tbf rl_runtime_err = TBF_DEFAULT_LOG_LIMITS; /** @@ -164,8 +156,6 @@ interpret(struct filter_state *fs, const struct f_line *line, struct f_val *val) #define falloc(size) tmp_alloc(size) #define fpool tmp_linpool -#define ACCESS_EATTRS do { if (!fs->eattrs) f_cache_eattrs(fs); } while (0) - #include "filter/inst-interpret.c" #undef res #undef v1 @@ -174,13 +164,11 @@ interpret(struct filter_state *fs, const struct f_line *line, struct f_val *val) #undef runtime #undef falloc #undef fpool -#undef ACCESS_EATTRS } } /* End of current line. Drop local variables before exiting. */ - fstk->vcnt -= curline.line->vars; - fstk->vcnt -= curline.line->args; + fstk->vcnt = curline.ventry + curline.line->results; fstk->ecnt--; } diff --git a/filter/filter_test.c b/filter/filter_test.c index 63764964..671dba94 100644 --- a/filter/filter_test.c +++ b/filter/filter_test.c @@ -70,6 +70,7 @@ int main(int argc, char *argv[]) { bt_init(argc, argv); + bt_bird_init(); bt_assert_hook = bt_assert_filter; @@ -78,11 +79,11 @@ main(int argc, char *argv[]) if (!bt_config_file_parse(BT_CONFIG_FILE)) abort(); - bt_test_suite(t_reconfig, "Testing reconfiguration"); + bt_test_suite_extra(t_reconfig, 0, BT_TIMEOUT, "Testing reconfiguration"); struct f_bt_test_suite *t; WALK_LIST(t, config->tests) - bt_test_suite_base(run_function, t->fn_name, t, BT_FORKING, BT_TIMEOUT, "%s", t->dsc); + bt_test_suite_base(run_function, t->fn_name, t, 0, BT_TIMEOUT, "%s", t->dsc); bt_bird_cleanup(); return bt_exit_value(); diff --git a/filter/test.conf b/filter/test.conf index cd2fa8db..22f5dea3 100644 --- a/filter/test.conf +++ b/filter/test.conf @@ -146,9 +146,8 @@ bt_test_same(onef, twof, 0); */ function t_bool() -bool b; { - b = true; + bool b = true; bt_assert(b); bt_assert(!!b); @@ -184,12 +183,11 @@ define xyzzy = (120+10); define '1a-a1' = (xyzzy-100); function t_int() -int i; { bt_assert(xyzzy = 130); bt_assert('1a-a1' = 30); - i = four; + int i = four; i = 12*100 + 60/2 + i; i = (i + 0); bt_assert(i = 1234); @@ -238,14 +236,19 @@ define is2 = [(17+2), 17, 15, 11, 8, 5, 3, 2]; define is3 = [5, 17, 2, 11, 8, 15, 3, 19]; function t_int_set() -int set is; { + int set is = []; + bt_assert(is = []); + bt_assert(0 !~ is); + bt_assert(1 ~ [1,2,3]); bt_assert(5 ~ [1..20]); bt_assert(2 ~ [ 1, 2, 3 ]); bt_assert(5 ~ [ 4 .. 7 ]); bt_assert(1 !~ [ 2, 3, 4 ]); bt_assert(999 !~ [ 666, 333 ]); + bt_assert(1 !~ []); + bt_assert(1 !~ is); is = [ 2, 3, 4, 7..11 ]; bt_assert(10 ~ is); @@ -280,6 +283,7 @@ int set is; bt_assert([1,4..10,20] = [1,4..10,20]); bt_assert(format([ 1, 2, 1, 1, 1, 3, 4, 1, 1, 1, 5 ]) = "[1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5]"); + bt_assert(format([]) = "[]"); } bt_test_suite(t_int_set, "Testing sets of integers"); @@ -293,9 +297,8 @@ bt_test_suite(t_int_set, "Testing sets of integers"); */ function t_string() -string st; { - st = "Hello"; + string st = "Hello"; bt_assert(format(st) = "Hello"); bt_assert(st ~ "Hell*"); bt_assert(st ~ "?ello"); @@ -320,9 +323,8 @@ function 'mkpair-a'(int a) } function t_pair() -pair pp; { - pp = (1, 2); + pair pp = (1, 2); bt_assert(format(pp) = "(1,2)"); bt_assert((1,2) = pp); bt_assert((1,1+1) = pp); @@ -343,10 +345,11 @@ bt_test_suite(t_pair, "Testing pairs"); */ function t_pair_set() -pair pp; -pair set ps; { - pp = (1, 2); + pair pp = (1, 2); + pair set ps = []; + bt_assert(pp !~ ps); + ps = [(1,(one+one)), (3,4)..(4,8), (5,*), (6,3..6)]; bt_assert(format(ps) = "[(1,2), (3,4)..(4,8), (5,0)..(5,65535), (6,3)..(6,6)]"); bt_assert(pp ~ ps); @@ -363,6 +366,7 @@ pair set ps; bt_assert((6,6+one) !~ ps); bt_assert(((one+6),2) !~ ps); bt_assert((1,1) !~ ps); + bt_assert(pp !~ []); ps = [(20..150, 200..300), (50100..50200, 1000..50000), (*, 5+5)]; bt_assert((100,200) ~ ps); @@ -414,6 +418,7 @@ quad qq; qq = 1.2.3.4; bt_assert(qq ~ [1.2.3.4, 5.6.7.8]); bt_assert(qq !~ [1.2.1.1, 1.2.3.5]); + bt_assert(qq !~ []); } bt_test_suite(t_quad_set, "Testing sets of quads"); @@ -494,6 +499,7 @@ ip set ips; bt_assert(1.2.3.4 !~ [ 1.2.3.3, 1.2.3.5 ]); bt_assert(1.2.3.4 ~ [ 1.2.3.3..1.2.3.5 ]); + bt_assert(1.2.3.4 !~ []); } bt_test_suite(t_ip_set, "Testing sets of ip address"); @@ -582,13 +588,34 @@ function test_pxset(prefix set pxs) bt_assert(1.0.0.0/8 ~ [ 1.0.0.0/8+ ]); bt_assert(1.0.0.0/9 !~ [ 1.0.0.0/8- ]); bt_assert(1.2.0.0/17 !~ [ 1.0.0.0/8{ 15 , 16 } ]); + bt_assert(net10 !~ []); bt_assert([ 10.0.0.0/8{ 15 , 17 } ] = [ 10.0.0.0/8{ 15 , 17 } ]); } +function test_empty_pxset(prefix set pxs) +int set s0; +prefix set s1; +{ + s0 = []; + s1 = []; + bt_assert(pxs != s0); + bt_assert(pxs = s1); + bt_assert(pxs = []); +} + function t_prefix_set() prefix set pxs; { + pxs = []; + bt_assert(format(pxs) = "[]"); + bt_assert(pxs = []); + bt_assert(1.2.0.0/16 !~ []); + bt_assert(1.2.0.0/16 !~ pxs); + + test_empty_pxset([]); + test_empty_pxset(pxs); + pxs = [ 1.2.0.0/16, 1.4.0.0/16+, 44.66.88.64/30{24,28}, 12.34.56.0/24{8,16} ]; bt_assert(format(pxs) = "[1.2.0.0/16{0.1.0.0}, 1.4.0.0/16{0.1.255.255}, 12.34.0.0/16{1.255.0.0}, 44.66.88.64/28{0.0.1.240}]"); @@ -673,6 +700,12 @@ bt_test_suite(t_prefix6, "Testing prefix IPv6"); function t_prefix6_set() prefix set pxs; { + pxs = []; + bt_assert(format(pxs) = "[]"); + bt_assert(pxs = []); + bt_assert(12::34/128 !~ []); + bt_assert(12::34/128 !~ pxs); + bt_assert(1180::/16 ~ [ 1100::/8{15, 17} ]); bt_assert(12::34 = 12::34); bt_assert(12::34 ~ [ 12::33..12::35 ]); @@ -790,6 +823,7 @@ int set set12; bt_assert(3 ~ p2); bt_assert(p2 ~ [2, 10..20]); bt_assert(p2 ~ [4, 10..20]); + bt_assert(p2 !~ []); p2 = prepend(p2, 5); bt_assert(p2 !~ pm1); @@ -800,6 +834,8 @@ int set set12; bt_assert(p2 ~ [= 5 [2, 4, 6] 3 [1..2] 1 =]); bt_assert(p2 ~ [= 5 set35 3 set12 set12 =]); bt_assert(p2 ~ mkpath(5, 4)); + bt_assert(p2 ~ [= * [3] * =]); + bt_assert(p2 !~ [= * [] * =]); bt_assert(p2.len = 5); bt_assert(p2.first = 5); @@ -808,6 +844,10 @@ int set set12; bt_assert(p2.len = 5); bt_assert(delete(p2, 3) = prepend(prepend(prepend(prepend(+empty+, 1), 2), 4), 5)); bt_assert(filter(p2, [1..3]) = prepend(prepend(prepend(+empty+, 1), 2), 3)); + bt_assert(delete(p2, []) = p2); + bt_assert(filter(p2, []) = +empty+); + bt_assert(delete(prepend(prepend(+empty+, 0), 1), []) = prepend(prepend(+empty+, 0), 1)); + bt_assert(filter(prepend(prepend(+empty+, 0), 1), []) = +empty+); p2 = prepend( + empty +, 5 ); p2 = prepend( p2, 4 ); @@ -827,6 +867,15 @@ int set set12; bt_assert(delete(p2, [4..5]) = prepend(prepend(prepend(prepend(+empty+, 3), 3), 2), 1)); bt_assert(format([= 1 2+ 3 =]) = "[= 1 2 + 3 =]"); + + # iteration over path + int x = 0; + int y = 0; + for int i in p2 do { + x = x + i; + y = y + x; + } + bt_assert(x = 18 && y = 50); } bt_test_suite(t_path, "Testing paths"); @@ -868,6 +917,7 @@ clist r; bt_assert(l ~ [(2,2..3)]); bt_assert(l ~ [(1,1..2)]); bt_assert(l ~ [(1,1)..(1,2)]); + bt_assert(l !~ []); l = add(l, (2,5)); l = add(l, (5,one)); @@ -905,6 +955,9 @@ clist r; bt_assert(l !~ [(*,(one+6))]); bt_assert(l !~ [(*, (one+one+one))]); + bt_assert(delete(l, []) = l); + bt_assert(filter(l, []) = -empty-); + l = delete(l, [(*,(one+onef(3)))]); l = delete(l, [(*,(4+one))]); bt_assert(l = add(-empty-, (3,1))); @@ -949,6 +1002,12 @@ clist r; bt_assert(format(r) = "(clist (2,1) (1,3) (2,2) (3,1) (2,3))"); bt_assert(r.min = (1,3)); bt_assert(r.max = (3,1)); + + # iteration over clist + int x = 0; + for pair c in r do + x = x + c.asn * c.asn * c.data; + bt_assert(x = 36); } bt_test_suite(t_clist, "Testing lists of communities"); @@ -1022,11 +1081,15 @@ eclist r; bt_assert((ro, 10.20.30.40, 100) !~ el); bt_assert(el !~ [(rt, 10, 35..40)]); bt_assert(el !~ [(ro, 10, *)]); + bt_assert(el !~ []); el = add(el, (rt, 10, 40)); el2 = filter(el, [(rt, 10, 20..40)] ); el2 = add(el2, (rt, 10, 50)); + bt_assert(delete(el, []) = el); + bt_assert(filter(el, []) = --empty--); + # eclist A (1,30,40) bt_assert(el = add(add(add(--empty--, (rt, 10, 1)), (rt, 10, 30)), (rt, 10, 40))); bt_assert(format(el) = "(eclist (rt, 10, 1) (rt, 10, 30) (rt, 10, 40))"); @@ -1060,6 +1123,13 @@ eclist r; bt_assert(format(r) = "(eclist (rt, 2, 1) (rt, 1, 3) (rt, 2, 2) (rt, 3, 1) (rt, 2, 3))"); bt_assert(r.min = (rt, 1, 3)); bt_assert(r.max = (rt, 3, 1)); + + # iteration over eclist + int x = 0; + for ec c in r do + if c > (rt, 2, 0) && c < (rt, 3, 0) then + x = x + 1; + bt_assert(x = 3); } bt_test_suite(t_eclist, "Testing lists of extended communities"); @@ -1144,6 +1214,9 @@ lclist r; ll2 = add(ll2, (30, 30, 30)); ll2 = add(ll2, (40, 40, 40)); + bt_assert(delete(ll, []) = ll); + bt_assert(filter(ll, []) = ---empty---); + # lclist A (10, 20, 30) bt_assert(format(ll) = "(lclist (10, 10, 10) (20, 20, 20) (30, 30, 30))"); @@ -1175,6 +1248,19 @@ lclist r; bt_assert(format(r) = "(lclist (2, 3, 3) (1, 2, 3) (2, 3, 1) (3, 1, 2) (2, 1, 3))"); bt_assert(r.min = (1, 2, 3)); bt_assert(r.max = (3, 1, 2)); + + # iteration over lclist + int x = 0; + int y = 0; + lc mx = (0, 0, 0); + for lc c in r do { + int asn2 = c.asn * c.asn; + x = x + asn2 * c.data1; + y = y + asn2 * c.data2; + if c > mx then mx = c; + } + bt_assert(x = 39 && y = 49); + bt_assert(mx = r.max); } bt_test_suite(t_lclist, "Testing lists of large communities"); @@ -1203,6 +1289,7 @@ lc set lls; bt_assert(ll !~ [(5,10,15), (10,21,30)]); bt_assert(ll !~ [(10,21..25,*)]); bt_assert(ll !~ [(11, *, *)]); + bt_assert(ll !~ []); lls = [(10, 10, 10), (20, 20, 15..25), (30, 30, *), (40, 35..45, *), (50, *, *), (55..65, *, *)]; bt_assert(format(lls) = "[(10, 10, 10), (20, 20, 15)..(20, 20, 25), (30, 30, 0)..(30, 30, 4294967295), (40, 35, 0)..(40, 45, 4294967295), (50, 0, 0)..(50, 4294967295, 4294967295), (55, 0, 0)..(65, 4294967295, 4294967295)]"); @@ -1259,6 +1346,10 @@ bt_test_suite(t_rd, "Testing route distinguishers"); function t_rd_set() rd set rds; { + rds = []; + bt_assert(rds = []); + bt_assert(10:20 !~ rds); + rds = [10:20, 100000:100..100000:200]; bt_assert(format(rds) = "[10:20, 100000:100..100000:200]"); @@ -1269,6 +1360,7 @@ rd set rds; bt_assert(100000:128 ~ rds); bt_assert(100000:200 ~ rds); bt_assert(100010:150 !~ rds); + bt_assert(100010:150 !~ []); } bt_test_suite(t_rd_set, "Testing sets of route distinguishers"); @@ -1335,7 +1427,85 @@ function fifteen() return 15; } +function local_vars(int j) +{ + int k = 10; + bt_assert(j = 5 && k = 10); + { + int j = 15; + k = 20; + bt_assert(j = 15 && k = 20); + } + bt_assert(j = 5 && k = 20); + + if j < 10 then + { + int j = 25; + string k = "hello"; + bt_assert(j = 25 && k = "hello"); + } + bt_assert(j = 5 && k = 20); + + int m = 100; + { + j = 35; + int k = 40; + bt_assert(j = 35 && k = 40 && m = 100); + } + bt_assert(j = 35 && k = 20 && m = 100); +} + +function factorial(int x) +{ + if x = 0 then return 0; + if x = 1 then return 1; + else return x * factorial(x - 1); +} + +function fibonacci(int x) +{ + if x = 0 then return 0; + if x = 1 then return 1; + else return fibonacci(x - 1) + fibonacci(x - 2); +} + +function hanoi_init(int a; int b) +{ + if b = 0 + then return +empty+; + else return prepend(hanoi_init(a + 1, b - 1), a); +} + +function hanoi_solve(int n; bgppath h_src; bgppath h_dst; bgppath h_aux; bool x; bool y) +{ + # x -> return src or dst + # y -> print state + + if n = 0 then { if x then return h_src; else return h_dst; } + + bgppath tmp1 = hanoi_solve(n - 1, h_src, h_aux, h_dst, true, y); + bgppath tmp2 = hanoi_solve(n - 1, h_src, h_aux, h_dst, false, false); + h_src = tmp1; + h_aux = tmp2; + + int v = h_src.first; + # bt_assert(h_dst = +empty+ || v < h_dst.first); + h_src = delete(h_src, v); + h_dst = prepend(h_dst, v); + + if y then + print "move: ", v, " src: ", h_src, " dst:", h_dst, " aux:", h_aux; + + tmp1 = hanoi_solve(n - 1, h_aux, h_dst, h_src, true, y); + tmp2 = hanoi_solve(n - 1, h_aux, h_dst, h_src, false, false); + h_aux = tmp1; + h_dst = tmp2; + + if x then return h_src; else return h_dst; +} + function t_call_function() +bgppath h_src; { bt_assert(fifteen() = 15); @@ -1347,6 +1517,17 @@ function t_call_function() bt_assert(callme(4, 4) = 16); bt_assert(callme(7, 2) = 14); bt_assert(callmeagain(1, 2, 3) = 6); + local_vars(5); + + bt_assert(factorial(5) = 120); + bt_assert(factorial(10) = 3628800); + + bt_assert(fibonacci(10) = 55); + bt_assert(fibonacci(20) = 6765); + + h_src = hanoi_init(1, 6); + bt_assert(format(h_src) = "(path 1 2 3 4 5 6)"); + bt_assert(hanoi_solve(6, h_src, +empty+, +empty+, false, false) = h_src); } bt_test_suite(t_call_function, "Testing calling functions"); @@ -1592,13 +1773,16 @@ filter vpn_filter bt_assert(net.type != NET_IP6); bt_assert(net.rd = 0:1:2); + bool b = false; case (net.type) { NET_IP4: print "IPV4"; NET_IP6: print "IPV6"; + else: b = true; } + bt_assert(b); bt_check_assign(from, 10.20.30.40); - bt_check_assign(gw, 55.55.55.44); + # bt_check_assign(gw, 55.55.55.44); bgp_community.add((3,5)); bgp_ext_community.add((ro, 135, 999)); diff --git a/filter/tree_test.c b/filter/tree_test.c index 05702f81..d180efbc 100644 --- a/filter/tree_test.c +++ b/filter/tree_test.c @@ -170,6 +170,8 @@ t_balancing(void) show_tree(balanced_tree_from_simple); bt_assert(same_tree(balanced_tree_from_simple, expected_balanced_tree)); + + tmp_flush(); } return 1; @@ -191,6 +193,9 @@ t_balancing_random(void) uint i; for(i = 0; i < 10; i++) { + struct lp_state lps; + lp_save(tmp_linpool, &lps); + struct f_tree *random_degenerated_tree = get_random_degenerated_left_tree(nodes_count); show_tree(random_degenerated_tree); @@ -200,7 +205,11 @@ t_balancing_random(void) show_tree(balanced_tree_from_random); bt_assert(same_tree(balanced_tree_from_random, expected_balanced_tree)); + + lp_restore(tmp_linpool, &lps); } + + tmp_flush(); } return 1; @@ -227,6 +236,8 @@ t_find(void) const struct f_tree *found_tree = find_tree(tree, &looking_up_value); bt_assert((val_compare(&looking_up_value, &(found_tree->from)) == 0) && (val_compare(&looking_up_value, &(found_tree->to)) == 0)); } + + tmp_flush(); } return 1; @@ -283,6 +294,8 @@ t_find_ranges(void) ((val_compare(&needle, &(found_tree->from)) == 1) && (val_compare(&needle, &(found_tree->to)) == -1)) ); } + + tmp_flush(); } return 1; diff --git a/filter/trie_test.c b/filter/trie_test.c index dc791280..ddce2daa 100644 --- a/filter/trie_test.c +++ b/filter/trie_test.c @@ -251,12 +251,12 @@ get_outer_net(net_addr *net, const struct f_prefix *src) static list * make_random_prefix_list(int num, int v6, int tight) { - list *prefixes = lp_allocz(tmp_linpool, sizeof(struct f_prefix_node)); + list *prefixes = tmp_allocz(sizeof(struct f_prefix_node)); init_list(prefixes); for (int i = 0; i < num; i++) { - struct f_prefix_node *px = lp_allocz(tmp_linpool, sizeof(struct f_prefix_node)); + struct f_prefix_node *px = tmp_allocz(sizeof(struct f_prefix_node)); get_random_prefix(&px->prefix, v6, tight); add_tail(prefixes, &px->n); @@ -294,7 +294,7 @@ read_prefix_list(FILE *f, int v6, int plus) char s[32]; int n; - list *pxlist = lp_allocz(tmp_linpool, sizeof(struct f_prefix_node)); + list *pxlist = tmp_allocz(sizeof(struct f_prefix_node)); init_list(pxlist); errno = 0; @@ -308,7 +308,7 @@ read_prefix_list(FILE *f, int v6, int plus) if (n != 5) bt_abort_msg("Invalid content of trie_data"); - struct f_prefix_node *px = lp_allocz(tmp_linpool, sizeof(struct f_prefix_node)); + struct f_prefix_node *px = tmp_allocz(sizeof(struct f_prefix_node)); net_fill_ip4(&px->prefix.net, ip4_build(a0, a1, a2, a3), pl); px->prefix.lo = pl; px->prefix.hi = plus ? IP4_MAX_PREFIX_LENGTH : pl; diff --git a/lib/Makefile b/lib/Makefile index 15f757d9..f4ade9a6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -1,4 +1,4 @@ -src := a-path.c a-set.c bitmap.c bitops.c blake2s.c blake2b.c checksum.c event.c flowspec.c idm.c ip.c lists.c mac.c md5.c mempool.c net.c patmatch.c printf.c resource.c sha1.c sha256.c sha512.c slab.c slists.c strtoul.c tbf.c timer.c xmalloc.c +src := a-path.c a-set.c bitmap.c bitops.c blake2s.c blake2b.c checksum.c event.c flowspec.c idm.c ip.c lists.c mac.c md5.c mempool.c net.c patmatch.c printf.c rcu.c resource.c sha1.c sha256.c sha512.c slab.c slists.c strtoul.c tbf.c timer.c xmalloc.c obj := $(src-o-files) $(all-daemon) diff --git a/lib/a-path.c b/lib/a-path.c index 0eca8475..a7a22e40 100644 --- a/lib/a-path.c +++ b/lib/a-path.c @@ -602,8 +602,10 @@ as_path_match_set(const struct adata *path, const struct f_tree *set) } const struct adata * -as_path_filter(struct linpool *pool, const struct adata *path, const struct f_tree *set, u32 key, int pos) +as_path_filter(struct linpool *pool, const struct adata *path, const struct f_val *set, int pos) { + ASSERT((set->type == T_SET) || (set->type == T_INT)); + if (!path) return NULL; @@ -629,13 +631,13 @@ as_path_filter(struct linpool *pool, const struct adata *path, const struct f_tr u32 as = get_as(p); int match; - if (set) + if (set->type == T_SET) { struct f_val v = { .type = T_INT, .val.i = as}; - match = !!find_tree(set, &v); + match = !!find_tree(set->val.t, &v); } - else - match = (as == key); + else /* T_INT */ + match = (as == set->val.i); if (match == pos) { @@ -667,6 +669,35 @@ as_path_filter(struct linpool *pool, const struct adata *path, const struct f_tr return res; } +int +as_path_walk(const struct adata *path, uint *pos, uint *val) +{ + if (!path) + return 0; + + const u8 *p = path->data; + const u8 *q = p + path->length; + uint n, x = *pos; + + while (p < q) + { + n = p[1]; + p += 2; + + if (x < n) + { + *val = get_as(p + x * BS); + *pos += 1; + return 1; + } + + p += n * BS; + x -= n; + } + + return 0; +} + struct pm_pos { diff --git a/lib/a-path_test.c b/lib/a-path_test.c index 38f77642..c6f8ce8b 100644 --- a/lib/a-path_test.c +++ b/lib/a-path_test.c @@ -12,6 +12,7 @@ #include "nest/rt.h" #include "lib/attrs.h" #include "lib/resource.h" +#include "filter/data.h" #define TESTS_NUM 30 #define AS_PATH_LENGTH 1000 @@ -127,8 +128,9 @@ t_path_include(void) int counts_of_contains = count_asn_in_array(as_nums, as_nums[i]); bt_assert_msg(as_path_contains(as_path, as_nums[i], counts_of_contains), "AS Path should contains %d-times number %d", counts_of_contains, as_nums[i]); - bt_assert(as_path_filter(tmp_linpool, as_path, NULL, as_nums[i], 0) != NULL); - bt_assert(as_path_filter(tmp_linpool, as_path, NULL, as_nums[i], 1) != NULL); + struct f_val v = { .type = T_INT, .val.i = as_nums[i] }; + bt_assert(as_path_filter(tmp_linpool, as_path, &v, 0) != NULL); + bt_assert(as_path_filter(tmp_linpool, as_path, &v, 1) != NULL); } for (i = 0; i < 10000; i++) diff --git a/lib/a-set.c b/lib/a-set.c index 8ede9b83..dcb86058 100644 --- a/lib/a-set.c +++ b/lib/a-set.c @@ -693,3 +693,51 @@ lc_set_max(const struct adata *list, lcomm *val) *val = (lcomm) { res[0], res[1], res[2] }; return 1; } + +int +int_set_walk(const struct adata *list, uint *pos, uint *val) +{ + if (!list) + return 0; + + if (*pos >= (uint) int_set_get_size(list)) + return 0; + + u32 *res = int_set_get_data(list) + *pos; + *val = *res; + *pos += 1; + + return 1; +} + +int +ec_set_walk(const struct adata *list, uint *pos, u64 *val) +{ + if (!list) + return 0; + + if (*pos >= (uint) int_set_get_size(list)) + return 0; + + u32 *res = int_set_get_data(list) + *pos; + *val = ec_generic(res[0], res[1]); + *pos += 2; + + return 1; +} + +int +lc_set_walk(const struct adata *list, uint *pos, lcomm *val) +{ + if (!list) + return 0; + + if (*pos >= (uint) int_set_get_size(list)) + return 0; + + u32 *res = int_set_get_data(list) + *pos; + *val = (lcomm) { res[0], res[1], res[2] }; + *pos += 3; + + return 1; +} diff --git a/lib/a-set_test.c b/lib/a-set_test.c index 3280031f..693b8f08 100644 --- a/lib/a-set_test.c +++ b/lib/a-set_test.c @@ -221,6 +221,7 @@ t_set_ec_delete(void) return 1; } + int main(int argc, char *argv[]) { diff --git a/lib/attrs.h b/lib/attrs.h index a75abcd3..af2f1036 100644 --- a/lib/attrs.h +++ b/lib/attrs.h @@ -60,6 +60,7 @@ static inline int adata_same(const struct adata *a, const struct adata *b) * to 16bit slot (like in 16bit AS_PATH). See RFC 4893 for details */ +struct f_val; struct f_tree; int as_path_valid(byte *data, uint len, int bs, int sets, int confed, char *err, uint elen); @@ -81,7 +82,8 @@ int as_path_get_last(const struct adata *path, u32 *last_as); u32 as_path_get_last_nonaggregated(const struct adata *path); int as_path_contains(const struct adata *path, u32 as, int min); int as_path_match_set(const struct adata *path, const struct f_tree *set); -const struct adata *as_path_filter(struct linpool *pool, const struct adata *path, const struct f_tree *set, u32 key, int pos); +const struct adata *as_path_filter(struct linpool *pool, const struct adata *path, const struct f_val *set, int pos); +int as_path_walk(const struct adata *path, uint *pos, uint *val); static inline struct adata *as_path_prepend(struct linpool *pool, const struct adata *path, u32 as) { return as_path_prepend2(pool, path, AS_PATH_SEQUENCE, as); } @@ -256,6 +258,9 @@ int lc_set_min(const struct adata *list, lcomm *val); int int_set_max(const struct adata *list, u32 *val); int ec_set_max(const struct adata *list, u64 *val); int lc_set_max(const struct adata *list, lcomm *val); +int int_set_walk(const struct adata *list, uint *pos, u32 *val); +int ec_set_walk(const struct adata *list, uint *pos, u64 *val); +int lc_set_walk(const struct adata *list, uint *pos, lcomm *val); void ec_set_sort_x(struct adata *set); /* Sort in place */ diff --git a/lib/birdlib.h b/lib/birdlib.h index 9b6e4a16..d743ecdf 100644 --- a/lib/birdlib.h +++ b/lib/birdlib.h @@ -14,8 +14,9 @@ /* Ugly structure offset handling macros */ +#define SAME_TYPE(a, b) ({ int _ = ((a) != (b)); !_; }) #define OFFSETOF(s, i) ((size_t) &((s *)0)->i) -#define SKIP_BACK(s, i, p) ((s *)((char *)p - OFFSETOF(s, i))) +#define SKIP_BACK(s, i, p) ({ s *_ptr = ((s *)((char *)p - OFFSETOF(s, i))); SAME_TYPE(&_ptr->i, p); _ptr; }) #define BIRD_ALIGN(s, a) (((s)+a-1)&~(a-1)) #define CPU_STRUCT_ALIGN (MAX_(_Alignof(void*), _Alignof(u64))) #define BIRD_CPU_ALIGN(s) BIRD_ALIGN((s), CPU_STRUCT_ALIGN) @@ -86,6 +87,7 @@ static inline int u64_cmp(u64 i1, u64 i2) /* Macros for gcc attributes */ #define NORET __attribute__((noreturn)) +#define USE_RESULT __atribute__((warn_unused_result)) #define UNUSED __attribute__((unused)) #define PACKED __attribute__((packed)) #define NONNULL(...) __attribute__((nonnull((__VA_ARGS__)))) @@ -93,10 +95,6 @@ static inline int u64_cmp(u64 i1, u64 i2) #define STATIC_ASSERT(EXP) _Static_assert(EXP, #EXP) #define STATIC_ASSERT_MSG(EXP,MSG) _Static_assert(EXP, MSG) -#ifndef HAVE_THREAD_LOCAL -#define _Thread_local -#endif - /* Microsecond time */ typedef s64 btime; @@ -178,8 +176,13 @@ void debug(const char *msg, ...); /* Printf to debug output */ #if defined(LOCAL_DEBUG) || defined(GLOBAL_DEBUG) #define DBG(x, y...) debug(x, ##y) +#define DBGL(x, y...) debug(x "\n", ##y) +#elif defined(DEBUG_TO_LOG) +#define DBG(...) do { } while (0) +#define DBGL(...) log(L_DEBUG __VA_ARGS__) #else -#define DBG(x, y...) do { } while(0) +#define DBG(...) do { } while(0) +#define DBGL(...) do { } while (0) #endif #define ASSERT_DIE(x) do { if (!(x)) bug("Assertion '%s' failed at %s:%d", #x, __FILE__, __LINE__); } while(0) diff --git a/lib/event.c b/lib/event.c index 33dc00b0..68ee4c06 100644 --- a/lib/event.c +++ b/lib/event.c @@ -19,20 +19,152 @@ * events in them and explicitly ask to run them. */ +#undef LOCAL_DEBUG + #include "nest/bird.h" #include "lib/event.h" +#include "lib/io-loop.h" event_list global_event_list; event_list global_work_list; -inline void -ev_postpone(event *e) +//#ifdef DEBUGGING +#if 0 +#define EDL_MAX 16384 +enum edl_caller { + EDL_REMOVE_FROM = 1, + EDL_POSTPONE = 2, + EDL_RUN = 3, + EDL_SEND = 4, + EDL_RUN_LIST = 5, +} caller; +static struct event_debug_log { + event_list *target_list; + event *event; + event *receiver; + uint pos; + uint prev_edl_pos; + uint thread; + enum edl_caller caller; +} edl[EDL_MAX]; +static _Atomic uint edl_cnt; +_Thread_local static uint edl_thread; +_Thread_local static uint prev_edl_pos = ~0; +static inline void edlog(event_list *list, event *e, event *receiver, uint pos, enum edl_caller caller) +{ + uint edl_pos = atomic_fetch_add_explicit(&edl_cnt, 1, memory_order_acq_rel); + if (!edl_thread) + edl_thread = edl_pos; + + edl[edl_pos % EDL_MAX] = (struct event_debug_log) { + .target_list = list, + .event = e, + .receiver = receiver, + .pos = pos, + .prev_edl_pos = prev_edl_pos, + .thread = edl_thread, + .caller = caller, + }; + + prev_edl_pos = edl_pos; +} +#else +#define edlog(...) +#endif + + +void +ev_init_list(event_list *el, struct birdloop *loop, const char *name) { - if (ev_active(e)) + el->name = name; + el->loop = loop; + + atomic_store_explicit(&el->receiver, NULL, memory_order_release); + atomic_store_explicit(&el->_executor, NULL, memory_order_release); +} + +/* + * The event list should work as a message passing point. Sending a message + * must be a fairly fast process with no locks and low waiting times. OTOH, + * processing messages always involves running the assigned code and the + * receiver is always a single one thread with no concurrency at all. There is + * also a postponing requirement to synchronously remove an event from a queue, + * yet we allow this only when the caller has its receiver event loop locked. + * It still means that the event may get postponed from other event in the same + * list, therefore we have to be careful. + */ + +static inline int +ev_remove_from(event *e, event * _Atomic * head) +{ + /* The head pointer stores where cur is pointed to from */ + event * _Atomic *prev = head; + + /* The current event in queue to check */ + event *cur = atomic_load_explicit(prev, memory_order_acquire); + + /* This part of queue is empty! */ + if (!cur) + return 0; + + edlog(NULL, e, cur, 1, EDL_REMOVE_FROM); + while (cur) + { + /* Pre-loaded next pointer */ + event *next = atomic_load_explicit(&cur->next, memory_order_acquire); + + if (e == cur) { - rem_node(&e->n); - e->n.next = NULL; + edlog(NULL, e, next, 3, EDL_REMOVE_FROM); + + /* Check whether we have collided with somebody else + * adding an item to the queue. */ + if (!atomic_compare_exchange_strong_explicit( + prev, &cur, next, + memory_order_acq_rel, memory_order_acquire)) + { + /* This may happen only on list head */ + ASSERT_DIE(prev == head); + + /* Restart. The collision should never happen again. */ + return ev_remove_from(e, head); + } + + /* Successfully removed from the list; inactivate this event. */ + atomic_store_explicit(&cur->next, NULL, memory_order_release); + return 1; } + + edlog(NULL, e, next, 2, EDL_REMOVE_FROM); + + /* Go to the next event. */ + prev = &cur->next; + cur = next; + } + + edlog(NULL, e, cur, 4, EDL_REMOVE_FROM); + + return 0; +} + +inline void +ev_postpone(event *e) +{ + /* Find the list to remove the event from */ + event_list *sl = ev_get_list(e); + edlog(sl, e, NULL, 1, EDL_POSTPONE); + if (!sl) + return; + + /* Postponing allowed only from the target loop */ + ASSERT_DIE(birdloop_inside(sl->loop)); + + /* Remove from one of these lists. */ + ASSERT(ev_remove_from(e, &sl->_executor) || ev_remove_from(e, &sl->receiver)); + + /* Mark as inactive */ + ASSERT_DIE(sl == atomic_exchange_explicit(&e->list, NULL, memory_order_acq_rel)); + edlog(sl, e, NULL, 2, EDL_POSTPONE); } static void @@ -43,7 +175,7 @@ ev_dump(resource *r) debug("(code %p, data %p, %s)\n", e->hook, e->data, - e->n.next ? "scheduled" : "inactive"); + atomic_load_explicit(&e->next, memory_order_relaxed) ? "scheduled" : "inactive"); } static struct resclass ev_class = { @@ -82,8 +214,10 @@ ev_new(pool *p) inline void ev_run(event *e) { + edlog(NULL, e, NULL, 1, EDL_RUN); ev_postpone(e); e->hook(e->data); + edlog(NULL, e, NULL, 2, EDL_RUN); } /** @@ -95,40 +229,39 @@ ev_run(event *e) * list @l which can be run by calling ev_run_list(). */ inline void -ev_enqueue(event_list *l, event *e) +ev_send(event_list *l, event *e) { - ev_postpone(e); - add_tail(l, &e->n); -} + edlog(l, e, NULL, 1, EDL_SEND); + /* Set the target list */ + event_list *ol = NULL; + if (!atomic_compare_exchange_strong_explicit( + &e->list, &ol, l, + memory_order_acq_rel, memory_order_acquire)) + if (ol == l) + return; + else + bug("Queuing an already queued event to another queue is not supported."); -/** - * ev_schedule - schedule an event - * @e: an event - * - * This function schedules an event by enqueueing it to a system-wide - * event list which is run by the platform dependent code whenever - * appropriate. - */ -void -ev_schedule(event *e) -{ - ev_enqueue(&global_event_list, e); -} + /* Here should be no concurrent senders */ + event *next = atomic_load_explicit(&l->receiver, memory_order_acquire); + edlog(l, e, next, 2, EDL_SEND); + event *old_next = NULL; + do + if (!atomic_compare_exchange_strong_explicit( + &e->next, &old_next, next, + memory_order_acq_rel, memory_order_acquire)) + bug("Event %p in inconsistent state"); + else + { + old_next = next; + edlog(l, old_next, next, 3, EDL_SEND); + } + while (!atomic_compare_exchange_strong_explicit( + &l->receiver, &next, e, + memory_order_acq_rel, memory_order_acquire)); -/** - * ev_schedule_work - schedule a work-event. - * @e: an event - * - * This function schedules an event by enqueueing it to a system-wide work-event - * list which is run by the platform dependent code whenever appropriate. This - * is designated for work-events instead of regular events. They are executed - * less often in order to not clog I/O loop. - */ -void -ev_schedule_work(event *e) -{ - if (!ev_active(e)) - add_tail(&global_work_list, &e->n); + edlog(l, e, next, 4, EDL_SEND); + birdloop_ping(l->loop); } void io_log_event(void *hook, void *data); @@ -140,62 +273,66 @@ void io_log_event(void *hook, void *data); * This function calls ev_run() for all events enqueued in the list @l. */ int -ev_run_list(event_list *l) +ev_run_list_limited(event_list *l, uint limit) { - node *n; - list tmp_list; - - init_list(&tmp_list); - add_tail_list(&tmp_list, l); - init_list(l); - WALK_LIST_FIRST(n, tmp_list) - { - event *e = SKIP_BACK(event, n, n); + event * _Atomic *ep = &l->_executor; + edlog(l, NULL, NULL, 1, EDL_RUN_LIST); - /* This is ugly hack, we want to log just events executed from the main I/O loop */ - if ((l == &global_event_list) || (l == &global_work_list)) - io_log_event(e->hook, e->data); + /* No pending events, refill the queue. */ + if (!atomic_load_explicit(ep, memory_order_acquire)) + { + /* Move the current event list aside and create a new one. */ + event *received = atomic_exchange_explicit(&l->receiver, NULL, memory_order_acq_rel); + edlog(l, NULL, received, 2, EDL_RUN_LIST); - ev_run(e); - tmp_flush(); - } + /* No event to run. */ + if (!received) + return 0; - return !EMPTY_LIST(*l); -} + /* Setup the executor queue */ + event *head = NULL; -int -ev_run_list_limited(event_list *l, uint limit) -{ - node *n; - list tmp_list; + /* Flip the order of the events by relinking them one by one (push-pop) */ + while (received) + { + event *cur = received; + received = atomic_exchange_explicit(&cur->next, head, memory_order_acq_rel); + edlog(l, head, received, 3, EDL_RUN_LIST); + head = cur; + } - init_list(&tmp_list); - add_tail_list(&tmp_list, l); - init_list(l); + /* Store the executor queue to its designated place */ + ASSERT_DIE(atomic_exchange_explicit(ep, head, memory_order_acq_rel) == NULL); + edlog(l, NULL, head, 4, EDL_RUN_LIST); + } - WALK_LIST_FIRST(n, tmp_list) + /* Run the events in order. */ + event *e; + while (e = atomic_load_explicit(ep, memory_order_acquire)) { - event *e = SKIP_BACK(event, n, n); - - if (!limit) - break; + edlog(l, e, NULL, 5, EDL_RUN_LIST); + /* Check limit */ + if (!--limit) + return 1; /* This is ugly hack, we want to log just events executed from the main I/O loop */ if ((l == &global_event_list) || (l == &global_work_list)) io_log_event(e->hook, e->data); - ev_run(e); + edlog(l, e, NULL, 6, EDL_RUN_LIST); + /* Inactivate the event */ + event *next = atomic_load_explicit(&e->next, memory_order_relaxed); + ASSERT_DIE(e == atomic_exchange_explicit(ep, next, memory_order_acq_rel)); + ASSERT_DIE(next == atomic_exchange_explicit(&e->next, NULL, memory_order_acq_rel)); + ASSERT_DIE(l == atomic_exchange_explicit(&e->list, NULL, memory_order_acq_rel)); + edlog(l, e, next, 7, EDL_RUN_LIST); + + /* Run the event */ + e->hook(e->data); tmp_flush(); - limit--; - } - if (!EMPTY_LIST(tmp_list)) - { - /* Attach new items after the unprocessed old items */ - add_tail_list(&tmp_list, l); - init_list(l); - add_tail_list(l, &tmp_list); - } + edlog(l, e, next, 8, EDL_RUN_LIST); + } - return !EMPTY_LIST(*l); + return !!atomic_load_explicit(&l->receiver, memory_order_acquire); } diff --git a/lib/event.h b/lib/event.h index 5f3b78d8..0bef737a 100644 --- a/lib/event.h +++ b/lib/event.h @@ -10,33 +10,57 @@ #define _BIRD_EVENT_H_ #include "lib/resource.h" +#include "lib/locking.h" +#include "lib/rcu.h" + +#include <stdatomic.h> + +struct birdloop; typedef struct event { resource r; void (*hook)(void *); void *data; - node n; /* Internal link */ + struct event * _Atomic next; + struct event_list * _Atomic list; } event; -typedef list event_list; +typedef struct event_list { + event * _Atomic receiver; /* Event receive list */ + event * _Atomic _executor; /* Event execute list */ + const char *name; + struct birdloop *loop; /* The executor loop */ +} event_list; extern event_list global_event_list; extern event_list global_work_list; event *ev_new(pool *); void ev_run(event *); -#define ev_init_list(el) init_list(el) +void ev_init_list(event_list *, struct birdloop *loop, const char *name); void ev_enqueue(event_list *, event *); -void ev_schedule(event *); -void ev_schedule_work(event *); +#define ev_send ev_enqueue +#define ev_send_loop(l, e) ev_send(birdloop_event_list((l)), (e)) + +#define ev_schedule(e) ({ ASSERT_THE_BIRD_LOCKED; if (!ev_active((e))) ev_send(&global_event_list, (e)); }) +#define ev_schedule_work(e) ({ ASSERT_THE_BIRD_LOCKED; if (!ev_active((e))) ev_send(&global_work_list, (e)); }) + void ev_postpone(event *); -int ev_run_list(event_list *); int ev_run_list_limited(event_list *, uint); +#define ev_run_list(l) ev_run_list_limited((l), ~0) + +#define LEGACY_EVENT_LIST(l) (((l) == &global_event_list) || ((l) == &global_work_list)) static inline int ev_active(event *e) { - return e->n.next != NULL; + return atomic_load_explicit(&e->list, memory_order_acquire) != NULL; +} + +static inline event_list * +ev_get_list(event *e) +{ + return atomic_load_explicit(&e->list, memory_order_acquire); } static inline event* diff --git a/lib/event_test.c b/lib/event_test.c index 3070327d..612deb25 100644 --- a/lib/event_test.c +++ b/lib/event_test.c @@ -54,7 +54,6 @@ t_ev_run_list(void) int i; olock_init(); - timer_init(); rt_init(); io_init(); if_init(); diff --git a/lib/io-loop.h b/lib/io-loop.h new file mode 100644 index 00000000..ae58bbee --- /dev/null +++ b/lib/io-loop.h @@ -0,0 +1,66 @@ +/* + * BIRD -- I/O and event loop + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_IO_LOOP_H_ +#define _BIRD_IO_LOOP_H_ + +#include "nest/bird.h" +#include "lib/lists.h" +#include "lib/locking.h" +#include "lib/resource.h" +#include "lib/event.h" +#include "lib/socket.h" + +extern struct birdloop main_birdloop; + +void sk_start(sock *s); +void sk_stop(sock *s); +void sk_reloop(sock *s, struct birdloop *loop); + +/* Start a new birdloop owned by given pool and domain */ +struct birdloop *birdloop_new(pool *p, uint order, const char *name); + +/* Stop the loop. At the end, the @stopped callback is called unlocked in tail + * position to finish cleanup. Run birdloop_free() from that callback to free + * the loop itself. */ +void birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data); +void birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data); +void birdloop_free(struct birdloop *loop); + +/* Get birdloop's event list */ +event_list *birdloop_event_list(struct birdloop *loop); + +/* Get birdloop's time heap */ +struct timeloop *birdloop_time_loop(struct birdloop *loop); + +/* Enter and exit the birdloop */ +void birdloop_enter(struct birdloop *loop); +void birdloop_leave(struct birdloop *loop); + +_Bool birdloop_inside(struct birdloop *loop); + +void birdloop_mask_wakeups(struct birdloop *loop); +void birdloop_unmask_wakeups(struct birdloop *loop); + +void birdloop_link(struct birdloop *loop); +void birdloop_unlink(struct birdloop *loop); + +void birdloop_ping(struct birdloop *loop); + +struct birdloop_flag_handler { + void (*hook)(struct birdloop_flag_handler *, u32 flags); + void *data; +}; + +void birdloop_flag(struct birdloop *loop, u32 flag); +void birdloop_flag_set_handler(struct birdloop *, struct birdloop_flag_handler *); + +void birdloop_init(void); + +/* Yield for a little while. Use only in special cases. */ +void birdloop_yield(void); + +#endif /* _BIRD_IO_LOOP_H_ */ diff --git a/lib/lists.c b/lib/lists.c index 200576cf..8f95c7c2 100644 --- a/lib/lists.c +++ b/lib/lists.c @@ -26,7 +26,7 @@ #define _BIRD_LISTS_C_ -#include "nest/bird.h" +#include "lib/birdlib.h" #include "lib/lists.h" LIST_INLINE int @@ -35,11 +35,12 @@ check_list(list *l, node *n) if (!l) { ASSERT_DIE(n); - ASSERT_DIE(n->prev); - do { n = n->prev; } while (n->prev); + node *nn = n; + while (nn->prev) + nn = nn->prev; - l = SKIP_BACK(list, head_node, n); + l = SKIP_BACK(list, head_node, nn); } int seen = 0; @@ -60,7 +61,7 @@ check_list(list *l, node *n) } ASSERT_DIE(cur == &(l->tail_node)); - ASSERT_DIE(!n || (seen == 1)); + ASSERT_DIE(!n || (seen == 1) || (n == &l->head_node) || (n == &l->tail_node)); return 1; } @@ -120,7 +121,7 @@ add_head(list *l, node *n) LIST_INLINE void insert_node(node *n, node *after) { - EXPENSIVE_CHECK(check_list(l, after)); + EXPENSIVE_CHECK(check_list(NULL, after)); ASSUME(n->prev == NULL); ASSUME(n->next == NULL); @@ -141,7 +142,7 @@ insert_node(node *n, node *after) LIST_INLINE void rem_node(node *n) { - EXPENSIVE_CHECK(check_list(NULL, n)); + EXPENSIVE_CHECK((n == n->prev) && (n == n->next) || check_list(NULL, n)); node *z = n->prev; node *x = n->next; diff --git a/lib/lists.h b/lib/lists.h index 7e6d5467..86ff59c9 100644 --- a/lib/lists.h +++ b/lib/lists.h @@ -69,6 +69,18 @@ typedef union list { /* In fact two overlayed nodes */ #define EMPTY_LIST(list) (!(list).head->next) +static inline _Bool +enlisted(node *n) +{ + switch ((!!n->next) + (!!n->prev)) + { + case 0: return 0; + case 2: return 1; + case 1: bug("Garbled event list node"); + } + + bug("Maths is broken. And you should see a new heaven and a new earth: for the first heaven and the first earth had been passed away."); +} #ifndef _BIRD_LISTS_C_ #define LIST_INLINE static inline diff --git a/lib/locking.h b/lib/locking.h new file mode 100644 index 00000000..498afdc8 --- /dev/null +++ b/lib/locking.h @@ -0,0 +1,65 @@ +/* + * BIRD Library -- Locking + * + * (c) 2020--2021 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_LOCKING_H_ +#define _BIRD_LOCKING_H_ + +struct domain_generic; + +/* Here define the global lock order; first to last. */ +struct lock_order { + struct domain_generic *the_bird; + struct domain_generic *proto; + struct domain_generic *service; + struct domain_generic *rtable; + struct domain_generic *attrs; + struct domain_generic *resource; +}; + +extern _Thread_local struct lock_order locking_stack; +extern _Thread_local struct domain_generic **last_locked; + +#define DOMAIN(type) struct domain__##type +#define DEFINE_DOMAIN(type) DOMAIN(type) { struct domain_generic *type; } +#define DOMAIN_ORDER(type) OFFSETOF(struct lock_order, type) + +#define DOMAIN_NEW(type, name) (DOMAIN(type)) { .type = domain_new(name, DOMAIN_ORDER(type)) } +struct domain_generic *domain_new(const char *name, uint order); + +#define DOMAIN_FREE(type, d) domain_free((d).type) +void domain_free(struct domain_generic *); + +#define DOMAIN_NULL(type) (DOMAIN(type)) {} + +#define LOCK_DOMAIN(type, d) do_lock(((d).type), &(locking_stack.type)) +#define UNLOCK_DOMAIN(type, d) do_unlock(((d).type), &(locking_stack.type)) + +#define DOMAIN_IS_LOCKED(type, d) (((d).type) == (locking_stack.type)) +#define DG_IS_LOCKED(d) ((d) == *(DG_LSP(d))) + +/* Internal for locking */ +void do_lock(struct domain_generic *dg, struct domain_generic **lsp); +void do_unlock(struct domain_generic *dg, struct domain_generic **lsp); + +uint dg_order(struct domain_generic *dg); + +#define DG_LSP(d) ((struct domain_generic **) (((void *) &locking_stack) + dg_order(d))) +#define DG_LOCK(d) do_lock(d, DG_LSP(d)) +#define DG_UNLOCK(d) do_unlock(d, DG_LSP(d)) + +/* Use with care. To be removed in near future. */ +DEFINE_DOMAIN(the_bird); +extern DOMAIN(the_bird) the_bird_domain; + +#define the_bird_lock() LOCK_DOMAIN(the_bird, the_bird_domain) +#define the_bird_unlock() UNLOCK_DOMAIN(the_bird, the_bird_domain) +#define the_bird_locked() DOMAIN_IS_LOCKED(the_bird, the_bird_domain) + +#define ASSERT_THE_BIRD_LOCKED ({ if (!the_bird_locked()) bug("The BIRD lock must be locked here: %s:%d", __FILE__, __LINE__); }) + +#endif diff --git a/lib/rcu.c b/lib/rcu.c new file mode 100644 index 00000000..83fdd022 --- /dev/null +++ b/lib/rcu.c @@ -0,0 +1,79 @@ +/* + * BIRD Library -- Read-Copy-Update Basic Operations + * + * (c) 2021 Maria Matejka <mq@jmq.cz> + * (c) 2021 CZ.NIC z.s.p.o. + * + * Can be freely distributed and used under the terms of the GNU GPL. + * Note: all the relevant patents shall be expired. + * + * Using the Supplementary Material for User-Level Implementations of Read-Copy-Update + * by Matthieu Desnoyers, Paul E. McKenney, Alan S. Stern, Michel R. Dagenais and Jonathan Walpole + * obtained from https://www.efficios.com/pub/rcu/urcu-supp-accepted.pdf + */ + +#include "lib/rcu.h" +#include "lib/io-loop.h" +#include "lib/locking.h" + +_Atomic uint rcu_gp_ctl = RCU_NEST_CNT; +_Thread_local struct rcu_birdloop *this_rcu_birdloop = NULL; + +static list rcu_birdloop_list; + +static struct rcu_birdloop main_rcu_birdloop; + +DEFINE_DOMAIN(resource); +static DOMAIN(resource) rcu_domain; + +static int +rcu_gp_ongoing(_Atomic uint *ctl) +{ + uint val = atomic_load(ctl); + return (val & RCU_NEST_CNT) && ((val ^ rcu_gp_ctl) & RCU_GP_PHASE); +} + +static void +update_counter_and_wait(void) +{ + atomic_fetch_xor(&rcu_gp_ctl, RCU_GP_PHASE); + struct rcu_birdloop *rc; + WALK_LIST(rc, rcu_birdloop_list) + while (rcu_gp_ongoing(&rc->ctl)) + birdloop_yield(); +} + +void +synchronize_rcu(void) +{ + LOCK_DOMAIN(resource, rcu_domain); + update_counter_and_wait(); + update_counter_and_wait(); + UNLOCK_DOMAIN(resource, rcu_domain); +} + +void +rcu_birdloop_start(struct rcu_birdloop *rc) +{ + LOCK_DOMAIN(resource, rcu_domain); + add_tail(&rcu_birdloop_list, &rc->n); + this_rcu_birdloop = rc; + UNLOCK_DOMAIN(resource, rcu_domain); +} + +void +rcu_birdloop_stop(struct rcu_birdloop *rc) +{ + LOCK_DOMAIN(resource, rcu_domain); + this_rcu_birdloop = NULL; + rem_node(&rc->n); + UNLOCK_DOMAIN(resource, rcu_domain); +} + +void +rcu_init(void) +{ + rcu_domain = DOMAIN_NEW(resource, "Read-Copy-Update"); + init_list(&rcu_birdloop_list); + rcu_birdloop_start(&main_rcu_birdloop); +} diff --git a/lib/rcu.h b/lib/rcu.h new file mode 100644 index 00000000..c537a1ef --- /dev/null +++ b/lib/rcu.h @@ -0,0 +1,55 @@ +/* + * BIRD Library -- Read-Copy-Update Basic Operations + * + * (c) 2021 Maria Matejka <mq@jmq.cz> + * (c) 2021 CZ.NIC z.s.p.o. + * + * Can be freely distributed and used under the terms of the GNU GPL. + * Note: all the relevant patents shall be expired. + */ + +#ifndef _BIRD_RCU_H_ +#define _BIRD_RCU_H_ + +#include "lib/birdlib.h" +#include "lib/lists.h" +#include <stdatomic.h> + +#define RCU_GP_PHASE 0x100000 +#define RCU_NEST_MASK 0x0fffff +#define RCU_NEST_CNT 0x000001 + +extern _Atomic uint rcu_gp_ctl; + +struct rcu_birdloop { + node n; + _Atomic uint ctl; +}; + +extern _Thread_local struct rcu_birdloop *this_rcu_birdloop; + +static inline void rcu_read_lock(void) +{ + uint cmp = atomic_load_explicit(&this_rcu_birdloop->ctl, memory_order_acquire); + + if (cmp & RCU_NEST_MASK) + atomic_store_explicit(&this_rcu_birdloop->ctl, cmp + RCU_NEST_CNT, memory_order_relaxed); + else + atomic_store(&this_rcu_birdloop->ctl, atomic_load_explicit(&rcu_gp_ctl, memory_order_acquire)); +} + +static inline void rcu_read_unlock(void) +{ + atomic_fetch_sub(&this_rcu_birdloop->ctl, RCU_NEST_CNT); +} + +void synchronize_rcu(void); + +/* Registering and unregistering a birdloop. To be called from birdloop implementation */ +void rcu_birdloop_start(struct rcu_birdloop *); +void rcu_birdloop_stop(struct rcu_birdloop *); + +/* Run this from resource init */ +void rcu_init(void); + +#endif diff --git a/lib/resource.c b/lib/resource.c index c31d9889..2e367132 100644 --- a/lib/resource.c +++ b/lib/resource.c @@ -14,6 +14,7 @@ #include "nest/bird.h" #include "lib/resource.h" #include "lib/string.h" +#include "lib/rcu.h" /** * DOC: Resource pools @@ -29,12 +30,6 @@ * is freed upon shutdown of the module. */ -struct pool { - resource r; - list inside; - const char *name; -}; - static void pool_dump(resource *); static void pool_free(resource *); static resource *pool_lookup(resource *, unsigned long); @@ -284,6 +279,7 @@ rlookup(unsigned long a) void resource_init(void) { + rcu_init(); resource_sys_init(); root_pool.r.class = &pool_class; diff --git a/lib/resource.h b/lib/resource.h index 4cedbf00..5d9e2165 100644 --- a/lib/resource.h +++ b/lib/resource.h @@ -40,7 +40,12 @@ struct resclass { /* Generic resource manipulation */ -typedef struct pool pool; +typedef struct pool { + resource r; + list inside; + const char *name; +} pool; + void resource_init(void); pool *rp_new(pool *, const char *); /* Create new pool */ @@ -115,9 +120,13 @@ void sl_free(void *); void buffer_realloc(void **buf, unsigned *size, unsigned need, unsigned item_size); /* Allocator of whole pages; for use in slabs and other high-level allocators. */ +#define PAGE_HEAD(x) ((void *) (((uintptr_t) (x)) & ~(page_size-1))) extern long page_size; +extern _Atomic int pages_kept; +extern _Atomic int pages_kept_locally; void *alloc_page(void); void free_page(void *); +void flush_local_pages(void); void resource_sys_init(void); diff --git a/lib/route.h b/lib/route.h index 68596316..eae251e7 100644 --- a/lib/route.h +++ b/lib/route.h @@ -10,12 +10,17 @@ #ifndef _BIRD_LIB_ROUTE_H_ #define _BIRD_LIB_ROUTE_H_ +#undef RT_SOURCE_DEBUG + #include "lib/type.h" +#include "lib/rcu.h" +#include "lib/hash.h" +#include "lib/event.h" struct network; struct proto; struct cli; - +struct rtable_private; typedef struct rte { struct ea_list *attrs; /* Attributes of this route */ @@ -29,12 +34,10 @@ typedef struct rte { u8 generation; /* If this route import is based on other previously exported route, this value should be 1 + MAX(generation of the parent routes). Otherwise the route is independent and this value is zero. */ + u8 stale_cycle; /* Auxiliary value for route refresh */ } rte; #define REF_FILTERED 2 /* Route is rejected by import filter */ -#define REF_STALE 4 /* Route is stale in a refresh cycle */ -#define REF_DISCARD 8 /* Route is scheduled for discard */ -#define REF_MODIFY 16 /* Route is scheduled for modify */ #define REF_PENDING 32 /* Route has not propagated completely yet */ /* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ @@ -45,19 +48,114 @@ static inline int rte_is_filtered(rte *r) { return !!(r->flags & REF_FILTERED); struct rte_src { struct rte_src *next; /* Hash chain */ - struct proto *proto; /* Protocol the source is based on */ + struct rte_owner *owner; /* Route source owner */ u32 private_id; /* Private ID, assigned by the protocol */ u32 global_id; /* Globally unique ID of the source */ - unsigned uc; /* Use count */ + _Atomic u64 uc; /* Use count */ +}; + +struct rte_owner_class { + void (*get_route_info)(struct rte *, byte *buf); /* Get route information (for `show route' command) */ + int (*rte_better)(struct rte *, struct rte *); + int (*rte_mergable)(struct rte *, struct rte *); + u32 (*rte_igp_metric)(const rte *); }; +struct rte_owner { + struct rte_owner_class *class; + int (*rte_recalculate)(struct rtable_private *, struct network *, struct rte *, struct rte *, struct rte *); + HASH(struct rte_src) hash; + const char *name; + u32 hash_key; + u32 uc; + event_list *list; + event *prune; + event *stop; +}; + +DEFINE_DOMAIN(attrs); +extern DOMAIN(attrs) attrs_domain; + +#define RTA_LOCK LOCK_DOMAIN(attrs, attrs_domain) +#define RTA_UNLOCK UNLOCK_DOMAIN(attrs, attrs_domain) + +#define RTE_SRC_PU_SHIFT 44 +#define RTE_SRC_IN_PROGRESS (1ULL << RTE_SRC_PU_SHIFT) + +/* Get a route source. This also locks the source, therefore the caller has to + * unlock the source after the route has been propagated. */ +struct rte_src *rt_get_source_o(struct rte_owner *o, u32 id); +#define rt_get_source(p, id) rt_get_source_o(&(p)->sources, (id)) -struct rte_src *rt_find_source(struct proto *p, u32 id); -struct rte_src *rt_get_source(struct proto *p, u32 id); struct rte_src *rt_find_source_global(u32 id); -static inline void rt_lock_source(struct rte_src *src) { src->uc++; } -static inline void rt_unlock_source(struct rte_src *src) { src->uc--; } -void rt_prune_sources(void); + +#ifdef RT_SOURCE_DEBUG +#define rt_lock_source _rt_lock_source_internal +#define rt_unlock_source _rt_unlock_source_internal +#endif + +static inline void rt_lock_source(struct rte_src *src) +{ + /* Locking a source is trivial; somebody already holds it so we just increase + * the use count. Nothing can be freed underneath our hands. */ + u64 uc = atomic_fetch_add_explicit(&src->uc, 1, memory_order_acq_rel); + ASSERT_DIE(uc > 0); +} + +static inline void rt_unlock_source(struct rte_src *src) +{ + /* Unlocking is tricky. We do it lockless so at the same time, the prune + * event may be running, therefore if the unlock gets us to zero, it must be + * the last thing in this routine, otherwise the prune routine may find the + * source's usecount zeroed, freeing it prematurely. + * + * The usecount is split into two parts: + * the top 20 bits are an in-progress indicator + * the bottom 44 bits keep the actual usecount. + * + * Therefore at most 1 million of writers can simultaneously unlock the same + * source, while at most ~17T different routes can reference it. Both limits + * are insanely high from the 2022 point of view. Let's suppose that when 17T + * routes or 1M writers get real, we get also 128bit atomic variables in the + * C norm. */ + + /* First, we push the in-progress indicator */ + u64 uc = atomic_fetch_add_explicit(&src->uc, RTE_SRC_IN_PROGRESS, memory_order_acq_rel); + + /* Then we split the indicator to its parts. Remember, we got the value before the operation happened. */ + u64 pending = (uc >> RTE_SRC_PU_SHIFT) + 1; + uc &= RTE_SRC_IN_PROGRESS - 1; + + /* We per-use the RCU critical section indicator to make the prune event wait + * until we finish here in the rare case we get preempted. */ + rcu_read_lock(); + + /* Obviously, there can't be more pending unlocks than the usecount itself */ + if (uc == pending) + /* If we're the last unlocker, schedule the owner's prune event */ + ev_send(src->owner->list, src->owner->prune); + else + ASSERT_DIE(uc > pending); + + /* And now, finally, simultaneously pop the in-progress indicator and the + * usecount, possibly allowing the source pruning routine to free this structure */ + atomic_fetch_sub_explicit(&src->uc, RTE_SRC_IN_PROGRESS + 1, memory_order_acq_rel); + + /* ... and to reduce the load a bit, the source pruning routine will better wait for + * RCU synchronization instead of a busy loop. */ + rcu_read_unlock(); +} + +#ifdef RT_SOURCE_DEBUG +#undef rt_lock_source +#undef rt_unlock_source + +#define rt_lock_source(x) ( log(L_INFO "Lock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_lock_source_internal(x) ) +#define rt_unlock_source(x) ( log(L_INFO "Unlock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_unlock_source_internal(x) ) +#endif + +void rt_init_sources(struct rte_owner *, const char *name, event_list *list); +void rt_destroy_sources(struct rte_owner *, event *); /* * Route Attributes @@ -159,7 +257,7 @@ typedef struct ea_list { struct ea_storage { struct ea_storage *next_hash; /* Next in hash chain */ struct ea_storage **pprev_hash; /* Previous in hash chain */ - u32 uc; /* Use count */ + _Atomic u32 uc; /* Use count */ u32 hash_key; /* List hash */ ea_list l[0]; /* The list itself */ }; @@ -425,15 +523,18 @@ static inline int ea_is_cached(const ea_list *r) { return r->flags & EALF_CACHED static inline struct ea_storage *ea_get_storage(ea_list *r) { ASSERT_DIE(ea_is_cached(r)); - return SKIP_BACK(struct ea_storage, l, r); + return SKIP_BACK(struct ea_storage, l[0], r); } -static inline ea_list *ea_clone(ea_list *r) { ea_get_storage(r)->uc++; return r; } +static inline ea_list *ea_clone(ea_list *r) { + ASSERT_DIE(0 < atomic_fetch_add_explicit(&ea_get_storage(r)->uc, 1, memory_order_acq_rel)); + return r; +} void ea__free(struct ea_storage *r); static inline void ea_free(ea_list *l) { if (!l) return; struct ea_storage *r = ea_get_storage(l); - if (!--r->uc) ea__free(r); + if (1 == atomic_fetch_sub_explicit(&r->uc, 1, memory_order_acq_rel)) ea__free(r); } void ea_dump(ea_list *); diff --git a/lib/settle.h b/lib/settle.h new file mode 100644 index 00000000..d274599d --- /dev/null +++ b/lib/settle.h @@ -0,0 +1,64 @@ +/* + * BIRD -- Settle timer + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * (c) 2022 CZ.NIC z.s.p.o. + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_SETTLE_H_ +#define _BIRD_SETTLE_H_ + +#include "lib/birdlib.h" +#include "lib/timer.h" + +struct settle_config { + btime min, max; +}; + +struct settle { + union { + /* Timer hook polymorphism. */ + struct { + resource _r; + void (*hook)(struct settle *); + }; + timer tm; + }; + struct settle_config cf; + btime started; +}; + +STATIC_ASSERT(OFFSETOF(struct settle, hook) == OFFSETOF(struct settle, tm) + OFFSETOF(timer, hook)); + +#define SETTLE_INIT(_cfp, _hook, _data) (struct settle) { .tm = { .data = (_data), }, .hook = (_hook), .cf = ({ASSERT_DIE((_cfp)->min <= (_cfp)->max); *(_cfp); }), } + + +static inline void settle_init(struct settle *s, struct settle_config *cf, void (*hook)(struct settle *), void *data) +{ + *s = SETTLE_INIT(cf, hook, data); +} + +#define settle_active(s) tm_active(&(s)->tm) + +static inline void settle_kick(struct settle *s, struct birdloop *loop) +{ + if (!tm_active(&s->tm)) + { + s->started = current_time(); + tm_set_in(&s->tm, s->started + s->cf.min, loop); + } + else + { + btime now = current_time(); + tm_set_in(&s->tm, MIN_(now + s->cf.min, s->started + s->cf.max), loop); + } +} + +static inline void settle_cancel(struct settle *s) +{ + tm_stop(&s->tm); +} + +#endif @@ -197,7 +197,7 @@ static struct resclass sl_class = { slab_memsize }; -#define SL_GET_HEAD(x) ((struct sl_head *) (((uintptr_t) (x)) & ~(page_size-1))) +#define SL_GET_HEAD(x) PAGE_HEAD(x) #define SL_HEAD_CHANGE_STATE(_s, _h, _from, _to) ({ \ ASSERT_DIE(_h->state == slh_##_from); \ diff --git a/lib/socket.h b/lib/socket.h index 0b6ac589..5c69482e 100644 --- a/lib/socket.h +++ b/lib/socket.h @@ -12,6 +12,7 @@ #include <errno.h> #include "lib/resource.h" +#include "lib/event.h" #ifdef HAVE_LIBSSH #define LIBSSH_LEGACY_0_4 #include <libssh/libssh.h> @@ -79,6 +80,7 @@ typedef struct birdsock { const char *password; /* Password for MD5 authentication */ const char *err; /* Error message */ struct ssh_sock *ssh; /* Used in SK_SSH */ + struct event reloop; /* Reloop event */ } sock; sock *sock_new(pool *); /* Allocate new socket */ @@ -129,6 +131,7 @@ extern int sk_priority_control; /* Suggested priority for control traffic, shou #define SKF_TRUNCATED 0x200 /* Received packet was truncated, set by IO layer */ #define SKF_HDRINCL 0x400 /* Used internally */ #define SKF_PKTINFO 0x800 /* Used internally */ +#define SKF_PASSIVE_THREAD 0x1000 /* Child sockets used in thread, do not add to main loop */ /* * Socket types SA SP DA DP IF TTL SendTo (?=may, -=must not, *=must) diff --git a/lib/timer.c b/lib/timer.c index c47e0bbc..ff6975a4 100644 --- a/lib/timer.c +++ b/lib/timer.c @@ -36,57 +36,13 @@ #include "lib/resource.h" #include "lib/timer.h" - -struct timeloop main_timeloop; - - -#ifdef USE_PTHREADS - #include <pthread.h> -/* Data accessed and modified from proto/bfd/io.c */ -pthread_key_t current_time_key; - -static inline struct timeloop * -timeloop_current(void) -{ - return pthread_getspecific(current_time_key); -} - -static inline void -timeloop_init_current(void) -{ - pthread_key_create(¤t_time_key, NULL); - pthread_setspecific(current_time_key, &main_timeloop); -} +_Atomic btime last_time; +_Atomic btime real_time; void wakeup_kick_current(void); -#else - -/* Just use main timelooop */ -static inline struct timeloop * timeloop_current(void) { return &main_timeloop; } -static inline void timeloop_init_current(void) { } - -#endif - -btime -current_time(void) -{ - return timeloop_current()->last_time; -} - -btime -current_real_time(void) -{ - struct timeloop *loop = timeloop_current(); - - if (!loop->real_time) - times_update_real_time(loop); - - return loop->real_time; -} - #define TIMER_LESS(a,b) ((a)->expires < (b)->expires) #define TIMER_SWAP(heap,a,b,t) (t = heap[a], heap[a] = heap[b], heap[b] = t, \ @@ -112,7 +68,7 @@ tm_dump(resource *r) if (t->recurrent) debug("recur %d, ", t->recurrent); if (t->expires) - debug("expires in %d ms)\n", (t->expires - current_time()) TO_MS); + debug("in loop %p expires in %d ms)\n", t->loop, (t->expires - current_time()) TO_MS); else debug("inactive)\n"); } @@ -135,41 +91,40 @@ tm_new(pool *p) return t; } -void -tm_set(timer *t, btime when) +static void +tm_set_in_tl(timer *t, btime when, struct timeloop *local_timeloop) { - struct timeloop *loop = timeloop_current(); - uint tc = timers_count(loop); + uint tc = timers_count(local_timeloop); if (!t->expires) { t->index = ++tc; t->expires = when; - BUFFER_PUSH(loop->timers) = t; - HEAP_INSERT(loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP); + BUFFER_PUSH(local_timeloop->timers) = t; + HEAP_INSERT(local_timeloop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP); } else if (t->expires < when) { t->expires = when; - HEAP_INCREASE(loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); + HEAP_INCREASE(local_timeloop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); } else if (t->expires > when) { t->expires = when; - HEAP_DECREASE(loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); + HEAP_DECREASE(local_timeloop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); } -#ifdef CONFIG_BFD - /* Hack to notify BFD loops */ - if ((loop != &main_timeloop) && (t->index == 1)) - wakeup_kick_current(); -#endif + t->loop = local_timeloop; + + if (t->index == 1) + birdloop_ping(local_timeloop->loop); } void -tm_start(timer *t, btime after) +tm_set_in(timer *t, btime when, struct birdloop *loop) { - tm_set(t, current_time() + MAX(after, 0)); + ASSERT_DIE(birdloop_inside(loop)); + tm_set_in_tl(t, when, birdloop_time_loop(loop)); } void @@ -178,20 +133,22 @@ tm_stop(timer *t) if (!t->expires) return; - struct timeloop *loop = timeloop_current(); - uint tc = timers_count(loop); + TLOCK_TIMER_ASSERT(t->loop); - HEAP_DELETE(loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); - BUFFER_POP(loop->timers); + uint tc = timers_count(t->loop); + + HEAP_DELETE(t->loop->timers.data, tc, timer *, TIMER_LESS, TIMER_SWAP, t->index); + BUFFER_POP(t->loop->timers); t->index = -1; t->expires = 0; + t->loop = NULL; } void timers_init(struct timeloop *loop, pool *p) { - times_init(loop); + TLOCK_TIMER_ASSERT(loop); BUFFER_INIT(loop->timers, p, 4); BUFFER_PUSH(loop->timers) = NULL; @@ -200,13 +157,15 @@ timers_init(struct timeloop *loop, pool *p) void io_log_event(void *hook, void *data); void -timers_fire(struct timeloop *loop) +timers_fire(struct timeloop *loop, int io_log) { + TLOCK_TIMER_ASSERT(loop); + btime base_time; timer *t; - times_update(loop); - base_time = loop->last_time; + times_update(); + base_time = current_time(); while (t = timers_first(loop)) { @@ -217,19 +176,19 @@ timers_fire(struct timeloop *loop) { btime when = t->expires + t->recurrent; - if (when <= loop->last_time) - when = loop->last_time + t->recurrent; + if (when <= base_time) + when = base_time + t->recurrent; if (t->randomize) when += random() % (t->randomize + 1); - tm_set(t, when); + tm_set_in_tl(t, when, loop); } else tm_stop(t); /* This is ugly hack, we want to log just timers executed from the main I/O loop */ - if (loop == &main_timeloop) + if (io_log) io_log_event(t->hook, t->data); t->hook(t); @@ -237,13 +196,6 @@ timers_fire(struct timeloop *loop) } } -void -timer_init(void) -{ - timers_init(&main_timeloop, &root_pool); - timeloop_init_current(); -} - /** * tm_parse_time - parse a date and time diff --git a/lib/timer.h b/lib/timer.h index c5ea430c..555fc96f 100644 --- a/lib/timer.h +++ b/lib/timer.h @@ -12,8 +12,14 @@ #include "nest/bird.h" #include "lib/buffer.h" +#include "lib/io-loop.h" +#include "lib/locking.h" #include "lib/resource.h" +#include <stdatomic.h> + +extern _Atomic btime last_time; +extern _Atomic btime real_time; typedef struct timer { @@ -25,36 +31,42 @@ typedef struct timer uint randomize; /* Amount of randomization */ uint recurrent; /* Timer recurrence */ + struct timeloop *loop; /* Loop where the timer is active */ + int index; } timer; struct timeloop { BUFFER_(timer *) timers; - btime last_time; - btime real_time; + struct domain_generic *domain; + struct birdloop *loop; }; +#define TLOCK_TIMER_ASSERT(loop) ASSERT_DIE((loop)->domain && DG_IS_LOCKED((loop)->domain)) +#define TLOCK_LOCAL_ASSERT(loop) ASSERT_DIE(!(loop)->domain || DG_IS_LOCKED((loop)->domain)) + static inline uint timers_count(struct timeloop *loop) -{ return loop->timers.used - 1; } +{ TLOCK_TIMER_ASSERT(loop); return loop->timers.used - 1; } static inline timer *timers_first(struct timeloop *loop) -{ return (loop->timers.used > 1) ? loop->timers.data[1] : NULL; } +{ TLOCK_TIMER_ASSERT(loop); return (loop->timers.used > 1) ? loop->timers.data[1] : NULL; } -extern struct timeloop main_timeloop; - -btime current_time(void); -btime current_real_time(void); +#define current_time() atomic_load_explicit(&last_time, memory_order_acquire) +#define current_real_time() atomic_load_explicit(&real_time, memory_order_acquire) //#define now (current_time() TO_S) //#define now_real (current_real_time() TO_S) extern btime boot_time; timer *tm_new(pool *p); -void tm_set(timer *t, btime when); -void tm_start(timer *t, btime after); +#define tm_set(t, when) tm_set_in((t), (when), &main_birdloop) +#define tm_start(t, after) tm_start_in((t), (after), &main_birdloop) void tm_stop(timer *t); +void tm_set_in(timer *t, btime when, struct birdloop *loop); +#define tm_start_in(t, after, loop) tm_set_in((t), (current_time() + MAX_((after), 0)), loop) + static inline int tm_active(timer *t) { @@ -94,15 +106,11 @@ tm_start_max(timer *t, btime after) } /* In sysdep code */ -void times_init(struct timeloop *loop); -void times_update(struct timeloop *loop); -void times_update_real_time(struct timeloop *loop); +void times_update(void); /* For I/O loop */ void timers_init(struct timeloop *loop, pool *p); -void timers_fire(struct timeloop *loop); - -void timer_init(void); +void timers_fire(struct timeloop *loop, int io_log); struct timeformat { diff --git a/misc/bird.spec b/misc/bird.spec index 425a6340..c3fdd7d6 100644 --- a/misc/bird.spec +++ b/misc/bird.spec @@ -1,6 +1,6 @@ Summary: BIRD Internet Routing Daemon Name: bird -Version: 2.0.9 +Version: 2.0.10 Release: 1 Copyright: GPL Group: Networking/Daemons diff --git a/nest/Makefile b/nest/Makefile index 0350c3b6..5b27da0c 100644 --- a/nest/Makefile +++ b/nest/Makefile @@ -2,11 +2,14 @@ src := cli.c cmds.c iface.c locks.c neighbor.c password.c proto.c proto-build.c obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,dev_build) -$(proto-build-c): $(lastword $(MAKEFILE_LIST)) +$(objdir)/nest/proto-build.c: $(lastword $(MAKEFILE_LIST)) $(E)echo GEN $@ - $(Q)echo "#include \"lib/birdlib.h\"\n$(patsubst %,void %(void);\n,$(PROTO_BUILD)) void protos_build_gen(void) { $(patsubst %, %();\n,$(PROTO_BUILD))}" > $@ + $(Q)echo "#include \"lib/birdlib.h\"" > $@ + $(Q)$(patsubst %,echo 'void %_build(void);' >> $@;,$(PROTO_BUILD)) + $(Q)echo "void protos_build_gen(void) {" >> $@ + $(Q)$(patsubst %,echo ' %_build();'>>$@;,$(PROTO_BUILD)) + $(Q)echo "}" >> $@ tests_src := tests_targets := $(tests_targets) $(tests-target-files) diff --git a/nest/cmds.c b/nest/cmds.c index 092be48a..8a5bbdd4 100644 --- a/nest/cmds.c +++ b/nest/cmds.c @@ -109,7 +109,6 @@ print_size(char *dsc, struct resmem vals) extern pool *rt_table_pool; extern pool *rta_pool; -extern uint *pages_kept; void cmd_show_memory(void) @@ -121,8 +120,10 @@ cmd_show_memory(void) print_size("Protocols:", rmemsize(proto_pool)); struct resmem total = rmemsize(&root_pool); #ifdef HAVE_MMAP - print_size("Standby memory:", (struct resmem) { .overhead = page_size * *pages_kept }); - total.overhead += page_size * *pages_kept; + int pk = atomic_load_explicit(&pages_kept, memory_order_relaxed) + + atomic_load_explicit(&pages_kept_locally, memory_order_relaxed); + print_size("Standby memory:", (struct resmem) { .overhead = page_size * pk }); + total.overhead += page_size * pk; #endif print_size("Total:", total); cli_msg(0, ""); diff --git a/nest/config.Y b/nest/config.Y index edddfc2b..f2904882 100644 --- a/nest/config.Y +++ b/nest/config.Y @@ -125,7 +125,7 @@ CF_KEYWORDS(TIMEFORMAT, ISO, SHORT, LONG, ROUTE, PROTOCOL, BASE, LOG, S, MS, US) CF_KEYWORDS(GRACEFUL, RESTART, WAIT, MAX, AS) CF_KEYWORDS(MIN, IDLE, RX, TX, INTERVAL, MULTIPLIER, PASSIVE) CF_KEYWORDS(CHECK, LINK) -CF_KEYWORDS(SORTED, TRIE, MIN, MAX, SETTLE, TIME) +CF_KEYWORDS(CORK, SORTED, TRIE, MIN, MAX, ROA, ROUTE, REFRESH, SETTLE, TIME, GC, THRESHOLD, PERIOD) /* For r_args_channel */ CF_KEYWORDS(IPV4, IPV4_MC, IPV4_MPLS, IPV6, IPV6_MC, IPV6_MPLS, IPV6_SADR, VPN4, VPN4_MC, VPN4_MPLS, VPN6, VPN6_MC, VPN6_MPLS, ROA4, ROA6, FLOW4, FLOW6, MPLS, PRI, SEC) @@ -165,7 +165,7 @@ rtrid: idval: NUM { $$ = $1; } - | '(' term ')' { $$ = f_eval_int(f_linearize($2)); } + | '(' term ')' { $$ = f_eval_int(f_linearize($2, 1)); } | IP4 { $$ = ip4_to_u32($1); } | CF_SYM_KNOWN { if ($1->class == (SYM_CONSTANT | T_INT) || $1->class == (SYM_CONSTANT | T_QUAD)) @@ -227,8 +227,15 @@ table_opt: cf_error("Trie option not supported for %s table", net_label[this_table->addr_type]); this_table->trie_used = $2; } - | MIN SETTLE TIME expr_us { this_table->min_settle_time = $4; } - | MAX SETTLE TIME expr_us { this_table->max_settle_time = $4; } + | GC THRESHOLD expr { this_table->gc_threshold = $3; } + | GC PERIOD expr_us { this_table->gc_period = (uint) $3; if ($3 > 3600 S_) cf_error("GC period must be at most 3600 s"); } + | CORK THRESHOLD expr expr { + if ($3 > $4) cf_error("Cork low threshold must be lower than the high threshold."); + this_table->cork_threshold.low = $3; + this_table->cork_threshold.high = $4; } + | EXPORT SETTLE TIME settle { this_table->export_settle = $4; } + | ROUTE REFRESH EXPORT SETTLE TIME settle { this_table->export_rr_settle = $6; } + | DEBUG bool { this_table->debug = $2; } ; table_opts: @@ -289,8 +296,8 @@ proto_item: | MRTDUMP mrtdump_mask { this_proto->mrtdump = $2; } | ROUTER ID idval { this_proto->router_id = $3; } | DESCRIPTION text { this_proto->dsc = $2; } - | VRF text { this_proto->vrf = if_get_by_name($2); this_proto->vrf_set = 1; } - | VRF DEFAULT { this_proto->vrf = NULL; this_proto->vrf_set = 1; } + | VRF text { this_proto->vrf = if_get_by_name($2); } + | VRF DEFAULT { this_proto->vrf = &default_vrf; } ; @@ -316,6 +323,7 @@ channel_item_: | RECEIVE LIMIT limit_spec { this_channel->rx_limit = $3; } | IMPORT LIMIT limit_spec { this_channel->in_limit = $3; } | EXPORT LIMIT limit_spec { this_channel->out_limit = $3; } + | ROA SETTLE TIME settle { this_channel->roa_settle = $4; } | PREFERENCE expr { this_channel->preference = $2; check_u16($2); } | IMPORT KEEP FILTERED bool { if ($4) @@ -355,7 +363,11 @@ channel_end: proto_channel: channel_start channel_opt_list channel_end; -rtable: CF_SYM_KNOWN { cf_assert_symbol($1, SYM_TABLE); $$ = $1->table; } ; +rtable: CF_SYM_KNOWN { + cf_assert_symbol($1, SYM_TABLE); + if (!$1->table) rt_new_default_table($1); + $$ = $1->table; +} ; imexport: FILTER filter { $$ = $2; } @@ -384,7 +396,7 @@ debug_default: DEBUG PROTOCOLS debug_mask { new_config->proto_default_debug = $3; } | DEBUG CHANNELS debug_mask { new_config->channel_default_debug = $3; } | DEBUG COMMANDS expr { new_config->cli_debug = $3; } - | DEBUG PIPE bool { new_config->pipe_debug = $3; } + | DEBUG TABLES debug_mask { new_config->table_debug = $3; } ; /* MRTDUMP PROTOCOLS is in systep/unix/config.Y */ @@ -413,7 +425,6 @@ timeformat_base: TIMEFORMAT timeformat_spec ';' ; - /* Interface patterns */ iface_patt_node_init: @@ -470,6 +481,7 @@ proto: dev_proto '}' ; dev_proto_start: proto_start DIRECT { this_proto = proto_config_new(&proto_device, $1); init_list(&DIRECT_CFG->iface_list); + this_proto->late_if_feed = 1; } ; @@ -676,6 +688,7 @@ r_args: } | r_args TABLE symbol_known { cf_assert_symbol($3, SYM_TABLE); + if (!$3->table) cf_error("Table %s not configured", $3->name); $$ = $1; rt_show_add_table($$, $3->table->table); $$->tables_defined_by = RSD_TDB_DIRECT; @@ -689,7 +702,8 @@ r_args: } | r_args IMPORT TABLE channel_arg { if (!($4->in_keep & RIK_PREFILTER)) cf_error("No import table in channel %s.%s", $4->proto->name, $4->name); - rt_show_add_exporter($$, &$4->table->exporter, "import")->prefilter = $4; + RT_LOCKED($4->table, tab) + rt_show_add_exporter($$, &tab->exporter.e, "import")->prefilter = $4; $$->tables_defined_by = RSD_TDB_DIRECT; } | r_args EXPORT TABLE channel_arg { @@ -875,7 +889,7 @@ CF_CLI(DUMP FILTER ALL,,, [[Dump all filters in linearized form]]) { filters_dump_all(); cli_msg(0, ""); } ; CF_CLI(EVAL, term, <expr>, [[Evaluate an expression]]) -{ cmd_eval(f_linearize($2)); } ; +{ cmd_eval(f_linearize($2, 1)); } ; CF_CLI_HELP(ECHO, ..., [[Control echoing of log messages]]) CF_CLI(ECHO, echo_mask echo_size, (all | off | { debug|trace|info|remote|warning|error|auth [, ...] }) [<buffer-size>], [[Control echoing of log messages]]) { diff --git a/nest/iface.c b/nest/iface.c index 682340c5..fc896e26 100644 --- a/nest/iface.c +++ b/nest/iface.c @@ -37,6 +37,7 @@ static pool *if_pool; list iface_list; +struct iface default_vrf; static void if_recalc_preferred(struct iface *i); @@ -147,7 +148,7 @@ ifa_send_notify(struct proto *p, unsigned c, struct ifa *a) { if (p->ifa_notify && (p->proto_state != PS_DOWN) && - (!p->vrf_set || p->vrf == a->iface->master)) + (!p->vrf || p->vrf == a->iface->master)) { if (p->debug & D_IFACES) log(L_TRACE "%s < address %N on interface %s %s", @@ -185,7 +186,7 @@ if_send_notify(struct proto *p, unsigned c, struct iface *i) { if (p->if_notify && (p->proto_state != PS_DOWN) && - (!p->vrf_set || p->vrf == i->master)) + (!p->vrf || p->vrf == i->master)) { if (p->debug & D_IFACES) log(L_TRACE "%s < interface %s %s", p->name, i->name, @@ -243,7 +244,7 @@ if_recalc_flags(struct iface *i UNUSED, uint flags) { if ((flags & IF_ADMIN_UP) && !(flags & (IF_SHUTDOWN | IF_TMP_DOWN)) && - !(i->master_index && !i->master)) + !(i->master_index && i->master == &default_vrf)) flags |= IF_UP; else flags &= ~IF_UP; @@ -301,6 +302,9 @@ if_update(struct iface *new) struct iface *i; unsigned c; + if (!new->master) + new->master = &default_vrf; + WALK_LIST(i, iface_list) if (!strcmp(new->name, i->name)) { @@ -711,6 +715,7 @@ if_init(void) { if_pool = rp_new(&root_pool, "Interfaces"); init_list(&iface_list); + strcpy(default_vrf.name, "default"); neigh_init(if_pool); } @@ -843,7 +848,7 @@ if_show(void) continue; char mbuf[16 + sizeof(i->name)] = {}; - if (i->master) + if (i->master != &default_vrf) bsprintf(mbuf, " master=%s", i->master->name); else if (i->master_index) bsprintf(mbuf, " master=#%u", i->master_index); diff --git a/nest/iface.h b/nest/iface.h index 1189cdd4..13f3bd12 100644 --- a/nest/iface.h +++ b/nest/iface.h @@ -28,6 +28,8 @@ struct ifa { /* Interface address */ unsigned flags; /* Analogous to iface->flags */ }; +extern struct iface default_vrf; + struct iface { node n; char name[16]; diff --git a/nest/neighbor.c b/nest/neighbor.c index 7cf9c85d..81da24d5 100644 --- a/nest/neighbor.c +++ b/nest/neighbor.c @@ -142,7 +142,7 @@ if_connected(ip_addr a, struct iface *i, struct ifa **ap, uint flags) } static inline int -if_connected_any(ip_addr a, struct iface *vrf, uint vrf_set, struct iface **iface, struct ifa **addr, uint flags) +if_connected_any(ip_addr a, struct iface *vrf, struct iface **iface, struct ifa **addr, uint flags) { struct iface *i; struct ifa *b; @@ -153,7 +153,7 @@ if_connected_any(ip_addr a, struct iface *vrf, uint vrf_set, struct iface **ifac /* Prefer SCOPE_HOST or longer prefix */ WALK_LIST(i, iface_list) - if ((!vrf_set || vrf == i->master) && ((s = if_connected(a, i, &b, flags)) >= 0)) + if ((!vrf || vrf == i->master) && ((s = if_connected(a, i, &b, flags)) >= 0)) if (scope_better(s, scope) || (scope_remote(s, scope) && ifa_better(b, *addr))) { *iface = i; @@ -245,7 +245,7 @@ neigh_find(struct proto *p, ip_addr a, struct iface *iface, uint flags) iface = (scope < 0) ? NULL : iface; } else - scope = if_connected_any(a, p->vrf, p->vrf_set, &iface, &addr, flags); + scope = if_connected_any(a, p->vrf, &iface, &addr, flags); /* scope < 0 means i don't know neighbor */ /* scope >= 0 <=> iface != NULL */ @@ -369,7 +369,7 @@ neigh_update(neighbor *n, struct iface *iface) return; /* VRF-bound neighbors ignore changes in other VRFs */ - if (p->vrf_set && (p->vrf != iface->master)) + if (p->vrf && (p->vrf != iface->master)) return; scope = if_connected(n->addr, iface, &ifa, n->flags); @@ -379,7 +379,7 @@ neigh_update(neighbor *n, struct iface *iface) { /* When neighbor is going down, try to respawn it on other ifaces */ if ((scope < 0) && (n->scope >= 0) && !n->ifreq && (n->flags & NEF_STICKY)) - scope = if_connected_any(n->addr, p->vrf, p->vrf_set, &iface, &ifa, n->flags); + scope = if_connected_any(n->addr, p->vrf, &iface, &ifa, n->flags); } else { diff --git a/nest/proto.c b/nest/proto.c index 061205c1..319b35dd 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -55,9 +55,31 @@ static void channel_update_limit(struct channel *c, struct limit *l, int dir, st static void channel_reset_limit(struct channel *c, struct limit *l, int dir); static void channel_feed_end(struct channel *c); static void channel_export_stopped(struct rt_export_request *req); +static void channel_check_stopped(struct channel *c); static inline int proto_is_done(struct proto *p) -{ return (p->proto_state == PS_DOWN) && (p->active_channels == 0); } +{ return (p->proto_state == PS_DOWN) && proto_is_inactive(p); } + +static inline event_list *proto_event_list(struct proto *p) +{ return p->loop == &main_birdloop ? &global_event_list : birdloop_event_list(p->loop); } + +static inline event_list *proto_work_list(struct proto *p) +{ return p->loop == &main_birdloop ? &global_work_list : birdloop_event_list(p->loop); } + +static inline void proto_send_event(struct proto *p) +{ ev_send(proto_event_list(p), p->event); } + +#define PROTO_ENTER_FROM_MAIN(p) ({ \ + ASSERT_DIE(birdloop_inside(&main_birdloop)); \ + struct birdloop *_loop = (p)->loop; \ + if (_loop != &main_birdloop) birdloop_enter(_loop); \ + _loop; \ + }) + +#define PROTO_LEAVE_FROM_MAIN(loop) ({ if (loop != &main_birdloop) birdloop_leave(loop); }) + +#define PROTO_LOCKED_FROM_MAIN(p) for (struct birdloop *_proto_loop = PROTO_ENTER_FROM_MAIN(p); _proto_loop; PROTO_LEAVE_FROM_MAIN(_proto_loop), (_proto_loop = NULL)) + static inline int channel_is_active(struct channel *c) { return (c->channel_state != CS_DOWN); } @@ -147,7 +169,7 @@ proto_cf_find_channel(struct proto_config *pc, uint net_type) * Returns pointer to channel or NULL */ struct channel * -proto_find_channel_by_table(struct proto *p, struct rtable *t) +proto_find_channel_by_table(struct proto *p, rtable *t) { struct channel *c; @@ -291,10 +313,18 @@ proto_remove_channels(struct proto *p) proto_remove_channel(p, c); } +struct roa_subscription { + node roa_node; + struct settle settle; + struct channel *c; + struct rt_export_request req; +}; + static void -channel_roa_in_changed(struct rt_subscription *s) +channel_roa_in_changed(struct settle *se) { - struct channel *c = s->data; + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, settle, se); + struct channel *c = s->c; int active = !!c->reload_req.hook; CD(c, "Reload triggered by RPKI change%s", active ? " - already active" : ""); @@ -306,9 +336,11 @@ channel_roa_in_changed(struct rt_subscription *s) } static void -channel_roa_out_changed(struct rt_subscription *s) +channel_roa_out_changed(struct settle *se) { - struct channel *c = s->data; + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, settle, se); + struct channel *c = s->c; + CD(c, "Feeding triggered by RPKI change"); c->refeed_pending = 1; @@ -317,29 +349,47 @@ channel_roa_out_changed(struct rt_subscription *s) rt_stop_export(&c->out_req, channel_export_stopped); } -/* Temporary code, subscriptions should be changed to resources */ -struct roa_subscription { - struct rt_subscription s; - node roa_node; -}; +static void +channel_export_one_roa(struct rt_export_request *req, const net_addr *net UNUSED, struct rt_pending_export *first) +{ + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, req, req); + + /* TODO: use the information about what roa has changed */ + settle_kick(&s->settle, &main_birdloop); + + rpe_mark_seen_all(req->hook, first, NULL); +} + +static void +channel_dump_roa_req(struct rt_export_request *req) +{ + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, req, req); + struct channel *c = s->c; + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter.e, req->hook->table); + + debug(" Channel %s.%s ROA %s change notifier from table %s request %p\n", + c->proto->name, c->name, + (s->settle.hook == channel_roa_in_changed) ? "import" : "export", + tab->name, req); +} static int channel_roa_is_subscribed(struct channel *c, rtable *tab, int dir) { - void (*hook)(struct rt_subscription *) = + void (*hook)(struct settle *) = dir ? channel_roa_in_changed : channel_roa_out_changed; struct roa_subscription *s; node *n; WALK_LIST2(s, n, c->roa_subscriptions, roa_node) - if ((s->s.tab == tab) && (s->s.hook == hook)) + if ((tab == SKIP_BACK(rtable, priv.exporter.e, s->req.hook->table)) + && (s->settle.hook == hook)) return 1; return 0; } - static void channel_roa_subscribe(struct channel *c, rtable *tab, int dir) { @@ -348,26 +398,46 @@ channel_roa_subscribe(struct channel *c, rtable *tab, int dir) struct roa_subscription *s = mb_allocz(c->proto->pool, sizeof(struct roa_subscription)); - s->s.hook = dir ? channel_roa_in_changed : channel_roa_out_changed; - s->s.data = c; - rt_subscribe(tab, &s->s); + *s = (struct roa_subscription) { + .settle = SETTLE_INIT(&c->roa_settle, dir ? channel_roa_in_changed : channel_roa_out_changed, NULL), + .c = c, + .req = { + .name = mb_sprintf(c->proto->pool, "%s.%s.roa-%s.%s", + c->proto->name, c->name, dir ? "in" : "out", tab->name), + .list = proto_work_list(c->proto), + .trace_routes = c->debug | c->proto->debug, + .dump_req = channel_dump_roa_req, + .export_one = channel_export_one_roa, + }, + }; add_tail(&c->roa_subscriptions, &s->roa_node); + rt_request_export(tab, &s->req); } static void -channel_roa_unsubscribe(struct roa_subscription *s) +channel_roa_unsubscribed(struct rt_export_request *req) { - rt_unsubscribe(&s->s); + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, req, req); + struct channel *c = s->c; + rem_node(&s->roa_node); mb_free(s); + + channel_check_stopped(c); +} + +static void +channel_roa_unsubscribe(struct roa_subscription *s) +{ + rt_stop_export(&s->req, channel_roa_unsubscribed); } static void channel_roa_subscribe_filter(struct channel *c, int dir) { const struct filter *f = dir ? c->in_filter : c->out_filter; - struct rtable *tab; + rtable *tab; int valid = 1, found = 0; if ((f == FILTER_ACCEPT) || (f == FILTER_REJECT)) @@ -428,17 +498,13 @@ channel_start_import(struct channel *c) return; } - int nlen = strlen(c->name) + strlen(c->proto->name) + 2; - char *rn = mb_allocz(c->proto->pool, nlen); - bsprintf(rn, "%s.%s", c->proto->name, c->name); - c->in_req = (struct rt_import_request) { - .name = rn, + .name = mb_sprintf(c->proto->pool, "%s.%s", c->proto->name, c->name), .trace_routes = c->debug | c->proto->debug, + .list = proto_work_list(c->proto), .dump_req = channel_dump_import_req, .log_state_change = channel_import_log_state_change, .preimport = channel_preimport, - .rte_modify = c->proto->rte_modify, }; ASSERT(c->channel_state == CS_UP); @@ -462,12 +528,10 @@ channel_start_export(struct channel *c) } ASSERT(c->channel_state == CS_UP); - int nlen = strlen(c->name) + strlen(c->proto->name) + 2; - char *rn = mb_allocz(c->proto->pool, nlen); - bsprintf(rn, "%s.%s", c->proto->name, c->name); c->out_req = (struct rt_export_request) { - .name = rn, + .name = mb_sprintf(c->proto->pool, "%s.%s", c->proto->name, c->name), + .list = proto_work_list(c->proto), .addr = c->out_subprefix, .addr_mode = c->out_subprefix ? TE_ADDR_IN : TE_ADDR_NONE, .trace_routes = c->debug | c->proto->debug, @@ -501,7 +565,7 @@ channel_start_export(struct channel *c) } DBG("%s.%s: Channel start export req=%p\n", c->proto->name, c->name, &c->out_req); - rt_request_export(&c->table->exporter, &c->out_req); + rt_request_export(c->table, &c->out_req); } static void @@ -510,15 +574,15 @@ channel_check_stopped(struct channel *c) switch (c->channel_state) { case CS_STOP: - if (c->out_req.hook || c->in_req.hook) + if (!EMPTY_LIST(c->roa_subscriptions) || c->out_req.hook || c->in_req.hook) return; channel_set_state(c, CS_DOWN); - ev_schedule(c->proto->event); + proto_send_event(c->proto); break; case CS_PAUSE: - if (c->out_req.hook) + if (!EMPTY_LIST(c->roa_subscriptions) || c->out_req.hook) return; channel_set_state(c, CS_START); @@ -535,8 +599,6 @@ channel_import_stopped(struct rt_import_request *req) { struct channel *c = SKIP_BACK(struct channel, in_req, req); - req->hook = NULL; - mb_free(c->in_req.name); c->in_req.name = NULL; @@ -555,7 +617,7 @@ channel_export_stopped(struct rt_export_request *req) { c->refeeding = 1; c->refeed_pending = 0; - rt_request_export(&c->table->exporter, req); + rt_request_export(c->table, req); return; } @@ -599,7 +661,7 @@ channel_schedule_reload(struct channel *c) { ASSERT(c->in_req.hook); - rt_request_export(&c->table->exporter, &c->reload_req); + rt_request_export(c->table, &c->reload_req); } static void @@ -634,6 +696,7 @@ channel_setup_in_table(struct channel *c) { c->reload_req = (struct rt_export_request) { .name = mb_sprintf(c->proto->pool, "%s.%s.import", c->proto->name, c->name), + .list = proto_work_list(c->proto), .trace_routes = c->debug | c->proto->debug, .export_bulk = channel_reload_export_bulk, .dump_req = channel_reload_dump_req, @@ -698,8 +761,6 @@ channel_do_stop(struct channel *c) CALL(c->channel->shutdown, c); - /* This have to be done in here, as channel pool is freed before channel_do_down() */ - c->out_table = NULL; } static void @@ -721,7 +782,7 @@ channel_do_down(struct channel *c) /* Schedule protocol shutddown */ if (proto_is_done(c->proto)) - ev_schedule(c->proto->event); + proto_send_event(c->proto); } void @@ -843,7 +904,7 @@ channel_config_new(const struct channel_class *cc, const char *name, uint net_ty if (proto->net_type && (net_type != proto->net_type)) cf_error("Different channel type"); - tab = new_config->def_tables[net_type]; + tab = rt_get_default_table(new_config, net_type); } if (!cc) @@ -862,6 +923,11 @@ channel_config_new(const struct channel_class *cc, const char *name, uint net_ty cf->debug = new_config->channel_default_debug; cf->rpki_reload = 1; + cf->roa_settle = (struct settle_config) { + .min = 1 S, + .max = 20 S, + }; + add_tail(&proto->channels, &cf->n); return cf; @@ -942,6 +1008,22 @@ channel_reconfigure(struct channel *c, struct channel_config *cf) c->in_req.trace_routes = c->out_req.trace_routes = c->debug | c->proto->debug; c->rpki_reload = cf->rpki_reload; + if ( (c->roa_settle.min != cf->roa_settle.min) + || (c->roa_settle.max != cf->roa_settle.max)) + { + c->roa_settle = cf->roa_settle; + + struct roa_subscription *s; + node *n; + + WALK_LIST2(s, n, c->roa_subscriptions, roa_node) + { + s->settle.cf = cf->roa_settle; + if (settle_active(&s->settle)) + settle_kick(&s->settle, &main_birdloop); + } + } + /* Execute channel-specific reconfigure hook */ if (c->channel->reconfigure && !c->channel->reconfigure(c, cf, &import_changed, &export_changed)) return 0; @@ -1032,34 +1114,48 @@ proto_configure_channel(struct proto *p, struct channel **pc, struct channel_con return 1; } +static void +proto_cleanup(struct proto *p) +{ + rfree(p->pool); + p->pool = NULL; + + p->active = 0; + proto_log_state_change(p); + proto_rethink_goal(p); +} static void -proto_event(void *ptr) +proto_loop_stopped(void *ptr) { struct proto *p = ptr; - if (p->do_start) - { - if_feed_baby(p); - p->do_start = 0; - } + birdloop_enter(&main_birdloop); + + p->loop = &main_birdloop; + proto_cleanup(p); + + birdloop_leave(&main_birdloop); +} + +static void +proto_event(void *ptr) +{ + struct proto *p = ptr; if (p->do_stop) { if (p->proto == &proto_unix_iface) if_flush_ifaces(p); + p->do_stop = 0; } if (proto_is_done(p)) - { - rfree(p->pool); - p->pool = NULL; - - p->active = 0; - proto_log_state_change(p); - proto_rethink_goal(p); - } + if (p->loop != &main_birdloop) + birdloop_stop_self(p->loop, proto_loop_stopped, p); + else + proto_cleanup(p); } @@ -1100,10 +1196,10 @@ proto_init(struct proto_config *c, node *n) struct protocol *pr = c->protocol; struct proto *p = pr->init(c); + p->loop = &main_birdloop; p->proto_state = PS_DOWN; p->last_state_change = current_time(); p->vrf = c->vrf; - p->vrf_set = c->vrf_set; insert_node(&p->n, n); p->event = ev_new_init(proto_pool, proto_event, p); @@ -1116,11 +1212,19 @@ proto_init(struct proto_config *c, node *n) static void proto_start(struct proto *p) { - /* Here we cannot use p->cf->name since it won't survive reconfiguration */ - p->pool = rp_new(proto_pool, p->proto->name); + DBG("Kicking %s up\n", p->name); + PD(p, "Starting"); + + p->pool = rp_newf(proto_pool, "Protocol %s", p->cf->name); if (graceful_restart_state == GRS_INIT) p->gr_recovery = 1; + + if (p->cf->loop_order != DOMAIN_ORDER(the_bird)) + p->loop = birdloop_new(p->pool, p->cf->loop_order, p->pool->name); + + PROTO_LOCKED_FROM_MAIN(p) + proto_notify_state(p, (p->proto->start ? p->proto->start(p) : PS_UP)); } @@ -1156,6 +1260,7 @@ proto_config_new(struct protocol *pr, int class) cf->class = class; cf->debug = new_config->proto_default_debug; cf->mrtdump = new_config->proto_default_mrtdump; + cf->loop_order = DOMAIN_ORDER(the_bird); init_list(&cf->channels); @@ -1271,8 +1376,7 @@ proto_reconfigure(struct proto *p, struct proto_config *oc, struct proto_config if ((nc->protocol != oc->protocol) || (nc->net_type != oc->net_type) || (nc->disabled != p->disabled) || - (nc->vrf != oc->vrf) || - (nc->vrf_set != oc->vrf_set)) + (nc->vrf != oc->vrf)) return 0; p->name = nc->name; @@ -1445,11 +1549,20 @@ protos_commit(struct config *new, struct config *old, int force_reconfig, int ty } static void -proto_rethink_goal(struct proto *p) +proto_shutdown(struct proto *p) { - struct protocol *q; - byte goal; + if (p->proto_state == PS_START || p->proto_state == PS_UP) + { + /* Going down */ + DBG("Kicking %s down\n", p->name); + PD(p, "Shutting down"); + proto_notify_state(p, (p->proto->shutdown ? p->proto->shutdown(p) : PS_DOWN)); + } +} +static void +proto_rethink_goal(struct proto *p) +{ if (p->reconfiguring && !p->active) { struct proto_config *nc = p->cf_new; @@ -1469,32 +1582,12 @@ proto_rethink_goal(struct proto *p) /* Determine what state we want to reach */ if (p->disabled || p->reconfiguring) - goal = PS_DOWN; - else - goal = PS_UP; - - q = p->proto; - if (goal == PS_UP) { - if (!p->active) - { - /* Going up */ - DBG("Kicking %s up\n", p->name); - PD(p, "Starting"); - proto_start(p); - proto_notify_state(p, (q->start ? q->start(p) : PS_UP)); - } - } - else - { - if (p->proto_state == PS_START || p->proto_state == PS_UP) - { - /* Going down */ - DBG("Kicking %s down\n", p->name); - PD(p, "Shutting down"); - proto_notify_state(p, (q->shutdown ? q->shutdown(p) : PS_DOWN)); - } + PROTO_LOCKED_FROM_MAIN(p) + proto_shutdown(p); } + else if (!p->active) + proto_start(p); } struct proto * @@ -1699,7 +1792,7 @@ protos_dump_all(void) #define DPF(x) (p->x ? " " #x : "") debug(" protocol %s (%p) state %s with %d active channels flags: %s%s%s%s%s\n", p->name, p, p_states[p->proto_state], p->active_channels, - DPF(disabled), DPF(active), DPF(do_start), DPF(do_stop), DPF(reconfiguring)); + DPF(disabled), DPF(active), DPF(do_stop), DPF(reconfiguring)); #undef DPF struct channel *c; @@ -1931,24 +2024,32 @@ channel_reset_limit(struct channel *c, struct limit *l, int dir) c->limit_active &= ~(1 << dir); } +static struct rte_owner_class default_rte_owner_class; + static inline void proto_do_start(struct proto *p) { p->active = 1; - p->do_start = 1; - ev_schedule(p->event); + + rt_init_sources(&p->sources, p->name, proto_event_list(p)); + if (!p->sources.class) + p->sources.class = &default_rte_owner_class; + + if (!p->cf->late_if_feed) + if_feed_baby(p); } static void proto_do_up(struct proto *p) { if (!p->main_source) - { p->main_source = rt_get_source(p, 0); - rt_lock_source(p->main_source); - } + // Locked automaticaly proto_start_channels(p); + + if (p->cf->late_if_feed) + if_feed_baby(p); } static inline void @@ -1963,9 +2064,6 @@ proto_do_stop(struct proto *p) p->down_sched = 0; p->gr_recovery = 0; - p->do_stop = 1; - ev_schedule(p->event); - if (p->main_source) { rt_unlock_source(p->main_source); @@ -1973,6 +2071,10 @@ proto_do_stop(struct proto *p) } proto_stop_channels(p); + rt_destroy_sources(&p->sources, p->event); + + p->do_stop = 1; + proto_send_event(p); } static void @@ -1983,7 +2085,7 @@ proto_do_down(struct proto *p) /* Shutdown is finished in the protocol event */ if (proto_is_done(p)) - ev_schedule(p->event); + proto_send_event(p); } @@ -2193,8 +2295,8 @@ proto_cmd_show(struct proto *p, uintptr_t verbose, int cnt) cli_msg(-1006, " Message: %s", p->message); if (p->cf->router_id) cli_msg(-1006, " Router ID: %R", p->cf->router_id); - if (p->vrf_set) - cli_msg(-1006, " VRF: %s", p->vrf ? p->vrf->name : "default"); + if (p->vrf) + cli_msg(-1006, " VRF: %s", p->vrf->name); if (p->proto->show_proto_info) p->proto->show_proto_info(p); @@ -2222,7 +2324,7 @@ proto_cmd_disable(struct proto *p, uintptr_t arg, int cnt UNUSED) p->disabled = 1; p->down_code = PDC_CMD_DISABLE; proto_set_message(p, (char *) arg, -1); - proto_rethink_goal(p); + proto_shutdown(p); cli_msg(-9, "%s: disabled", p->name); } @@ -2255,9 +2357,9 @@ proto_cmd_restart(struct proto *p, uintptr_t arg, int cnt UNUSED) p->disabled = 1; p->down_code = PDC_CMD_RESTART; proto_set_message(p, (char *) arg, -1); - proto_rethink_goal(p); + proto_shutdown(p); p->disabled = 0; - proto_rethink_goal(p); + /* After the protocol shuts down, proto_rethink_goal() is run from proto_event. */ cli_msg(-12, "%s: restarted", p->name); } @@ -2332,7 +2434,9 @@ proto_apply_cmd_symbol(const struct symbol *s, void (* cmd)(struct proto *, uint if (s->proto->proto) { - cmd(s->proto->proto, arg, 0); + struct proto *p = s->proto->proto; + PROTO_LOCKED_FROM_MAIN(p) + cmd(p, arg, 0); cli_msg(0, ""); } else @@ -2347,7 +2451,8 @@ proto_apply_cmd_patt(const char *patt, void (* cmd)(struct proto *, uintptr_t, i WALK_LIST(p, proto_list) if (!patt || patmatch(patt, p->name)) - cmd(p, arg, cnt++); + PROTO_LOCKED_FROM_MAIN(p) + cmd(p, arg, cnt++); if (!cnt) cli_msg(8003, "No protocols match"); diff --git a/nest/protocol.h b/nest/protocol.h index 3ccd364a..88bff092 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -12,13 +12,13 @@ #include "lib/lists.h" #include "lib/resource.h" #include "lib/event.h" +#include "lib/settle.h" #include "nest/rt.h" #include "nest/limit.h" #include "conf/conf.h" struct iface; struct ifa; -struct rtable; struct rte; struct neighbor; struct rta; @@ -60,7 +60,6 @@ struct protocol { int (*start)(struct proto *); /* Start the instance */ int (*shutdown)(struct proto *); /* Stop the instance */ void (*get_status)(struct proto *, byte *buf); /* Get instance status (for `show protocols' command) */ - void (*get_route_info)(struct rte *, byte *buf); /* Get route information (for `show route' command) */ // int (*get_attr)(const struct eattr *, byte *buf, int buflen); /* ASCIIfy dynamic attribute (returns GA_*) */ void (*show_proto_info)(struct proto *); /* Show protocol info (for `show protocols all' command) */ void (*copy_config)(struct proto_config *, struct proto_config *); /* Copy config from given protocol instance */ @@ -101,9 +100,10 @@ struct proto_config { int class; /* SYM_PROTO or SYM_TEMPLATE */ u8 net_type; /* Protocol network type (NET_*), 0 for undefined */ u8 disabled; /* Protocol enabled/disabled by default */ - u8 vrf_set; /* Related VRF instance (below) is defined */ + u8 late_if_feed; /* Delay interface feed after channels are up */ u32 debug, mrtdump; /* Debugging bitfields, both use D_* constants */ u32 router_id; /* Protocol specific router ID */ + uint loop_order; /* Launch a birdloop on this locking level; use DOMAIN_ORDER(the_bird) for mainloop */ list channels; /* List of channel configs (struct channel_config) */ struct iface *vrf; /* Related VRF instance, NULL if global */ @@ -121,22 +121,23 @@ struct proto { struct proto_config *cf_new; /* Configuration we want to switch to after shutdown (NULL=delete) */ pool *pool; /* Pool containing local objects */ event *event; /* Protocol event */ + struct birdloop *loop; /* BIRDloop running this protocol */ list channels; /* List of channels to rtables (struct channel) */ struct channel *main_channel; /* Primary channel */ struct rte_src *main_source; /* Primary route source */ + struct rte_owner sources; /* Route source owner structure */ struct iface *vrf; /* Related VRF instance, NULL if global */ const char *name; /* Name of this instance (== cf->name) */ u32 debug; /* Debugging flags */ u32 mrtdump; /* MRTDump flags */ uint active_channels; /* Number of active channels */ + uint active_loops; /* Number of active IO loops */ byte net_type; /* Protocol network type (NET_*), 0 for undefined */ byte disabled; /* Manually disabled */ - byte vrf_set; /* Related VRF instance (above) is defined */ byte proto_state; /* Protocol state machine (PS_*, see below) */ byte active; /* From PS_START to cleanup after PS_STOP */ - byte do_start; /* Start actions are scheduled */ byte do_stop; /* Stop actions are scheduled */ byte reconfiguring; /* We're shutting down due to reconfiguration */ byte gr_recovery; /* Protocol should participate in graceful restart recovery */ @@ -186,10 +187,9 @@ struct proto { * rte_remove Called whenever a rte is removed from the routing table. */ - int (*rte_recalculate)(struct rtable *, struct network *, struct rte *, struct rte *, struct rte *); + int (*rte_recalculate)(struct rtable_private *, struct network *, struct rte *, struct rte *, struct rte *); int (*rte_better)(struct rte *, struct rte *); int (*rte_mergable)(struct rte *, struct rte *); - struct rte *(*rte_modify)(struct rte *, struct linpool *); void (*rte_insert)(struct network *, struct rte *); void (*rte_remove)(struct network *, struct rte *); u32 (*rte_igp_metric)(const struct rte *); @@ -339,6 +339,8 @@ void proto_notify_state(struct proto *p, unsigned state); * as a result of received ROUTE-REFRESH request). */ +static inline int proto_is_inactive(struct proto *p) +{ return (p->active_channels == 0) && (p->active_loops == 0) && (p->sources.uc == 0); } /* @@ -459,6 +461,8 @@ struct channel_config { struct channel_limit in_limit; /* Limit for importing routes from protocol */ struct channel_limit out_limit; /* Limit for exporting routes to protocol */ + struct settle_config roa_settle; /* Settle times for ROA-induced reload */ + u8 net_type; /* Routing table network type (NET_*), 0 for undefined */ u8 ra_mode; /* Mode of received route advertisements (RA_*) */ u16 preference; /* Default route preference */ @@ -475,7 +479,7 @@ struct channel { const struct channel_class *channel; struct proto *proto; - struct rtable *table; + rtable *table; const struct filter *in_filter; /* Input filter */ const struct filter *out_filter; /* Output filter */ const net_addr *out_subprefix; /* Export only subprefixes of this net */ @@ -486,6 +490,8 @@ struct channel { struct limit in_limit; /* Input limit */ struct limit out_limit; /* Output limit */ + struct settle_config roa_settle; /* Settle times for ROA-induced reload */ + u8 limit_actions[PLD_MAX]; /* Limit actions enum */ u8 limit_active; /* Flags for active limits */ @@ -539,7 +545,7 @@ struct channel { struct rt_exporter *out_table; /* Internal table for exported routes */ - list roa_subscriptions; /* List of active ROA table subscriptions based on filters roa_check() */ + list roa_subscriptions; /* List of active ROA table subscriptions based on filters' roa_check() calls */ }; #define RIK_REJECTED 1 /* Routes rejected in import filter are kept */ @@ -603,7 +609,7 @@ struct channel_config *proto_cf_find_channel(struct proto_config *p, uint net_ty static inline struct channel_config *proto_cf_main_channel(struct proto_config *pc) { return proto_cf_find_channel(pc, pc->net_type); } -struct channel *proto_find_channel_by_table(struct proto *p, struct rtable *t); +struct channel *proto_find_channel_by_table(struct proto *p, rtable *t); struct channel *proto_find_channel_by_name(struct proto *p, const char *n); struct channel *proto_add_channel(struct proto *p, struct channel_config *cf); int proto_configure_channel(struct proto *p, struct channel **c, struct channel_config *cf); diff --git a/nest/rt-attr.c b/nest/rt-attr.c index b31bc5cc..471209ee 100644 --- a/nest/rt-attr.c +++ b/nest/rt-attr.c @@ -178,6 +178,8 @@ const char * flowspec_valid_names[FLOWSPEC__MAX] = { [FLOWSPEC_INVALID] = "invalid", }; +DOMAIN(attrs) attrs_domain; + pool *rta_pool; static slab *rte_src_slab; @@ -187,16 +189,14 @@ static struct idm src_ids; /* rte source hash */ -#define RSH_KEY(n) n->proto, n->private_id +#define RSH_KEY(n) n->private_id #define RSH_NEXT(n) n->next -#define RSH_EQ(p1,n1,p2,n2) p1 == p2 && n1 == n2 -#define RSH_FN(p,n) p->hash_key ^ u32_hash(n) +#define RSH_EQ(n1,n2) n1 == n2 +#define RSH_FN(n) u32_hash(n) #define RSH_REHASH rte_src_rehash #define RSH_PARAMS /2, *2, 1, 1, 8, 20 -#define RSH_INIT_ORDER 6 - -static HASH(struct rte_src) src_hash; +#define RSH_INIT_ORDER 2 static struct rte_src **rte_src_global; static uint rte_src_global_max = SRC_ID_INIT_SIZE; @@ -207,34 +207,44 @@ rte_src_init(void) rte_src_global = mb_allocz(rta_pool, sizeof(struct rte_src *) * rte_src_global_max); idm_init(&src_ids, rta_pool, SRC_ID_INIT_SIZE); - - HASH_INIT(src_hash, rta_pool, RSH_INIT_ORDER); } - HASH_DEFINE_REHASH_FN(RSH, struct rte_src) -struct rte_src * -rt_find_source(struct proto *p, u32 id) +static struct rte_src * +rt_find_source(struct rte_owner *p, u32 id) { - return HASH_FIND(src_hash, RSH, p, id); + return HASH_FIND(p->hash, RSH, id); } struct rte_src * -rt_get_source(struct proto *p, u32 id) +rt_get_source_o(struct rte_owner *p, u32 id) { + if (p->stop) + bug("Stopping route owner asked for another source."); + struct rte_src *src = rt_find_source(p, id); if (src) + { + UNUSED u64 uc = atomic_fetch_add_explicit(&src->uc, 1, memory_order_acq_rel); return src; + } + RTA_LOCK; src = sl_allocz(rte_src_slab); - src->proto = p; + src->owner = p; src->private_id = id; src->global_id = idm_alloc(&src_ids); - src->uc = 0; - HASH_INSERT2(src_hash, RSH, rta_pool, src); + atomic_store_explicit(&src->uc, 1, memory_order_release); + p->uc++; + + HASH_INSERT2(p->hash, RSH, rta_pool, src); + if (config->table_debug) + log(L_TRACE "Allocated new rte_src for %s, ID %uL %uG, have %u sources now", + p->name, src->private_id, src->global_id, p->uc); + if (src->global_id >= rte_src_global_max) { rte_src_global = mb_realloc(rte_src_global, sizeof(struct rte_src *) * (rte_src_global_max *= 2)); @@ -243,6 +253,7 @@ rt_get_source(struct proto *p, u32 id) } rte_src_global[src->global_id] = src; + RTA_UNLOCK; return src; } @@ -256,23 +267,89 @@ rt_find_source_global(u32 id) return rte_src_global[id]; } +static inline void +rt_done_sources(struct rte_owner *o) +{ + ev_send(o->list, o->stop); +} + void -rt_prune_sources(void) +rt_prune_sources(void *data) { - HASH_WALK_FILTER(src_hash, next, src, sp) + struct rte_owner *o = data; + + HASH_WALK_FILTER(o->hash, next, src, sp) { - if (src->uc == 0) + u64 uc; + while ((uc = atomic_load_explicit(&src->uc, memory_order_acquire)) >> RTE_SRC_PU_SHIFT) + synchronize_rcu(); + + if (uc == 0) { - HASH_DO_REMOVE(src_hash, RSH, sp); + o->uc--; + + HASH_DO_REMOVE(o->hash, RSH, sp); + + RTA_LOCK; + rte_src_global[src->global_id] = NULL; idm_free(&src_ids, src->global_id); sl_free(src); + RTA_UNLOCK; } } HASH_WALK_FILTER_END; - HASH_MAY_RESIZE_DOWN(src_hash, RSH, rta_pool); + RTA_LOCK; + HASH_MAY_RESIZE_DOWN(o->hash, RSH, rta_pool); + + if (o->stop && !o->uc) + { + rfree(o->prune); + RTA_UNLOCK; + + if (config->table_debug) + log(L_TRACE "All rte_src's for %s pruned, scheduling stop event", o->name); + + rt_done_sources(o); + } + else + RTA_UNLOCK; } +void +rt_init_sources(struct rte_owner *o, const char *name, event_list *list) +{ + RTA_LOCK; + HASH_INIT(o->hash, rta_pool, RSH_INIT_ORDER); + o->hash_key = random_u32(); + o->uc = 0; + o->name = name; + o->prune = ev_new_init(rta_pool, rt_prune_sources, o); + o->stop = NULL; + o->list = list; + RTA_UNLOCK; +} + +void +rt_destroy_sources(struct rte_owner *o, event *done) +{ + o->stop = done; + + if (!o->uc) + { + if (config->table_debug) + log(L_TRACE "Source owner %s destroy requested. All rte_src's already pruned, scheduling stop event", o->name); + + RTA_LOCK; + rfree(o->prune); + RTA_UNLOCK; + + rt_done_sources(o); + } + else + if (config->table_debug) + log(L_TRACE "Source owner %s destroy requested. Remaining %u rte_src's to prune.", o->name, o->uc); +} /* * Multipath Next Hop @@ -966,6 +1043,8 @@ ea_list_ref(ea_list *l) } } +static void ea_free_nested(ea_list *l); + static void ea_list_unref(ea_list *l) { @@ -986,7 +1065,7 @@ ea_list_unref(ea_list *l) } if (l->next) - ea_free(l->next); + ea_free_nested(l->next); } void @@ -1395,9 +1474,15 @@ ea_lookup(ea_list *o, int overlay) o = ea_normalize(o, overlay); h = ea_hash(o); + RTA_LOCK; + for(r=rta_hash_table[h & rta_cache_mask]; r; r=r->next_hash) if (r->hash_key == h && ea_same(r->l, o)) - return ea_clone(r->l); + { + atomic_fetch_add_explicit(&r->uc, 1, memory_order_acq_rel); + RTA_UNLOCK; + return r->l; + } uint elen = ea_list_size(o); r = mb_alloc(rta_pool, elen + sizeof(struct ea_storage)); @@ -1413,12 +1498,17 @@ ea_lookup(ea_list *o, int overlay) if (++rta_cache_count > rta_cache_limit) rta_rehash(); + RTA_UNLOCK; return r->l; } -void -ea__free(struct ea_storage *a) +static void +ea_free_locked(struct ea_storage *a) { + /* Somebody has cloned this rta inbetween. This sometimes happens. */ + if (atomic_load_explicit(&a->uc, memory_order_acquire)) + return; + ASSERT(rta_cache_count); rta_cache_count--; *a->pprev_hash = a->next_hash; @@ -1429,6 +1519,22 @@ ea__free(struct ea_storage *a) mb_free(a); } +static void +ea_free_nested(struct ea_list *l) +{ + struct ea_storage *r = ea_get_storage(l); + if (1 == atomic_fetch_sub_explicit(&r->uc, 1, memory_order_acq_rel)) + ea_free_locked(r); +} + +void +ea__free(struct ea_storage *a) +{ + RTA_LOCK; + ea_free_locked(a); + RTA_UNLOCK; +} + /** * rta_dump_all - dump attribute cache * @@ -1438,6 +1544,8 @@ ea__free(struct ea_storage *a) void ea_dump_all(void) { + RTA_LOCK; + debug("Route attribute cache (%d entries, rehash at %d):\n", rta_cache_count, rta_cache_limit); for (uint h=0; h < rta_cache_size; h++) for (struct ea_storage *a = rta_hash_table[h]; a; a = a->next_hash) @@ -1447,6 +1555,8 @@ ea_dump_all(void) debug("\n"); } debug("\n"); + + RTA_UNLOCK; } void @@ -1466,6 +1576,8 @@ ea_show_list(struct cli *c, ea_list *eal) void rta_init(void) { + attrs_domain = DOMAIN_NEW(attrs, "Attributes"); + rta_pool = rp_new(&root_pool, "Attributes"); rta_alloc_hash(); diff --git a/nest/rt-dev.c b/nest/rt-dev.c index 7f45985f..4199e17c 100644 --- a/nest/rt-dev.c +++ b/nest/rt-dev.c @@ -68,6 +68,7 @@ dev_ifa_notify(struct proto *P, uint flags, struct ifa *ad) /* Use iface ID as local source ID */ struct rte_src *src = rt_get_source(P, ad->iface->index); rte_update(c, net, NULL, src); + rt_unlock_source(src); } else if (flags & IF_CHANGE_UP) { @@ -95,6 +96,7 @@ dev_ifa_notify(struct proto *P, uint flags, struct ifa *ad) }; rte_update(c, net, &e0, src); + rt_unlock_source(src); } } diff --git a/nest/rt-show.c b/nest/rt-show.c index c3294518..dc88047a 100644 --- a/nest/rt-show.c +++ b/nest/rt-show.c @@ -58,7 +58,7 @@ rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, int primary if (d->verbose && !rta_is_cached(a) && a) a = ea_normalize(a, 0); - get_route_info = e->src->proto->proto->get_route_info; + get_route_info = e->src->owner->class ? e->src->owner->class->get_route_info : NULL; if (get_route_info) get_route_info(e, info); else @@ -74,10 +74,14 @@ rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, int primary cli_printf(c, -1007, "%-20s %s [%s %s%s]%s%s", ia, net_is_flow(e->net) ? flowspec_valid_name(flowspec_valid) : had ? "recursive" : rta_dest_name(dest), - e->src->proto->name, tm, from, primary ? (sync_error ? " !" : " *") : "", info); + e->src->owner->name, tm, from, primary ? (sync_error ? " !" : " *") : "", info); if (d->verbose) + { ea_show_list(c, a); + cli_printf(c, -1008, "\tInternal route handling values: %uL %uG %uS id %u", + e->src->private_id, e->src->global_id, e->stale_cycle, e->id); + } else if (dest == RTD_UNICAST) ea_show_nexthop_list(c, nhad); else if (had) @@ -174,7 +178,7 @@ rt_show_net(struct rt_show_data *d, const net_addr *n, rte **feed, uint count) } } - if (d->show_protocol && (d->show_protocol != e.src->proto)) + if (d->show_protocol && (&d->show_protocol->sources != e.src->owner)) goto skip; if (f_run(d->filter, &e, 0) > F_ACCEPT) @@ -256,6 +260,18 @@ rt_show_dump_req(struct rt_export_request *req) } static void +rt_show_done(struct rt_show_data *d) +{ + /* No more action */ + d->cli->cleanup = NULL; + d->cli->cont = NULL; + d->cli->rover = NULL; + + /* Write pending messages */ + cli_write_trigger(d->cli); +} + +static void rt_show_cont(struct rt_show_data *d) { struct cli *c = d->cli; @@ -263,18 +279,13 @@ rt_show_cont(struct rt_show_data *d) if (d->running_on_config && (d->running_on_config != config)) { cli_printf(c, 8004, "Stopped due to reconfiguration"); - - /* No more action */ - c->cleanup = NULL; - c->cont = NULL; - c->rover = NULL; - cli_write_trigger(c); - return; + return rt_show_done(d); } d->req = (struct rt_export_request) { .addr = d->addr, .name = "CLI Show Route", + .list = &global_work_list, .export_bulk = rt_show_net_export_bulk, .dump_req = rt_show_dump_req, .log_state_change = rt_show_log_state_change, @@ -290,7 +301,7 @@ rt_show_cont(struct rt_show_data *d) if (d->tables_defined_by & RSD_TDB_SET) rt_show_table(d); - rt_request_export(d->tab->table, &d->req); + rt_request_export_other(d->tab->table, &d->req); } static void @@ -316,7 +327,6 @@ rt_show_export_stopped(struct rt_export_request *req) if (NODE_VALID(d->tab)) return rt_show_cont(d); - /* Printout total stats */ if (d->stats && (d->table_counter > 1)) { @@ -329,7 +339,8 @@ rt_show_export_stopped(struct rt_export_request *req) else cli_printf(d->cli, 0, ""); - cli_write_trigger(d->cli); + /* No more route showing */ + rt_show_done(d); } struct rt_show_data_rtable * @@ -343,9 +354,11 @@ rt_show_add_exporter(struct rt_show_data *d, struct rt_exporter *t, const char * } struct rt_show_data_rtable * -rt_show_add_table(struct rt_show_data *d, struct rtable *t) +rt_show_add_table(struct rt_show_data *d, rtable *t) { - struct rt_show_data_rtable *rsdr = rt_show_add_exporter(d, &t->exporter, t->name); + struct rt_show_data_rtable *rsdr; + RT_LOCKED(t, tp) + rsdr = rt_show_add_exporter(d, &tp->exporter.e, t->name); struct proto_config *krt = t->config->krt_attached; if (krt) @@ -389,8 +402,8 @@ rt_show_get_default_tables(struct rt_show_data *d) } for (int i=1; i<NET_MAX; i++) - if (config->def_tables[i] && config->def_tables[i]->table) - rt_show_add_table(d, config->def_tables[i]->table); + if (config->def_tables[i] && config->def_tables[i]->table && config->def_tables[i]->table->table) + rt_show_add_table(d, config->def_tables[i]->table->table); } static inline void @@ -407,12 +420,13 @@ rt_show_prepare_tables(struct rt_show_data *d) /* Ensure there is defined export_channel for each table */ if (d->export_mode) { + rtable *rt = SKIP_BACK(rtable, priv.exporter.e, tab->table); if (!tab->export_channel && d->export_channel && - (tab->table == &d->export_channel->table->exporter)) + (rt == d->export_channel->table)) tab->export_channel = d->export_channel; if (!tab->export_channel && d->export_protocol) - tab->export_channel = proto_find_channel_by_table(d->export_protocol, SKIP_BACK(rtable, exporter, tab->table)); + tab->export_channel = proto_find_channel_by_table(d->export_protocol, rt); if (!tab->export_channel) { diff --git a/nest/rt-table.c b/nest/rt-table.c index 29690414..36d69d92 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -43,10 +43,10 @@ * all prefixes that may influence resolving of tracked next hops. * * When a best route changes in the src table, the hostcache is notified using - * rt_notify_hostcache(), which immediately checks using the trie whether the + * an auxiliary export request, which checks using the trie whether the * change is relevant and if it is, then it schedules asynchronous hostcache * recomputation. The recomputation is done by rt_update_hostcache() (called - * from rt_event() of src table), it walks through all hostentries and resolves + * as an event of src table), it walks through all hostentries and resolves * them (by rt_update_hostentry()). It also updates the trie. If a change in * hostentry resolution was found, then it schedules asynchronous nexthop * recomputation of associated dst table. That is done by rt_next_hop_update() @@ -60,15 +60,14 @@ * routes depends of resolving their network prefixes in IP routing tables. This * is similar to the recursive next hop mechanism, but simpler as there are no * intermediate hostcache and hostentries (because flows are less likely to - * share common net prefix than routes sharing a common next hop). In src table, - * there is a list of dst tables (list flowspec_links), this list is updated by - * flowpsec channels (by rt_flowspec_link() and rt_flowspec_unlink() during - * channel start/stop). Each dst table has its own trie of prefixes that may - * influence validation of flowspec routes in it (flowspec_trie). + * share common net prefix than routes sharing a common next hop). Every dst + * table has its own export request in every src table. Each dst table has its + * own trie of prefixes that may influence validation of flowspec routes in it + * (flowspec_trie). * - * When a best route changes in the src table, rt_flowspec_notify() immediately - * checks all dst tables from the list using their tries to see whether the - * change is relevant for them. If it is, then an asynchronous re-validation of + * When a best route changes in the src table, the notification mechanism is + * invoked by the export request which checks its dst table's trie to see + * whether the change is relevant, and if so, an asynchronous re-validation of * flowspec routes in the dst table is scheduled. That is also done by function * rt_next_hop_update(), like nexthop recomputation above. It iterates over all * flowspec routes and re-validates them. It also recalculates the trie. @@ -83,9 +82,8 @@ * will be re-validated later in this round anyway. * * The third mechanism is used for RPKI re-validation of IP routes and it is the - * simplest. It is just a list of subscribers in src table, who are notified - * when any change happened, but only after a settle time. Also, in RPKI case - * the dst is not a table, but a channel, who refeeds routes through a filter. + * simplest. It is also an auxiliary export request belonging to the + * appropriate channel, triggering its reload/refeed timer after a settle time. */ #undef LOCAL_DEBUG @@ -105,32 +103,53 @@ #include "lib/string.h" #include "lib/alloca.h" #include "lib/flowspec.h" +#include "lib/idm.h" #ifdef CONFIG_BGP #include "proto/bgp/bgp.h" #endif -pool *rt_table_pool; +#include <stdatomic.h> -static linpool *rte_update_pool; +pool *rt_table_pool; list routing_tables; list deleted_routing_tables; -static void rt_free_hostcache(rtable *tab); -static void rt_notify_hostcache(rtable *tab, net *net); -static void rt_update_hostcache(rtable *tab); -static void rt_next_hop_update(rtable *tab); +struct rt_cork rt_cork; + +/* Data structures for export journal */ +#define RT_PENDING_EXPORT_ITEMS (page_size - sizeof(struct rt_export_block)) / sizeof(struct rt_pending_export) + +struct rt_export_block { + node n; + _Atomic u32 end; + _Atomic _Bool not_last; + struct rt_pending_export export[]; +}; + +static void rt_free_hostcache(struct rtable_private *tab); +static void rt_update_hostcache(void *tab); +static void rt_next_hop_update(struct rtable_private *tab); +static void rt_nhu_uncork(void *_tab); static inline void rt_next_hop_resolve_rte(rte *r); static inline void rt_flowspec_resolve_rte(rte *r, struct channel *c); -static inline void rt_prune_table(rtable *tab); -static inline void rt_schedule_notify(rtable *tab); -static void rt_flowspec_notify(rtable *tab, net *net); +static inline void rt_prune_table(struct rtable_private *tab); +static void rt_kick_prune_timer(struct rtable_private *tab); static void rt_feed_by_fib(void *); static void rt_feed_by_trie(void *); static void rt_feed_equal(void *); static void rt_feed_for(void *); -static uint rt_feed_net(struct rt_export_hook *c, net *n); +static void rt_check_cork_low(struct rtable_private *tab); +static void rt_check_cork_high(struct rtable_private *tab); +static void rt_cork_release_hook(void *); +static void rt_shutdown(void *); +static void rt_delete(void *); + +static void rt_export_used(struct rt_table_exporter *, const char *, const char *); +static void rt_export_cleanup(struct rtable_private *tab); + +static int rte_same(rte *x, rte *y); const char *rt_import_state_name_array[TIS_MAX] = { [TIS_DOWN] = "DOWN", @@ -143,6 +162,7 @@ const char *rt_import_state_name_array[TIS_MAX] = { const char *rt_export_state_name_array[TES_MAX] = { [TES_DOWN] = "DOWN", + [TES_HUNGRY] = "HUNGRY", [TES_FEEDING] = "FEEDING", [TES_READY] = "READY", [TES_STOP] = "STOP" @@ -164,13 +184,22 @@ const char *rt_export_state_name(u8 state) return rt_export_state_name_array[state]; } -static inline struct rte_storage *rt_next_hop_update_rte(rtable *tab, net *n, rte *old); -static struct hostentry *rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep); +static struct hostentry *rt_get_hostentry(struct rtable_private *tab, ip_addr a, ip_addr ll, rtable *dep); + +static inline rtable *rt_priv_to_pub(struct rtable_private *tab) { return RT_PUB(tab); } +static inline rtable *rt_pub_to_pub(rtable *tab) { return tab; } +#define RT_ANY_TO_PUB(tab) _Generic((tab),rtable*:rt_pub_to_pub,struct rtable_private*:rt_priv_to_pub)((tab)) + +#define rt_trace(tab, level, fmt, args...) do {\ + rtable *t = RT_ANY_TO_PUB((tab)); \ + if (t->config->debug & (level)) \ + log(L_TRACE "%s: " fmt, t->name, ##args); \ +} while (0) static void net_init_with_trie(struct fib *f, void *N) { - rtable *tab = SKIP_BACK(rtable, fib, f); + struct rtable_private *tab = SKIP_BACK(struct rtable_private, fib, f); net *n = N; if (tab->trie) @@ -181,7 +210,7 @@ net_init_with_trie(struct fib *f, void *N) } static inline net * -net_route_ip4_trie(rtable *t, const net_addr_ip4 *n0) +net_route_ip4_trie(struct rtable_private *t, const net_addr_ip4 *n0) { TRIE_WALK_TO_ROOT_IP4(t->trie, n0, n) { @@ -195,7 +224,7 @@ net_route_ip4_trie(rtable *t, const net_addr_ip4 *n0) } static inline net * -net_route_vpn4_trie(rtable *t, const net_addr_vpn4 *n0) +net_route_vpn4_trie(struct rtable_private *t, const net_addr_vpn4 *n0) { TRIE_WALK_TO_ROOT_IP4(t->trie, (const net_addr_ip4 *) n0, px) { @@ -211,7 +240,7 @@ net_route_vpn4_trie(rtable *t, const net_addr_vpn4 *n0) } static inline net * -net_route_ip6_trie(rtable *t, const net_addr_ip6 *n0) +net_route_ip6_trie(struct rtable_private *t, const net_addr_ip6 *n0) { TRIE_WALK_TO_ROOT_IP6(t->trie, n0, n) { @@ -225,7 +254,7 @@ net_route_ip6_trie(rtable *t, const net_addr_ip6 *n0) } static inline net * -net_route_vpn6_trie(rtable *t, const net_addr_vpn6 *n0) +net_route_vpn6_trie(struct rtable_private *t, const net_addr_vpn6 *n0) { TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px) { @@ -241,7 +270,7 @@ net_route_vpn6_trie(rtable *t, const net_addr_vpn6 *n0) } static inline void * -net_route_ip6_sadr_trie(rtable *t, const net_addr_ip6_sadr *n0) +net_route_ip6_sadr_trie(struct rtable_private *t, const net_addr_ip6_sadr *n0) { TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px) { @@ -274,7 +303,7 @@ net_route_ip6_sadr_trie(rtable *t, const net_addr_ip6_sadr *n0) } static inline net * -net_route_ip4_fib(rtable *t, const net_addr_ip4 *n0) +net_route_ip4_fib(struct rtable_private *t, const net_addr_ip4 *n0) { net_addr_ip4 n; net_copy_ip4(&n, n0); @@ -290,7 +319,7 @@ net_route_ip4_fib(rtable *t, const net_addr_ip4 *n0) } static inline net * -net_route_vpn4_fib(rtable *t, const net_addr_vpn4 *n0) +net_route_vpn4_fib(struct rtable_private *t, const net_addr_vpn4 *n0) { net_addr_vpn4 n; net_copy_vpn4(&n, n0); @@ -306,7 +335,7 @@ net_route_vpn4_fib(rtable *t, const net_addr_vpn4 *n0) } static inline net * -net_route_ip6_fib(rtable *t, const net_addr_ip6 *n0) +net_route_ip6_fib(struct rtable_private *t, const net_addr_ip6 *n0) { net_addr_ip6 n; net_copy_ip6(&n, n0); @@ -322,7 +351,7 @@ net_route_ip6_fib(rtable *t, const net_addr_ip6 *n0) } static inline net * -net_route_vpn6_fib(rtable *t, const net_addr_vpn6 *n0) +net_route_vpn6_fib(struct rtable_private *t, const net_addr_vpn6 *n0) { net_addr_vpn6 n; net_copy_vpn6(&n, n0); @@ -338,7 +367,7 @@ net_route_vpn6_fib(rtable *t, const net_addr_vpn6 *n0) } static inline void * -net_route_ip6_sadr_fib(rtable *t, const net_addr_ip6_sadr *n0) +net_route_ip6_sadr_fib(struct rtable_private *t, const net_addr_ip6_sadr *n0) { net_addr_ip6_sadr n; net_copy_ip6_sadr(&n, n0); @@ -378,7 +407,7 @@ net_route_ip6_sadr_fib(rtable *t, const net_addr_ip6_sadr *n0) } net * -net_route(rtable *tab, const net_addr *n) +net_route(struct rtable_private *tab, const net_addr *n) { ASSERT(tab->addr_type == n->type); @@ -421,7 +450,7 @@ net_route(rtable *tab, const net_addr *n) static int -net_roa_check_ip4_trie(rtable *tab, const net_addr_ip4 *px, u32 asn) +net_roa_check_ip4_trie(struct rtable_private *tab, const net_addr_ip4 *px, u32 asn) { int anything = 0; @@ -449,7 +478,7 @@ net_roa_check_ip4_trie(rtable *tab, const net_addr_ip4 *px, u32 asn) } static int -net_roa_check_ip4_fib(rtable *tab, const net_addr_ip4 *px, u32 asn) +net_roa_check_ip4_fib(struct rtable_private *tab, const net_addr_ip4 *px, u32 asn) { struct net_addr_roa4 n = NET_ADDR_ROA4(px->prefix, px->pxlen, 0, 0); struct fib_node *fn; @@ -481,7 +510,7 @@ net_roa_check_ip4_fib(rtable *tab, const net_addr_ip4 *px, u32 asn) } static int -net_roa_check_ip6_trie(rtable *tab, const net_addr_ip6 *px, u32 asn) +net_roa_check_ip6_trie(struct rtable_private *tab, const net_addr_ip6 *px, u32 asn) { int anything = 0; @@ -509,7 +538,7 @@ net_roa_check_ip6_trie(rtable *tab, const net_addr_ip6 *px, u32 asn) } static int -net_roa_check_ip6_fib(rtable *tab, const net_addr_ip6 *px, u32 asn) +net_roa_check_ip6_fib(struct rtable_private *tab, const net_addr_ip6 *px, u32 asn) { struct net_addr_roa6 n = NET_ADDR_ROA6(px->prefix, px->pxlen, 0, 0); struct fib_node *fn; @@ -556,24 +585,30 @@ net_roa_check_ip6_fib(rtable *tab, const net_addr_ip6 *px, u32 asn) * must have type NET_IP4 or NET_IP6, respectively. */ int -net_roa_check(rtable *tab, const net_addr *n, u32 asn) +net_roa_check(rtable *tp, const net_addr *n, u32 asn) { - if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4)) - { - if (tab->trie) - return net_roa_check_ip4_trie(tab, (const net_addr_ip4 *) n, asn); - else - return net_roa_check_ip4_fib (tab, (const net_addr_ip4 *) n, asn); - } - else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6)) + int out = ROA_UNKNOWN; + + RT_LOCKED(tp, tab) { - if (tab->trie) - return net_roa_check_ip6_trie(tab, (const net_addr_ip6 *) n, asn); + if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4)) + { + if (tab->trie) + out = net_roa_check_ip4_trie(tab, (const net_addr_ip4 *) n, asn); + else + out = net_roa_check_ip4_fib (tab, (const net_addr_ip4 *) n, asn); + } + else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6)) + { + if (tab->trie) + out = net_roa_check_ip6_trie(tab, (const net_addr_ip6 *) n, asn); + else + out = net_roa_check_ip6_fib (tab, (const net_addr_ip6 *) n, asn); + } else - return net_roa_check_ip6_fib (tab, (const net_addr_ip6 *) n, asn); + out = ROA_UNKNOWN; /* Should not happen */ } - else - return ROA_UNKNOWN; /* Should not happen */ + return out; } /** @@ -597,7 +632,7 @@ rte_find(net *net, struct rte_src *src) struct rte_storage * -rte_store(const rte *r, net *net, rtable *tab) +rte_store(const rte *r, net *net, struct rtable_private *tab) { struct rte_storage *e = sl_alloc(tab->rte_slab); @@ -647,16 +682,16 @@ rte_better(rte *new, rte *old) return 1; if (np < op) return 0; - if (new->src->proto->proto != old->src->proto->proto) + if (new->src->owner->class != old->src->owner->class) { /* * If the user has configured protocol preferences, so that two different protocols * have the same preference, try to break the tie by comparing addresses. Not too * useful, but keeps the ordering of routes unambiguous. */ - return new->src->proto->proto > old->src->proto->proto; + return new->src->owner->class > old->src->owner->class; } - if (better = new->src->proto->rte_better) + if (better = new->src->owner->class->rte_better) return better(new, old); return 0; } @@ -672,10 +707,10 @@ rte_mergable(rte *pri, rte *sec) if (rt_get_preference(pri) != rt_get_preference(sec)) return 0; - if (pri->src->proto->proto != sec->src->proto->proto) + if (pri->src->owner->class != sec->src->owner->class) return 0; - if (mergable = pri->src->proto->rte_mergable) + if (mergable = pri->src->owner->class->rte_mergable) return mergable(pri, sec); return 0; @@ -684,8 +719,9 @@ rte_mergable(rte *pri, rte *sec) static void rte_trace(const char *name, const rte *e, int dir, const char *msg) { - log(L_TRACE "%s %c %s %N %uL %uG %s", - name, dir, msg, e->net, e->src->private_id, e->src->global_id, + log(L_TRACE "%s %c %s %N src %uL %uG %uS id %u %s", + name, dir, msg, e->net, + e->src->private_id, e->src->global_id, e->stale_cycle, e->id, rta_dest_name(rte_dest(e))); } @@ -725,8 +761,8 @@ rte_feed_count(net *n) { uint count = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTE_OR_NULL(e))) - count++; + count++; + return count; } @@ -735,11 +771,11 @@ rte_feed_obtain(net *n, struct rte **feed, uint count) { uint i = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTE_OR_NULL(e))) { ASSERT_DIE(i < count); feed[i++] = &e->rte; } + ASSERT_DIE(i == count); } @@ -847,6 +883,16 @@ do_rt_notify(struct channel *c, const net_addr *net, rte *new, const rte *old) static void rt_notify_basic(struct channel *c, const net_addr *net, rte *new, rte *old) { + if (new && old && rte_same(new, old)) + { + if ((new->id != old->id) && bmap_test(&c->export_map, old->id)) + { + bmap_set(&c->export_map, new->id); + bmap_clear(&c->export_map, old->id); + } + return; + } + if (new) new = export_filter(c, new, 0); @@ -859,8 +905,18 @@ rt_notify_basic(struct channel *c, const net_addr *net, rte *new, rte *old) do_rt_notify(c, net, new, old); } +static void +channel_rpe_mark_seen(struct rt_export_request *req, struct rt_pending_export *rpe) +{ + struct channel *c = SKIP_BACK(struct channel, out_req, req); + + rpe_mark_seen(req->hook, rpe); + if (rpe->old) + bmap_clear(&c->export_reject_map, rpe->old->rte.id); +} + void -rt_notify_accepted(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe, +rt_notify_accepted(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *first, struct rte **feed, uint count) { struct channel *c = SKIP_BACK(struct channel, out_req, req); @@ -902,38 +958,26 @@ rt_notify_accepted(struct rt_export_request *req, const net_addr *n, struct rt_p } } +done: /* Check obsolete routes for previously exported */ - if (!old_best) - if (rpe && rpe->old && bmap_test(&c->export_map, rpe->old->rte.id)) - old_best = &rpe->old->rte; - -/* for (; rpe; rpe = atomic_load_explicit(&rpe->next, memory_order_relaxed)) + RPE_WALK(first, rpe, NULL) + { + channel_rpe_mark_seen(req, rpe); + if (rpe->old) { - if (rpe->old && bmap_test(&hook->accept_map, rpe->old->id)) + if (bmap_test(&c->export_map, rpe->old->rte.id)) { - old_best = &rpe->old.rte; - break; + ASSERT_DIE(old_best == NULL); + old_best = &rpe->old->rte; } - - if (rpe == rpe_last) - break; } - */ + } /* Nothing to export */ - if (!new_best && !old_best) - { + if (new_best || old_best) + do_rt_notify(c, n, new_best, old_best); + else DBG("rt_notify_accepted: nothing to export\n"); - goto done; - } - - do_rt_notify(c, n, new_best, old_best); - -done: - /* Drop the old stored rejection if applicable. - * new->id == old->id happens when updating hostentries. */ - if (rpe && rpe->old && (!rpe->new || (rpe->new->rte.id != rpe->old->rte.id))) - bmap_clear(&c->export_reject_map, rpe->old->rte.id); } rte * @@ -999,7 +1043,7 @@ rt_export_merged(struct channel *c, struct rte **feed, uint count, linpool *pool } void -rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe, +rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *first, struct rte **feed, uint count) { struct channel *c = SKIP_BACK(struct channel, out_req, req); @@ -1025,63 +1069,70 @@ rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pen } /* Check obsolete routes for previously exported */ - if (!old_best) - if (rpe && rpe->old && bmap_test(&c->export_map, rpe->old->rte.id)) - old_best = &rpe->old->rte; - -/* for (; rpe; rpe = atomic_load_explicit(&rpe->next, memory_order_relaxed)) + RPE_WALK(first, rpe, NULL) + { + channel_rpe_mark_seen(req, rpe); + if (rpe->old) { - if (rpe->old && bmap_test(&hook->accept_map, rpe->old->id)) + if (bmap_test(&c->export_map, rpe->old->rte.id)) { - old_best = &rpe->old.rte; - break; + ASSERT_DIE(old_best == NULL); + old_best = &rpe->old->rte; } - - if (rpe == rpe_last) - break; } - */ + } /* Prepare new merged route */ - rte *new_merged = count ? rt_export_merged(c, feed, count, rte_update_pool, 0) : NULL; + rte *new_merged = count ? rt_export_merged(c, feed, count, tmp_linpool, 0) : NULL; if (new_merged || old_best) do_rt_notify(c, n, new_merged, old_best); - - /* Drop the old stored rejection if applicable. - * new->id == old->id happens when updating hostentries. */ - if (rpe && rpe->old && (!rpe->new || (rpe->new->rte.id != rpe->old->rte.id))) - bmap_clear(&c->export_reject_map, rpe->old->rte.id); } void -rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte n0; + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; - if (rpe->new_best != rpe->old_best) - rt_notify_basic(c, net, RTE_COPY(rpe->new_best, &n0), RTE_OR_NULL(rpe->old_best)); + RPE_WALK(first, rpe, NULL) + { + channel_rpe_mark_seen(req, rpe); + new_best = rpe->new_best; + } - /* Drop the old stored rejection if applicable. - * new->id == old->id happens when updating hostentries. */ - if (rpe->old && (!rpe->new || (rpe->new->rte.id != rpe->old->rte.id))) - bmap_clear(&c->export_reject_map, rpe->old->rte.id); + rte n0 = RTE_COPY_VALID(new_best); + if (n0.src || o) + rt_notify_basic(c, net, n0.src ? &n0 : NULL, o); } void -rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte n0; - if (rpe->new != rpe->old) - rt_notify_basic(c, net, RTE_COPY(rpe->new, &n0), RTE_OR_NULL(rpe->old)); + rte *n = RTE_VALID_OR_NULL(first->new); + rte *o = RTE_VALID_OR_NULL(first->old); - /* Drop the old stored rejection if applicable. - * new->id == old->id happens when updating hostentries. */ - if (rpe->old && (!rpe->new || (rpe->new->rte.id != rpe->old->rte.id))) - bmap_clear(&c->export_reject_map, rpe->old->rte.id); + if (!n && !o) + { + channel_rpe_mark_seen(req, first); + return; + } + + struct rte_src *src = n ? n->src : o->src; + struct rte_storage *new_latest = first->new; + + RPE_WALK(first, rpe, src) + { + channel_rpe_mark_seen(req, rpe); + new_latest = rpe->new; + } + + rte n0 = RTE_COPY_VALID(new_latest); + if (n0.src || o) + rt_notify_basic(c, net, n0.src ? &n0 : NULL, o); } void @@ -1090,10 +1141,108 @@ rt_feed_any(struct rt_export_request *req, const net_addr *net, struct rt_pendin struct channel *c = SKIP_BACK(struct channel, out_req, req); for (uint i=0; i<count; i++) + if (rte_is_valid(feed[i])) + { + rte n0 = *feed[i]; + rt_notify_basic(c, net, &n0, NULL); + } +} + +void +rpe_mark_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe) +{ + bmap_set(&hook->seq_map, rpe->seq); +} + +struct rt_pending_export * +rpe_next(struct rt_pending_export *rpe, struct rte_src *src) +{ + struct rt_pending_export *next = atomic_load_explicit(&rpe->next, memory_order_acquire); + + if (!next) + return NULL; + + if (!src) + return next; + + while (rpe = next) + if (src == (rpe->new ? rpe->new->rte.src : rpe->old->rte.src)) + return rpe; + else + next = atomic_load_explicit(&rpe->next, memory_order_acquire); + + return NULL; +} + +static struct rt_pending_export * rt_next_export_fast(struct rt_pending_export *last); +static int +rte_export(struct rt_table_export_hook *th, struct rt_pending_export *rpe) +{ + rtable *tab = RT_PUB(SKIP_BACK(struct rtable_private, exporter, th->table)); + struct rt_export_hook *hook = &th->h; + if (bmap_test(&hook->seq_map, rpe->seq)) + goto ignore; /* Seen already */ + + const net_addr *n = rpe->new_best ? rpe->new_best->rte.net : rpe->old_best->rte.net; + + switch (hook->req->addr_mode) + { + case TE_ADDR_NONE: + break; + + case TE_ADDR_IN: + if (!net_in_netX(n, hook->req->addr)) + goto ignore; + break; + + case TE_ADDR_EQUAL: + if (!net_equal(n, hook->req->addr)) + goto ignore; + break; + + case TE_ADDR_FOR: + bug("Continuos export of best prefix match not implemented yet."); + + default: + bug("Strange table export address mode: %d", hook->req->addr_mode); + } + + if (rpe->new) + hook->stats.updates_received++; + else + hook->stats.withdraws_received++; + + if (hook->req->export_one) + hook->req->export_one(hook->req, n, rpe); + else if (hook->req->export_bulk) { - rte n0 = *feed[i]; - rt_notify_basic(c, net, &n0, NULL); + net *net = SKIP_BACK(struct network, n.addr, (net_addr (*)[0]) n); + RT_LOCK(tab); + uint count = rte_feed_count(net); + rte **feed = NULL; + if (count) + { + feed = alloca(count * sizeof(rte *)); + rte_feed_obtain(net, feed, count); + } + RT_UNLOCK(tab); + hook->req->export_bulk(hook->req, n, rpe, feed, count); } + else + bug("Export request must always provide an export method"); + +ignore: + /* Get the next export if exists */ + th->rpe_next = rt_next_export_fast(rpe); + + /* The last block may be available to free */ + int used = (PAGE_HEAD(th->rpe_next) != PAGE_HEAD(rpe)); + + /* Releasing this export for cleanup routine */ + DBG("store hook=%p last_export=%p seq=%lu\n", hook, rpe, rpe->seq); + atomic_store_explicit(&th->last_export, rpe, memory_order_release); + + return used; } /** @@ -1128,91 +1277,262 @@ rt_feed_any(struct rt_export_request *req, const net_addr *net, struct rt_pendin * done outside of scope of rte_announce(). */ static void -rte_announce(rtable *tab, net *net, struct rte_storage *new, struct rte_storage *old, +rte_announce(struct rtable_private *tab, net *net, struct rte_storage *new, struct rte_storage *old, struct rte_storage *new_best, struct rte_storage *old_best) { - if (!rte_is_valid(RTE_OR_NULL(new))) - new = NULL; + int new_best_valid = rte_is_valid(RTE_OR_NULL(new_best)); + int old_best_valid = rte_is_valid(RTE_OR_NULL(old_best)); - if (!rte_is_valid(RTE_OR_NULL(old))) - old = NULL; + if ((new == old) && (new_best == old_best)) + return; - if (!rte_is_valid(RTE_OR_NULL(new_best))) - new_best = NULL; + if (new_best_valid) + new_best->rte.sender->stats.pref++; + if (old_best_valid) + old_best->rte.sender->stats.pref--; - if (!rte_is_valid(RTE_OR_NULL(old_best))) - old_best = NULL; + if (EMPTY_LIST(tab->exporter.e.hooks) && EMPTY_LIST(tab->exporter.pending)) + { + /* No export hook and no pending exports to cleanup. We may free the route immediately. */ + if (!old) + return; - if (!new && !old && !new_best && !old_best) + hmap_clear(&tab->id_map, old->rte.id); + rte_free(old); return; + } + + /* Get the pending export structure */ + struct rt_export_block *rpeb = NULL, *rpebsnl = NULL; + u32 end = 0; - if (new_best != old_best) + if (!EMPTY_LIST(tab->exporter.pending)) { - if (new_best) - new_best->rte.sender->stats.pref++; - if (old_best) - old_best->rte.sender->stats.pref--; + rpeb = TAIL(tab->exporter.pending); + end = atomic_load_explicit(&rpeb->end, memory_order_relaxed); + if (end >= RT_PENDING_EXPORT_ITEMS) + { + ASSERT_DIE(end == RT_PENDING_EXPORT_ITEMS); + rpebsnl = rpeb; - if (tab->hostcache) - rt_notify_hostcache(tab, net); + rpeb = NULL; + end = 0; + } + } - if (!EMPTY_LIST(tab->flowspec_links)) - rt_flowspec_notify(tab, net); + if (!rpeb) + { + rpeb = alloc_page(); + *rpeb = (struct rt_export_block) {}; + add_tail(&tab->exporter.pending, &rpeb->n); } - rt_schedule_notify(tab); + /* Fill the pending export */ + struct rt_pending_export *rpe = &rpeb->export[rpeb->end]; + *rpe = (struct rt_pending_export) { + .new = new, + .new_best = new_best, + .old = old, + .old_best = old_best, + .seq = tab->exporter.next_seq++, + }; + + DBGL("rte_announce: table=%s net=%N new=%p id %u from %s old=%p id %u from %s new_best=%p id %u old_best=%p id %u seq=%lu", + tab->name, net->n.addr, + new, new ? new->rte.id : 0, new ? new->rte.sender->req->name : NULL, + old, old ? old->rte.id : 0, old ? old->rte.sender->req->name : NULL, + new_best, old_best, rpe->seq); + + ASSERT_DIE(atomic_fetch_add_explicit(&rpeb->end, 1, memory_order_release) == end); - struct rt_pending_export rpe = { .new = new, .old = old, .new_best = new_best, .old_best = old_best }; - uint count = rte_feed_count(net); - rte **feed = NULL; - if (count) + if (rpebsnl) { - feed = alloca(count * sizeof(rte *)); - rte_feed_obtain(net, feed, count); + _Bool f = 0; + ASSERT_DIE(atomic_compare_exchange_strong_explicit(&rpebsnl->not_last, &f, 1, + memory_order_release, memory_order_relaxed)); } - struct rt_export_hook *eh; - WALK_LIST(eh, tab->exporter.hooks) + /* Append to the same-network squasher list */ + if (net->last) { - if (eh->export_state == TES_STOP) - continue; + struct rt_pending_export *rpenull = NULL; + ASSERT_DIE(atomic_compare_exchange_strong_explicit( + &net->last->next, &rpenull, rpe, + memory_order_relaxed, + memory_order_relaxed)); - switch (eh->req->addr_mode) - { - case TE_ADDR_NONE: - break; + } - case TE_ADDR_IN: - if (!net_in_netX(net->n.addr, eh->req->addr)) - continue; - break; + net->last = rpe; - case TE_ADDR_EQUAL: - if (!net_equal(net->n.addr, eh->req->addr)) + if (!net->first) + net->first = rpe; + + if (tab->exporter.first == NULL) + tab->exporter.first = rpe; + + rt_check_cork_high(tab); +} + +static struct rt_pending_export * +rt_next_export_fast(struct rt_pending_export *last) +{ + /* Get the whole export block and find our position in there. */ + struct rt_export_block *rpeb = PAGE_HEAD(last); + u32 pos = (last - &rpeb->export[0]); + u32 end = atomic_load_explicit(&rpeb->end, memory_order_acquire); + ASSERT_DIE(pos < end); + + /* Next is in the same block. */ + if (++pos < end) + return &rpeb->export[pos]; + + /* There is another block. */ + if (atomic_load_explicit(&rpeb->not_last, memory_order_acquire)) + { + /* This is OK to do non-atomically because of the not_last flag. */ + rpeb = NODE_NEXT(rpeb); + return &rpeb->export[0]; + } + + /* There is nothing more. */ + return NULL; +} + +static struct rt_pending_export * +rt_next_export(struct rt_table_export_hook *hook, struct rt_table_exporter *tab) +{ + ASSERT_DIE(RT_IS_LOCKED(SKIP_BACK(struct rtable_private, exporter, tab))); + + /* As the table is locked, it is safe to reload the last export pointer */ + struct rt_pending_export *last = atomic_load_explicit(&hook->last_export, memory_order_acquire); + + /* It is still valid, let's reuse it */ + if (last) + return rt_next_export_fast(last); + + /* No, therefore we must process the table's first pending export */ + else + return tab->first; +} + +static inline void +rt_send_export_event(struct rt_export_hook *hook) +{ + ev_send(hook->req->list, &hook->event); +} + +static void +rt_announce_exports(struct settle *s) +{ + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, export_settle, s)), tab) + if (!EMPTY_LIST(tab->exporter.pending)) + { + struct rt_export_hook *c; node *n; + WALK_LIST2(c, n, tab->exporter.e.hooks, n) + { + if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_READY) continue; - break; - case TE_ADDR_FOR: - bug("Continuos export of best prefix match not implemented yet."); + rt_send_export_event(c); + } + } +} - default: - bug("Strange table export address mode: %d", eh->req->addr_mode); +static void +rt_kick_export_settle(struct rtable_private *tab) +{ + tab->export_settle.cf = tab->rr_counter ? tab->config->export_rr_settle : tab->config->export_settle; + settle_kick(&tab->export_settle, tab->loop); +} + +static void +rt_import_announce_exports(void *_hook) +{ + struct rt_import_hook *hook = _hook; + if (hook->import_state == TIS_CLEARED) + { + void (*stopped)(struct rt_import_request *) = hook->stopped; + struct rt_import_request *req = hook->req; + + RT_LOCKED(hook->table, tab) + { + req->hook = NULL; + + rt_trace(tab, D_EVENTS, "Hook %s stopped", req->name); + rem_node(&hook->n); + mb_free(hook); + rt_unlock_table(tab); } - if (new) - eh->stats.updates_received++; - else - eh->stats.withdraws_received++; + stopped(req); + return; + } - if (eh->req->export_one) - eh->req->export_one(eh->req, net->n.addr, &rpe); - else if (eh->req->export_bulk) - eh->req->export_bulk(eh->req, net->n.addr, &rpe, feed, count); - else - bug("Export request must always provide an export method"); + rt_trace(hook->table, D_EVENTS, "Announcing exports after imports from %s", hook->req->name); + birdloop_flag(hook->table->loop, RTF_EXPORT); +} + +static struct rt_pending_export * +rt_last_export(struct rt_table_exporter *tab) +{ + struct rt_pending_export *rpe = NULL; + + if (!EMPTY_LIST(tab->pending)) + { + /* We'll continue processing exports from this export on */ + struct rt_export_block *reb = TAIL(tab->pending); + ASSERT_DIE(reb->end); + rpe = &reb->export[reb->end - 1]; + } + + return rpe; +} + +#define RT_EXPORT_BULK 1024 + +static void +rt_export_hook(void *_data) +{ + struct rt_table_export_hook *c = _data; + rtable *tab = SKIP_BACK(rtable, priv.exporter, c->table); + + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_READY); + + if (!c->rpe_next) + { + RT_LOCK(tab); + c->rpe_next = rt_next_export(c, c->table); + + if (!c->rpe_next) + { + rt_export_used(c->table, c->h.req->name, "done exporting"); + RT_UNLOCK(tab); + return; + } + + RT_UNLOCK(tab); + } + + int used = 0; + + /* Process the export */ + for (uint i=0; i<RT_EXPORT_BULK; i++) + { + used += rte_export(c, c->rpe_next); + + if (!c->rpe_next) + break; } + + if (used) + RT_LOCKED(tab, _) + rt_export_used(c->table, c->h.req->name, "finished export bulk"); + + rt_send_export_event(&c->h); } + static inline int rte_validate(struct channel *ch, rte *e) { @@ -1279,11 +1599,10 @@ rte_same(rte *x, rte *y) static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); } -static void -rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *src) +static int +rte_recalculate(struct rtable_private *table, struct rt_import_hook *c, net *net, rte *new, struct rte_src *src) { struct rt_import_request *req = c->req; - struct rtable *table = c->table; struct rt_import_stats *stats = &c->stats; struct rte_storage *old_best_stored = net->routes, *old_stored = NULL; rte *old_best = old_best_stored ? &old_best_stored->rte : NULL; @@ -1313,17 +1632,16 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr { if (!old->generation && !new->generation) bug("Two protocols claim to author a route with the same rte_src in table %s: %N %s/%u:%u", - c->table->name, net->n.addr, old->src->proto->name, old->src->private_id, old->src->global_id); + c->table->name, net->n.addr, old->src->owner->name, old->src->private_id, old->src->global_id); log_rl(&table->rl_pipe, L_ERR "Route source collision in table %s: %N %s/%u:%u", - c->table->name, net->n.addr, old->src->proto->name, old->src->private_id, old->src->global_id); + c->table->name, net->n.addr, old->src->owner->name, old->src->private_id, old->src->global_id); } if (new && rte_same(old, &new_stored->rte)) { /* No changes, ignore the new route and refresh the old one */ - - old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY); + old->stale_cycle = new->stale_cycle; if (!rte_is_filtered(new)) { @@ -1333,7 +1651,7 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr /* We need to free the already stored route here before returning */ rte_free(new_stored); - return; + return 0; } *before_old = (*before_old)->next; @@ -1343,7 +1661,7 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr if (!old && !new) { stats->withdraws_ignored++; - return; + return 0; } /* If rejected by import limit, we need to pretend there is no route */ @@ -1393,8 +1711,8 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr /* If routes are not sorted, find the best route and move it on the first position. There are several optimized cases. */ - if (src->proto->rte_recalculate && - src->proto->rte_recalculate(table, net, new_stored ? &new_stored->rte : NULL, old, old_best)) + if (src->owner->rte_recalculate && + src->owner->rte_recalculate(table, net, new_stored ? &new_stored->rte : NULL, old, old_best)) goto do_recalculate; if (new_stored && rte_better(&new_stored->rte, old_best)) @@ -1458,14 +1776,8 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr if (new_stored) { new_stored->rte.lastmod = current_time(); - - if (!old) - { - new_stored->rte.id = hmap_first_zero(&table->id_map); - hmap_set(&table->id_map, new_stored->rte.id); - } - else - new_stored->rte.id = old->id; + new_stored->rte.id = hmap_first_zero(&table->id_map); + hmap_set(&table->id_map, new_stored->rte.id); } /* Log the route change */ @@ -1485,41 +1797,7 @@ rte_recalculate(struct rt_import_hook *c, net *net, rte *new, struct rte_src *sr rte_announce(table, net, new_stored, old_stored, net->routes, old_best_stored); - if (!net->routes && - (table->gc_counter++ >= table->config->gc_max_ops) && - (table->gc_time + table->config->gc_min_time <= current_time())) - rt_schedule_prune(table); - -#if 0 - /* Enable and reimplement these callbacks if anybody wants to use them */ - if (old_ok && p->rte_remove) - p->rte_remove(net, old); - if (new_ok && p->rte_insert) - p->rte_insert(net, &new_stored->rte); -#endif - - if (old) - { - if (!new_stored) - hmap_clear(&table->id_map, old->id); - - rte_free(old_stored); - } -} - -static int rte_update_nest_cnt; /* Nesting counter to allow recursive updates */ - -static inline void -rte_update_lock(void) -{ - rte_update_nest_cnt++; -} - -static inline void -rte_update_unlock(void) -{ - if (!--rte_update_nest_cnt) - lp_flush(rte_update_pool); + return 1; } int @@ -1571,7 +1849,6 @@ rte_update(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) const struct filter *filter = c->in_filter; struct channel_import_stats *stats = &c->import_stats; - rte_update_lock(); if (new) { new->net = n; @@ -1622,7 +1899,6 @@ rte_update(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) ea_free(a); } - rte_update_unlock(); } void @@ -1632,110 +1908,90 @@ rte_import(struct rt_import_request *req, const net_addr *n, rte *new, struct rt if (!hook) return; - net *nn; - if (new) + RT_LOCKED(hook->table, tab) + { + net *nn; + if (new) { /* Use the actual struct network, not the dummy one */ - nn = net_get(hook->table, n); + nn = net_get(tab, n); new->net = nn->n.addr; new->sender = hook; + + /* Set the stale cycle */ + new->stale_cycle = hook->stale_set; } - else if (!(nn = net_find(hook->table, n))) + else if (!(nn = net_find(tab, n))) { req->hook->stats.withdraws_ignored++; - return; + RT_RETURN(tab); } - /* And recalculate the best route */ - rte_recalculate(hook, nn, new, src); -} - -/* Independent call to rte_announce(), used from next hop - recalculation, outside of rte_update(). new must be non-NULL */ -static inline void -rte_announce_i(rtable *tab, net *net, struct rte_storage *new, struct rte_storage *old, - struct rte_storage *new_best, struct rte_storage *old_best) -{ - rte_update_lock(); - rte_announce(tab, net, new, old, new_best, old_best); - rte_update_unlock(); -} - -static inline void -rte_discard(net *net, rte *old) /* Non-filtered route deletion, used during garbage collection */ -{ - rte_update_lock(); - rte_recalculate(old->sender, net, NULL, old->src); - rte_update_unlock(); -} - -/* Modify existing route by protocol hook, used for long-lived graceful restart */ -static inline void -rte_modify(net *net, rte *old) -{ - rte_update_lock(); - - rte *new = old->sender->req->rte_modify(old, rte_update_pool); - if (new != old) - { - if (new) - new->flags = old->flags & ~REF_MODIFY; - - rte_recalculate(old->sender, net, new, old->src); + /* Recalculate the best route */ + if (rte_recalculate(tab, hook, nn, new, src)) + ev_send(req->list, &hook->announce_event); } - - rte_update_unlock(); } /* Check rtable for best route to given net whether it would be exported do p */ int -rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter) +rt_examine(rtable *tp, net_addr *a, struct channel *c, const struct filter *filter) { - net *n = net_find(t, a); + rte rt = {}; - if (!n || !rte_is_valid(RTE_OR_NULL(n->routes))) - return 0; - - rte rt = n->routes->rte; + RT_LOCKED(tp, t) + { + net *n = net_find(t, a); + if (n) + rt = RTE_COPY_VALID(n->routes); + } - rte_update_lock(); + if (!rt.src) + return 0; - /* Rest is stripped down export_filter() */ int v = c->proto->preexport ? c->proto->preexport(c, &rt) : 0; if (v == RIC_PROCESS) v = (f_run(filter, &rt, FF_SILENT) <= F_ACCEPT); - rte_update_unlock(); - return v > 0; } static void -rt_table_export_done(struct rt_export_hook *hook) +rt_table_export_done(void *hh) { - struct rt_exporter *re = hook->table; - struct rtable *tab = SKIP_BACK(struct rtable, exporter, re); + struct rt_table_export_hook *hook = hh; + struct rt_export_request *req = hook->h.req; + void (*stopped)(struct rt_export_request *) = hook->h.stopped; + rtable *t = SKIP_BACK(rtable, priv.exporter, hook->table); + + RT_LOCKED(t, tab) + { + DBG("Export hook %p in table %s finished uc=%u\n", hook, tab->name, tab->use_count); + + /* Drop pending exports */ + rt_export_used(&tab->exporter, hook->h.req->name, "stopped"); + + /* Do the common code; this frees the hook */ + rt_export_stopped(&hook->h); + } - rt_unlock_table(tab); - DBG("Export hook %p in table %s finished uc=%u\n", hook, tab->name, tab->use_count); + /* Report the channel as stopped. */ + CALL(stopped, req); + + /* Unlock the table; this may free it */ + rt_unlock_table(t); } -static void -rt_export_stopped(void *data) +void +rt_export_stopped(struct rt_export_hook *hook) { - struct rt_export_hook *hook = data; - struct rt_exporter *tab = hook->table; + /* Unlink from the request */ + hook->req->hook = NULL; /* Unlist */ rem_node(&hook->n); - /* Reporting the channel as stopped. */ - hook->stopped(hook->req); - - /* Reporting the hook as finished. */ - CALL(tab->done, hook); - - /* Freeing the hook together with its coroutine. */ + /* Free the hook itself together with its pool */ rfree(hook->pool); } @@ -1745,36 +2001,38 @@ rt_set_import_state(struct rt_import_hook *hook, u8 state) hook->last_state_change = current_time(); hook->import_state = state; - if (hook->req->log_state_change) - hook->req->log_state_change(hook->req, state); + CALL(hook->req->log_state_change, hook->req, state); } void rt_set_export_state(struct rt_export_hook *hook, u8 state) { hook->last_state_change = current_time(); - hook->export_state = state; + u8 old = atomic_exchange_explicit(&hook->export_state, state, memory_order_release); - if (hook->req->log_state_change) - hook->req->log_state_change(hook->req, state); + if (old != state) + CALL(hook->req->log_state_change, hook->req, state); } void -rt_request_import(rtable *tab, struct rt_import_request *req) +rt_request_import(rtable *t, struct rt_import_request *req) { - rt_lock_table(tab); + RT_LOCKED(t, tab) + { + rt_lock_table(tab); - struct rt_import_hook *hook = req->hook = mb_allocz(tab->rp, sizeof(struct rt_import_hook)); + struct rt_import_hook *hook = req->hook = mb_allocz(tab->rp, sizeof(struct rt_import_hook)); - DBG("Lock table %s for import %p req=%p uc=%u\n", tab->name, hook, req, tab->use_count); + hook->announce_event = (event) { .hook = rt_import_announce_exports, .data = hook }; - hook->req = req; - hook->table = tab; + DBG("Lock table %s for import %p req=%p uc=%u\n", tab->name, hook, req, tab->use_count); - rt_set_import_state(hook, TIS_UP); + hook->req = req; + hook->table = t; - hook->n = (node) {}; - add_tail(&tab->imports, &hook->n); + rt_set_import_state(hook, TIS_UP); + add_tail(&tab->imports, &hook->n); + } } void @@ -1783,23 +2041,77 @@ rt_stop_import(struct rt_import_request *req, void (*stopped)(struct rt_import_r ASSERT_DIE(req->hook); struct rt_import_hook *hook = req->hook; - rt_schedule_prune(hook->table); + RT_LOCKED(hook->table, tab) + { + rt_schedule_prune(tab); + rt_set_import_state(hook, TIS_STOP); + hook->stopped = stopped; - rt_set_import_state(hook, TIS_STOP); + if (hook->stale_set != hook->stale_pruned) + tab->rr_counter -= (hook->stale_set - hook->stale_pruned - 1); + else + tab->rr_counter++; - hook->stopped = stopped; + hook->stale_set = hook->stale_pruned = hook->stale_pruning = hook->stale_valid = 0; + } } -static struct rt_export_hook * -rt_table_export_start(struct rt_exporter *re, struct rt_export_request *req) +static void rt_table_export_start_feed(struct rtable_private *tab, struct rt_table_export_hook *hook); +static void +rt_table_export_uncork(void *_hook) { - rtable *tab = SKIP_BACK(rtable, exporter, re); + ASSERT_DIE(birdloop_inside(&main_birdloop)); + + struct rt_table_export_hook *hook = _hook; + struct birdloop *loop = hook->h.req->list->loop; + + if (loop != &main_birdloop) + birdloop_enter(loop); + + u8 state; + switch (state = atomic_load_explicit(&hook->h.export_state, memory_order_relaxed)) + { + case TES_HUNGRY: + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, hook->table)), tab) + rt_table_export_start_feed(tab, hook); + break; + case TES_STOP: + rt_stop_export_common(&hook->h); + break; + default: + bug("Uncorking a table export in a strange state: %u", state); + } + + if (loop != &main_birdloop) + birdloop_leave(loop); +} + +static void +rt_table_export_start_locked(struct rtable_private *tab, struct rt_export_request *req) +{ + struct rt_exporter *re = &tab->exporter.e; rt_lock_table(tab); - pool *p = rp_new(tab->rp, "Export hook"); - struct rt_export_hook *hook = mb_allocz(p, sizeof(struct rt_export_hook)); - hook->pool = p; - hook->lp = lp_new_default(p); + req->hook = rt_alloc_export(re, sizeof(struct rt_table_export_hook)); + req->hook->req = req; + + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, req->hook); + hook->h.event = (event) { + .hook = rt_table_export_uncork, + .data = hook, + }; + + if (rt_cork_check(&hook->h.event)) + rt_set_export_state(&hook->h, TES_HUNGRY); + else + rt_table_export_start_feed(tab, hook); +} + +static void +rt_table_export_start_feed(struct rtable_private *tab, struct rt_table_export_hook *hook) +{ + struct rt_exporter *re = &tab->exporter.e; + struct rt_export_request *req = hook->h.req; /* stats zeroed by mb_allocz */ switch (req->addr_mode) @@ -1807,24 +2119,25 @@ rt_table_export_start(struct rt_exporter *re, struct rt_export_request *req) case TE_ADDR_IN: if (tab->trie && net_val_match(tab->addr_type, NB_IP)) { - hook->walk_state = mb_allocz(p, sizeof (struct f_trie_walk_state)); + hook->walk_state = mb_allocz(hook->h.pool, sizeof (struct f_trie_walk_state)); hook->walk_lock = rt_lock_trie(tab); trie_walk_init(hook->walk_state, tab->trie, req->addr); - hook->event = ev_new_init(p, rt_feed_by_trie, hook); + hook->h.event.hook = rt_feed_by_trie; + hook->walk_last.type = 0; break; } /* fall through */ case TE_ADDR_NONE: FIB_ITERATE_INIT(&hook->feed_fit, &tab->fib); - hook->event = ev_new_init(p, rt_feed_by_fib, hook); + hook->h.event.hook = rt_feed_by_fib; break; case TE_ADDR_EQUAL: - hook->event = ev_new_init(p, rt_feed_equal, hook); + hook->h.event.hook = rt_feed_equal; break; case TE_ADDR_FOR: - hook->event = ev_new_init(p, rt_feed_for, hook); + hook->h.event.hook = rt_feed_for; break; default: @@ -1833,72 +2146,137 @@ rt_table_export_start(struct rt_exporter *re, struct rt_export_request *req) DBG("New export hook %p req %p in table %s uc=%u\n", hook, req, tab->name, tab->use_count); - return hook; + struct rt_pending_export *rpe = rt_last_export(hook->table); + DBG("store hook=%p last_export=%p seq=%lu\n", hook, rpe, rpe ? rpe->seq : 0); + atomic_store_explicit(&hook->last_export, rpe, memory_order_relaxed); + + rt_init_export(re, req->hook); +} + +static void +rt_table_export_start(struct rt_exporter *re, struct rt_export_request *req) +{ + RT_LOCKED(SKIP_BACK(rtable, priv.exporter.e, re), tab) + rt_table_export_start_locked(tab, req); +} + +void rt_request_export(rtable *t, struct rt_export_request *req) +{ + RT_LOCKED(t, tab) + rt_table_export_start_locked(tab, req); /* Is locked inside */ } void -rt_request_export(struct rt_exporter *re, struct rt_export_request *req) +rt_request_export_other(struct rt_exporter *re, struct rt_export_request *req) { - struct rt_export_hook *hook = req->hook = re->start(re, req); + return re->class->start(re, req); +} - hook->req = req; +struct rt_export_hook * +rt_alloc_export(struct rt_exporter *re, uint size) +{ + pool *p = rp_new(re->rp, "Export hook"); + struct rt_export_hook *hook = mb_allocz(p, size); + + hook->pool = p; hook->table = re; + return hook; +} + +void +rt_init_export(struct rt_exporter *re, struct rt_export_hook *hook) +{ + hook->event.data = hook; + + bmap_init(&hook->seq_map, hook->pool, 1024); + hook->n = (node) {}; add_tail(&re->hooks, &hook->n); /* Regular export */ rt_set_export_state(hook, TES_FEEDING); - ev_schedule_work(hook->event); + rt_send_export_event(hook); } -static void -rt_table_export_stop(struct rt_export_hook *hook) +static int +rt_table_export_stop_locked(struct rt_export_hook *hh) { - rtable *tab = SKIP_BACK(rtable, exporter, hook->table); + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, hh); + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter, hook->table); - if (hook->export_state != TES_FEEDING) - return; - - switch (hook->req->addr_mode) + switch (atomic_load_explicit(&hh->export_state, memory_order_relaxed)) { - case TE_ADDR_IN: - if (hook->walk_lock) + case TES_HUNGRY: + return 0; + case TES_FEEDING: + switch (hh->req->addr_mode) { - rt_unlock_trie(tab, hook->walk_lock); - hook->walk_lock = NULL; - mb_free(hook->walk_state); - hook->walk_state = NULL; - break; + case TE_ADDR_IN: + if (hook->walk_lock) + { + rt_unlock_trie(tab, hook->walk_lock); + hook->walk_lock = NULL; + mb_free(hook->walk_state); + hook->walk_state = NULL; + break; + } + /* fall through */ + case TE_ADDR_NONE: + fit_get(&tab->fib, &hook->feed_fit); + break; } - /* fall through */ - case TE_ADDR_NONE: - fit_get(&tab->fib, &hook->feed_fit); - break; + } + return 1; +} + +static void +rt_table_export_stop(struct rt_export_hook *hh) +{ + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, hh); + int ok = 0; + rtable *t = SKIP_BACK(rtable, priv.exporter, hook->table); + if (RT_IS_LOCKED(t)) + ok = rt_table_export_stop_locked(hh); + else + RT_LOCKED(t, tab) + ok = rt_table_export_stop_locked(hh); + + if (ok) + rt_stop_export_common(hh); + else + rt_set_export_state(&hook->h, TES_STOP); } void rt_stop_export(struct rt_export_request *req, void (*stopped)(struct rt_export_request *)) { + ASSERT_DIE(birdloop_inside(req->list->loop)); ASSERT_DIE(req->hook); struct rt_export_hook *hook = req->hook; - /* Cancel the feeder event */ - ev_postpone(hook->event); - - /* Stop feeding from the exporter */ - CALL(hook->table->stop, hook); - - /* Reset the event as the stopped event */ - hook->event->hook = rt_export_stopped; + /* Set the stopped callback */ hook->stopped = stopped; + /* Run the stop code */ + if (hook->table->class->stop) + hook->table->class->stop(hook); + else + rt_stop_export_common(hook); +} + +void +rt_stop_export_common(struct rt_export_hook *hook) +{ /* Update export state */ rt_set_export_state(hook, TES_STOP); + /* Reset the event as the stopped event */ + hook->event.hook = hook->table->class->done; + /* Run the stopped event */ - ev_schedule(hook->event); + rt_send_export_event(hook); } /** @@ -1916,15 +2294,45 @@ rt_stop_export(struct rt_export_request *req, void (*stopped)(struct rt_export_r * flag in rt_refresh_end() and then removing such routes in the prune loop. */ void -rt_refresh_begin(rtable *t, struct rt_import_request *req) +rt_refresh_begin(struct rt_import_request *req) { - FIB_WALK(&t->fib, net, n) - { - for (struct rte_storage *e = n->routes; e; e = e->next) - if (e->rte.sender == req->hook) - e->rte.flags |= REF_STALE; - } - FIB_WALK_END; + struct rt_import_hook *hook = req->hook; + ASSERT_DIE(hook); + ASSERT_DIE(hook->stale_set == hook->stale_valid); + + RT_LOCKED(hook->table, tab) + { + + /* If the pruning routine is too slow */ + if ((hook->stale_pruned < hook->stale_valid) && (hook->stale_pruned + 128 < hook->stale_valid) + || (hook->stale_pruned > hook->stale_valid) && (hook->stale_pruned > hook->stale_valid + 128)) + { + log(L_WARN "Route refresh flood in table %s", hook->table->name); + FIB_WALK(&tab->fib, net, n) + { + for (struct rte_storage *e = n->routes; e; e = e->next) + if (e->rte.sender == req->hook) + e->rte.stale_cycle = 0; + } + FIB_WALK_END; + tab->rr_counter -= (hook->stale_set - hook->stale_pruned - 1); + hook->stale_set = 1; + hook->stale_valid = 0; + hook->stale_pruned = 0; + } + /* Setting a new value of the stale modifier */ + else if (!++hook->stale_set) + { + /* Let's reserve the stale_cycle zero value for always-invalid routes */ + hook->stale_set = 1; + hook->stale_valid = 0; + tab->rr_counter++; + } + + if (req->trace_routes & D_STATES) + log(L_TRACE "%s: route refresh begin [%u]", req->name, hook->stale_set); + + } } /** @@ -1936,43 +2344,21 @@ rt_refresh_begin(rtable *t, struct rt_import_request *req) * hook. See rt_refresh_begin() for description of refresh cycles. */ void -rt_refresh_end(rtable *t, struct rt_import_request *req) +rt_refresh_end(struct rt_import_request *req) { - int prune = 0; - - FIB_WALK(&t->fib, net, n) - { - for (struct rte_storage *e = n->routes; e; e = e->next) - if ((e->rte.sender == req->hook) && (e->rte.flags & REF_STALE)) - { - e->rte.flags |= REF_DISCARD; - prune = 1; - } - } - FIB_WALK_END; - - if (prune) - rt_schedule_prune(t); -} + struct rt_import_hook *hook = req->hook; + ASSERT_DIE(hook); -void -rt_modify_stale(rtable *t, struct rt_import_request *req) -{ - int prune = 0; + RT_LOCKED(hook->table, tab) + { + hook->stale_valid++; + ASSERT_DIE(hook->stale_set == hook->stale_valid); - FIB_WALK(&t->fib, net, n) - { - for (struct rte_storage *e = n->routes; e; e = e->next) - if ((e->rte.sender == req->hook) && (e->rte.flags & REF_STALE) && !(e->rte.flags & REF_FILTERED)) - { - e->rte.flags |= REF_MODIFY; - prune = 1; - } - } - FIB_WALK_END; + rt_schedule_prune(tab); - if (prune) - rt_schedule_prune(t); + if (req->trace_routes & D_STATES) + log(L_TRACE "%s: route refresh end [%u]", req->name, hook->stale_valid); + } } /** @@ -1997,8 +2383,11 @@ rte_dump(struct rte_storage *e) * This function dumps contents of a given routing table to debug output. */ void -rt_dump(rtable *t) +rt_dump(rtable *tp) { + RT_LOCKED(tp, t) + { + debug("Dump of routing table <%s>%s\n", t->name, t->deleted ? " (deleted)" : ""); #ifdef DEBUGGING fib_check(&t->fib); @@ -2010,6 +2399,8 @@ rt_dump(rtable *t) } FIB_WALK_END; debug("\n"); + + } } /** @@ -2031,11 +2422,14 @@ rt_dump_all(void) } void -rt_dump_hooks(rtable *tab) +rt_dump_hooks(rtable *tp) { + RT_LOCKED(tp, tab) + { + debug("Dump of hooks in routing table <%s>%s\n", tab->name, tab->deleted ? " (deleted)" : ""); - debug(" nhu_state=%u hcu_scheduled=%u use_count=%d rt_count=%u\n", - tab->nhu_state, tab->hcu_scheduled, tab->use_count, tab->rt_count); + debug(" nhu_state=%u use_count=%d rt_count=%u\n", + tab->nhu_state, tab->use_count, tab->rt_count); debug(" last_rt_change=%t gc_time=%t gc_counter=%d prune_state=%u\n", tab->last_rt_change, tab->gc_time, tab->gc_counter, tab->prune_state); @@ -2049,15 +2443,18 @@ rt_dump_hooks(rtable *tab) ih->last_state_change, ih->import_state, ih->stopped); } - struct rt_export_hook *eh; - WALK_LIST(eh, tab->exporter.hooks) + struct rt_table_export_hook *eh; + WALK_LIST(eh, tab->exporter.e.hooks) { - eh->req->dump_req(eh->req); + eh->h.req->dump_req(eh->h.req); debug(" Export hook %p requested by %p:" - " refeed_pending=%u last_state_change=%t export_state=%u stopped=%p\n", - eh, eh->req, eh->refeed_pending, eh->last_state_change, eh->export_state, eh->stopped); + " refeed_pending=%u last_state_change=%t export_state=%u\n", + eh, eh->h.req, eh->refeed_pending, eh->h.last_state_change, + atomic_load_explicit(&eh->h.export_state, memory_order_relaxed)); } debug("\n"); + + } } void @@ -2076,206 +2473,236 @@ rt_dump_hooks_all(void) } static inline void -rt_schedule_hcu(rtable *tab) -{ - if (tab->hcu_scheduled) - return; - - tab->hcu_scheduled = 1; - ev_schedule(tab->rt_event); -} - -static inline void -rt_schedule_nhu(rtable *tab) +rt_schedule_nhu(struct rtable_private *tab) { - if (tab->nhu_state == NHU_CLEAN) - ev_schedule(tab->rt_event); - - /* state change: - * NHU_CLEAN -> NHU_SCHEDULED - * NHU_RUNNING -> NHU_DIRTY - */ - tab->nhu_state |= NHU_SCHEDULED; + if (tab->nhu_corked) + { + if (!(tab->nhu_corked & NHU_SCHEDULED)) + tab->nhu_corked |= NHU_SCHEDULED; + } + else if (!(tab->nhu_state & NHU_SCHEDULED)) + { + rt_trace(tab, D_EVENTS, "Scheduling NHU"); + + /* state change: + * NHU_CLEAN -> NHU_SCHEDULED + * NHU_RUNNING -> NHU_DIRTY + */ + if ((tab->nhu_state |= NHU_SCHEDULED) == NHU_SCHEDULED) + birdloop_flag(tab->loop, RTF_NHU); + } } void -rt_schedule_prune(rtable *tab) +rt_schedule_prune(struct rtable_private *tab) { if (tab->prune_state == 0) - ev_schedule(tab->rt_event); + birdloop_flag(tab->loop, RTF_CLEANUP); /* state change 0->1, 2->3 */ tab->prune_state |= 1; } - static void -rt_event(void *ptr) +rt_export_used(struct rt_table_exporter *e, const char *who, const char *why) { - rtable *tab = ptr; + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter, e); + ASSERT_DIE(RT_IS_LOCKED(tab)); - rt_lock_table(tab); + rt_trace(tab, D_EVENTS, "Export cleanup requested by %s %s", who, why); - if (tab->hcu_scheduled) - rt_update_hostcache(tab); + if (tab->export_used) + return; - if (tab->nhu_state) - rt_next_hop_update(tab); + tab->export_used = 1; + birdloop_flag(tab->loop, RTF_CLEANUP); +} - if (tab->prune_state) - rt_prune_table(tab); +static void +rt_flag_handler(struct birdloop_flag_handler *fh, u32 flags) +{ + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, fh, fh)), tab) + { + ASSERT_DIE(birdloop_inside(tab->loop)); + rt_lock_table(tab); - rt_unlock_table(tab); -} + if (flags & RTF_NHU) + rt_next_hop_update(tab); + if (flags & RTF_EXPORT) + rt_kick_export_settle(tab); -static inline btime -rt_settled_time(rtable *tab) -{ - ASSUME(tab->base_settle_time != 0); + if (flags & RTF_CLEANUP) + { + if (tab->export_used) + rt_export_cleanup(tab); + + if (tab->prune_state) + rt_prune_table(tab); + } - return MIN(tab->last_rt_change + tab->config->min_settle_time, - tab->base_settle_time + tab->config->max_settle_time); + rt_unlock_table(tab); + } } static void -rt_settle_timer(timer *t) +rt_prune_timer(timer *t) { - rtable *tab = t->data; - - if (!tab->base_settle_time) - return; + RT_LOCKED((rtable *) t->data, tab) + if (tab->gc_counter >= tab->config->gc_threshold) + rt_schedule_prune(tab); +} - btime settled_time = rt_settled_time(tab); - if (current_time() < settled_time) - { - tm_set(tab->settle_timer, settled_time); +static void +rt_kick_prune_timer(struct rtable_private *tab) +{ + /* Return if prune is already scheduled */ + if (tm_active(tab->prune_timer) || (tab->prune_state & 1)) return; - } - - /* Settled */ - tab->base_settle_time = 0; - struct rt_subscription *s; - WALK_LIST(s, tab->subscribers) - s->hook(s); + /* Randomize GC period to +/- 50% */ + btime gc_period = tab->config->gc_period; + gc_period = (gc_period / 2) + (random_u32() % (uint) gc_period); + tm_start_in(tab->prune_timer, gc_period, tab->loop); } + static void -rt_kick_settle_timer(rtable *tab) +rt_flowspec_export_one(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) { - tab->base_settle_time = current_time(); + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rtable *dst_pub = ln->dst; + ASSUME(rt_is_flow(dst_pub)); + struct rtable_private *dst = RT_LOCK(dst_pub); - if (!tab->settle_timer) - tab->settle_timer = tm_new_init(tab->rp, rt_settle_timer, tab, 0, 0); + /* No need to inspect it further if recalculation is already scheduled */ + if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY) + || !trie_match_net(dst->flowspec_trie, net)) + { + RT_UNLOCK(dst_pub); + rpe_mark_seen_all(req->hook, first, NULL); + return; + } - if (!tm_active(tab->settle_timer)) - tm_set(tab->settle_timer, rt_settled_time(tab)); -} + /* This net may affect some flowspecs, check the actual change */ + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; -static inline void -rt_schedule_notify(rtable *tab) -{ - if (EMPTY_LIST(tab->subscribers)) - return; + RPE_WALK(first, rpe, NULL) + { + rpe_mark_seen(req->hook, rpe); + new_best = rpe->new_best; + } - if (tab->base_settle_time) - return; + /* Yes, something has actually changed. Schedule the update. */ + if (o != RTE_VALID_OR_NULL(new_best)) + rt_schedule_nhu(dst); - rt_kick_settle_timer(tab); + RT_UNLOCK(dst_pub); } -void -rt_subscribe(rtable *tab, struct rt_subscription *s) +static void +rt_flowspec_dump_req(struct rt_export_request *req) { - s->tab = tab; - rt_lock_table(tab); - DBG("rt_subscribe(%s)\n", tab->name); - add_tail(&tab->subscribers, &s->n); + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + debug(" Flowspec link for table %s (%p)\n", ln->dst->name, req); } -void -rt_unsubscribe(struct rt_subscription *s) +static void +rt_flowspec_log_state_change(struct rt_export_request *req, u8 state) { - rem_node(&s->n); - rt_unlock_table(s->tab); + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rt_trace(ln->dst, D_STATES, "Flowspec link from %s export state changed to %s", + ln->src->name, rt_export_state_name(state)); } static struct rt_flowspec_link * -rt_flowspec_find_link(rtable *src, rtable *dst) +rt_flowspec_find_link(struct rtable_private *src, rtable *dst) { - struct rt_flowspec_link *ln; - WALK_LIST(ln, src->flowspec_links) - if ((ln->src == src) && (ln->dst == dst)) - return ln; + struct rt_table_export_hook *hook; node *n; + WALK_LIST2(hook, n, src->exporter.e.hooks, h.n) + switch (atomic_load_explicit(&hook->h.export_state, memory_order_acquire)) + { + case TES_HUNGRY: + case TES_FEEDING: + case TES_READY: + if (hook->h.req->export_one == rt_flowspec_export_one) + { + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, hook->h.req); + if (ln->dst == dst) + return ln; + } + } return NULL; } void -rt_flowspec_link(rtable *src, rtable *dst) +rt_flowspec_link(rtable *src_pub, rtable *dst_pub) { - ASSERT(rt_is_ip(src)); - ASSERT(rt_is_flow(dst)); + ASSERT(rt_is_ip(src_pub)); + ASSERT(rt_is_flow(dst_pub)); - struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst); + int lock_dst = 0; - if (!ln) + RT_LOCKED(src_pub, src) { - rt_lock_table(src); - rt_lock_table(dst); + struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst_pub); - ln = mb_allocz(src->rp, sizeof(struct rt_flowspec_link)); - ln->src = src; - ln->dst = dst; - add_tail(&src->flowspec_links, &ln->n); + if (!ln) + { + pool *p = src->rp; + ln = mb_allocz(p, sizeof(struct rt_flowspec_link)); + ln->src = src_pub; + ln->dst = dst_pub; + ln->req = (struct rt_export_request) { + .name = mb_sprintf(p, "%s.flowspec.notifier", dst_pub->name), + .list = &global_work_list, + .trace_routes = src->config->debug, + .dump_req = rt_flowspec_dump_req, + .log_state_change = rt_flowspec_log_state_change, + .export_one = rt_flowspec_export_one, + }; + + rt_table_export_start_locked(src, &ln->req); + + lock_dst = 1; + } + + ln->uc++; } - ln->uc++; + if (lock_dst) + rt_lock_table(dst_pub); } -void -rt_flowspec_unlink(rtable *src, rtable *dst) +static void +rt_flowspec_link_stopped(struct rt_export_request *req) { - struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst); - - ASSERT(ln && (ln->uc > 0)); + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rtable *dst = ln->dst; - ln->uc--; - - if (!ln->uc) - { - rem_node(&ln->n); - mb_free(ln); - - rt_unlock_table(src); - rt_unlock_table(dst); - } + mb_free(ln); + rt_unlock_table(dst); } -static void -rt_flowspec_notify(rtable *src, net *net) +void +rt_flowspec_unlink(rtable *src, rtable *dst) { - /* Only IP tables are src links */ - ASSERT(rt_is_ip(src)); - struct rt_flowspec_link *ln; - WALK_LIST(ln, src->flowspec_links) + RT_LOCKED(src, t) { - rtable *dst = ln->dst; - ASSERT(rt_is_flow(dst)); + ln = rt_flowspec_find_link(t, dst); - /* No need to inspect it further if recalculation is already active */ - if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY)) - continue; + ASSERT(ln && (ln->uc > 0)); - if (trie_match_net(dst->flowspec_trie, net->n.addr)) - rt_schedule_nhu(dst); + if (!--ln->uc) + rt_stop_export(&ln->req, rt_flowspec_link_stopped); } } static void -rt_flowspec_reset_trie(rtable *tab) +rt_flowspec_reset_trie(struct rtable_private *tab) { linpool *lp = tab->flowspec_trie->lp; int ipv4 = tab->flowspec_trie->ipv4; @@ -2288,14 +2715,13 @@ rt_flowspec_reset_trie(rtable *tab) static void rt_free(resource *_r) { - rtable *r = (rtable *) _r; + struct rtable_private *r = SKIP_BACK(struct rtable_private, r, _r); + + DOMAIN_FREE(rtable, r->lock); DBG("Deleting routing table %s\n", r->name); ASSERT_DIE(r->use_count == 0); - if (r->internal) - return; - r->config->table = NULL; rem_node(&r->n); @@ -2306,7 +2732,6 @@ rt_free(resource *_r) fib_free(&r->fib); hmap_free(&r->id_map); rfree(r->rt_event); - rfree(r->settle_timer); mb_free(r); */ } @@ -2314,26 +2739,44 @@ rt_free(resource *_r) static void rt_res_dump(resource *_r) { - rtable *r = (rtable *) _r; + struct rtable_private *r = SKIP_BACK(struct rtable_private, r, _r); + debug("name \"%s\", addr_type=%s, rt_count=%u, use_count=%d\n", r->name, net_label[r->addr_type], r->rt_count, r->use_count); } static struct resclass rt_class = { .name = "Routing table", - .size = sizeof(struct rtable), + .size = sizeof(rtable), .free = rt_free, .dump = rt_res_dump, .lookup = NULL, .memsize = NULL, }; +static const struct rt_exporter_class rt_table_exporter_class = { + .start = rt_table_export_start, + .stop = rt_table_export_stop, + .done = rt_table_export_done, +}; + +void +rt_exporter_init(struct rt_exporter *e) +{ + init_list(&e->hooks); +} + +static struct idm rtable_idm; +uint rtable_max_id = 0; + rtable * rt_setup(pool *pp, struct rtable_config *cf) { + ASSERT_DIE(birdloop_inside(&main_birdloop)); + pool *p = rp_newf(pp, "Routing table %s", cf->name); - rtable *t = ralloc(p, &rt_class); + struct rtable_private *t = ralloc(p, &rt_class); t->rp = p; t->rte_slab = sl_new(p, sizeof(struct rte_storage)); @@ -2341,6 +2784,11 @@ rt_setup(pool *pp, struct rtable_config *cf) t->name = cf->name; t->config = cf; t->addr_type = cf->addr_type; + t->id = idm_alloc(&rtable_idm); + if (t->id >= rtable_max_id) + rtable_max_id = t->id + 1; + + t->lock = DOMAIN_NEW(rtable, t->name); fib_init(&t->fib, p, t->addr_type, sizeof(net), OFFSETOF(net, n), 0, NULL); @@ -2352,38 +2800,48 @@ rt_setup(pool *pp, struct rtable_config *cf) t->fib.init = net_init_with_trie; } - init_list(&t->flowspec_links); + init_list(&t->imports); - t->exporter = (struct rt_exporter) { - .addr_type = t->addr_type, - .start = rt_table_export_start, - .stop = rt_table_export_stop, - .done = rt_table_export_done, - }; - init_list(&t->exporter.hooks); + hmap_init(&t->id_map, p, 1024); + hmap_set(&t->id_map, 0); - if (!(t->internal = cf->internal)) - { - init_list(&t->imports); + t->fh = (struct birdloop_flag_handler) { .hook = rt_flag_handler, }; + t->nhu_uncork_event = ev_new_init(p, rt_nhu_uncork, t); + t->prune_timer = tm_new_init(p, rt_prune_timer, t, 0, 0); + t->last_rt_change = t->gc_time = current_time(); - hmap_init(&t->id_map, p, 1024); - hmap_set(&t->id_map, 0); + t->export_settle = SETTLE_INIT(&cf->export_settle, rt_announce_exports, NULL); - init_list(&t->subscribers); + t->exporter = (struct rt_table_exporter) { + .e = { + .class = &rt_table_exporter_class, + .addr_type = t->addr_type, + .rp = t->rp, + }, + .next_seq = 1, + }; - t->rt_event = ev_new_init(p, rt_event, t); - t->last_rt_change = t->gc_time = current_time(); + rt_exporter_init(&t->exporter.e); - t->rl_pipe = (struct tbf) TBF_DEFAULT_LOG_LIMITS; + init_list(&t->exporter.pending); - if (rt_is_flow(t)) - { - t->flowspec_trie = f_new_trie(lp_new_default(p), 0); - t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4); - } + t->cork_threshold = cf->cork_threshold; + + t->rl_pipe = (struct tbf) TBF_DEFAULT_LOG_LIMITS; + + if (rt_is_flow(RT_PUB(t))) + { + t->flowspec_trie = f_new_trie(lp_new_default(p), 0); + t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4); } - return t; + /* Start the service thread */ + t->loop = birdloop_new(p, DOMAIN_ORDER(service), mb_sprintf(p, "Routing tahle %s", t->name)); + birdloop_enter(t->loop); + birdloop_flag_set_handler(t->loop, &t->fh); + birdloop_leave(t->loop); + + return RT_PUB(t); } /** @@ -2397,9 +2855,11 @@ rt_init(void) { rta_init(); rt_table_pool = rp_new(&root_pool, "Routing tables"); - rte_update_pool = lp_new_default(rt_table_pool); init_list(&routing_tables); init_list(&deleted_routing_tables); + ev_init_list(&rt_cork.queue, &main_birdloop, "Route cork release"); + rt_cork.run = (event) { .hook = rt_cork_release_hook }; + idm_init(&rtable_idm, rt_table_pool, 256); } @@ -2418,7 +2878,7 @@ rt_init(void) * iteration. */ static void -rt_prune_table(rtable *tab) +rt_prune_table(struct rtable_private *tab) { struct fib_iterator *fit = &tab->prune_fit; int limit = 2000; @@ -2426,7 +2886,7 @@ rt_prune_table(rtable *tab) struct rt_import_hook *ih; node *n, *x; - DBG("Pruning route table %s\n", tab->name); + rt_trace(tab, D_STATES, "Pruning"); #ifdef DEBUGGING fib_check(&tab->fib); #endif @@ -2440,10 +2900,20 @@ rt_prune_table(rtable *tab) WALK_LIST2(ih, n, tab->imports, n) if (ih->import_state == TIS_STOP) rt_set_import_state(ih, TIS_FLUSHING); + else if ((ih->stale_valid != ih->stale_pruning) && (ih->stale_pruning == ih->stale_pruned)) + { + ih->stale_pruning = ih->stale_valid; + + if (ih->req->trace_routes & D_STATES) + log(L_TRACE "%s: table prune after refresh begin [%u]", ih->req->name, ih->stale_pruning); + } FIB_ITERATE_INIT(fit, &tab->fib); tab->prune_state = 2; + tab->gc_counter = 0; + tab->gc_time = current_time(); + if (tab->prune_trie) { /* Init prefix trie pruning */ @@ -2459,30 +2929,25 @@ again: if (limit <= 0) { FIB_ITERATE_PUT(fit); - ev_schedule(tab->rt_event); + birdloop_flag(tab->loop, RTF_CLEANUP); return; } for (struct rte_storage *e=n->routes; e; e=e->next) { - if ((e->rte.sender->import_state == TIS_FLUSHING) || (e->rte.flags & REF_DISCARD)) - { - rte_discard(n, &e->rte); - limit--; - - goto rescan; - } - - if (e->rte.flags & REF_MODIFY) + struct rt_import_hook *s = e->rte.sender; + if ((s->import_state == TIS_FLUSHING) || + (e->rte.stale_cycle < s->stale_valid) || + (e->rte.stale_cycle > s->stale_set)) { - rte_modify(n, &e->rte); + rte_recalculate(tab, e->rte.sender, n, NULL, e->rte.src); limit--; goto rescan; } } - if (!n->routes) /* Orphaned FIB entry */ + if (!n->routes && !n->first) /* Orphaned FIB entry */ { FIB_ITERATE_PUT(fit); fib_delete(&tab->fib, n); @@ -2497,15 +2962,16 @@ again: } FIB_ITERATE_END; + rt_trace(tab, D_EVENTS, "Prune done, scheduling export timer"); + rt_kick_export_settle(tab); + #ifdef DEBUGGING fib_check(&tab->fib); #endif - tab->gc_counter = 0; - tab->gc_time = current_time(); - /* state change 2->0, 3->1 */ - tab->prune_state &= 1; + if (tab->prune_state &= 1) + birdloop_flag(tab->loop, RTF_CLEANUP); if (tab->trie_new) { @@ -2538,18 +3004,205 @@ again: } } - rt_prune_sources(); + uint flushed_channels = 0; /* Close flushed channels */ WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) if (ih->import_state == TIS_FLUSHING) { - rt_set_import_state(ih, TIS_CLEARED); - ih->stopped(ih->req); - rem_node(&ih->n); - mb_free(ih); - rt_unlock_table(tab); + ih->flush_seq = tab->exporter.next_seq; + rt_set_import_state(ih, TIS_WAITING); + flushed_channels++; + tab->rr_counter--; } + else if (ih->stale_pruning != ih->stale_pruned) + { + tab->rr_counter -= (ih->stale_pruned - ih->stale_pruning); + ih->stale_pruned = ih->stale_pruning; + if (ih->req->trace_routes & D_STATES) + log(L_TRACE "%s: table prune after refresh end [%u]", ih->req->name, ih->stale_pruned); + } + + /* In some cases, we may want to directly proceed to export cleanup */ + if (EMPTY_LIST(tab->exporter.e.hooks) && flushed_channels) + rt_export_cleanup(tab); +} + +static void +rt_export_cleanup(struct rtable_private *tab) +{ + tab->export_used = 0; + + u64 min_seq = ~((u64) 0); + struct rt_pending_export *last_export_to_free = NULL; + struct rt_pending_export *first = tab->exporter.first; + int want_prune = 0; + + struct rt_table_export_hook *eh; + node *n; + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) + { + switch (atomic_load_explicit(&eh->h.export_state, memory_order_acquire)) + { + case TES_DOWN: + case TES_HUNGRY: + continue; + + case TES_READY: + { + struct rt_pending_export *last = atomic_load_explicit(&eh->last_export, memory_order_acquire); + if (!last) + /* No last export means that the channel has exported nothing since last cleanup */ + goto done; + + else if (min_seq > last->seq) + { + min_seq = last->seq; + last_export_to_free = last; + } + continue; + } + + default: + /* It's only safe to cleanup when the export state is idle or regular. No feeding or stopping allowed. */ + goto done; + } + } + + tab->exporter.first = last_export_to_free ? rt_next_export_fast(last_export_to_free) : NULL; + + rt_trace(tab, D_STATES, "Export cleanup, old exporter.first seq %lu, new %lu, min_seq %ld", + first ? first->seq : 0, + tab->exporter.first ? tab->exporter.first->seq : 0, + min_seq); + + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) + { + if (atomic_load_explicit(&eh->h.export_state, memory_order_acquire) != TES_READY) + continue; + + struct rt_pending_export *last = atomic_load_explicit(&eh->last_export, memory_order_acquire); + if (last == last_export_to_free) + { + /* This may fail when the channel managed to export more inbetween. This is OK. */ + atomic_compare_exchange_strong_explicit( + &eh->last_export, &last, NULL, + memory_order_release, + memory_order_relaxed); + + DBG("store hook=%p last_export=NULL\n", eh); + } + } + + while (first && (first->seq <= min_seq)) + { + ASSERT_DIE(first->new || first->old); + + const net_addr *n = first->new ? + first->new->rte.net : + first->old->rte.net; + net *net = SKIP_BACK(struct network, n.addr, (net_addr (*)[0]) n); + + ASSERT_DIE(net->first == first); + + if (first == net->last) + /* The only export here */ + net->last = net->first = NULL; + else + /* First is now the next one */ + net->first = atomic_load_explicit(&first->next, memory_order_relaxed); + + want_prune += !net->routes && !net->first; + + /* For now, the old route may be finally freed */ + if (first->old) + { + rt_rte_trace_in(D_ROUTES, first->old->rte.sender->req, &first->old->rte, "freed"); + hmap_clear(&tab->id_map, first->old->rte.id); + rte_free(first->old); + } + +#ifdef LOCAL_DEBUG + memset(first, 0xbd, sizeof(struct rt_pending_export)); +#endif + + struct rt_export_block *reb = HEAD(tab->exporter.pending); + ASSERT_DIE(reb == PAGE_HEAD(first)); + + u32 pos = (first - &reb->export[0]); + u32 end = atomic_load_explicit(&reb->end, memory_order_relaxed); + ASSERT_DIE(pos < end); + + struct rt_pending_export *next = NULL; + + if (++pos < end) + next = &reb->export[pos]; + else + { + rem_node(&reb->n); + +#ifdef LOCAL_DEBUG + memset(reb, 0xbe, page_size); +#endif + + free_page(reb); + + if (EMPTY_LIST(tab->exporter.pending)) + { + rt_trace(tab, D_EVENTS, "Resetting export seq"); + + node *n; + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) + { + if (atomic_load_explicit(&eh->h.export_state, memory_order_acquire) != TES_READY) + continue; + + ASSERT_DIE(atomic_load_explicit(&eh->last_export, memory_order_acquire) == NULL); + bmap_reset(&eh->h.seq_map, 1024); + } + + tab->exporter.next_seq = 1; + } + else + { + reb = HEAD(tab->exporter.pending); + next = &reb->export[0]; + } + } + + first = next; + } + + rt_check_cork_low(tab); + +done:; + struct rt_import_hook *ih; node *x; + WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) + if (ih->import_state == TIS_WAITING) + if (!first || (first->seq >= ih->flush_seq)) + { + ih->import_state = TIS_CLEARED; + ev_send(ih->req->list, &ih->announce_event); + } + + if ((tab->gc_counter += want_prune) >= tab->config->gc_threshold) + rt_kick_prune_timer(tab); + + if (tab->export_used) + birdloop_flag(tab->loop, RTF_CLEANUP); + + if (EMPTY_LIST(tab->exporter.pending)) + settle_cancel(&tab->export_settle); +} + +static void +rt_cork_release_hook(void *data UNUSED) +{ + do synchronize_rcu(); + while ( + !atomic_load_explicit(&rt_cork.active, memory_order_acquire) && + ev_run_list(&rt_cork.queue) + ); } /** @@ -2566,7 +3219,7 @@ again: * */ struct f_trie * -rt_lock_trie(rtable *tab) +rt_lock_trie(struct rtable_private *tab) { ASSERT(tab->trie); @@ -2583,7 +3236,7 @@ rt_lock_trie(rtable *tab) * It may free the trie and schedule next trie pruning. */ void -rt_unlock_trie(rtable *tab, struct f_trie *trie) +rt_unlock_trie(struct rtable_private *tab, struct f_trie *trie) { ASSERT(trie); @@ -2623,8 +3276,29 @@ rt_preconfig(struct config *c) { init_list(&c->tables); - rt_new_table(cf_get_symbol("master4"), NET_IP4); - rt_new_table(cf_get_symbol("master6"), NET_IP6); + c->def_tables[NET_IP4] = cf_define_symbol(cf_get_symbol("master4"), SYM_TABLE, table, NULL); + c->def_tables[NET_IP6] = cf_define_symbol(cf_get_symbol("master6"), SYM_TABLE, table, NULL); +} + +void +rt_postconfig(struct config *c) +{ + uint num_tables = list_length(&c->tables); + btime def_gc_period = 400 MS * num_tables; + def_gc_period = MAX(def_gc_period, 10 S); + def_gc_period = MIN(def_gc_period, 600 S); + + struct rtable_config *rc; + WALK_LIST(rc, c->tables) + if (rc->gc_period == (uint) -1) + rc->gc_period = (uint) def_gc_period; + + for (uint net_type = 0; net_type < NET_MAX; net_type++) + if (c->def_tables[net_type] && !c->def_tables[net_type]->table) + { + c->def_tables[net_type]->class = SYM_VOID; + c->def_tables[net_type] = NULL; + } } @@ -2634,7 +3308,7 @@ rt_preconfig(struct config *c) */ void -ea_set_hostentry(ea_list **to, struct rtable *dep, struct rtable *tab, ip_addr gw, ip_addr ll, u32 lnum, u32 labels[lnum]) +ea_set_hostentry(ea_list **to, rtable *dep, rtable *src, ip_addr gw, ip_addr ll, u32 lnum, u32 labels[lnum]) { struct { struct adata ad; @@ -2642,7 +3316,8 @@ ea_set_hostentry(ea_list **to, struct rtable *dep, struct rtable *tab, ip_addr g u32 labels[lnum]; } *head = (void *) tmp_alloc_adata(sizeof *head - sizeof(struct adata)); - head->he = rt_get_hostentry(tab, gw, ll, dep); + RT_LOCKED(src, tab) + head->he = rt_get_hostentry(tab, gw, ll, dep); memcpy(head->labels, labels, lnum * sizeof(u32)); ea_set_attr(to, EA_LITERAL_DIRECT_ADATA( @@ -2771,17 +3446,16 @@ rta_next_hop_outdated(ea_list *a) ? head : NULL; } -static inline struct rte_storage * -rt_next_hop_update_rte(rtable *tab, net *n, rte *old) +static inline int +rt_next_hop_update_rte(rte *old, rte *new) { struct hostentry_adata *head = rta_next_hop_outdated(old->attrs); if (!head) - return NULL; - - rte e0 = *old; - rta_apply_hostentry(&e0.attrs, head); + return 0; - return rte_store(&e0, n, tab); + *new = *old; + rta_apply_hostentry(&new->attrs, head); + return 1; } static inline void @@ -2839,7 +3513,6 @@ rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, ea_list * { ASSERT(rt_is_ip(tab_ip)); ASSERT(rt_is_flow(tab_flow)); - ASSERT(tab_ip->trie); /* RFC 8955 6. a) Flowspec has defined dst prefix */ if (!net_flow_has_dst_prefix(n)) @@ -2859,32 +3532,45 @@ rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, ea_list * else net_fill_ip6(&dst, net6_prefix(n), net6_pxlen(n)); - /* Find best-match BGP unicast route for flowspec dst prefix */ - net *nb = net_route(tab_ip, &dst); - const rte *rb = nb ? &nb->routes->rte : NULL; + rte rb = {}; + net_addr_union nau; + RT_LOCKED(tab_ip, tip) + { + ASSERT(tip->trie); + /* Find best-match BGP unicast route for flowspec dst prefix */ + net *nb = net_route(tip, &dst); + if (nb) + { + rb = RTE_COPY_VALID(nb->routes); + rta_clone(rb.attrs); + net_copy(&nau.n, nb->n.addr); + rb.net = &nau.n; + } + } /* Register prefix to trie for tracking further changes */ int max_pxlen = (n->type == NET_FLOW4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH; - trie_add_prefix(tab_flow->flowspec_trie, &dst, (nb ? nb->n.addr->pxlen : 0), max_pxlen); + RT_LOCKED(tab_flow, tfl) + trie_add_prefix(tfl->flowspec_trie, &dst, (rb.net ? rb.net->pxlen : 0), max_pxlen); /* No best-match BGP route -> no flowspec */ - if (!rb || (rt_get_source_attr(rb) != RTS_BGP)) + if (!rb.attrs || (rt_get_source_attr(&rb) != RTS_BGP)) return FLOWSPEC_INVALID; /* Find ORIGINATOR_ID values */ u32 orig_a = ea_get_int(a, "bgp_originator_id", 0); - u32 orig_b = ea_get_int(rb->attrs, "bgp_originator_id", 0); + u32 orig_b = ea_get_int(rb.attrs, "bgp_originator_id", 0); /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */ if ((orig_a != orig_b) || (!orig_a && !orig_b && !ipa_equal( ea_get_ip(a, &ea_gen_from, IPA_NONE), - ea_get_ip(rb->attrs, &ea_gen_from, IPA_NONE) + ea_get_ip(rb.attrs, &ea_gen_from, IPA_NONE) ))) return FLOWSPEC_INVALID; /* Find ASN of the best-match route, for use in next checks */ - u32 asn_b = rta_get_first_asn(rb->attrs); + u32 asn_b = rta_get_first_asn(rb.attrs); if (!asn_b) return FLOWSPEC_INVALID; @@ -2893,51 +3579,53 @@ rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, ea_list * return FLOWSPEC_INVALID; /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */ - TRIE_WALK(tab_ip->trie, subnet, &dst) + RT_LOCKED(tab_ip, tip) { - net *nc = net_find_valid(tab_ip, &subnet); - if (!nc) - continue; + TRIE_WALK(tip->trie, subnet, &dst) + { + net *nc = net_find_valid(tip, &subnet); + if (!nc) + continue; - const rte *rc = &nc->routes->rte; - if (rt_get_source_attr(rc) != RTS_BGP) - return FLOWSPEC_INVALID; + const rte *rc = &nc->routes->rte; + if (rt_get_source_attr(rc) != RTS_BGP) + RT_RETURN(tip, FLOWSPEC_INVALID); - if (rta_get_first_asn(rc->attrs) != asn_b) - return FLOWSPEC_INVALID; + if (rta_get_first_asn(rc->attrs) != asn_b) + RT_RETURN(tip, FLOWSPEC_INVALID); + } + TRIE_WALK_END; } - TRIE_WALK_END; return FLOWSPEC_VALID; } #endif /* CONFIG_BGP */ -static struct rte_storage * -rt_flowspec_update_rte(rtable *tab, net *n, rte *r) +static int +rt_flowspec_update_rte(rtable *tab, rte *r, rte *new) { #ifdef CONFIG_BGP if (r->generation || (rt_get_source_attr(r) != RTS_BGP)) - return NULL; + return 0; struct bgp_channel *bc = (struct bgp_channel *) SKIP_BACK(struct channel, in_req, r->sender->req); if (!bc->base_table) - return NULL; + return 0; struct bgp_proto *p = SKIP_BACK(struct bgp_proto, p, bc->c.proto); enum flowspec_valid old = rt_get_flowspec_valid(r), - valid = rt_flowspec_check(bc->base_table, tab, n->n.addr, r->attrs, p->is_interior); + valid = rt_flowspec_check(bc->base_table, tab, r->net, r->attrs, p->is_interior); if (old == valid) - return NULL; - - rte new = *r; - ea_set_attr_u32(&new.attrs, &ea_gen_flowspec_valid, 0, valid); + return 0; - return rte_store(&new, n, tab); + *new = *r; + ea_set_attr_u32(&new->attrs, &ea_gen_flowspec_valid, 0, valid); + return 1; #else - return NULL; + return 0; #endif } @@ -2972,10 +3660,9 @@ rt_flowspec_resolve_rte(rte *r, struct channel *c) } static inline int -rt_next_hop_update_net(rtable *tab, net *n) +rt_next_hop_update_net(struct rtable_private *tab, net *n) { - struct rte_storage *new; - int count = 0; + uint count = 0; int is_flow = net_is_flow(n->n.addr); struct rte_storage *old_best = n->routes; @@ -2983,44 +3670,90 @@ rt_next_hop_update_net(rtable *tab, net *n) return 0; for (struct rte_storage *e, **k = &n->routes; e = *k; k = &e->next) - if (is_flow || rta_next_hop_outdated(e->rte.attrs)) - count++; + count++; if (!count) return 0; struct rte_multiupdate { - struct rte_storage *old, *new; - } *updates = alloca(sizeof(struct rte_multiupdate) * count); + struct rte_storage *old, *new_stored; + rte new; + } *updates = tmp_allocz(sizeof(struct rte_multiupdate) * (count+1)); + + struct rt_pending_export *last_pending = n->last; - int pos = 0; + uint pos = 0; for (struct rte_storage *e, **k = &n->routes; e = *k; k = &e->next) - if (is_flow || rta_next_hop_outdated(e->rte.attrs)) - { - struct rte_storage *new = is_flow - ? rt_flowspec_update_rte(tab, n, &e->rte) - : rt_next_hop_update_rte(tab, n, &e->rte); + updates[pos++].old = e; + + /* This is an exceptional place where table can be unlocked while keeping its data: + * the reason why this is safe is that NHU must be always run from the same + * thread as cleanup routines, therefore the only real problem may arise when + * some importer does a change on this particular net (destination) while NHU + * is being computed. Statistically, this should almost never happen. In such + * case, we just drop all the computed changes and do it once again. + * */ + RT_UNLOCK(tab); + + uint mod = 0; + if (is_flow) + for (uint i = 0; i < pos; i++) + mod += rt_flowspec_update_rte(RT_PUB(tab), &updates[i].old->rte, &updates[i].new); - if (!new) - continue; + else + for (uint i = 0; i < pos; i++) + mod += rt_next_hop_update_rte(&updates[i].old->rte, &updates[i].new); - /* Call a pre-comparison hook */ - /* Not really an efficient way to compute this */ - if (e->rte.src->proto->rte_recalculate) - e->rte.src->proto->rte_recalculate(tab, n, &new->rte, &e->rte, &old_best->rte); + RT_LOCK(RT_PUB(tab)); - updates[pos++] = (struct rte_multiupdate) { - .old = e, - .new = new, - }; + if (!mod) + return 0; + + /* Something has changed inbetween, retry NHU. */ + if (last_pending != n->last) + return rt_next_hop_update_net(tab, n); + + /* Now we reconstruct the original linked list */ + struct rte_storage **nptr = &n->routes; + for (uint i = 0; i < pos; i++) + { + updates[i].old->next = NULL; - /* Replace the route in the list */ - new->next = e->next; - *k = e = new; + struct rte_storage *put; + if (updates[i].new.attrs) + put = updates[i].new_stored = rte_store(&updates[i].new, n, tab); + else + put = updates[i].old; + + *nptr = put; + nptr = &put->next; + } + *nptr = NULL; + + /* Call the pre-comparison hooks */ + for (uint i = 0; i < pos; i++) + if (updates[i].new_stored) + { + /* Get a new ID for the route */ + updates[i].new_stored->rte.lastmod = current_time(); + updates[i].new_stored->rte.id = hmap_first_zero(&tab->id_map); + hmap_set(&tab->id_map, updates[i].new_stored->rte.id); + + /* Call a pre-comparison hook */ + /* Not really an efficient way to compute this */ + if (updates[i].old->rte.src->owner->rte_recalculate) + updates[i].old->rte.src->owner->rte_recalculate(tab, n, &updates[i].new_stored->rte, &updates[i].old->rte, &old_best->rte); } - ASSERT_DIE(pos <= count); - count = pos; +#if DEBUGGING + { + uint t = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + t++; + ASSERT_DIE(t == pos); + ASSERT_DIE(pos == count); + } +#endif /* Find the new best route */ struct rte_storage **new_best = NULL; @@ -3031,7 +3764,7 @@ rt_next_hop_update_net(rtable *tab, net *n) } /* Relink the new best route to the first position */ - new = *new_best; + struct rte_storage *new = *new_best; if (new != n->routes) { *new_best = new->next; @@ -3039,86 +3772,166 @@ rt_next_hop_update_net(rtable *tab, net *n) n->routes = new; } + uint total = 0; /* Announce the changes */ - for (int i=0; i<count; i++) + for (uint i=0; i<count; i++) { - _Bool nb = (new == updates[i].new), ob = (old_best == updates[i].old); - const char *best_indicator[2][2] = { { "updated", "updated [-best]" }, { "updated [+best]", "updated [best]" } }; - rt_rte_trace_in(D_ROUTES, updates[i].new->rte.sender->req, &updates[i].new->rte, best_indicator[nb][ob]); - rte_announce_i(tab, n, updates[i].new, updates[i].old, new, old_best); + if (!updates[i].new_stored) + continue; + + _Bool nb = (new->rte.src == updates[i].new.src), ob = (i == 0); + const char *best_indicator[2][2] = { + { "autoupdated", "autoupdated [-best]" }, + { "autoupdated [+best]", "autoupdated [best]" } + }; + rt_rte_trace_in(D_ROUTES, updates[i].new.sender->req, &updates[i].new, best_indicator[nb][ob]); + rte_announce(tab, n, updates[i].new_stored, updates[i].old, new, old_best); + + total++; } - for (int i=0; i<count; i++) - rte_free(updates[i].old); + return total; +} - return count; +static void +rt_nhu_uncork(void *_tab) +{ + RT_LOCKED((rtable *) _tab, tab) + { + ASSERT_DIE(tab->nhu_corked); + ASSERT_DIE(tab->nhu_state == 0); + + /* Reset the state */ + tab->nhu_state = tab->nhu_corked; + tab->nhu_corked = 0; + rt_trace(tab, D_STATES, "Next hop updater uncorked"); + + birdloop_flag(tab->loop, RTF_NHU); + } } static void -rt_next_hop_update(rtable *tab) +rt_next_hop_update(struct rtable_private *tab) { - struct fib_iterator *fit = &tab->nhu_fit; - int max_feed = 32; + ASSERT_DIE(birdloop_inside(tab->loop)); + + if (tab->nhu_corked) + return; + + if (!tab->nhu_state) + return; - if (tab->nhu_state == NHU_CLEAN) + /* Check corkedness */ + if (rt_cork_check(tab->nhu_uncork_event)) + { + rt_trace(tab, D_STATES, "Next hop updater corked"); + if ((tab->nhu_state & NHU_RUNNING) + && !EMPTY_LIST(tab->exporter.pending)) + rt_kick_export_settle(tab); + + tab->nhu_corked = tab->nhu_state; + tab->nhu_state = 0; return; + } + struct fib_iterator *fit = &tab->nhu_fit; + int max_feed = 32; + + /* Initialize a new run */ if (tab->nhu_state == NHU_SCHEDULED) - { - FIB_ITERATE_INIT(fit, &tab->fib); - tab->nhu_state = NHU_RUNNING; + { + FIB_ITERATE_INIT(fit, &tab->fib); + tab->nhu_state = NHU_RUNNING; - if (tab->flowspec_trie) - rt_flowspec_reset_trie(tab); - } + if (tab->flowspec_trie) + rt_flowspec_reset_trie(tab); + } + /* Walk the fib one net after another */ FIB_ITERATE_START(&tab->fib, fit, net, n) { if (max_feed <= 0) { FIB_ITERATE_PUT(fit); - ev_schedule(tab->rt_event); + birdloop_flag(tab->loop, RTF_NHU); return; } + lp_state lps; + lp_save(tmp_linpool, &lps); max_feed -= rt_next_hop_update_net(tab, n); + lp_restore(tmp_linpool, &lps); } FIB_ITERATE_END; + /* Finished NHU, cleanup */ + rt_trace(tab, D_EVENTS, "NHU done, scheduling export timer"); + rt_kick_export_settle(tab); + /* State change: * NHU_DIRTY -> NHU_SCHEDULED * NHU_RUNNING -> NHU_CLEAN */ - tab->nhu_state &= 1; + if ((tab->nhu_state &= NHU_SCHEDULED) == NHU_SCHEDULED) + birdloop_flag(tab->loop, RTF_NHU); +} + +void +rt_new_default_table(struct symbol *s) +{ + for (uint addr_type = 0; addr_type < NET_MAX; addr_type++) + if (s == new_config->def_tables[addr_type]) + { + s->table = rt_new_table(s, addr_type); + return; + } - if (tab->nhu_state != NHU_CLEAN) - ev_schedule(tab->rt_event); + bug("Requested an unknown new default table: %s", s->name); } +struct rtable_config * +rt_get_default_table(struct config *cf, uint addr_type) +{ + struct symbol *ts = cf->def_tables[addr_type]; + if (!ts) + return NULL; + + if (!ts->table) + rt_new_default_table(ts); + + return ts->table; +} struct rtable_config * rt_new_table(struct symbol *s, uint addr_type) { - /* Hack that allows to 'redefine' the master table */ - if ((s->class == SYM_TABLE) && - (s->table == new_config->def_tables[addr_type]) && - ((addr_type == NET_IP4) || (addr_type == NET_IP6))) - return s->table; - struct rtable_config *c = cfg_allocz(sizeof(struct rtable_config)); - cf_define_symbol(s, SYM_TABLE, table, c); + if (s == new_config->def_tables[addr_type]) + s->table = c; + else + cf_define_symbol(s, SYM_TABLE, table, c); + c->name = s->name; c->addr_type = addr_type; - c->gc_max_ops = 1000; - c->gc_min_time = 5; - c->min_settle_time = 1 S; - c->max_settle_time = 20 S; + c->gc_threshold = 1000; + c->gc_period = (uint) -1; /* set in rt_postconfig() */ + c->cork_threshold.low = 128; + c->cork_threshold.high = 512; + c->export_settle = (struct settle_config) { + .min = 1 MS, + .max = 100 MS, + }; + c->export_rr_settle = (struct settle_config) { + .min = 100 MS, + .max = 3 S, + }; + c->debug = new_config->table_debug; add_tail(&new_config->tables, &c->n); /* First table of each type is kept as default */ if (! new_config->def_tables[addr_type]) - new_config->def_tables[addr_type] = c; + new_config->def_tables[addr_type] = s; return c; } @@ -3132,8 +3945,9 @@ rt_new_table(struct symbol *s, uint addr_type) * configuration. */ void -rt_lock_table(rtable *r) +rt_lock_table_priv(struct rtable_private *r, const char *file, uint line) { + rt_trace(r, D_STATES, "Locked at %s:%d", file, line); r->use_count++; } @@ -3146,20 +3960,71 @@ rt_lock_table(rtable *r) * for deletion by configuration changes. */ void -rt_unlock_table(rtable *r) +rt_unlock_table_priv(struct rtable_private *r, const char *file, uint line) { + rt_trace(r, D_STATES, "Unlocked at %s:%d", file, line); if (!--r->use_count && r->deleted) - { - struct config *conf = r->deleted; + /* Stop the service thread to finish this up */ + ev_send(&global_event_list, ev_new_init(r->rp, rt_shutdown, r)); +} - /* Delete the routing table by freeing its pool */ - rt_shutdown(r); - config_del_obstacle(conf); - } +static void +rt_shutdown(void *tab_) +{ + struct rtable_private *r = tab_; + birdloop_stop(r->loop, rt_delete, r); } +static void +rt_delete(void *tab_) +{ + birdloop_enter(&main_birdloop); + + /* We assume that nobody holds the table reference now as use_count is zero. + * Anyway the last holder may still hold the lock. Therefore we lock and + * unlock it the last time to be sure that nobody is there. */ + struct rtable_private *tab = RT_LOCK((rtable *) tab_); + struct config *conf = tab->deleted; + + RT_UNLOCK(RT_PUB(tab)); + + rfree(tab->rp); + config_del_obstacle(conf); + + birdloop_leave(&main_birdloop); +} + + +static void +rt_check_cork_low(struct rtable_private *tab) +{ + if (!tab->cork_active) + return; + + if (!tab->exporter.first || (tab->exporter.first->seq + tab->cork_threshold.low > tab->exporter.next_seq)) + { + tab->cork_active = 0; + rt_cork_release(); + + rt_trace(tab, D_STATES, "Uncorked"); + } +} + +static void +rt_check_cork_high(struct rtable_private *tab) +{ + if (!tab->cork_active && tab->exporter.first && (tab->exporter.first->seq + tab->cork_threshold.high <= tab->exporter.next_seq)) + { + tab->cork_active = 1; + rt_cork_acquire(); + + rt_trace(tab, D_STATES, "Corked"); + } +} + + static int -rt_reconfigure(rtable *tab, struct rtable_config *new, struct rtable_config *old) +rt_reconfigure(struct rtable_private *tab, struct rtable_config *new, struct rtable_config *old) { if ((new->addr_type != old->addr_type) || (new->sorted != old->sorted) || @@ -3167,10 +4032,26 @@ rt_reconfigure(rtable *tab, struct rtable_config *new, struct rtable_config *old return 0; DBG("\t%s: same\n", new->name); - new->table = tab; + new->table = RT_PUB(tab); tab->name = new->name; tab->config = new; + if (tab->hostcache) + tab->hostcache->req.trace_routes = new->debug; + + struct rt_table_export_hook *hook; node *n; + WALK_LIST2(hook, n, tab->exporter.e.hooks, h.n) + if (hook->h.req->export_one == rt_flowspec_export_one) + hook->h.req->trace_routes = new->debug; + + tab->cork_threshold = new->cork_threshold; + + if (new->cork_threshold.high != old->cork_threshold.high) + rt_check_cork_high(tab); + + if (new->cork_threshold.low != old->cork_threshold.low) + rt_check_cork_low(tab); + return 1; } @@ -3203,19 +4084,36 @@ rt_commit(struct config *new, struct config *old) { WALK_LIST(o, old->tables) { - rtable *tab = o->table; + struct rtable_private *tab = RT_LOCK(o->table); + if (tab->deleted) + { + RT_UNLOCK(tab); continue; + } r = rt_find_table_config(new, o->name); if (r && !new->shutdown && rt_reconfigure(tab, r, o)) + { + RT_UNLOCK(tab); continue; + } DBG("\t%s: deleted\n", o->name); tab->deleted = old; config_add_obstacle(old); rt_lock_table(tab); + + if (tab->hostcache) + { + rt_stop_export(&tab->hostcache->req, NULL); + if (ev_get_list(&tab->hostcache->update) == &rt_cork.queue) + ev_postpone(&tab->hostcache->update); + } + rt_unlock_table(tab); + + RT_UNLOCK(tab); } } @@ -3229,6 +4127,84 @@ rt_commit(struct config *new, struct config *old) DBG("\tdone\n"); } +static void +rt_feed_done(struct rt_export_hook *c) +{ + c->event.hook = rt_export_hook; + + rt_set_export_state(c, TES_READY); + + rt_send_export_event(c); +} + +#define MAX_FEED_BLOCK 1024 +typedef struct { + uint cnt, pos; + union { + struct rt_pending_export *rpe; + struct { + rte **feed; + uint *start; + }; + }; +} rt_feed_block; + +static int +rt_prepare_feed(struct rt_table_export_hook *c, net *n, rt_feed_block *b) +{ + if (n->routes) + { + if (c->h.req->export_bulk) + { + uint cnt = rte_feed_count(n); + if (b->cnt && (b->cnt + cnt > MAX_FEED_BLOCK)) + return 0; + + if (!b->cnt) + { + b->feed = tmp_alloc(sizeof(rte *) * MAX(MAX_FEED_BLOCK, cnt)); + b->start = tmp_alloc(sizeof(uint) * ((cnt >= MAX_FEED_BLOCK) ? 2 : (MAX_FEED_BLOCK + 2 - cnt))); + } + + rte_feed_obtain(n, &b->feed[b->cnt], cnt); + b->start[b->pos++] = b->cnt; + b->cnt += cnt; + } + else if (b->pos == MAX_FEED_BLOCK) + return 0; + else + { + if (!b->pos) + b->rpe = tmp_alloc(sizeof(struct rt_pending_export) * MAX_FEED_BLOCK); + + b->rpe[b->pos++] = (struct rt_pending_export) { .new = n->routes, .new_best = n->routes }; + } + } + + rpe_mark_seen_all(&c->h, n->first, NULL); + return 1; +} + +static void +rt_process_feed(struct rt_table_export_hook *c, rt_feed_block *b) +{ + if (!b->pos) + return; + + if (c->h.req->export_bulk) + { + b->start[b->pos] = b->cnt; + for (uint p = 0; p < b->pos; p++) + { + rte **feed = &b->feed[b->start[p]]; + c->h.req->export_bulk(c->h.req, feed[0]->net, NULL, feed, b->start[p+1] - b->start[p]); + } + } + else + for (uint p = 0; p < b->pos; p++) + c->h.req->export_one(c->h.req, b->rpe[p].new->rte.net, &b->rpe[p]); +} + /** * rt_feed_by_fib - advertise all routes to a channel by walking a fib * @c: channel to be fed @@ -3241,59 +4217,67 @@ rt_commit(struct config *new, struct config *old) static void rt_feed_by_fib(void *data) { - struct rt_export_hook *c = data; - + struct rt_table_export_hook *c = data; struct fib_iterator *fit = &c->feed_fit; - int max_feed = 256; + rt_feed_block block = {}; - ASSERT(c->export_state == TES_FEEDING); + ASSERT(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); - rtable *tab = SKIP_BACK(rtable, exporter, c->table); + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { FIB_ITERATE_START(&tab->fib, fit, net, n) { - if (max_feed <= 0) + if ((c->h.req->addr_mode == TE_ADDR_NONE) || net_in_netX(n->n.addr, c->h.req->addr)) + { + if (!rt_prepare_feed(c, n, &block)) { FIB_ITERATE_PUT(fit); - ev_schedule_work(c->event); + RT_UNLOCK(tab); + rt_process_feed(c, &block); + rt_send_export_event(&c->h); return; } - - ASSERT(c->export_state == TES_FEEDING); - - if ((c->req->addr_mode == TE_ADDR_NONE) || net_in_netX(n->n.addr, c->req->addr)) - max_feed -= rt_feed_net(c, n); + } } FIB_ITERATE_END; + } - rt_set_export_state(c, TES_READY); + rt_process_feed(c, &block); + rt_feed_done(&c->h); } static void rt_feed_by_trie(void *data) { - struct rt_export_hook *c = data; - rtable *tab = SKIP_BACK(rtable, exporter, c->table); + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; + + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { ASSERT_DIE(c->walk_state); struct f_trie_walk_state *ws = c->walk_state; - int max_feed = 256; + ASSERT(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); - ASSERT_DIE(c->export_state == TES_FEEDING); + do { + if (!c->walk_last.type) + continue; - net_addr addr; - while (trie_walk_next(ws, &addr)) - { - net *n = net_find(tab, &addr); + net *n = net_find(tab, &c->walk_last); if (!n) continue; - if ((max_feed -= rt_feed_net(c, n)) <= 0) + if (!rt_prepare_feed(c, n, &block)) + { + RT_UNLOCK(tab); + rt_process_feed(c, &block); + rt_send_export_event(&c->h); return; - - ASSERT_DIE(c->export_state == TES_FEEDING); + } } + while (trie_walk_next(ws, &c->walk_last)); rt_unlock_trie(tab, c->walk_lock); c->walk_lock = NULL; @@ -3301,69 +4285,56 @@ rt_feed_by_trie(void *data) mb_free(c->walk_state); c->walk_state = NULL; - rt_set_export_state(c, TES_READY); + c->walk_last.type = 0; + + } + + rt_process_feed(c, &block); + rt_feed_done(&c->h); } static void rt_feed_equal(void *data) { - struct rt_export_hook *c = data; - rtable *tab = SKIP_BACK(rtable, exporter, c->table); + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; + net *n; - ASSERT_DIE(c->export_state == TES_FEEDING); - ASSERT_DIE(c->req->addr_mode == TE_ADDR_EQUAL); + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); + ASSERT_DIE(c->h.req->addr_mode == TE_ADDR_EQUAL); + + if (n = net_find(tab, c->h.req->addr)) + ASSERT_DIE(rt_prepare_feed(c, n, &block)); + } - net *n = net_find(tab, c->req->addr); if (n) - rt_feed_net(c, n); + rt_process_feed(c, &block); - rt_set_export_state(c, TES_READY); + rt_feed_done(&c->h); } static void rt_feed_for(void *data) { - struct rt_export_hook *c = data; - rtable *tab = SKIP_BACK(rtable, exporter, c->table); + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; + net *n; - ASSERT_DIE(c->export_state == TES_FEEDING); - ASSERT_DIE(c->req->addr_mode == TE_ADDR_FOR); - - net *n = net_route(tab, c->req->addr); - if (n) - rt_feed_net(c, n); - - rt_set_export_state(c, TES_READY); -} + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); + ASSERT_DIE(c->h.req->addr_mode == TE_ADDR_FOR); -static uint -rt_feed_net(struct rt_export_hook *c, net *n) -{ - if (c->req->export_bulk) - { - uint count = rte_feed_count(n); - if (count) - { - rte_update_lock(); - rte **feed = alloca(count * sizeof(rte *)); - rte_feed_obtain(n, feed, count); - struct rt_pending_export rpe = { .new_best = n->routes }; - c->req->export_bulk(c->req, n->n.addr, &rpe, feed, count); - rte_update_unlock(); - } - return count; - } + if (n = net_route(tab, c->h.req->addr)) + ASSERT_DIE(rt_prepare_feed(c, n, &block)); + } - if (n->routes && rte_is_valid(&n->routes->rte)) - { - rte_update_lock(); - struct rt_pending_export rpe = { .new = n->routes, .new_best = n->routes }; - c->req->export_one(c->req, n->n.addr, &rpe); - rte_update_unlock(); - return 1; - } + if (n) + rt_process_feed(c, &block); - return 0; + rt_feed_done(&c->h); } @@ -3371,7 +4342,6 @@ rt_feed_net(struct rt_export_hook *c, net *n) * Import table */ - void channel_reload_export_bulk(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) { struct channel *c = SKIP_BACK(struct channel, reload_req, req); @@ -3493,7 +4463,56 @@ hc_delete_hostentry(struct hostcache *hc, pool *p, struct hostentry *he) } static void -rt_init_hostcache(rtable *tab) +hc_notify_dump_req(struct rt_export_request *req) +{ + debug(" Table %s (%p)\n", req->name, req); +} + +static void +hc_notify_log_state_change(struct rt_export_request *req, u8 state) +{ + struct hostcache *hc = SKIP_BACK(struct hostcache, req, req); + rt_trace((rtable *) hc->update.data, D_STATES, "HCU Export state changed to %s", rt_export_state_name(state)); +} + +static void +hc_notify_export_one(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) +{ + struct hostcache *hc = SKIP_BACK(struct hostcache, req, req); + + /* No interest in this update, mark seen only */ + int interested = 1; + RT_LOCKED((rtable *) hc->update.data, tab) + if (ev_active(&hc->update) || !trie_match_net(hc->trie, net)) + { + rpe_mark_seen_all(req->hook, first, NULL); + interested = 0; + } + + if (!interested) + return; + + /* This net may affect some hostentries, check the actual change */ + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; + + RPE_WALK(first, rpe, NULL) + { + rpe_mark_seen(req->hook, rpe); + new_best = rpe->new_best; + } + + /* Yes, something has actually changed. Do the hostcache update. */ + if (o != RTE_VALID_OR_NULL(new_best)) + RT_LOCKED((rtable *) hc->update.data, tab) + if ((atomic_load_explicit(&req->hook->export_state, memory_order_acquire) == TES_READY) + && !ev_active(&hc->update)) + ev_send_loop(tab->loop, &hc->update); +} + + +static void +rt_init_hostcache(struct rtable_private *tab) { struct hostcache *hc = mb_allocz(tab->rp, sizeof(struct hostcache)); init_list(&hc->hostentries); @@ -3505,11 +4524,27 @@ rt_init_hostcache(rtable *tab) hc->lp = lp_new(tab->rp); hc->trie = f_new_trie(hc->lp, 0); + hc->update = (event) { + .hook = rt_update_hostcache, + .data = tab, + }; + + hc->req = (struct rt_export_request) { + .name = mb_sprintf(tab->rp, "%s.hcu.notifier", tab->name), + .list = &global_work_list, + .trace_routes = tab->config->debug, + .dump_req = hc_notify_dump_req, + .log_state_change = hc_notify_log_state_change, + .export_one = hc_notify_export_one, + }; + + rt_table_export_start_locked(tab, &hc->req); + tab->hostcache = hc; } static void -rt_free_hostcache(rtable *tab) +rt_free_hostcache(struct rtable_private *tab) { struct hostcache *hc = tab->hostcache; @@ -3531,16 +4566,6 @@ rt_free_hostcache(rtable *tab) */ } -static void -rt_notify_hostcache(rtable *tab, net *net) -{ - if (tab->hcu_scheduled) - return; - - if (trie_match_net(tab->hostcache->trie, net->n.addr)) - rt_schedule_hcu(tab); -} - static int if_local_addr(ip_addr a, struct iface *i) { @@ -3564,14 +4589,14 @@ rt_get_igp_metric(const rte *rt) if (rt_get_source_attr(rt) == RTS_DEVICE) return 0; - if (rt->src->proto->rte_igp_metric) - return rt->src->proto->rte_igp_metric(rt); + if (rt->src->owner->class->rte_igp_metric) + return rt->src->owner->class->rte_igp_metric(rt); return IGP_METRIC_UNKNOWN; } static int -rt_update_hostentry(rtable *tab, struct hostentry *he) +rt_update_hostentry(struct rtable_private *tab, struct hostentry *he) { ea_list *old_src = he->src; int direct = 0; @@ -3589,9 +4614,12 @@ rt_update_hostentry(rtable *tab, struct hostentry *he) { struct rte_storage *e = n->routes; ea_list *a = e->rte.attrs; - pxlen = n->n.addr->pxlen; + u32 pref = rt_get_preference(&e->rte); - if (ea_find(a, &ea_gen_hostentry)) + for (struct rte_storage *ee = n->routes; ee; ee = ee->next) + if (rte_is_valid(&ee->rte) && + (rt_get_preference(&ee->rte) >= pref) && + ea_find(ee->rte.attrs, &ea_gen_hostentry)) { /* Recursive route should not depend on another recursive route */ log(L_WARN "Next hop address %I resolvable through recursive route for %N", @@ -3599,6 +4627,8 @@ rt_update_hostentry(rtable *tab, struct hostentry *he) goto done; } + pxlen = n->n.addr->pxlen; + eattr *nhea = ea_find(a, &ea_gen_nexthop); ASSERT_DIE(nhea); struct nexthop_adata *nhad = (void *) nhea->u.ptr; @@ -3632,9 +4662,28 @@ done: } static void -rt_update_hostcache(rtable *tab) +rt_update_hostcache(void *data) { + rtable **nhu_pending; + + RT_LOCKED((rtable *) data, tab) + { + struct hostcache *hc = tab->hostcache; + + /* Shutdown shortcut */ + if (!hc->req.hook) + RT_RETURN(tab); + + if (rt_cork_check(&hc->update)) + { + rt_trace(tab, D_STATES, "Hostcache update corked"); + RT_RETURN(tab); + } + + /* Destination schedule map */ + nhu_pending = tmp_allocz(sizeof(rtable *) * rtable_max_id); + struct hostentry *he; node *n, *x; @@ -3652,14 +4701,18 @@ rt_update_hostcache(rtable *tab) } if (rt_update_hostentry(tab, he)) - rt_schedule_nhu(he->tab); + nhu_pending[he->tab->id] = he->tab; } + } - tab->hcu_scheduled = 0; + for (uint i=0; i<rtable_max_id; i++) + if (nhu_pending[i]) + RT_LOCKED(nhu_pending[i], dst) + rt_schedule_nhu(dst); } static struct hostentry * -rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep) +rt_get_hostentry(struct rtable_private *tab, ip_addr a, ip_addr ll, rtable *dep) { struct hostentry *he; @@ -17,12 +17,19 @@ #include "lib/type.h" #include "lib/fib.h" #include "lib/route.h" +#include "lib/event.h" +#include "lib/rcu.h" +#include "lib/io-loop.h" +#include "lib/settle.h" + +#include <stdatomic.h> struct ea_list; struct protocol; struct proto; struct channel; struct rte_src; +struct hostcache; struct symbol; struct timer; struct filter; @@ -30,6 +37,10 @@ struct f_trie; struct f_trie_walk_state; struct cli; +struct rt_cork_threshold { + u64 low, high; +}; + /* * Master Routing Tables. Generally speaking, each of them contains a FIB * with each entry pointing to a list of route entries representing routes @@ -44,61 +55,99 @@ struct cli; struct rtable_config { node n; char *name; - struct rtable *table; + union rtable *table; struct proto_config *krt_attached; /* Kernel syncer attached to this table */ uint addr_type; /* Type of address data stored in table (NET_*) */ - int gc_max_ops; /* Maximum number of operations before GC is run */ - int gc_min_time; /* Minimum time between two consecutive GC runs */ + uint gc_threshold; /* Maximum number of operations before GC is run */ + uint gc_period; /* Approximate time between two consecutive GC runs */ byte sorted; /* Routes of network are sorted according to rte_better() */ - byte internal; /* Internal table of a protocol */ byte trie_used; /* Rtable has attached trie */ - btime min_settle_time; /* Minimum settle time for notifications */ - btime max_settle_time; /* Maximum settle time for notifications */ + byte debug; /* Whether to log */ + struct rt_cork_threshold cork_threshold; /* Cork threshold values */ + struct settle_config export_settle; /* Export announcement settler */ + struct settle_config export_rr_settle;/* Export announcement settler config valid when any + route refresh is running */ }; struct rt_export_hook; struct rt_export_request; +struct rt_exporter; + +struct rt_exporter_class { + void (*start)(struct rt_exporter *, struct rt_export_request *); + void (*stop)(struct rt_export_hook *); + void (*done)(void *_rt_export_hook); +}; struct rt_exporter { + const struct rt_exporter_class *class; + pool *rp; list hooks; /* Registered route export hooks */ uint addr_type; /* Type of address data exported (NET_*) */ - struct rt_export_hook *(*start)(struct rt_exporter *, struct rt_export_request *); - void (*stop)(struct rt_export_hook *); - void (*done)(struct rt_export_hook *); }; -typedef struct rtable { - resource r; - node n; /* Node in list of all tables */ +struct rt_table_exporter { + struct rt_exporter e; + list pending; /* List of packed struct rt_pending_export */ + + struct rt_pending_export *first; /* First export to announce */ + u64 next_seq; /* The next export will have this ID */ +}; + +extern uint rtable_max_id; + +DEFINE_DOMAIN(rtable); + +/* The public part of rtable structure */ +#define RTABLE_PUBLIC \ + resource r; \ + node n; /* Node in list of all tables */ \ + char *name; /* Name of this table */ \ + uint addr_type; /* Type of address data stored in table (NET_*) */ \ + uint id; /* Integer table ID for fast lookup */ \ + DOMAIN(rtable) lock; /* Lock to take to access the private parts */ \ + struct rtable_config *config; /* Configuration of this table */ \ + struct birdloop *loop; /* Service thread */ \ + +/* The complete rtable structure */ +struct rtable_private { + /* Once more the public part */ + RTABLE_PUBLIC; + + /* Here the private items not to be accessed without locking */ pool *rp; /* Resource pool to allocate everything from, including itself */ struct slab *rte_slab; /* Slab to allocate route objects */ struct fib fib; struct f_trie *trie; /* Trie of prefixes defined in fib */ - char *name; /* Name of this table */ - uint addr_type; /* Type of address data stored in table (NET_*) */ int use_count; /* Number of protocols using this table */ u32 rt_count; /* Number of routes in the table */ list imports; /* Registered route importers */ - struct rt_exporter exporter; /* Exporter API structure */ + struct rt_table_exporter exporter; /* Exporter API structure */ struct hmap id_map; struct hostcache *hostcache; - struct rtable_config *config; /* Configuration of this table */ struct config *deleted; /* Table doesn't exist in current configuration, * delete as soon as use_count becomes 0 and remove * obstacle from this routing table. */ - struct event *rt_event; /* Routing table event */ + struct event *nhu_uncork_event; /* Helper event to schedule NHU on uncork */ + struct settle export_settle; /* Export batching settle timer */ + struct timer *prune_timer; /* Timer for periodic pruning / GC */ + struct birdloop_flag_handler fh; /* Handler for simple events */ btime last_rt_change; /* Last time when route changed */ - btime base_settle_time; /* Start time of rtable settling interval */ btime gc_time; /* Time of last GC */ - int gc_counter; /* Number of operations since last GC */ + uint gc_counter; /* Number of operations since last GC */ + uint rr_counter; /* Number of currently running route refreshes, + in fact sum of (stale_set - stale_pruned) over all importers + + one for each TIS_FLUSHING importer */ byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */ byte prune_trie; /* Prune prefix trie during next table prune */ - byte hcu_scheduled; /* Hostcache update is scheduled */ byte nhu_state; /* Next Hop Update state */ - byte internal; /* This table is internal for some other object */ + byte nhu_corked; /* Next Hop Update is corked with this state */ + byte export_used; /* Pending Export pruning is scheduled */ + byte cork_active; /* Cork has been activated */ + struct rt_cork_threshold cork_threshold; /* Threshold for table cork */ struct fib_iterator prune_fit; /* Rtable prune FIB iterator */ struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ struct f_trie *trie_new; /* New prefix trie defined during pruning */ @@ -107,69 +156,83 @@ typedef struct rtable { u32 trie_old_lock_count; /* Old prefix trie locked by walks */ struct tbf rl_pipe; /* Rate limiting token buffer for pipe collisions */ - list subscribers; /* Subscribers for notifications */ - struct timer *settle_timer; /* Settle time for notifications */ - list flowspec_links; /* List of flowspec links, src for NET_IPx and dst for NET_FLOWx */ struct f_trie *flowspec_trie; /* Trie for evaluation of flowspec notifications */ +}; + +/* The final union private-public rtable structure */ +typedef union rtable { + struct { + RTABLE_PUBLIC; + }; + struct rtable_private priv; } rtable; -struct rt_subscription { - node n; - rtable *tab; - void (*hook)(struct rt_subscription *b); - void *data; -}; +#define RT_IS_LOCKED(tab) DOMAIN_IS_LOCKED(rtable, (tab)->lock) -struct rt_flowspec_link { - node n; - rtable *src; - rtable *dst; - u32 uc; -}; +#define RT_LOCK(tab) ({ LOCK_DOMAIN(rtable, (tab)->lock); &(tab)->priv; }) +#define RT_UNLOCK(tab) UNLOCK_DOMAIN(rtable, (tab)->lock) +#define RT_PRIV(tab) ({ ASSERT_DIE(RT_IS_LOCKED((tab))); &(tab)->priv; }) +#define RT_PUB(tab) SKIP_BACK(rtable, priv, tab) + +#define RT_LOCKED(tpub, tpriv) for (struct rtable_private *tpriv = RT_LOCK(tpub); tpriv; RT_UNLOCK(tpriv), (tpriv = NULL)) +#define RT_RETURN(tpriv, ...) do { RT_UNLOCK(tpriv); return __VA_ARGS__; } while (0) + +#define RT_PRIV_SAME(tpriv, tpub) (&(tpub)->priv == (tpriv)) + +/* Flags for birdloop_flag() */ +#define RTF_CLEANUP 1 +#define RTF_NHU 2 +#define RTF_EXPORT 4 + +extern struct rt_cork { + _Atomic uint active; + event_list queue; + event run; +} rt_cork; + +static inline void rt_cork_acquire(void) +{ + atomic_fetch_add_explicit(&rt_cork.active, 1, memory_order_acq_rel); +} + +static inline void rt_cork_release(void) +{ + if (atomic_fetch_sub_explicit(&rt_cork.active, 1, memory_order_acq_rel) == 1) + { + synchronize_rcu(); + ev_send(&global_work_list, &rt_cork.run); + } +} + +static inline int rt_cork_check(event *e) +{ + rcu_read_lock(); + + int corked = (atomic_load_explicit(&rt_cork.active, memory_order_acquire) > 0); + if (corked) + ev_send(&rt_cork.queue, e); + + rcu_read_unlock(); + + return corked; +} -#define NHU_CLEAN 0 -#define NHU_SCHEDULED 1 -#define NHU_RUNNING 2 -#define NHU_DIRTY 3 typedef struct network { - struct rte_storage *routes; /* Available routes for this network */ + struct rte_storage *routes; /* Available routes for this network */ + struct rt_pending_export *first, *last; struct fib_node n; /* FIB flags reserved for kernel syncer */ } net; -struct hostcache { - slab *slab; /* Slab holding all hostentries */ - struct hostentry **hash_table; /* Hash table for hostentries */ - unsigned hash_order, hash_shift; - unsigned hash_max, hash_min; - unsigned hash_items; - linpool *lp; /* Linpool for trie */ - struct f_trie *trie; /* Trie of prefixes that might affect hostentries */ - list hostentries; /* List of all hostentries */ - byte update_hostcache; -}; - -struct hostentry { - node ln; - ip_addr addr; /* IP address of host, part of key */ - ip_addr link; /* (link-local) IP address of host, used as gw - if host is directly attached */ - struct rtable *tab; /* Dependent table, part of key */ - struct hostentry *next; /* Next in hash chain */ - unsigned hash_key; /* Hash key */ - unsigned uc; /* Use count */ - ea_list *src; /* Source attributes */ - byte nexthop_linkable; /* Nexthop list is completely non-device */ - u32 igp_metric; /* Chosen route IGP metric */ -}; - struct rte_storage { struct rte_storage *next; /* Next in chain */ struct rte rte; /* Route data */ }; -#define RTE_COPY(r, l) ((r) ? (((*(l)) = (r)->rte), (l)) : NULL) -#define RTE_OR_NULL(r) ((r) ? &((r)->rte) : NULL) +#define RTE_COPY(r) ((r) ? (r)->rte : (rte) {}) +#define RTE_COPY_VALID(r) (((r) && (rte_is_valid(&(r)->rte))) ? (r)->rte : (rte) {}) +#define RTE_OR_NULL(r) ((r) ? &((r)->rte) : NULL) +#define RTE_VALID_OR_NULL(r) (((r) && (rte_is_valid(&(r)->rte))) ? &((r)->rte) : NULL) /* Table-channel connections */ @@ -178,12 +241,13 @@ struct rt_import_request { char *name; u8 trace_routes; + event_list *list; /* Where to schedule announce events */ + void (*dump_req)(struct rt_import_request *req); void (*log_state_change)(struct rt_import_request *req, u8 state); /* Preimport is called when the @new route is just-to-be inserted, replacing @old. * Return a route (may be different or modified in-place) to continue or NULL to withdraw. */ int (*preimport)(struct rt_import_request *req, struct rte *new, struct rte *old); - struct rte *(*rte_modify)(struct rte *, struct linpool *); }; struct rt_import_hook { @@ -200,15 +264,23 @@ struct rt_import_hook { u32 withdraws_accepted; /* Number of route withdraws accepted and processed */ } stats; + u64 flush_seq; /* Table export seq when the channel announced flushing */ btime last_state_change; /* Time of last state transition */ u8 import_state; /* IS_* */ + u8 stale_set; /* Set this stale_cycle to imported routes */ + u8 stale_valid; /* Routes with this stale_cycle and bigger are considered valid */ + u8 stale_pruned; /* Last prune finished when this value was set at stale_valid */ + u8 stale_pruning; /* Last prune started when this value was set at stale_valid */ void (*stopped)(struct rt_import_request *); /* Stored callback when import is stopped */ + event announce_event; /* This event announces table updates */ }; struct rt_pending_export { + struct rt_pending_export * _Atomic next; /* Next export for the same destination */ struct rte_storage *new, *new_best, *old, *old_best; + u64 seq; /* Sequential ID (table-local) of the pending export */ }; struct rt_export_request { @@ -218,6 +290,8 @@ struct rt_export_request { u8 trace_routes; u8 addr_mode; /* Network prefilter mode (TE_ADDR_*) */ + event_list *list; /* Where to schedule export events */ + /* There are two methods of export. You can either request feeding every single change * or feeding the whole route feed. In case of regular export, &export_one is preferred. * Anyway, when feeding, &export_bulk is preferred, falling back to &export_one. @@ -237,7 +311,6 @@ struct rt_export_hook { struct rt_exporter *table; /* The connected table */ pool *pool; - linpool *lp; struct rt_export_request *req; /* The requestor */ @@ -247,24 +320,44 @@ struct rt_export_hook { u32 withdraws_received; /* Number of route withdraws received */ } stats; + btime last_state_change; /* Time of last state transition */ + + _Atomic u8 export_state; /* Route export state (TES_*, see below) */ + struct event event; /* Event running all the export operations */ + + struct bmap seq_map; /* Keep track which exports were already procesed */ + + void (*stopped)(struct rt_export_request *); /* Stored callback when export is stopped */ +}; + +struct rt_table_export_hook { + union { + struct rt_export_hook h; + struct { /* Overriding the parent structure beginning */ + node _n; + struct rt_table_exporter *table; + }; + }; + union { struct fib_iterator feed_fit; /* Routing table iterator used during feeding */ struct { struct f_trie_walk_state *walk_state; /* Iterator over networks in trie */ struct f_trie *walk_lock; /* Locked trie for walking */ + union { /* Last net visited but not processed */ + net_addr walk_last; + net_addr_ip4 walk_last_ip4; + net_addr_ip6 walk_last_ip6; + }; }; - u32 hash_iter; /* Iterator over hash */ }; - btime last_state_change; /* Time of last state transition */ + struct rt_pending_export *_Atomic last_export;/* Last export processed */ + struct rt_pending_export *rpe_next; /* Next pending export to process */ u8 refeed_pending; /* Refeeding and another refeed is scheduled */ - u8 export_state; /* Route export state (TES_*, see below) */ u8 feed_type; /* Which feeding method is used (TFT_*, see below) */ - struct event *event; /* Event running all the export operations */ - - void (*stopped)(struct rt_export_request *); /* Stored callback when export is stopped */ }; #define TIS_DOWN 0 @@ -276,6 +369,7 @@ struct rt_export_hook { #define TIS_MAX 6 #define TES_DOWN 0 +#define TES_HUNGRY 1 #define TES_FEEDING 2 #define TES_READY 3 #define TES_STOP 4 @@ -293,7 +387,8 @@ struct rt_export_hook { #define TFT_HASH 3 void rt_request_import(rtable *tab, struct rt_import_request *req); -void rt_request_export(struct rt_exporter *tab, struct rt_export_request *req); +void rt_request_export(rtable *tab, struct rt_export_request *req); +void rt_request_export_other(struct rt_exporter *tab, struct rt_export_request *req); void rt_export_once(struct rt_exporter *tab, struct rt_export_request *req); @@ -310,6 +405,36 @@ void rt_set_export_state(struct rt_export_hook *hook, u8 state); void rte_import(struct rt_import_request *req, const net_addr *net, rte *new, struct rte_src *src); +/* + * For table export processing + */ + +/* Get next rpe. If src is given, it must match. */ +struct rt_pending_export *rpe_next(struct rt_pending_export *rpe, struct rte_src *src); + +/* Walk all rpe's */ +#define RPE_WALK(first, it, src) \ + for (struct rt_pending_export *it = (first); it; it = rpe_next(it, (src))) + +/* Mark the pending export processed */ +void rpe_mark_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); + +#define rpe_mark_seen_all(hook, first, src) \ + RPE_WALK((first), _rpe, (src)) rpe_mark_seen((hook), _rpe) + +/* Get pending export seen status */ +int rpe_get_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); + +/* + * For rt_export_hook and rt_exporter inheritance + */ + +void rt_init_export(struct rt_exporter *re, struct rt_export_hook *hook); +struct rt_export_hook *rt_alloc_export(struct rt_exporter *re, uint size); +void rt_stop_export_common(struct rt_export_hook *hook); +void rt_export_stopped(struct rt_export_hook *hook); +void rt_exporter_init(struct rt_exporter *re); + /* Types of route announcement, also used as flags */ #define RA_UNDEF 0 /* Undefined RA type */ #define RA_OPTIMAL 1 /* Announcement of optimal route change */ @@ -323,6 +448,49 @@ void rte_import(struct rt_import_request *req, const net_addr *net, rte *new, st #define RIC_REJECT -1 /* Rejected by protocol */ #define RIC_DROP -2 /* Silently dropped by protocol */ +/* + * Next hop update data structures + */ + +#define NHU_CLEAN 0 +#define NHU_SCHEDULED 1 +#define NHU_RUNNING 2 +#define NHU_DIRTY 3 + +struct hostentry { + node ln; + ip_addr addr; /* IP address of host, part of key */ + ip_addr link; /* (link-local) IP address of host, used as gw + if host is directly attached */ + rtable *tab; /* Dependent table, part of key */ + struct hostentry *next; /* Next in hash chain */ + unsigned hash_key; /* Hash key */ + unsigned uc; /* Use count */ + ea_list *src; /* Source attributes */ + byte nexthop_linkable; /* Nexthop list is completely non-device */ + u32 igp_metric; /* Chosen route IGP metric */ +}; + +struct hostcache { + slab *slab; /* Slab holding all hostentries */ + struct hostentry **hash_table; /* Hash table for hostentries */ + unsigned hash_order, hash_shift; + unsigned hash_max, hash_min; + unsigned hash_items; + linpool *lp; /* Linpool for trie */ + struct f_trie *trie; /* Trie of prefixes that might affect hostentries */ + list hostentries; /* List of all hostentries */ + event update; + struct rt_export_request req; /* Notifier */ +}; + +struct rt_flowspec_link { + rtable *src; + rtable *dst; + u32 uc; + struct rt_export_request req; +}; + #define rte_update channel_rte_import /** * rte_update - enter a new update to a routing table @@ -363,33 +531,38 @@ struct config; void rt_init(void); void rt_preconfig(struct config *); +void rt_postconfig(struct config *); void rt_commit(struct config *new, struct config *old); -void rt_lock_table(rtable *); -void rt_unlock_table(rtable *); -struct f_trie * rt_lock_trie(rtable *tab); -void rt_unlock_trie(rtable *tab, struct f_trie *trie); -void rt_subscribe(rtable *tab, struct rt_subscription *s); -void rt_unsubscribe(struct rt_subscription *s); +void rt_lock_table_priv(struct rtable_private *, const char *file, uint line); +void rt_unlock_table_priv(struct rtable_private *, const char *file, uint line); +static inline void rt_lock_table_pub(rtable *t, const char *file, uint line) +{ RT_LOCKED(t, tt) rt_lock_table_priv(tt, file, line); } +static inline void rt_unlock_table_pub(rtable *t, const char *file, uint line) +{ RT_LOCKED(t, tt) rt_unlock_table_priv(tt, file, line); } + +#define rt_lock_table(t) _Generic((t), rtable *: rt_lock_table_pub, \ + struct rtable_private *: rt_lock_table_priv)((t), __FILE__, __LINE__) +#define rt_unlock_table(t) _Generic((t), rtable *: rt_unlock_table_pub, \ + struct rtable_private *: rt_unlock_table_priv)((t), __FILE__, __LINE__) + +struct f_trie * rt_lock_trie(struct rtable_private *tab); +void rt_unlock_trie(struct rtable_private *tab, struct f_trie *trie); void rt_flowspec_link(rtable *src, rtable *dst); void rt_flowspec_unlink(rtable *src, rtable *dst); rtable *rt_setup(pool *, struct rtable_config *); -static inline void rt_shutdown(rtable *r) { rfree(r->rp); } -static inline net *net_find(rtable *tab, const net_addr *addr) { return (net *) fib_find(&tab->fib, addr); } -static inline net *net_find_valid(rtable *tab, const net_addr *addr) +static inline net *net_find(struct rtable_private *tab, const net_addr *addr) { return (net *) fib_find(&tab->fib, addr); } +static inline net *net_find_valid(struct rtable_private *tab, const net_addr *addr) { net *n = net_find(tab, addr); return (n && n->routes && rte_is_valid(&n->routes->rte)) ? n : NULL; } -static inline net *net_get(rtable *tab, const net_addr *addr) { return (net *) fib_get(&tab->fib, addr); } -net *net_get(rtable *tab, const net_addr *addr); -net *net_route(rtable *tab, const net_addr *n); +static inline net *net_get(struct rtable_private *tab, const net_addr *addr) { return (net *) fib_get(&tab->fib, addr); } +net *net_route(struct rtable_private *tab, const net_addr *n); int rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter); rte *rt_export_merged(struct channel *c, rte ** feed, uint count, linpool *pool, int silent); -void rt_refresh_begin(rtable *t, struct rt_import_request *); -void rt_refresh_end(rtable *t, struct rt_import_request *); +void rt_refresh_begin(struct rt_import_request *); +void rt_refresh_end(struct rt_import_request *); void rt_modify_stale(rtable *t, struct rt_import_request *); -void rt_schedule_prune(rtable *t); +void rt_schedule_prune(struct rtable_private *t); void rte_dump(struct rte_storage *); -void rte_free(struct rte_storage *); -struct rte_storage *rte_store(const rte *, net *net, rtable *); void rt_dump(rtable *); void rt_dump_all(void); void rt_dump_hooks(rtable *); @@ -399,6 +572,8 @@ void rt_reload_channel_abort(struct channel *c); void rt_refeed_channel(struct channel *c); void rt_prune_sync(rtable *t, int all); struct rtable_config *rt_new_table(struct symbol *s, uint addr_type); +void rt_new_default_table(struct symbol *s); +struct rtable_config *rt_get_default_table(struct config *cf, uint addr_type); static inline int rt_is_ip(rtable *tab) { return (tab->addr_type == NET_IP4) || (tab->addr_type == NET_IP6); } @@ -448,7 +623,7 @@ struct rt_show_data { void rt_show(struct rt_show_data *); struct rt_show_data_rtable * rt_show_add_exporter(struct rt_show_data *d, struct rt_exporter *t, const char *name); -struct rt_show_data_rtable * rt_show_add_table(struct rt_show_data *d, struct rtable *t); +struct rt_show_data_rtable * rt_show_add_table(struct rt_show_data *d, rtable *t); /* Value of table definition mode in struct rt_show_data */ #define RSD_TDB_DEFAULT 0 /* no table specified */ @@ -475,7 +650,7 @@ struct hostentry_adata { }; void -ea_set_hostentry(ea_list **to, struct rtable *dep, struct rtable *tab, ip_addr gw, ip_addr ll, u32 lnum, u32 labels[lnum]); +ea_set_hostentry(ea_list **to, rtable *dep, rtable *tab, ip_addr gw, ip_addr ll, u32 lnum, u32 labels[lnum]); void ea_show_hostentry(const struct adata *ad, byte *buf, uint size); void ea_show_nexthop_list(struct cli *c, struct nexthop_adata *nhad); diff --git a/proto/babel/Makefile b/proto/babel/Makefile index ae6aeaf2..06b58e95 100644 --- a/proto/babel/Makefile +++ b/proto/babel/Makefile @@ -2,6 +2,5 @@ src := babel.c packets.c obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,babel_build) tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 4939619f..4d024e3a 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -315,7 +315,9 @@ babel_add_seqno_request(struct babel_proto *p, struct babel_entry *e, /* Found older */ rem_node(NODE sr); - rem_node(&sr->nbr_node); + + if (sr->nbr) + rem_node(&sr->nbr_node); goto found; } @@ -455,10 +457,7 @@ babel_flush_neighbor(struct babel_proto *p, struct babel_neighbor *nbr) struct babel_seqno_request *sr; WALK_LIST_FIRST2(sr, nbr_node, nbr->requests) - { - sr->nbr = NULL; - rem_node(&sr->nbr_node); - } + babel_remove_seqno_request(p, sr); nbr->ifa = NULL; rem_node(NODE nbr); @@ -2258,9 +2257,9 @@ babel_kick_timer(struct babel_proto *p) static int -babel_preexport(struct channel *c, struct rte *new) +babel_preexport(struct channel *C, struct rte *new) { - if (new->src->proto != c->proto) + if (new->src->owner != &C->proto->sources) return 0; /* Reject our own unreachable routes */ @@ -2290,7 +2289,7 @@ babel_rt_notify(struct proto *P, struct channel *c UNUSED, const net_addr *net, uint rt_metric = ea_get_int(new->attrs, &ea_babel_metric, 0); u64 rt_router_id = 0; - if (new->src->proto == P) + if (new->src->owner == &P->sources) { rt_seqno = ea_get_int(new->attrs, &ea_babel_seqno, 0); eattr *e = ea_find(new->attrs, &ea_babel_router_id); @@ -2374,6 +2373,12 @@ babel_postconfig(struct proto_config *CF) cf->ip6_channel = ip6 ?: ip6_sadr; } +static struct rte_owner_class babel_rte_owner_class = { + .get_route_info = babel_get_route_info, + .rte_better = babel_rte_better, + .rte_igp_metric = babel_rte_igp_metric, +}; + static struct proto * babel_init(struct proto_config *CF) { @@ -2387,8 +2392,8 @@ babel_init(struct proto_config *CF) P->if_notify = babel_if_notify; P->rt_notify = babel_rt_notify; P->preexport = babel_preexport; - P->rte_better = babel_rte_better; - P->rte_igp_metric = babel_rte_igp_metric; + + P->sources.class = &babel_rte_owner_class; return P; } @@ -2499,7 +2504,6 @@ struct protocol proto_babel = { .start = babel_start, .shutdown = babel_shutdown, .reconfigure = babel_reconfigure, - .get_route_info = babel_get_route_info, }; void diff --git a/proto/bfd/Makefile b/proto/bfd/Makefile index dbdc0a09..267dff98 100644 --- a/proto/bfd/Makefile +++ b/proto/bfd/Makefile @@ -1,7 +1,6 @@ -src := bfd.c io.c packets.c +src := bfd.c packets.c obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,bfd_build) tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/bfd/bfd.c b/proto/bfd/bfd.c index 871ecf69..25ff19ac 100644 --- a/proto/bfd/bfd.c +++ b/proto/bfd/bfd.c @@ -82,7 +82,7 @@ * BFD thread to the main thread. This is done in an asynchronous way, sesions * with pending notifications are linked (in the BFD thread) to @notify_list in * &bfd_proto, and then bfd_notify_hook() in the main thread is activated using - * bfd_notify_kick() and a pipe. The hook then processes scheduled sessions and + * a standard event sending code. The hook then processes scheduled sessions and * calls hooks from associated BFD requests. This @notify_list (and state fields * in structure &bfd_session) is protected by a spinlock in &bfd_proto and * functions bfd_lock_sessions() / bfd_unlock_sessions(). @@ -113,15 +113,21 @@ #define HASH_IP_EQ(a1,n1,a2,n2) ipa_equal(a1, a2) && n1 == n2 #define HASH_IP_FN(a,n) ipa_hash(a) ^ u32_hash(n) -static list STATIC_LIST_INIT(bfd_proto_list); -static list STATIC_LIST_INIT(bfd_wait_list); +#define BFD_LOCK LOCK_DOMAIN(rtable, bfd_global.lock) +#define BFD_UNLOCK UNLOCK_DOMAIN(rtable, bfd_global.lock) + +static struct { + DOMAIN(rtable) lock; + list wait_list; + list pickup_list; + list proto_list; +} bfd_global; const char *bfd_state_names[] = { "AdminDown", "Down", "Init", "Up" }; static void bfd_session_set_min_tx(struct bfd_session *s, u32 val); static struct bfd_iface *bfd_get_iface(struct bfd_proto *p, ip_addr local, struct iface *iface); static void bfd_free_iface(struct bfd_iface *ifa); -static inline void bfd_notify_kick(struct bfd_proto *p); /* @@ -170,7 +176,7 @@ bfd_session_update_state(struct bfd_session *s, uint state, uint diag) bfd_session_set_min_tx(s, s->cf.idle_tx_int); if (notify) - bfd_notify_kick(p); + ev_send(&global_event_list, &p->notify_event); } static void @@ -188,7 +194,7 @@ bfd_session_update_tx_interval(struct bfd_session *s) return; /* Set timer relative to last tx_timer event */ - tm_set(s->tx_timer, s->last_tx + tx_int_l); + tm_set_in(s->tx_timer, s->last_tx + tx_int_l, s->ifa->bfd->p.loop); } static void @@ -202,7 +208,7 @@ bfd_session_update_detection_time(struct bfd_session *s, int kick) if (!s->last_rx) return; - tm_set(s->hold_timer, s->last_rx + timeout); + tm_set_in(s->hold_timer, s->last_rx + timeout, s->ifa->bfd->p.loop); } static void @@ -226,7 +232,7 @@ bfd_session_control_tx_timer(struct bfd_session *s, int reset) if (reset || !tm_active(s->tx_timer)) { s->last_tx = 0; - tm_start(s->tx_timer, 0); + tm_start_in(s->tx_timer, 0, s->ifa->bfd->p.loop); } return; @@ -419,7 +425,7 @@ bfd_get_free_id(struct bfd_proto *p) static struct bfd_session * bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface *iface, struct bfd_options *opts) { - birdloop_enter(p->loop); + ASSERT_DIE(birdloop_inside(p->p.loop)); struct bfd_iface *ifa = bfd_get_iface(p, local, iface); @@ -454,8 +460,6 @@ bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface * TRACE(D_EVENTS, "Session to %I added", s->addr); - birdloop_leave(p->loop); - return s; } @@ -463,38 +467,34 @@ bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface * static void bfd_open_session(struct bfd_proto *p, struct bfd_session *s, ip_addr local, struct iface *ifa) { - birdloop_enter(p->loop); + birdloop_enter(p->p.loop); s->opened = 1; bfd_session_control_tx_timer(s); - birdloop_leave(p->loop); + birdloop_leave(p->p.loop); } static void bfd_close_session(struct bfd_proto *p, struct bfd_session *s) { - birdloop_enter(p->loop); + birdloop_enter(p->p.loop); s->opened = 0; bfd_session_update_state(s, BFD_STATE_DOWN, BFD_DIAG_PATH_DOWN); bfd_session_control_tx_timer(s); - birdloop_leave(p->loop); + birdloop_leave(p->p.loop); } */ static void -bfd_remove_session(struct bfd_proto *p, struct bfd_session *s) +bfd_remove_session_locked(struct bfd_proto *p, struct bfd_session *s) { - ip_addr ip = s->addr; - /* Caller should ensure that request list is empty */ - birdloop_enter(p->loop); - /* Remove session from notify list if scheduled for notification */ /* No need for bfd_lock_sessions(), we are already protected by birdloop_enter() */ if (NODE_VALID(&s->n)) @@ -508,11 +508,17 @@ bfd_remove_session(struct bfd_proto *p, struct bfd_session *s) HASH_REMOVE(p->session_hash_id, HASH_ID, s); HASH_REMOVE(p->session_hash_ip, HASH_IP, s); - sl_free(s); + TRACE(D_EVENTS, "Session to %I removed", s->addr); - TRACE(D_EVENTS, "Session to %I removed", ip); + sl_free(s); +} - birdloop_leave(p->loop); +static void +bfd_remove_session(struct bfd_proto *p, struct bfd_session *s) +{ + birdloop_enter(p->p.loop); + bfd_remove_session_locked(p, s); + birdloop_leave(p->p.loop); } static void @@ -521,7 +527,7 @@ bfd_reconfigure_session(struct bfd_proto *p, struct bfd_session *s) if (EMPTY_LIST(s->request_list)) return; - birdloop_enter(p->loop); + birdloop_enter(p->p.loop); struct bfd_request *req = SKIP_BACK(struct bfd_request, n, HEAD(s->request_list)); s->cf = bfd_merge_options(s->ifa->cf, &req->opts); @@ -534,7 +540,7 @@ bfd_reconfigure_session(struct bfd_proto *p, struct bfd_session *s) bfd_session_control_tx_timer(s, 0); - birdloop_leave(p->loop); + birdloop_leave(p->p.loop); TRACE(D_EVENTS, "Session to %I reconfigured", s->addr); } @@ -627,9 +633,9 @@ bfd_reconfigure_iface(struct bfd_proto *p, struct bfd_iface *ifa, struct bfd_con (new->passive != old->passive); /* This should be probably changed to not access ifa->cf from the BFD thread */ - birdloop_enter(p->loop); + birdloop_enter(p->p.loop); ifa->cf = new; - birdloop_leave(p->loop); + birdloop_leave(p->p.loop); } @@ -659,7 +665,7 @@ bfd_add_request(struct bfd_proto *p, struct bfd_request *req) { struct bfd_config *cf = (struct bfd_config *) (p->p.cf); - if (p->p.vrf_set && (p->p.vrf != req->vrf)) + if (p->p.vrf && (p->p.vrf != req->vrf)) return 0; if (ipa_is_ip4(req->addr) ? !cf->accept_ipv4 : !cf->accept_ipv6) @@ -690,41 +696,68 @@ bfd_add_request(struct bfd_proto *p, struct bfd_request *req) } static void -bfd_submit_request(struct bfd_request *req) +bfd_pickup_requests(void *_data UNUSED) { node *n; + WALK_LIST(n, bfd_global.proto_list) + { + struct bfd_proto *p = SKIP_BACK(struct bfd_proto, bfd_node, n); + birdloop_enter(p->p.loop); + BFD_LOCK; - WALK_LIST(n, bfd_proto_list) - if (bfd_add_request(SKIP_BACK(struct bfd_proto, bfd_node, n), req)) - return; + node *rn, *rnxt; + WALK_LIST_DELSAFE(rn, rnxt, bfd_global.pickup_list) + bfd_add_request(p, SKIP_BACK(struct bfd_request, n, rn)); - rem_node(&req->n); - add_tail(&bfd_wait_list, &req->n); - req->session = NULL; - bfd_request_notify(req, BFD_STATE_ADMIN_DOWN, 0); + BFD_UNLOCK; + birdloop_leave(p->p.loop); + } + + BFD_LOCK; + node *rn, *rnxt; + WALK_LIST_DELSAFE(rn, rnxt, bfd_global.pickup_list) + { + rem_node(rn); + add_tail(&bfd_global.wait_list, rn); + bfd_request_notify(SKIP_BACK(struct bfd_request, n, rn), BFD_STATE_ADMIN_DOWN, 0); + } + BFD_UNLOCK; } +static event bfd_pickup_event = { .hook = bfd_pickup_requests }; + static void bfd_take_requests(struct bfd_proto *p) { node *n, *nn; - - WALK_LIST_DELSAFE(n, nn, bfd_wait_list) + BFD_LOCK; + WALK_LIST_DELSAFE(n, nn, bfd_global.wait_list) bfd_add_request(p, SKIP_BACK(struct bfd_request, n, n)); + BFD_UNLOCK; } static void bfd_drop_requests(struct bfd_proto *p) { node *n; - - HASH_WALK(p->session_hash_id, next_id, s) + BFD_LOCK; + HASH_WALK_DELSAFE(p->session_hash_id, next_id, s) { - /* We assume that p is not in bfd_proto_list */ WALK_LIST_FIRST(n, s->request_list) - bfd_submit_request(SKIP_BACK(struct bfd_request, n, n)); + { + struct bfd_request *req = SKIP_BACK(struct bfd_request, n, n); + rem_node(&req->n); + add_tail(&bfd_global.pickup_list, &req->n); + req->session = NULL; + bfd_request_notify(req, BFD_STATE_ADMIN_DOWN, 0); + } + + ev_send(&global_event_list, &bfd_pickup_event); + + bfd_remove_session_locked(p, s); } HASH_WALK_END; + BFD_UNLOCK; } static struct resclass bfd_request_class; @@ -737,9 +770,6 @@ bfd_request_session(pool *p, ip_addr addr, ip_addr local, { struct bfd_request *req = ralloc(p, &bfd_request_class); - /* Hack: self-link req->n, we will call rem_node() on it */ - req->n.prev = req->n.next = &req->n; - req->addr = addr; req->local = local; req->iface = iface; @@ -748,11 +778,16 @@ bfd_request_session(pool *p, ip_addr addr, ip_addr local, if (opts) req->opts = *opts; - bfd_submit_request(req); - req->hook = hook; req->data = data; + req->session = NULL; + + BFD_LOCK; + add_tail(&bfd_global.pickup_list, &req->n); + ev_send(&global_event_list, &bfd_pickup_event); + BFD_UNLOCK; + return req; } @@ -916,21 +951,15 @@ bfd_reconfigure_neighbors(struct bfd_proto *p, struct bfd_config *new) /* This core notify code should be replaced after main loop transition to birdloop */ -int pipe(int pipefd[2]); -void pipe_drain(int fd); -void pipe_kick(int fd); - -static int -bfd_notify_hook(sock *sk, uint len UNUSED) +static void +bfd_notify_hook(void *data) { - struct bfd_proto *p = sk->data; + struct bfd_proto *p = data; struct bfd_session *s; list tmp_list; u8 state, diag; node *n, *nn; - pipe_drain(sk->fd); - bfd_lock_sessions(p); init_list(&tmp_list); add_tail_list(&tmp_list, &p->notify_list); @@ -954,55 +983,8 @@ bfd_notify_hook(sock *sk, uint len UNUSED) if (EMPTY_LIST(s->request_list)) bfd_remove_session(p, s); } - - return 0; -} - -static inline void -bfd_notify_kick(struct bfd_proto *p) -{ - pipe_kick(p->notify_ws->fd); -} - -static void -bfd_noterr_hook(sock *sk, int err) -{ - struct bfd_proto *p = sk->data; - log(L_ERR "%s: Notify socket error: %m", p->p.name, err); -} - -static void -bfd_notify_init(struct bfd_proto *p) -{ - int pfds[2]; - sock *sk; - - int rv = pipe(pfds); - if (rv < 0) - die("pipe: %m"); - - sk = sk_new(p->p.pool); - sk->type = SK_MAGIC; - sk->rx_hook = bfd_notify_hook; - sk->err_hook = bfd_noterr_hook; - sk->fd = pfds[0]; - sk->data = p; - if (sk_open(sk) < 0) - die("bfd: sk_open failed"); - p->notify_rs = sk; - - /* The write sock is not added to any event loop */ - sk = sk_new(p->p.pool); - sk->type = SK_MAGIC; - sk->fd = pfds[1]; - sk->data = p; - sk->flags = SKF_THREAD; - if (sk_open(sk) < 0) - die("bfd: sk_open failed"); - p->notify_ws = sk; } - /* * BFD protocol glue */ @@ -1023,10 +1005,10 @@ bfd_start(struct proto *P) struct bfd_proto *p = (struct bfd_proto *) P; struct bfd_config *cf = (struct bfd_config *) (P->cf); - p->loop = birdloop_new(); - p->tpool = rp_new(NULL, "BFD thread root"); pthread_spin_init(&p->lock, PTHREAD_PROCESS_PRIVATE); + p->tpool = rp_new(P->pool, "BFD loop pool"); + p->session_slab = sl_new(P->pool, sizeof(struct bfd_session)); HASH_INIT(p->session_hash_id, P->pool, 8); HASH_INIT(p->session_hash_ip, P->pool, 8); @@ -1034,11 +1016,12 @@ bfd_start(struct proto *P) init_list(&p->iface_list); init_list(&p->notify_list); - bfd_notify_init(p); - - add_tail(&bfd_proto_list, &p->bfd_node); + p->notify_event = (event) { + .hook = bfd_notify_hook, + .data = p, + }; - birdloop_enter(p->loop); + add_tail(&bfd_global.proto_list, &p->bfd_node); if (!cf->strict_bind) { @@ -1055,42 +1038,33 @@ bfd_start(struct proto *P) p->rx6_m = bfd_open_rx_sk(p, 1, SK_IPV6); } - birdloop_leave(p->loop); - bfd_take_requests(p); struct bfd_neighbor *n; WALK_LIST(n, cf->neigh_list) bfd_start_neighbor(p, n); - birdloop_start(p->loop); - return PS_UP; } - static int bfd_shutdown(struct proto *P) { struct bfd_proto *p = (struct bfd_proto *) P; - struct bfd_config *cf = (struct bfd_config *) (P->cf); + struct bfd_config *cf = (struct bfd_config *) (p->p.cf); rem_node(&p->bfd_node); - birdloop_stop(p->loop); - - struct bfd_neighbor *n; - WALK_LIST(n, cf->neigh_list) - bfd_stop_neighbor(p, n); + struct bfd_neighbor *bn; + WALK_LIST(bn, cf->neigh_list) + bfd_stop_neighbor(p, bn); bfd_drop_requests(p); - /* FIXME: This is hack */ - birdloop_enter(p->loop); - rfree(p->tpool); - birdloop_leave(p->loop); - - birdloop_free(p->loop); + if (p->rx4_1) sk_stop(p->rx4_1); + if (p->rx4_m) sk_stop(p->rx4_m); + if (p->rx6_1) sk_stop(p->rx6_1); + if (p->rx6_m) sk_stop(p->rx6_m); return PS_DOWN; } @@ -1111,7 +1085,7 @@ bfd_reconfigure(struct proto *P, struct proto_config *c) (new->strict_bind != old->strict_bind)) return 0; - birdloop_mask_wakeups(p->loop); + birdloop_mask_wakeups(p->p.loop); WALK_LIST(ifa, p->iface_list) bfd_reconfigure_iface(p, ifa, new); @@ -1125,7 +1099,7 @@ bfd_reconfigure(struct proto *P, struct proto_config *c) bfd_reconfigure_neighbors(p, new); - birdloop_unmask_wakeups(p->loop); + birdloop_unmask_wakeups(p->p.loop); return 1; } @@ -1196,4 +1170,9 @@ void bfd_build(void) { proto_build(&proto_bfd); + + bfd_global.lock = DOMAIN_NEW(rtable, "BFD Global"); + init_list(&bfd_global.wait_list); + init_list(&bfd_global.pickup_list); + init_list(&bfd_global.proto_list); } diff --git a/proto/bfd/bfd.h b/proto/bfd/bfd.h index 60b7916c..9a8e20c6 100644 --- a/proto/bfd/bfd.h +++ b/proto/bfd/bfd.h @@ -17,12 +17,12 @@ #include "nest/password.h" #include "conf/conf.h" #include "lib/hash.h" +#include "lib/io-loop.h" #include "lib/resource.h" #include "lib/socket.h" #include "lib/string.h" #include "nest/bfd.h" -#include "io.h" #define BFD_CONTROL_PORT 3784 @@ -88,17 +88,18 @@ struct bfd_neighbor struct bfd_proto { struct proto p; - struct birdloop *loop; - pool *tpool; + pthread_spinlock_t lock; + + pool *tpool; + node bfd_node; slab *session_slab; HASH(struct bfd_session) session_hash_id; HASH(struct bfd_session) session_hash_ip; - sock *notify_rs; - sock *notify_ws; + event notify_event; list notify_list; sock *rx4_1; diff --git a/proto/bfd/config.Y b/proto/bfd/config.Y index 70461872..0d6e33fa 100644 --- a/proto/bfd/config.Y +++ b/proto/bfd/config.Y @@ -37,6 +37,7 @@ proto: bfd_proto ; bfd_proto_start: proto_start BFD { this_proto = proto_config_new(&proto_bfd, $1); + this_proto->loop_order = DOMAIN_ORDER(proto); init_list(&BFD_CFG->patt_list); init_list(&BFD_CFG->neigh_list); BFD_CFG->accept_ipv4 = BFD_CFG->accept_ipv6 = 1; diff --git a/proto/bfd/io.c b/proto/bfd/io.c deleted file mode 100644 index e696cc89..00000000 --- a/proto/bfd/io.c +++ /dev/null @@ -1,537 +0,0 @@ -/* - * BIRD -- I/O and event loop - * - * Can be freely distributed and used under the terms of the GNU GPL. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <errno.h> -#include <fcntl.h> -#include <poll.h> -#include <pthread.h> -#include <time.h> -#include <sys/time.h> - -#include "nest/bird.h" -#include "proto/bfd/io.h" - -#include "lib/buffer.h" -#include "lib/lists.h" -#include "lib/resource.h" -#include "lib/event.h" -#include "lib/timer.h" -#include "lib/socket.h" - - -struct birdloop -{ - pool *pool; - pthread_t thread; - pthread_mutex_t mutex; - - u8 stop_called; - u8 poll_active; - u8 wakeup_masked; - int wakeup_fds[2]; - - struct timeloop time; - list event_list; - list sock_list; - uint sock_num; - - BUFFER(sock *) poll_sk; - BUFFER(struct pollfd) poll_fd; - u8 poll_changed; - u8 close_scheduled; -}; - - -/* - * Current thread context - */ - -static pthread_key_t current_loop_key; -extern pthread_key_t current_time_key; - -static inline struct birdloop * -birdloop_current(void) -{ - return pthread_getspecific(current_loop_key); -} - -static inline void -birdloop_set_current(struct birdloop *loop) -{ - pthread_setspecific(current_loop_key, loop); - pthread_setspecific(current_time_key, loop ? &loop->time : &main_timeloop); -} - -static inline void -birdloop_init_current(void) -{ - pthread_key_create(¤t_loop_key, NULL); -} - - -/* - * Wakeup code for birdloop - */ - -static void -pipe_new(int *pfds) -{ - int rv = pipe(pfds); - if (rv < 0) - die("pipe: %m"); - - if (fcntl(pfds[0], F_SETFL, O_NONBLOCK) < 0) - die("fcntl(O_NONBLOCK): %m"); - - if (fcntl(pfds[1], F_SETFL, O_NONBLOCK) < 0) - die("fcntl(O_NONBLOCK): %m"); -} - -void -pipe_drain(int fd) -{ - char buf[64]; - int rv; - - try: - rv = read(fd, buf, 64); - if (rv < 0) - { - if (errno == EINTR) - goto try; - if (errno == EAGAIN) - return; - die("wakeup read: %m"); - } - if (rv == 64) - goto try; -} - -void -pipe_kick(int fd) -{ - u64 v = 1; - int rv; - - try: - rv = write(fd, &v, sizeof(u64)); - if (rv < 0) - { - if (errno == EINTR) - goto try; - if (errno == EAGAIN) - return; - die("wakeup write: %m"); - } -} - -static inline void -wakeup_init(struct birdloop *loop) -{ - pipe_new(loop->wakeup_fds); -} - -static inline void -wakeup_drain(struct birdloop *loop) -{ - pipe_drain(loop->wakeup_fds[0]); -} - -static inline void -wakeup_do_kick(struct birdloop *loop) -{ - pipe_kick(loop->wakeup_fds[1]); -} - -static inline void -wakeup_kick(struct birdloop *loop) -{ - if (!loop->wakeup_masked) - wakeup_do_kick(loop); - else - loop->wakeup_masked = 2; -} - -/* For notifications from outside */ -void -wakeup_kick_current(void) -{ - struct birdloop *loop = birdloop_current(); - - if (loop && loop->poll_active) - wakeup_kick(loop); -} - - -/* - * Events - */ - -static inline uint -events_waiting(struct birdloop *loop) -{ - return !EMPTY_LIST(loop->event_list); -} - -static inline void -events_init(struct birdloop *loop) -{ - init_list(&loop->event_list); -} - -static void -events_fire(struct birdloop *loop) -{ - times_update(&loop->time); - ev_run_list(&loop->event_list); -} - -void -ev2_schedule(event *e) -{ - struct birdloop *loop = birdloop_current(); - - if (loop->poll_active && EMPTY_LIST(loop->event_list)) - wakeup_kick(loop); - - if (e->n.next) - rem_node(&e->n); - - add_tail(&loop->event_list, &e->n); -} - - -/* - * Sockets - */ - -static void -sockets_init(struct birdloop *loop) -{ - init_list(&loop->sock_list); - loop->sock_num = 0; - - BUFFER_INIT(loop->poll_sk, loop->pool, 4); - BUFFER_INIT(loop->poll_fd, loop->pool, 4); - loop->poll_changed = 1; /* add wakeup fd */ -} - -static void -sockets_add(struct birdloop *loop, sock *s) -{ - add_tail(&loop->sock_list, &s->n); - loop->sock_num++; - - s->index = -1; - loop->poll_changed = 1; - - if (loop->poll_active) - wakeup_kick(loop); -} - -void -sk_start(sock *s) -{ - struct birdloop *loop = birdloop_current(); - - sockets_add(loop, s); -} - -static void -sockets_remove(struct birdloop *loop, sock *s) -{ - rem_node(&s->n); - loop->sock_num--; - - if (s->index >= 0) - loop->poll_sk.data[s->index] = NULL; - - s->index = -1; - loop->poll_changed = 1; - - /* Wakeup moved to sk_stop() */ -} - -void -sk_stop(sock *s) -{ - struct birdloop *loop = birdloop_current(); - - sockets_remove(loop, s); - - if (loop->poll_active) - { - loop->close_scheduled = 1; - wakeup_kick(loop); - } - else - close(s->fd); - - s->fd = -1; -} - -static inline uint sk_want_events(sock *s) -{ return (s->rx_hook ? POLLIN : 0) | ((s->ttx != s->tpos) ? POLLOUT : 0); } - -/* -FIXME: this should be called from sock code - -static void -sockets_update(struct birdloop *loop, sock *s) -{ - if (s->index >= 0) - loop->poll_fd.data[s->index].events = sk_want_events(s); -} -*/ - -static void -sockets_prepare(struct birdloop *loop) -{ - BUFFER_SET(loop->poll_sk, loop->sock_num + 1); - BUFFER_SET(loop->poll_fd, loop->sock_num + 1); - - struct pollfd *pfd = loop->poll_fd.data; - sock **psk = loop->poll_sk.data; - uint i = 0; - node *n; - - WALK_LIST(n, loop->sock_list) - { - sock *s = SKIP_BACK(sock, n, n); - - ASSERT(i < loop->sock_num); - - s->index = i; - *psk = s; - pfd->fd = s->fd; - pfd->events = sk_want_events(s); - pfd->revents = 0; - - pfd++; - psk++; - i++; - } - - ASSERT(i == loop->sock_num); - - /* Add internal wakeup fd */ - *psk = NULL; - pfd->fd = loop->wakeup_fds[0]; - pfd->events = POLLIN; - pfd->revents = 0; - - loop->poll_changed = 0; -} - -static void -sockets_close_fds(struct birdloop *loop) -{ - struct pollfd *pfd = loop->poll_fd.data; - sock **psk = loop->poll_sk.data; - int poll_num = loop->poll_fd.used - 1; - - int i; - for (i = 0; i < poll_num; i++) - if (psk[i] == NULL) - close(pfd[i].fd); - - loop->close_scheduled = 0; -} - -int sk_read(sock *s, int revents); -int sk_write(sock *s); - -static void -sockets_fire(struct birdloop *loop) -{ - struct pollfd *pfd = loop->poll_fd.data; - sock **psk = loop->poll_sk.data; - int poll_num = loop->poll_fd.used - 1; - - times_update(&loop->time); - - /* Last fd is internal wakeup fd */ - if (pfd[poll_num].revents & POLLIN) - wakeup_drain(loop); - - int i; - for (i = 0; i < poll_num; pfd++, psk++, i++) - { - int e = 1; - - if (! pfd->revents) - continue; - - if (pfd->revents & POLLNVAL) - die("poll: invalid fd %d", pfd->fd); - - if (pfd->revents & POLLIN) - while (e && *psk && (*psk)->rx_hook) - e = sk_read(*psk, 0); - - e = 1; - if (pfd->revents & POLLOUT) - while (e && *psk) - e = sk_write(*psk); - } -} - - -/* - * Birdloop - */ - -static void * birdloop_main(void *arg); - -struct birdloop * -birdloop_new(void) -{ - /* FIXME: this init should be elsewhere and thread-safe */ - static int init = 0; - if (!init) - { birdloop_init_current(); init = 1; } - - pool *p = rp_new(NULL, "Birdloop root"); - struct birdloop *loop = mb_allocz(p, sizeof(struct birdloop)); - loop->pool = p; - pthread_mutex_init(&loop->mutex, NULL); - - wakeup_init(loop); - - events_init(loop); - timers_init(&loop->time, p); - sockets_init(loop); - - return loop; -} - -void -birdloop_start(struct birdloop *loop) -{ - int rv = pthread_create(&loop->thread, NULL, birdloop_main, loop); - if (rv) - die("pthread_create(): %M", rv); -} - -void -birdloop_stop(struct birdloop *loop) -{ - pthread_mutex_lock(&loop->mutex); - loop->stop_called = 1; - wakeup_do_kick(loop); - pthread_mutex_unlock(&loop->mutex); - - int rv = pthread_join(loop->thread, NULL); - if (rv) - die("pthread_join(): %M", rv); -} - -void -birdloop_free(struct birdloop *loop) -{ - rfree(loop->pool); -} - - -void -birdloop_enter(struct birdloop *loop) -{ - /* TODO: these functions could save and restore old context */ - pthread_mutex_lock(&loop->mutex); - birdloop_set_current(loop); -} - -void -birdloop_leave(struct birdloop *loop) -{ - /* TODO: these functions could save and restore old context */ - birdloop_set_current(NULL); - pthread_mutex_unlock(&loop->mutex); -} - -void -birdloop_mask_wakeups(struct birdloop *loop) -{ - pthread_mutex_lock(&loop->mutex); - loop->wakeup_masked = 1; - pthread_mutex_unlock(&loop->mutex); -} - -void -birdloop_unmask_wakeups(struct birdloop *loop) -{ - pthread_mutex_lock(&loop->mutex); - if (loop->wakeup_masked == 2) - wakeup_do_kick(loop); - loop->wakeup_masked = 0; - pthread_mutex_unlock(&loop->mutex); -} - -static void * -birdloop_main(void *arg) -{ - struct birdloop *loop = arg; - timer *t; - int rv, timeout; - - birdloop_set_current(loop); - - tmp_init(loop->pool); - - pthread_mutex_lock(&loop->mutex); - while (1) - { - events_fire(loop); - timers_fire(&loop->time); - - times_update(&loop->time); - if (events_waiting(loop)) - timeout = 0; - else if (t = timers_first(&loop->time)) - timeout = (tm_remains(t) TO_MS) + 1; - else - timeout = -1; - - if (loop->poll_changed) - sockets_prepare(loop); - - loop->poll_active = 1; - pthread_mutex_unlock(&loop->mutex); - - try: - rv = poll(loop->poll_fd.data, loop->poll_fd.used, timeout); - if (rv < 0) - { - if (errno == EINTR || errno == EAGAIN) - goto try; - die("poll: %m"); - } - - pthread_mutex_lock(&loop->mutex); - loop->poll_active = 0; - - if (loop->close_scheduled) - sockets_close_fds(loop); - - if (loop->stop_called) - break; - - if (rv) - sockets_fire(loop); - - timers_fire(&loop->time); - } - - loop->stop_called = 0; - pthread_mutex_unlock(&loop->mutex); - - return NULL; -} - - diff --git a/proto/bfd/io.h b/proto/bfd/io.h deleted file mode 100644 index ec706e9a..00000000 --- a/proto/bfd/io.h +++ /dev/null @@ -1,34 +0,0 @@ -/* - * BIRD -- I/O and event loop - * - * Can be freely distributed and used under the terms of the GNU GPL. - */ - -#ifndef _BIRD_BFD_IO_H_ -#define _BIRD_BFD_IO_H_ - -#include "nest/bird.h" -#include "lib/lists.h" -#include "lib/resource.h" -#include "lib/event.h" -#include "lib/timer.h" -#include "lib/socket.h" - - -void ev2_schedule(event *e); - -void sk_start(sock *s); -void sk_stop(sock *s); - -struct birdloop *birdloop_new(void); -void birdloop_start(struct birdloop *loop); -void birdloop_stop(struct birdloop *loop); -void birdloop_free(struct birdloop *loop); - -void birdloop_enter(struct birdloop *loop); -void birdloop_leave(struct birdloop *loop); -void birdloop_mask_wakeups(struct birdloop *loop); -void birdloop_unmask_wakeups(struct birdloop *loop); - - -#endif /* _BIRD_BFD_IO_H_ */ diff --git a/proto/bfd/packets.c b/proto/bfd/packets.c index 5f10734c..6f0b4eaf 100644 --- a/proto/bfd/packets.c +++ b/proto/bfd/packets.c @@ -412,7 +412,7 @@ bfd_err_hook(sock *sk, int err) sock * bfd_open_rx_sk(struct bfd_proto *p, int multihop, int af) { - sock *sk = sk_new(p->tpool); + sock *sk = sk_new(p->p.pool); sk->type = SK_UDP; sk->subtype = af; sk->sport = !multihop ? BFD_CONTROL_PORT : BFD_MULTI_CTL_PORT; @@ -475,7 +475,7 @@ bfd_open_rx_sk_bound(struct bfd_proto *p, ip_addr local, struct iface *ifa) sock * bfd_open_tx_sk(struct bfd_proto *p, ip_addr local, struct iface *ifa) { - sock *sk = sk_new(p->tpool); + sock *sk = sk_new(p->p.pool); sk->type = SK_UDP; sk->saddr = local; sk->dport = ifa ? BFD_CONTROL_PORT : BFD_MULTI_CTL_PORT; diff --git a/proto/bgp/Makefile b/proto/bgp/Makefile index 2a4cc99c..f6a38678 100644 --- a/proto/bgp/Makefile +++ b/proto/bgp/Makefile @@ -2,6 +2,5 @@ src := attrs.c bgp.c packets.c obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,bgp_build) tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 084c9b63..bfdd9ac5 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -939,6 +939,18 @@ bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint fla bgp_set_attr_ptr(to, BA_LARGE_COMMUNITY, flags, ad); } + +static void +bgp_decode_otc(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to) +{ + if (len != 4) + WITHDRAW(BAD_LENGTH, "OTC", len); + + u32 val = get_u32(data); + bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, flags, val); +} + + static void bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a) { @@ -1175,6 +1187,13 @@ static union bgp_attr_desc bgp_attr_table[BGP_ATTR_MAX] = { .encode = bgp_encode_u32s, .decode = bgp_decode_large_community, }, + [BA_ONLY_TO_CUSTOMER] = { + .name = "otc", + .type = T_INT, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .encode = bgp_encode_u32, + .decode = bgp_decode_otc, + }, [BA_MPLS_LABEL_STACK] = { .name = "bgp_mpls_label_stack", .type = T_CLIST, @@ -1504,6 +1523,29 @@ bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to) REPORT("Discarding AIGP attribute received on non-AIGP session"); bgp_unset_attr(to, BA_AIGP); } + + /* Handle OTC ingress procedure, RFC 9234 */ + if (bgp_channel_is_role_applicable(s->channel)) + { + struct bgp_proto *p = s->proto; + eattr *e = bgp_find_attr(*to, BA_ONLY_TO_CUSTOMER); + + /* Reject routes from downstream if they are leaked */ + if (e && (p->cf->local_role == BGP_ROLE_PROVIDER || + p->cf->local_role == BGP_ROLE_RS_SERVER)) + WITHDRAW("Route leak detected - OTC attribute from downstream"); + + /* Reject routes from peers if they are leaked */ + if (e && (p->cf->local_role == BGP_ROLE_PEER) && (e->u.data != p->cf->remote_as)) + WITHDRAW("Route leak detected - OTC attribute with mismatched ASN (%u)", + (uint) e->u.data); + + /* Mark routes from upstream if it did not happened before */ + if (!e && (p->cf->local_role == BGP_ROLE_CUSTOMER || + p->cf->local_role == BGP_ROLE_PEER || + p->cf->local_role == BGP_ROLE_RS_CLIENT)) + bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, 0, p->cf->remote_as); + } } @@ -1522,8 +1564,8 @@ bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to) HASH_DEFINE_REHASH_FN(RBH, struct bgp_bucket) -void -bgp_init_bucket_table(struct bgp_channel *c) +static void +bgp_init_bucket_table(struct bgp_pending_tx *c) { HASH_INIT(c->bucket_hash, c->pool, 8); @@ -1531,24 +1573,8 @@ bgp_init_bucket_table(struct bgp_channel *c) c->withdraw_bucket = NULL; } -void -bgp_free_bucket_table(struct bgp_channel *c) -{ - HASH_FREE(c->bucket_hash); - - struct bgp_bucket *b; - WALK_LIST_FIRST(b, c->bucket_queue) - { - rem_node(&b->send_node); - mb_free(b); - } - - mb_free(c->withdraw_bucket); - c->withdraw_bucket = NULL; -} - static struct bgp_bucket * -bgp_get_bucket(struct bgp_channel *c, ea_list *new) +bgp_get_bucket(struct bgp_pending_tx *c, ea_list *new) { /* Hash and lookup */ u32 hash = ea_hash(new); @@ -1577,7 +1603,7 @@ bgp_get_bucket(struct bgp_channel *c, ea_list *new) } static struct bgp_bucket * -bgp_get_withdraw_bucket(struct bgp_channel *c) +bgp_get_withdraw_bucket(struct bgp_pending_tx *c) { if (!c->withdraw_bucket) { @@ -1589,15 +1615,17 @@ bgp_get_withdraw_bucket(struct bgp_channel *c) } static void -bgp_free_bucket_xx(struct bgp_channel *c, struct bgp_bucket *b) +bgp_free_bucket(struct bgp_pending_tx *c, struct bgp_bucket *b) { HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b); mb_free(b); } int -bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_done_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { + struct bgp_pending_tx *c = bc->ptx; + /* Won't free the withdraw bucket */ if (b == c->withdraw_bucket) return 0; @@ -1608,21 +1636,23 @@ bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b) if (b->px_uc || !EMPTY_LIST(b->prefixes)) return 0; - bgp_free_bucket_xx(c, b); + bgp_free_bucket(c, b); return 1; } void -bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_defer_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { + struct bgp_pending_tx *c = bc->ptx; rem_node(&b->send_node); add_tail(&c->bucket_queue, &b->send_node); } void -bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_withdraw_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { - struct bgp_proto *p = (void *) c->c.proto; + struct bgp_proto *p = (void *) bc->c.proto; + struct bgp_pending_tx *c = bc->ptx; struct bgp_bucket *wb = bgp_get_withdraw_bucket(c); log(L_ERR "%s: Attribute list too long", p->p.name); @@ -1643,7 +1673,7 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) #define PXH_KEY(px) px->net, px->path_id, px->hash #define PXH_NEXT(px) px->next -#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (c->add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2) +#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2) #define PXH_FN(n,i,h) h #define PXH_REHASH bgp_pxh_rehash @@ -1652,28 +1682,21 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) HASH_DEFINE_REHASH_FN(PXH, struct bgp_prefix) -void -bgp_init_prefix_table(struct bgp_channel *c) +static void +bgp_init_prefix_table(struct bgp_channel *bc) { + struct bgp_pending_tx *c = bc->ptx; HASH_INIT(c->prefix_hash, c->pool, 8); - uint alen = net_addr_length[c->c.net_type]; + uint alen = net_addr_length[bc->c.net_type]; c->prefix_slab = alen ? sl_new(c->pool, sizeof(struct bgp_prefix) + alen) : NULL; } -void -bgp_free_prefix_table(struct bgp_channel *c) -{ - HASH_FREE(c->prefix_hash); - - rfree(c->prefix_slab); - c->prefix_slab = NULL; -} - static struct bgp_prefix * -bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id) +bgp_get_prefix(struct bgp_pending_tx *c, const net_addr *net, struct rte_src *src, int add_path_tx) { - u32 path_id_hash = c->add_path_tx ? path_id : 0; + u32 path_id = src->global_id; + u32 path_id_hash = add_path_tx ? path_id : 0; /* We must use a different hash function than the rtable */ u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id_hash)); struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id_hash, hash); @@ -1690,21 +1713,23 @@ bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id) px->hash = hash; px->path_id = path_id; net_copy(px->net, net); + rt_lock_source(src); HASH_INSERT2(c->prefix_hash, PXH, c->pool, px); return px; } -static void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px); +static void bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px); static inline int bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *b) { +#define IS_WITHDRAW_BUCKET(b) ((b) == c->ptx->withdraw_bucket) #define BPX_TRACE(what) do { \ if (c->c.debug & D_ROUTES) log(L_TRACE "%s.%s < %s %N %uG %s", \ c->c.proto->name, c->c.name, what, \ - px->net, px->path_id, (b == c->withdraw_bucket) ? "withdraw" : "update"); } while (0) + px->net, px->path_id, IS_WITHDRAW_BUCKET(b) ? "withdraw" : "update"); } while (0) px->lastmod = current_time(); /* Already queued for the same bucket */ @@ -1722,7 +1747,7 @@ bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucke } /* The new bucket is the same as we sent before */ - if ((px->last == b) || c->c.out_table && !px->last && (b == c->withdraw_bucket)) + if ((px->last == b) || c->c.out_table && !px->last && IS_WITHDRAW_BUCKET(b)) { if (px->cur) BPX_TRACE("reverted"); @@ -1731,15 +1756,15 @@ bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucke /* Well, we haven't sent anything yet */ if (!px->last) - bgp_free_prefix(c, px); + bgp_free_prefix(c->ptx, px); px->cur = NULL; return 0; } /* Enqueue the bucket if it has been empty */ - if ((b != c->withdraw_bucket) && EMPTY_LIST(b->prefixes)) - add_tail(&c->bucket_queue, &b->send_node); + if (!IS_WITHDRAW_BUCKET(b) && EMPTY_LIST(b->prefixes)) + add_tail(&c->ptx->bucket_queue, &b->send_node); /* Enqueue to the new bucket and indicate the change */ add_tail(&b->prefixes, &px->buck_node_xx); @@ -1752,10 +1777,12 @@ bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucke } static void -bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px) +bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px) { HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px); + rt_unlock_source(rt_find_source_global(px->path_id)); + if (c->prefix_slab) sl_free(px); else @@ -1780,7 +1807,7 @@ bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket px->last->px_uc--; /* Ref the current sent version */ - if (buck != c->withdraw_bucket) + if (!IS_WITHDRAW_BUCKET(buck)) { px->last = buck; px->last->px_uc++; @@ -1790,7 +1817,49 @@ bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket /* Prefixes belonging to the withdraw bucket are freed always */ } - bgp_free_prefix(c, px); + bgp_free_prefix(c->ptx, px); +} + +static void +bgp_pending_tx_rfree(resource *r) +{ + struct bgp_pending_tx *ptx = SKIP_BACK(struct bgp_pending_tx, r, r); + + HASH_WALK(ptx->prefix_hash, next, n) + rt_unlock_source(rt_find_source_global(n->path_id)); + HASH_WALK_END; +} + +static void bgp_pending_tx_dump(resource *r UNUSED) { debug("\n"); } + +static struct resclass bgp_pending_tx_class = { + .name = "BGP Pending TX", + .size = sizeof(struct bgp_pending_tx), + .free = bgp_pending_tx_rfree, + .dump = bgp_pending_tx_dump, +}; + +void +bgp_init_pending_tx(struct bgp_channel *c) +{ + ASSERT_DIE(!c->ptx); + + pool *p = rp_new(c->pool, "BGP Pending TX"); + c->ptx = ralloc(p, &bgp_pending_tx_class); + c->ptx->pool = p; + + bgp_init_bucket_table(c->ptx); + bgp_init_prefix_table(c); +} + +void +bgp_free_pending_tx(struct bgp_channel *c) +{ + ASSERT_DIE(c->ptx); + ASSERT_DIE(c->ptx->pool); + + rfree(c->ptx->pool); + c->ptx = NULL; } @@ -1798,24 +1867,30 @@ bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket * Prefix hash table exporter */ +struct bgp_out_export_hook { + struct rt_export_hook h; + u32 hash_iter; /* Iterator over hash */ +}; + static void bgp_out_table_feed(void *data) { - struct rt_export_hook *hook = data; - struct bgp_channel *c = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->table); + struct bgp_out_export_hook *hook = data; + struct bgp_channel *bc = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->h.table); + struct bgp_pending_tx *c = bc->ptx; int max = 512; - const net_addr *neq = (hook->req->addr_mode == TE_ADDR_EQUAL) ? hook->req->addr : NULL; + const net_addr *neq = (hook->h.req->addr_mode == TE_ADDR_EQUAL) ? hook->h.req->addr : NULL; const net_addr *cand = NULL; do { HASH_WALK_ITER(c->prefix_hash, PXH, n, hook->hash_iter) { - switch (hook->req->addr_mode) + switch (hook->h.req->addr_mode) { case TE_ADDR_IN: - if (!net_in_netX(n->net, hook->req->addr)) + if (!net_in_netX(n->net, hook->h.req->addr)) continue; /* fall through */ case TE_ADDR_NONE: @@ -1827,7 +1902,7 @@ bgp_out_table_feed(void *data) case TE_ADDR_FOR: if (!neq) { - if (net_in_netX(hook->req->addr, n->net) && (!cand || (n->net->length > cand->length))) + if (net_in_netX(hook->h.req->addr, n->net) && (!cand || (n->net->length > cand->length))) cand = n->net; continue; } @@ -1872,13 +1947,13 @@ bgp_out_table_feed(void *data) .new = &es, .new_best = &es, }; - if (hook->req->export_bulk) + if (hook->h.req->export_bulk) { rte *feed = &es.rte; - hook->req->export_bulk(hook->req, n->net, &rpe, &feed, 1); + hook->h.req->export_bulk(hook->h.req, n->net, &rpe, &feed, 1); } - else if (hook->req->export_one) - hook->req->export_one(hook->req, n->net, &rpe); + else if (hook->h.req->export_one) + hook->h.req->export_one(hook->h.req, n->net, &rpe); else bug("No export method in export request"); } @@ -1889,36 +1964,51 @@ bgp_out_table_feed(void *data) } while (neq); if (hook->hash_iter) - ev_schedule_work(hook->event); + ev_schedule_work(&hook->h.event); else - rt_set_export_state(hook, TES_READY); + rt_set_export_state(&hook->h, TES_READY); +} + +static void +bgp_out_table_export_start(struct rt_exporter *re, struct rt_export_request *req) +{ + req->hook = rt_alloc_export(re, sizeof(struct bgp_out_export_hook)); + req->hook->req = req; + + struct bgp_out_export_hook *hook = SKIP_BACK(struct bgp_out_export_hook, h, req->hook); + + hook->h.event.hook = bgp_out_table_feed; + rt_init_export(re, req->hook); } -static struct rt_export_hook * -bgp_out_table_export_start(struct rt_exporter *re, struct rt_export_request *req UNUSED) +static void +bgp_out_table_export_done(void *data) { - struct bgp_channel *c = SKIP_BACK(struct bgp_channel, prefix_exporter, re); - pool *p = rp_new(c->c.proto->pool, "Export hook"); - struct rt_export_hook *hook = mb_allocz(p, sizeof(struct rt_export_hook)); - hook->pool = p; - hook->lp = lp_new_default(p); - hook->event = ev_new_init(p, bgp_out_table_feed, hook); - hook->feed_type = TFT_HASH; + struct bgp_out_export_hook *hook = data; + struct rt_export_request *req = hook->h.req; + void (*stopped)(struct rt_export_request *) = hook->h.stopped; - return hook; + rt_export_stopped(&hook->h); + CALL(stopped, req); } +static const struct rt_exporter_class bgp_out_table_export_class = { + .start = bgp_out_table_export_start, + .done = bgp_out_table_export_done, +}; + void bgp_setup_out_table(struct bgp_channel *c) { ASSERT_DIE(c->c.out_table == NULL); c->prefix_exporter = (struct rt_exporter) { + .class = &bgp_out_table_export_class, .addr_type = c->c.table->addr_type, - .start = bgp_out_table_export_start, + .rp = c->c.proto->pool, }; - init_list(&c->prefix_exporter.hooks); + rt_exporter_init(&c->prefix_exporter); c->c.out_table = &c->prefix_exporter; } @@ -1929,11 +2019,11 @@ bgp_setup_out_table(struct bgp_channel *c) */ int -bgp_preexport(struct channel *c, rte *e) +bgp_preexport(struct channel *C, rte *e) { - struct proto *SRC = e->src->proto; - struct bgp_proto *p = (struct bgp_proto *) (c->proto); - struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (struct bgp_proto *) SRC : NULL; + struct bgp_proto *p = (struct bgp_proto *) C->proto; + struct bgp_proto *src = bgp_rte_proto(e); + struct bgp_channel *c = (struct bgp_channel *) C; /* Reject our routes */ if (src == p) @@ -1973,11 +2063,11 @@ bgp_preexport(struct channel *c, rte *e) } /* Handle well-known communities, RFC 1997 */ - struct eattr *com; + struct eattr *a; if (p->cf->interpret_communities && - (com = ea_find(e->attrs, BGP_EA_ID(BA_COMMUNITY)))) + (a = bgp_find_attr(e->attrs, BA_COMMUNITY))) { - const struct adata *d = com->u.ptr; + const struct adata *d = a->u.ptr; /* Do not export anywhere */ if (int_set_contains(d, BGP_COMM_NO_ADVERTISE)) @@ -1996,14 +2086,23 @@ bgp_preexport(struct channel *c, rte *e) return -1; } + /* Do not export routes marked with OTC to upstream, RFC 9234 */ + if (bgp_channel_is_role_applicable(c)) + { + a = bgp_find_attr(e->attrs, BA_ONLY_TO_CUSTOMER); + if (a && (p->cf->local_role==BGP_ROLE_CUSTOMER || + p->cf->local_role==BGP_ROLE_PEER || + p->cf->local_role==BGP_ROLE_RS_CLIENT)) + return -1; + } + return 0; } static ea_list * bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *attrs0, struct linpool *pool) { - struct proto *SRC = e->src->proto; - struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (void *) SRC : NULL; + struct bgp_proto *src = bgp_rte_proto(e); struct bgp_export_state s = { .proto = p, .channel = c, .pool = pool, .src = src, .route = e, .mpls = c->desc->mpls }; ea_list *attrs = attrs0; eattr *a; @@ -2105,6 +2204,16 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at } } + /* Mark routes for downstream with OTC, RFC 9234 */ + if (bgp_channel_is_role_applicable(c)) + { + a = bgp_find_attr(attrs, BA_ONLY_TO_CUSTOMER); + if (!a && (p->cf->local_role == BGP_ROLE_PROVIDER || + p->cf->local_role == BGP_ROLE_PEER || + p->cf->local_role == BGP_ROLE_RS_SERVER)) + bgp_set_attr_u32(&attrs, BA_ONLY_TO_CUSTOMER, 0, p->public_as); + } + /* * Presence of mandatory attributes ORIGIN and AS_PATH is ensured by above * conditions. Presence and validity of quasi-mandatory NEXT_HOP attribute @@ -2121,7 +2230,7 @@ bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, c struct bgp_proto *p = (void *) P; struct bgp_channel *c = (void *) C; struct bgp_bucket *buck; - u32 path; + struct rte_src *path; if (new) { @@ -2132,16 +2241,16 @@ bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, c log(L_ERR "%s: Invalid route %N withdrawn", p->p.name, n); /* If attributes are invalid, we fail back to withdraw */ - buck = attrs ? bgp_get_bucket(c, attrs) : bgp_get_withdraw_bucket(c); - path = new->src->global_id; + buck = attrs ? bgp_get_bucket(c->ptx, attrs) : bgp_get_withdraw_bucket(c->ptx); + path = new->src; } else { - buck = bgp_get_withdraw_bucket(c); - path = old->src->global_id; + buck = bgp_get_withdraw_bucket(c->ptx); + path = old->src; } - if (bgp_update_prefix(c, bgp_get_prefix(c, n, path), buck)) + if (bgp_update_prefix(c, bgp_get_prefix(c->ptx, n, path, c->add_path_tx), buck)) bgp_schedule_packet(p->conn, c, PKT_UPDATE); } @@ -2156,7 +2265,7 @@ bgp_get_neighbor(rte *r) return as; /* If AS_PATH is not defined, we treat rte as locally originated */ - struct bgp_proto *p = (void *) r->src->proto; + struct bgp_proto *p = bgp_rte_proto(r); return p->cf->confederation ?: p->local_as; } @@ -2186,8 +2295,8 @@ rte_stale(rte *r) int bgp_rte_better(rte *new, rte *old) { - struct bgp_proto *new_bgp = (struct bgp_proto *) new->src->proto; - struct bgp_proto *old_bgp = (struct bgp_proto *) old->src->proto; + struct bgp_proto *new_bgp = bgp_rte_proto(new); + struct bgp_proto *old_bgp = bgp_rte_proto(old); eattr *x, *y; u32 n, o; @@ -2331,8 +2440,8 @@ bgp_rte_better(rte *new, rte *old) int bgp_rte_mergable(rte *pri, rte *sec) { - struct bgp_proto *pri_bgp = (struct bgp_proto *) pri->src->proto; - struct bgp_proto *sec_bgp = (struct bgp_proto *) sec->src->proto; + struct bgp_proto *pri_bgp = bgp_rte_proto(pri); + struct bgp_proto *sec_bgp = bgp_rte_proto(sec); eattr *x, *y; u32 p, s; @@ -2416,12 +2525,12 @@ same_group(rte *r, u32 lpref, u32 lasn) static inline int use_deterministic_med(struct rte_storage *r) { - struct proto *P = r->rte.src->proto; - return (P->proto == &proto_bgp) && ((struct bgp_proto *) P)->cf->deterministic_med; + struct bgp_proto *p = bgp_rte_proto(&r->rte); + return p && p->cf->deterministic_med; } int -bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) +bgp_rte_recalculate(struct rtable_private *table, net *net, rte *new, rte *old, rte *old_best) { rte *key = new ? new : old; u32 lpref = rt_get_preference(key); @@ -2546,27 +2655,57 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) return !old_suppressed; } -rte * -bgp_rte_modify_stale(struct rte *r, struct linpool *pool) +void +bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) { - eattr *ea = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY)); - const struct adata *ad = ea ? ea->u.ptr : NULL; - uint flags = ea ? ea->flags : BAF_PARTIAL; + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + struct rt_import_hook *irh = c->c.in_req.hook; - if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR)) - return NULL; + /* Find our routes among others */ + for (uint i=0; i<count; i++) + { + rte *r = feed[i]; + + if ( + !rte_is_valid(r) || /* Not a valid route */ + (r->sender != irh) || /* Not our route */ + (r->stale_cycle == irh->stale_set)) /* A new route, do not mark as stale */ + continue; + + eattr *ea = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY)); + const struct adata *ad = ea ? ea->u.ptr : NULL; + uint flags = ea ? ea->flags : BAF_PARTIAL; - if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE)) - return r; + /* LLGR not allowed, withdraw the route */ + if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR)) + { + rte_import(&c->c.in_req, n, NULL, r->src); + continue; + } + + /* Route already marked as LLGR, do nothing */ + if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE)) + continue; - _Thread_local static rte e0; - e0 = *r; + /* Store the tmp_linpool state to aggresively save memory */ + struct lp_state tmpp; + lp_save(tmp_linpool, &tmpp); - bgp_set_attr_ptr(&e0.attrs, BA_COMMUNITY, flags, - int_set_add(pool, ad, BGP_COMM_LLGR_STALE)); - e0.pflags |= BGP_REF_STALE; + /* Mark the route as LLGR */ + rte e0 = *r; + bgp_set_attr_ptr(&e0.attrs, BA_COMMUNITY, flags, int_set_add(tmp_linpool, ad, BGP_COMM_LLGR_STALE)); + e0.pflags &= ~BGP_REF_NOT_STALE; + e0.pflags |= BGP_REF_STALE; - return &e0; + /* We need to update the route but keep it stale. */ + ASSERT_DIE(irh->stale_set == irh->stale_valid + 1); + irh->stale_set--; + rte_import(&c->c.in_req, n, &e0, r->src); + irh->stale_set++; + + /* Restore the memory state */ + lp_restore(tmp_linpool, &tmpp); + } } diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index 6ffe8824..573e3d25 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -102,6 +102,7 @@ * RFC 8212 - Default EBGP Route Propagation Behavior without Policies * RFC 8654 - Extended Message Support for BGP * RFC 9117 - Revised Validation Procedure for BGP Flow Specifications + * RFC 9234 - Route Leak Prevention and Detection Using Roles * draft-ietf-idr-ext-opt-param-07 * draft-uttaro-idr-bgp-persistence-04 * draft-walton-bgp-hostname-capability-02 @@ -139,6 +140,9 @@ static void bgp_update_bfd(struct bgp_proto *p, const struct bfd_options *bfd); static int bgp_incoming_connection(sock *sk, uint dummy UNUSED); static void bgp_listen_sock_err(sock *sk UNUSED, int err); +static void bgp_graceful_restart_feed(struct bgp_channel *c); + + /** * bgp_open - open a BGP instance * @p: BGP instance @@ -373,6 +377,7 @@ bgp_close_conn(struct bgp_conn *conn) conn->keepalive_timer = NULL; rfree(conn->hold_timer); conn->hold_timer = NULL; + rfree(conn->tx_ev); conn->tx_ev = NULL; rfree(conn->sk); @@ -511,8 +516,15 @@ void bgp_stop(struct bgp_proto *p, int subcode, byte *data, uint len) { proto_notify_state(&p->p, PS_STOP); + p->uncork_ev->data = NULL; bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len); bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len); + + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + if (c->ptx) + bgp_free_pending_tx(c); + ev_schedule(p->event); } @@ -760,32 +772,30 @@ bgp_handle_graceful_restart(struct bgp_proto *p) { case BGP_GRS_NONE: c->gr_active = BGP_GRS_ACTIVE; - rt_refresh_begin(c->c.table, &c->c.in_req); + rt_refresh_begin(&c->c.in_req); break; case BGP_GRS_ACTIVE: - rt_refresh_end(c->c.table, &c->c.in_req); - rt_refresh_begin(c->c.table, &c->c.in_req); + rt_refresh_end(&c->c.in_req); + rt_refresh_begin(&c->c.in_req); break; case BGP_GRS_LLGR: - rt_refresh_begin(c->c.table, &c->c.in_req); - rt_modify_stale(c->c.table, &c->c.in_req); + rt_refresh_begin(&c->c.in_req); + bgp_graceful_restart_feed(c); break; } } else { /* Just flush the routes */ - rt_refresh_begin(c->c.table, &c->c.in_req); - rt_refresh_end(c->c.table, &c->c.in_req); + rt_refresh_begin(&c->c.in_req); + rt_refresh_end(&c->c.in_req); } /* Reset bucket and prefix tables */ - bgp_free_bucket_table(c); - bgp_free_prefix_table(c); - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); + bgp_free_pending_tx(c); + bgp_init_pending_tx(c); c->packets_to_send = 0; } @@ -796,6 +806,53 @@ bgp_handle_graceful_restart(struct bgp_proto *p) tm_start(p->gr_timer, p->conn->remote_caps->gr_time S); } +static void +bgp_graceful_restart_feed_done(struct rt_export_request *req) +{ + req->hook = NULL; +} + +static void +bgp_graceful_restart_feed_dump_req(struct rt_export_request *req) +{ + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + debug(" BGP-GR %s.%s export request %p\n", c->c.proto->name, c->c.name, req); +} + +static void +bgp_graceful_restart_feed_log_state_change(struct rt_export_request *req, u8 state) +{ + struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + struct bgp_proto *p = (void *) c->c.proto; + BGP_TRACE(D_EVENTS, "Long-lived graceful restart export state changed to %s", rt_export_state_name(state)); + + if (state == TES_READY) + rt_stop_export(req, bgp_graceful_restart_feed_done); +} + +static void +bgp_graceful_restart_drop_export(struct rt_export_request *req UNUSED, const net_addr *n UNUSED, struct rt_pending_export *rpe UNUSED) +{ /* Nothing to do */ } + +static void +bgp_graceful_restart_feed(struct bgp_channel *c) +{ + c->stale_feed = (struct rt_export_request) { + .name = "BGP-GR", + .list = &global_work_list, + .trace_routes = c->c.debug | c->c.proto->debug, + .dump_req = bgp_graceful_restart_feed_dump_req, + .log_state_change = bgp_graceful_restart_feed_log_state_change, + .export_bulk = bgp_rte_modify_stale, + .export_one = bgp_graceful_restart_drop_export, + }; + + rt_request_export(c->c.table, &c->stale_feed); +} + + + + /** * bgp_graceful_restart_done - finish active BGP graceful restart * @c: BGP channel @@ -819,7 +876,7 @@ bgp_graceful_restart_done(struct bgp_channel *c) BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); tm_stop(c->stale_timer); - rt_refresh_end(c->c.table, &c->c.in_req); + rt_refresh_end(&c->c.in_req); } /** @@ -861,7 +918,7 @@ bgp_graceful_restart_timeout(timer *t) /* Channel is in GR, and supports LLGR -> start LLGR */ c->gr_active = BGP_GRS_LLGR; tm_start(c->stale_timer, c->stale_time S); - rt_modify_stale(c->c.table, &c->c.in_req); + bgp_graceful_restart_feed(c); } } else @@ -899,7 +956,7 @@ bgp_refresh_begin(struct bgp_channel *c) { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } c->load_state = BFS_REFRESHING; - rt_refresh_begin(c->c.table, &c->c.in_req); + rt_refresh_begin(&c->c.in_req); } /** @@ -920,7 +977,7 @@ bgp_refresh_end(struct bgp_channel *c) { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } c->load_state = BFS_NONE; - rt_refresh_end(c->c.table, &c->c.in_req); + rt_refresh_end(&c->c.in_req); } @@ -1526,6 +1583,8 @@ bgp_start(struct proto *P) p->last_rx_update = 0; p->event = ev_new_init(p->p.pool, bgp_decision, p); + p->uncork_ev = ev_new_init(p->p.pool, bgp_uncork, p); + p->startup_timer = tm_new_init(p->p.pool, bgp_startup_timeout, p, 0, 0); p->gr_timer = tm_new_init(p->p.pool, bgp_graceful_restart_timeout, p, 0, 0); @@ -1656,6 +1715,13 @@ done: return p->p.proto_state; } +struct rte_owner_class bgp_rte_owner_class = { + .get_route_info = bgp_get_route_info, + .rte_better = bgp_rte_better, + .rte_mergable = bgp_rte_mergable, + .rte_igp_metric = bgp_rte_igp_metric, +}; + static struct proto * bgp_init(struct proto_config *CF) { @@ -1669,11 +1735,9 @@ bgp_init(struct proto_config *CF) P->reload_routes = bgp_reload_routes; P->feed_begin = bgp_feed_begin; P->feed_end = bgp_feed_end; - P->rte_better = bgp_rte_better; - P->rte_mergable = bgp_rte_mergable; - P->rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL; - P->rte_modify = bgp_rte_modify_stale; - P->rte_igp_metric = bgp_rte_igp_metric; + + P->sources.class = &bgp_rte_owner_class; + P->sources.rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL; p->cf = cf; p->is_internal = (cf->local_as == cf->remote_as); @@ -1747,8 +1811,7 @@ bgp_channel_start(struct channel *C) if (c->cf->export_table) bgp_setup_out_table(c); - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); + bgp_init_pending_tx(c); c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0); @@ -1871,7 +1934,7 @@ bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32 return cc2->c.table; /* Last, try default table of given type */ - if (tab = cf->c.global->def_tables[type]) + if (tab = rt_get_default_table(cf->c.global, type)) return tab; cf_error("Undefined IGP table"); @@ -1890,7 +1953,7 @@ bgp_default_base_table(struct bgp_config *cf, struct bgp_channel_config *cc) return cc2->c.table; /* Last, try default table of given type */ - struct rtable_config *tab = cf->c.global->def_tables[type]; + struct rtable_config *tab = rt_get_default_table(cf->c.global, type); if (tab) return tab; @@ -1958,6 +2021,15 @@ bgp_postconfig(struct proto_config *CF) if (internal && cf->rs_client) cf_error("Only external neighbor can be RS client"); + if (internal && (cf->local_role != BGP_ROLE_UNDEFINED)) + cf_error("Local role cannot be set on IBGP sessions"); + + if (interior && (cf->local_role != BGP_ROLE_UNDEFINED)) + log(L_WARN "BGP roles are not recommended to be used within AS confederations"); + + if (cf->require_roles && (cf->local_role == BGP_ROLE_UNDEFINED)) + cf_error("Local role must be set if roles are required"); + if (!cf->confederation && cf->confederation_member) cf_error("Confederation ID must be set for member sessions"); @@ -2320,6 +2392,15 @@ bgp_show_afis(int code, char *s, u32 *afis, uint count) cli_msg(code, b.start); } +static const char * +bgp_format_role_name(u8 role) +{ + static const char *bgp_role_names[] = { "provider", "rs_server", "rs_client", "customer", "peer" }; + if (role == BGP_ROLE_UNDEFINED) return "undefined"; + if (role < ARRAY_SIZE(bgp_role_names)) return bgp_role_names[role]; + return "?"; +} + static void bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) { @@ -2448,6 +2529,9 @@ bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) if (caps->hostname) cli_msg(-1006, " Hostname: %s", caps->hostname); + + if (caps->role != BGP_ROLE_UNDEFINED) + cli_msg(-1006, " Role: %s", bgp_format_role_name(caps->role)); } static void @@ -2585,7 +2669,6 @@ struct protocol proto_bgp = { .reconfigure = bgp_reconfigure, .copy_config = bgp_copy_config, .get_status = bgp_get_status, - .get_route_info = bgp_get_route_info, .show_proto_info = bgp_show_proto_info }; diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 003893e0..1bcfb915 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -113,6 +113,8 @@ struct bgp_config { int gr_mode; /* Graceful restart mode (BGP_GR_*) */ int llgr_mode; /* Long-lived graceful restart mode (BGP_LLGR_*) */ int setkey; /* Set MD5 password to system SA/SP database */ + u8 local_role; /* Set peering role with neighbor [RFC 9234] */ + int require_roles; /* Require configured roles on both sides */ /* Times below are in seconds */ unsigned gr_time; /* Graceful restart timeout */ unsigned llgr_time; /* Long-lived graceful restart stale time */ @@ -166,6 +168,13 @@ struct bgp_channel_config { #define BGP_PT_INTERNAL 1 #define BGP_PT_EXTERNAL 2 +#define BGP_ROLE_UNDEFINED 255 +#define BGP_ROLE_PROVIDER 0 +#define BGP_ROLE_RS_SERVER 1 +#define BGP_ROLE_RS_CLIENT 2 +#define BGP_ROLE_CUSTOMER 3 +#define BGP_ROLE_PEER 4 + #define NH_NO 0 #define NH_ALL 1 #define NH_IBGP 2 @@ -226,6 +235,7 @@ struct bgp_caps { u8 ext_messages; /* Extended message length, RFC draft */ u8 route_refresh; /* Route refresh capability, RFC 2918 */ u8 enhanced_refresh; /* Enhanced route refresh, RFC 7313 */ + u8 role; /* BGP role capability, RFC 9234 */ u8 gr_aware; /* Graceful restart capability, RFC 4724 */ u8 gr_flags; /* Graceful restart flags */ @@ -319,6 +329,7 @@ struct bgp_proto { struct bgp_socket *sock; /* Shared listening socket */ struct bfd_request *bfd_req; /* BFD request, if BFD is used */ struct birdsock *postponed_sk; /* Postponed incoming socket for dynamic BGP */ + event *uncork_ev; /* Uncork event in case of congestion */ struct bgp_stats stats; /* BGP statistics */ btime last_established; /* Last time of enter/leave of established state */ btime last_rx_update; /* Last time of RX update */ @@ -350,14 +361,8 @@ struct bgp_channel { /* Rest are zeroed when down */ pool *pool; - HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ - struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ - list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ - - HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ - slab *prefix_slab; /* Slab holding prefix nodes */ - - struct rt_exporter prefix_exporter; /* Table-like exporter for prefix_hash */ + struct bgp_pending_tx *ptx; /* Routes waiting to be sent */ + struct rt_exporter prefix_exporter; /* Table-like exporter for ptx */ ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */ ip_addr link_addr; /* Link-local version of next_hop_addr */ @@ -371,6 +376,7 @@ struct bgp_channel { timer *stale_timer; /* Long-lived stale timer for LLGR */ u32 stale_time; /* Stored LLGR stale time from last session */ + struct rt_export_request stale_feed; /* Feeder request for stale route modification */ u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */ u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */ @@ -399,6 +405,18 @@ struct bgp_bucket { ea_list eattrs[0]; /* Per-bucket extended attributes */ }; +struct bgp_pending_tx { + resource r; + pool *pool; + + HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ + struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ + list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ + + HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ + slab *prefix_slab; /* Slab holding prefix nodes */ +}; + struct bgp_export_state { struct bgp_proto *proto; struct bgp_channel *channel; @@ -492,6 +510,12 @@ static inline int bgp_cc_is_ipv4(struct bgp_channel_config *c) static inline int bgp_cc_is_ipv6(struct bgp_channel_config *c) { return BGP_AFI(c->afi) == BGP_AFI_IPV6; } +static inline int bgp_channel_is_role_applicable(struct bgp_channel *c) +{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); } + +static inline int bgp_cc_is_role_applicable(struct bgp_channel_config *c) +{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); } + static inline uint bgp_max_packet_length(struct bgp_conn *conn) { return conn->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; } @@ -533,6 +557,7 @@ rte_resolvable(const rte *rt) return NEXTHOP_IS_REACHABLE(nhad) || (nhad->dest != RTD_UNREACHABLE); } +extern struct rte_owner_class bgp_rte_owner_class; #ifdef LOCAL_DEBUG #define BGP_FORCE_DEBUG 1 @@ -564,25 +589,30 @@ void bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to); void bgp_setup_out_table(struct bgp_channel *c); -void bgp_init_bucket_table(struct bgp_channel *c); -void bgp_free_bucket_table(struct bgp_channel *c); +void bgp_init_pending_tx(struct bgp_channel *c); +void bgp_free_pending_tx(struct bgp_channel *c); + void bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b); int bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b); -void bgp_init_prefix_table(struct bgp_channel *c); -void bgp_free_prefix_table(struct bgp_channel *c); void bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck); int bgp_rte_better(struct rte *, struct rte *); int bgp_rte_mergable(rte *pri, rte *sec); -int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best); -struct rte *bgp_rte_modify_stale(struct rte *r, struct linpool *pool); +int bgp_rte_recalculate(struct rtable_private *table, net *net, rte *new, rte *old, rte *old_best); +void bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe UNUSED, rte **feed, uint count); u32 bgp_rte_igp_metric(const rte *); void bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, const rte *old); int bgp_preexport(struct channel *, struct rte *); void bgp_get_route_info(struct rte *, byte *); int bgp_total_aigp_metric_(const rte *e, u64 *metric, const struct adata **ad); +static inline struct bgp_proto *bgp_rte_proto(struct rte *rte) +{ + return (rte->src->owner->class == &bgp_rte_owner_class) ? + SKIP_BACK(struct bgp_proto, p.sources, rte->src->owner) : NULL; +} + #define BGP_AIGP_METRIC 1 #define BGP_AIGP_MAX U64(0xffffffffffffffff) @@ -609,6 +639,7 @@ void bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) void bgp_kick_tx(void *vconn); void bgp_tx(struct birdsock *sk); int bgp_rx(struct birdsock *sk, uint size); +void bgp_uncork(void *vp); const char * bgp_error_dsc(unsigned code, unsigned subcode); void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len); @@ -652,6 +683,7 @@ enum bgp_attr_id { BA_AS4_AGGREGATOR = 0x12, /* RFC 6793 */ BA_AIGP = 0x1a, /* RFC 7311 */ BA_LARGE_COMMUNITY = 0x20, /* RFC 8092 */ +#define BA_ONLY_TO_CUSTOMER 0x23 /* RFC 9234 */ /* Bird's private internal BGP attributes */ BA_MPLS_LABEL_STACK = 0x100, /* MPLS label stack transfer attribute */ diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 24f3ec8f..9f0d2306 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -30,7 +30,8 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE, STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG, LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS, DYNAMIC, RANGE, NAME, DIGITS, AIGP, ORIGINATE, COST, ENFORCE, - FIRST, FREE, VALIDATE, BASE) + FIRST, FREE, VALIDATE, BASE, ROLE, ROLES, PEER, PROVIDER, CUSTOMER, + RS_SERVER, RS_CLIENT, REQUIRE, BGP_OTC) %type <i> bgp_nh %type <i32> bgp_afi @@ -39,7 +40,7 @@ CF_KEYWORDS(CEASE, PREFIX, LIMIT, HIT, ADMINISTRATIVE, SHUTDOWN, RESET, PEER, CONFIGURATION, CHANGE, DECONFIGURED, CONNECTION, REJECTED, COLLISION, OUT, OF, RESOURCES) -%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag +%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag bgp_role_name CF_GRAMMAR @@ -73,6 +74,7 @@ bgp_proto_start: proto_start BGP { BGP_CFG->llgr_mode = -1; BGP_CFG->llgr_time = 3600; BGP_CFG->setkey = 1; + BGP_CFG->local_role = BGP_ROLE_UNDEFINED; BGP_CFG->dynamic_name = "dynbgp"; BGP_CFG->check_link = -1; } @@ -115,6 +117,14 @@ bgp_cease_flag: | OUT OF RESOURCES { $$ = 1 << 8; } ; +bgp_role_name: + PEER { $$ = BGP_ROLE_PEER; } + | PROVIDER { $$ = BGP_ROLE_PROVIDER; } + | CUSTOMER { $$ = BGP_ROLE_CUSTOMER; } + | RS_SERVER { $$ = BGP_ROLE_RS_SERVER; } + | RS_CLIENT { $$ = BGP_ROLE_RS_CLIENT; } + ; + bgp_proto: bgp_proto_start proto_name '{' | bgp_proto proto_item ';' @@ -198,6 +208,8 @@ bgp_proto: | bgp_proto BFD GRACEFUL ';' { init_bfd_opts(&BGP_CFG->bfd); BGP_CFG->bfd->mode = BGP_BFD_GRACEFUL; } | bgp_proto BFD { open_bfd_opts(&BGP_CFG->bfd); } bfd_opts { close_bfd_opts(); } ';' | bgp_proto ENFORCE FIRST AS bool ';' { BGP_CFG->enforce_first_as = $5; } + | bgp_proto LOCAL ROLE bgp_role_name ';' { BGP_CFG->local_role = $4; } + | bgp_proto REQUIRE ROLES bool ';' { BGP_CFG->require_roles = $4; } ; bgp_afi: diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index 4d4ae3eb..d4d2d0b0 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -238,6 +238,7 @@ bgp_prepare_capabilities(struct bgp_conn *conn) caps->ext_messages = p->cf->enable_extended_messages; caps->route_refresh = p->cf->enable_refresh; caps->enhanced_refresh = p->cf->enable_refresh; + caps->role = p->cf->local_role; if (caps->as4_support) caps->as4_number = p->public_as; @@ -350,6 +351,13 @@ bgp_write_capabilities(struct bgp_conn *conn, byte *buf) *buf++ = 0; /* Capability data length */ } + if (caps->role != BGP_ROLE_UNDEFINED) + { + *buf++ = 9; /* Capability 9: Announce chosen BGP role */ + *buf++ = 1; /* Capability data length */ + *buf++ = caps->role; + } + if (caps->gr_aware) { *buf++ = 64; /* Capability 64: Support for graceful restart */ @@ -449,11 +457,15 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) struct bgp_proto *p = conn->bgp; struct bgp_caps *caps; struct bgp_af_caps *ac; + uint err_subcode = 0; int i, cl; u32 af; if (!conn->remote_caps) + { caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + sizeof(struct bgp_af_caps)); + caps->role = BGP_ROLE_UNDEFINED; + } else { caps = conn->remote_caps; @@ -513,6 +525,21 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) caps->ext_messages = 1; break; + case 9: /* BGP role capability, RFC 9234 */ + if (cl != 1) + goto err; + + /* Reserved value */ + if (pos[2] == BGP_ROLE_UNDEFINED) + { err_subcode = 11; goto err; } + + /* Multiple inconsistent values */ + if ((caps->role != BGP_ROLE_UNDEFINED) && (caps->role != pos[2])) + { err_subcode = 11; goto err; } + + caps->role = pos[2]; + break; + case 64: /* Graceful restart capability, RFC 4724 */ if (cl % 4 != 2) goto err; @@ -638,7 +665,7 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) err: mb_free(caps); - bgp_error(conn, 2, 0, NULL, 0); + bgp_error(conn, 2, err_subcode, NULL, 0); return -1; } @@ -854,6 +881,22 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) conn->received_as = asn; } + /* RFC 9234 4.2 - check role agreement */ + u8 local_role = p->cf->local_role; + u8 neigh_role = caps->role; + + if ((local_role != BGP_ROLE_UNDEFINED) && + (neigh_role != BGP_ROLE_UNDEFINED) && + !((local_role == BGP_ROLE_PEER && neigh_role == BGP_ROLE_PEER) || + (local_role == BGP_ROLE_CUSTOMER && neigh_role == BGP_ROLE_PROVIDER) || + (local_role == BGP_ROLE_PROVIDER && neigh_role == BGP_ROLE_CUSTOMER) || + (local_role == BGP_ROLE_RS_CLIENT && neigh_role == BGP_ROLE_RS_SERVER) || + (local_role == BGP_ROLE_RS_SERVER && neigh_role == BGP_ROLE_RS_CLIENT))) + { bgp_error(conn, 2, 11, NULL, 0); return; } + + if ((p->cf->require_roles) && (neigh_role == BGP_ROLE_UNDEFINED)) + { bgp_error(conn, 2, 11, NULL, 0); return; } + /* Check the other connection */ other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; switch (other->state) @@ -1372,6 +1415,8 @@ bgp_rte_update(struct bgp_parse_state *s, const net_addr *n, u32 path_id, ea_lis { if (path_id != s->last_id) { + rt_unlock_source(s->last_src); + s->last_src = rt_get_source(&s->proto->p, path_id); s->last_id = path_id; @@ -2167,7 +2212,7 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu * var IPv4 Network Layer Reachability Information */ - ASSERT_DIE(s->channel->withdraw_bucket != buck); + ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck); int lr, la; @@ -2190,7 +2235,7 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu static byte * bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - ASSERT_DIE(s->channel->withdraw_bucket != buck); + ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck); /* * 2 B IPv4 Withdrawn Routes Length (zero) @@ -2330,7 +2375,7 @@ again: ; }; /* Try unreachable bucket */ - if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) + if ((buck = c->ptx->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) { res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ? bgp_create_ip_unreach(&s, buck, buf, end): @@ -2340,9 +2385,9 @@ again: ; } /* Try reachable buckets */ - if (!EMPTY_LIST(c->bucket_queue)) + if (!EMPTY_LIST(c->ptx->bucket_queue)) { - buck = HEAD(c->bucket_queue); + buck = HEAD(c->ptx->bucket_queue); /* Cleanup empty buckets */ if (bgp_done_bucket(c, buck)) @@ -2449,6 +2494,7 @@ bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_lis s->last_id = 0; s->last_src = s->proto->p.main_source; + rt_lock_source(s->last_src); /* * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not @@ -2475,6 +2521,8 @@ bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_lis rta_free(s->cached_ea); s->cached_ea = NULL; + + rt_unlock_source(s->last_src); } static void @@ -2972,6 +3020,7 @@ static struct { { 2, 6, "Unacceptable hold time" }, { 2, 7, "Required capability missing" }, /* [RFC5492] */ { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */ + { 2,11, "Role mismatch" }, /* From Open Policy, RFC 9234 */ { 3, 0, "Invalid UPDATE message" }, { 3, 1, "Malformed attribute list" }, { 3, 2, "Unrecognized well-known attribute" }, @@ -3175,6 +3224,21 @@ bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len) } } +void +bgp_uncork(void *vp) +{ + struct bgp_proto *p = vp; + + if (p && p->conn && (p->conn->state == BS_ESTABLISHED) && !p->conn->sk->rx_hook) + { + struct birdsock *sk = p->conn->sk; + ASSERT_DIE(sk->rpos > sk->rbuf); + sk->rx_hook = bgp_rx; + bgp_rx(sk, sk->rpos - sk->rbuf); + BGP_TRACE(D_PACKETS, "Uncorked"); + } +} + /** * bgp_rx - handle received data * @sk: socket @@ -3189,6 +3253,7 @@ int bgp_rx(sock *sk, uint size) { struct bgp_conn *conn = sk->data; + struct bgp_proto *p = conn->bgp; byte *pkt_start = sk->rbuf; byte *end = pkt_start + size; uint i, len; @@ -3198,6 +3263,12 @@ bgp_rx(sock *sk, uint size) { if ((conn->state == BS_CLOSE) || (conn->sk != sk)) return 0; + if ((conn->state == BS_ESTABLISHED) && rt_cork_check(conn->bgp->uncork_ev)) + { + sk->rx_hook = NULL; + BGP_TRACE(D_PACKETS, "Corked"); + return 0; + } for(i=0; i<16; i++) if (pkt_start[i] != 0xff) { diff --git a/proto/mrt/Makefile b/proto/mrt/Makefile index 000e1c1c..8cd44ac1 100644 --- a/proto/mrt/Makefile +++ b/proto/mrt/Makefile @@ -2,6 +2,5 @@ src := mrt.c obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,mrt_build) tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/mrt/mrt.c b/proto/mrt/mrt.c index 5ef4cd44..f07f9ca2 100644 --- a/proto/mrt/mrt.c +++ b/proto/mrt/mrt.c @@ -228,7 +228,7 @@ mrt_next_table_(rtable *tab, rtable *tab_ptr, const char *pattern) NODE_VALID(tn); tn = tn->next) { - tab = SKIP_BACK(struct rtable, n, tn); + tab = SKIP_BACK(rtable, n, tn); if (patmatch(pattern, tab->name) && ((tab->addr_type == NET_IP4) || (tab->addr_type == NET_IP6))) return tab; @@ -243,13 +243,15 @@ mrt_next_table(struct mrt_table_dump_state *s) rtable *tab = mrt_next_table_(s->table, s->table_ptr, s->table_expr); if (s->table) - rt_unlock_table(s->table); + RT_LOCKED(s->table, tab) + rt_unlock_table(tab); s->table = tab; s->ipv4 = tab ? (tab->addr_type == NET_IP4) : 0; if (s->table) - rt_lock_table(s->table); + RT_LOCKED(s->table, tab) + rt_lock_table(tab); return s->table; } @@ -472,9 +474,9 @@ mrt_rib_table_entry(struct mrt_table_dump_state *s, rte *r) #ifdef CONFIG_BGP /* Find peer index */ - if (r->src->proto->proto == &proto_bgp) + struct bgp_proto *p = bgp_rte_proto(r); + if (p) { - struct bgp_proto *p = (void *) r->src->proto; struct mrt_peer_entry *n = HASH_FIND(s->peer_hash, PEER, p->remote_id, p->remote_as, p->remote_ip); @@ -573,14 +575,18 @@ mrt_table_dump_init(pool *pp) static void mrt_table_dump_free(struct mrt_table_dump_state *s) { - if (s->table_open) - FIB_ITERATE_UNLINK(&s->fit, &s->table->fib); - if (s->table) - rt_unlock_table(s->table); + RT_LOCKED(s->table, tab) + { + if (s->table_open) + FIB_ITERATE_UNLINK(&s->fit, &tab->fib); + + rt_unlock_table(tab); + } if (s->table_ptr) - rt_unlock_table(s->table_ptr); + RT_LOCKED(s->table_ptr, tab) + rt_unlock_table(tab); config_del_obstacle(s->config); @@ -606,16 +612,19 @@ mrt_table_dump_step(struct mrt_table_dump_state *s) mrt_peer_table_dump(s); - FIB_ITERATE_INIT(&s->fit, &s->table->fib); + RT_LOCKED(s->table, tab) + { + + FIB_ITERATE_INIT(&s->fit, &tab->fib); s->table_open = 1; step: - FIB_ITERATE_START(&s->table->fib, &s->fit, net, n) + FIB_ITERATE_START(&tab->fib, &s->fit, net, n) { if (s->max < 0) { FIB_ITERATE_PUT(&s->fit); - return 0; + RT_RETURN(tab, 0); } /* With Always ADD_PATH option, we jump directly to second phase */ @@ -630,6 +639,8 @@ mrt_table_dump_step(struct mrt_table_dump_state *s) FIB_ITERATE_END; s->table_open = 0; + } + mrt_close_file(s); mrt_peer_table_flush(s); } @@ -661,7 +672,8 @@ mrt_timer(timer *t) s->always_add_path = cf->always_add_path; if (s->table_ptr) - rt_lock_table(s->table_ptr); + RT_LOCKED(s->table_ptr, tab) + rt_lock_table(tab); p->table_dump = s; ev_schedule(p->event); @@ -737,7 +749,8 @@ mrt_dump_cmd(struct mrt_dump_data *d) s->filename = d->filename; if (s->table_ptr) - rt_lock_table(s->table_ptr); + RT_LOCKED(s->table_ptr, tab) + rt_lock_table(tab); this_cli->cont = mrt_dump_cont; this_cli->cleanup = mrt_dump_cleanup; diff --git a/proto/mrt/mrt.h b/proto/mrt/mrt.h index 3b83aa39..f535a391 100644 --- a/proto/mrt/mrt.h +++ b/proto/mrt/mrt.h @@ -40,7 +40,7 @@ struct mrt_proto { struct mrt_dump_data { const char *table_expr; - struct rtable *table_ptr; + rtable *table_ptr; const struct filter *filter; const char *filename; }; @@ -60,7 +60,7 @@ struct mrt_table_dump_state { /* Configuration information */ const char *table_expr; /* Wildcard for table name (or NULL) */ - struct rtable *table_ptr; /* Explicit table (or NULL) */ + rtable *table_ptr; /* Explicit table (or NULL) */ const struct filter *filter; /* Optional filter */ const char *filename; /* Filename pattern */ int always_add_path; /* Always use *_ADDPATH message subtypes */ @@ -73,7 +73,7 @@ struct mrt_table_dump_state { HASH(struct mrt_peer_entry) peer_hash; /* Hash for peers to find the index */ - struct rtable *table; /* Processed table, NULL initially */ + rtable *table; /* Processed table, NULL initially */ struct fib_iterator fit; /* Iterator in processed table */ int table_open; /* Whether iterator is linked */ diff --git a/proto/ospf/Makefile b/proto/ospf/Makefile index 85664543..015f394a 100644 --- a/proto/ospf/Makefile +++ b/proto/ospf/Makefile @@ -2,6 +2,5 @@ src := dbdes.c hello.c iface.c lsack.c lsalib.c lsreq.c lsupd.c neighbor.c ospf. obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,ospf_build) tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/ospf/ospf.c b/proto/ospf/ospf.c index 6a4ee8ab..4e29f960 100644 --- a/proto/ospf/ospf.c +++ b/proto/ospf/ospf.c @@ -377,8 +377,8 @@ ospf_init(struct proto_config *CF) P->reload_routes = ospf_reload_routes; P->feed_begin = ospf_feed_begin; P->feed_end = ospf_feed_end; - P->rte_better = ospf_rte_better; - P->rte_igp_metric = ospf_rte_igp_metric; + + P->sources.class = &ospf_rte_owner_class; return P; } @@ -486,13 +486,13 @@ ospf_disp(timer * timer) * import to the filters. */ static int -ospf_preexport(struct channel *c, rte *e) +ospf_preexport(struct channel *C, rte *e) { - struct ospf_proto *p = (struct ospf_proto *) c->proto; + struct ospf_proto *p = (struct ospf_proto *) C->proto; struct ospf_area *oa = ospf_main_area(p); /* Reject our own routes */ - if (e->src->proto == c->proto) + if (e->sender == C->in_req.hook) return -1; /* Do not export routes to stub areas */ @@ -1506,6 +1506,12 @@ ospf_sh_lsadb(struct lsadb_show_data *ld) } +struct rte_owner_class ospf_rte_owner_class = { + .get_route_info = ospf_get_route_info, + .rte_better = ospf_rte_better, + .rte_igp_metric = ospf_rte_igp_metric, +}; + struct protocol proto_ospf = { .name = "OSPF", .template = "ospf%d", @@ -1519,7 +1525,6 @@ struct protocol proto_ospf = { .shutdown = ospf_shutdown, .reconfigure = ospf_reconfigure, .get_status = ospf_get_status, - .get_route_info = ospf_get_route_info }; struct ea_class ea_ospf_metric1 = { diff --git a/proto/ospf/ospf.h b/proto/ospf/ospf.h index 7bed5c85..3477ba5a 100644 --- a/proto/ospf/ospf.h +++ b/proto/ospf/ospf.h @@ -1002,6 +1002,8 @@ void ospf_sh_state(struct proto *P, int verbose, int reachable); void ospf_sh_lsadb(struct lsadb_show_data *ld); +extern struct rte_owner_class ospf_rte_owner_class; + /* iface.c */ void ospf_iface_chstate(struct ospf_iface *ifa, u8 state); void ospf_iface_sm(struct ospf_iface *ifa, int event); diff --git a/proto/perf/Makefile b/proto/perf/Makefile index 42051f43..7877fb19 100644 --- a/proto/perf/Makefile +++ b/proto/perf/Makefile @@ -2,6 +2,5 @@ src := perf.c obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,perf_build) tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/perf/perf.c b/proto/perf/perf.c index d82ac8aa..9adafe5a 100644 --- a/proto/perf/perf.c +++ b/proto/perf/perf.c @@ -202,7 +202,9 @@ perf_loop(void *data) p->exp++; } - rt_schedule_prune(P->main_channel->table); + RT_LOCKED(P->main_channel->table, tab) + rt_schedule_prune(tab); + ev_schedule(p->loop); } diff --git a/proto/pipe/Makefile b/proto/pipe/Makefile index ba66027f..0d68db4c 100644 --- a/proto/pipe/Makefile +++ b/proto/pipe/Makefile @@ -2,6 +2,5 @@ src := pipe.c obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,pipe_build) tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/pipe/config.Y b/proto/pipe/config.Y index 0990168e..444de127 100644 --- a/proto/pipe/config.Y +++ b/proto/pipe/config.Y @@ -25,6 +25,7 @@ proto: pipe_proto '}' { this_channel = NULL; } ; pipe_proto_start: proto_start PIPE { this_proto = proto_config_new(&proto_pipe, $1); + this_proto->loop_order = DOMAIN_ORDER(proto); PIPE_CFG->max_generation = 16; } proto_name diff --git a/proto/pipe/pipe.c b/proto/pipe/pipe.c index 351db36b..b3b50a0d 100644 --- a/proto/pipe/pipe.c +++ b/proto/pipe/pipe.c @@ -73,12 +73,12 @@ pipe_rt_notify(struct proto *P, struct channel *src_ch, const net_addr *n, rte * } static int -pipe_preexport(struct channel *c, rte *e) +pipe_preexport(struct channel *C, rte *e) { - struct pipe_proto *p = (void *) c->proto; + struct pipe_proto *p = (void *) C->proto; /* Avoid direct loopbacks */ - if (e->sender == c->in_req.hook) + if (e->sender == C->in_req.hook) return -1; /* Indirection check */ @@ -86,8 +86,8 @@ pipe_preexport(struct channel *c, rte *e) if (e->generation >= max_generation) { log_rl(&p->rl_gen, L_ERR "Route overpiped (%u hops of %u configured in %s) in table %s: %N %s/%u:%u", - e->generation, max_generation, c->proto->name, - c->table->name, e->net, e->src->proto->name, e->src->private_id, e->src->global_id); + e->generation, max_generation, C->proto->name, + C->table->name, e->net, e->src->owner->name, e->src->private_id, e->src->global_id); return -1; } diff --git a/proto/radv/Makefile b/proto/radv/Makefile index 4780bee3..5c56fbf3 100644 --- a/proto/radv/Makefile +++ b/proto/radv/Makefile @@ -2,6 +2,5 @@ src := packets.c radv.c obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,radv_build) tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/radv/radv.c b/proto/radv/radv.c index 3a159462..10d5e3ed 100644 --- a/proto/radv/radv.c +++ b/proto/radv/radv.c @@ -394,10 +394,10 @@ radv_net_match_trigger(struct radv_config *cf, const net_addr *n) } int -radv_preexport(struct channel *c, rte *new) +radv_preexport(struct channel *C, rte *new) { // struct radv_proto *p = (struct radv_proto *) P; - struct radv_config *cf = (struct radv_config *) (c->proto->cf); + struct radv_config *cf = (struct radv_config *) (C->proto->cf); if (radv_net_match_trigger(cf, new->net)) return RIC_PROCESS; diff --git a/proto/rip/Makefile b/proto/rip/Makefile index b9ff62d6..f4a6fa72 100644 --- a/proto/rip/Makefile +++ b/proto/rip/Makefile @@ -2,6 +2,5 @@ src := packets.c rip.c obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,rip_build) tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/rip/rip.c b/proto/rip/rip.c index f5c01380..183fc265 100644 --- a/proto/rip/rip.c +++ b/proto/rip/rip.c @@ -377,7 +377,7 @@ rip_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net, s en->valid = RIP_ENTRY_VALID; en->metric = rt_metric; en->tag = rt_tag; - en->from = (new->src->proto == P) ? rt_from : NULL; + en->from = (new->src->owner == &P->sources) ? rt_from : NULL; eattr *nhea = ea_find(new->attrs, &ea_gen_nexthop); if (nhea) @@ -1112,16 +1112,13 @@ rip_reload_routes(struct channel *C) rip_kick_timer(p); } -static int -rip_rte_better(struct rte *new, struct rte *old) -{ - ASSERT_DIE(new->src == old->src); - struct rip_proto *p = (struct rip_proto *) new->src->proto; - - u32 new_metric = ea_get_int(new->attrs, &ea_rip_metric, p->infinity); - u32 old_metric = ea_get_int(old->attrs, &ea_rip_metric, p->infinity); +static struct rte_owner_class rip_rte_owner_class; - return new_metric < old_metric; +static inline struct rip_proto * +rip_rte_proto(struct rte *rte) +{ + return (rte->src->owner->class == &rip_rte_owner_class) ? + SKIP_BACK(struct rip_proto, p.sources, rte->src->owner) : NULL; } static u32 @@ -1130,6 +1127,12 @@ rip_rte_igp_metric(const rte *rt) return ea_get_int(rt->attrs, &ea_rip_metric, IGP_METRIC_UNKNOWN); } +static int +rip_rte_better(struct rte *new, struct rte *old) +{ + return rip_rte_igp_metric(new) < rip_rte_igp_metric(old); +} + static void rip_postconfig(struct proto_config *CF) { @@ -1151,8 +1154,7 @@ rip_init(struct proto_config *CF) P->rt_notify = rip_rt_notify; P->neigh_notify = rip_neigh_notify; P->reload_routes = rip_reload_routes; - P->rte_better = rip_rte_better; - P->rte_igp_metric = rip_rte_igp_metric; + P->sources.class = &rip_rte_owner_class; return P; } @@ -1227,7 +1229,7 @@ rip_reconfigure(struct proto *P, struct proto_config *CF) static void rip_get_route_info(rte *rte, byte *buf) { - struct rip_proto *p = (struct rip_proto *) rte->src->proto; + struct rip_proto *p = rip_rte_proto(rte); u32 rt_metric = ea_get_int(rte->attrs, &ea_rip_metric, p->infinity); u32 rt_tag = ea_get_int(rte->attrs, &ea_rip_tag, 0); @@ -1359,6 +1361,12 @@ rip_dump(struct proto *P) } +static struct rte_owner_class rip_rte_owner_class = { + .get_route_info = rip_get_route_info, + .rte_better = rip_rte_better, + .rte_igp_metric = rip_rte_igp_metric, +}; + struct protocol proto_rip = { .name = "RIP", .template = "rip%d", @@ -1372,7 +1380,6 @@ struct protocol proto_rip = { .start = rip_start, .shutdown = rip_shutdown, .reconfigure = rip_reconfigure, - .get_route_info = rip_get_route_info, }; void diff --git a/proto/rpki/Makefile b/proto/rpki/Makefile index 8e3a2761..0f60b2a0 100644 --- a/proto/rpki/Makefile +++ b/proto/rpki/Makefile @@ -2,6 +2,5 @@ src := rpki.c packets.c tcp_transport.c ssh_transport.c transport.c obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,rpki_build) tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/rpki/config.Y b/proto/rpki/config.Y index d6d326b8..743b5b42 100644 --- a/proto/rpki/config.Y +++ b/proto/rpki/config.Y @@ -42,6 +42,7 @@ proto: rpki_proto ; rpki_proto_start: proto_start RPKI { this_proto = proto_config_new(&proto_rpki, $1); + this_proto->loop_order = DOMAIN_ORDER(proto); RPKI_CFG->retry_interval = RPKI_RETRY_INTERVAL; RPKI_CFG->refresh_interval = RPKI_REFRESH_INTERVAL; RPKI_CFG->expire_interval = RPKI_EXPIRE_INTERVAL; diff --git a/proto/rpki/packets.c b/proto/rpki/packets.c index 4a52b54b..d7895a22 100644 --- a/proto/rpki/packets.c +++ b/proto/rpki/packets.c @@ -233,7 +233,12 @@ static const size_t min_pdu_size[] = { [ERROR] = 16, }; -static int rpki_send_error_pdu(struct rpki_cache *cache, const enum pdu_error_type error_code, const u32 err_pdu_len, const struct pdu_header *erroneous_pdu, const char *fmt, ...); +static int rpki_send_error_pdu_(struct rpki_cache *cache, const enum pdu_error_type error_code, const u32 err_pdu_len, const struct pdu_header *erroneous_pdu, const char *fmt, ...); + +#define rpki_send_error_pdu(cache, error_code, err_pdu_len, erroneous_pdu, fmt...) ({ \ + rpki_send_error_pdu_(cache, error_code, err_pdu_len, erroneous_pdu, #fmt); \ + CACHE_TRACE(D_PACKETS, cache, #fmt); \ + }) static void rpki_pdu_to_network_byte_order(struct pdu_header *pdu) @@ -595,6 +600,7 @@ rpki_handle_error_pdu(struct rpki_cache *cache, const struct pdu_error *pdu) case INTERNAL_ERROR: case INVALID_REQUEST: case UNSUPPORTED_PDU_TYPE: + CACHE_TRACE(D_PACKETS, cache, "Got UNSUPPORTED_PDU_TYPE"); rpki_cache_change_state(cache, RPKI_CS_ERROR_FATAL); break; @@ -652,21 +658,7 @@ rpki_handle_cache_response_pdu(struct rpki_cache *cache, const struct pdu_cache_ { if (cache->request_session_id) { - if (cache->last_update) - { - /* - * This isn't the first sync and we already received records. This point - * is after Reset Query and before importing new records from cache - * server. We need to load new ones and kick out missing ones. So start - * a refresh cycle. - */ - if (cache->p->roa4_channel) - rt_refresh_begin(cache->p->roa4_channel->table, &cache->p->roa4_channel->in_req); - if (cache->p->roa6_channel) - rt_refresh_begin(cache->p->roa6_channel->table, &cache->p->roa6_channel->in_req); - - cache->p->refresh_channels = 1; - } + rpki_start_refresh(cache->p); cache->session_id = pdu->session_id; cache->request_session_id = 0; } @@ -842,14 +834,7 @@ rpki_handle_end_of_data_pdu(struct rpki_cache *cache, const struct pdu_end_of_da (cf->keep_expire_interval ? "keeps " : ""), cache->expire_interval); } - if (cache->p->refresh_channels) - { - cache->p->refresh_channels = 0; - if (cache->p->roa4_channel) - rt_refresh_end(cache->p->roa4_channel->table, &cache->p->roa4_channel->in_req); - if (cache->p->roa6_channel) - rt_refresh_end(cache->p->roa6_channel->table, &cache->p->roa6_channel->in_req); - } + rpki_stop_refresh(cache->p); cache->last_update = current_time(); cache->serial_num = pdu->serial_num; @@ -1040,7 +1025,7 @@ rpki_connected_hook(sock *sk) * This function prepares Error PDU and sends it to a cache server. */ static int -rpki_send_error_pdu(struct rpki_cache *cache, const enum pdu_error_type error_code, const u32 err_pdu_len, const struct pdu_header *erroneous_pdu, const char *fmt, ...) +rpki_send_error_pdu_(struct rpki_cache *cache, const enum pdu_error_type error_code, const u32 err_pdu_len, const struct pdu_header *erroneous_pdu, const char *fmt, ...) { va_list args; char msg[128]; diff --git a/proto/rpki/rpki.c b/proto/rpki/rpki.c index 56615e36..7ec8d72f 100644 --- a/proto/rpki/rpki.c +++ b/proto/rpki/rpki.c @@ -109,6 +109,7 @@ static void rpki_schedule_next_expire_check(struct rpki_cache *cache); static void rpki_stop_refresh_timer_event(struct rpki_cache *cache); static void rpki_stop_retry_timer_event(struct rpki_cache *cache); static void rpki_stop_expire_timer_event(struct rpki_cache *cache); +static void rpki_stop_all_timers(struct rpki_cache *cache); /* @@ -136,6 +137,30 @@ rpki_table_remove_roa(struct rpki_cache *cache, struct channel *channel, const n rte_update(channel, &pfxr->n, NULL, p->p.main_source); } +void +rpki_start_refresh(struct rpki_proto *p) +{ + if (p->roa4_channel) + rt_refresh_begin(&p->roa4_channel->in_req); + if (p->roa6_channel) + rt_refresh_begin(&p->roa6_channel->in_req); + + p->refresh_channels = 1; +} + +void +rpki_stop_refresh(struct rpki_proto *p) +{ + if (!p->refresh_channels) + return; + + p->refresh_channels = 0; + + if (p->roa4_channel) + rt_refresh_end(&p->roa4_channel->in_req); + if (p->roa6_channel) + rt_refresh_end(&p->roa6_channel->in_req); +} /* * RPKI Protocol Logic @@ -192,6 +217,8 @@ rpki_force_restart_proto(struct rpki_proto *p) { if (p->cache) { + rpki_tr_close(p->cache->tr_sock); + rpki_stop_all_timers(p->cache); CACHE_DBG(p->cache, "Connection object destroying"); } @@ -315,7 +342,7 @@ rpki_schedule_next_refresh(struct rpki_cache *cache) btime t = cache->refresh_interval S; CACHE_DBG(cache, "after %t s", t); - tm_start(cache->refresh_timer, t); + tm_start_in(cache->refresh_timer, t, cache->p->p.loop); } static void @@ -324,7 +351,7 @@ rpki_schedule_next_retry(struct rpki_cache *cache) btime t = cache->retry_interval S; CACHE_DBG(cache, "after %t s", t); - tm_start(cache->retry_timer, t); + tm_start_in(cache->retry_timer, t, cache->p->p.loop); } static void @@ -335,7 +362,7 @@ rpki_schedule_next_expire_check(struct rpki_cache *cache) t = MAX(t, 1 S); CACHE_DBG(cache, "after %t s", t); - tm_start(cache->expire_timer, t); + tm_start_in(cache->expire_timer, t, cache->p->p.loop); } static void @@ -352,13 +379,21 @@ rpki_stop_retry_timer_event(struct rpki_cache *cache) tm_stop(cache->retry_timer); } -static void UNUSED +static void rpki_stop_expire_timer_event(struct rpki_cache *cache) { CACHE_DBG(cache, "Stop"); tm_stop(cache->expire_timer); } +static void +rpki_stop_all_timers(struct rpki_cache *cache) +{ + rpki_stop_refresh_timer_event(cache); + rpki_stop_retry_timer_event(cache); + rpki_stop_expire_timer_event(cache); +} + static int rpki_do_we_recv_prefix_pdu_in_last_seconds(struct rpki_cache *cache) { @@ -623,6 +658,7 @@ rpki_close_connection(struct rpki_cache *cache) { CACHE_TRACE(D_EVENTS, cache, "Closing a connection"); rpki_tr_close(cache->tr_sock); + rpki_stop_refresh(cache->p); proto_notify_state(&cache->p->p, PS_START); } diff --git a/proto/rpki/rpki.h b/proto/rpki/rpki.h index 26fbb46e..20253844 100644 --- a/proto/rpki/rpki.h +++ b/proto/rpki/rpki.h @@ -83,6 +83,8 @@ const char *rpki_cache_state_to_str(enum rpki_cache_state state); void rpki_table_add_roa(struct rpki_cache *cache, struct channel *channel, const net_addr_union *pfxr); void rpki_table_remove_roa(struct rpki_cache *cache, struct channel *channel, const net_addr_union *pfxr); +void rpki_start_refresh(struct rpki_proto *p); +void rpki_stop_refresh(struct rpki_proto *p); /* * RPKI Protocol Logic diff --git a/proto/rpki/ssh_transport.c b/proto/rpki/ssh_transport.c index 6333f367..223afa80 100644 --- a/proto/rpki/ssh_transport.c +++ b/proto/rpki/ssh_transport.c @@ -38,6 +38,8 @@ rpki_tr_ssh_open(struct rpki_tr_sock *tr) if (sk_open(sk) != 0) return RPKI_TR_ERROR; + sk_start(sk); + return RPKI_TR_SUCCESS; } diff --git a/proto/rpki/tcp_transport.c b/proto/rpki/tcp_transport.c index 132f8e2d..4e850c44 100644 --- a/proto/rpki/tcp_transport.c +++ b/proto/rpki/tcp_transport.c @@ -31,6 +31,8 @@ rpki_tr_tcp_open(struct rpki_tr_sock *tr) if (sk_open(sk) != 0) return RPKI_TR_ERROR; + sk_start(sk); + return RPKI_TR_SUCCESS; } diff --git a/proto/rpki/transport.c b/proto/rpki/transport.c index 81bd6dd8..4026fca4 100644 --- a/proto/rpki/transport.c +++ b/proto/rpki/transport.c @@ -85,6 +85,7 @@ rpki_tr_open(struct rpki_tr_sock *tr) sk->rbsize = RPKI_RX_BUFFER_SIZE; sk->tbsize = RPKI_TX_BUFFER_SIZE; sk->tos = IP_PREC_INTERNET_CONTROL; + sk->flags |= SKF_THREAD; sk->vrf = cache->p->p.vrf; if (ipa_zero(sk->daddr) && sk->host) @@ -120,6 +121,7 @@ rpki_tr_close(struct rpki_tr_sock *tr) if (tr->sk) { + sk_stop(tr->sk); rfree(tr->sk); tr->sk = NULL; } diff --git a/proto/static/Makefile b/proto/static/Makefile index 26aed31f..de6e819b 100644 --- a/proto/static/Makefile +++ b/proto/static/Makefile @@ -2,6 +2,5 @@ src := static.c obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,static_build) tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/static/config.Y b/proto/static/config.Y index 41e10dbf..9d26ee82 100644 --- a/proto/static/config.Y +++ b/proto/static/config.Y @@ -40,7 +40,7 @@ static_route_finish(void) if (net_type_match(this_srt->net, NB_DEST) == !this_srt->dest) cf_error("Unexpected or missing nexthop/type"); - this_srt->cmds = f_linearize(this_srt_cmds); + this_srt->cmds = f_linearize(this_srt_cmds, 0); } CF_DECLS diff --git a/proto/static/static.c b/proto/static/static.c index f0a514f7..42fd20b7 100644 --- a/proto/static/static.c +++ b/proto/static/static.c @@ -50,11 +50,14 @@ static inline struct rte_src * static_get_source(struct static_proto *p, uint i) { return i ? rt_get_source(&p->p, i) : p->p.main_source; } +static inline void static_free_source(struct rte_src *src, uint i) +{ if (i) rt_unlock_source(src); } + static void static_announce_rte(struct static_proto *p, struct static_route *r) { + struct rte_src *src; ea_list *ea = NULL; - struct rte_src *src = static_get_source(p, r->index); ea_set_attr_u32(&ea, &ea_gen_preference, 0, p->p.main_channel->preference); ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_STATIC); @@ -114,6 +117,7 @@ static_announce_rte(struct static_proto *p, struct static_route *r) return; /* We skip rta_lookup() here */ + src = static_get_source(p, r->index); rte e0 = { .attrs = ea, .src = src, .net = r->net, }, *e = &e0; /* Evaluate the filter */ @@ -121,6 +125,8 @@ static_announce_rte(struct static_proto *p, struct static_route *r) f_eval_rte(r->cmds, e); rte_update(p->p.main_channel, r->net, e, src); + static_free_source(src, r->index); + r->state = SRS_CLEAN; return; @@ -128,7 +134,9 @@ withdraw: if (r->state == SRS_DOWN) return; + src = static_get_source(p, r->index); rte_update(p->p.main_channel, r->net, NULL, src); + static_free_source(src, r->index); r->state = SRS_DOWN; } @@ -294,7 +302,11 @@ static void static_remove_rte(struct static_proto *p, struct static_route *r) { if (r->state) - rte_update(p->p.main_channel, r->net, NULL, static_get_source(p, r->index)); + { + struct rte_src *src = static_get_source(p, r->index); + rte_update(p->p.main_channel, r->net, NULL, src); + static_free_source(src, r->index); + } static_reset_rte(p, r); } @@ -424,11 +436,11 @@ static_postconfig(struct proto_config *CF) if (!cf->igp_table_ip4) cf->igp_table_ip4 = (cc->table->addr_type == NET_IP4) ? - cc->table : cf->c.global->def_tables[NET_IP4]; + cc->table : rt_get_default_table(cf->c.global, NET_IP4); if (!cf->igp_table_ip6) cf->igp_table_ip6 = (cc->table->addr_type == NET_IP6) ? - cc->table : cf->c.global->def_tables[NET_IP6]; + cc->table : rt_get_default_table(cf->c.global, NET_IP6); WALK_LIST(r, cf->routes) if (r->net && (r->net->type != CF->net_type)) @@ -437,6 +449,8 @@ static_postconfig(struct proto_config *CF) static_index_routes(cf); } +static struct rte_owner_class static_rte_owner_class; + static struct proto * static_init(struct proto_config *CF) { @@ -448,8 +462,7 @@ static_init(struct proto_config *CF) P->neigh_notify = static_neigh_notify; P->reload_routes = static_reload_routes; - P->rte_better = static_rte_better; - P->rte_mergable = static_rte_mergable; + P->sources.class = &static_rte_owner_class; if (cf->igp_table_ip4) p->igp_table_ip4 = cf->igp_table_ip4->table; @@ -481,7 +494,12 @@ static_start(struct proto *P) proto_notify_state(P, PS_UP); WALK_LIST(r, cf->routes) + { + struct lp_state lps; + lp_save(tmp_linpool, &lps); static_add_rte(p, r); + lp_restore(tmp_linpool, &lps); + } return PS_UP; } @@ -748,6 +766,11 @@ static_show(struct proto *P) static_show_rt(r); } +static struct rte_owner_class static_rte_owner_class = { + .get_route_info = static_get_route_info, + .rte_better = static_rte_better, + .rte_mergable = static_rte_mergable, +}; struct protocol proto_static = { .name = "Static", @@ -763,7 +786,6 @@ struct protocol proto_static = { .shutdown = static_shutdown, .reconfigure = static_reconfigure, .copy_config = static_copy_config, - .get_route_info = static_get_route_info, }; void diff --git a/sysdep/cf/README b/sysdep/cf/README index 9a7a4afa..68078bbe 100644 --- a/sysdep/cf/README +++ b/sysdep/cf/README @@ -4,7 +4,6 @@ Available configuration variables: CONFIG_AUTO_ROUTES Device routes are added automagically by the kernel CONFIG_SELF_CONSCIOUS We're able to recognize whether route was installed by us CONFIG_MULTIPLE_TABLES The kernel supports multiple routing tables -CONFIG_ALL_TABLES_AT_ONCE Kernel scanner wants to process all tables at once CONFIG_SINGLE_ROUTE There is only one route per network CONFIG_MC_PROPER_SRC Multicast packets have source address according to socket saddr field diff --git a/sysdep/cf/linux.h b/sysdep/cf/linux.h index 047d3764..c640bef4 100644 --- a/sysdep/cf/linux.h +++ b/sysdep/cf/linux.h @@ -9,7 +9,6 @@ #define CONFIG_AUTO_ROUTES #define CONFIG_SELF_CONSCIOUS #define CONFIG_MULTIPLE_TABLES -#define CONFIG_ALL_TABLES_AT_ONCE #define CONFIG_IP6_SADR_KERNEL #define CONFIG_MC_PROPER_SRC diff --git a/sysdep/config.h b/sysdep/config.h index b0531844..5cdadbb0 100644 --- a/sysdep/config.h +++ b/sysdep/config.h @@ -13,7 +13,7 @@ #ifdef GIT_LABEL #define BIRD_VERSION XSTR1(GIT_LABEL) #else -#define BIRD_VERSION "2.0.9" +#define BIRD_VERSION "2.0.10" #endif /* Include parameters determined by configure script */ diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 656202ac..94a37a73 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -75,51 +75,16 @@ #endif #define krt_ipv4(p) ((p)->af == AF_INET) -#define krt_ecmp6(p) ((p)->af == AF_INET6) const int rt_default_ecmp = 16; -/* - * Structure nl_parse_state keeps state of received route processing. Ideally, - * we could just independently parse received Netlink messages and immediately - * propagate received routes to the rest of BIRD, but older Linux kernel (before - * version 4.11) represents and announces IPv6 ECMP routes not as one route with - * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of - * routes with the same prefix. More recent kernels work as with IPv4. - * - * Therefore, BIRD keeps currently processed route in nl_parse_state structure - * and postpones its propagation until we expect it to be final; i.e., when - * non-matching route is received or when the scan ends. When another matching - * route is received, it is merged with the already processed route to form an - * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the - * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6 - * routes with RTA_MULTIPATH set are just considered non-matching. - * - * This is ignored for asynchronous notifications (every notification is handled - * as a separate route). It is not an issue for our routes, as we ignore such - * notifications anyways. But importing alien IPv6 ECMP routes does not work - * properly with older kernels. - * - * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes - * for the same prefix. - */ - struct nl_parse_state { + struct krt_proto *proto; struct linpool *pool; int scan; - int merge; - net_addr *net; - ea_list *attrs; - struct krt_proto *proto; - s8 new; - s8 krt_src; - u8 krt_type; - u8 krt_proto; - u32 krt_metric; - - u32 rta_flow; /* Used during parsing */ + u32 rta_flow; }; /* @@ -257,16 +222,13 @@ nl_open_sock(struct nl_sock *nl) } } -static void +static int nl_set_strict_dump(struct nl_sock *nl UNUSED, int strict UNUSED) { - /* - * Strict checking is not necessary, it improves behavior on newer kernels. - * If it is not available (missing SOL_NETLINK compile-time, or ENOPROTOOPT - * run-time), we can just ignore it. - */ #ifdef SOL_NETLINK - setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); + return setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); +#else + return -1; #endif } @@ -294,10 +256,17 @@ nl_cfg_rx_buffer_size(struct config *cfg) static void nl_open(void) { + if ((nl_scan.fd >= 0) && (nl_req.fd >= 0)) + return; + nl_open_sock(&nl_scan); nl_open_sock(&nl_req); - nl_set_strict_dump(&nl_scan, 1); + if (nl_set_strict_dump(&nl_scan, 1) < 0) + { + log(L_WARN "KRT: Netlink strict checking failed, will scan all tables at once"); + krt_use_shared_scan(); + } } static void @@ -352,11 +321,13 @@ nl_request_dump_addr(int af) } static void -nl_request_dump_route(int af) +nl_request_dump_route(int af, int table_id) { struct { struct nlmsghdr nh; struct rtmsg rtm; + struct rtattr rta; + u32 table_id; } req = { .nh.nlmsg_type = RTM_GETROUTE, .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)), @@ -365,7 +336,17 @@ nl_request_dump_route(int af) .rtm.rtm_family = af, }; - send(nl_scan.fd, &req, sizeof(req), 0); + if (table_id < 256) + req.rtm.rtm_table = table_id; + else + { + req.rta.rta_type = RTA_TABLE; + req.rta.rta_len = RTA_LENGTH(4); + req.table_id = table_id; + req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + req.rta.rta_len; + } + + send(nl_scan.fd, &req, req.nh.nlmsg_len, 0); nl_scan.last_hdr = NULL; } @@ -1437,10 +1418,13 @@ nh_bufsize(struct nexthop_adata *nhad) } static int -nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nexthop_adata *nh) +nl_send_route(struct krt_proto *p, const rte *e, int op) { eattr *ea; ea_list *eattrs = e->attrs; + eattr *nhea = ea_find(eattrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); int bufsize = 128 + KRT_METRICS_MAX*8 + (nh ? nh_bufsize(nh) : 0); u32 priority = 0; @@ -1518,15 +1502,17 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho /* For route delete, we do not specify remaining route attributes */ if (op == NL_OP_DELETE) - goto dest; + goto done; /* Default scope is LINK for device routes, UNIVERSE otherwise */ if (p->af == AF_MPLS) r->r.rtm_scope = RT_SCOPE_UNIVERSE; else if (ea = ea_find(eattrs, &ea_krt_scope)) r->r.rtm_scope = ea->u.data; + else if (dest == RTD_UNICAST && ipa_zero(nh->nh.gw)) + r->r.rtm_scope = RT_SCOPE_LINK; else - r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->nh.gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; + r->r.rtm_scope = RT_SCOPE_UNIVERSE; if (ea = ea_find(eattrs, &ea_krt_prefsrc)) nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data); @@ -1549,13 +1535,11 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho if (metrics[0]) nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX); - -dest: switch (dest) { case RTD_UNICAST: r->r.rtm_type = RTN_UNICAST; - if (!NEXTHOP_ONE(nh) && !krt_ecmp6(p)) + if (!NEXTHOP_ONE(nh)) nl_add_multipath(&r->h, rsize, nh, p->af, eattrs); else { @@ -1581,99 +1565,56 @@ dest: bug("krt_capable inconsistent with nl_send_route"); } +done: /* Ignore missing for DELETE */ return nl_exchange(&r->h, (op == NL_OP_DELETE)); } static inline int -nl_add_rte(struct krt_proto *p, rte *e) -{ - ea_list *ea = e->attrs; - int err = 0; - - eattr *nhea = ea_find(ea, &ea_gen_nexthop); - struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; - - if (krt_ecmp6(p) && nhad && NEXTHOP_IS_REACHABLE(nhad) && !NEXTHOP_ONE(nhad)) - { - uint cnt = 0; - NEXTHOP_WALK(nh, nhad) - { - struct { - struct nexthop_adata nhad; - u32 labels[MPLS_MAX_LABEL_STACK]; - } nhx; - memcpy(&nhx.nhad.nh, nh, NEXTHOP_SIZE(nh)); - nhx.nhad.ad.length = (void *) NEXTHOP_NEXT(&nhx.nhad.nh) - (void *) nhx.nhad.ad.data; - - if (!cnt++) - { - err = nl_send_route(p, e, NL_OP_ADD, RTD_UNICAST, &nhx.nhad); - if (err < 0) - return err; - } - else - err += nl_send_route(p, e, NL_OP_APPEND, RTD_UNICAST, &nhx.nhad); - } - - return err; - } - - return nl_send_route(p, e, NL_OP_ADD, - NEXTHOP_IS_REACHABLE(nhad) ? RTD_UNICAST : nhad->dest, nhad); -} - -static inline int -nl_delete_rte(struct krt_proto *p, const rte *e) +nl_allow_replace(struct krt_proto *p, rte *new) { - int err = 0; + /* + * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for + * matching rtm_protocol, but that is OK when dedicated priority is used. + * + * For IPv6, the NL_OP_REPLACE is still broken even in Linux 4.19 LTS + * (although it seems to be fixed in Linux 5.10 LTS) for sequence: + * + * ip route add 2001:db8::/32 via fe80::1 dev eth0 + * ip route replace 2001:db8::/32 dev eth0 + * + * (it ends with two routes instead of replacing the first by the second one) + * + * Replacing with direct and special type (e.g. unreachable) routes does not + * work, but replacing with regular routes work reliably + */ - /* For IPv6, we just repeatedly request DELETE until we get error */ - do - err = nl_send_route(p, e, NL_OP_DELETE, RTD_NONE, NULL); - while (krt_ecmp6(p) && !err); + if (krt_ipv4(p)) + return 1; - return err; -} + eattr *nhea = ea_find(new->attrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); -static inline int -nl_replace_rte(struct krt_proto *p, rte *e) -{ - eattr *nhea = ea_find(e->attrs, &ea_gen_nexthop); - struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; - return nl_send_route(p, e, NL_OP_REPLACE, - NEXTHOP_IS_REACHABLE(nhad) ? RTD_UNICAST : nhad->dest, nhad); + return (dest == RTD_UNICAST) && ipa_nonzero(nh->nh.gw); } - void krt_replace_rte(struct krt_proto *p, const net_addr *n UNUSED, rte *new, const rte *old) { int err = 0; - /* - * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for - * matching rtm_protocol, but that is OK when dedicated priority is used. - * - * We do not use NL_OP_REPLACE for IPv6, as it has broken semantics for ECMP - * and with some kernel versions ECMP replace crashes kernel. Would need more - * testing and checks for kernel versions. - * - * For IPv6, we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the - * old route value, so we do not try to optimize IPv6 ECMP reconfigurations. - */ - - if (krt_ipv4(p) && old && new) + if (old && new && nl_allow_replace(p, new)) { - err = nl_replace_rte(p, new); + err = nl_send_route(p, new, NL_OP_REPLACE); } else { if (old) - nl_delete_rte(p, old); + nl_send_route(p, old, NL_OP_DELETE); if (new) - err = nl_add_rte(p, new); + err = nl_send_route(p, new, NL_OP_ADD); } if (new) @@ -1685,68 +1626,6 @@ krt_replace_rte(struct krt_proto *p, const net_addr *n UNUSED, rte *new, const r } } -static int -nl_mergable_route(struct nl_parse_state *s, const net_addr *net, struct krt_proto *p, uint priority, uint krt_type, uint rtm_family) -{ - /* Route merging is used for IPv6 scans */ - if (!s->scan || (rtm_family != AF_INET6)) - return 0; - - /* Saved and new route must have same network, proto/table, and priority */ - if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority)) - return 0; - - /* Both must be regular unicast routes */ - if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST)) - return 0; - - return 1; -} - -static void -nl_announce_route(struct nl_parse_state *s) -{ - rte e0 = { - .attrs = s->attrs, - .net = s->net, - }; - - EA_LOCAL_LIST(2) ea = { - .l = { .count = 2, .next = e0.attrs }, - .a = { - EA_LITERAL_EMBEDDED(&ea_krt_source, 0, s->krt_proto), - EA_LITERAL_EMBEDDED(&ea_krt_metric, 0, s->krt_metric), - }, - }; - - e0.attrs = &ea.l; - - if (s->scan) - krt_got_route(s->proto, &e0, s->krt_src); - else - krt_got_route_async(s->proto, &e0, s->new, s->krt_src); - - s->net = NULL; - s->attrs = NULL; - s->proto = NULL; - lp_flush(s->pool); -} - -static inline void -nl_parse_begin(struct nl_parse_state *s, int scan) -{ - memset(s, 0, sizeof (struct nl_parse_state)); - s->pool = nl_linpool; - s->scan = scan; -} - -static inline void -nl_parse_end(struct nl_parse_state *s) -{ - if (s->net) - nl_announce_route(s); -} - #define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0) #define SKIP(ARG, ...) do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0) @@ -1882,11 +1761,10 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) net6_prefix(&src), net6_pxlen(&src)); } - if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type, i->rtm_family)) - nl_announce_route(s); - ea_list *ra = NULL; ea_set_attr_u32(&ra, &ea_gen_source, 0, RTS_INHERIT); + ea_set_attr_u32(&ra, &ea_krt_source, 0, i->rtm_protocol); + ea_set_attr_u32(&ra, &ea_krt_metric, 0, priority); if (a[RTA_FLOW]) s->rta_flow = rta_get_u32(a[RTA_FLOW]); @@ -2028,63 +1906,42 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) EA_LITERAL_EMBEDDED(&ea_krt_metrics[t], 0, metrics[t])); } - /* - * Ideally, now we would send the received route to the rest of kernel code. - * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we - * postpone it and merge next hops until the end of the sequence. Note that - * when doing merging of next hops, we expect the new route to be unipath. - * Otherwise, we ignore additional next hops in nexthop_insert(). - */ + rte e0 = { + .net = net, + .attrs = ra, + }; - if (!s->net) - { - /* Store the new route */ - s->net = lp_alloc(s->pool, net->length); - net_copy(s->net, net); - - ea_set_attr_data(&ra, &ea_gen_nexthop, 0, - nhad.ad.data, nhad.ad.length); - - s->attrs = ra; - s->proto = p; - s->new = new; - s->krt_src = krt_src; - s->krt_type = i->rtm_type; - s->krt_proto = i->rtm_protocol; - s->krt_metric = priority; - } + if (s->scan) + krt_got_route(p, &e0, krt_src); else - { - /* Merge next hops with the stored route */ - eattr *nhea = ea_find(s->attrs, &ea_gen_nexthop); - struct nexthop_adata *nhad_old = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; - - if (nhad_old) - ea_set_attr(&s->attrs, - EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, - &(nexthop_merge(nhad_old, &nhad.nhad, - KRT_CF->merge_paths, s->pool)->ad) - )); - else - ea_set_attr_data(&s->attrs, &ea_gen_nexthop, 0, - nhad.ad.data, nhad.ad.length); - } + krt_got_route_async(p, &e0, new, krt_src); + + lp_flush(s->pool); } void -krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */ +krt_do_scan(struct krt_proto *p) { - struct nlmsghdr *h; - struct nl_parse_state s; + struct nl_parse_state s = { + .proto = p, + .pool = nl_linpool, + .scan = 1, + }; + + /* Table-specific scan or shared scan */ + if (p) + nl_request_dump_route(p->af, krt_table_id(p)); + else + nl_request_dump_route(AF_UNSPEC, 0); - nl_parse_begin(&s, 1); - nl_request_dump_route(AF_UNSPEC); + struct nlmsghdr *h; while (h = nl_get_scan()) + { if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) nl_parse_route(&s, h); else log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type); - nl_parse_end(&s); + } } /* @@ -2099,16 +1956,18 @@ static struct config *nl_last_config; /* For tracking changes to nl_async_bufsiz static void nl_async_msg(struct nlmsghdr *h) { - struct nl_parse_state s; + struct nl_parse_state s = { + .proto = NULL, + .pool = nl_linpool, + .scan = 0, + }; switch (h->nlmsg_type) { case RTM_NEWROUTE: case RTM_DELROUTE: DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type); - nl_parse_begin(&s, 0); nl_parse_route(&s, h); - nl_parse_end(&s); break; case RTM_NEWLINK: case RTM_DELLINK: diff --git a/sysdep/unix/Makefile b/sysdep/unix/Makefile index 51ab98a9..6f6b0d26 100644 --- a/sysdep/unix/Makefile +++ b/sysdep/unix/Makefile @@ -1,9 +1,7 @@ -src := alloc.c io.c krt.c log.c main.c random.c +src := alloc.c io.c io-loop.c krt.c log.c main.c random.c domain.c obj := $(src-o-files) $(all-daemon) $(cf-local) -$(call proto-build,kif_build) -$(call proto-build,krt_build) $(conf-y-targets): $(s)krt.Y src := $(filter-out main.c, $(src)) diff --git a/sysdep/unix/alloc.c b/sysdep/unix/alloc.c index edad6209..847def30 100644 --- a/sysdep/unix/alloc.c +++ b/sysdep/unix/alloc.c @@ -10,6 +10,7 @@ #include "lib/resource.h" #include "lib/lists.h" #include "lib/event.h" +#include "lib/rcu.h" #include <errno.h> #include <stdlib.h> @@ -22,46 +23,43 @@ long page_size = 0; #ifdef HAVE_MMAP -#define KEEP_PAGES_MAIN_MAX 256 -#define KEEP_PAGES_MAIN_MIN 8 -#define CLEANUP_PAGES_BULK 256 +#define KEEP_PAGES_MAX 512 +#define KEEP_PAGES_MIN 32 +#define KEEP_PAGES_MAX_LOCAL 16 +#define ALLOC_PAGES_AT_ONCE 8 -STATIC_ASSERT(KEEP_PAGES_MAIN_MIN * 4 < KEEP_PAGES_MAIN_MAX); +STATIC_ASSERT(KEEP_PAGES_MIN * 4 < KEEP_PAGES_MAX); +STATIC_ASSERT(ALLOC_PAGES_AT_ONCE < KEEP_PAGES_MAX_LOCAL); static _Bool use_fake = 0; +static _Bool initialized = 0; #if DEBUGGING struct free_page { node unused[42]; - node n; + struct free_page * _Atomic next; }; #else struct free_page { - node n; + struct free_page * _Atomic next; }; #endif -struct free_pages { - list pages; - u16 min, max; /* Minimal and maximal number of free pages kept */ - uint cnt; /* Number of empty pages */ - event cleanup; -}; - -static void global_free_pages_cleanup_event(void *); +static struct free_page * _Atomic page_stack = NULL; +static _Thread_local struct free_page * local_page_stack = NULL; -static struct free_pages global_free_pages = { - .min = KEEP_PAGES_MAIN_MIN, - .max = KEEP_PAGES_MAIN_MAX, - .cleanup = { .hook = global_free_pages_cleanup_event }, -}; +static void page_cleanup(void *); +static event page_cleanup_event = { .hook = page_cleanup, }; +#define SCHEDULE_CLEANUP do if (initialized && !shutting_down) ev_send(&global_event_list, &page_cleanup_event); while (0) -uint *pages_kept = &global_free_pages.cnt; +_Atomic int pages_kept = 0; +_Atomic int pages_kept_locally = 0; +static int pages_kept_here = 0; static void * alloc_sys_page(void) { - void *ptr = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + void *ptr = mmap(NULL, page_size * ALLOC_PAGES_AT_ONCE, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr == MAP_FAILED) bug("mmap(%lu) failed: %m", page_size); @@ -90,20 +88,33 @@ alloc_page(void) } #ifdef HAVE_MMAP - struct free_pages *fps = &global_free_pages; - - if (fps->cnt) + struct free_page *fp = local_page_stack; + if (fp) { - struct free_page *fp = SKIP_BACK(struct free_page, n, HEAD(fps->pages)); - rem_node(&fp->n); - if ((--fps->cnt < fps->min) && !shutting_down) - ev_schedule(&fps->cleanup); - - bzero(fp, page_size); + local_page_stack = atomic_load_explicit(&fp->next, memory_order_acquire); + atomic_fetch_sub_explicit(&pages_kept_locally, 1, memory_order_relaxed); + pages_kept_here--; return fp; } - return alloc_sys_page(); + rcu_read_lock(); + fp = atomic_load_explicit(&page_stack, memory_order_acquire); + while (fp && !atomic_compare_exchange_strong_explicit( + &page_stack, &fp, atomic_load_explicit(&fp->next, memory_order_acquire), + memory_order_acq_rel, memory_order_acquire)) + ; + rcu_read_unlock(); + + if (!fp) + { + void *ptr = alloc_sys_page(); + for (int i=1; i<ALLOC_PAGES_AT_ONCE; i++) + free_page(ptr + page_size * i); + return ptr; + } + + atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed); + return fp; #endif } @@ -117,45 +128,95 @@ free_page(void *ptr) } #ifdef HAVE_MMAP - struct free_pages *fps = &global_free_pages; struct free_page *fp = ptr; + if (shutting_down || (pages_kept_here < KEEP_PAGES_MAX_LOCAL)) + { + atomic_store_explicit(&fp->next, local_page_stack, memory_order_relaxed); + atomic_fetch_add_explicit(&pages_kept_locally, 1, memory_order_relaxed); + pages_kept_here++; + return; + } + + rcu_read_lock(); + struct free_page *next = atomic_load_explicit(&page_stack, memory_order_acquire); - fp->n = (node) {}; - add_tail(&fps->pages, &fp->n); + do atomic_store_explicit(&fp->next, next, memory_order_release); + while (!atomic_compare_exchange_strong_explicit( + &page_stack, &next, fp, + memory_order_acq_rel, memory_order_acquire)); + rcu_read_unlock(); - if ((++fps->cnt > fps->max) && !shutting_down) - ev_schedule(&fps->cleanup); + if (atomic_fetch_add_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX) + SCHEDULE_CLEANUP; #endif } +void +flush_local_pages(void) +{ + if (use_fake || !local_page_stack || shutting_down) + return; + + struct free_page *last = local_page_stack, *next; + int check_count = 1; + while (next = atomic_load_explicit(&last->next, memory_order_acquire)) + { + check_count++; + last = next; + } + + ASSERT_DIE(check_count == pages_kept_here); + + rcu_read_lock(); + next = atomic_load_explicit(&page_stack, memory_order_acquire); + + do atomic_store_explicit(&last->next, next, memory_order_release); + while (!atomic_compare_exchange_strong_explicit( + &page_stack, &next, local_page_stack, + memory_order_acq_rel, memory_order_acquire)); + rcu_read_unlock(); + + local_page_stack = NULL; + pages_kept_here = 0; + + atomic_fetch_sub_explicit(&pages_kept_locally, check_count, memory_order_relaxed); + if (atomic_fetch_add_explicit(&pages_kept, check_count, memory_order_relaxed) >= KEEP_PAGES_MAX) + SCHEDULE_CLEANUP; +} + #ifdef HAVE_MMAP static void -global_free_pages_cleanup_event(void *data UNUSED) +page_cleanup(void *_ UNUSED) { if (shutting_down) return; - struct free_pages *fps = &global_free_pages; + struct free_page *stack = atomic_exchange_explicit(&page_stack, NULL, memory_order_acq_rel); + if (!stack) + return; - while (fps->cnt / 2 < fps->min) - { - struct free_page *fp = alloc_sys_page(); - fp->n = (node) {}; - add_tail(&fps->pages, &fp->n); - fps->cnt++; + synchronize_rcu(); + + do { + struct free_page *f = stack; + stack = atomic_load_explicit(&f->next, memory_order_acquire); + + if (munmap(f, page_size) == 0) + continue; + else if (errno != ENOMEM) + bug("munmap(%p) failed: %m", f); + else + free_page(f); } + while ((atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX / 2) && stack); - for (uint seen = 0; (seen < CLEANUP_PAGES_BULK) && (fps->cnt > fps->max / 2); seen++) + while (stack) { - struct free_page *fp = SKIP_BACK(struct free_page, n, TAIL(fps->pages)); - rem_node(&fp->n); + struct free_page *f = stack; + stack = atomic_load_explicit(&f->next, memory_order_acquire); + free_page(f); - if (munmap(fp, page_size) == 0) - fps->cnt--; - else if (errno == ENOMEM) - add_head(&fps->pages, &fp->n); - else - bug("munmap(%p) failed: %m", fp); + atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed); } } #endif @@ -164,17 +225,17 @@ void resource_sys_init(void) { #ifdef HAVE_MMAP - ASSERT_DIE(global_free_pages.cnt == 0); - if (!(page_size = sysconf(_SC_PAGESIZE))) die("System page size must be non-zero"); if (u64_popcount(page_size) == 1) { - struct free_pages *fps = &global_free_pages; - init_list(&fps->pages); - global_free_pages_cleanup_event(NULL); + for (int i = 0; i < (KEEP_PAGES_MIN * 2); i++) + free_page(alloc_page()); + + page_cleanup(NULL); + initialized = 1; return; } @@ -184,4 +245,5 @@ resource_sys_init(void) #endif page_size = 4096; + initialized = 1; } diff --git a/sysdep/unix/domain.c b/sysdep/unix/domain.c new file mode 100644 index 00000000..0a5858a6 --- /dev/null +++ b/sysdep/unix/domain.c @@ -0,0 +1,116 @@ +/* + * BIRD Locking + * + * (c) 2020 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#undef LOCAL_DEBUG + +#undef DEBUG_LOCKING + +#include "lib/birdlib.h" +#include "lib/locking.h" +#include "lib/resource.h" +#include "lib/timer.h" + +#include "conf/conf.h" + +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <pthread.h> +#include <semaphore.h> +#include <stdatomic.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +/* + * Locking subsystem + */ + +_Thread_local struct lock_order locking_stack = {}; +_Thread_local struct domain_generic **last_locked = NULL; + +#define ASSERT_NO_LOCK ASSERT_DIE(last_locked == NULL) + +struct domain_generic { + pthread_mutex_t mutex; + uint order; + struct domain_generic **prev; + struct lock_order *locked_by; + const char *name; +}; + +#define DOMAIN_INIT(_name, _order) { .mutex = PTHREAD_MUTEX_INITIALIZER, .name = _name, .order = _order } + +static struct domain_generic the_bird_domain_gen = DOMAIN_INIT("The BIRD", OFFSETOF(struct lock_order, the_bird)); + +DOMAIN(the_bird) the_bird_domain = { .the_bird = &the_bird_domain_gen }; + +struct domain_generic * +domain_new(const char *name, uint order) +{ + ASSERT_DIE(order < sizeof(struct lock_order)); + struct domain_generic *dg = xmalloc(sizeof(struct domain_generic)); + *dg = (struct domain_generic) DOMAIN_INIT(name, order); + return dg; +} + +void +domain_free(struct domain_generic *dg) +{ + pthread_mutex_destroy(&dg->mutex); + xfree(dg); +} + +uint dg_order(struct domain_generic *dg) +{ + return dg->order; +} + +void do_lock(struct domain_generic *dg, struct domain_generic **lsp) +{ + if ((char *) lsp - (char *) &locking_stack != dg->order) + bug("Trying to lock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack); + + if (lsp <= last_locked) + bug("Trying to lock in a bad order"); + if (*lsp) + bug("Inconsistent locking stack state on lock"); + + btime lock_begin = current_time(); + pthread_mutex_lock(&dg->mutex); + btime duration = current_time() - lock_begin; + if (config && (duration > config->watchdog_warning)) + log(L_WARN "Locking of %s took %d ms", dg->name, (int) (duration TO_MS)); + + if (dg->prev || dg->locked_by) + bug("Previous unlock not finished correctly"); + dg->prev = last_locked; + *lsp = dg; + last_locked = lsp; + dg->locked_by = &locking_stack; +} + +void do_unlock(struct domain_generic *dg, struct domain_generic **lsp) +{ + if ((char *) lsp - (char *) &locking_stack != dg->order) + bug("Trying to unlock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack); + + if (dg->locked_by != &locking_stack) + bug("Inconsistent domain state on unlock"); + if ((last_locked != lsp) || (*lsp != dg)) + bug("Inconsistent locking stack state on unlock"); + dg->locked_by = NULL; + last_locked = dg->prev; + *lsp = NULL; + dg->prev = NULL; + pthread_mutex_unlock(&dg->mutex); +} diff --git a/sysdep/unix/io-loop.c b/sysdep/unix/io-loop.c new file mode 100644 index 00000000..dbca36e9 --- /dev/null +++ b/sysdep/unix/io-loop.c @@ -0,0 +1,643 @@ +/* + * BIRD -- I/O and event loop + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <pthread.h> +#include <time.h> +#include <sys/time.h> + +#include "nest/bird.h" + +#include "lib/buffer.h" +#include "lib/lists.h" +#include "lib/resource.h" +#include "lib/event.h" +#include "lib/timer.h" +#include "lib/socket.h" + +#include "lib/io-loop.h" +#include "sysdep/unix/io-loop.h" +#include "conf/conf.h" + +#define THREAD_STACK_SIZE 65536 /* To be lowered in near future */ + +/* + * Current thread context + */ + +_Thread_local struct birdloop *birdloop_current; +static _Thread_local struct birdloop *birdloop_wakeup_masked; +static _Thread_local uint birdloop_wakeup_masked_count; + +event_list * +birdloop_event_list(struct birdloop *loop) +{ + return &loop->event_list; +} + +struct timeloop * +birdloop_time_loop(struct birdloop *loop) +{ + return &loop->time; +} + +_Bool +birdloop_inside(struct birdloop *loop) +{ + for (struct birdloop *c = birdloop_current; c; c = c->prev_loop) + if (loop == c) + return 1; + + return 0; +} + +void +birdloop_flag(struct birdloop *loop, u32 flag) +{ + atomic_fetch_or_explicit(&loop->flags, flag, memory_order_acq_rel); + birdloop_ping(loop); +} + +void +birdloop_flag_set_handler(struct birdloop *loop, struct birdloop_flag_handler *fh) +{ + ASSERT_DIE(birdloop_inside(loop)); + loop->flag_handler = fh; +} + +static int +birdloop_process_flags(struct birdloop *loop) +{ + if (!loop->flag_handler) + return 0; + + u32 flags = atomic_exchange_explicit(&loop->flags, 0, memory_order_acq_rel); + loop->flag_handler->hook(loop->flag_handler, flags); + return !!flags; +} + +static int +birdloop_run_events(struct birdloop *loop) +{ + btime begin = current_time(); + while (current_time() - begin < 5 MS) + { + if (!ev_run_list(&loop->event_list)) + return 0; + + times_update(); + } + + return 1; +} + +/* + * Wakeup code for birdloop + */ + +void +pipe_new(struct pipe *p) +{ + int rv = pipe(p->fd); + if (rv < 0) + die("pipe: %m"); + + if (fcntl(p->fd[0], F_SETFL, O_NONBLOCK) < 0) + die("fcntl(O_NONBLOCK): %m"); + + if (fcntl(p->fd[1], F_SETFL, O_NONBLOCK) < 0) + die("fcntl(O_NONBLOCK): %m"); +} + +void +pipe_drain(struct pipe *p) +{ + while (1) { + char buf[64]; + int rv = read(p->fd[0], buf, sizeof(buf)); + if ((rv < 0) && (errno == EAGAIN)) + return; + + if (rv == 0) + bug("wakeup read eof"); + if ((rv < 0) && (errno != EINTR)) + bug("wakeup read: %m"); + } +} + +int +pipe_read_one(struct pipe *p) +{ + while (1) { + char v; + int rv = read(p->fd[0], &v, sizeof(v)); + if (rv == 1) + return 1; + if ((rv < 0) && (errno == EAGAIN)) + return 0; + if (rv > 1) + bug("wakeup read more bytes than expected: %d", rv); + if (rv == 0) + bug("wakeup read eof"); + if (errno != EINTR) + bug("wakeup read: %m"); + } +} + +void +pipe_kick(struct pipe *p) +{ + char v = 1; + int rv; + + while (1) { + rv = write(p->fd[1], &v, sizeof(v)); + if ((rv >= 0) || (errno == EAGAIN)) + return; + if (errno != EINTR) + bug("wakeup write: %m"); + } +} + +void +pipe_pollin(struct pipe *p, struct pollfd *pfd) +{ + pfd->fd = p->fd[0]; + pfd->events = POLLIN; + pfd->revents = 0; +} + +static inline void +wakeup_init(struct birdloop *loop) +{ + pipe_new(&loop->wakeup); +} + +static inline void +wakeup_drain(struct birdloop *loop) +{ + pipe_drain(&loop->wakeup); +} + +static inline void +wakeup_do_kick(struct birdloop *loop) +{ + pipe_kick(&loop->wakeup); +} + +static inline void +birdloop_do_ping(struct birdloop *loop) +{ + if (atomic_fetch_add_explicit(&loop->ping_sent, 1, memory_order_acq_rel)) + return; + + if (loop == birdloop_wakeup_masked) + birdloop_wakeup_masked_count++; + else + wakeup_do_kick(loop); +} + +void +birdloop_ping(struct birdloop *loop) +{ + if (birdloop_inside(loop) && !loop->ping_pending) + loop->ping_pending++; + else + birdloop_do_ping(loop); +} + + +/* + * Sockets + */ + +static void +sockets_init(struct birdloop *loop) +{ + init_list(&loop->sock_list); + loop->sock_num = 0; + + BUFFER_INIT(loop->poll_sk, loop->pool, 4); + BUFFER_INIT(loop->poll_fd, loop->pool, 4); + loop->poll_changed = 1; /* add wakeup fd */ +} + +static void +sockets_add(struct birdloop *loop, sock *s) +{ + add_tail(&loop->sock_list, &s->n); + loop->sock_num++; + + s->index = -1; + loop->poll_changed = 1; + + birdloop_ping(loop); +} + +void +sk_start(sock *s) +{ + ASSERT_DIE(birdloop_current != &main_birdloop); + sockets_add(birdloop_current, s); +} + +static void +sockets_remove(struct birdloop *loop, sock *s) +{ + rem_node(&s->n); + loop->sock_num--; + + if (s->index >= 0) + { + loop->poll_sk.data[s->index] = NULL; + s->index = -1; + loop->poll_changed = 1; + loop->close_scheduled = 1; + birdloop_ping(loop); + } + else + close(s->fd); +} + +void +sk_stop(sock *s) +{ + sockets_remove(birdloop_current, s); +} + +static inline uint sk_want_events(sock *s) +{ return (s->rx_hook ? POLLIN : 0) | ((s->ttx != s->tpos) ? POLLOUT : 0); } + +/* +FIXME: this should be called from sock code + +static void +sockets_update(struct birdloop *loop, sock *s) +{ + if (s->index >= 0) + loop->poll_fd.data[s->index].events = sk_want_events(s); +} +*/ + +static void +sockets_prepare(struct birdloop *loop) +{ + BUFFER_SET(loop->poll_sk, loop->sock_num + 1); + BUFFER_SET(loop->poll_fd, loop->sock_num + 1); + + struct pollfd *pfd = loop->poll_fd.data; + sock **psk = loop->poll_sk.data; + uint i = 0; + node *n; + + WALK_LIST(n, loop->sock_list) + { + sock *s = SKIP_BACK(sock, n, n); + + ASSERT(i < loop->sock_num); + + s->index = i; + *psk = s; + pfd->fd = s->fd; + pfd->events = sk_want_events(s); + pfd->revents = 0; + + pfd++; + psk++; + i++; + } + + ASSERT(i == loop->sock_num); + + /* Add internal wakeup fd */ + *psk = NULL; + pipe_pollin(&loop->wakeup, pfd); + + loop->poll_changed = 0; +} + +static void +sockets_close_fds(struct birdloop *loop) +{ + struct pollfd *pfd = loop->poll_fd.data; + sock **psk = loop->poll_sk.data; + int poll_num = loop->poll_fd.used - 1; + + int i; + for (i = 0; i < poll_num; i++) + if (psk[i] == NULL) + close(pfd[i].fd); + + loop->close_scheduled = 0; +} + +int sk_read(sock *s, int revents); +int sk_write(sock *s); + +static void +sockets_fire(struct birdloop *loop) +{ + struct pollfd *pfd = loop->poll_fd.data; + sock **psk = loop->poll_sk.data; + int poll_num = loop->poll_fd.used - 1; + + times_update(); + + /* Last fd is internal wakeup fd */ + if (pfd[poll_num].revents & POLLIN) + wakeup_drain(loop); + + int i; + for (i = 0; i < poll_num; pfd++, psk++, i++) + { + int e = 1; + + if (! pfd->revents) + continue; + + if (pfd->revents & POLLNVAL) + die("poll: invalid fd %d", pfd->fd); + + if (pfd->revents & POLLIN) + while (e && *psk && (*psk)->rx_hook) + e = sk_read(*psk, pfd->revents); + + e = 1; + if (pfd->revents & POLLOUT) + { + loop->poll_changed = 1; + while (e && *psk) + e = sk_write(*psk); + } + } +} + + +/* + * Birdloop + */ + +struct birdloop main_birdloop; + +static void birdloop_enter_locked(struct birdloop *loop); + +void +birdloop_init(void) +{ + wakeup_init(&main_birdloop); + + main_birdloop.time.domain = the_bird_domain.the_bird; + main_birdloop.time.loop = &main_birdloop; + + times_update(); + timers_init(&main_birdloop.time, &root_pool); + + birdloop_enter_locked(&main_birdloop); +} + +static void *birdloop_main(void *arg); + +struct birdloop * +birdloop_new(pool *pp, uint order, const char *name) +{ + struct domain_generic *dg = domain_new(name, order); + + pool *p = rp_new(pp, name); + struct birdloop *loop = mb_allocz(p, sizeof(struct birdloop)); + loop->pool = p; + + loop->time.domain = dg; + loop->time.loop = loop; + + birdloop_enter(loop); + + wakeup_init(loop); + ev_init_list(&loop->event_list, loop, name); + timers_init(&loop->time, p); + sockets_init(loop); + + int e = 0; + + if (e = pthread_attr_init(&loop->thread_attr)) + die("pthread_attr_init() failed: %M", e); + + if (e = pthread_attr_setstacksize(&loop->thread_attr, THREAD_STACK_SIZE)) + die("pthread_attr_setstacksize(%u) failed: %M", THREAD_STACK_SIZE, e); + + if (e = pthread_attr_setdetachstate(&loop->thread_attr, PTHREAD_CREATE_DETACHED)) + die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e); + + if (e = pthread_create(&loop->thread_id, &loop->thread_attr, birdloop_main, loop)) + die("pthread_create() failed: %M", e); + + birdloop_leave(loop); + + return loop; +} + +static void +birdloop_do_stop(struct birdloop *loop, void (*stopped)(void *data), void *data) +{ + loop->stopped = stopped; + loop->stop_data = data; + wakeup_do_kick(loop); +} + +void +birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data) +{ + DG_LOCK(loop->time.domain); + birdloop_do_stop(loop, stopped, data); + DG_UNLOCK(loop->time.domain); +} + +void +birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data) +{ + ASSERT_DIE(loop == birdloop_current); + ASSERT_DIE(DG_IS_LOCKED(loop->time.domain)); + + birdloop_do_stop(loop, stopped, data); +} + +void +birdloop_free(struct birdloop *loop) +{ + ASSERT_DIE(loop->links == 0); + ASSERT_DIE(pthread_equal(pthread_self(), loop->thread_id)); + + rcu_birdloop_stop(&loop->rcu); + pthread_attr_destroy(&loop->thread_attr); + + domain_free(loop->time.domain); + rfree(loop->pool); +} + +static void +birdloop_enter_locked(struct birdloop *loop) +{ + ASSERT_DIE(DG_IS_LOCKED(loop->time.domain)); + ASSERT_DIE(!birdloop_inside(loop)); + + /* Store the old context */ + loop->prev_loop = birdloop_current; + + /* Put the new context */ + birdloop_current = loop; +} + +void +birdloop_enter(struct birdloop *loop) +{ + DG_LOCK(loop->time.domain); + return birdloop_enter_locked(loop); +} + +static void +birdloop_leave_locked(struct birdloop *loop) +{ + /* Check the current context */ + ASSERT_DIE(birdloop_current == loop); + + /* Send pending pings */ + if (loop->ping_pending) + { + loop->ping_pending = 0; + birdloop_do_ping(loop); + } + + /* Restore the old context */ + birdloop_current = loop->prev_loop; +} + +void +birdloop_leave(struct birdloop *loop) +{ + birdloop_leave_locked(loop); + DG_UNLOCK(loop->time.domain); +} + +void +birdloop_mask_wakeups(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_wakeup_masked == NULL); + birdloop_wakeup_masked = loop; +} + +void +birdloop_unmask_wakeups(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_wakeup_masked == loop); + birdloop_wakeup_masked = NULL; + if (birdloop_wakeup_masked_count) + wakeup_do_kick(loop); + + birdloop_wakeup_masked_count = 0; +} + +void +birdloop_link(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_inside(loop)); + loop->links++; +} + +void +birdloop_unlink(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_inside(loop)); + loop->links--; +} + +static void * +birdloop_main(void *arg) +{ + struct birdloop *loop = arg; + timer *t; + int rv, timeout; + + rcu_birdloop_start(&loop->rcu); + + btime loop_begin = current_time(); + + tmp_init(loop->pool); + + birdloop_enter(loop); + while (1) + { + timers_fire(&loop->time, 0); + if (birdloop_process_flags(loop) + birdloop_run_events(loop)) + timeout = 0; + else if (t = timers_first(&loop->time)) + timeout = (tm_remains(t) TO_MS) + 1; + else + timeout = -1; + + if (loop->poll_changed) + sockets_prepare(loop); + else + if ((timeout < 0) || (timeout > 5000)) + flush_local_pages(); + + btime duration = current_time() - loop_begin; + if (duration > config->watchdog_warning) + log(L_WARN "I/O loop cycle took %d ms", (int) (duration TO_MS)); + + birdloop_leave(loop); + + try: + rv = poll(loop->poll_fd.data, loop->poll_fd.used, timeout); + if (rv < 0) + { + if (errno == EINTR || errno == EAGAIN) + goto try; + die("poll: %m"); + } + + birdloop_enter(loop); + + if (loop->close_scheduled) + sockets_close_fds(loop); + + if (loop->stopped) + break; + + loop_begin = current_time(); + + if (rv) + sockets_fire(loop); + + atomic_exchange_explicit(&loop->ping_sent, 0, memory_order_acq_rel); + } + + /* Flush remaining events */ + ASSERT_DIE(!ev_run_list(&loop->event_list)); + + /* Drop timers */ + while (t = timers_first(&loop->time)) + tm_stop(t); + + /* No sockets allowed */ + ASSERT_DIE(EMPTY_LIST(loop->sock_list)); + ASSERT_DIE(loop->sock_num == 0); + + birdloop_leave(loop); + loop->stopped(loop->stop_data); + + flush_local_pages(); + return NULL; +} + +void +birdloop_yield(void) +{ + usleep(100); +} diff --git a/sysdep/unix/io-loop.h b/sysdep/unix/io-loop.h new file mode 100644 index 00000000..29ca96d6 --- /dev/null +++ b/sysdep/unix/io-loop.h @@ -0,0 +1,56 @@ +/* + * BIRD -- I/O and event loop + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_SYSDEP_UNIX_IO_LOOP_H_ +#define _BIRD_SYSDEP_UNIX_IO_LOOP_H_ + +#include "lib/rcu.h" + +struct pipe +{ + int fd[2]; +}; + +void pipe_new(struct pipe *); +void pipe_pollin(struct pipe *, struct pollfd *); +void pipe_drain(struct pipe *); +void pipe_kick(struct pipe *); + +struct birdloop +{ + pool *pool; + + struct timeloop time; + event_list event_list; + list sock_list; + uint sock_num; + + BUFFER(sock *) poll_sk; + BUFFER(struct pollfd) poll_fd; + u8 poll_changed; + u8 close_scheduled; + + uint ping_pending; + _Atomic u32 ping_sent; + struct pipe wakeup; + + pthread_t thread_id; + pthread_attr_t thread_attr; + + struct rcu_birdloop rcu; + + uint links; + + _Atomic u32 flags; + struct birdloop_flag_handler *flag_handler; + + void (*stopped)(void *data); + void *stop_data; + + struct birdloop *prev_loop; +}; + +#endif diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index 8a116789..6454f15f 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -36,12 +36,14 @@ #include "lib/resource.h" #include "lib/socket.h" #include "lib/event.h" +#include "lib/locking.h" #include "lib/timer.h" #include "lib/string.h" #include "nest/iface.h" #include "conf/conf.h" #include "sysdep/unix/unix.h" +#include "sysdep/unix/io-loop.h" #include CONFIG_INCLUDE_SYSIO_H /* Maximum number of calls of tx handler for one socket in one @@ -122,55 +124,50 @@ rf_fileno(struct rfile *f) btime boot_time; + void -times_init(struct timeloop *loop) +times_update(void) { struct timespec ts; int rv; + btime old_time = current_time(); + btime old_real_time = current_real_time(); + rv = clock_gettime(CLOCK_MONOTONIC, &ts); if (rv < 0) die("Monotonic clock is missing"); if ((ts.tv_sec < 0) || (((u64) ts.tv_sec) > ((u64) 1 << 40))) log(L_WARN "Monotonic clock is crazy"); - - loop->last_time = ts.tv_sec S + ts.tv_nsec NS; - loop->real_time = 0; -} - -void -times_update(struct timeloop *loop) -{ - struct timespec ts; - int rv; - - rv = clock_gettime(CLOCK_MONOTONIC, &ts); - if (rv < 0) - die("clock_gettime: %m"); - + btime new_time = ts.tv_sec S + ts.tv_nsec NS; - if (new_time < loop->last_time) + if (new_time < old_time) log(L_ERR "Monotonic clock is broken"); - loop->last_time = new_time; - loop->real_time = 0; -} - -void -times_update_real_time(struct timeloop *loop) -{ - struct timespec ts; - int rv; - rv = clock_gettime(CLOCK_REALTIME, &ts); if (rv < 0) die("clock_gettime: %m"); - loop->real_time = ts.tv_sec S + ts.tv_nsec NS; -} + btime new_real_time = ts.tv_sec S + ts.tv_nsec NS; + if (!atomic_compare_exchange_strong_explicit( + &last_time, + &old_time, + new_time, + memory_order_acq_rel, + memory_order_relaxed)) + DBG("Time update collision: last_time"); + + if (!atomic_compare_exchange_strong_explicit( + &real_time, + &old_real_time, + new_real_time, + memory_order_acq_rel, + memory_order_relaxed)) + DBG("Time update collision: real_time"); +} /** * DOC: Sockets @@ -804,18 +801,16 @@ sk_free(resource *r) sk_ssh_free(s); #endif - if (s->fd < 0) + if ((s->fd < 0) || (s->flags & SKF_THREAD)) return; - /* FIXME: we should call sk_stop() for SKF_THREAD sockets */ - if (!(s->flags & SKF_THREAD)) - { - if (s == current_sock) - current_sock = sk_next(s); - if (s == stored_sock) - stored_sock = sk_next(s); + if (s == current_sock) + current_sock = sk_next(s); + if (s == stored_sock) + stored_sock = sk_next(s); + + if (enlisted(&s->n)) rem_node(&s->n); - } if (s->type != SK_SSH && s->type != SK_SSH_ACTIVE) close(s->fd); @@ -1108,7 +1103,11 @@ sk_passive_connected(sock *s, int type) return 1; } - sk_insert(t); + if (s->flags & SKF_PASSIVE_THREAD) + t->flags |= SKF_THREAD; + else + sk_insert(t); + sk_alloc_bufs(t); s->rx_hook(t, 0); return 1; @@ -1516,6 +1515,36 @@ sk_open_unix(sock *s, char *name) return 0; } +static void +sk_reloop_hook(void *_vs) +{ + sock *s = _vs; + if (birdloop_inside(&main_birdloop)) + { + s->flags &= ~SKF_THREAD; + sk_insert(s); + } + else + { + s->flags |= SKF_THREAD; + sk_start(s); + } +} + +void +sk_reloop(sock *s, struct birdloop *loop) +{ + if (enlisted(&s->n)) + rem_node(&s->n); + + s->reloop = (event) { + .hook = sk_reloop_hook, + .data = s, + }; + + ev_send_loop(loop, &s->reloop); +} + #define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \ CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL) @@ -2037,34 +2066,21 @@ struct event_log_entry static struct event_log_entry event_log[EVENT_LOG_LENGTH]; static struct event_log_entry *event_open; static int event_log_pos, event_log_num, watchdog_active; -static btime last_time; +static btime last_io_time; static btime loop_time; static void io_update_time(void) { - struct timespec ts; - int rv; - - /* - * This is third time-tracking procedure (after update_times() above and - * times_update() in BFD), dedicated to internal event log and latency - * tracking. Hopefully, we consolidate these sometimes. - */ - - rv = clock_gettime(CLOCK_MONOTONIC, &ts); - if (rv < 0) - die("clock_gettime: %m"); - - last_time = ts.tv_sec S + ts.tv_nsec NS; + last_io_time = current_time(); if (event_open) { - event_open->duration = last_time - event_open->timestamp; + event_open->duration = last_io_time - event_open->timestamp; if (event_open->duration > config->latency_limit) - log(L_WARN "Event 0x%p 0x%p took %d ms", - event_open->hook, event_open->data, (int) (event_open->duration TO_MS)); + log(L_WARN "Event 0x%p 0x%p took %u.%03u ms", + event_open->hook, event_open->data, (uint) (event_open->duration TO_MS), (uint) (event_open->duration % 1000)); event_open = NULL; } @@ -2089,7 +2105,7 @@ io_log_event(void *hook, void *data) en->hook = hook; en->data = data; - en->timestamp = last_time; + en->timestamp = last_io_time; en->duration = 0; event_log_num++; @@ -2117,14 +2133,14 @@ io_log_dump(void) struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH; if (en->hook) log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data, - (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS)); + (int) ((last_io_time - en->timestamp) TO_MS), (int) (en->duration TO_MS)); } } void watchdog_sigalrm(int sig UNUSED) { - /* Update last_time and duration, but skip latency check */ + /* Update last_io_time and duration, but skip latency check */ config->latency_limit = 0xffffffff; io_update_time(); @@ -2137,7 +2153,7 @@ watchdog_start1(void) { io_update_time(); - loop_time = last_time; + loop_time = last_io_time; } static inline void @@ -2145,7 +2161,7 @@ watchdog_start(void) { io_update_time(); - loop_time = last_time; + loop_time = last_io_time; event_log_num = 0; if (config->watchdog_timeout) @@ -2166,10 +2182,10 @@ watchdog_stop(void) watchdog_active = 0; } - btime duration = last_time - loop_time; + btime duration = last_io_time - loop_time; if (duration > config->watchdog_warning) - log(L_WARN "I/O loop cycle took %d ms for %d events", - (int) (duration TO_MS), event_log_num); + log(L_WARN "I/O loop cycle took %u.%03u ms for %d events", + (uint) (duration TO_MS), (uint) (duration % 1000), event_log_num); } @@ -2181,8 +2197,9 @@ void io_init(void) { init_list(&sock_list); - init_list(&global_event_list); - init_list(&global_work_list); + ev_init_list(&global_event_list, &main_birdloop, "Global event list"); + ev_init_list(&global_work_list, &main_birdloop, "Global work list"); + ev_init_list(&main_birdloop.event_list, &main_birdloop, "Global fast event list"); krt_io_init(); // XXX init_times(); // XXX update_times(); @@ -2210,22 +2227,27 @@ io_loop(void) watchdog_start1(); for(;;) { - times_update(&main_timeloop); + times_update(); events = ev_run_list(&global_event_list); events = ev_run_list_limited(&global_work_list, WORK_EVENTS_MAX) || events; - timers_fire(&main_timeloop); + events = ev_run_list(&main_birdloop.event_list) || events; + timers_fire(&main_birdloop.time, 1); io_close_event(); // FIXME poll_tout = (events ? 0 : 3000); /* Time in milliseconds */ - if (t = timers_first(&main_timeloop)) + if (t = timers_first(&main_birdloop.time)) { - times_update(&main_timeloop); + times_update(); timeout = (tm_remains(t) TO_MS) + 1; poll_tout = MIN(poll_tout, timeout); } - nfds = 0; + /* A hack to reload main io_loop() when something has changed asynchronously. */ + pipe_pollin(&main_birdloop.wakeup, &pfd[0]); + + nfds = 1; + WALK_LIST(n, sock_list) { pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */ @@ -2284,7 +2306,9 @@ io_loop(void) /* And finally enter poll() to find active sockets */ watchdog_stop(); + birdloop_leave(&main_birdloop); pout = poll(pfd, nfds, poll_tout); + birdloop_enter(&main_birdloop); watchdog_start(); if (pout < 0) @@ -2295,7 +2319,15 @@ io_loop(void) } if (pout) { - times_update(&main_timeloop); + if (pfd[0].revents & POLLIN) + { + /* IO loop reload requested */ + pipe_drain(&main_birdloop.wakeup); + atomic_exchange_explicit(&main_birdloop.ping_sent, 0, memory_order_acq_rel); + continue; + } + + times_update(); /* guaranteed to be non-empty */ current_sock = SKIP_BACK(sock, n, HEAD(sock_list)); diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index 93d19614..080c3a03 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -314,6 +314,7 @@ krt_learn_scan(struct krt_proto *p, rte *e) ea_set_attr_u32(&e0.attrs, &ea_gen_preference, 0, p->p.main_channel->preference); rte_update(p->p.main_channel, e->net, &e0, e0.src); + rt_unlock_source(e0.src); } static void @@ -322,9 +323,9 @@ krt_learn_async(struct krt_proto *p, rte *e, int new) if (new) return krt_learn_scan(p, e); - struct rte_src *src = rt_find_source(&p->p, krt_metric(e)); - if (src) - rte_update(p->p.main_channel, e->net, NULL, src); + struct rte_src *src = rt_get_source(&p->p, krt_metric(e)); + rte_update(p->p.main_channel, e->net, NULL, src); + rt_unlock_source(src); } #endif @@ -365,6 +366,13 @@ rte_feed_obtain(net *n, rte **feed, uint count) static struct rte * krt_export_net(struct krt_proto *p, net *net) { + /* FIXME: Here we are calling filters in table-locked context when exporting + * to kernel. Here BIRD can crash if the user requested ROA check in kernel + * export filter. It doesn't make much sense to write the filters like this, + * therefore we may keep this unfinished piece of work here for later as it + * won't really affect anybody. */ + ASSERT_DIE(RT_IS_LOCKED(p->p.main_channel->table)); + struct channel *c = p->p.main_channel; const struct filter *filter = c->out_filter; @@ -445,6 +453,9 @@ krt_got_route(struct krt_proto *p, rte *e, s8 src) #endif /* The rest is for KRT_SRC_BIRD (or KRT_SRC_UNKNOWN) */ + RT_LOCKED(p->p.main_channel->table, tab) + { + /* Deleting all routes if flush is requested */ if (p->flush_routes) goto delete; @@ -453,7 +464,7 @@ krt_got_route(struct krt_proto *p, rte *e, s8 src) if (!p->ready) goto ignore; - net *net = net_find(p->p.main_channel->table, e->net); + net *net = net_find(tab, e->net); if (!net || !krt_is_installed(p, net)) goto delete; @@ -498,7 +509,9 @@ delete: krt_replace_rte(p, e->net, NULL, e); goto done; -done: +done:; + } + lp_flush(krt_filter_lp); } @@ -511,7 +524,8 @@ krt_init_scan(struct krt_proto *p) static void krt_prune(struct krt_proto *p) { - struct rtable *t = p->p.main_channel->table; + RT_LOCKED(p->p.main_channel->table, t) + { KRT_TRACE(p, D_EVENTS, "Pruning table %s", t->name); FIB_WALK(&t->fib, net, n) @@ -533,6 +547,8 @@ krt_prune(struct krt_proto *p) if (p->ready) p->initialized = 1; + + } } static void @@ -577,18 +593,17 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src) } } + /* * Periodic scanning */ - -#ifdef CONFIG_ALL_TABLES_AT_ONCE - -static timer *krt_scan_timer; -static int krt_scan_count; +static timer *krt_scan_all_timer; +static int krt_scan_all_count; +static _Bool krt_scan_all_tables; static void -krt_scan(timer *t UNUSED) +krt_scan_all(timer *t UNUSED) { struct krt_proto *p; node *n; @@ -609,35 +624,42 @@ krt_scan(timer *t UNUSED) } static void -krt_scan_timer_start(struct krt_proto *p) +krt_scan_all_timer_start(struct krt_proto *p) { - if (!krt_scan_count) - krt_scan_timer = tm_new_init(krt_pool, krt_scan, NULL, KRT_CF->scan_time, 0); + if (!krt_scan_all_count) + krt_scan_all_timer = tm_new_init(krt_pool, krt_scan_all, NULL, KRT_CF->scan_time, 0); - krt_scan_count++; + krt_scan_all_count++; - tm_start(krt_scan_timer, 1 S); + tm_start(krt_scan_all_timer, 1 S); } static void -krt_scan_timer_stop(struct krt_proto *p UNUSED) +krt_scan_all_timer_stop(void) { - krt_scan_count--; + ASSERT(krt_scan_all_count > 0); + + krt_scan_all_count--; - if (!krt_scan_count) + if (!krt_scan_all_count) { - rfree(krt_scan_timer); - krt_scan_timer = NULL; + rfree(krt_scan_all_timer); + krt_scan_all_timer = NULL; } } static void -krt_scan_timer_kick(struct krt_proto *p UNUSED) +krt_scan_all_timer_kick(void) { - tm_start(krt_scan_timer, 0); + tm_start(krt_scan_all_timer, 0); +} + +void +krt_use_shared_scan(void) +{ + krt_scan_all_tables = 1; } -#else static void krt_scan(timer *t) @@ -655,35 +677,42 @@ krt_scan(timer *t) static void krt_scan_timer_start(struct krt_proto *p) { - p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0); - tm_start(p->scan_timer, 1 S); + if (krt_scan_all_tables) + krt_scan_all_timer_start(p); + else + { + p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0); + tm_start(p->scan_timer, 1 S); + } } static void krt_scan_timer_stop(struct krt_proto *p) { - tm_stop(p->scan_timer); + if (krt_scan_all_tables) + krt_scan_all_timer_stop(); + else + tm_stop(p->scan_timer); } static void krt_scan_timer_kick(struct krt_proto *p) { - tm_start(p->scan_timer, 0); + if (krt_scan_all_tables) + krt_scan_all_timer_kick(); + else + tm_start(p->scan_timer, 0); } -#endif - - - /* * Updates */ static int -krt_preexport(struct channel *c, rte *e) +krt_preexport(struct channel *C, rte *e) { - if (e->src->proto == c->proto) + if (e->src->owner == &C->proto->sources) return -1; if (!krt_capable(e)) @@ -791,11 +820,6 @@ krt_postconfig(struct proto_config *CF) if (! proto_cf_main_channel(CF)) cf_error("Channel not specified"); -#ifdef CONFIG_ALL_TABLES_AT_ONCE - if (krt_cf->scan_time != cf->scan_time) - cf_error("All kernel syncers must use the same table scan interval"); -#endif - struct channel_config *cc = proto_cf_main_channel(CF); struct rtable_config *tab = cc->table; if (tab->krt_attached) diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h index e0d60cbd..9f7ebb4f 100644 --- a/sysdep/unix/krt.h +++ b/sysdep/unix/krt.h @@ -21,11 +21,6 @@ struct kif_proto; #define KRT_DEFAULT_ECMP_LIMIT 16 -#if 0 -#define EA_KRT_SOURCE EA_CODE(PROTOCOL_KERNEL, 0) -#define EA_KRT_METRIC EA_CODE(PROTOCOL_KERNEL, 1) -#endif - extern struct ea_class ea_krt_source, ea_krt_metric; #define KRT_REF_SEEN 0x1 /* Seen in table */ @@ -55,10 +50,7 @@ struct krt_proto { struct proto p; struct krt_state sys; /* Sysdep state */ -#ifndef CONFIG_ALL_TABLES_AT_ONCE timer *scan_timer; -#endif - struct bmap sync_map; /* Keeps track which exported routes were successfully written to kernel */ struct bmap seen_map; /* Routes seen during last periodic scan */ node krt_node; /* Node in krt_proto_list */ @@ -80,6 +72,7 @@ extern pool *krt_pool; struct proto_config * kif_init_config(int class); void kif_request_scan(void); +void krt_use_shared_scan(void); void krt_got_route(struct krt_proto *p, struct rte *e, s8 src); void krt_got_route_async(struct krt_proto *p, struct rte *e, int new, s8 src); diff --git a/sysdep/unix/log.c b/sysdep/unix/log.c index 4e9df069..185231e8 100644 --- a/sysdep/unix/log.c +++ b/sysdep/unix/log.c @@ -15,6 +15,7 @@ * user's manual. */ +#include <stdatomic.h> #include <stdio.h> #include <stdlib.h> #include <stdarg.h> @@ -35,8 +36,10 @@ static FILE *dbgf; static list *current_log_list; static char *current_syslog_name; /* NULL -> syslog closed */ +static _Atomic uint max_thread_id = ATOMIC_VAR_INIT(1); +static _Thread_local uint this_thread_id; -#ifdef USE_PTHREADS +#define THIS_THREAD_ID (this_thread_id ?: (this_thread_id = atomic_fetch_add_explicit(&max_thread_id, 1, memory_order_acq_rel))) #include <pthread.h> @@ -48,15 +51,6 @@ static pthread_t main_thread; void main_thread_init(void) { main_thread = pthread_self(); } static int main_thread_self(void) { return pthread_equal(pthread_self(), main_thread); } -#else - -static inline void log_lock(void) { } -static inline void log_unlock(void) { } -void main_thread_init(void) { } -static int main_thread_self(void) { return 1; } - -#endif - #ifdef HAVE_SYSLOG_H #include <sys/syslog.h> @@ -189,7 +183,7 @@ log_commit(int class, buffer *buf) l->pos += msg_len; } - fprintf(l->fh, "%s <%s> ", tbuf, class_names[class]); + fprintf(l->fh, "%s [%04x] <%s> ", tbuf, THIS_THREAD_ID, class_names[class]); } fputs(buf->start, l->fh); fputc('\n', l->fh); @@ -299,6 +293,8 @@ die(const char *msg, ...) exit(1); } +static struct timespec dbg_time_start; + /** * debug - write to debug output * @msg: a printf-like message @@ -311,12 +307,33 @@ debug(const char *msg, ...) { #define MAX_DEBUG_BUFSIZE 16384 va_list args; - char buf[MAX_DEBUG_BUFSIZE]; + char buf[MAX_DEBUG_BUFSIZE], *pos = buf; + int max = MAX_DEBUG_BUFSIZE; va_start(args, msg); if (dbgf) { - if (bvsnprintf(buf, MAX_DEBUG_BUFSIZE, msg, args) < 0) + struct timespec dbg_time; + clock_gettime(CLOCK_MONOTONIC, &dbg_time); + uint nsec; + uint sec; + + if (dbg_time.tv_nsec > dbg_time_start.tv_nsec) + { + nsec = dbg_time.tv_nsec - dbg_time_start.tv_nsec; + sec = dbg_time.tv_sec - dbg_time_start.tv_sec; + } + else + { + nsec = 1000000000 + dbg_time.tv_nsec - dbg_time_start.tv_nsec; + sec = dbg_time.tv_sec - dbg_time_start.tv_sec - 1; + } + + int n = bsnprintf(pos, max, "%u.%09u: [%04x] ", sec, nsec, THIS_THREAD_ID); + pos += n; + max -= n; + + if (bvsnprintf(pos, max, msg, args) < 0) bug("Extremely long debug output, split it."); fputs(buf, dbgf); @@ -422,6 +439,8 @@ done: void log_init_debug(char *f) { + clock_gettime(CLOCK_MONOTONIC, &dbg_time_start); + if (dbgf && dbgf != stderr) fclose(dbgf); if (!f) diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c index fd4934d9..bf9f2be0 100644 --- a/sysdep/unix/main.c +++ b/sysdep/unix/main.c @@ -28,6 +28,7 @@ #include "lib/resource.h" #include "lib/socket.h" #include "lib/event.h" +#include "lib/locking.h" #include "lib/timer.h" #include "lib/string.h" #include "nest/rt.h" @@ -116,7 +117,7 @@ add_num_const(char *name, int val, const char *file, const uint line) struct f_val *v = cfg_alloc(sizeof(struct f_val)); *v = (struct f_val) { .type = T_INT, .val.i = val }; struct symbol *sym = cf_get_symbol(name); - if (sym->class && (sym->scope == conf_this_scope)) + if (sym->class && cf_symbol_is_local(sym)) cf_error("Error reading value for %s from %s:%d: already defined", name, file, line); cf_define_symbol(sym, SYM_CONSTANT | T_INT, val, v); @@ -873,13 +874,16 @@ main(int argc, char **argv) dmalloc_debug(0x2f03d00); #endif + times_update(); parse_args(argc, argv); log_switch(1, NULL, NULL); + the_bird_lock(); + random_init(); net_init(); resource_init(); - timer_init(); + birdloop_init(); olock_init(); rt_init(); io_init(); @@ -927,6 +931,7 @@ main(int argc, char **argv) dup2(0, 2); } + main_thread_init(); write_pid_file(); diff --git a/test/birdtest.c b/test/birdtest.c index ae05d1a5..5e3de1c5 100644 --- a/test/birdtest.c +++ b/test/birdtest.c @@ -21,6 +21,7 @@ #include "test/birdtest.h" #include "lib/string.h" #include "lib/event.h" +#include "lib/io-loop.h" #ifdef HAVE_EXECINFO_H #include <execinfo.h> @@ -64,6 +65,9 @@ bt_init(int argc, char *argv[]) { int c; + /* We have no interest in stdin */ + close(0); + initstate(BT_RANDOM_SEED, (char *) bt_random_state, sizeof(bt_random_state)); bt_verbose = 0; @@ -120,9 +124,11 @@ bt_init(int argc, char *argv[]) clock_gettime(CLOCK_MONOTONIC, &bt_begin); bt_suite_case_begin = bt_suite_begin = bt_begin; + the_bird_lock(); resource_init(); - ev_init_list(&global_event_list); - + ev_init_list(&global_event_list, &main_birdloop, "Global event list in unit tests"); + ev_init_list(&global_work_list, &main_birdloop, "Global work list in unit tests"); + birdloop_init(); return; usage: diff --git a/test/birdtest.h b/test/birdtest.h index ad5f8f9c..b8978b3e 100644 --- a/test/birdtest.h +++ b/test/birdtest.h @@ -40,7 +40,7 @@ static inline u64 bt_random(void) void bt_log_suite_result(int result, const char *fmt, ...); void bt_log_suite_case_result(int result, const char *fmt, ...); -#define BT_TIMEOUT 5 /* Default timeout in seconds */ +#define BT_TIMEOUT 20 /* Default timeout in seconds */ #define BT_FORKING 1 /* Forking is enabled in default */ #define BT_RANDOM_SEED 0x5097d2bb diff --git a/test/bt-utils.c b/test/bt-utils.c index 509b5ed4..36e44da4 100644 --- a/test/bt-utils.c +++ b/test/bt-utils.c @@ -53,6 +53,8 @@ cf_file_read(byte *dest, uint max_len, int fd) return l; } +void resource_sys_init(void); + void bt_bird_init(void) { @@ -61,7 +63,6 @@ bt_bird_init(void) log_switch(bt_verbose != 0, NULL, NULL); olock_init(); - timer_init(); rt_init(); io_init(); if_init(); @@ -73,6 +74,7 @@ bt_bird_init(void) void bt_bird_cleanup(void) { config = new_config = NULL; + the_bird_unlock(); } static char * |