diff options
166 files changed, 13593 insertions, 7559 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 39202098..0a758cff 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,7 +3,7 @@ variables: LC_ALL: C.UTF-8 GIT_STRATEGY: fetch DOCKER_CMD: docker --config="$HOME/.docker/$CI_JOB_ID/" - IMG_BASE: registry.labs.nic.cz/labs/bird + IMG_BASE: registry.nic.cz/labs/bird TOOLS_DIR: /var/lib/gitlab-runner/bird-tools stages: @@ -16,7 +16,7 @@ stages: stage: image allow_failure: true script: - - $DOCKER_CMD login -u gitlab-ci-token -p $CI_BUILD_TOKEN registry.labs.nic.cz + - $DOCKER_CMD login -u gitlab-ci-token -p $CI_BUILD_TOKEN registry.nic.cz # Make sure we refresh the base image if it updates (eg. security updates, etc) # If we do just the build, cache is always reused and the freshness of the # base image is never checked. However, pull always asks and updates the @@ -154,9 +154,9 @@ docker_ubuntu-20_04-amd64: IMG_NAME: "ubuntu-20.04-amd64" <<: *docker_build -docker_ubuntu-20_10-amd64: +docker_ubuntu-21_10-amd64: variables: - IMG_NAME: "ubuntu-20.10-amd64" + IMG_NAME: "ubuntu-21.10-amd64" <<: *docker_build # GPG error @@ -224,135 +224,135 @@ docker_opensuse-15.3-amd64: build-debian-8-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:debian-8-amd64 + image: registry.nic.cz/labs/bird:debian-8-amd64 build-debian-8-i386: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:debian-8-i386 + image: registry.nic.cz/labs/bird:debian-8-i386 build-debian-9-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:debian-9-amd64 + image: registry.nic.cz/labs/bird:debian-9-amd64 build-debian-9-i386: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:debian-9-i386 + image: registry.nic.cz/labs/bird:debian-9-i386 build-debian-10-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:debian-10-amd64 + image: registry.nic.cz/labs/bird:debian-10-amd64 build-debian-10-i386: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:debian-10-i386 + image: registry.nic.cz/labs/bird:debian-10-i386 build-debian-11-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:debian-11-amd64 + image: registry.nic.cz/labs/bird:debian-11-amd64 #build-debian-11-i386: # <<: *build-linux -# image: registry.labs.nic.cz/labs/bird:debian-11-i386 +# image: registry.nic.cz/labs/bird:debian-11-i386 build-debian-testing-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:debian-testing-amd64 + image: registry.nic.cz/labs/bird:debian-testing-amd64 #build-debian-testing-i386: # <<: *build-linux -# image: registry.labs.nic.cz/labs/bird:debian-testing-i386 +# image: registry.nic.cz/labs/bird:debian-testing-i386 build-fedora-25-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:fedora-25-amd64 + image: registry.nic.cz/labs/bird:fedora-25-amd64 build-fedora-26-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:fedora-26-amd64 + image: registry.nic.cz/labs/bird:fedora-26-amd64 build-fedora-27-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:fedora-27-amd64 + image: registry.nic.cz/labs/bird:fedora-27-amd64 build-fedora-28-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:fedora-28-amd64 + image: registry.nic.cz/labs/bird:fedora-28-amd64 build-fedora-29-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:fedora-29-amd64 + image: registry.nic.cz/labs/bird:fedora-29-amd64 build-fedora-30-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:fedora-30-amd64 + image: registry.nic.cz/labs/bird:fedora-30-amd64 build-fedora-31-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:fedora-31-amd64 + image: registry.nic.cz/labs/bird:fedora-31-amd64 build-fedora-32-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:fedora-32-amd64 + image: registry.nic.cz/labs/bird:fedora-32-amd64 build-fedora-33-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:fedora-33-amd64 + image: registry.nic.cz/labs/bird:fedora-33-amd64 build-fedora-34-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:fedora-33-amd64 + image: registry.nic.cz/labs/bird:fedora-33-amd64 build-centos-8-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:centos-8-amd64 + image: registry.nic.cz/labs/bird:centos-8-amd64 build-ubuntu-16_04-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:ubuntu-16.04-amd64 + image: registry.nic.cz/labs/bird:ubuntu-16.04-amd64 build-ubuntu-18_04-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:ubuntu-18.04-amd64 + image: registry.nic.cz/labs/bird:ubuntu-18.04-amd64 build-ubuntu-20_04-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:ubuntu-20.04-amd64 + image: registry.nic.cz/labs/bird:ubuntu-20.04-amd64 -build-ubuntu-20_10-amd64: +build-ubuntu-21_10-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:ubuntu-20.10-amd64 + image: registry.nic.cz/labs/bird:ubuntu-21.10-amd64 #build-ubuntu-21_04-amd64: # <<: *build-linux -# image: registry.labs.nic.cz/labs/bird:ubuntu-21.04-amd64 +# image: registry.nic.cz/labs/bird:ubuntu-21.04-amd64 build-opensuse-15.0-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:opensuse-15.0-amd64 + image: registry.nic.cz/labs/bird:opensuse-15.0-amd64 build-opensuse-15.1-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:opensuse-15.1-amd64 + image: registry.nic.cz/labs/bird:opensuse-15.1-amd64 build-opensuse-15.2-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:opensuse-15.2-amd64 + image: registry.nic.cz/labs/bird:opensuse-15.2-amd64 build-opensuse-15.3-amd64: <<: *build-linux - image: registry.labs.nic.cz/labs/bird:opensuse-15.3-amd64 + image: registry.nic.cz/labs/bird:opensuse-15.3-amd64 -build-freebsd-11-amd64: - <<: *build-base - tags: - - freebsd - - amd64 +#build-freebsd-11-amd64: +# <<: *build-base +# tags: +# - freebsd +# - amd64 -build-freebsd-11-i386: - <<: *build-base - tags: - - freebsd - - i386 +#build-freebsd-11-i386: +# <<: *build-base +# tags: +# - freebsd +# - i386 .pkg-deb: &pkg-deb @@ -390,60 +390,60 @@ build-freebsd-11-i386: #pkg-debian-8-amd64: # <<: *pkg-deb # needs: [build-debian-8-amd64] -# image: registry.labs.nic.cz/labs/bird:debian-8-amd64 +# image: registry.nic.cz/labs/bird:debian-8-amd64 # Dpkg error: PATH is not set #pkg-debian-8-i386: # <<: *pkg-deb # needs: [build-debian-8-i386] -# image: registry.labs.nic.cz/labs/bird:debian-8-i386 +# image: registry.nic.cz/labs/bird:debian-8-i386 # Dpkg error: PATH is not set pkg-debian-9-amd64: <<: *pkg-deb needs: [build-debian-9-amd64] - image: registry.labs.nic.cz/labs/bird:debian-9-amd64 + image: registry.nic.cz/labs/bird:debian-9-amd64 # Dpkg error: PATH is not set pkg-debian-9-i386: <<: *pkg-deb needs: [build-debian-9-i386] - image: registry.labs.nic.cz/labs/bird:debian-9-i386 + image: registry.nic.cz/labs/bird:debian-9-i386 pkg-debian-10-amd64: <<: *pkg-deb needs: [build-debian-10-amd64] - image: registry.labs.nic.cz/labs/bird:debian-10-amd64 + image: registry.nic.cz/labs/bird:debian-10-amd64 pkg-debian-10-i386: <<: *pkg-deb needs: [build-debian-10-i386] - image: registry.labs.nic.cz/labs/bird:debian-10-i386 + image: registry.nic.cz/labs/bird:debian-10-i386 pkg-debian-11-amd64: <<: *pkg-deb needs: [build-debian-11-amd64] - image: registry.labs.nic.cz/labs/bird:debian-11-amd64 + image: registry.nic.cz/labs/bird:debian-11-amd64 pkg-fedora-30-amd64: <<: *pkg-rpm-wa needs: [build-fedora-30-amd64] - image: registry.labs.nic.cz/labs/bird:fedora-30-amd64 + image: registry.nic.cz/labs/bird:fedora-30-amd64 pkg-fedora-31-amd64: <<: *pkg-rpm-wa needs: [build-fedora-31-amd64] - image: registry.labs.nic.cz/labs/bird:fedora-31-amd64 + image: registry.nic.cz/labs/bird:fedora-31-amd64 pkg-fedora-32-amd64: <<: *pkg-rpm-wa needs: [build-fedora-32-amd64] - image: registry.labs.nic.cz/labs/bird:fedora-32-amd64 + image: registry.nic.cz/labs/bird:fedora-32-amd64 pkg-fedora-33-amd64: <<: *pkg-rpm-wa needs: [build-fedora-33-amd64] - image: registry.labs.nic.cz/labs/bird:fedora-33-amd64 + image: registry.nic.cz/labs/bird:fedora-33-amd64 pkg-fedora-34-amd64: <<: *pkg-rpm @@ -453,42 +453,43 @@ pkg-fedora-34-amd64: pkg-centos-8-amd64: <<: *pkg-rpm-wa needs: [build-centos-8-amd64] - image: registry.labs.nic.cz/labs/bird:centos-8-amd64 + image: registry.nic.cz/labs/bird:centos-8-amd64 pkg-ubuntu-18.04-amd64: <<: *pkg-deb needs: [build-ubuntu-18_04-amd64] - image: registry.labs.nic.cz/labs/bird:ubuntu-18.04-amd64 + image: registry.nic.cz/labs/bird:ubuntu-18.04-amd64 pkg-ubuntu-20.04-amd64: <<: *pkg-deb needs: [build-ubuntu-20_04-amd64] - image: registry.labs.nic.cz/labs/bird:ubuntu-20.04-amd64 + image: registry.nic.cz/labs/bird:ubuntu-20.04-amd64 + -pkg-ubuntu-20.10-amd64: +pkg-ubuntu-21.10-amd64: <<: *pkg-deb - needs: [build-ubuntu-20_10-amd64] - image: registry.labs.nic.cz/labs/bird:ubuntu-20.10-amd64 + needs: [build-ubuntu-21_10-amd64] + image: registry.nic.cz/labs/bird:ubuntu-21.10-amd64 #pkg-ubuntu-21.04-amd64: # <<: *pkg-deb # needs: [build-ubuntu-21_04-amd64] -# image: registry.labs.nic.cz/labs/bird:ubuntu-21.04-amd64 +# image: registry.nic.cz/labs/bird:ubuntu-21.04-amd64 pkg-opensuse-15.1-amd64: <<: *pkg-rpm-wa needs: [build-opensuse-15.1-amd64] - image: registry.labs.nic.cz/labs/bird:opensuse-15.1-amd64 + image: registry.nic.cz/labs/bird:opensuse-15.1-amd64 pkg-opensuse-15.2-amd64: <<: *pkg-rpm-wa needs: [build-opensuse-15.2-amd64] - image: registry.labs.nic.cz/labs/bird:opensuse-15.2-amd64 + image: registry.nic.cz/labs/bird:opensuse-15.2-amd64 pkg-opensuse-15.3-amd64: <<: *pkg-rpm-wa needs: [build-opensuse-15.3-amd64] - image: registry.labs.nic.cz/labs/bird:opensuse-15.3-amd64 + image: registry.nic.cz/labs/bird:opensuse-15.3-amd64 build-birdlab: @@ -557,6 +558,11 @@ test-ospf-custom: variables: TEST_NAME: cf-ospf-custom +test-ospf-area: + <<: *test-base + variables: + TEST_NAME: cf-ospf-area + test-ospf-vrf: <<: *test-base variables: @@ -611,3 +617,8 @@ test-babel-auth: <<: *test-base variables: TEST_NAME: cf-babel-auth + +test-rip-base: + <<: *test-base + variables: + TEST_NAME: cf-rip-base diff --git a/Makefile.in b/Makefile.in index e0ff4a1d..fa534872 100644 --- a/Makefile.in +++ b/Makefile.in @@ -26,6 +26,7 @@ INSTALL_DATA=@INSTALL_DATA@ client=$(addprefix $(exedir)/,@CLIENT@) daemon=$(exedir)/bird protocols=@protocols@ +PROTO_BUILD := $(protocols) dev kif krt prefix=@prefix@ exec_prefix=@exec_prefix@ @@ -1,3 +1,46 @@ +Version 2.0.10 (2022-06-16) + o BGP performance improvements + o BFD: New 'strict bind' option + o RPKI: VRF support + o Allow use of 240.0.0.0/4 as a private range + o BIRD client uses exit status to report errors + o Important bugfixes + +Version 2.0.9 (2022-02-09) + o BGP: Flowspec validation procedure + o Babel: MAC authentication support + o Routing table configuration blocks + o Optional prefix trie in routing table for faster LPM/interval queries + o CLI: New 'show route in <prefix>' command + o Filter: Faster (16-way) prefix sets + o Filter: MPLS label route attribute + o Filter: Operators to pick community components + o Filter: Operators to find minimum and maximum element of lists + o BGP: New 'free bind' option + o BGP: Log route updates that were changed to withdraws + o BGP: Improved 'invalid next hop' error reporting + o OSPF: Allow ifaces with host address as unnumbered PtP or PtMP ifaces + o OSPF: All packets on PtP networks should be sent to AllSPFRouters address + o Scripts for apkg-powered upstream packaging for deb and rpm + o Support for Blake2s and Blake2b hash functions + o Security keys / passwords can be entered in hexadecimal digits + o Memory statistics split into Effective and Overhead + o Linux: New option 'netlink rx buffer' to specify netlink socket buffer size + o BSD: Assume onlink flag on ifaces with only host addresses + o Many bugfixes + + Notes: + + For OSPF on PtP network, BIRD now sends all packets to multicast AllSPFRouters + address (as required in RFC 2328 8.1). This likely breaks setups with multiple + neighbors on a network configured as PtP, which worked in previous versions. + Such links should be configured as PtMP. + + Since Linux 5.3, netlink socket can be flooded by route cache entries during + route table scan. This version mitigates that issue by using strict netlink + filtering. + + Version 2.0.8 (2021-03-18) o Automatic channel reloads based on RPKI changes o Multiple static routes with the same network @@ -2,6 +2,32 @@ dnl ** Additional Autoconf tests for BIRD configure script dnl ** (c) 1999 Martin Mares <mj@ucw.cz> dnl ** (c) 2021 Maria Matejka <mq@jmq.cz> +AC_DEFUN([BIRD_CHECK_POINTER_ALIGNMENT], +[ + AC_CACHE_CHECK( + [how pointers are aligned], + [bird_cv_pointer_alignment], + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM( + [ + _Static_assert(_Alignof(void *) == 8, "bad"); + ], [] + ) + ], + [bird_cv_pointer_alignment=8], + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM( + [ + _Static_assert(_Alignof(void *) == 4, "bad"); + ], [] + ) + ], + [bird_cv_pointer_alignment=4], + [bird_cv_pointer_alignment=unknown] + )) + ) +]) + AC_DEFUN([BIRD_CHECK_THREAD_LOCAL], [ AC_CACHE_CHECK( diff --git a/bird-gdb.py b/bird-gdb.py index c7351d6b..077703f9 100644 --- a/bird-gdb.py +++ b/bird-gdb.py @@ -25,7 +25,6 @@ class BIRDFValPrinter(BIRDPrinter): "T_ENUM_RTS": "i", "T_ENUM_BGP_ORIGIN": "i", "T_ENUM_SCOPE": "i", - "T_ENUM_RTC": "i", "T_ENUM_RTD": "i", "T_ENUM_ROA": "i", "T_ENUM_NETTYPE": "i", diff --git a/client/client.c b/client/client.c index 97cf6639..934e16e0 100644 --- a/client/client.c +++ b/client/client.c @@ -50,6 +50,7 @@ static byte *server_read_pos = server_read_buf; int init = 1; /* During intial sequence */ int busy = 1; /* Executing BIRD command */ int interactive; /* Whether stdin is terminal */ +int last_code; /* Last return code */ static int num_lines, skip_input; int term_lns, term_cls; @@ -196,7 +197,7 @@ init_commands(void) { /* Initial command is finished and we want to exit */ cleanup(); - exit(0); + exit((last_code < 8000) ? 0 : 1); } input_init(); @@ -283,6 +284,8 @@ server_got_reply(char *x) if (code) PRINTF(len, "%s\n", verbose ? x : x+5); + last_code = code; + if (x[4] == ' ') { busy = 0; diff --git a/conf/cf-lex.l b/conf/cf-lex.l index 704a1750..04e0b3a5 100644 --- a/conf/cf-lex.l +++ b/conf/cf-lex.l @@ -42,7 +42,7 @@ #define PARSER 1 #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "filter/filter.h" #include "filter/f-inst.h" @@ -77,19 +77,22 @@ static uint cf_hash(const byte *c); #define SYM_NEXT(n) n->next #define SYM_EQ(a,s1,b,s2) !strcmp(a,b) && s1 == s2 #define SYM_FN(k,s) cf_hash(k) -#define SYM_ORDER 6 /* Initial */ +#define SYM_ORDER 4 /* Initial */ #define SYM_REHASH sym_rehash -#define SYM_PARAMS /8, *1, 2, 2, 6, 20 +#define SYM_PARAMS /8, *1, 2, 2, 4, 20 HASH_DEFINE_REHASH_FN(SYM, struct symbol) HASH(struct keyword) kw_hash; - +HASH(struct ea_class) ea_name_hash; struct sym_scope *conf_this_scope; +static struct sym_scope global_root_scope__init = { .active = 1, }; +struct sym_scope *global_root_scope = &global_root_scope__init; + linpool *cfg_mem; int (*cf_read_hook)(byte *buf, unsigned int max, int fd); @@ -255,7 +258,7 @@ WHITE [ \t] return IP4; } -{XIGIT}{2}(:{XIGIT}{2}|{XIGIT}{2}){15,} { +{XIGIT}{2}((:{XIGIT}{2}){15,}|({XIGIT}{2}){15,}) { char *s = yytext; size_t len = 0, i; struct bytestring *bytes; @@ -347,7 +350,7 @@ else: { return DDOT; } -[={}:;,.()+*/%<>~\[\]?!\|-] { +[={}:;,.()+*/%<>~\[\]?!\|&-] { return yytext[0]; } @@ -574,6 +577,8 @@ check_eof(void) return 0; } +static inline void cf_swap_soft_scope(void); + static struct symbol * cf_new_symbol(const byte *c) { @@ -583,45 +588,64 @@ cf_new_symbol(const byte *c) if (l > SYM_MAX_LEN) cf_error("Symbol too long"); + cf_swap_soft_scope(); + s = cfg_allocz(sizeof(struct symbol) + l + 1); *s = (struct symbol) { .scope = conf_this_scope, .class = SYM_VOID, }; strcpy(s->name, c); - if (!new_config->sym_hash.data) - HASH_INIT(new_config->sym_hash, new_config->pool, SYM_ORDER); + if (!conf_this_scope->hash.data) + HASH_INIT(conf_this_scope->hash, new_config->pool, SYM_ORDER); + + HASH_INSERT2(conf_this_scope->hash, SYM, new_config->pool, s); + + if (conf_this_scope == new_config->root_scope) + add_tail(&(new_config->symbols), &(s->n)); + + return s; +} + +static struct symbol * +cf_root_symbol(const byte *c) +{ + uint l = strlen(c); + if (l > SYM_MAX_LEN) + bug("Root symbol %s too long", c); - HASH_INSERT2(new_config->sym_hash, SYM, new_config->pool, s); + struct symbol *s = mb_alloc(&root_pool, sizeof(struct symbol) + l + 1); + *s = (struct symbol) { .scope = global_root_scope, .class = SYM_VOID, }; + memcpy(s->name, c, l+1); - add_tail(&(new_config->symbols), &(s->n)); + if (!global_root_scope->hash.data) + HASH_INIT(global_root_scope->hash, &root_pool, SYM_ORDER); + HASH_INSERT2(global_root_scope->hash, SYM, &root_pool, s); return s; } + /** - * cf_find_symbol - find a symbol by name - * @cfg: specificed config + * cf_find_symbol_scope - find a symbol by name + * @scope: config scope * @c: symbol name * - * This functions searches the symbol table in the config @cfg for a symbol of - * given name. First it examines the current scope, then the second recent one + * This functions searches the symbol table in the scope @scope for a symbol of + * given name. First it examines the current scope, then the underlying one * and so on until it either finds the symbol and returns a pointer to its * &symbol structure or reaches the end of the scope chain and returns %NULL to * signify no match. */ struct symbol * -cf_find_symbol(const struct config *cfg, const byte *c) +cf_find_symbol_scope(const struct sym_scope *scope, const byte *c) { struct symbol *s; - if (cfg->sym_hash.data && - (s = HASH_FIND(cfg->sym_hash, SYM, c, 1))) - return s; - - /* In CLI command parsing, fallback points to the current config, otherwise it is NULL. */ - if (cfg->fallback && - cfg->fallback->sym_hash.data && - (s = HASH_FIND(cfg->fallback->sym_hash, SYM, c, 1))) - return s; + /* Find the symbol here or anywhere below */ + while (scope) + if (scope->hash.data && (s = HASH_FIND(scope->hash, SYM, c, 1))) + return s; + else + scope = scope->next; return NULL; } @@ -638,7 +662,7 @@ cf_find_symbol(const struct config *cfg, const byte *c) struct symbol * cf_get_symbol(const byte *c) { - return cf_find_symbol(new_config, c) ?: cf_new_symbol(c); + return cf_find_symbol_scope(conf_this_scope, c) ?: cf_new_symbol(c); } /** @@ -654,10 +678,10 @@ cf_localize_symbol(struct symbol *sym) /* If the symbol type is void, it has been recently allocated just in this scope. */ if (!sym->class) return sym; - + /* If the scope is the current, it is already defined in this scope. */ - if (sym->scope == conf_this_scope) - cf_error("Symbol already defined"); + if (cf_symbol_is_local(sym)) + cf_error("Symbol '%s' already defined", sym->name); /* Not allocated here yet, doing it now. */ return cf_new_symbol(sym->name); @@ -689,10 +713,7 @@ cf_lex_symbol(const char *data) struct symbol *sym = cf_get_symbol(data); cf_lval.s = sym; - if (sym->class != SYM_VOID) - return CF_SYM_KNOWN; - - /* Is it a keyword? */ + /* Is it a keyword? Prefer the keyword. */ struct keyword *k = HASH_FIND(kw_hash, KW, data); if (k) { @@ -705,9 +726,11 @@ cf_lex_symbol(const char *data) } } - /* OK, undefined symbol */ - cf_lval.s = sym; - return CF_SYM_UNDEFINED; + /* OK, only a symbol. */ + if (sym->class == SYM_VOID) + return CF_SYM_UNDEFINED; + else + return CF_SYM_KNOWN; } static void @@ -720,6 +743,34 @@ cf_lex_init_kh(void) HASH_INSERT(kw_hash, KW, k); } +void +ea_lex_register(struct ea_class *def) +{ + struct symbol *sym = cf_root_symbol(def->name); + sym->class = SYM_ATTRIBUTE; + sym->attribute = def; + def->sym = sym; +} + +void +ea_lex_unregister(struct ea_class *def) +{ + struct symbol *sym = def->sym; + HASH_REMOVE2(global_root_scope->hash, SYM, &root_pool, sym); + mb_free(sym); + def->sym = NULL; +} + +struct ea_class * +ea_class_find_by_name(const char *name) +{ + struct symbol *sym = cf_find_symbol(global_root_scope, name); + if (!sym || (sym->class != SYM_ATTRIBUTE)) + return NULL; + else + return sym->attribute; +} + /** * cf_lex_init - initialize the lexer * @is_cli: true if we're going to parse CLI command, false for configuration @@ -753,6 +804,11 @@ cf_lex_init(int is_cli, struct config *c) c->root_scope = cfg_allocz(sizeof(struct sym_scope)); conf_this_scope = c->root_scope; conf_this_scope->active = 1; + + if (is_cli) + conf_this_scope->next = config->root_scope; + else + conf_this_scope->next = global_root_scope; } /** @@ -787,6 +843,8 @@ cf_push_scope(struct symbol *sym) void cf_pop_scope(void) { + ASSERT(!conf_this_scope->soft_scopes); + conf_this_scope->active = 0; conf_this_scope = conf_this_scope->next; @@ -794,6 +852,52 @@ cf_pop_scope(void) } /** + * cf_push_soft_scope - enter new soft scope + * + * If we want to enter a new anonymous scope that most likely will not contain + * any symbols, we can use cf_push_soft_scope() insteas of cf_push_scope(). + * Such scope will be converted to a regular scope on first use. + */ +void +cf_push_soft_scope(void) +{ + if (conf_this_scope->soft_scopes < 0xfe) + conf_this_scope->soft_scopes++; + else + cf_push_scope(NULL); +} + +/** + * cf_pop_soft_scope - leave a soft scope + * + * Leave a soft scope entered by cf_push_soft_scope(). + */ +void +cf_pop_soft_scope(void) +{ + if (conf_this_scope->soft_scopes) + conf_this_scope->soft_scopes--; + else + cf_pop_scope(); +} + +/** + * cf_swap_soft_scope - convert soft scope to regular scope + * + * Soft scopes cannot hold symbols, so they must be converted to regular scopes + * on first use. It is done automatically by cf_new_symbol(). + */ +static inline void +cf_swap_soft_scope(void) +{ + if (conf_this_scope->soft_scopes) + { + conf_this_scope->soft_scopes--; + cf_push_scope(NULL); + } +} + +/** * cf_symbol_class_name - get name of a symbol class * @sym: symbol * diff --git a/conf/conf.c b/conf/conf.c index 58abcde1..17424402 100644 --- a/conf/conf.c +++ b/conf/conf.c @@ -46,7 +46,7 @@ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "lib/resource.h" @@ -140,6 +140,7 @@ config_parse(struct config *c) protos_preconfig(c); rt_preconfig(c); cf_parse(); + rt_postconfig(c); if (EMPTY_LIST(c->protos)) cf_error("No protocol is specified in the config file"); @@ -168,7 +169,6 @@ int cli_parse(struct config *c) { int done = 0; - c->fallback = config; new_config = c; cfg_mem = c->mem; if (setjmp(conf_jmpbuf)) @@ -179,7 +179,6 @@ cli_parse(struct config *c) done = 1; cleanup: - c->fallback = NULL; new_config = NULL; cfg_mem = NULL; return done; @@ -520,6 +519,8 @@ order_shutdown(int gr) memcpy(c, config, sizeof(struct config)); init_list(&c->protos); init_list(&c->tables); + init_list(&c->symbols); + memset(c->def_tables, 0, sizeof(c->def_tables)); c->shutdown = 1; c->gr_down = gr; diff --git a/conf/conf.h b/conf/conf.h index 4f6aa6eb..ffefa519 100644 --- a/conf/conf.h +++ b/conf/conf.h @@ -16,7 +16,6 @@ #include "lib/timer.h" /* Configuration structure */ - struct config { pool *pool; /* Pool the configuration is stored in */ linpool *mem; /* Linear pool containing configuration data */ @@ -28,7 +27,7 @@ struct config { int mrtdump_file; /* Configured MRTDump file (sysdep, fd in unix) */ const char *syslog_name; /* Name used for syslog (NULL -> no syslog) */ - struct rtable_config *def_tables[NET_MAX]; /* Default routing tables for each network */ + struct symbol *def_tables[NET_MAX]; /* Default routing tables for each network */ struct iface_patt *router_id_from; /* Configured list of router ID iface patterns */ u32 router_id; /* Our Router ID */ @@ -55,8 +54,7 @@ struct config { char *err_file_name; /* File name containing error */ char *file_name; /* Name of main configuration file */ int file_fd; /* File descriptor of main configuration file */ - HASH(struct symbol) sym_hash; /* Lexer: symbol hash table */ - struct config *fallback; /* Link to regular config for CLI parsing */ + struct sym_scope *root_scope; /* Scope for root symbols */ int obstacle_count; /* Number of items blocking freeing of this config */ int shutdown; /* This is a pseudo-config for daemon shutdown */ @@ -123,7 +121,7 @@ struct symbol { const struct f_line *function; /* For SYM_FUNCTION */ const struct filter *filter; /* For SYM_FILTER */ struct rtable_config *table; /* For SYM_TABLE */ - struct f_dynamic_attr *attribute; /* For SYM_ATTRIBUTE */ + struct ea_class *attribute; /* For SYM_ATTRIBUTE */ struct f_val *val; /* For SYM_CONSTANT */ uint offset; /* For SYM_VARIABLE */ }; @@ -134,10 +132,16 @@ struct symbol { struct sym_scope { struct sym_scope *next; /* Next on scope stack */ struct symbol *name; /* Name of this scope */ + + HASH(struct symbol) hash; /* Local symbol hash */ + uint slots; /* Variable slots */ - int active; /* Currently entered */ + byte active; /* Currently entered */ + byte soft_scopes; /* Number of soft scopes above */ }; +extern struct sym_scope *global_root_scope; + struct bytestring { size_t length; byte data[]; @@ -186,12 +190,22 @@ int cf_lex(void); void cf_lex_init(int is_cli, struct config *c); void cf_lex_unwind(void); -struct symbol *cf_find_symbol(const struct config *cfg, const byte *c); +struct symbol *cf_find_symbol_scope(const struct sym_scope *scope, const byte *c); +static inline struct symbol *cf_find_symbol_cfg(const struct config *cfg, const byte *c) +{ return cf_find_symbol_scope(cfg->root_scope, c); } + +#define cf_find_symbol(where, what) _Generic(*(where), \ + struct config: cf_find_symbol_cfg, \ + struct sym_scope: cf_find_symbol_scope \ + )((where), (what)) struct symbol *cf_get_symbol(const byte *c); struct symbol *cf_default_name(char *template, int *counter); struct symbol *cf_localize_symbol(struct symbol *sym); +static inline int cf_symbol_is_local(struct symbol *sym) +{ return (sym->scope == conf_this_scope) && !conf_this_scope->soft_scopes; } + /** * cf_define_symbol - define meaning of a symbol * @sym: symbol to be defined @@ -215,6 +229,9 @@ struct symbol *cf_localize_symbol(struct symbol *sym); void cf_push_scope(struct symbol *); void cf_pop_scope(void); +void cf_push_soft_scope(void); +void cf_pop_soft_scope(void); + char *cf_symbol_class_name(struct symbol *sym); /* Parser */ diff --git a/conf/confbase.Y b/conf/confbase.Y index 6985783b..8e5da9e3 100644 --- a/conf/confbase.Y +++ b/conf/confbase.Y @@ -14,11 +14,12 @@ CF_HDR #include "conf/conf.h" #include "lib/resource.h" #include "lib/socket.h" +#include "lib/settle.h" #include "lib/timer.h" #include "lib/string.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/bfd.h" #include "nest/cli.h" #include "filter/filter.h" @@ -71,10 +72,12 @@ CF_DECLS } xp; enum filter_return fret; enum ec_subtype ecs; - struct f_dynamic_attr fda; + struct ea_class *ea_class; struct f_static_attr fsa; + struct f_attr_bit fab; struct f_lval flv; struct f_line *fl; + struct f_arg *fa; const struct filter *f; struct f_tree *e; struct f_trie *trie; @@ -91,7 +94,8 @@ CF_DECLS struct proto_spec ps; struct channel_limit cl; struct timeformat *tf; - mpls_label_stack *mls; + struct settle_config settle; + struct adata *ad; struct bytestring *bs; } @@ -109,17 +113,19 @@ CF_DECLS %type <i> expr bool pxlen4 %type <time> expr_us time +%type <settle> settle %type <a> ipa -%type <net> net_ip4_ net_ip6_ net_ip6 net_ip_ net_ip net_or_ipa +%type <net> net_ip4_ net_ip4 net_ip6_ net_ip6 net_ip_ net_ip net_or_ipa %type <net_ptr> net_ net_any net_vpn4_ net_vpn6_ net_vpn_ net_roa4_ net_roa6_ net_roa_ net_ip6_sadr_ net_mpls_ -%type <mls> label_stack_start label_stack +%type <ad> label_stack_start label_stack %type <t> text opttext -%type <s> symbol +%type <s> symbol symbol_known toksym %nonassoc PREFIX_DUMMY %left AND OR %nonassoc '=' '<' '>' '~' GEQ LEQ NEQ NMA PO PC +%left '|' '&' %left '+' '-' %left '*' '/' '%' %left '!' @@ -151,16 +157,16 @@ conf: definition ; definition: DEFINE symbol '=' term ';' { - struct f_val *val = cfg_allocz(sizeof(struct f_val)); - if (f_eval(f_linearize($4), cfg_mem, val) > F_RETURN) cf_error("Runtime error"); - cf_define_symbol($2, SYM_CONSTANT | val->type, val, val); + struct f_val val; + if (f_eval(f_linearize($4, 1), &val) > F_RETURN) cf_error("Runtime error"); + cf_define_symbol($2, SYM_CONSTANT | val.type, val, lp_val_copy(cfg_mem, &val)); } ; expr: NUM - | '(' term ')' { $$ = f_eval_int(f_linearize($2)); } - | CF_SYM_KNOWN { + | '(' term ')' { $$ = f_eval_int(f_linearize($2, 1)); } + | symbol_known { if ($1->class != (SYM_CONSTANT | T_INT)) cf_error("Number constant expected"); $$ = SYM_VAL($1).i; } ; @@ -171,7 +177,9 @@ expr_us: | expr US { $$ = $1 US_; } ; -symbol: CF_SYM_UNDEFINED | CF_SYM_KNOWN ; +toksym: FROM | PREFERENCE ; +symbol: CF_SYM_UNDEFINED | CF_SYM_KNOWN | toksym ; +symbol_known: CF_SYM_KNOWN | toksym ; /* Switches */ @@ -303,6 +311,15 @@ net_: /* Networks - regular */ +net_ip4: + net_ip4_ + | CF_SYM_KNOWN { + if (($1->class != (SYM_CONSTANT | T_NET)) || (SYM_VAL($1).net->type != NET_IP4)) + cf_error("IPv4 network constant expected"); + $$ = * SYM_VAL($1).net; + } + ; + net_ip6: net_ip6_ | CF_SYM_KNOWN { @@ -347,17 +364,19 @@ net_or_ipa: label_stack_start: NUM { - $$ = cfg_allocz(sizeof(mpls_label_stack)); - $$->len = 1; - $$->stack[0] = $1; + $$ = cfg_allocz(ADATA_SIZE(MPLS_MAX_LABEL_STACK * sizeof(u32))); + $$->length = sizeof(u32); + *((u32 *)$$->data) = $1; }; label_stack: label_stack_start | label_stack '/' NUM { - if ($1->len >= MPLS_MAX_LABEL_STACK) + if ($1->length >= MPLS_MAX_LABEL_STACK * sizeof(u32)) cf_error("Too many labels in stack"); - $1->stack[$1->len++] = $3; + + *((u32 *)($$->data + $1->length)) = $3; + $1->length += sizeof(u32); $$ = $1; } ; @@ -370,6 +389,13 @@ time: } ; +/* Settle timer configuration */ +settle: expr_us expr_us { + if ($1 > $2) cf_error("Minimum settle time %t is bigger than maximum settle time %t", $1, $2); + $$.min = $1; + $$.max = $2; +}; + text: TEXT | CF_SYM_KNOWN { diff --git a/conf/flowspec.Y b/conf/flowspec.Y index 56a7c5dc..dbdbdda5 100644 --- a/conf/flowspec.Y +++ b/conf/flowspec.Y @@ -142,7 +142,7 @@ flow_frag_opts: ; flow4_item: - flow_srcdst net_ip { + flow_srcdst net_ip4 { flow_builder_set_type(this_flow, $1); flow_builder4_add_pfx(this_flow, (net_addr_ip4 *) &($2)); } diff --git a/conf/gen_keywords.m4 b/conf/gen_keywords.m4 index 0c1dc545..53226e4d 100644 --- a/conf/gen_keywords.m4 +++ b/conf/gen_keywords.m4 @@ -26,8 +26,7 @@ m4_define(CF_DEFINES, `m4_divert(-1)') m4_define(CF_handle_kw, `m4_divert(1){ "m4_translit($1,[[A-Z]],[[a-z]])", $1, NULL }, m4_divert(-1)') m4_define(CF_keywd, `m4_ifdef([[CF_tok_$1]],,[[m4_define([[CF_tok_$1]],1)CF_handle_kw($1)]])') -m4_define(CF_KEYWORDS, `m4_define([[CF_toks]],[[]])CF_iterate([[CF_keywd]], [[$@]])m4_ifelse(CF_toks,,,%token[[]]CF_toks -)DNL') +m4_define(CF_KEYWORDS, `CF_iterate([[CF_keywd]], [[$@]])DNL') # CLI commands generate keywords as well m4_define(CF_CLI, `CF_KEYWORDS(m4_translit($1, [[ ]], [[,]])) diff --git a/conf/gen_parser.m4 b/conf/gen_parser.m4 index 5b378a93..af4b1455 100644 --- a/conf/gen_parser.m4 +++ b/conf/gen_parser.m4 @@ -31,7 +31,7 @@ m4_define(CF_iterate, `m4_define([[CF_iter]], m4_defn([[$1]]))CF_itera($2)') # Keywords act as untyped %token m4_define(CF_keywd, `m4_ifdef([[CF_tok_$1]],,[[m4_define([[CF_tok_$1]],1)m4_define([[CF_toks]],CF_toks $1)]])') -m4_define(CF_KEYWORDS, `m4_define([[CF_toks]],[[]])CF_iterate([[CF_keywd]], [[$@]])m4_ifelse(CF_toks,,,%token[[]]CF_toks +m4_define(CF_KEYWORDS, `m4_define([[CF_toks]],[[]])CF_iterate([[CF_keywd]], [[$@]])m4_ifelse(CF_toks,,,%token<s>[[]]CF_toks )DNL') # CLI commands diff --git a/configure.ac b/configure.ac index 5c0cf002..f942ec1b 100644 --- a/configure.ac +++ b/configure.ac @@ -276,6 +276,12 @@ if test "$enable_libssh" != no ; then enable_libssh=no fi fi + + AC_CHECK_LIB([ssh], [ssh_session_is_known_server], [ssh_old_server_validation_api=no], [ssh_old_server_validation_api=yes]) + + if test "$ssh_old_server_validation_api" = yes; then + AC_DEFINE([HAVE_SSH_OLD_SERVER_VALIDATION_API], [1], [Define to 1 if ssh_session_is_known_server isn't defined]) + fi fi if test "$enable_mpls_kernel" != no ; then @@ -337,7 +343,7 @@ case $sysdesc in ;; esac -AC_CHECK_HEADERS_ONCE([alloca.h syslog.h]) +AC_CHECK_HEADERS_ONCE([alloca.h syslog.h stdatomic.h]) AC_CHECK_HEADER([sys/mman.h], [AC_DEFINE([HAVE_MMAP], [1], [Define to 1 if mmap() is available.])], have_mman=no) AC_CHECK_FUNC([aligned_alloc], [AC_DEFINE([HAVE_ALIGNED_ALLOC], [1], [Define to 1 if aligned_alloc() is available.])], have_aligned_alloc=no) AC_CHECK_MEMBERS([struct sockaddr.sa_len], [], [], [#include <sys/socket.h>]) @@ -353,6 +359,13 @@ AC_C_BIGENDIAN( [AC_MSG_ERROR([Cannot determine CPU endianity.])] ) +BIRD_CHECK_POINTER_ALIGNMENT +if test "$bird_cv_pointer_alignment" = "unknown" ; then + AC_MSG_ERROR([Couldn't determine pointer alignment]) +else + AC_DEFINE_UNQUOTED([CPU_POINTER_ALIGNMENT], [$bird_cv_pointer_alignment], [Pointer alignment for macro usage]) +fi + BIRD_CHECK_ANDROID_GLOB if test "$bird_cv_lib_glob" = no ; then AC_MSG_ERROR([glob.h not found.]) diff --git a/distro/pkg/rpm/bird.service b/distro/pkg/rpm/bird.service index fa203c78..26bcb8a0 100644 --- a/distro/pkg/rpm/bird.service +++ b/distro/pkg/rpm/bird.service @@ -5,8 +5,9 @@ After=network.target [Service] Type=simple +ExecStartPre=/usr/sbin/bird -p ExecStart=/usr/sbin/bird -f -u bird -g bird -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=/usr/sbin/birdc configure Restart=on-failure [Install] diff --git a/doc/bird.sgml b/doc/bird.sgml index d1a3b70f..0cfe19c4 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -14,7 +14,7 @@ configuration - something in config which is not keyword. (set-fill-column 80) - Copyright 1999,2000 Pavel Machek <pavel@ucw.cz>, distribute under GPL version 2 or later. + Copyright 1999 - 2022 CZ.NIC, z.s.p.o , distribute under GPL version 2 or later. --> @@ -23,7 +23,6 @@ configuration - something in config which is not keyword. <title>BIRD 2.0 User's Guide <author> Ondrej Filip <it/<feela@network.cz>/, -Pavel Machek <it/<pavel@ucw.cz>/, Martin Mares <it/<mj@ucw.cz>/, Maria Matejka <it/<mq@jmq.cz>/, Ondrej Zajicek <it/<santiago@crfreenet.org>/ @@ -145,13 +144,6 @@ BIRD executable by configuring out routing protocols you don't use, and <p>You can pass several command-line options to bird: <descrip> - <tag><label id="argv-block">-B <m/exp/</tag> - allocate memory using 2^<cf/exp/ byte sized blocks; - if you're expecting high memory load, raise this to - reduce number of allocated memory pages. For a million routes - in one table, the recommended setting is 18. - Default is your system page size, typically 12 for 4096 bytes. - <tag><label id="argv-config">-c <m/config name/</tag> use given configuration file instead of <it/prefix/<file>/etc/bird.conf</file>. @@ -259,16 +251,9 @@ The global best route selection algorithm is (roughly) as follows: </itemize> <p><label id="dsc-table-sorted">Usually, a routing table just chooses a selected -route from a list of entries for one network. But if the <cf/sorted/ option is -activated, these lists of entries are kept completely sorted (according to -preference or some protocol-dependent metric). This is needed for some features -of some protocols (e.g. <cf/secondary/ option of BGP protocol, which allows to -accept not just a selected route, but the first route (in the sorted list) that -is accepted by filters), but it is incompatible with some other features (e.g. -<cf/deterministic med/ option of BGP protocol, which activates a way of choosing -selected route that cannot be described using comparison and ordering). Minor -advantage is that routes are shown sorted in <cf/show route/, minor disadvantage -is that it is slightly more computationally expensive. +route from a list of entries for one network. Optionally, these lists of entries +are kept completely sorted (according to preference or some protocol-dependent +metric). See <ref id="rtable-sorted" name="sorted"> table option for details. <sect>Routes and network types <label id="routes"> @@ -520,6 +505,11 @@ include "tablename.conf";; See <ref id="channel-debug" name="debug"> in the channel section. Default: off. + <tag><label id="opt-debug-tables">debug tables all|off|{ states|routes|filters|events [, <m/.../] }</tag> + Set global defaults of table debugging options. + See <ref id="rtable-debug" name="debug"> in the table section. + Default: off. + <tag><label id="opt-debug-commands">debug commands <m/number/</tag> Control logging of client connections (0 for no logging, 1 for logging of connects and disconnects, 2 and higher for logging of all client @@ -635,18 +625,107 @@ include "tablename.conf";; <cf/protocol/ times, and the <cf/iso long ms/ format for <cf/base/ and <cf/log/ times. - <tag><label id="opt-table"><m/nettype/ table <m/name/ [sorted]</tag> - Create a new routing table. The default routing tables <cf/master4/ and - <cf/master6/ are created implicitly, other routing tables have to be - added by this command. Option <cf/sorted/ can be used to enable sorting - of routes, see <ref id="dsc-table-sorted" name="sorted table"> - description for details. + <tag><label id="opt-table"><m/nettype/ table <m/name/ [ { <m/option/; [<m/.../] } ]</tag> + Define a new routing table. The default routing tables <cf/master4/ and + <cf/master6/ are defined implicitly, other routing tables have to be + defined by this option. See the <ref id="rtable-opts" + name="routing table configuration section"> for routing table options. <tag><label id="opt-eval">eval <m/expr/</tag> Evaluates given filter expression. It is used by the developers for testing of filters. </descrip> +<sect>Routing table options +<label id="rtable-opts"> + +<p>Most routing tables do not need any options and are defined without an option +block, but there are still some options to tweak routing table behavior. Note +that implicit tables (<cf/master4/ and <cf/master6/) can be redefined in order +to set options. + +<descrip> + <tag><label id="rtable-sorted">sorted <m/switch/</tag> + Usually, a routing table just chooses the selected (best) route from a + list of routes for each network, while keeping remaining routes unsorted. + If enabled, these lists of routes are kept completely sorted (according + to preference or some protocol-dependent metric). + + This is needed for some protocol features (e.g. <cf/secondary/ option of + BGP protocol, which allows to accept not just a selected route, but the + first route (in the sorted list) that is accepted by filters), but it is + incompatible with some other features (e.g. <cf/deterministic med/ + option of BGP protocol, which activates a way of choosing selected route + that cannot be described using comparison and ordering). Minor advantage + is that routes are shown sorted in <cf/show route/, minor disadvantage + is that it is slightly more computationally expensive. Default: off. + + <tag><label id="rtable-trie">trie <m/switch/</tag> + BIRD routing tables are implemented with hash tables, which is efficient + for exact-match lookups, but inconvenient for longest-match lookups or + interval lookups (finding superprefix or subprefixes). This option + activates additional trie structure that is used to accelerate these + lookups, while using the hash table for exact-match lookups. + + This has advantage for <ref id="rpki" name="RPKI"> (on ROA tables), + for <ref id="bgp-gateway" name="recursive next-hops"> (on IGP tables), + and is required for <ref id="bgp-validate" name="flowspec validation"> + (on base IP tables). Another advantage is that interval results (like + from <cf/show route in .../ command) are lexicographically sorted. The + disadvantage is that trie-enabled routing tables require more memory, + which may be an issue especially in multi-table setups. Default: off. + + <tag><label id="rtable-gc-threshold">gc threshold <m/number/</tag> + Specify a minimum amount of removed networks that triggers a garbage + collection (GC) cycle. Default: 1000. + + <tag><label id="rtable-gc-period">gc period <m/time/</tag> + Specify a period of time between consecutive GC cycles. When there is a + significant amount of route withdraws, GC cycles are executed repeatedly + with given period time (with some random factor). When there is just + small amount of changes, GC cycles are not executed. In extensive route + server setups, running GC on hundreds of full BGP routing tables can + take significant amount of time, therefore they should use higher GC + periods. Default: adaptive, based on number of routing tables in the + configuration. From 10 s (with <= 25 routing tables) up to 600 s (with + >= 1500 routing tables). + + <tag><label id="rtable-cork-threshold">cork threshold <m/number/ <m/number/</tag> + Too many pending exports may lead to memory bloating. In such cases, + BIRD tries to relieve the memory pressure by pausing some routines until + the queue sizes get low enough. This option allows the user to set the + thresholds; first value is the low threshold (when to resume), the + second one is the high threshold (when to pause). The higher is the + threshold, the more memory can get used. In most cases, the defaults + should work for you. Default: 128, 512. + + <tag><label id="rtable-export-settle-time">export settle time <m/time/ <m/time/</tag> + Minimum and maximum settle times, respectively, for export announcements. + When multiple routes are changing, this mechanism waits for the changes + to settle before waking up sleeping export threads but if the changes are coming + steadily, BIRD isn't waiting forever; at most the maximum time. + Default values: <cf/1 ms 100 ms/. You have to always provide both values. + + <tag><label id="rtable-route-refresh-export-settle-time">route refresh export settle time <m/time/ <m/time/</tag> + Minimum and maximum settle times, respectively, for export announcements + (the same as above), valid when any channel is currently doing a route refresh. + This serves a purpose of even more aggresive change bundling, knowing that there + is some active process generating changes in a fast pace. If you don't want + this feature, set this to the same values as <ref id="rtable-export-settle-time" name="export settle time">. + Default values: <cf/100 ms 3 s/. + + <tag><label id="rtable-debug">debug all|off|{ states|routes|events [, <m/.../] }</tag> + Set table debugging options. Each table can write some trace messages + into log with category <cf/trace/. You can request <cf/all/ trace messages + or select some types: <cf/states/ for table state changes and auxiliary + processes, <cf/routes/ for auxiliary route notifications (next hop update, + flowspec revalidation) and <cf/events/ for more detailed auxiliary routine + debug. See also <ref id="channel-debug" name="channel debugging option">. + Default: off. + +</descrip> + + <sect>Protocol options <label id="protocol-opts"> @@ -874,10 +953,12 @@ inherited from templates can be updated by new definitions. <cf/none/ is for dropping all routes. Default: <cf/all/ (except for EBGP). - <tag><label id="proto-export">export <m/filter/</tag> + <tag><label id="proto-export">export [ in <m/prefix/ ] <m/filter/</tag> This is similar to the <cf>import</cf> keyword, except that it works in - the direction from the routing table to the protocol. Default: <cf/none/ - (except for EBGP). + the direction from the routing table to the protocol. If <cf/in/ keyword is used, + only routes inside the given prefix are exported. Other routes are completely + ignored (e.g. no logging and no statistics). + Default: <cf/none/ (except for EBGP). <tag><label id="proto-import-keep-filtered">import keep filtered <m/switch/</tag> Usually, if an import filter rejects a route, the route is forgotten. @@ -899,6 +980,16 @@ inherited from templates can be updated by new definitions. <ref id="bgp-export-table" name="export table"> (for respective direction). Default: on. + <tag><label id="rtable-min-settle-time">roa settle time <m/time/ <m/time/</tag> + Minimum and maximum settle times, respectively, for ROA table changes. + The automatic reload is triggered after the minimum time after the last + ROA table change has been received but not later than the maximum time after + first unprocessed ROA table change. Therefore with default values, the + automatic reload happens 1 second after the ROA table stops updating, yet if it + were to be later than 20 seconds after the ROA table starts updating, + the automatic reload is triggered anyway. Default values: <cf/1 s 20 s/. + You have to always provide both values. + <tag><label id="proto-import-limit">import limit [<m/number/ | off ] [action warn | block | restart | disable]</tag> Specify an import route limit (a maximum number of routes imported from the protocol) and optionally the action to be taken when the limit is @@ -1205,8 +1296,8 @@ this: <code> filter not_too_far -int var; { + int var; if defined( rip_metric ) then var = rip_metric; else { @@ -1235,9 +1326,9 @@ local variables. Recursion is not allowed. Function definitions look like this: <code> function name () -int local_variable; { - local_variable = 5; + int local_variable; + int another_variable = 5; } function with_parameters (int parameter) @@ -1246,16 +1337,19 @@ function with_parameters (int parameter) } </code> -<p>Unlike in C, variables are declared after the <cf/function/ line, but before -the first <cf/{/. You can't declare variables in nested blocks. Functions are -called like in C: <cf>name(); with_parameters(5);</cf>. Function may return -values using the <cf>return <m/[expr]/</cf> command. Returning a value exits -from current function (this is similar to C). +<p>Like in C programming language, variables are declared inside function body, +either at the beginning, or mixed with other statements. Declarations may +contain initialization. You can also declare variables in nested blocks, such +variables have scope restricted to such block. There is a deprecated syntax to +declare variables after the <cf/function/ line, but before the first <cf/{/. +Functions are called like in C: <cf>name(); with_parameters(5);</cf>. Function +may return values using the <cf>return <m/[expr]/</cf> command. Returning a +value exits from current function (this is similar to C). -<p>Filters are defined in a way similar to functions except they can't have +<p>Filters are defined in a way similar to functions except they cannot have explicit parameters. They get a route table entry as an implicit parameter, it is also passed automatically to any functions called. The filter must terminate -with either <cf/accept/ or <cf/reject/ statement. If there's a runtime error in +with either <cf/accept/ or <cf/reject/ statement. If there is a runtime error in filter, the route is rejected. <p>A nice trick to debug filters is to use <cf>show route filter <m/name/</cf> @@ -1299,6 +1393,9 @@ in the foot). The same syntax can also be used to construct a pair from two arbitrary integer expressions (for example <cf/(1+2,a)/). + Operators <cf/.asn/ and <cf/.data/ can be used to extract corresponding + components of a pair: <cf>(<m/asn/, <m/data/)</cf>. + <tag><label id="type-quad">quad</tag> This is a dotted quad of numbers used to represent router IDs (and others). Each component can have a value from 0 to 255. Literals of @@ -1389,6 +1486,10 @@ in the foot). pairs, LCs can be constructed using expressions for its parts, (e.g. <cf/(myas, 10+20, 3*10)/, where <cf/myas/ is an integer variable). + Operators <cf/.asn/, <cf/.data1/, and <cf/.data2/ can be used + to extract corresponding components of LCs: + <cf>(<m/asn/, <m/data1/, <m/data2/)</cf>. + <tag><label id="type-set">int|pair|quad|ip|prefix|ec|lc|enum set</tag> Filters recognize four types of sets. Sets are similar to strings: you can pass them around but you can't modify them. Literals of type <cf>int @@ -1532,7 +1633,7 @@ in the foot). Clist is similar to a set, except that unlike other sets, it can be modified. The type is used for community list (a set of pairs) and for cluster list (a set of quads). There exist no literals of this type. - There are three special operators on clists: + There are special operators on clists: <cf><m/C/.len</cf> returns the length of clist <m/C/. @@ -1559,6 +1660,15 @@ in the foot). <cf><m/C/.add(<m/P/);</cf> if <m/C/ is appropriate route attribute (for example <cf/bgp_community/). Similarly for <cf/delete/ and <cf/filter/. + <cf><m/C/.min</cf> returns the minimum element of clist <m/C/. + + <cf><m/C/.max</cf> returns the maximum element of clist <m/C/. + + Operators <cf/.min/, <cf/.max/ can be used together with <cf/filter/ + to extract the community from the specific subset of communities + (e.g. localpref or prepend) without the need to check every possible + value (e.g. <cf/filter(bgp_community, [(23456, 1000..1099)]).min/). + <tag><label id="type-eclist">eclist</tag> Eclist is a data type used for BGP extended community lists. Eclists are very similar to clists, but they are sets of ECs instead of pairs. @@ -1609,7 +1719,8 @@ prefix and an ASN as arguments. <sect>Control structures <label id="control-structures"> -<p>Filters support two control structures: conditions and case switches. +<p>Filters support several control structures: conditions, for loops and case +switches. <p>Syntax of a condition is: <cf>if <M>boolean expression</M> then <m/commandT/; else <m/commandF/;</cf> and you can use <cf>{ <m/command1/; <m/command2/; @@ -1617,6 +1728,14 @@ else <m/commandF/;</cf> and you can use <cf>{ <m/command1/; <m/command2/; omitted. If the <cf><m>boolean expression</m></cf> is true, <m/commandT/ is executed, otherwise <m/commandF/ is executed. +<p>For loops allow to iterate over elements in compound data like BGP paths or +community lists. The syntax is: <cf>for [ <m/type/ ] <m/variable/ in <m/expr/ +do <m/command/;</cf> and you can also use compound command like in conditions. +The expression is evaluated to a compound data, then for each element from such +data the command is executed with the item assigned to the variable. A variable +may be an existing one (when just name is used) or a locally defined (when type +and name is used). In both cases, it must have the same type as elements. + <p>The <cf>case</cf> is similar to case from Pascal. Syntax is <cf>case <m/expr/ { else: | <m/num_or_prefix [ .. num_or_prefix]/: <m/statement/ ; [ ... ] }</cf>. The expression after <cf>case</cf> can be of any type which can be @@ -1629,16 +1748,21 @@ neither of the <cf/:/ clauses, the statements after <cf/else:/ are executed. <p>Here is example that uses <cf/if/ and <cf/case/ structures: <code> +if 1234 = i then printn "."; else { + print "not 1234"; + print "You need {} around multiple commands"; +} + +for int asn in bgp_path do { + printn "ASN: ", asn; + if asn < 65536 then print " (2B)"; else print " (4B)"; +} + case arg1 { 2: print "two"; print "I can do more commands without {}"; 3 .. 5: print "three to five"; else: print "something else"; } - -if 1234 = i then printn "."; else { - print "not 1234"; - print "You need {} around multiple commands"; -} </code> @@ -1667,17 +1791,8 @@ Common route attributes are: primary key of the routing table. Read-only. (See the <ref id="routes" name="chapter about routes">.) - <tag><label id="rta-scope"><m/enum/ scope</tag> - The scope of the route. Possible values: <cf/SCOPE_HOST/ for routes - local to this host, <cf/SCOPE_LINK/ for those specific for a physical - link, <cf/SCOPE_SITE/ and <cf/SCOPE_ORGANIZATION/ for private routes and - <cf/SCOPE_UNIVERSE/ for globally visible routes. This attribute is not - interpreted by BIRD and can be used to mark routes in filters. The - default value for new routes is <cf/SCOPE_UNIVERSE/. - <tag><label id="rta-preference"><m/int/ preference</tag> - Preference of the route. Valid values are 0-65535. (See the chapter - about routing tables.) + Preference of the route. <tag><label id="rta-from"><m/ip/ from</tag> The router which the route has originated from. @@ -2097,6 +2212,13 @@ protocol bfd [<name>] { to configure separate BFD protocol instances for IPv4 and for IPv6 sessions. + <tag><label id="bfd-strict-bind">strict bind <m/switch/</tag> + Specify whether each BFD interface should use a separate listening + socket bound to its local address, or just use a shared listening socket + accepting all addresses. Binding to a specific address could be useful + in cases like running multiple BIRD instances on a machine, each + handling a different set of interfaces. Default: disabled. + <tag><label id="bfd-iface">interface <m/pattern/ [, <m/.../] { <m/options/ }</tag> Interface definitions allow to specify options for sessions associated with such interfaces and also may contain interface specific options. @@ -2281,6 +2403,8 @@ avoid routing loops. <item> <rfc id="8092"> - BGP Large Communities Attribute <item> <rfc id="8203"> - BGP Administrative Shutdown Communication <item> <rfc id="8212"> - Default EBGP Route Propagation Behavior without Policies +<item> <rfc id="9117"> - Revised Validation Procedure for BGP Flow Specifications +<item> <rfc id="9234"> - Route Leak Prevention and Detection Using Roles </itemize> <sect1>Route selection rules @@ -2403,6 +2527,12 @@ using the following configuration parameters: same address family and using the same local port) should have set <cf/strict bind/, or none of them. Default: disabled. + <tag><label id="bgp-free-bind">free bind <m/switch/</tag> + Use IP_FREEBIND socket option for the listening socket, which allows + binding to an IP address not (yet) assigned to an interface. Note that + all BGP instances that share a listening socket should have the same + value of the <cf/freebind/ option. Default: disabled. + <tag><label id="bgp-check-link">check link <M>switch</M></tag> BGP could use hardware link state into consideration. If enabled, BIRD tracks the link state of the associated interface and when link @@ -2614,13 +2744,6 @@ using the following configuration parameters: disabled. Default: on, with automatic fallback to off when received capability-related error. - <tag><label id="bgp-advertise-ipv4">advertise ipv4 <m/switch/</tag> - Advertise IPv4 multiprotocol capability. This is not a correct behavior - according to the strict interpretation of <rfc id="4760">, but it is - widespread and required by some BGP implementations (Cisco and Quagga). - This option is relevant to IPv4 mode with enabled capability - advertisement only. Default: on. - <tag><label id="bgp-advertise-hostname">advertise hostname <m/switch/</tag> Advertise hostname capability along with the hostname. Default: off. @@ -2666,7 +2789,7 @@ using the following configuration parameters: <tag><label id="bgp-error-wait-time">error wait time <m/number/,<m/number/</tag> Minimum and maximum delay in seconds between a protocol failure (either - local or reported by the peer) and automatic restart. Doesn't apply + local or reported by the peer) and automatic restart. Doesn not apply when <cf/disable after error/ is configured. If consecutive errors happen, the delay is increased exponentially until it reaches the maximum. Default: 60, 300. @@ -2722,6 +2845,29 @@ using the following configuration parameters: protocol itself (for example, if a route is received through eBGP and therefore does not have such attribute). Default: 100 (0 in pre-1.2.0 versions of BIRD). + + <tag><label id="bgp-local-role">local role <m/role-name/</tag> + BGP roles are a mechanism for route leak prevention and automatic route + filtering based on common BGP topology relationships. They are defined + in <rfc id="9234">. Instead of manually configuring filters and + communities, automatic filtering is done with the help of the OTC + attribute - a flag for routes that should be sent only to customers. + The same attribute is also used to automatically detect and filter route + leaks created by third parties. + + This option is valid for EBGP sessions, but it is not recommended to be + used within AS confederations (which would require manual filtering of + <cf/bgp_otc/ attribute on confederation boundaries). + + Possible <cf><m/role-name/</cf> values are: <cf/provider/, + <cf/rs_server/, <cf/rs_client/, <cf/customer/ and <cf/peer/. + Default: No local role assigned. + + <tag><label id="bgp-require-roles">require roles <m/switch/</tag> + If this option is set, the BGP roles must be defined on both sides, + otherwise the session will not be established. This behavior is defined + in <rfc id="9234"> as "strict mode" and is used to enforce corresponding + configuration at your conterpart side. Default: disabled. </descrip> <sect1>Channel configuration @@ -2844,6 +2990,31 @@ be used in explicit configuration. explicitly (to conserve memory). This option requires that the connected routing table is <ref id="dsc-table-sorted" name="sorted">. Default: off. + <tag><label id="bgp-validate">validate <m/switch/</tag> + Apply flowspec validation procedure as described in <rfc id="8955"> + section 6 and <rfc id="9117">. The Validation procedure enforces that + only routers in the forwarding path for a network can originate flowspec + rules for that network. The validation procedure should be used for EBGP + to prevent injection of malicious flowspec rules from outside, but it + should also be used for IBGP to ensure that selected flowspec rules are + consistent with selected IP routes. The validation procedure uses an IP + routing table (<ref id="bgp-base-table" name="base table">, see below) + against which flowspec rules are validated. This option is limited to + flowspec channels. Default: off (for compatibility reasons). + + Note that currently the flowspec validation does not work reliably + together with <ref id="bgp-import-table" name="import table"> option + enabled on flowspec channels. + + <tag><label id="bgp-base-table">base table <m/name/</tag> + Specifies an IP table used for the flowspec validation procedure. The + table must have enabled <cf/trie/ option, otherwise the validation + procedure would not work. The type of the table must be <cf/ipv4/ for + <cf/flow4/ channels and <cf/ipv6/ for <cf/flow6/ channels. This option + is limited to flowspec channels. Default: the main table of the + <cf/ipv4/ / <cf/ipv6/ channel of the same BGP instance, or the + <cf/master4/ / <cf/master6/ table if there is no such channel. + <tag><label id="bgp-extended-next-hop">extended next hop <m/switch/</tag> BGP expects that announced next hops have the same address family as associated network prefixes. This option provides an extension to use @@ -3004,6 +3175,11 @@ some of them (marked with `<tt/O/') are optional. This attribute contains accumulated IGP metric, which is a total distance to the destination through multiple autonomous systems. Currently, the attribute is not accessible from filters. + + <tag><label id="bgp-otc">int bgp_otc [O]</tag> + This attribute is defined in <rfc id="9234">. OTC is a flag that marks + routes that should be sent only to customers. If <ref id="bgp-role" + name="local Role"> is configured it set automatically. </descrip> <sect1>Example @@ -3240,6 +3416,12 @@ channels. allows to specify a limit on maximal number of nexthops in one route. By default, multipath merging is disabled. If enabled, default value of the limit is 16. + + <tag><label id="krt-netlink-rx-buffer">netlink rx buffer <m/number/</tag> (Linux) + Set kernel receive buffer size (in bytes) for the netlink socket. The default + value is OS-dependent (from the <file>/proc/sys/net/core/rmem_default</file> + file), If you get some "Kernel dropped some netlink message ..." warnings, + you may increase this value. </descrip> <sect1>Attributes @@ -5165,7 +5347,7 @@ Note that for negated matches, value must be either zero or equal to bitmask <cf>port 1..1023,1194,3306</cf>). <tag><label id="flow-dport">dport <m/numbers-match/</tag> - Set a mating destination port numbers (e.g. <cf>dport 49151</cf>). + Set a matching destination port numbers (e.g. <cf>dport 49151</cf>). <tag><label id="flow-sport">sport <m/numbers-match/</tag> Set a matching source port numbers (e.g. <cf>sport = 0</cf>). @@ -5356,15 +5538,15 @@ name="atrey.karlin.mff.cuni.cz:/pub/rfc">). </book> <!-- -LocalWords: GPL IPv GateD BGPv RIPv OSPFv Linux sgml html dvi sgmltools Pavel +LocalWords: GPL IPv GateD BGPv RIPv OSPFv Linux sgml html dvi sgmltools LocalWords: linuxdoc dtd descrip config conf syslog stderr auth ospf bgp Mbps LocalWords: router's eval expr num birdc ctl UNIX if's enums bool int ip GCC LocalWords: len ipaddress pxlen netmask enum bgppath bgpmask clist gw md eth -LocalWords: RTS printn quitbird iBGP AS'es eBGP RFC multiprotocol IGP Machek +LocalWords: RTS printn quitbird iBGP AS'es eBGP RFC multiprotocol IGP LocalWords: EGP misconfigurations keepalive pref aggr aggregator BIRD's RTC LocalWords: OS'es AS's multicast nolisten misconfigured UID blackhole MRTD MTU LocalWords: uninstalls ethernets IP binutils ANYCAST anycast dest RTD ICMP rfc LocalWords: compat multicasts nonbroadcast pointopoint loopback sym stats LocalWords: Perl SIGHUP dd mm yy HH MM SS EXT IA UNICAST multihop Discriminator txt -LocalWords: proto wildcard Ondrej Filip +LocalWords: proto wildcard --> diff --git a/doc/prog-head.sgml b/doc/prog-head.sgml index 0eec367e..4daca3a3 100644 --- a/doc/prog-head.sgml +++ b/doc/prog-head.sgml @@ -12,9 +12,9 @@ <title>BIRD Programmer's Documentation <author> Ondrej Filip <it/<feela@network.cz>/, -Pavel Machek <it/<pavel@ucw.cz>/, Martin Mares <it/<mj@ucw.cz>/, Ondrej Zajicek <it/<santiago@crfreenet.org>/ +Maria Matejka <it/<mq@jmq.cz>/, </author> <abstract> diff --git a/filter/config.Y b/filter/config.Y index 8034b790..5ba4f7e6 100644 --- a/filter/config.Y +++ b/filter/config.Y @@ -22,6 +22,46 @@ static inline u32 pair_b(u32 p) { return p & 0xFFFF; } #define f_generate_complex(fi_code, da, arg) \ f_new_inst(FI_EA_SET, f_new_inst(fi_code, f_new_inst(FI_EA_GET, da), arg), da) +#define f_generate_complex_sym(fi_code, sym, arg) ({ \ + if (sym->class != SYM_ATTRIBUTE) \ + cf_error("Can't empty %s: not an attribute", sym->name); \ + f_generate_complex(fi_code, sym->attribute, arg); \ +}) + +#define f_generate_complex_default(fi_code, da, arg, def) \ + f_new_inst(FI_EA_SET, f_new_inst(fi_code, f_new_inst(FI_DEFAULT, f_new_inst(FI_EA_GET, da), f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_INT, .val.i = def })), arg), da) + + +static int +f_new_var(struct sym_scope *s) +{ + /* + * - A variable is an offset on vstack from vbase. + * - Vbase is set on filter start / function call. + * - Scopes contain anonymous scopes (blocks) inside filter/function scope + * - Each scope knows number of vars in that scope + * - Offset is therefore a sum of 'slots' up to named scope + * - New variables are added on top of vstk, so intermediate values cannot + * be there during FI_VAR_INIT. I.e. no 'var' inside 'term'. + * - Also, each f_line must always have its scope, otherwise a variable may + * be defined but not initialized if relevant f_line is not executed. + */ + + int offset = s->slots++; + + while (!s->name) + { + s = s->next; + ASSERT(s); + offset += s->slots; + } + + if (offset >= 0xff) + cf_error("Too many variables, at most 255 allowed"); + + return offset; +} + /* * Sets and their items are during parsing handled as lists, linked * through left ptr. The first item in a list also contains a pointer @@ -161,28 +201,32 @@ f_new_lc_item(u32 f1, u32 t1, u32 f2, u32 t2, u32 f3, u32 t3) } static inline struct f_inst * -f_generate_empty(struct f_dynamic_attr dyn) +f_generate_empty(const struct symbol *sym) { - struct f_val empty; + if (sym->class != SYM_ATTRIBUTE) + cf_error("Can't empty %s: not an attribute", sym->name); - switch (dyn.type & EAF_TYPE_MASK) { - case EAF_TYPE_AS_PATH: - empty = f_const_empty_path; - break; - case EAF_TYPE_INT_SET: - empty = f_const_empty_clist; - break; - case EAF_TYPE_EC_SET: - empty = f_const_empty_eclist; - break; - case EAF_TYPE_LC_SET: - empty = f_const_empty_lclist; - break; - default: - cf_error("Can't empty that attribute"); - } + const struct ea_class *def = sym->attribute; + const struct f_val *empty = f_get_empty(def->type); + if (!empty) + cf_error("Can't empty attribute %s", def->name); + + return f_new_inst(FI_EA_SET, f_new_inst(FI_CONSTANT, *empty), def); +} - return f_new_inst(FI_EA_SET, f_new_inst(FI_CONSTANT, empty), dyn); +static inline struct f_inst * +f_implicit_roa_check(struct rtable_config *tab) +{ + const struct ea_class *def = ea_class_find("bgp_path"); + if (!def) + cf_error("Fatal: Couldn't find BGP path attribute definition."); + + struct f_static_attr fsa = f_new_static_attr(T_NET, SA_NET, 1); + + return f_new_inst(FI_ROA_CHECK, + f_new_inst(FI_RTA_GET, fsa), + f_new_inst(FI_AS_PATH_LAST, f_new_inst(FI_EA_GET, def)), + tab); } /* @@ -262,7 +306,7 @@ assert_assign(struct f_lval *lval, struct f_inst *expr, const char *start, const checker = f_new_inst(FI_EQ, expr, getter); setter->next = checker; - + return assert_done(setter, start, end); } @@ -273,15 +317,17 @@ CF_KEYWORDS(FUNCTION, PRINT, PRINTN, UNSET, RETURN, INT, BOOL, IP, TYPE, PREFIX, RD, PAIR, QUAD, EC, LC, SET, STRING, BGPMASK, BGPPATH, CLIST, ECLIST, LCLIST, IF, THEN, ELSE, CASE, + FOR, IN, DO, TRUE, FALSE, RT, RO, UNKNOWN, GENERIC, - FROM, GW, NET, MASK, PROTO, SOURCE, SCOPE, DEST, IFNAME, IFINDEX, WEIGHT, GW_MPLS, - PREFERENCE, + FROM, GW, NET, MASK, PROTO, SCOPE, DEST, IFNAME, IFINDEX, WEIGHT, GW_MPLS, ROA_CHECK, ASN, SRC, DST, IS_V4, IS_V6, LEN, MAXLEN, + DATA, DATA1, DATA2, DEFINED, - ADD, DELETE, CONTAINS, RESET, - PREPEND, FIRST, LAST, LAST_NONAGGREGATED, MATCH, + ADD, DELETE, RESET, + PREPEND, FIRST, LAST, LAST_NONAGGREGATED, + MIN, MAX, EMPTY, FILTER, WHERE, EVAL, ATTRIBUTE, BT_ASSERT, BT_TEST_SUITE, BT_CHECK_ASSIGN, BT_TEST_SAME, FORMAT, STACKS) @@ -290,21 +336,23 @@ CF_KEYWORDS(FUNCTION, PRINT, PRINTN, UNSET, RETURN, %nonassoc ELSE %type <xp> cmds_int cmd_prep -%type <x> term block cmd cmds constant constructor print_list var_list function_call symbol_value bgp_path_expr bgp_path bgp_path_tail -%type <fda> dynamic_attr +%type <x> term cmd cmd_var cmds cmds_scoped constant constructor print_list var var_init var_list function_call symbol_value bgp_path_expr bgp_path bgp_path_tail %type <fsa> static_attr +%type <fab> attr_bit %type <f> filter where_filter %type <fl> filter_body function_body %type <flv> lvalue -%type <i> type function_args function_vars +%type <i> type function_vars +%type <fa> function_argsn function_args %type <ecs> ec_kind -%type <fret> break_command +%type <fret> break_command %type <i32> cnum %type <e> pair_item ec_item lc_item set_item switch_item set_items switch_items switch_body %type <trie> fprefix_set %type <v> set_atom switch_atom fipa %type <px> fprefix %type <t> get_cf_position +%type <s> for_var CF_GRAMMAR @@ -328,17 +376,24 @@ filter_def: conf: filter_eval ; filter_eval: - EVAL term { f_eval_int(f_linearize($2)); } + EVAL term { f_eval_int(f_linearize($2, 1)); } ; conf: custom_attr ; custom_attr: ATTRIBUTE type symbol ';' { - cf_define_symbol($3, SYM_ATTRIBUTE, attribute, ca_lookup(new_config->pool, $3->name, $2)->fda); + if (($3->class == SYM_ATTRIBUTE) && ($3->scope == new_config->root_scope)) + cf_error("Duplicate attribute %s definition", $3->name); + + cf_define_symbol($3, SYM_ATTRIBUTE, attribute, + ea_register_alloc(new_config->pool, (struct ea_class) { + .name = $3->name, + .type = $2, + })->class); }; conf: bt_test_suite ; bt_test_suite: - BT_TEST_SUITE '(' CF_SYM_KNOWN ',' text ')' { + BT_TEST_SUITE '(' symbol_known ',' text ')' { cf_assert_symbol($3, SYM_FUNCTION); struct f_bt_test_suite *t = cfg_allocz(sizeof(struct f_bt_test_suite)); t->fn = $3->function; @@ -351,7 +406,7 @@ bt_test_suite: conf: bt_test_same ; bt_test_same: - BT_TEST_SAME '(' CF_SYM_KNOWN ',' CF_SYM_KNOWN ',' NUM ')' { + BT_TEST_SAME '(' symbol_known ',' symbol_known ',' NUM ')' { cf_assert_symbol($3, SYM_FUNCTION); cf_assert_symbol($5, SYM_FUNCTION); struct f_bt_test_suite *t = cfg_allocz(sizeof(struct f_bt_test_suite)); @@ -403,25 +458,28 @@ type: ; function_argsn: - /* EMPTY */ + /* EMPTY */ { $$ = NULL; } | function_argsn type symbol ';' { if ($3->scope->slots >= 0xfe) cf_error("Too many declarations, at most 255 allowed"); - cf_define_symbol($3, SYM_VARIABLE | $2, offset, $3->scope->slots++); + $$ = cfg_alloc(sizeof(struct f_arg)); + $$->arg = cf_define_symbol($3, SYM_VARIABLE | $2, offset, sym_->scope->slots++); + $$->next = $1; } ; function_args: - '(' ')' { $$ = 0; } + '(' ')' { $$ = NULL; } | '(' function_argsn type symbol ')' { - cf_define_symbol($4, SYM_VARIABLE | $3, offset, $4->scope->slots++); - $$ = $4->scope->slots; + $$ = cfg_alloc(sizeof(struct f_arg)); + $$->arg = cf_define_symbol($4, SYM_VARIABLE | $3, offset, sym_->scope->slots++); + $$->next = $2; } ; function_vars: /* EMPTY */ { $$ = 0; } | function_vars type symbol ';' { - cf_define_symbol($3, SYM_VARIABLE | $2, offset, $3->scope->slots++); + cf_define_symbol($3, SYM_VARIABLE | $2, offset, f_new_var(sym_->scope)); $$ = $1 + 1; } ; @@ -429,7 +487,7 @@ function_vars: filter_body: function_body ; filter: - CF_SYM_KNOWN { + symbol_known { cf_assert_symbol($1, SYM_FILTER); $$ = $1->filter; } @@ -449,20 +507,35 @@ where_filter: function_body: function_vars '{' cmds '}' { - $$ = f_linearize($3); + $$ = f_linearize($3, 0); $$->vars = $1; } ; conf: function_def ; function_def: - FUNCTION symbol { DBG( "Beginning of function %s\n", $2->name ); + FUNCTION symbol { + DBG( "Beginning of function %s\n", $2->name ); $2 = cf_define_symbol($2, SYM_FUNCTION, function, NULL); cf_push_scope($2); - } function_args function_body { - DBG("Definition of function %s with %u args and %u local vars.\n", $2->name, $4, $5->vars); - $5->args = $4; - $2->function = $5; + } function_args { + /* Make dummy f_line for storing function prototype */ + struct f_line *dummy = cfg_allocz(sizeof(struct f_line)); + $2->function = dummy; + + /* Revert the args */ + while ($4) { + struct f_arg *tmp = $4; + $4 = $4->next; + + tmp->next = dummy->arg_list; + dummy->arg_list = tmp; + dummy->args++; + } + } function_body { + $6->args = $2->function->args; + $6->arg_list = $2->function->arg_list; + $2->function = $6; cf_pop_scope(); } ; @@ -473,7 +546,11 @@ cmds: /* EMPTY */ { $$ = NULL; } | cmds_int { $$ = $1.begin; } ; -cmd_prep: cmd { +cmds_scoped: { cf_push_soft_scope(); } cmds { cf_pop_soft_scope(); $$ = $2; } ; + +cmd_var: var | cmd ; + +cmd_prep: cmd_var { $$.begin = $$.end = $1; if ($1) while ($$.end->next) @@ -495,15 +572,6 @@ cmds_int: cmd_prep } ; -block: - cmd { - $$=$1; - } - | '{' cmds '}' { - $$=$2; - } - ; - /* * Complex types, their bison value is struct f_val */ @@ -527,10 +595,10 @@ set_atom: | VPN_RD { $$.type = T_RD; $$.val.ec = $1; } | ENUM { $$.type = pair_a($1); $$.val.i = pair_b($1); } | '(' term ')' { - if (f_eval(f_linearize($2), cfg_mem, &($$)) > F_RETURN) cf_error("Runtime error"); + if (f_eval(f_linearize($2, 1), &($$)) > F_RETURN) cf_error("Runtime error"); if (!f_valid_set_type($$.type)) cf_error("Set-incompatible type"); } - | CF_SYM_KNOWN { + | symbol_known { cf_assert_symbol($1, SYM_CONSTANT); if (!f_valid_set_type(SYM_TYPE($1))) cf_error("%s: set-incompatible type", $1->name); $$ = *$1->val; @@ -539,13 +607,13 @@ set_atom: switch_atom: NUM { $$.type = T_INT; $$.val.i = $1; } - | '(' term ')' { $$.type = T_INT; $$.val.i = f_eval_int(f_linearize($2)); } + | '(' term ')' { $$.type = T_INT; $$.val.i = f_eval_int(f_linearize($2, 1)); } | fipa { $$ = $1; } | ENUM { $$.type = pair_a($1); $$.val.i = pair_b($1); } ; cnum: - term { $$ = f_eval_int(f_linearize($1)); } + term { $$ = f_eval_int(f_linearize($1, 1)); } pair_item: '(' cnum ',' cnum ')' { $$ = f_new_pair_item($2, $2, $4, $4); } @@ -629,19 +697,19 @@ fprefix_set: ; switch_body: /* EMPTY */ { $$ = NULL; } - | switch_body switch_items ':' cmds { + | switch_body switch_items ':' cmds_scoped { /* Fill data fields */ struct f_tree *t; - struct f_line *line = f_linearize($4); + struct f_line *line = f_linearize($4, 0); for (t = $2; t; t = t->left) t->data = line; $$ = f_merge_items($1, $2); } - | switch_body ELSECOL cmds { + | switch_body ELSECOL cmds_scoped { struct f_tree *t = f_new_tree(); t->from.type = t->to.type = T_VOID; t->right = t; - t->data = f_linearize($3); + t->data = f_linearize($3, 0); $$ = f_merge_items($1, t); } ; @@ -658,6 +726,7 @@ bgp_path: bgp_path_tail: NUM bgp_path_tail { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_PATH_MASK_ITEM, .val.pmi = { .asn = $1, .kind = PM_ASN, }, }); $$->next = $2; } | NUM DDOT NUM bgp_path_tail { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_PATH_MASK_ITEM, .val.pmi = { .from = $1, .to = $3, .kind = PM_ASN_RANGE }, }); $$->next = $4; } + | '[' ']' bgp_path_tail { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_PATH_MASK_ITEM, .val.pmi = { .set = NULL, .kind = PM_ASN_SET }, }); $$->next = $3; } | '[' set_items ']' bgp_path_tail { if ($2->from.type != T_INT) cf_error("Only integer sets allowed in path mask"); $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_PATH_MASK_ITEM, .val.pmi = { .set = build_tree($2), .kind = PM_ASN_SET }, }); $$->next = $4; @@ -677,6 +746,7 @@ constant: | fipa { $$ = f_new_inst(FI_CONSTANT, $1); } | VPN_RD { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_RD, .val.ec = $1, }); } | net_ { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_NET, .val.net = $1, }); } + | '[' ']' { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_SET, .val.t = NULL, }); } | '[' set_items ']' { DBG( "We've got a set here..." ); $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_SET, .val.t = build_tree($2), }); @@ -700,31 +770,26 @@ var_list: /* EMPTY */ { $$ = NULL; } | var_list ',' term { $$ = $3; $$->next = $1; } function_call: - CF_SYM_KNOWN '(' var_list ')' { + symbol_known '(' var_list ')' + { if ($1->class != SYM_FUNCTION) cf_error("You can't call something which is not a function. Really."); - struct f_inst *fc = f_new_inst(FI_CALL, $1); - uint args = 0; + /* Revert the var_list */ + struct f_inst *args = NULL; while ($3) { - args++; - struct f_inst *tmp = $3->next; - $3->next = fc; + struct f_inst *tmp = $3; + $3 = $3->next; - fc = $3; - $3 = tmp; + tmp->next = args; + args = tmp; } - if (args != $1->function->args) - cf_error("Function call '%s' got %u arguments, need %u arguments.", - $1->name, args, $1->function->args); - - $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_VOID }); - $$->next = fc; + $$ = f_new_inst(FI_CALL, args, $1); } ; -symbol_value: CF_SYM_KNOWN +symbol_value: symbol_known { switch ($1->class) { case SYM_CONSTANT_RANGE: @@ -734,7 +799,7 @@ symbol_value: CF_SYM_KNOWN $$ = f_new_inst(FI_VAR_GET, $1); break; case SYM_ATTRIBUTE: - $$ = f_new_inst(FI_EA_GET, *$1->attribute); + $$ = f_new_inst(FI_EA_GET, $1->attribute); break; default: cf_error("Can't get value of symbol %s", $1->name); @@ -743,17 +808,13 @@ symbol_value: CF_SYM_KNOWN ; static_attr: - FROM { $$ = f_new_static_attr(T_IP, SA_FROM, 0); } - | GW { $$ = f_new_static_attr(T_IP, SA_GW, 0); } + GW { $$ = f_new_static_attr(T_IP, SA_GW, 0); } | NET { $$ = f_new_static_attr(T_NET, SA_NET, 1); } | PROTO { $$ = f_new_static_attr(T_STRING, SA_PROTO, 1); } - | SOURCE { $$ = f_new_static_attr(T_ENUM_RTS, SA_SOURCE, 1); } - | SCOPE { $$ = f_new_static_attr(T_ENUM_SCOPE, SA_SCOPE, 0); } | DEST { $$ = f_new_static_attr(T_ENUM_RTD, SA_DEST, 0); } | IFNAME { $$ = f_new_static_attr(T_STRING, SA_IFNAME, 0); } | IFINDEX { $$ = f_new_static_attr(T_INT, SA_IFINDEX, 1); } | WEIGHT { $$ = f_new_static_attr(T_INT, SA_WEIGHT, 0); } - | PREFERENCE { $$ = f_new_static_attr(T_INT, SA_PREF, 0); } | GW_MPLS { $$ = f_new_static_attr(T_INT, SA_GW_MPLS, 0); } ; @@ -763,6 +824,8 @@ term: | term '-' term { $$ = f_new_inst(FI_SUBTRACT, $1, $3); } | term '*' term { $$ = f_new_inst(FI_MULTIPLY, $1, $3); } | term '/' term { $$ = f_new_inst(FI_DIVIDE, $1, $3); } + | term '&' term { $$ = f_new_inst(FI_BITAND, $1, $3); } + | term '|' term { $$ = f_new_inst(FI_BITOR, $1, $3); } | term AND term { $$ = f_new_inst(FI_AND, $1, $3); } | term OR term { $$ = f_new_inst(FI_OR, $1, $3); } | term '=' term { $$ = f_new_inst(FI_EQ, $1, $3); } @@ -781,8 +844,10 @@ term: | constructor { $$ = $1; } | static_attr { $$ = f_new_inst(FI_RTA_GET, $1); } - - | dynamic_attr { $$ = f_new_inst(FI_EA_GET, $1); } + | attr_bit { + struct f_inst *c = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_INT, .val.i = (1U << $1.bit)}); + $$ = f_new_inst(FI_EQ, c, f_new_inst(FI_BITAND, f_new_inst(FI_EA_GET, $1.class), c)); + } | term '.' IS_V4 { $$ = f_new_inst(FI_IS_V4, $1); } | term '.' TYPE { $$ = f_new_inst(FI_TYPE, $1); } @@ -790,13 +855,18 @@ term: | term '.' RD { $$ = f_new_inst(FI_ROUTE_DISTINGUISHER, $1); } | term '.' LEN { $$ = f_new_inst(FI_LENGTH, $1); } | term '.' MAXLEN { $$ = f_new_inst(FI_ROA_MAXLEN, $1); } - | term '.' ASN { $$ = f_new_inst(FI_ROA_ASN, $1); } + | term '.' ASN { $$ = f_new_inst(FI_ASN, $1); } | term '.' SRC { $$ = f_new_inst(FI_NET_SRC, $1); } | term '.' DST { $$ = f_new_inst(FI_NET_DST, $1); } | term '.' MASK '(' term ')' { $$ = f_new_inst(FI_IP_MASK, $1, $5); } | term '.' FIRST { $$ = f_new_inst(FI_AS_PATH_FIRST, $1); } | term '.' LAST { $$ = f_new_inst(FI_AS_PATH_LAST, $1); } | term '.' LAST_NONAGGREGATED { $$ = f_new_inst(FI_AS_PATH_LAST_NAG, $1); } + | term '.' DATA { $$ = f_new_inst(FI_PAIR_DATA, $1); } + | term '.' DATA1 { $$ = f_new_inst(FI_LC_DATA1, $1); } + | term '.' DATA2 { $$ = f_new_inst(FI_LC_DATA2, $1); } + | term '.' MIN { $$ = f_new_inst(FI_MIN, $1); } + | term '.' MAX { $$ = f_new_inst(FI_MAX, $1); } /* Communities */ /* This causes one shift/reduce conflict @@ -815,13 +885,11 @@ term: | DELETE '(' term ',' term ')' { $$ = f_new_inst(FI_CLIST_DEL, $3, $5); } | FILTER '(' term ',' term ')' { $$ = f_new_inst(FI_CLIST_FILTER, $3, $5); } - | ROA_CHECK '(' rtable ')' { $$ = f_new_inst(FI_ROA_CHECK_IMPLICIT, $3); } - | ROA_CHECK '(' rtable ',' term ',' term ')' { $$ = f_new_inst(FI_ROA_CHECK_EXPLICIT, $5, $7, $3); } + | ROA_CHECK '(' rtable ')' { $$ = f_implicit_roa_check($3); } + | ROA_CHECK '(' rtable ',' term ',' term ')' { $$ = f_new_inst(FI_ROA_CHECK, $5, $7, $3); } | FORMAT '(' term ')' { $$ = f_new_inst(FI_FORMAT, $3); } -/* | term '.' LEN { $$->code = P('P','l'); } */ - | function_call ; @@ -841,20 +909,53 @@ print_list: /* EMPTY */ { $$ = NULL; } } ; +var_init: + /* empty */ { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { }); } + | '=' term { $$ = $2; } + ; + +var: + type symbol var_init ';' { + struct symbol *sym = cf_define_symbol($2, SYM_VARIABLE | $1, offset, f_new_var(sym_->scope)); + $$ = f_new_inst(FI_VAR_INIT, $3, sym); + } + +for_var: + type symbol { $$ = cf_define_symbol($2, SYM_VARIABLE | $1, offset, f_new_var(sym_->scope)); } + | CF_SYM_KNOWN { $$ = $1; cf_assert_symbol($1, SYM_VARIABLE); } + ; + cmd: - IF term THEN block { + '{' cmds_scoped '}' { + $$ = $2; + } + | IF term THEN cmd { $$ = f_new_inst(FI_CONDITION, $2, $4, NULL); } - | IF term THEN block ELSE block { + | IF term THEN cmd ELSE cmd { $$ = f_new_inst(FI_CONDITION, $2, $4, $6); } - | CF_SYM_KNOWN '=' term ';' { + | FOR { + /* Reserve space for walk data on stack */ + cf_push_scope(NULL); + conf_this_scope->slots += 2; + } for_var IN + /* Parse term in the parent scope */ + { conf_this_scope->active = 0; } term { conf_this_scope->active = 1; } + DO cmd { + cf_pop_scope(); + $$ = f_new_inst(FI_FOR_INIT, $6, $3); + $$->next = f_new_inst(FI_FOR_NEXT, $3, $9); + } + | symbol_known '=' term ';' { switch ($1->class) { case SYM_VARIABLE_RANGE: $$ = f_new_inst(FI_VAR_SET, $3, $1); break; case SYM_ATTRIBUTE: - $$ = f_new_inst(FI_EA_SET, $3, *$1->attribute); + if ($1->attribute->readonly) + cf_error("Attribute %s is read-only", $1->attribute->name); + $$ = f_new_inst(FI_EA_SET, $3, $1->attribute); break; default: cf_error("Can't assign to symbol %s", $1->name); @@ -864,16 +965,25 @@ cmd: DBG( "Ook, we'll return the value\n" ); $$ = f_new_inst(FI_RETURN, $2); } - | dynamic_attr '=' term ';' { - $$ = f_new_inst(FI_EA_SET, $3, $1); - } | static_attr '=' term ';' { if ($1.readonly) cf_error( "This static attribute is read-only."); $$ = f_new_inst(FI_RTA_SET, $3, $1); } - | UNSET '(' dynamic_attr ')' ';' { - $$ = f_new_inst(FI_EA_UNSET, $3); + | UNSET '(' symbol_known ')' ';' { + if ($3->class != SYM_ATTRIBUTE) + cf_error("Can't unset %s", $3->name); + if ($3->attribute->readonly) + cf_error("Attribute %s is read-only", $3->attribute->name); + $$ = f_new_inst(FI_EA_UNSET, $3->attribute); + } + | attr_bit '=' term ';' { + $$ = f_new_inst(FI_CONDITION, $3, + f_generate_complex_default(FI_BITOR, $1.class, + f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_INT, .val.i = (1U << $1.bit)}), 0), + f_generate_complex_default(FI_BITAND, $1.class, + f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_INT, .val.i = ~(1U << $1.bit)}), 0) + ); } | break_command print_list ';' { struct f_inst *breaker = f_new_inst(FI_DIE, $1); @@ -893,16 +1003,16 @@ cmd: | PRINTN print_list ';' { $$ = f_new_inst(FI_PRINT, $2); } - | function_call ';' { $$ = f_new_inst(FI_DROP_RESULT, $1); } + | function_call ';' { $$ = f_new_inst(FI_DROP_RESULT, $1); } | CASE term '{' switch_body '}' { $$ = f_new_inst(FI_SWITCH, $2, build_tree($4)); } - | dynamic_attr '.' EMPTY ';' { $$ = f_generate_empty($1); } - | dynamic_attr '.' PREPEND '(' term ')' ';' { $$ = f_generate_complex( FI_PATH_PREPEND, $1, $5 ); } - | dynamic_attr '.' ADD '(' term ')' ';' { $$ = f_generate_complex( FI_CLIST_ADD, $1, $5 ); } - | dynamic_attr '.' DELETE '(' term ')' ';' { $$ = f_generate_complex( FI_CLIST_DEL, $1, $5 ); } - | dynamic_attr '.' FILTER '(' term ')' ';' { $$ = f_generate_complex( FI_CLIST_FILTER, $1, $5 ); } + | symbol_known '.' EMPTY ';' { $$ = f_generate_empty($1); } + | symbol_known '.' PREPEND '(' term ')' ';' { $$ = f_generate_complex_sym( FI_PATH_PREPEND, $1, $5 ); } + | symbol_known '.' ADD '(' term ')' ';' { $$ = f_generate_complex_sym( FI_CLIST_ADD, $1, $5 ); } + | symbol_known '.' DELETE '(' term ')' ';' { $$ = f_generate_complex_sym( FI_CLIST_DEL, $1, $5 ); } + | symbol_known '.' FILTER '(' term ')' ';' { $$ = f_generate_complex_sym( FI_CLIST_FILTER, $1, $5 ); } | BT_ASSERT '(' get_cf_position term get_cf_position ')' ';' { $$ = assert_done($4, $3 + 1, $5 - 1); } | BT_CHECK_ASSIGN '(' get_cf_position lvalue get_cf_position ',' term ')' ';' { $$ = assert_assign(&$4, $7, $3 + 1, $5 - 1); } ; @@ -913,8 +1023,17 @@ get_cf_position: }; lvalue: - CF_SYM_KNOWN { cf_assert_symbol($1, SYM_VARIABLE); $$ = (struct f_lval) { .type = F_LVAL_VARIABLE, .sym = $1 }; } + symbol_known { + switch ($1->class) { + case SYM_VARIABLE_RANGE: + $$ = (struct f_lval) { .type = F_LVAL_VARIABLE, .sym = $1 }; + break; + case SYM_ATTRIBUTE: + $$ = (struct f_lval) { .type = F_LVAL_EA, .da = $1->attribute }; + break; + } + } | static_attr { $$ = (struct f_lval) { .type = F_LVAL_SA, .sa = $1 }; } - | dynamic_attr { $$ = (struct f_lval) { .type = F_LVAL_EA, .da = $1 }; }; + ; CF_END diff --git a/filter/data.c b/filter/data.c index 7c33d2cb..d26b07f5 100644 --- a/filter/data.c +++ b/filter/data.c @@ -16,10 +16,10 @@ #include "lib/unaligned.h" #include "lib/net.h" #include "lib/ip.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/attrs.h" +#include "lib/attrs.h" #include "conf/conf.h" #include "filter/filter.h" #include "filter/f-inst.h" @@ -27,6 +27,8 @@ static const char * const f_type_str[] = { [T_VOID] = "void", + [T_OPAQUE] = "opaque byte string", + [T_IFACE] = "interface", [T_INT] = "int", [T_BOOL] = "bool", @@ -36,7 +38,6 @@ static const char * const f_type_str[] = { [T_ENUM_RTS] = "enum rts", [T_ENUM_BGP_ORIGIN] = "enum bgp_origin", [T_ENUM_SCOPE] = "enum scope", - [T_ENUM_RTC] = "enum rtc", [T_ENUM_RTD] = "enum rtd", [T_ENUM_ROA] = "enum roa", [T_ENUM_NETTYPE] = "enum nettype", @@ -47,6 +48,7 @@ static const char * const f_type_str[] = { [T_NET] = "prefix", [T_STRING] = "string", [T_PATH_MASK] = "bgpmask", + [T_PATH_MASK_ITEM] = "bgpmask item", [T_PATH] = "bgppath", [T_CLIST] = "clist", [T_EC] = "ec", @@ -54,20 +56,31 @@ static const char * const f_type_str[] = { [T_LC] = "lc", [T_LCLIST] = "lclist", [T_RD] = "rd", + + [T_SET] = "set", + [T_PREFIX_SET] = "prefix set", }; const char * -f_type_name(enum f_type t) +f_type_name(btype t) { - if (t < ARRAY_SIZE(f_type_str)) - return f_type_str[t] ?: "?"; - - if ((t == T_SET) || (t == T_PREFIX_SET)) - return "set"; + return (t < ARRAY_SIZE(f_type_str)) ? (f_type_str[t] ?: "?") : "?"; +} - return "?"; +btype +f_type_element_type(btype t) +{ + switch(t) { + case T_PATH: return T_INT; + case T_CLIST: return T_PAIR; + case T_ECLIST: return T_EC; + case T_LCLIST: return T_LC; + default: return T_VOID; + }; } +const struct f_trie f_const_empty_trie = { .ipv4 = -1, }; + const struct f_val f_const_empty_path = { .type = T_PATH, .val.ad = &null_adata, @@ -80,16 +93,11 @@ const struct f_val f_const_empty_path = { }, f_const_empty_lclist = { .type = T_LCLIST, .val.ad = &null_adata, +}, f_const_empty_prefix_set = { + .type = T_PREFIX_SET, + .val.ti = &f_const_empty_trie, }; -static struct adata * -adata_empty(struct linpool *pool, int l) -{ - struct adata *res = lp_alloc(pool, sizeof(struct adata) + l); - res->length = l; - return res; -} - static void pm_format(const struct f_path_mask *p, buffer *buf) { @@ -176,7 +184,7 @@ val_compare(const struct f_val *v1, const struct f_val *v2) if (val_is_ip4(v1) && (v2->type == T_QUAD)) return uint_cmp(ipa_to_u32(v1->val.ip), v2->val.i); - debug( "Types do not match in val_compare\n" ); + DBG( "Types do not match in val_compare\n" ); return F_CMP_ERROR; } @@ -290,6 +298,12 @@ val_same(const struct f_val *v1, const struct f_val *v2) int clist_set_type(const struct f_tree *set, struct f_val *v) { + if (!set) + { + v->type = T_VOID; + return 1; + } + switch (set->from.type) { case T_PAIR: @@ -412,7 +426,7 @@ clist_filter(struct linpool *pool, const struct adata *list, const struct f_val if (nl == list->length) return list; - struct adata *res = adata_empty(pool, nl); + struct adata *res = lp_alloc_adata(pool, nl); memcpy(res->data, tmp, nl); return res; } @@ -446,7 +460,7 @@ eclist_filter(struct linpool *pool, const struct adata *list, const struct f_val if (nl == list->length) return list; - struct adata *res = adata_empty(pool, nl); + struct adata *res = lp_alloc_adata(pool, nl); memcpy(res->data, tmp, nl); return res; } @@ -478,7 +492,7 @@ lclist_filter(struct linpool *pool, const struct adata *list, const struct f_val if (nl == list->length) return list; - struct adata *res = adata_empty(pool, nl); + struct adata *res = lp_alloc_adata(pool, nl); memcpy(res->data, tmp, nl); return res; } @@ -526,6 +540,9 @@ val_in_range(const struct f_val *v1, const struct f_val *v2) if (v2->type != T_SET) return F_CMP_ERROR; + if (!v2->val.t) + return 0; + /* With integrated Quad<->IP implicit conversion */ if ((v1->type == v2->val.t->from.type) || ((v1->type == T_QUAD) && val_is_ip4(&(v2->val.t->from)) && val_is_ip4(&(v2->val.t->to)))) @@ -599,3 +616,75 @@ val_dump(const struct f_val *v) { return val_dump_buffer; } + +struct f_val * +lp_val_copy(struct linpool *lp, const struct f_val *v) +{ + switch (v->type) + { + case T_VOID: + case T_BOOL: + case T_INT: + case T_IP: + case T_PAIR: + case T_QUAD: + case T_EC: + case T_LC: + case T_RD: + case T_ENUM: + case T_PATH_MASK_ITEM: + /* These aren't embedded but there is no need to copy them */ + case T_SET: + case T_PREFIX_SET: + case T_PATH_MASK: + case T_IFACE: + { + struct f_val *out = lp_alloc(lp, sizeof(*out)); + *out = *v; + return out; + } + + case T_NET: + { + struct { + struct f_val val; + net_addr net[0]; + } *out = lp_alloc(lp, sizeof(*out) + v->val.net->length); + out->val = *v; + out->val.val.net = out->net; + net_copy(out->net, v->val.net); + return &out->val; + } + + case T_STRING: + { + uint len = strlen(v->val.s); + struct { + struct f_val val; + char buf[0]; + } *out = lp_alloc(lp, sizeof(*out) + len + 1); + out->val = *v; + out->val.val.s = out->buf; + memcpy(out->buf, v->val.s, len+1); + return &out->val; + } + + case T_PATH: + case T_CLIST: + case T_ECLIST: + case T_LCLIST: + { + struct { + struct f_val val; + struct adata ad; + } *out = lp_alloc(lp, sizeof(*out) + v->val.ad->length); + out->val = *v; + out->val.val.ad = &out->ad; + memcpy(&out->ad, v->val.ad, v->val.ad->length); + return &out->val; + } + + default: + bug("Unknown type in value copy: %d", v->type); + } +} diff --git a/filter/data.h b/filter/data.h index 45246f9f..c1e7c736 100644 --- a/filter/data.h +++ b/filter/data.h @@ -11,110 +11,37 @@ #define _BIRD_FILTER_DATA_H_ #include "nest/bird.h" - -/* Type numbers must be in 0..0xff range */ -#define T_MASK 0xff - -/* Internal types */ -enum f_type { -/* Nothing. Simply nothing. */ - T_VOID = 0, - -/* User visible types, which fit in int */ - T_INT = 0x10, - T_BOOL = 0x11, - T_PAIR = 0x12, /* Notice that pair is stored as integer: first << 16 | second */ - T_QUAD = 0x13, - -/* Put enumerational types in 0x30..0x3f range */ - T_ENUM_LO = 0x30, - T_ENUM_HI = 0x3f, - - T_ENUM_RTS = 0x30, - T_ENUM_BGP_ORIGIN = 0x31, - T_ENUM_SCOPE = 0x32, - T_ENUM_RTC = 0x33, - T_ENUM_RTD = 0x34, - T_ENUM_ROA = 0x35, - T_ENUM_NETTYPE = 0x36, - T_ENUM_RA_PREFERENCE = 0x37, - T_ENUM_AF = 0x38, - -/* new enums go here */ - T_ENUM_EMPTY = 0x3f, /* Special hack for atomic_aggr */ - -#define T_ENUM T_ENUM_LO ... T_ENUM_HI - -/* Bigger ones */ - T_IP = 0x20, - T_NET = 0x21, - T_STRING = 0x22, - T_PATH_MASK = 0x23, /* mask for BGP path */ - T_PATH = 0x24, /* BGP path */ - T_CLIST = 0x25, /* Community list */ - T_EC = 0x26, /* Extended community value, u64 */ - T_ECLIST = 0x27, /* Extended community list */ - T_LC = 0x28, /* Large community value, lcomm */ - T_LCLIST = 0x29, /* Large community list */ - T_RD = 0x2a, /* Route distinguisher for VPN addresses */ - T_PATH_MASK_ITEM = 0x2b, /* Path mask item for path mask constructors */ - - T_SET = 0x80, - T_PREFIX_SET = 0x81, -} PACKED; +#include "lib/type.h" /* Filter value; size of this affects filter memory consumption */ struct f_val { - enum f_type type; /* T_* */ - union { - uint i; - u64 ec; - lcomm lc; - ip_addr ip; - const net_addr *net; - const char *s; - const struct f_tree *t; - const struct f_trie *ti; - const struct adata *ad; - const struct f_path_mask *path_mask; - struct f_path_mask_item pmi; - } val; + btype type; /* T_* */ + union bval_long val; }; -/* Dynamic attribute definition (eattrs) */ -struct f_dynamic_attr { - u8 type; /* EA type (EAF_*) */ - u8 bit; /* For bitfield accessors */ - enum f_type f_type; /* Filter type */ - uint ea_code; /* EA code */ -}; +#define fputip(a) ({ ip_addr *ax = falloc(sizeof(*ax)); *ax = (a); ax; }) enum f_sa_code { - SA_FROM = 1, - SA_GW, + SA_GW = 1, SA_NET, SA_PROTO, - SA_SOURCE, - SA_SCOPE, SA_DEST, SA_IFNAME, SA_IFINDEX, SA_WEIGHT, - SA_PREF, SA_GW_MPLS, } PACKED; /* Static attribute definition (members of struct rta) */ struct f_static_attr { - enum f_type f_type; /* Filter type */ + btype type; /* Data type */ enum f_sa_code sa_code; /* Static attribute id */ - int readonly:1; /* Don't allow writing */ + int readonly:1; /* Don't allow writing */ }; /* Filter l-value type */ enum f_lval_type { F_LVAL_VARIABLE, - F_LVAL_PREFERENCE, F_LVAL_SA, F_LVAL_EA, }; @@ -124,7 +51,7 @@ struct f_lval { enum f_lval_type type; union { struct symbol *sym; - struct f_dynamic_attr da; + const struct ea_class *da; struct f_static_attr sa; }; }; @@ -141,18 +68,23 @@ struct f_tree { void *data; }; +#define TRIE_STEP 4 +#define TRIE_STACK_LENGTH 33 + struct f_trie_node4 { ip4_addr addr, mask, accept; - uint plen; - struct f_trie_node4 *c[2]; + u16 plen; + u16 local; + struct f_trie_node4 *c[1 << TRIE_STEP]; }; struct f_trie_node6 { ip6_addr addr, mask, accept; - uint plen; - struct f_trie_node6 *c[2]; + u16 plen; + u16 local; + struct f_trie_node6 *c[1 << TRIE_STEP]; }; struct f_trie_node @@ -169,9 +101,20 @@ struct f_trie u8 zero; s8 ipv4; /* -1 for undefined / empty */ u16 data_size; /* Additional data for each trie node */ + u32 prefix_count; /* Works only for restricted tries (pxlen == l == h) */ struct f_trie_node root; /* Root trie node */ }; +struct f_trie_walk_state +{ + u8 ipv4; + u8 accept_length; /* Current inter-node prefix position */ + u8 start_pos; /* Initial prefix position in stack[0] */ + u8 local_pos; /* Current intra-node prefix position */ + u8 stack_pos; /* Current node in stack below */ + const struct f_trie_node *stack[TRIE_STACK_LENGTH]; +}; + struct f_tree *f_new_tree(void); struct f_tree *build_tree(struct f_tree *); const struct f_tree *find_tree(const struct f_tree *t, const struct f_val *val); @@ -182,12 +125,75 @@ void tree_walk(const struct f_tree *t, void (*hook)(const struct f_tree *, void struct f_trie *f_new_trie(linpool *lp, uint data_size); void *trie_add_prefix(struct f_trie *t, const net_addr *n, uint l, uint h); int trie_match_net(const struct f_trie *t, const net_addr *n); +int trie_match_longest_ip4(const struct f_trie *t, const net_addr_ip4 *net, net_addr_ip4 *dst, ip4_addr *found0); +int trie_match_longest_ip6(const struct f_trie *t, const net_addr_ip6 *net, net_addr_ip6 *dst, ip6_addr *found0); +void trie_walk_init(struct f_trie_walk_state *s, const struct f_trie *t, const net_addr *from); +int trie_walk_next(struct f_trie_walk_state *s, net_addr *net); int trie_same(const struct f_trie *t1, const struct f_trie *t2); void trie_format(const struct f_trie *t, buffer *buf); +static inline int +trie_match_next_longest_ip4(net_addr_ip4 *n, ip4_addr *found) +{ + while (n->pxlen) + { + n->pxlen--; + ip4_clrbit(&n->prefix, n->pxlen); + + if (ip4_getbit(*found, n->pxlen)) + return 1; + } + + return 0; +} + +static inline int +trie_match_next_longest_ip6(net_addr_ip6 *n, ip6_addr *found) +{ + while (n->pxlen) + { + n->pxlen--; + ip6_clrbit(&n->prefix, n->pxlen); + + if (ip6_getbit(*found, n->pxlen)) + return 1; + } + + return 0; +} + + +#define TRIE_WALK_TO_ROOT_IP4(trie, net, dst) ({ \ + net_addr_ip4 dst; \ + ip4_addr _found; \ + for (int _n = trie_match_longest_ip4(trie, net, &dst, &_found); \ + _n; \ + _n = trie_match_next_longest_ip4(&dst, &_found)) + +#define TRIE_WALK_TO_ROOT_IP6(trie, net, dst) ({ \ + net_addr_ip6 dst; \ + ip6_addr _found; \ + for (int _n = trie_match_longest_ip6(trie, net, &dst, &_found); \ + _n; \ + _n = trie_match_next_longest_ip6(&dst, &_found)) + +#define TRIE_WALK_TO_ROOT_END }) + + +#define TRIE_WALK(trie, net, from) ({ \ + net_addr net; \ + struct f_trie_walk_state tws_; \ + trie_walk_init(&tws_, trie, from); \ + while (trie_walk_next(&tws_, &net)) + +#define TRIE_WALK_END }) + + #define F_CMP_ERROR 999 -const char *f_type_name(enum f_type t); +const char *f_type_name(btype t); + +enum btype f_type_element_type(btype t); int val_same(const struct f_val *v1, const struct f_val *v2); int val_compare(const struct f_val *v1, const struct f_val *v2); @@ -195,15 +201,19 @@ void val_format(const struct f_val *v, buffer *buf); char *val_format_str(struct linpool *lp, const struct f_val *v); const char *val_dump(const struct f_val *v); +struct f_val *lp_val_copy(struct linpool *lp, const struct f_val *v); + static inline int val_is_ip4(const struct f_val *v) { return (v->type == T_IP) && ipa_is_ip4(v->val.ip); } int val_in_range(const struct f_val *v1, const struct f_val *v2); int clist_set_type(const struct f_tree *set, struct f_val *v); static inline int eclist_set_type(const struct f_tree *set) -{ return set->from.type == T_EC; } +{ return !set || set->from.type == T_EC; } static inline int lclist_set_type(const struct f_tree *set) -{ return set->from.type == T_LC; } +{ return !set || set->from.type == T_LC; } +static inline int path_set_type(const struct f_tree *set) +{ return !set || set->from.type == T_INT; } const struct adata *clist_filter(struct linpool *pool, const struct adata *list, const struct f_val *set, int pos); const struct adata *eclist_filter(struct linpool *pool, const struct adata *list, const struct f_val *set, int pos); @@ -219,8 +229,18 @@ undef_value(struct f_val v) (v.val.ad == &null_adata); } -extern const struct f_val f_const_empty_path, f_const_empty_clist, f_const_empty_eclist, f_const_empty_lclist; +extern const struct f_val f_const_empty_path, f_const_empty_clist, f_const_empty_eclist, f_const_empty_lclist, f_const_empty_prefix_set; +static inline const struct f_val *f_get_empty(btype t) +{ + switch (t) { + case T_PATH: return &f_const_empty_path; + case T_CLIST: return &f_const_empty_clist; + case T_ECLIST: return &f_const_empty_eclist; + case T_LCLIST: return &f_const_empty_lclist; + default: return NULL; + } +} -enum filter_return f_eval(const struct f_line *expr, struct linpool *tmp_pool, struct f_val *pres); +enum filter_return f_eval(const struct f_line *expr, struct f_val *pres); #endif diff --git a/filter/decl.m4 b/filter/decl.m4 index 44537aaa..e2472127 100644 --- a/filter/decl.m4 +++ b/filter/decl.m4 @@ -94,7 +94,7 @@ FID_DUMP_BODY()m4_dnl debug("%s" $4 "\n", INDENT, $5); ]]) FID_INTERPRET_EXEC()m4_dnl -const $1 $2 = whati->$2 +$1 $2 = whati->$2 FID_INTERPRET_BODY') # Instruction arguments are needed only until linearization is done. @@ -191,6 +191,12 @@ if (f$1->type && f$2->type && (f$1->type != f$2->type) && cf_error("Arguments $1 and $2 of %s must be of the same type", f_instruction_name(what->fi_code)); FID_INTERPRET_BODY()') +m4_define(ARG_PREFER_SAME_TYPE, ` +FID_NEW_BODY()m4_dnl +if (f$1->type && f$2->type && (f$1->type != f$2->type)) + (void) (f_const_promotion(f$2, f$1->type) || f_const_promotion(f$1, f$2->type)); +FID_INTERPRET_BODY()') + # Executing another filter line. This replaces the recursion # that was needed in the former implementation. m4_define(LINEX, `FID_INTERPRET_EXEC()LINEX_($1)FID_INTERPRET_NEW()return $1 FID_INTERPRET_BODY()') @@ -216,7 +222,7 @@ whati->f$1 = f$1; FID_DUMP_BODY()m4_dnl f_dump_line(item->fl$1, indent + 1); FID_LINEARIZE_BODY()m4_dnl -item->fl$1 = f_linearize(whati->f$1); +item->fl$1 = f_linearize(whati->f$1, $2); FID_SAME_BODY()m4_dnl if (!f_same(f1->fl$1, f2->fl$1)) return 0; FID_ITERATE_BODY()m4_dnl @@ -244,9 +250,13 @@ m4_define(ERROR, # This macro specifies result type and makes there are no conflicting definitions m4_define(RESULT_TYPE, `m4_ifdef([[INST_RESULT_TYPE]], - [[m4_ifelse(INST_RESULT_TYPE,$1,,[[ERROR([[Multiple type definitons]])]])]], + [[m4_ifelse(INST_RESULT_TYPE,$1,,[[ERROR([[Multiple type definitions in]] INST_NAME)]])]], [[m4_define(INST_RESULT_TYPE,$1) RESULT_TYPE_($1)]])') +m4_define(RESULT_TYPE_CHECK, + `m4_ifelse(INST_OUTVAL,0,, + [[m4_ifdef([[INST_RESULT_TYPE]],,[[ERROR([[Missing type definition in]] INST_NAME)]])]])') + m4_define(RESULT_TYPE_, ` FID_NEW_BODY()m4_dnl what->type = $1; @@ -256,7 +266,7 @@ FID_INTERPRET_BODY()') m4_define(SYMBOL, `FID_MEMBER(struct symbol *, sym, [[strcmp(f1->sym->name, f2->sym->name) || (f1->sym->class != f2->sym->class)]], "symbol %s", item->sym->name)') m4_define(RTC, `FID_MEMBER(struct rtable_config *, rtc, [[strcmp(f1->rtc->name, f2->rtc->name)]], "route table %s", item->rtc->name)') m4_define(STATIC_ATTR, `FID_MEMBER(struct f_static_attr, sa, f1->sa.sa_code != f2->sa.sa_code,,)') -m4_define(DYNAMIC_ATTR, `FID_MEMBER(struct f_dynamic_attr, da, f1->da.ea_code != f2->da.ea_code,,)') +m4_define(DYNAMIC_ATTR, `FID_MEMBER(const struct ea_class *, da, f1->da != f2->da,,)') m4_define(ACCESS_RTE, `FID_HIC(,[[do { if (!fs->rte) runtime("No route to access"); } while (0)]],NEVER_CONSTANT())') # 2) Code wrapping @@ -300,6 +310,7 @@ m4_define(FID_ITERATE, `FID_ZONE(10, Iteration)') # This macro does all the code wrapping. See inline comments. m4_define(INST_FLUSH, `m4_ifdef([[INST_NAME]], [[ +RESULT_TYPE_CHECK()m4_dnl Check for defined RESULT_TYPE() FID_ENUM()m4_dnl Contents of enum fi_code { ... } INST_NAME(), FID_ENUM_STR()m4_dnl Contents of const char * indexed by enum fi_code @@ -375,6 +386,7 @@ case INST_NAME(): { #undef whati #undef item dest->items[pos].fi_code = what->fi_code; + dest->items[pos].flags = what->flags; dest->items[pos].lineno = what->lineno; break; } @@ -402,6 +414,7 @@ m4_define(INST, `m4_dnl This macro is called on beginning of each instruction INST_FLUSH()m4_dnl First, old data is flushed m4_define([[INST_NAME]], [[$1]])m4_dnl Then we store instruction name, m4_define([[INST_INVAL]], [[$2]])m4_dnl instruction input value count, +m4_define([[INST_OUTVAL]], [[$3]])m4_dnl instruction output value count, m4_undefine([[INST_NEVER_CONSTANT]])m4_dnl reset NEVER_CONSTANT trigger, m4_undefine([[INST_RESULT_TYPE]])m4_dnl and reset RESULT_TYPE value. FID_INTERPRET_BODY()m4_dnl By default, every code is interpreter code. @@ -490,7 +503,7 @@ fi_constant(struct f_inst *what, struct f_val val) } static int -f_const_promotion(struct f_inst *arg, enum f_type want) +f_const_promotion(struct f_inst *arg, btype want) { if (arg->fi_code != FI_CONSTANT) return 0; @@ -505,6 +518,11 @@ f_const_promotion(struct f_inst *arg, enum f_type want) return 1; } + else if ((c->type == T_SET) && (!c->val.t) && (want == T_PREFIX_SET)) { + *c = f_const_empty_prefix_set; + return 1; + } + return 0; } @@ -560,7 +578,7 @@ FID_WR_PUT(8) } struct f_line * -f_linearize_concat(const struct f_inst * const inst[], uint count) +f_linearize_concat(const struct f_inst * const inst[], uint count, uint results) { uint len = 0; for (uint i=0; i<count; i++) @@ -572,6 +590,8 @@ f_linearize_concat(const struct f_inst * const inst[], uint count) for (uint i=0; i<count; i++) out->len = linearize(out, inst[i], out->len); + out->results = results; + #ifdef LOCAL_DEBUG f_dump_line(out, 0); #endif @@ -640,7 +660,8 @@ FID_WR_PUT(4)m4_dnl struct f_inst { struct f_inst *next; /* Next instruction */ enum f_instruction_code fi_code; /* Instruction code */ - enum f_type type; /* Type of returned value, if known */ + enum f_instruction_flags flags; /* Flags, instruction-specific */ + btype type; /* Type of returned value, if known */ int size; /* How many instructions are underneath */ int lineno; /* Line number */ union { diff --git a/filter/f-inst.c b/filter/f-inst.c index 0341a2f1..801eceec 100644 --- a/filter/f-inst.c +++ b/filter/f-inst.c @@ -62,14 +62,14 @@ * m4_dnl INST(FI_NOP, in, out) { enum value, input args, output args * m4_dnl ARG(num, type); argument, its id (in data fields) and type accessible by v1, v2, v3 * m4_dnl ARG_ANY(num); argument with no type check accessible by v1, v2, v3 + * m4_dnl ARG_TYPE(num, type); just declare the type of argument * m4_dnl VARARG; variable-length argument list; accessible by vv(i) and whati->varcount - * m4_dnl LINE(num, unused); this argument has to be converted to its own f_line + * m4_dnl LINE(num, out); this argument has to be converted to its own f_line * m4_dnl SYMBOL; symbol handed from config * m4_dnl STATIC_ATTR; static attribute definition * m4_dnl DYNAMIC_ATTR; dynamic attribute definition * m4_dnl RTC; route table config * m4_dnl ACCESS_RTE; this instruction needs route - * m4_dnl ACCESS_EATTRS; this instruction needs extended attributes * * m4_dnl FID_MEMBER( custom instruction member * m4_dnl C type, for storage in structs @@ -80,10 +80,17 @@ * m4_dnl ) * * m4_dnl RESULT(type, union-field, value); putting this on value stack + * m4_dnl RESULT_(type, union-field, value); like RESULT(), but do not declare the type * m4_dnl RESULT_VAL(value-struct); pass the struct f_val directly + * m4_dnl RESULT_TYPE(type); just declare the type of result value * m4_dnl RESULT_VOID; return undef * m4_dnl } * + * Note that runtime arguments m4_dnl (ARG*, VARARG) must be defined before + * parse-time arguments m4_dnl (LINE, SYMBOL, ...). During linearization, + * first ones move position in f_line by linearizing arguments first, while + * second ones store data to the current position. + * * Also note that the { ... } blocks are not respected by M4 at all. * If you get weird unmatched-brace-pair errors, check what it generated and why. * What is really considered as one instruction is not the { ... } block @@ -91,6 +98,24 @@ * * Other code is just copied into the interpreter part. * + * The filter language uses a simple type system, where values have types + * (constants T_*) and also terms (instructions) are statically typed. Our + * static typing is partial (some terms do not declare types of arguments + * or results), therefore it can detect most but not all type errors and + * therefore we still have runtime type checks. + * + * m4_dnl Types of arguments are declared by macros ARG() and ARG_TYPE(), + * m4_dnl types of results are declared by RESULT() and RESULT_TYPE(). + * m4_dnl Macros ARG_ANY(), RESULT_() and RESULT_VAL() do not declare types + * m4_dnl themselves, but can be combined with ARG_TYPE() / RESULT_TYPE(). + * + * m4_dnl Note that types should be declared only once. If there are + * m4_dnl multiple RESULT() macros in an instruction definition, they must + * m4_dnl use the exact same expression for type, or they should be replaced + * m4_dnl by multiple RESULT_() macros and a common RESULT_TYPE() macro. + * m4_dnl See e.g. FI_EA_GET or FI_MIN instructions. + * + * * If you are satisfied with this, you don't need to read the following * detailed description of what is really done with the instruction definitions. * @@ -211,11 +236,40 @@ * m4_dnl fpool -> the current linpool * m4_dnl NEVER_CONSTANT-> don't generate pre-interpretation code at all * m4_dnl ACCESS_RTE -> check that route is available, also NEVER_CONSTANT - * m4_dnl ACCESS_EATTRS -> pre-cache the eattrs; use only with ACCESS_RTE - * m4_dnl f_rta_cow(fs) -> function to call before any change to route should be done * * m4_dnl If you are stymied, see FI_CALL or FI_CONSTANT or just search for * m4_dnl the mentioned macros in this file to see what is happening there in wild. + * + * + * A note about soundness of the type system: + * + * A type system is sound when types of expressions are consistent with + * types of values resulting from evaluation of such expressions. Untyped + * expressions are ok, but badly typed expressions are not sound. So is + * the type system of BIRD filtering code sound? There are some points: + * + * All cases of (one) m4_dnl RESULT() macro are obviously ok, as the macro + * both declares a type and returns a value. One have to check instructions + * that use m4_dnl RESULT_TYPE() macro. There are two issues: + * + * FI_AND, FI_OR - second argument is statically checked to be T_BOOL and + * passed as result without dynamic typecheck, declared to be T_BOOL. If + * an untyped non-bool expression is used as a second argument, then + * the mismatched type is returned. + * + * FI_VAR_GET - soundness depends on consistency of declared symbol types + * and stored values. This is maintained when values are stored by + * FI_VAR_SET, but when they are stored by FI_CALL, only static checking is + * used, so when an untyped expression returning mismatched value is used + * as a function argument, then inconsistent value is stored and subsequent + * FI_VAR_GET would be unsound. + * + * Both of these issues are inconsequential, as mismatched values from + * unsound expressions will be caught by dynamic typechecks like mismatched + * values from untyped expressions. + * + * Also note that FI_CALL is the only expression without properly declared + * result type. */ /* Binary operators */ @@ -240,13 +294,23 @@ if (v2.val.i == 0) runtime( "Mother told me not to divide by 0" ); RESULT(T_INT, i, v1.val.i / v2.val.i); } + INST(FI_BITOR, 2, 1) { + ARG(1,T_INT); + ARG(2,T_INT); + RESULT(T_INT, i, v1.val.i | v2.val.i); + } + INST(FI_BITAND, 2, 1) { + ARG(1,T_INT); + ARG(2,T_INT); + RESULT(T_INT, i, v1.val.i & v2.val.i); + } INST(FI_AND, 1, 1) { ARG(1,T_BOOL); ARG_TYPE_STATIC(2,T_BOOL); RESULT_TYPE(T_BOOL); if (v1.val.i) - LINE(2,0); + LINE(2,1); else RESULT_VAL(v1); } @@ -256,7 +320,7 @@ RESULT_TYPE(T_BOOL); if (!v1.val.i) - LINE(2,0); + LINE(2,1); else RESULT_VAL(v1); } @@ -349,7 +413,7 @@ break; case T_SET: - if (vv(i).val.t->from.type != T_INT) + if (!path_set_type(vv(i).val.t)) runtime("Only integer sets allowed in path mask"); pm->item[i] = (struct f_path_mask_item) { @@ -371,12 +435,14 @@ INST(FI_NEQ, 2, 1) { ARG_ANY(1); ARG_ANY(2); + ARG_PREFER_SAME_TYPE(1, 2); RESULT(T_BOOL, i, !val_same(&v1, &v2)); } INST(FI_EQ, 2, 1) { ARG_ANY(1); ARG_ANY(2); + ARG_PREFER_SAME_TYPE(1, 2); RESULT(T_BOOL, i, val_same(&v1, &v2)); } @@ -447,6 +513,18 @@ RESULT(T_BOOL, i, ipa_is_ip4(v1.val.ip)); } + INST(FI_VAR_INIT, 1, 0) { + NEVER_CONSTANT; + ARG_ANY(1); + SYMBOL; + ARG_TYPE(1, sym->class & 0xff); + + /* New variable is always the last on stack */ + uint pos = curline.vbase + sym->offset; + fstk->vstk[pos] = v1; + fstk->vcnt = pos + 1; + } + /* Set to indirect value prepared in v1 */ INST(FI_VAR_SET, 1, 0) { NEVER_CONSTANT; @@ -477,12 +555,100 @@ RESULT_VAL(val); } + INST(FI_FOR_INIT, 1, 0) { + NEVER_CONSTANT; + ARG_ANY(1); + SYMBOL; + + FID_NEW_BODY() + ASSERT((sym->class & ~0xff) == SYM_VARIABLE); + + /* Static type check */ + if (f1->type) + { + enum btype t_var = (sym->class & 0xff); + enum btype t_arg = f_type_element_type(f1->type); + if (!t_arg) + cf_error("Value of expression in FOR must be iterable, got %s", + f_type_name(f1->type)); + if (t_var != t_arg) + cf_error("Loop variable '%s' in FOR must be %s, is %s", + sym->name, f_type_name(t_arg), f_type_name(t_var)); + } + + FID_INTERPRET_BODY() + + /* Dynamic type check */ + if ((sym->class & 0xff) != f_type_element_type(v1.type)) + runtime("Mismatched argument and variable type"); + + /* Setup the index */ + v2 = (struct f_val) { .type = T_INT, .val.i = 0 }; + + /* Keep v1 and v2 on the stack */ + fstk->vcnt += 2; + } + + INST(FI_FOR_NEXT, 2, 0) { + NEVER_CONSTANT; + SYMBOL; + + /* Type checks are done in FI_FOR_INIT */ + + /* Loop variable */ + struct f_val *var = &fstk->vstk[curline.vbase + sym->offset]; + int step = 0; + + switch(v1.type) + { + case T_PATH: + var->type = T_INT; + step = as_path_walk(v1.val.ad, &v2.val.i, &var->val.i); + break; + + case T_CLIST: + var->type = T_PAIR; + step = int_set_walk(v1.val.ad, &v2.val.i, &var->val.i); + break; + + case T_ECLIST: + var->type = T_EC; + step = ec_set_walk(v1.val.ad, &v2.val.i, &var->val.ec); + break; + + case T_LCLIST: + var->type = T_LC; + step = lc_set_walk(v1.val.ad, &v2.val.i, &var->val.lc); + break; + + default: + runtime( "Clist or lclist expected" ); + } + + if (step) + { + /* Keep v1 and v2 on the stack */ + fstk->vcnt += 2; + + /* Repeat this instruction */ + curline.pos--; + + /* Execute the loop body */ + LINE(1, 0); + + /* Space for loop variable, may be unused */ + fstk->vcnt += 1; + } + else + var->type = T_VOID; + } + INST(FI_CONDITION, 1, 0) { ARG(1, T_BOOL); if (v1.val.i) LINE(2,0); else - LINE(3,1); + LINE(3,0); } INST(FI_PRINT, 0, 0) { @@ -519,25 +685,43 @@ { STATIC_ATTR; ACCESS_RTE; - struct rta *rta = fs->rte->attrs; switch (sa.sa_code) { - case SA_FROM: RESULT(sa.f_type, ip, rta->from); break; - case SA_GW: RESULT(sa.f_type, ip, rta->nh.gw); break; - case SA_NET: RESULT(sa.f_type, net, fs->rte->net); break; - case SA_PROTO: RESULT(sa.f_type, s, fs->rte->src->owner->name); break; - case SA_SOURCE: RESULT(sa.f_type, i, rta->source); break; - case SA_SCOPE: RESULT(sa.f_type, i, rta->scope); break; - case SA_DEST: RESULT(sa.f_type, i, rta->dest); break; - case SA_IFNAME: RESULT(sa.f_type, s, rta->nh.iface ? rta->nh.iface->name : ""); break; - case SA_IFINDEX: RESULT(sa.f_type, i, rta->nh.iface ? rta->nh.iface->index : 0); break; - case SA_WEIGHT: RESULT(sa.f_type, i, rta->nh.weight + 1); break; - case SA_PREF: RESULT(sa.f_type, i, rta->pref); break; - case SA_GW_MPLS: RESULT(sa.f_type, i, rta->nh.labels ? rta->nh.label[0] : MPLS_NULL); break; - + case SA_NET: RESULT(sa.type, net, fs->rte->net); break; + case SA_PROTO: RESULT(sa.type, s, fs->rte->src->owner->name); break; default: - bug("Invalid static attribute access (%u/%u)", sa.f_type, sa.sa_code); + { + struct eattr *nhea = ea_find(fs->rte->attrs, &ea_gen_nexthop); + struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + struct nexthop *nh = nhad ? &nhad->nh : NULL; + + switch (sa.sa_code) + { + case SA_DEST: + RESULT(sa.type, i, nhad ? + (NEXTHOP_IS_REACHABLE(nhad) ? RTD_UNICAST : nhad->dest) + : RTD_NONE); + break; + case SA_GW: + RESULT(sa.type, ip, nh ? nh->gw : IPA_NONE); + break; + case SA_IFNAME: + RESULT(sa.type, s, (nh && nh->iface) ? nh->iface->name : ""); + break; + case SA_IFINDEX: + RESULT(sa.type, i, (nh && nh->iface) ? nh->iface->index : 0); + break; + case SA_WEIGHT: + RESULT(sa.type, i, (nh ? nh->weight : 0) + 1); + break; + case SA_GW_MPLS: + RESULT(sa.type, i, (nh && nh->labels) ? nh->label[0] : MPLS_NULL); + break; + default: + bug("Invalid static attribute access (%u/%u)", sa.type, sa.sa_code); + } + } } } } @@ -546,52 +730,52 @@ ACCESS_RTE; ARG_ANY(1); STATIC_ATTR; - ARG_TYPE(1, sa.f_type); - - f_rta_cow(fs); + ARG_TYPE(1, sa.type); { - struct rta *rta = fs->rte->attrs; + union { + struct nexthop_adata nha; + struct { + struct adata ad; + struct nexthop nh; + u32 label; + }; + } nha; + + nha.ad = (struct adata) { + .length = sizeof (struct nexthop_adata) - sizeof (struct adata), + }; + + eattr *a = NULL; switch (sa.sa_code) { - case SA_FROM: - rta->from = v1.val.ip; - break; + case SA_DEST: + { + int i = v1.val.i; + if ((i != RTD_BLACKHOLE) && (i != RTD_UNREACHABLE) && (i != RTD_PROHIBIT)) + runtime( "Destination can be changed only to blackhole, unreachable or prohibit" ); + nha.nha.dest = i; + nha.ad.length = NEXTHOP_DEST_SIZE; + break; + } case SA_GW: { + struct eattr *nh_ea = ea_find(fs->rte->attrs, &ea_gen_nexthop); + ip_addr ip = v1.val.ip; - struct iface *ifa = ipa_is_link_local(ip) ? rta->nh.iface : NULL; + struct iface *ifa = (ipa_is_link_local(ip) && nh_ea) ? + ((struct nexthop_adata *) nh_ea->u.ptr)->nh.iface : NULL; + /* XXX this code supposes that every owner is a protocol XXX */ neighbor *n = neigh_find(SKIP_BACK(struct proto, sources, fs->rte->src->owner), ip, ifa, 0); if (!n || (n->scope == SCOPE_HOST)) runtime( "Invalid gw address" ); - rta->dest = RTD_UNICAST; - rta->nh.gw = ip; - rta->nh.iface = n->iface; - rta->nh.next = NULL; - rta->hostentry = NULL; - rta->nh.labels = 0; - } - break; - - case SA_SCOPE: - rta->scope = v1.val.i; - break; - - case SA_DEST: - { - int i = v1.val.i; - if ((i != RTD_BLACKHOLE) && (i != RTD_UNREACHABLE) && (i != RTD_PROHIBIT)) - runtime( "Destination can be changed only to blackhole, unreachable or prohibit" ); - - rta->dest = i; - rta->nh.gw = IPA_NONE; - rta->nh.iface = NULL; - rta->nh.next = NULL; - rta->hostentry = NULL; - rta->nh.labels = 0; + nha.nh = (struct nexthop) { + .gw = ip, + .iface = n->iface, + }; } break; @@ -601,12 +785,9 @@ if (!ifa) runtime( "Invalid iface name" ); - rta->dest = RTD_UNICAST; - rta->nh.gw = IPA_NONE; - rta->nh.iface = ifa; - rta->nh.next = NULL; - rta->hostentry = NULL; - rta->nh.labels = 0; + nha.nh = (struct nexthop) { + .iface = ifa, + }; } break; @@ -615,13 +796,20 @@ if (v1.val.i >= 0x100000) runtime( "Invalid MPLS label" ); + struct eattr *nh_ea = ea_find(fs->rte->attrs, &ea_gen_nexthop); + if (!nh_ea) + runtime( "No nexthop to add a MPLS label to" ); + + nha.nh = ((struct nexthop_adata *) nh_ea->u.ptr)->nh; + if (v1.val.i != MPLS_NULL) { - rta->nh.label[0] = v1.val.i; - rta->nh.labels = 1; + nha.nh.label[0] = v1.val.i; + nha.nh.labels = 1; + nha.ad.length = sizeof nha - sizeof (struct adata); } else - rta->nh.labels = 0; + nha.nh.labels = 0; } break; @@ -630,184 +818,120 @@ int i = v1.val.i; if (i < 1 || i > 256) runtime( "Setting weight value out of bounds" ); - if (rta->dest != RTD_UNICAST) + + struct eattr *nh_ea = ea_find(fs->rte->attrs, &ea_gen_nexthop); + if (!nh_ea) + runtime( "No nexthop to set weight on" ); + + struct nexthop_adata *nhad = (struct nexthop_adata *) nh_ea->u.ptr; + if (!NEXTHOP_IS_REACHABLE(nhad)) runtime( "Setting weight needs regular nexthop " ); + struct nexthop_adata *nhax = (struct nexthop_adata *) tmp_copy_adata(&nhad->ad); + /* Set weight on all next hops */ - for (struct nexthop *nh = &rta->nh; nh; nh = nh->next) + NEXTHOP_WALK(nh, nhax) nh->weight = i - 1; - } - break; - case SA_PREF: - rta->pref = v1.val.i; + a = ea_set_attr(&fs->rte->attrs, + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nhax->ad)); + } break; default: - bug("Invalid static attribute access (%u/%u)", sa.f_type, sa.sa_code); + bug("Invalid static attribute access (%u/%u)", sa.type, sa.sa_code); } + + if (!a) + a = ea_set_attr(&fs->rte->attrs, + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, tmp_copy_adata(&nha.ad))); + + a->originated = 1; + a->fresh = 1; } } INST(FI_EA_GET, 0, 1) { /* Access to extended attributes */ DYNAMIC_ATTR; ACCESS_RTE; - ACCESS_EATTRS; - RESULT_TYPE(da.f_type); + RESULT_TYPE(da->type); { - eattr *e = ea_find(*fs->eattrs, da.ea_code); - - if (!e) { - /* A special case: undefined as_path looks like empty as_path */ - if (da.type == EAF_TYPE_AS_PATH) { - RESULT_(T_PATH, ad, &null_adata); - break; - } - - /* The same special case for int_set */ - if (da.type == EAF_TYPE_INT_SET) { - RESULT_(T_CLIST, ad, &null_adata); - break; - } + const struct f_val *empty; + const eattr *e = ea_find(fs->rte->attrs, da->id); - /* The same special case for ec_set */ - if (da.type == EAF_TYPE_EC_SET) { - RESULT_(T_ECLIST, ad, &null_adata); - break; - } + if (e) + { + ASSERT_DIE(e->type == da->type); - /* The same special case for lc_set */ - if (da.type == EAF_TYPE_LC_SET) { - RESULT_(T_LCLIST, ad, &null_adata); - break; + switch (e->type) { + case T_IP: + RESULT_(T_IP, ip, *((const ip_addr *) e->u.ptr->data)); + break; + default: + RESULT_VAL([[(struct f_val) { + .type = e->type, + .val.bval = e->u, + }]]); } - - /* Undefined value */ - RESULT_VOID; - break; } - - switch (e->type & EAF_TYPE_MASK) { - case EAF_TYPE_INT: - RESULT_(da.f_type, i, e->u.data); - break; - case EAF_TYPE_ROUTER_ID: - RESULT_(T_QUAD, i, e->u.data); - break; - case EAF_TYPE_OPAQUE: - RESULT_(T_ENUM_EMPTY, i, 0); - break; - case EAF_TYPE_IP_ADDRESS: - RESULT_(T_IP, ip, *((ip_addr *) e->u.ptr->data)); - break; - case EAF_TYPE_AS_PATH: - RESULT_(T_PATH, ad, e->u.ptr); - break; - case EAF_TYPE_BITFIELD: - RESULT_(T_BOOL, i, !!(e->u.data & (1u << da.bit))); - break; - case EAF_TYPE_INT_SET: - RESULT_(T_CLIST, ad, e->u.ptr); - break; - case EAF_TYPE_EC_SET: - RESULT_(T_ECLIST, ad, e->u.ptr); - break; - case EAF_TYPE_LC_SET: - RESULT_(T_LCLIST, ad, e->u.ptr); - break; - case EAF_TYPE_UNDEF: + else if (empty = f_get_empty(da->type)) + RESULT_VAL(*empty); + else RESULT_VOID; - break; - default: - bug("Unknown dynamic attribute type"); - } } } INST(FI_EA_SET, 1, 0) { ACCESS_RTE; - ACCESS_EATTRS; ARG_ANY(1); DYNAMIC_ATTR; - ARG_TYPE(1, da.f_type); + ARG_TYPE(1, da->type); { - struct ea_list *l = lp_alloc(fs->pool, sizeof(struct ea_list) + sizeof(eattr)); - - l->next = NULL; - l->flags = EALF_SORTED; - l->count = 1; - l->attrs[0].id = da.ea_code; - l->attrs[0].flags = 0; - l->attrs[0].type = da.type | EAF_ORIGINATED | EAF_FRESH; - - switch (da.type) { - case EAF_TYPE_INT: - case EAF_TYPE_ROUTER_ID: - l->attrs[0].u.data = v1.val.i; - break; + struct eattr *a; - case EAF_TYPE_OPAQUE: - runtime( "Setting opaque attribute is not allowed" ); - break; + if (da->type >= EAF_TYPE__MAX) + bug("Unsupported attribute type"); - case EAF_TYPE_IP_ADDRESS:; - int len = sizeof(ip_addr); - struct adata *ad = lp_alloc(fs->pool, sizeof(struct adata) + len); - ad->length = len; - (* (ip_addr *) ad->data) = v1.val.ip; - l->attrs[0].u.ptr = ad; + switch (da->type) { + case T_OPAQUE: + case T_IFACE: + runtime( "Setting opaque attribute is not allowed" ); break; - case EAF_TYPE_AS_PATH: - case EAF_TYPE_INT_SET: - case EAF_TYPE_EC_SET: - case EAF_TYPE_LC_SET: - l->attrs[0].u.ptr = v1.val.ad; - break; - - case EAF_TYPE_BITFIELD: - { - /* First, we have to find the old value */ - eattr *e = ea_find(*fs->eattrs, da.ea_code); - u32 data = e ? e->u.data : 0; - - if (v1.val.i) - l->attrs[0].u.data = data | (1u << da.bit); - else - l->attrs[0].u.data = data & ~(1u << da.bit); - } + case T_IP: + a = ea_set_attr(&fs->rte->attrs, + EA_LITERAL_STORE_ADATA(da, 0, &v1.val.ip, sizeof(ip_addr))); break; default: - bug("Unknown dynamic attribute type"); + a = ea_set_attr(&fs->rte->attrs, + EA_LITERAL_GENERIC(da->id, da->type, 0, .u = v1.val.bval)); + break; } - f_rta_cow(fs); - l->next = *fs->eattrs; - *fs->eattrs = l; + a->originated = 1; + a->fresh = 1; } } INST(FI_EA_UNSET, 0, 0) { DYNAMIC_ATTR; ACCESS_RTE; - ACCESS_EATTRS; - { - struct ea_list *l = lp_alloc(fs->pool, sizeof(struct ea_list) + sizeof(eattr)); - - l->next = NULL; - l->flags = EALF_SORTED; - l->count = 1; - l->attrs[0].id = da.ea_code; - l->attrs[0].flags = 0; - l->attrs[0].type = EAF_TYPE_UNDEF | EAF_ORIGINATED | EAF_FRESH; - l->attrs[0].u.data = 0; - - f_rta_cow(fs); - l->next = *fs->eattrs; - *fs->eattrs = l; - } + ea_unset_attr(&fs->rte->attrs, 1, da); + } + + INST(FI_DEFAULT, 2, 1) { + ARG_ANY(1); + ARG_ANY(2); + RESULT_TYPE(f_type_element_type(v2.type)); + + log(L_INFO "Type of arg 1 is: %d", v1.type); + + if (v1.type == T_VOID) + RESULT_VAL(v2); + else + RESULT_VAL(v1); } INST(FI_LENGTH, 1, 1) { /* Get length of */ @@ -902,14 +1026,31 @@ ((net_addr_roa6 *) v1.val.net)->max_pxlen); } - INST(FI_ROA_ASN, 1, 1) { /* Get ROA ASN */ - ARG(1, T_NET); - if (!net_is_roa(v1.val.net)) - runtime( "ROA expected" ); + INST(FI_ASN, 1, 1) { /* Get ROA ASN or community ASN part */ + ARG_ANY(1); + RESULT_TYPE(T_INT); + switch(v1.type) + { + case T_NET: + if (!net_is_roa(v1.val.net)) + runtime( "ROA expected" ); - RESULT(T_INT, i, (v1.val.net->type == NET_ROA4) ? - ((net_addr_roa4 *) v1.val.net)->asn : - ((net_addr_roa6 *) v1.val.net)->asn); + RESULT_(T_INT, i, (v1.val.net->type == NET_ROA4) ? + ((net_addr_roa4 *) v1.val.net)->asn : + ((net_addr_roa6 *) v1.val.net)->asn); + break; + + case T_PAIR: + RESULT_(T_INT, i, v1.val.i >> 16); + break; + + case T_LC: + RESULT_(T_INT, i, v1.val.lc.asn); + break; + + default: + runtime( "Net, pair or lc expected" ); + } } INST(FI_IP, 1, 1) { /* Convert prefix to ... */ @@ -943,7 +1084,90 @@ RESULT(T_INT, i, as_path_get_last_nonaggregated(v1.val.ad)); } - INST(FI_RETURN, 1, 1) { + INST(FI_PAIR_DATA, 1, 1) { /* Get data part from the standard community */ + ARG(1, T_PAIR); + RESULT(T_INT, i, v1.val.i & 0xFFFF); + } + + INST(FI_LC_DATA1, 1, 1) { /* Get data1 part from the large community */ + ARG(1, T_LC); + RESULT(T_INT, i, v1.val.lc.ldp1); + } + + INST(FI_LC_DATA2, 1, 1) { /* Get data2 part from the large community */ + ARG(1, T_LC); + RESULT(T_INT, i, v1.val.lc.ldp2); + } + + INST(FI_MIN, 1, 1) { /* Get minimum element from list */ + ARG_ANY(1); + RESULT_TYPE(f_type_element_type(v1.type)); + switch(v1.type) + { + case T_CLIST: + { + u32 val = 0; + int_set_min(v1.val.ad, &val); + RESULT_(T_PAIR, i, val); + } + break; + + case T_ECLIST: + { + u64 val = 0; + ec_set_min(v1.val.ad, &val); + RESULT_(T_EC, ec, val); + } + break; + + case T_LCLIST: + { + lcomm val = { 0, 0, 0 }; + lc_set_min(v1.val.ad, &val); + RESULT_(T_LC, lc, val); + } + break; + + default: + runtime( "Clist or lclist expected" ); + } + } + + INST(FI_MAX, 1, 1) { /* Get maximum element from list */ + ARG_ANY(1); + RESULT_TYPE(f_type_element_type(v1.type)); + switch(v1.type) + { + case T_CLIST: + { + u32 val = 0; + int_set_max(v1.val.ad, &val); + RESULT_(T_PAIR, i, val); + } + break; + + case T_ECLIST: + { + u64 val = 0; + ec_set_max(v1.val.ad, &val); + RESULT_(T_EC, ec, val); + } + break; + + case T_LCLIST: + { + lcomm val = { 0, 0, 0 }; + lc_set_max(v1.val.ad, &val); + RESULT_(T_LC, lc, val); + } + break; + + default: + runtime( "Clist or lclist expected" ); + } + } + + INST(FI_RETURN, 1, 0) { NEVER_CONSTANT; /* Acquire the return value */ ARG_ANY(1); @@ -971,28 +1195,59 @@ INST(FI_CALL, 0, 1) { NEVER_CONSTANT; + VARARG; SYMBOL; + /* Fake result type declaration */ + RESULT_TYPE(T_VOID); + + FID_NEW_BODY() + ASSERT(sym->class == SYM_FUNCTION); + + if (whati->varcount != sym->function->args) + cf_error("Function '%s' expects %u arguments, got %u arguments", + sym->name, sym->function->args, whati->varcount); + + /* Typecheck individual arguments */ + struct f_inst *a = fvar; + struct f_arg *b = sym->function->arg_list; + for (uint i = 1; a && b; a = a->next, b = b->next, i++) + { + enum btype b_type = b->arg->class & 0xff; + + if (a->type && (a->type != b_type) && !f_const_promotion(a, b_type)) + cf_error("Argument %u of '%s' must be %s, got %s", + i, sym->name, f_type_name(b_type), f_type_name(a->type)); + } + ASSERT(!a && !b); + + /* Add implicit void slot for the return value */ + struct f_inst *tmp = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_VOID }); + tmp->next = whati->fvar; + whati->fvar = tmp; + what->size += tmp->size; + + /* Mark recursive calls, they have dummy f_line */ + if (!sym->function->len) + what->flags |= FIF_RECURSIVE; + FID_SAME_BODY() - if (!(f1->sym->flags & SYM_FLAG_SAME)) - return 0; + if (!(f1->sym->flags & SYM_FLAG_SAME) && !(f1_->flags & FIF_RECURSIVE)) + return 0; FID_ITERATE_BODY() + if (!(what->flags & FIF_RECURSIVE)) BUFFER_PUSH(fit->lines) = whati->sym->function; FID_INTERPRET_BODY() /* Push the body on stack */ LINEX(sym->function); + curline.vbase = curline.ventry; curline.emask |= FE_RETURN; - /* Before this instruction was called, there was the T_VOID - * automatic return value pushed on value stack and also - * sym->function->args function arguments. Setting the - * vbase to point to first argument. */ - ASSERT(curline.ventry >= sym->function->args); - curline.ventry -= sym->function->args; - curline.vbase = curline.ventry; + /* Arguments on stack */ + fstk->vcnt += sym->function->args; /* Storage for local variables */ f_vcnt_check_overflow(sym->function->vars); @@ -1106,17 +1361,10 @@ if (v1.type == T_PATH) { - const struct f_tree *set = NULL; - u32 key = 0; - - if (v2.type == T_INT) - key = v2.val.i; - else if ((v2.type == T_SET) && (v2.val.t->from.type == T_INT)) - set = v2.val.t; + if ((v2.type == T_SET) && path_set_type(v2.val.t) || (v2.type == T_INT)) + RESULT_(T_PATH, ad, [[ as_path_filter(fpool, v1.val.ad, &v2, 0) ]]); else runtime("Can't delete non-integer (set)"); - - RESULT_(T_PATH, ad, [[ as_path_filter(fpool, v1.val.ad, set, key, 0) ]]); } else if (v1.type == T_CLIST) @@ -1168,10 +1416,8 @@ if (v1.type == T_PATH) { - u32 key = 0; - - if ((v2.type == T_SET) && (v2.val.t->from.type == T_INT)) - RESULT_(T_PATH, ad, [[ as_path_filter(fpool, v1.val.ad, v2.val.t, key, 1) ]]); + if ((v2.type == T_SET) && path_set_type(v2.val.t)) + RESULT_(T_PATH, ad, [[ as_path_filter(fpool, v1.val.ad, &v2, 1) ]]); else runtime("Can't filter integer"); } @@ -1209,37 +1455,7 @@ runtime("Can't filter non-[e|l]clist"); } - INST(FI_ROA_CHECK_IMPLICIT, 0, 1) { /* ROA Check */ - NEVER_CONSTANT; - RTC(1); - rtable *table = rtc->table; - ACCESS_RTE; - ACCESS_EATTRS; - const net_addr *net = fs->rte->net; - - /* We ignore temporary attributes, probably not a problem here */ - /* 0x02 is a value of BA_AS_PATH, we don't want to include BGP headers */ - eattr *e = ea_find(*fs->eattrs, EA_CODE(PROTOCOL_BGP, 0x02)); - - if (!e || ((e->type & EAF_TYPE_MASK) != EAF_TYPE_AS_PATH)) - runtime("Missing AS_PATH attribute"); - - u32 as = 0; - as_path_get_last(e->u.ptr, &as); - - if (!table) - runtime("Missing ROA table"); - - if (table->addr_type != NET_ROA4 && table->addr_type != NET_ROA6) - runtime("Table type must be either ROA4 or ROA6"); - - if (table->addr_type != (net->type == NET_IP4 ? NET_ROA4 : NET_ROA6)) - RESULT(T_ENUM_ROA, i, ROA_UNKNOWN); /* Prefix and table type mismatch */ - else - RESULT(T_ENUM_ROA, i, [[ net_roa_check(table, net, as) ]]); - } - - INST(FI_ROA_CHECK_EXPLICIT, 2, 1) { /* ROA Check */ + INST(FI_ROA_CHECK, 2, 1) { /* ROA Check */ NEVER_CONSTANT; ARG(1, T_NET); ARG(2, T_INT); @@ -1261,7 +1477,7 @@ } - INST(FI_FORMAT, 1, 0) { /* Format */ + INST(FI_FORMAT, 1, 1) { /* Format */ ARG_ANY(1); RESULT(T_STRING, s, val_format_str(fpool, &v1)); } diff --git a/filter/f-inst.h b/filter/f-inst.h index df45f88e..fbc59de7 100644 --- a/filter/f-inst.h +++ b/filter/f-inst.h @@ -22,7 +22,7 @@ /* Flags for instructions */ enum f_instruction_flags { - FIF_PRINTED = 1, /* FI_PRINT_AND_DIE: message put in buffer */ + FIF_RECURSIVE = 1, /* FI_CALL: function is directly recursive */ } PACKED; /* Include generated filter instruction declarations */ @@ -35,19 +35,26 @@ const char *f_instruction_name_(enum f_instruction_code fi); static inline const char *f_instruction_name(enum f_instruction_code fi) { return f_instruction_name_(fi) + 3; } +struct f_arg { + struct symbol *arg; + struct f_arg *next; +}; + /* Filter structures for execution */ /* Line of instructions to be unconditionally executed one after another */ struct f_line { uint len; /* Line length */ u8 args; /* Function: Args required */ u8 vars; + u8 results; /* Results left on stack: cmd -> 0, term -> 1 */ + struct f_arg *arg_list; struct f_line_item items[0]; /* The items themselves */ }; /* Convert the f_inst infix tree to the f_line structures */ -struct f_line *f_linearize_concat(const struct f_inst * const inst[], uint count); -static inline struct f_line *f_linearize(const struct f_inst *root) -{ return f_linearize_concat(&root, 1); } +struct f_line *f_linearize_concat(const struct f_inst * const inst[], uint count, uint results); +static inline struct f_line *f_linearize(const struct f_inst *root, uint results) +{ return f_linearize_concat(&root, 1, results); } void f_dump_line(const struct f_line *, uint indent); @@ -87,15 +94,17 @@ void f_add_lines(const struct f_line_item *what, struct filter_iterator *fit); struct filter *f_new_where(struct f_inst *); -static inline struct f_dynamic_attr f_new_dynamic_attr(u8 type, enum f_type f_type, uint code) /* Type as core knows it, type as filters know it, and code of dynamic attribute */ -{ return (struct f_dynamic_attr) { .type = type, .f_type = f_type, .ea_code = code }; } /* f_type currently unused; will be handy for static type checking */ -static inline struct f_dynamic_attr f_new_dynamic_attr_bit(u8 bit, enum f_type f_type, uint code) /* Type as core knows it, type as filters know it, and code of dynamic attribute */ -{ return (struct f_dynamic_attr) { .type = EAF_TYPE_BITFIELD, .bit = bit, .f_type = f_type, .ea_code = code }; } /* f_type currently unused; will be handy for static type checking */ -static inline struct f_static_attr f_new_static_attr(int f_type, int code, int readonly) -{ return (struct f_static_attr) { .f_type = f_type, .sa_code = code, .readonly = readonly }; } -struct f_inst *f_generate_complex(enum f_instruction_code fi_code, struct f_dynamic_attr da, struct f_inst *argument); +static inline struct f_static_attr f_new_static_attr(btype type, int code, int readonly) +{ return (struct f_static_attr) { .type = type, .sa_code = code, .readonly = readonly }; } struct f_inst *f_generate_roa_check(struct rtable_config *table, struct f_inst *prefix, struct f_inst *asn); +struct f_attr_bit { + const struct ea_class *class; + uint bit; +}; + +#define f_new_dynamic_attr_bit(_bit, _name) ((struct f_attr_bit) { .bit = _bit, .class = ea_class_find(_name) }) + /* Hook for call bt_assert() function in configuration */ extern void (*bt_assert_hook)(int result, const struct f_line_item *assert); diff --git a/filter/f-util.c b/filter/f-util.c index 410999a6..82a06bdd 100644 --- a/filter/f-util.c +++ b/filter/f-util.c @@ -2,7 +2,7 @@ * Filters: utility functions * * Copyright 1998 Pavel Machek <pavel@ucw.cz> - * 2017 Jan Maria Matejka <mq@ucw.cz> + * 2017 Maria Matejka <mq@ucw.cz> * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -13,7 +13,7 @@ #include "filter/f-inst.h" #include "lib/idm.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #define P(a,b) ((a<<8) | b) @@ -37,147 +37,6 @@ struct filter *f_new_where(struct f_inst *where) f_new_inst(FI_DIE, F_REJECT)); struct filter *f = cfg_allocz(sizeof(struct filter)); - f->root = f_linearize(cond); + f->root = f_linearize(cond, 0); return f; } - -#define CA_KEY(n) n->name, n->fda.type -#define CA_NEXT(n) n->next -#define CA_EQ(na,ta,nb,tb) (!strcmp(na,nb) && (ta == tb)) -#define CA_FN(n,t) (mem_hash(n, strlen(n)) ^ (t*0xaae99453U)) -#define CA_ORDER 8 /* Fixed */ - -struct ca_storage { - struct ca_storage *next; - struct f_dynamic_attr fda; - u32 uc; - char name[0]; -}; - -HASH(struct ca_storage) ca_hash; - -static struct idm ca_idm; -static struct ca_storage **ca_storage; -static uint ca_storage_max; - -static void -ca_free(resource *r) -{ - struct custom_attribute *ca = (void *) r; - struct ca_storage *cas = HASH_FIND(ca_hash, CA, ca->name, ca->fda->type); - ASSERT(cas); - - ca->name = NULL; - ca->fda = NULL; - if (!--cas->uc) { - uint id = EA_CUSTOM_ID(cas->fda.ea_code); - idm_free(&ca_idm, id); - HASH_REMOVE(ca_hash, CA, cas); - ca_storage[id] = NULL; - mb_free(cas); - } -} - -static void -ca_dump(resource *r) -{ - struct custom_attribute *ca = (void *) r; - debug("name \"%s\" id 0x%04x ea_type 0x%02x f_type 0x%02x\n", - ca->name, ca->fda->ea_code, ca->fda->type, ca->fda->f_type); -} - -static struct resclass ca_class = { - .name = "Custom attribute", - .size = sizeof(struct custom_attribute), - .free = ca_free, - .dump = ca_dump, - .lookup = NULL, - .memsize = NULL, -}; - -struct custom_attribute * -ca_lookup(pool *p, const char *name, int f_type) -{ - int ea_type; - - switch (f_type) { - case T_INT: - ea_type = EAF_TYPE_INT; - break; - case T_IP: - ea_type = EAF_TYPE_IP_ADDRESS; - break; - case T_QUAD: - ea_type = EAF_TYPE_ROUTER_ID; - break; - case T_PATH: - ea_type = EAF_TYPE_AS_PATH; - break; - case T_CLIST: - ea_type = EAF_TYPE_INT_SET; - break; - case T_ECLIST: - ea_type = EAF_TYPE_EC_SET; - break; - case T_LCLIST: - ea_type = EAF_TYPE_LC_SET; - break; - default: - cf_error("Custom route attribute of unsupported type"); - } - - static int inited = 0; - if (!inited) { - idm_init(&ca_idm, &root_pool, 8); - HASH_INIT(ca_hash, &root_pool, CA_ORDER); - - ca_storage_max = 256; - ca_storage = mb_allocz(&root_pool, sizeof(struct ca_storage *) * ca_storage_max); - - inited++; - } - - struct ca_storage *cas = HASH_FIND(ca_hash, CA, name, ea_type); - if (cas) { - cas->uc++; - } else { - - uint id = idm_alloc(&ca_idm); - - if (id >= EA_CUSTOM_BIT) - cf_error("Too many custom attributes."); - - if (id >= ca_storage_max) { - ca_storage_max *= 2; - ca_storage = mb_realloc(ca_storage, sizeof(struct ca_storage *) * ca_storage_max * 2); - } - - cas = mb_allocz(&root_pool, sizeof(struct ca_storage) + strlen(name) + 1); - cas->fda = f_new_dynamic_attr(ea_type, f_type, EA_CUSTOM(id)); - cas->uc = 1; - - strcpy(cas->name, name); - ca_storage[id] = cas; - - HASH_INSERT(ca_hash, CA, cas); - } - - struct custom_attribute *ca = ralloc(p, &ca_class); - ca->fda = &(cas->fda); - ca->name = cas->name; - return ca; -} - -const char * -ea_custom_name(uint ea) -{ - uint id = EA_CUSTOM_ID(ea); - if (id >= ca_storage_max) - return NULL; - - if (!ca_storage[id]) - return NULL; - - return ca_storage[id]->name; -} - diff --git a/filter/filter.c b/filter/filter.c index 625b3ade..0aff4d30 100644 --- a/filter/filter.c +++ b/filter/filter.c @@ -35,10 +35,10 @@ #include "lib/ip.h" #include "lib/net.h" #include "lib/flowspec.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/attrs.h" +#include "lib/attrs.h" #include "conf/conf.h" #include "filter/filter.h" #include "filter/f-inst.h" @@ -76,12 +76,6 @@ struct filter_state { /* The route we are processing. This may be NULL to indicate no route available. */ struct rte *rte; - /* Cached pointer to ea_list */ - struct ea_list **eattrs; - - /* Linpool for adata allocation */ - struct linpool *pool; - /* Buffer for log output */ struct buffer buf; @@ -97,32 +91,6 @@ void (*bt_assert_hook)(int result, const struct f_line_item *assert); #define f_stack_init(fs) ( _f_stack_init(fs, v, 128), _f_stack_init(fs, e, 128) ) -static inline void f_cache_eattrs(struct filter_state *fs) -{ - fs->eattrs = &(fs->rte->attrs->eattrs); -} - -/* - * rta_cow - prepare rta for modification by filter - */ -static void -f_rta_cow(struct filter_state *fs) -{ - if (!rta_is_cached(fs->rte->attrs)) - return; - - /* - * Get shallow copy of rta. Fields eattrs and nexthops of rta are shared - * with fs->old_rta (they will be copied when the cached rta will be obtained - * at the end of f_run()), also the lock of hostentry is inherited (we - * suppose hostentry is not changed by filters). - */ - fs->rte->attrs = rta_do_cow(fs->rte->attrs, fs->pool); - - /* Re-cache the ea_list */ - f_cache_eattrs(fs); -} - static struct tbf rl_runtime_err = TBF_DEFAULT_LOG_LIMITS; /** @@ -185,10 +153,8 @@ interpret(struct filter_state *fs, const struct f_line *line, struct f_val *val) return F_ERROR; \ } while(0) -#define falloc(size) lp_alloc(fs->pool, size) -#define fpool fs->pool - -#define ACCESS_EATTRS do { if (!fs->eattrs) f_cache_eattrs(fs); } while (0) +#define falloc(size) tmp_alloc(size) +#define fpool tmp_linpool #include "filter/inst-interpret.c" #undef res @@ -198,13 +164,11 @@ interpret(struct filter_state *fs, const struct f_line *line, struct f_val *val) #undef runtime #undef falloc #undef fpool -#undef ACCESS_EATTRS } } /* End of current line. Drop local variables before exiting. */ - fstk->vcnt -= curline.line->vars; - fstk->vcnt -= curline.line->args; + fstk->vcnt = curline.ventry + curline.line->results; fstk->ecnt--; } @@ -237,7 +201,7 @@ interpret(struct filter_state *fs, const struct f_line *line, struct f_val *val) * tmp_pool, otherwise the filters may modify it. */ enum filter_return -f_run(const struct filter *filter, struct rte *rte, struct linpool *tmp_pool, int flags) +f_run(const struct filter *filter, struct rte *rte, int flags) { if (filter == FILTER_ACCEPT) return F_ACCEPT; @@ -250,7 +214,6 @@ f_run(const struct filter *filter, struct rte *rte, struct linpool *tmp_pool, in /* Initialize the filter state */ filter_state = (struct filter_state) { .rte = rte, - .pool = tmp_pool, .flags = flags, }; @@ -285,11 +248,10 @@ f_run(const struct filter *filter, struct rte *rte, struct linpool *tmp_pool, in */ enum filter_return -f_eval_rte(const struct f_line *expr, struct rte *rte, struct linpool *tmp_pool) +f_eval_rte(const struct f_line *expr, struct rte *rte) { filter_state = (struct filter_state) { .rte = rte, - .pool = tmp_pool, }; f_stack_init(filter_state); @@ -308,11 +270,9 @@ f_eval_rte(const struct f_line *expr, struct rte *rte, struct linpool *tmp_pool) * @pres: here the output will be stored */ enum filter_return -f_eval(const struct f_line *expr, struct linpool *tmp_pool, struct f_val *pres) +f_eval(const struct f_line *expr, struct f_val *pres) { - filter_state = (struct filter_state) { - .pool = tmp_pool, - }; + filter_state = (struct filter_state) {}; f_stack_init(filter_state); @@ -331,9 +291,7 @@ uint f_eval_int(const struct f_line *expr) { /* Called independently in parse-time to eval expressions */ - filter_state = (struct filter_state) { - .pool = cfg_mem, - }; + filter_state = (struct filter_state) {}; f_stack_init(filter_state); @@ -354,10 +312,10 @@ f_eval_int(const struct f_line *expr) * f_eval_buf - get a value of a term and print it to the supplied buffer */ enum filter_return -f_eval_buf(const struct f_line *expr, struct linpool *tmp_pool, buffer *buf) +f_eval_buf(const struct f_line *expr, buffer *buf) { struct f_val val; - enum filter_return fret = f_eval(expr, tmp_pool, &val); + enum filter_return fret = f_eval(expr, &val); if (fret <= F_RETURN) val_format(&val, buf); return fret; diff --git a/filter/filter.h b/filter/filter.h index 9964831c..3f2e62eb 100644 --- a/filter/filter.h +++ b/filter/filter.h @@ -13,8 +13,8 @@ #include "lib/resource.h" #include "lib/ip.h" #include "lib/macro.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" /* Possible return values of filter execution */ enum filter_return { @@ -51,10 +51,10 @@ struct filter { struct rte; -enum filter_return f_run(const struct filter *filter, struct rte *rte, struct linpool *tmp_pool, int flags); -enum filter_return f_eval_rte(const struct f_line *expr, struct rte *rte, struct linpool *tmp_pool); +enum filter_return f_run(const struct filter *filter, struct rte *rte, int flags); +enum filter_return f_eval_rte(const struct f_line *expr, struct rte *rte); uint f_eval_int(const struct f_line *expr); -enum filter_return f_eval_buf(const struct f_line *expr, struct linpool *tmp_pool, buffer *buf); +enum filter_return f_eval_buf(const struct f_line *expr, buffer *buf); const char *filter_name(const struct filter *filter); int filter_same(const struct filter *new, const struct filter *old); @@ -70,13 +70,4 @@ void filters_dump_all(void); #define FF_SILENT 2 /* Silent filter execution */ -/* Custom route attributes */ -struct custom_attribute { - resource r; - struct f_dynamic_attr *fda; - const char *name; -}; - -struct custom_attribute *ca_lookup(pool *p, const char *name, int ea_type); - #endif diff --git a/filter/filter_test.c b/filter/filter_test.c index 2a0b5431..671dba94 100644 --- a/filter/filter_test.c +++ b/filter/filter_test.c @@ -46,9 +46,7 @@ run_function(const void *arg) if (t->cmp) return t->result == f_same(t->fn, t->cmp); - linpool *tmp = lp_new_default(&root_pool); - enum filter_return fret = f_eval(t->fn, tmp, NULL); - rfree(tmp); + enum filter_return fret = f_eval(t->fn, NULL); return (fret < F_REJECT); } @@ -81,11 +79,11 @@ main(int argc, char *argv[]) if (!bt_config_file_parse(BT_CONFIG_FILE)) abort(); - bt_test_suite(t_reconfig, "Testing reconfiguration"); + bt_test_suite_extra(t_reconfig, 0, BT_TIMEOUT, "Testing reconfiguration"); struct f_bt_test_suite *t; WALK_LIST(t, config->tests) - bt_test_suite_base(run_function, t->fn_name, t, BT_FORKING, BT_TIMEOUT, "%s", t->dsc); + bt_test_suite_base(run_function, t->fn_name, t, 0, BT_TIMEOUT, "%s", t->dsc); bt_bird_cleanup(); return bt_exit_value(); diff --git a/filter/test.conf b/filter/test.conf index 93e7a770..22f5dea3 100644 --- a/filter/test.conf +++ b/filter/test.conf @@ -9,7 +9,109 @@ router id 62.168.0.1; /* We have to setup any protocol */ protocol device { } - +/* Setting some custom attributes, enough to force BIRD to reallocate the attribute idmap */ +attribute int test_ca_int1; +attribute int test_ca_int2; +attribute int test_ca_int3; +attribute int test_ca_int4; +attribute int test_ca_int5; +attribute int test_ca_int6; +attribute int test_ca_int7; +attribute int test_ca_int8; +attribute int test_ca_int9; +attribute int test_ca_int10; + +attribute ip test_ca_ip1; +attribute ip test_ca_ip2; +attribute ip test_ca_ip3; +attribute ip test_ca_ip4; +attribute ip test_ca_ip5; +attribute ip test_ca_ip6; +attribute ip test_ca_ip7; +attribute ip test_ca_ip8; +attribute ip test_ca_ip9; +attribute ip test_ca_ip10; + +attribute quad test_ca_quad1; +attribute quad test_ca_quad2; +attribute quad test_ca_quad3; +attribute quad test_ca_quad4; +attribute quad test_ca_quad5; +attribute quad test_ca_quad6; +attribute quad test_ca_quad7; +attribute quad test_ca_quad8; +attribute quad test_ca_quad9; +attribute quad test_ca_quad10; + +attribute bgppath test_ca_bgppath1; +attribute bgppath test_ca_bgppath2; +attribute bgppath test_ca_bgppath3; +attribute bgppath test_ca_bgppath4; +attribute bgppath test_ca_bgppath5; +attribute bgppath test_ca_bgppath6; +attribute bgppath test_ca_bgppath7; +attribute bgppath test_ca_bgppath8; +attribute bgppath test_ca_bgppath9; +attribute bgppath test_ca_bgppath10; + +attribute clist test_ca_clist1; +attribute clist test_ca_clist2; +attribute clist test_ca_clist3; +attribute clist test_ca_clist4; +attribute clist test_ca_clist5; +attribute clist test_ca_clist6; +attribute clist test_ca_clist7; +attribute clist test_ca_clist8; +attribute clist test_ca_clist9; +attribute clist test_ca_clist10; + +attribute eclist test_ca_eclist1; +attribute eclist test_ca_eclist2; +attribute eclist test_ca_eclist3; +attribute eclist test_ca_eclist4; +attribute eclist test_ca_eclist5; +attribute eclist test_ca_eclist6; +attribute eclist test_ca_eclist7; +attribute eclist test_ca_eclist8; +attribute eclist test_ca_eclist9; +attribute eclist test_ca_eclist10; + +attribute lclist test_ca_lclist1; +attribute lclist test_ca_lclist2; +attribute lclist test_ca_lclist3; +attribute lclist test_ca_lclist4; +attribute lclist test_ca_lclist5; +attribute lclist test_ca_lclist6; +attribute lclist test_ca_lclist7; +attribute lclist test_ca_lclist8; +attribute lclist test_ca_lclist9; +attribute lclist test_ca_lclist10; + +attribute lclist test_ca_lclist_max1; +attribute lclist test_ca_lclist_max2; +attribute lclist test_ca_lclist_max3; +attribute lclist test_ca_lclist_max4; +attribute lclist test_ca_lclist_max5; +attribute lclist test_ca_lclist_max6; +attribute lclist test_ca_lclist_max7; +attribute lclist test_ca_lclist_max8; +attribute lclist test_ca_lclist_max9; +attribute lclist test_ca_lclist_max10; +attribute lclist test_ca_lclist_max11; +attribute lclist test_ca_lclist_max12; +attribute lclist test_ca_lclist_max13; +attribute lclist test_ca_lclist_max14; +attribute lclist test_ca_lclist_max15; +attribute lclist test_ca_lclist_max16; +attribute lclist test_ca_lclist_max17; +attribute lclist test_ca_lclist_max18; +attribute lclist test_ca_lclist_max19; +attribute lclist test_ca_lclist_max20; +attribute lclist test_ca_lclist_max21; + + +/* Uncomment this to get an error */ +#attribute int bgp_path; /* * Common definitions and functions @@ -44,9 +146,8 @@ bt_test_same(onef, twof, 0); */ function t_bool() -bool b; { - b = true; + bool b = true; bt_assert(b); bt_assert(!!b); @@ -82,12 +183,11 @@ define xyzzy = (120+10); define '1a-a1' = (xyzzy-100); function t_int() -int i; { bt_assert(xyzzy = 130); bt_assert('1a-a1' = 30); - i = four; + int i = four; i = 12*100 + 60/2 + i; i = (i + 0); bt_assert(i = 1234); @@ -111,6 +211,14 @@ int i; bt_assert(!(i = 4)); bt_assert(1 <= 1); bt_assert(!(1234 < 1234)); + + bt_assert(10 - 5 = 5); + bt_assert(4294967295 + 1 = 0); + bt_assert(6*9=54); + bt_assert(984/41 = 24); + bt_assert(123/45 = 2); + bt_assert(0xfee1a | 0xbeef = 0xffeff); + bt_assert(0xfee1a & 0xbeef = 0xae0a); } bt_test_suite(t_int, "Testing integers"); @@ -128,14 +236,19 @@ define is2 = [(17+2), 17, 15, 11, 8, 5, 3, 2]; define is3 = [5, 17, 2, 11, 8, 15, 3, 19]; function t_int_set() -int set is; { + int set is = []; + bt_assert(is = []); + bt_assert(0 !~ is); + bt_assert(1 ~ [1,2,3]); bt_assert(5 ~ [1..20]); bt_assert(2 ~ [ 1, 2, 3 ]); bt_assert(5 ~ [ 4 .. 7 ]); bt_assert(1 !~ [ 2, 3, 4 ]); bt_assert(999 !~ [ 666, 333 ]); + bt_assert(1 !~ []); + bt_assert(1 !~ is); is = [ 2, 3, 4, 7..11 ]; bt_assert(10 ~ is); @@ -170,6 +283,7 @@ int set is; bt_assert([1,4..10,20] = [1,4..10,20]); bt_assert(format([ 1, 2, 1, 1, 1, 3, 4, 1, 1, 1, 5 ]) = "[1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5]"); + bt_assert(format([]) = "[]"); } bt_test_suite(t_int_set, "Testing sets of integers"); @@ -183,9 +297,8 @@ bt_test_suite(t_int_set, "Testing sets of integers"); */ function t_string() -string st; { - st = "Hello"; + string st = "Hello"; bt_assert(format(st) = "Hello"); bt_assert(st ~ "Hell*"); bt_assert(st ~ "?ello"); @@ -210,9 +323,8 @@ function 'mkpair-a'(int a) } function t_pair() -pair pp; { - pp = (1, 2); + pair pp = (1, 2); bt_assert(format(pp) = "(1,2)"); bt_assert((1,2) = pp); bt_assert((1,1+1) = pp); @@ -233,10 +345,11 @@ bt_test_suite(t_pair, "Testing pairs"); */ function t_pair_set() -pair pp; -pair set ps; { - pp = (1, 2); + pair pp = (1, 2); + pair set ps = []; + bt_assert(pp !~ ps); + ps = [(1,(one+one)), (3,4)..(4,8), (5,*), (6,3..6)]; bt_assert(format(ps) = "[(1,2), (3,4)..(4,8), (5,0)..(5,65535), (6,3)..(6,6)]"); bt_assert(pp ~ ps); @@ -253,6 +366,7 @@ pair set ps; bt_assert((6,6+one) !~ ps); bt_assert(((one+6),2) !~ ps); bt_assert((1,1) !~ ps); + bt_assert(pp !~ []); ps = [(20..150, 200..300), (50100..50200, 1000..50000), (*, 5+5)]; bt_assert((100,200) ~ ps); @@ -304,6 +418,7 @@ quad qq; qq = 1.2.3.4; bt_assert(qq ~ [1.2.3.4, 5.6.7.8]); bt_assert(qq !~ [1.2.1.1, 1.2.3.5]); + bt_assert(qq !~ []); } bt_test_suite(t_quad_set, "Testing sets of quads"); @@ -335,6 +450,26 @@ ip p; p = 1234:5678::; bt_assert(!p.is_v4); bt_assert(p.mask(24) = 1234:5600::); + + p = 1:2:3:4:5:6:7:8; + bt_assert(!p.is_v4); + bt_assert(format(p) = "1:2:3:4:5:6:7:8"); + bt_assert(p.mask(64) = 1:2:3:4::); + + p = 10:20:30:40:50:60:70:80; + bt_assert(!p.is_v4); + bt_assert(format(p) = "10:20:30:40:50:60:70:80"); + bt_assert(p.mask(64) = 10:20:30:40::); + + p = 1090:20a0:30b0:40c0:50d0:60e0:70f0:8000; + bt_assert(!p.is_v4); + bt_assert(format(p) = "1090:20a0:30b0:40c0:50d0:60e0:70f0:8000"); + bt_assert(p.mask(64) = 1090:20a0:30b0:40c0::); + + p = ::fffe:6:c0c:936d:88c7:35d3; + bt_assert(!p.is_v4); + bt_assert(format(p) = "::fffe:6:c0c:936d:88c7:35d3"); + bt_assert(p.mask(64) = 0:0:fffe:6::); } bt_test_suite(t_ip, "Testing ip address"); @@ -364,6 +499,7 @@ ip set ips; bt_assert(1.2.3.4 !~ [ 1.2.3.3, 1.2.3.5 ]); bt_assert(1.2.3.4 ~ [ 1.2.3.3..1.2.3.5 ]); + bt_assert(1.2.3.4 !~ []); } bt_test_suite(t_ip_set, "Testing sets of ip address"); @@ -378,9 +514,9 @@ bt_test_suite(t_ip_set, "Testing sets of ip address"); function t_enum() { - bt_assert(format(RTS_STATIC) = "(enum 30)1"); - bt_assert(format(NET_IP4) = "(enum 36)1"); - bt_assert(format(NET_VPN6) = "(enum 36)4"); + bt_assert(format(RTS_STATIC) = "(enum 31)1"); + bt_assert(format(NET_IP4) = "(enum 3b)1"); + bt_assert(format(NET_VPN6) = "(enum 3b)4"); bt_assert(RTS_STATIC ~ [RTS_STATIC, RTS_DEVICE]); bt_assert(RTS_BGP !~ [RTS_STATIC, RTS_DEVICE]); @@ -452,13 +588,34 @@ function test_pxset(prefix set pxs) bt_assert(1.0.0.0/8 ~ [ 1.0.0.0/8+ ]); bt_assert(1.0.0.0/9 !~ [ 1.0.0.0/8- ]); bt_assert(1.2.0.0/17 !~ [ 1.0.0.0/8{ 15 , 16 } ]); + bt_assert(net10 !~ []); bt_assert([ 10.0.0.0/8{ 15 , 17 } ] = [ 10.0.0.0/8{ 15 , 17 } ]); } +function test_empty_pxset(prefix set pxs) +int set s0; +prefix set s1; +{ + s0 = []; + s1 = []; + bt_assert(pxs != s0); + bt_assert(pxs = s1); + bt_assert(pxs = []); +} + function t_prefix_set() prefix set pxs; { + pxs = []; + bt_assert(format(pxs) = "[]"); + bt_assert(pxs = []); + bt_assert(1.2.0.0/16 !~ []); + bt_assert(1.2.0.0/16 !~ pxs); + + test_empty_pxset([]); + test_empty_pxset(pxs); + pxs = [ 1.2.0.0/16, 1.4.0.0/16+, 44.66.88.64/30{24,28}, 12.34.56.0/24{8,16} ]; bt_assert(format(pxs) = "[1.2.0.0/16{0.1.0.0}, 1.4.0.0/16{0.1.255.255}, 12.34.0.0/16{1.255.0.0}, 44.66.88.64/28{0.0.1.240}]"); @@ -478,6 +635,33 @@ prefix set pxs; bt_assert(1.2.0.0/16 ~ [ 1.0.0.0/8{ 15 , 17 } ]); bt_assert([ 10.0.0.0/8{ 15 , 17 } ] != [ 11.0.0.0/8{ 15 , 17 } ]); + + /* Formatting of prefix sets, some cases are a bit strange */ + bt_assert(format([ 0.0.0.0/0 ]) = "[0.0.0.0/0]"); + bt_assert(format([ 10.10.0.0/32 ]) = "[10.10.0.0/32{0.0.0.1}]"); + bt_assert(format([ 10.10.0.0/17 ]) = "[10.10.0.0/17{0.0.128.0}]"); + bt_assert(format([ 10.10.0.0/17{17,19} ]) = "[10.10.0.0/17{0.0.224.0}]"); # 224 = 128+64+32 + bt_assert(format([ 10.10.128.0/17{18,19} ]) = "[10.10.128.0/18{0.0.96.0}, 10.10.192.0/18{0.0.96.0}]"); # 96 = 64+32 + bt_assert(format([ 10.10.64.0/18- ]) = "[0.0.0.0/0, 0.0.0.0/1{128.0.0.0}, 0.0.0.0/2{64.0.0.0}, 0.0.0.0/3{32.0.0.0}, 10.10.0.0/16{255.255.0.0}, 10.10.0.0/17{0.0.128.0}, 10.10.64.0/18{0.0.64.0}]"); + bt_assert(format([ 10.10.64.0/18+ ]) = "[10.10.64.0/18{0.0.96.0}, 10.10.64.0/20{0.0.31.255}, 10.10.80.0/20{0.0.31.255}, 10.10.96.0/20{0.0.31.255}, 10.10.112.0/20{0.0.31.255}]"); + + bt_assert(format([ 10.10.160.0/19 ]) = "[10.10.160.0/19{0.0.32.0}]"); + bt_assert(format([ 10.10.160.0/19{19,22} ]) = "[10.10.160.0/19{0.0.32.0}, 10.10.160.0/20{0.0.28.0}, 10.10.176.0/20{0.0.28.0}]"); # 28 = 16+8+4 + bt_assert(format([ 10.10.160.0/19+ ]) = "[10.10.160.0/19{0.0.32.0}, 10.10.160.0/20{0.0.31.255}, 10.10.176.0/20{0.0.31.255}]"); + + bt_assert(format([ ::/0 ]) = "[::/0]"); + bt_assert(format([ 11:22:33:44:55:66:77:88/128 ]) = "[11:22:33:44:55:66:77:88/128{::1}]"); + bt_assert(format([ 11:22:33:44::/64 ]) = "[11:22:33:44::/64{0:0:0:1::}]"); + bt_assert(format([ 11:22:33:44::/64+ ]) = "[11:22:33:44::/64{::1:ffff:ffff:ffff:ffff}]"); + + bt_assert(format([ 11:22:33:44::/65 ]) = "[11:22:33:44::/65{::8000:0:0:0}]"); + bt_assert(format([ 11:22:33:44::/65{65,67} ]) = "[11:22:33:44::/65{::e000:0:0:0}]"); # e = 8+4+2 + bt_assert(format([ 11:22:33:44:8000::/65{66,67} ]) = "[11:22:33:44:8000::/66{::6000:0:0:0}, 11:22:33:44:c000::/66{::6000:0:0:0}]"); # 6 = 4+2 + bt_assert(format([ 11:22:33:44:4000::/66- ]) = "[::/0, ::/1{8000::}, ::/2{4000::}, ::/3{2000::}, 11:22:33:44::/64{ffff:ffff:ffff:ffff::}, 11:22:33:44::/65{::8000:0:0:0}, 11:22:33:44:4000::/66{::4000:0:0:0}]"); + bt_assert(format([ 11:22:33:44:4000::/66+ ]) = "[11:22:33:44:4000::/66{::6000:0:0:0}, 11:22:33:44:4000::/68{::1fff:ffff:ffff:ffff}, 11:22:33:44:5000::/68{::1fff:ffff:ffff:ffff}, 11:22:33:44:6000::/68{::1fff:ffff:ffff:ffff}, 11:22:33:44:7000::/68{::1fff:ffff:ffff:ffff}]"); + bt_assert(format([ 11:22:33:44:c000::/67 ]) = "[11:22:33:44:c000::/67{::2000:0:0:0}]"); + bt_assert(format([ 11:22:33:44:c000::/67{67,71} ]) = "[11:22:33:44:c000::/67{::2000:0:0:0}, 11:22:33:44:c000::/68{::1e00:0:0:0}, 11:22:33:44:d000::/68{::1e00:0:0:0}]"); + bt_assert(format([ 11:22:33:44:c000::/67+ ]) = "[11:22:33:44:c000::/67{::2000:0:0:0}, 11:22:33:44:c000::/68{::1fff:ffff:ffff:ffff}, 11:22:33:44:d000::/68{::1fff:ffff:ffff:ffff}]"); } bt_test_suite(t_prefix_set, "Testing prefix sets"); @@ -516,6 +700,12 @@ bt_test_suite(t_prefix6, "Testing prefix IPv6"); function t_prefix6_set() prefix set pxs; { + pxs = []; + bt_assert(format(pxs) = "[]"); + bt_assert(pxs = []); + bt_assert(12::34/128 !~ []); + bt_assert(12::34/128 !~ pxs); + bt_assert(1180::/16 ~ [ 1100::/8{15, 17} ]); bt_assert(12::34 = 12::34); bt_assert(12::34 ~ [ 12::33..12::35 ]); @@ -557,6 +747,12 @@ prefix set pxs; bt_assert(2000::/29 !~ pxs); bt_assert(1100::/10 !~ pxs); bt_assert(2010::/26 !~ pxs); + + pxs = [ 52E0::/13{13,128} ]; + bt_assert(52E7:BE81:379B:E6FD:541F:B0D0::/93 ~ pxs); + + pxs = [ 41D8:8718::/30{0,30}, 413A:99A8:6C00::/38{38,128} ]; + bt_assert(4180::/9 ~ pxs); } bt_test_suite(t_prefix6_set, "Testing prefix IPv6 sets"); @@ -627,6 +823,7 @@ int set set12; bt_assert(3 ~ p2); bt_assert(p2 ~ [2, 10..20]); bt_assert(p2 ~ [4, 10..20]); + bt_assert(p2 !~ []); p2 = prepend(p2, 5); bt_assert(p2 !~ pm1); @@ -637,6 +834,8 @@ int set set12; bt_assert(p2 ~ [= 5 [2, 4, 6] 3 [1..2] 1 =]); bt_assert(p2 ~ [= 5 set35 3 set12 set12 =]); bt_assert(p2 ~ mkpath(5, 4)); + bt_assert(p2 ~ [= * [3] * =]); + bt_assert(p2 !~ [= * [] * =]); bt_assert(p2.len = 5); bt_assert(p2.first = 5); @@ -645,6 +844,10 @@ int set set12; bt_assert(p2.len = 5); bt_assert(delete(p2, 3) = prepend(prepend(prepend(prepend(+empty+, 1), 2), 4), 5)); bt_assert(filter(p2, [1..3]) = prepend(prepend(prepend(+empty+, 1), 2), 3)); + bt_assert(delete(p2, []) = p2); + bt_assert(filter(p2, []) = +empty+); + bt_assert(delete(prepend(prepend(+empty+, 0), 1), []) = prepend(prepend(+empty+, 0), 1)); + bt_assert(filter(prepend(prepend(+empty+, 0), 1), []) = +empty+); p2 = prepend( + empty +, 5 ); p2 = prepend( p2, 4 ); @@ -664,6 +867,15 @@ int set set12; bt_assert(delete(p2, [4..5]) = prepend(prepend(prepend(prepend(+empty+, 3), 3), 2), 1)); bt_assert(format([= 1 2+ 3 =]) = "[= 1 2 + 3 =]"); + + # iteration over path + int x = 0; + int y = 0; + for int i in p2 do { + x = x + i; + y = y + x; + } + bt_assert(x = 18 && y = 50); } bt_test_suite(t_path, "Testing paths"); @@ -683,6 +895,11 @@ clist l; clist l2; clist r; { + bt_assert((10, 20).asn = 10); + bt_assert((10, 20).data = 20); + bt_assert(p23.asn = 2); + bt_assert(p23.data = 3); + l = - empty -; bt_assert(l !~ [(*,*)]); bt_assert((l ~ [(*,*)]) != (l !~ [(*,*)])); @@ -700,6 +917,7 @@ clist r; bt_assert(l ~ [(2,2..3)]); bt_assert(l ~ [(1,1..2)]); bt_assert(l ~ [(1,1)..(1,2)]); + bt_assert(l !~ []); l = add(l, (2,5)); l = add(l, (5,one)); @@ -737,6 +955,9 @@ clist r; bt_assert(l !~ [(*,(one+6))]); bt_assert(l !~ [(*, (one+one+one))]); + bt_assert(delete(l, []) = l); + bt_assert(filter(l, []) = -empty-); + l = delete(l, [(*,(one+onef(3)))]); l = delete(l, [(*,(4+one))]); bt_assert(l = add(-empty-, (3,1))); @@ -775,6 +996,18 @@ clist r; r = filter(l, [(3,1), (*,2)]); bt_assert(r = add(add(-empty-, (3,1)), (3,2))); bt_assert(format(r) = "(clist (3,1) (3,2))"); + + # minimim & maximum element + r = add(add(add(add(add(-empty-, (2,1)), (1,3)), (2,2)), (3,1)), (2,3)); + bt_assert(format(r) = "(clist (2,1) (1,3) (2,2) (3,1) (2,3))"); + bt_assert(r.min = (1,3)); + bt_assert(r.max = (3,1)); + + # iteration over clist + int x = 0; + for pair c in r do + x = x + c.asn * c.asn * c.data; + bt_assert(x = 36); } bt_test_suite(t_clist, "Testing lists of communities"); @@ -848,11 +1081,15 @@ eclist r; bt_assert((ro, 10.20.30.40, 100) !~ el); bt_assert(el !~ [(rt, 10, 35..40)]); bt_assert(el !~ [(ro, 10, *)]); + bt_assert(el !~ []); el = add(el, (rt, 10, 40)); el2 = filter(el, [(rt, 10, 20..40)] ); el2 = add(el2, (rt, 10, 50)); + bt_assert(delete(el, []) = el); + bt_assert(filter(el, []) = --empty--); + # eclist A (1,30,40) bt_assert(el = add(add(add(--empty--, (rt, 10, 1)), (rt, 10, 30)), (rt, 10, 40))); bt_assert(format(el) = "(eclist (rt, 10, 1) (rt, 10, 30) (rt, 10, 40))"); @@ -880,6 +1117,19 @@ eclist r; r = filter(el, [(rt, 10, 1), (rt, 10, 25..30), (ro, 10, 40)]); bt_assert(r = add(add(--empty--, (rt, 10, 1)), (rt, 10, 30))); bt_assert(format(r) = "(eclist (rt, 10, 1) (rt, 10, 30))"); + + # minimim & maximum element + r = add(add(add(add(add(--empty--, (rt, 2, 1)), (rt, 1, 3)), (rt, 2, 2)), (rt, 3, 1)), (rt, 2, 3)); + bt_assert(format(r) = "(eclist (rt, 2, 1) (rt, 1, 3) (rt, 2, 2) (rt, 3, 1) (rt, 2, 3))"); + bt_assert(r.min = (rt, 1, 3)); + bt_assert(r.max = (rt, 3, 1)); + + # iteration over eclist + int x = 0; + for ec c in r do + if c > (rt, 2, 0) && c < (rt, 3, 0) then + x = x + 1; + bt_assert(x = 3); } bt_test_suite(t_eclist, "Testing lists of extended communities"); @@ -939,6 +1189,10 @@ lclist r; bt_assert(---empty--- = ---empty---); bt_assert((10, 20, 30) !~ ---empty---); + bt_assert((10, 20, 30).asn = 10); + bt_assert((10, 20, 30).data1 = 20); + bt_assert((10, 20, 30).data2 = 30); + ll = --- empty ---; ll = add(ll, (ten, 20, 30)); ll = add(ll, (1000, 2000, 3000)); @@ -960,6 +1214,9 @@ lclist r; ll2 = add(ll2, (30, 30, 30)); ll2 = add(ll2, (40, 40, 40)); + bt_assert(delete(ll, []) = ll); + bt_assert(filter(ll, []) = ---empty---); + # lclist A (10, 20, 30) bt_assert(format(ll) = "(lclist (10, 10, 10) (20, 20, 20) (30, 30, 30))"); @@ -985,6 +1242,25 @@ lclist r; r = filter(ll, [(5..15, *, *), (20, 15..25, *)]); bt_assert(r = add(add(---empty---, (10, 10, 10)), (20, 20, 20))); bt_assert(format(r) = "(lclist (10, 10, 10) (20, 20, 20))"); + + # minimim & maximum element + r = add(add(add(add(add(---empty---, (2, 3, 3)), (1, 2, 3)), (2, 3, 1)), (3, 1, 2)), (2, 1, 3)); + bt_assert(format(r) = "(lclist (2, 3, 3) (1, 2, 3) (2, 3, 1) (3, 1, 2) (2, 1, 3))"); + bt_assert(r.min = (1, 2, 3)); + bt_assert(r.max = (3, 1, 2)); + + # iteration over lclist + int x = 0; + int y = 0; + lc mx = (0, 0, 0); + for lc c in r do { + int asn2 = c.asn * c.asn; + x = x + asn2 * c.data1; + y = y + asn2 * c.data2; + if c > mx then mx = c; + } + bt_assert(x = 39 && y = 49); + bt_assert(mx = r.max); } bt_test_suite(t_lclist, "Testing lists of large communities"); @@ -1013,6 +1289,7 @@ lc set lls; bt_assert(ll !~ [(5,10,15), (10,21,30)]); bt_assert(ll !~ [(10,21..25,*)]); bt_assert(ll !~ [(11, *, *)]); + bt_assert(ll !~ []); lls = [(10, 10, 10), (20, 20, 15..25), (30, 30, *), (40, 35..45, *), (50, *, *), (55..65, *, *)]; bt_assert(format(lls) = "[(10, 10, 10), (20, 20, 15)..(20, 20, 25), (30, 30, 0)..(30, 30, 4294967295), (40, 35, 0)..(40, 45, 4294967295), (50, 0, 0)..(50, 4294967295, 4294967295), (55, 0, 0)..(65, 4294967295, 4294967295)]"); @@ -1069,6 +1346,10 @@ bt_test_suite(t_rd, "Testing route distinguishers"); function t_rd_set() rd set rds; { + rds = []; + bt_assert(rds = []); + bt_assert(10:20 !~ rds); + rds = [10:20, 100000:100..100000:200]; bt_assert(format(rds) = "[10:20, 100000:100..100000:200]"); @@ -1079,6 +1360,7 @@ rd set rds; bt_assert(100000:128 ~ rds); bt_assert(100000:200 ~ rds); bt_assert(100010:150 !~ rds); + bt_assert(100010:150 !~ []); } bt_test_suite(t_rd_set, "Testing sets of route distinguishers"); @@ -1145,7 +1427,85 @@ function fifteen() return 15; } +function local_vars(int j) +{ + int k = 10; + bt_assert(j = 5 && k = 10); + { + int j = 15; + k = 20; + bt_assert(j = 15 && k = 20); + } + bt_assert(j = 5 && k = 20); + + if j < 10 then + { + int j = 25; + string k = "hello"; + bt_assert(j = 25 && k = "hello"); + } + bt_assert(j = 5 && k = 20); + + int m = 100; + { + j = 35; + int k = 40; + bt_assert(j = 35 && k = 40 && m = 100); + } + bt_assert(j = 35 && k = 20 && m = 100); +} + +function factorial(int x) +{ + if x = 0 then return 0; + if x = 1 then return 1; + else return x * factorial(x - 1); +} + +function fibonacci(int x) +{ + if x = 0 then return 0; + if x = 1 then return 1; + else return fibonacci(x - 1) + fibonacci(x - 2); +} + +function hanoi_init(int a; int b) +{ + if b = 0 + then return +empty+; + else return prepend(hanoi_init(a + 1, b - 1), a); +} + +function hanoi_solve(int n; bgppath h_src; bgppath h_dst; bgppath h_aux; bool x; bool y) +{ + # x -> return src or dst + # y -> print state + + if n = 0 then { if x then return h_src; else return h_dst; } + + bgppath tmp1 = hanoi_solve(n - 1, h_src, h_aux, h_dst, true, y); + bgppath tmp2 = hanoi_solve(n - 1, h_src, h_aux, h_dst, false, false); + h_src = tmp1; + h_aux = tmp2; + + int v = h_src.first; + # bt_assert(h_dst = +empty+ || v < h_dst.first); + h_src = delete(h_src, v); + h_dst = prepend(h_dst, v); + + if y then + print "move: ", v, " src: ", h_src, " dst:", h_dst, " aux:", h_aux; + + tmp1 = hanoi_solve(n - 1, h_aux, h_dst, h_src, true, y); + tmp2 = hanoi_solve(n - 1, h_aux, h_dst, h_src, false, false); + h_aux = tmp1; + h_dst = tmp2; + + if x then return h_src; else return h_dst; +} + function t_call_function() +bgppath h_src; { bt_assert(fifteen() = 15); @@ -1157,6 +1517,17 @@ function t_call_function() bt_assert(callme(4, 4) = 16); bt_assert(callme(7, 2) = 14); bt_assert(callmeagain(1, 2, 3) = 6); + local_vars(5); + + bt_assert(factorial(5) = 120); + bt_assert(factorial(10) = 3628800); + + bt_assert(fibonacci(10) = 55); + bt_assert(fibonacci(20) = 6765); + + h_src = hanoi_init(1, 6); + bt_assert(format(h_src) = "(path 1 2 3 4 5 6)"); + bt_assert(hanoi_solve(6, h_src, +empty+, +empty+, false, false) = h_src); } bt_test_suite(t_call_function, "Testing calling functions"); @@ -1243,6 +1614,7 @@ function __test2() filter testf int j; +bool t; { print "Heya, filtering route to ", net.ip, " prefixlen ", net.len, " source ", source; print "This route was from ", from; @@ -1254,6 +1626,54 @@ int j; rip_metric = 14; unset(rip_metric); + preference = 1234; + + test_ca_int1 = 42; + test_ca_ip2 = 1.3.5.7; + test_ca_quad3 = 2.4.6.8; + test_ca_bgppath4 = +empty+; + test_ca_clist5 = -empty-; + test_ca_eclist6 = --empty--; + test_ca_lclist7 = ---empty---; + + igp_metric = 53; + babel_metric = 64; + t = defined(babel_router_id); + j = babel_seqno; + + bgp_origin = ORIGIN_IGP; + bgp_path = +empty+; + bgp_next_hop = 3456:789a:bcde:f012::3456:789a; + bgp_med = 71; + bgp_local_pref = 942; + t = defined(bgp_atomic_aggr); + t = defined(bgp_aggregator); + bgp_community = -empty-; + bgp_originator_id = 9.7.5.3; + bgp_cluster_list = -empty-; + bgp_ext_community = --empty--; + t = defined(bgp_aigp); + bgp_large_community = ---empty---; + t = defined(bgp_mpls_label_stack); + + ospf_metric1 = 64; + ospf_metric2 = 111; + ospf_tag = 654432; + + radv_preference = RA_PREF_LOW; + radv_lifetime = 28; + + rip_metric = 2; + rip_tag = 4; + t = defined(rip_from); + + krt_source = 17; + krt_metric = 19; + +# krt_lock_mtu = false; +# krt_lock_window = true; +# krt_lock_rtt = krt_lock_rttvar && krt_lock_sstresh || krt_lock_cwnd; + accept "ok I take that"; } @@ -1353,13 +1773,16 @@ filter vpn_filter bt_assert(net.type != NET_IP6); bt_assert(net.rd = 0:1:2); + bool b = false; case (net.type) { NET_IP4: print "IPV4"; NET_IP6: print "IPV6"; + else: b = true; } + bt_assert(b); bt_check_assign(from, 10.20.30.40); - bt_check_assign(gw, 55.55.55.44); + # bt_check_assign(gw, 55.55.55.44); bgp_community.add((3,5)); bgp_ext_community.add((ro, 135, 999)); diff --git a/filter/test.conf2 b/filter/test.conf2 index e95f9563..9fc8330f 100644 --- a/filter/test.conf2 +++ b/filter/test.conf2 @@ -38,12 +38,6 @@ protocol static { print from; from = 1.2.3.4; print from; - print scope; - scope = SCOPE_HOST; - print scope; - if !(scope ~ [ SCOPE_HOST, SCOPE_SITE ]) then { - print "Failed in test"; - } preference = 15; print preference; diff --git a/filter/tree_test.c b/filter/tree_test.c index 6472d17e..d180efbc 100644 --- a/filter/tree_test.c +++ b/filter/tree_test.c @@ -19,10 +19,7 @@ static void start_conf_env(void) { bt_bird_init(); - - pool *p = rp_new(&root_pool, "helper_pool"); - linpool *l = lp_new_default(p); - cfg_mem = l; + cfg_mem = tmp_linpool; } static struct f_tree * @@ -173,6 +170,8 @@ t_balancing(void) show_tree(balanced_tree_from_simple); bt_assert(same_tree(balanced_tree_from_simple, expected_balanced_tree)); + + tmp_flush(); } return 1; @@ -194,6 +193,9 @@ t_balancing_random(void) uint i; for(i = 0; i < 10; i++) { + struct lp_state lps; + lp_save(tmp_linpool, &lps); + struct f_tree *random_degenerated_tree = get_random_degenerated_left_tree(nodes_count); show_tree(random_degenerated_tree); @@ -203,7 +205,11 @@ t_balancing_random(void) show_tree(balanced_tree_from_random); bt_assert(same_tree(balanced_tree_from_random, expected_balanced_tree)); + + lp_restore(tmp_linpool, &lps); } + + tmp_flush(); } return 1; @@ -230,6 +236,8 @@ t_find(void) const struct f_tree *found_tree = find_tree(tree, &looking_up_value); bt_assert((val_compare(&looking_up_value, &(found_tree->from)) == 0) && (val_compare(&looking_up_value, &(found_tree->to)) == 0)); } + + tmp_flush(); } return 1; @@ -286,6 +294,8 @@ t_find_ranges(void) ((val_compare(&needle, &(found_tree->from)) == 1) && (val_compare(&needle, &(found_tree->to)) == -1)) ); } + + tmp_flush(); } return 1; diff --git a/filter/trie.c b/filter/trie.c index 1a4e1ac3..12ba0b82 100644 --- a/filter/trie.c +++ b/filter/trie.c @@ -1,7 +1,8 @@ /* * Filters: Trie for prefix sets * - * Copyright 2009 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2009--2021 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2009--2021 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -9,53 +10,68 @@ /** * DOC: Trie for prefix sets * - * We use a (compressed) trie to represent prefix sets. Every node - * in the trie represents one prefix (&addr/&plen) and &plen also - * indicates the index of the bit in the address that is used to - * branch at the node. If we need to represent just a set of - * prefixes, it would be simple, but we have to represent a - * set of prefix patterns. Each prefix pattern consists of - * &ppaddr/&pplen and two integers: &low and &high, and a prefix - * &paddr/&plen matches that pattern if the first MIN(&plen, &pplen) - * bits of &paddr and &ppaddr are the same and &low <= &plen <= &high. - * - * We use a bitmask (&accept) to represent accepted prefix lengths - * at a node. As there are 33 prefix lengths (0..32 for IPv4), but - * there is just one prefix of zero length in the whole trie so we - * have &zero flag in &f_trie (indicating whether the trie accepts - * prefix 0.0.0.0/0) as a special case, and &accept bitmask + * We use a (compressed) trie to represent prefix sets. Every node in the trie + * represents one prefix (&addr/&plen) and &plen also indicates the index of + * bits in the address that are used to branch at the node. Note that such + * prefix is not necessary a member of the prefix set, it is just a canonical + * prefix associated with a node. Prefix lengths of nodes are aligned to + * multiples of &TRIE_STEP (4) and there is 16-way branching in each + * node. Therefore, we say that a node is associated with a range of prefix + * lengths (&plen .. &plen + TRIE_STEP - 1). + * + * The prefix set is not just a set of prefixes, it is defined by a set of + * prefix patterns. Each prefix pattern consists of &ppaddr/&pplen and two + * integers: &low and &high. The tested prefix &paddr/&plen matches that pattern + * if the first MIN(&plen, &pplen) bits of &paddr and &ppaddr are the same and + * &low <= &plen <= &high. + * + * There are two ways to represent accepted prefixes for a node. First, there is + * a bitmask &local, which represents independently all 15 prefixes that extend + * the canonical prefix of the node and are within a range of prefix lengths + * associated with the node. E.g., for node 10.0.0.0/8 they are 10.0.0.0/8, + * 10.0.0.0/9, 10.128.0.0/9, .. 10.224.0.0/11. This order (first by length, then + * lexicographically) is used for indexing the bitmask &local, starting at + * position 1. I.e., index is 2^(plen - base) + offset within the same length, + * see function trie_local_mask6() for details. + * + * Second, we use a bitmask &accept to represent accepted prefix lengths at a + * node. The bit is set means that all prefixes of given length that are either + * subprefixes or superprefixes of the canonical prefix are accepted. As there + * are 33 prefix lengths (0..32 for IPv4), but there is just one prefix of zero + * length in the whole trie so we have &zero flag in &f_trie (indicating whether + * the trie accepts prefix 0.0.0.0/0) as a special case, and &accept bitmask * represents accepted prefix lengths from 1 to 32. * - * There are two cases in prefix matching - a match when the length - * of the prefix is smaller that the length of the prefix pattern, - * (&plen < &pplen) and otherwise. The second case is simple - we - * just walk through the trie and look at every visited node - * whether that prefix accepts our prefix length (&plen). The - * first case is tricky - we don't want to examine every descendant - * of a final node, so (when we create the trie) we have to propagate - * that information from nodes to their ascendants. - * - * Suppose that we have two masks (M1 and M2) for a node. Mask M1 - * represents accepted prefix lengths by just the node and mask M2 - * represents accepted prefix lengths by the node or any of its - * descendants. Therefore M2 is a bitwise or of M1 and children's - * M2 and this is a maintained invariant during trie building. - * Basically, when we want to match a prefix, we walk through the trie, - * check mask M1 for our prefix length and when we came to - * final node, we check mask M2. - * - * There are two differences in the real implementation. First, - * we use a compressed trie so there is a case that we skip our - * final node (if it is not in the trie) and we came to node that - * is either extension of our prefix, or completely out of path - * In the first case, we also have to check M2. - * - * Second, we really need not to maintain two separate bitmasks. - * Checks for mask M1 are always larger than &applen and we need - * just the first &pplen bits of mask M2 (if trie compression - * hadn't been used it would suffice to know just $applen-th bit), - * so we have to store them together in &accept mask - the first - * &pplen bits of mask M2 and then mask M1. + * One complication is handling of prefix patterns with unaligned prefix length. + * When such pattern is to be added, we add a primary node above (with rounded + * down prefix length &nlen) and a set of secondary nodes below (with rounded up + * prefix lengths &slen). Accepted prefix lengths of the original prefix pattern + * are then represented in different places based on their lengths. For prefixes + * shorter than &nlen, it is &accept bitmask of the primary node, for prefixes + * between &nlen and &slen - 1 it is &local bitmask of the primary node, and for + * prefixes longer of equal &slen it is &accept bitmasks of secondary nodes. + * + * There are two cases in prefix matching - a match when the length of the + * prefix is smaller that the length of the prefix pattern, (&plen < &pplen) and + * otherwise. The second case is simple - we just walk through the trie and look + * at every visited node whether that prefix accepts our prefix length (&plen). + * The first case is tricky - we do not want to examine every descendant of a + * final node, so (when we create the trie) we have to propagate that + * information from nodes to their ascendants. + * + * There are two kinds of propagations - propagation from child's &accept + * bitmask to parent's &accept bitmask, and propagation from child's &accept + * bitmask to parent's &local bitmask. The first kind is simple - as all + * superprefixes of a parent are also all superprefixes of appropriate length of + * a child, then we can just add (by bitwise or) a child &accept mask masked by + * parent prefix length mask to the parent &accept mask. This handles prefixes + * shorter than node &plen. + * + * The second kind of propagation is necessary to handle superprefixes of a + * child that are represented by parent &local mask - that are in the range of + * prefix lengths associated with the parent. For each accepted (by child + * &accept mask) prefix length from that range, we need to set appropriate bit + * in &local mask. See function trie_amask_to_local() for details. * * There are four cases when we walk through a trie: * @@ -65,8 +81,32 @@ * - we are beyond the end of path (node length > &plen) * - we are still on path and keep walking (node length < &plen) * - * The walking code in trie_match_prefix() is structured according to - * these cases. + * The walking code in trie_match_net() is structured according to these cases. + * + * Iteration over prefixes in a trie can be done using TRIE_WALK() macro, or + * directly using trie_walk_init() and trie_walk_next() functions. The second + * approach allows suspending the iteration and continuing in it later. + * Prefixes are enumerated in the usual lexicographic order and may be + * restricted to a subset of the trie (all subnets of a specified prefix). + * + * Note that the trie walk does not reliably enumerate `implicit' prefixes + * defined by &low and &high fields in prefix patterns, it is supposed to be + * used on tries constructed from `explicit' prefixes (&low == &plen == &high + * in call to trie_add_prefix()). + * + * The trie walk has three basic state variables stored in the struct + * &f_trie_walk_state -- the current node in &stack[stack_pos], &accept_length + * for iteration over inter-node prefixes (non-branching prefixes on compressed + * path between the current node and its parent node, stored in the bitmap + * &accept of the current node) and &local_pos for iteration over intra-node + * prefixes (stored in the bitmap &local). + * + * The trie also supports longest-prefix-match query by trie_match_longest_ip4() + * and it can be extended to iteration over all covering prefixes for a given + * prefix (from longest to shortest) using TRIE_WALK_TO_ROOT_IP4() macro. There + * are also IPv6 versions (for practical reasons, these functions and macros are + * separate for IPv4 and IPv6). There is the same limitation to enumeration of + * `implicit' prefixes like with the previous TRIE_WALK() macro. */ #include "nest/bird.h" @@ -86,7 +126,10 @@ #define ipa_mkmask(x) ip6_mkmask(x) #define ipa_masklen(x) ip6_masklen(&x) #define ipa_pxlen(x,y) ip6_pxlen(x,y) -#define ipa_getbit(x,n) ip6_getbit(x,n) +#define ipa_getbit(a,p) ip6_getbit(a,p) +#define ipa_getbits(a,p,n) ip6_getbits(a,p,n) +#define ipa_setbits(a,p,n) ip6_setbits(a,p,n) +#define trie_local_mask(a,b,c) trie_local_mask6(a,b,c) #define ipt_from_ip4(x) _MI6(_I(x), 0, 0, 0) #define ipt_to_ip4(x) _MI4(_I0(x)) @@ -109,10 +152,11 @@ f_new_trie(linpool *lp, uint data_size) } static inline struct f_trie_node4 * -new_node4(struct f_trie *t, int plen, ip4_addr paddr, ip4_addr pmask, ip4_addr amask) +new_node4(struct f_trie *t, uint plen, uint local, ip4_addr paddr, ip4_addr pmask, ip4_addr amask) { struct f_trie_node4 *n = lp_allocz(t->lp, sizeof(struct f_trie_node4) + t->data_size); n->plen = plen; + n->local = local; n->addr = paddr; n->mask = pmask; n->accept = amask; @@ -120,10 +164,11 @@ new_node4(struct f_trie *t, int plen, ip4_addr paddr, ip4_addr pmask, ip4_addr a } static inline struct f_trie_node6 * -new_node6(struct f_trie *t, int plen, ip6_addr paddr, ip6_addr pmask, ip6_addr amask) +new_node6(struct f_trie *t, uint plen, uint local, ip6_addr paddr, ip6_addr pmask, ip6_addr amask) { struct f_trie_node6 *n = lp_allocz(t->lp, sizeof(struct f_trie_node6) + t->data_size); n->plen = plen; + n->local = local; n->addr = paddr; n->mask = pmask; n->accept = amask; @@ -131,24 +176,24 @@ new_node6(struct f_trie *t, int plen, ip6_addr paddr, ip6_addr pmask, ip6_addr a } static inline struct f_trie_node * -new_node(struct f_trie *t, int plen, ip_addr paddr, ip_addr pmask, ip_addr amask) +new_node(struct f_trie *t, uint plen, uint local, ip_addr paddr, ip_addr pmask, ip_addr amask) { if (t->ipv4) - return (struct f_trie_node *) new_node4(t, plen, ipt_to_ip4(paddr), ipt_to_ip4(pmask), ipt_to_ip4(amask)); + return (struct f_trie_node *) new_node4(t, plen, local, ipt_to_ip4(paddr), ipt_to_ip4(pmask), ipt_to_ip4(amask)); else - return (struct f_trie_node *) new_node6(t, plen, ipa_to_ip6(paddr), ipa_to_ip6(pmask), ipa_to_ip6(amask)); + return (struct f_trie_node *) new_node6(t, plen, local, ipa_to_ip6(paddr), ipa_to_ip6(pmask), ipa_to_ip6(amask)); } static inline void attach_node4(struct f_trie_node4 *parent, struct f_trie_node4 *child) { - parent->c[ip4_getbit(child->addr, parent->plen) ? 1 : 0] = child; + parent->c[ip4_getbits(child->addr, parent->plen, TRIE_STEP)] = child; } static inline void attach_node6(struct f_trie_node6 *parent, struct f_trie_node6 *child) { - parent->c[ip6_getbit(child->addr, parent->plen) ? 1 : 0] = child; + parent->c[ip6_getbits(child->addr, parent->plen, TRIE_STEP)] = child; } static inline void @@ -160,63 +205,96 @@ attach_node(struct f_trie_node *parent, struct f_trie_node *child, int v4) attach_node6(&parent->v6, &child->v6); } -#define GET_ADDR(N,F,X) ((X) ? ipt_from_ip4((N)->v4.F) : ipa_from_ip6((N)->v6.F)) -#define SET_ADDR(N,F,X,V) ({ if (X) (N)->v4.F =ipt_to_ip4(V); else (N)->v6.F =ipa_to_ip6(V); }) -#define GET_CHILD(N,F,X,I) ((X) ? (struct f_trie_node *) (N)->v4.c[I] : (struct f_trie_node *) (N)->v6.c[I]) -/** - * trie_add_prefix - * @t: trie to add to - * @net: IP network prefix - * @l: prefix lower bound - * @h: prefix upper bound +/* + * Internal prefixes of a node a represented by the local bitmask, each bit for + * one prefix. Bit 0 is unused, Bit 1 is for the main prefix of the node, + * remaining bits correspond to subprefixes by this pattern: * - * Adds prefix (prefix pattern) @n to trie @t. @l and @h are lower - * and upper bounds on accepted prefix lengths, both inclusive. - * 0 <= l, h <= 32 (128 for IPv6). + * 1 + * 2 3 + * 4 5 6 7 + * 8 9 A B C D E F * - * Returns a pointer to the allocated node. The function can return a pointer to - * an existing node if @px and @plen are the same. If px/plen == 0/0 (or ::/0), - * a pointer to the root node is returned. Returns NULL when called with - * mismatched IPv4/IPv6 net type. + * E.g. for 10.0.0.0/8 node, the 10.64.0.0/10 would be position 5. */ -void * -trie_add_prefix(struct f_trie *t, const net_addr *net, uint l, uint h) +/* + * Compute appropriate mask representing prefix px/plen in local bitmask of node + * with prefix length nlen. Assuming that nlen <= plen < (nlen + TRIE_STEP). + */ +static inline uint +trie_local_mask4(ip4_addr px, uint plen, uint nlen) { - uint plen = net_pxlen(net); - ip_addr px; - int v4; + uint step = plen - nlen; + uint pos = (1u << step) + ip4_getbits(px, nlen, step); + return 1u << pos; +} - switch (net->type) - { - case NET_IP4: px = ipt_from_ip4(net4_prefix(net)); v4 = 1; break; - case NET_IP6: px = ipa_from_ip6(net6_prefix(net)); v4 = 0; break; - default: bug("invalid type"); - } +static inline uint +trie_local_mask6(ip6_addr px, uint plen, uint nlen) +{ + uint step = plen - nlen; + uint pos = (1u << step) + ip6_getbits(px, nlen, step); + return 1u << pos; +} - if (t->ipv4 != v4) - { - if (t->ipv4 < 0) - t->ipv4 = v4; - else - return NULL; - } +/* + * Compute an appropriate local mask (for a node with prefix length nlen) + * representing prefixes of px that are accepted by amask and fall within the + * range associated with that node. Used for propagation of child accept mask + * to parent local mask. + */ +static inline uint +trie_amask_to_local(ip_addr px, ip_addr amask, uint nlen) +{ + uint local = 0; - if (l == 0) - t->zero = 1; - else - l--; + for (uint plen = MAX(nlen, 1); plen < (nlen + TRIE_STEP); plen++) + if (ipa_getbit(amask, plen - 1)) + local |= trie_local_mask(px, plen, nlen); - if (h < plen) - plen = h; + return local; +} + +/* + * Compute a bitmask representing a level of subprefixes (of the same length), + * using specified position as a root. E.g., level 2 from root position 3 would + * be bit positions C-F, returned as bitmask 0xf000. + */ +static inline uint +trie_level_mask(uint pos, uint level) +{ + return ((1u << (1u << level)) - 1) << (pos << level); +} - ip_addr amask = ipa_xor(ipa_mkmask(l), ipa_mkmask(h)); + +#define GET_ADDR(N,F,X) ((X) ? ipt_from_ip4((N)->v4.F) : ipa_from_ip6((N)->v6.F)) +#define SET_ADDR(N,F,X,V) ({ if (X) (N)->v4.F =ipt_to_ip4(V); else (N)->v6.F =ipa_to_ip6(V); }) + +#define GET_LOCAL(N,X) ((X) ? (N)->v4.local : (N)->v6.local) +#define ADD_LOCAL(N,X,V) ({ uint v_ = (V); if (X) (N)->v4.local |= v_; else (N)->v6.local |= v_; }) + +#define GET_CHILD(N,X,I) ((X) ? (struct f_trie_node *) (N)->v4.c[I] : (struct f_trie_node *) (N)->v6.c[I]) + + +static void * +trie_add_node(struct f_trie *t, uint plen, ip_addr px, uint local, uint l, uint h) +{ + uint l_ = l ? (l - 1) : 0; + ip_addr amask = (l_ < h) ? ipa_xor(ipa_mkmask(l_), ipa_mkmask(h)) : IPA_NONE; ip_addr pmask = ipa_mkmask(plen); ip_addr paddr = ipa_and(px, pmask); struct f_trie_node *o = NULL; struct f_trie_node *n = &t->root; + int v4 = t->ipv4; + /* Add all bits for each active level (0x0002 0x000c 0x00f0 0xff00) */ + for (uint i = 0; i < TRIE_STEP; i++) + if ((l <= (plen + i)) && ((plen + i) <= h)) + local |= trie_level_mask(1, i); + + DBG("Insert node %I/%u (%I %x)\n", paddr, plen, amask, local); while (n) { ip_addr naddr = GET_ADDR(n, addr, v4); @@ -225,23 +303,31 @@ trie_add_prefix(struct f_trie *t, const net_addr *net, uint l, uint h) ip_addr cmask = ipa_and(nmask, pmask); uint nlen = v4 ? n->v4.plen : n->v6.plen; + DBG("Found node %I/%u (%I %x)\n", + naddr, nlen, accept, v4 ? n->v4.local : n->v6.local); + if (ipa_compare(ipa_and(paddr, cmask), ipa_and(naddr, cmask))) { /* We are out of path - we have to add branching node 'b' between node 'o' and node 'n', and attach new node 'a' as the other child of 'b'. */ - int blen = ipa_pxlen(paddr, naddr); + int blen = ROUND_DOWN_POW2(ipa_pxlen(paddr, naddr), TRIE_STEP); ip_addr bmask = ipa_mkmask(blen); ip_addr baddr = ipa_and(px, bmask); /* Merge accept masks from children to get accept mask for node 'b' */ ip_addr baccm = ipa_and(ipa_or(amask, accept), bmask); + uint bloc = trie_amask_to_local(naddr, accept, blen) | + trie_amask_to_local(paddr, amask, blen); - struct f_trie_node *a = new_node(t, plen, paddr, pmask, amask); - struct f_trie_node *b = new_node(t, blen, baddr, bmask, baccm); + struct f_trie_node *a = new_node(t, plen, local, paddr, pmask, amask); + struct f_trie_node *b = new_node(t, blen, bloc, baddr, bmask, baccm); attach_node(o, b, v4); attach_node(b, n, v4); attach_node(b, a, v4); + t->prefix_count++; + + DBG("Case 1\n"); return a; } @@ -249,66 +335,195 @@ trie_add_prefix(struct f_trie *t, const net_addr *net, uint l, uint h) { /* We add new node 'a' between node 'o' and node 'n' */ amask = ipa_or(amask, ipa_and(accept, pmask)); - struct f_trie_node *a = new_node(t, plen, paddr, pmask, amask); + local |= trie_amask_to_local(naddr, accept, plen); + struct f_trie_node *a = new_node(t, plen, local, paddr, pmask, amask); attach_node(o, a, v4); attach_node(a, n, v4); + t->prefix_count++; + + DBG("Case 2\n"); return a; } if (plen == nlen) { - /* We already found added node in trie. Just update accept mask */ + /* We already found added node in trie. Just update accept and local mask */ accept = ipa_or(accept, amask); SET_ADDR(n, accept, v4, accept); + + if ((GET_LOCAL(n, v4) & local) != local) + t->prefix_count++; + + ADD_LOCAL(n, v4, local); + + DBG("Case 3\n"); return n; } /* Update accept mask part M2 and go deeper */ accept = ipa_or(accept, ipa_and(amask, nmask)); SET_ADDR(n, accept, v4, accept); + ADD_LOCAL(n, v4, trie_amask_to_local(paddr, amask, nlen)); + + DBG("Step %u\n", ipa_getbits(paddr, nlen)); /* n->plen < plen and plen <= 32 (128) */ o = n; - n = GET_CHILD(n, c, v4, ipa_getbit(paddr, nlen) ? 1 : 0); + n = GET_CHILD(n, v4, ipa_getbits(paddr, nlen, TRIE_STEP)); } /* We add new tail node 'a' after node 'o' */ - struct f_trie_node *a = new_node(t, plen, paddr, pmask, amask); + struct f_trie_node *a = new_node(t, plen, local, paddr, pmask, amask); attach_node(o, a, v4); + t->prefix_count++; + DBG("Case 4\n"); return a; } +/** + * trie_add_prefix + * @t: trie to add to + * @net: IP network prefix + * @l: prefix lower bound + * @h: prefix upper bound + * + * Adds prefix (prefix pattern) @n to trie @t. @l and @h are lower + * and upper bounds on accepted prefix lengths, both inclusive. + * 0 <= l, h <= 32 (128 for IPv6). + * + * Returns a pointer to the allocated node. The function can return a pointer to + * an existing node if @px and @plen are the same. If px/plen == 0/0 (or ::/0), + * a pointer to the root node is returned. Returns NULL when called with + * mismatched IPv4/IPv6 net type. + */ +void * +trie_add_prefix(struct f_trie *t, const net_addr *net, uint l, uint h) +{ + uint plen = net_pxlen(net); + ip_addr px; + int v4; + + switch (net->type) + { + case NET_IP4: + case NET_VPN4: + case NET_ROA4: + px = ipt_from_ip4(net4_prefix(net)); + v4 = 1; + break; + + case NET_IP6: + case NET_VPN6: + case NET_ROA6: + case NET_IP6_SADR: + px = ipa_from_ip6(net6_prefix(net)); + v4 = 0; + break; + + default: + bug("invalid type"); + } + + if (t->ipv4 != v4) + { + if (t->ipv4 < 0) + t->ipv4 = v4; + else + return NULL; + } + + DBG("\nInsert net %N (%u-%u)\n", net, l, h); + + if (l == 0) + t->zero = 1; + + if (h < plen) + plen = h; + + /* Primary node length, plen rounded down */ + uint nlen = ROUND_DOWN_POW2(plen, TRIE_STEP); + + if (plen == nlen) + return trie_add_node(t, nlen, px, 0, l, h); + + /* Secondary node length, plen rouned up */ + uint slen = nlen + TRIE_STEP; + void *node = NULL; + + /* + * For unaligned prefix lengths it is more complicated. We need to encode + * matching prefixes of lengths from l to h. There are three cases of lengths: + * + * 1) 0..nlen are encoded by the accept mask of the primary node + * 2) nlen..(slen-1) are encoded by the local mask of the primary node + * 3) slen..max are encoded in secondary nodes + */ + + if (l < slen) + { + uint local = 0; + + /* Compute local bits for accepted nlen..(slen-1) prefixes */ + for (uint i = 0; i < TRIE_STEP; i++) + if ((l <= (nlen + i)) && ((nlen + i) <= h)) + { + uint pos = (1u << i) + ipa_getbits(px, nlen, i); + uint len = ((nlen + i) <= plen) ? 1 : (1u << (nlen + i - plen)); + + /* We need to fill 'len' bits starting at 'pos' position */ + local |= ((1u << len) - 1) << pos; + } + + /* Add the primary node */ + node = trie_add_node(t, nlen, px, local, l, nlen); + } + + if (slen <= h) + { + uint l2 = MAX(l, slen); + uint max = (1u << (slen - plen)); + + /* Add secondary nodes */ + for (uint i = 0; i < max; i++) + node = trie_add_node(t, slen, ipa_setbits(px, slen - 1, i), 0, l2, h); + } + + return node; +} + + static int trie_match_net4(const struct f_trie *t, ip4_addr px, uint plen) { - ip4_addr pmask = ip4_mkmask(plen); - ip4_addr paddr = ip4_and(px, pmask); - if (plen == 0) return t->zero; int plentest = plen - 1; + uint nlen = ROUND_DOWN_POW2(plen, TRIE_STEP); + uint local = trie_local_mask4(px, plen, nlen); const struct f_trie_node4 *n = &t->root.v4; while (n) { - ip4_addr cmask = ip4_and(n->mask, pmask); - /* We are out of path */ - if (ip4_compare(ip4_and(paddr, cmask), ip4_and(n->addr, cmask))) + if (!ip4_prefix_equal(px, n->addr, MIN(plen, n->plen))) return 0; + /* Check local mask */ + if ((n->plen == nlen) && (n->local & local)) + return 1; + /* Check accept mask */ if (ip4_getbit(n->accept, plentest)) return 1; /* We finished trie walk and still no match */ - if (plen <= n->plen) + if (nlen <= n->plen) return 0; /* Choose children */ - n = n->c[(ip4_getbit(paddr, n->plen)) ? 1 : 0]; + n = n->c[ip4_getbits(px, n->plen, TRIE_STEP)]; } return 0; @@ -317,33 +532,34 @@ trie_match_net4(const struct f_trie *t, ip4_addr px, uint plen) static int trie_match_net6(const struct f_trie *t, ip6_addr px, uint plen) { - ip6_addr pmask = ip6_mkmask(plen); - ip6_addr paddr = ip6_and(px, pmask); - if (plen == 0) return t->zero; int plentest = plen - 1; + uint nlen = ROUND_DOWN_POW2(plen, TRIE_STEP); + uint local = trie_local_mask6(px, plen, nlen); const struct f_trie_node6 *n = &t->root.v6; while (n) { - ip6_addr cmask = ip6_and(n->mask, pmask); - /* We are out of path */ - if (ip6_compare(ip6_and(paddr, cmask), ip6_and(n->addr, cmask))) + if (!ip6_prefix_equal(px, n->addr, MIN(plen, n->plen))) return 0; + /* Check local mask */ + if ((n->plen == nlen) && (n->local & local)) + return 1; + /* Check accept mask */ if (ip6_getbit(n->accept, plentest)) return 1; /* We finished trie walk and still no match */ - if (plen <= n->plen) + if (nlen <= n->plen) return 0; /* Choose children */ - n = n->c[(ip6_getbit(paddr, n->plen)) ? 1 : 0]; + n = n->c[ip6_getbits(px, n->plen, TRIE_STEP)]; } return 0; @@ -378,6 +594,412 @@ trie_match_net(const struct f_trie *t, const net_addr *n) } } + +/** + * trie_match_longest_ip4 + * @t: trie + * @net: net address + * @dst: return value + * @found0: optional returned bitmask of found nodes + * + * Perform longest prefix match for the address @net and return the resulting + * prefix in the buffer @dst. The bitmask @found0 is used to report lengths of + * prefixes on the path from the root to the resulting prefix. E.g., if there is + * also a /20 shorter matching prefix, then 20-th bit is set in @found0. This + * can be used to enumerate all matching prefixes for the network @net using + * function trie_match_next_longest_ip4() or macro TRIE_WALK_TO_ROOT_IP4(). + * + * This function assumes IPv4 trie, there is also an IPv6 variant. The @net + * argument is typed as net_addr_ip4, but would accept any IPv4-based net_addr, + * like net4_prefix(). Anyway, returned @dst is always net_addr_ip4. + * + * Result: 1 if a matching prefix was found, 0 if not. + */ +int +trie_match_longest_ip4(const struct f_trie *t, const net_addr_ip4 *net, net_addr_ip4 *dst, ip4_addr *found0) +{ + ASSERT(t->ipv4); + + const ip4_addr prefix = net->prefix; + const int pxlen = net->pxlen; + + const struct f_trie_node4 *n = &t->root.v4; + int len = 0; + + ip4_addr found = IP4_NONE; + int last = -1; + + while (n) + { + /* We are out of path */ + if (!ip4_prefix_equal(prefix, n->addr, MIN(pxlen, n->plen))) + goto done; + + /* Check accept mask */ + for (; len < n->plen; len++) + { + if (len > pxlen) + goto done; + + if (ip4_getbit(n->accept, len - 1)) + { + /* len is always < 32 due to len < n->plen */ + ip4_setbit(&found, len); + last = len; + } + } + + /* Special case for max length, there is only one valid local position */ + if (len == IP4_MAX_PREFIX_LENGTH) + { + if (n->local & (1u << 1)) + last = len; + + goto done; + } + + /* Check local mask */ + for (int pos = 1; pos < (1 << TRIE_STEP); pos = 2 * pos + ip4_getbit(prefix, len), len++) + { + if (len > pxlen) + goto done; + + if (n->local & (1u << pos)) + { + /* len is always < 32 due to special case above */ + ip4_setbit(&found, len); + last = len; + } + } + + /* Choose child */ + n = n->c[ip4_getbits(prefix, n->plen, TRIE_STEP)]; + } + +done: + if (last < 0) + return 0; + + *dst = NET_ADDR_IP4(ip4_and(prefix, ip4_mkmask(last)), last); + + if (found0) + *found0 = found; + + return 1; +} + + +/** + * trie_match_longest_ip6 + * @t: trie + * @net: net address + * @dst: return value + * @found0: optional returned bitmask of found nodes + * + * Perform longest prefix match for the address @net and return the resulting + * prefix in the buffer @dst. The bitmask @found0 is used to report lengths of + * prefixes on the path from the root to the resulting prefix. E.g., if there is + * also a /20 shorter matching prefix, then 20-th bit is set in @found0. This + * can be used to enumerate all matching prefixes for the network @net using + * function trie_match_next_longest_ip6() or macro TRIE_WALK_TO_ROOT_IP6(). + * + * This function assumes IPv6 trie, there is also an IPv4 variant. The @net + * argument is typed as net_addr_ip6, but would accept any IPv6-based net_addr, + * like net6_prefix(). Anyway, returned @dst is always net_addr_ip6. + * + * Result: 1 if a matching prefix was found, 0 if not. + */ +int +trie_match_longest_ip6(const struct f_trie *t, const net_addr_ip6 *net, net_addr_ip6 *dst, ip6_addr *found0) +{ + ASSERT(!t->ipv4); + + const ip6_addr prefix = net->prefix; + const int pxlen = net->pxlen; + + const struct f_trie_node6 *n = &t->root.v6; + int len = 0; + + ip6_addr found = IP6_NONE; + int last = -1; + + while (n) + { + /* We are out of path */ + if (!ip6_prefix_equal(prefix, n->addr, MIN(pxlen, n->plen))) + goto done; + + /* Check accept mask */ + for (; len < n->plen; len++) + { + if (len > pxlen) + goto done; + + if (ip6_getbit(n->accept, len - 1)) + { + /* len is always < 128 due to len < n->plen */ + ip6_setbit(&found, len); + last = len; + } + } + + /* Special case for max length, there is only one valid local position */ + if (len == IP6_MAX_PREFIX_LENGTH) + { + if (n->local & (1u << 1)) + last = len; + + goto done; + } + + /* Check local mask */ + for (int pos = 1; pos < (1 << TRIE_STEP); pos = 2 * pos + ip6_getbit(prefix, len), len++) + { + if (len > pxlen) + goto done; + + if (n->local & (1u << pos)) + { + /* len is always < 128 due to special case above */ + ip6_setbit(&found, len); + last = len; + } + } + + /* Choose child */ + n = n->c[ip6_getbits(prefix, n->plen, TRIE_STEP)]; + } + +done: + if (last < 0) + return 0; + + *dst = NET_ADDR_IP6(ip6_and(prefix, ip6_mkmask(last)), last); + + if (found0) + *found0 = found; + + return 1; +} + +#define SAME_PREFIX(A,B,X,L) ((X) ? ip4_prefix_equal((A)->v4.addr, net4_prefix(B), (L)) : ip6_prefix_equal((A)->v6.addr, net6_prefix(B), (L))) +#define GET_NET_BITS(N,X,A,B) ((X) ? ip4_getbits(net4_prefix(N), (A), (B)) : ip6_getbits(net6_prefix(N), (A), (B))) + +/** + * trie_walk_init + * @s: walk state + * @t: trie + * @net: optional subnet for walk + * + * Initialize walk state for subsequent walk through nodes of the trie @t by + * trie_walk_next(). The argument @net allows to restrict walk to given subnet, + * otherwise full walk over all nodes is used. This is done by finding node at + * or below @net and starting position in it. + */ +void +trie_walk_init(struct f_trie_walk_state *s, const struct f_trie *t, const net_addr *net) +{ + *s = (struct f_trie_walk_state) { + .ipv4 = t->ipv4, + .accept_length = 0, + .start_pos = 1, + .local_pos = 1, + .stack_pos = 0, + .stack[0] = &t->root + }; + + if (!net) + return; + + /* We want to find node of level at least plen */ + int plen = ROUND_DOWN_POW2(net->pxlen, TRIE_STEP); + const struct f_trie_node *n = &t->root; + const int v4 = t->ipv4; + + while (n) + { + int nlen = v4 ? n->v4.plen : n->v6.plen; + + /* We are out of path */ + if (!SAME_PREFIX(n, net, v4, MIN(net->pxlen, nlen))) + break; + + /* We found final node */ + if (nlen >= plen) + { + if (nlen == plen) + { + /* Find proper local_pos, while accept_length is not used */ + int step = net->pxlen - plen; + s->start_pos = s->local_pos = (1u << step) + GET_NET_BITS(net, v4, plen, step); + s->accept_length = plen; + } + else + { + /* Start from pos 1 in local node, but first try accept mask */ + s->accept_length = net->pxlen; + } + + s->stack[0] = n; + return; + } + + /* Choose child */ + n = GET_CHILD(n, v4, GET_NET_BITS(net, v4, nlen, TRIE_STEP)); + } + + s->stack[0] = NULL; + return; +} + +#define GET_ACCEPT_BIT(N,X,B) ((X) ? ip4_getbit((N)->v4.accept, (B)) : ip6_getbit((N)->v6.accept, (B))) +#define GET_LOCAL_BIT(N,X,B) (((X) ? (N)->v4.local : (N)->v6.local) & (1u << (B))) + +/** + * trie_walk_next + * @s: walk state + * @net: return value + * + * Find the next prefix in the trie walk and return it in the buffer @net. + * Prefixes are walked in the usual lexicographic order and may be restricted + * to a subset of the trie during walk setup by trie_walk_init(). Note that the + * trie walk does not iterate reliably over 'implicit' prefixes defined by &low + * and &high fields in prefix patterns, it is supposed to be used on tries + * constructed from 'explicit' prefixes (&low == &plen == &high in call to + * trie_add_prefix()). + * + * Result: 1 if the next prefix was found, 0 for the end of walk. + */ +int +trie_walk_next(struct f_trie_walk_state *s, net_addr *net) +{ + const struct f_trie_node *n = s->stack[s->stack_pos]; + int len = s->accept_length; + int pos = s->local_pos; + int v4 = s->ipv4; + + /* + * The walk has three basic state variables -- n, len and pos. In each node n, + * we first walk superprefixes (by len in &accept bitmask), and then we walk + * internal positions (by pos in &local bitmask). These positions are: + * + * 1 + * 2 3 + * 4 5 6 7 + * 8 9 A B C D E F + * + * We walk them depth-first, including virtual positions 10-1F that are + * equivalent of position 1 in child nodes 0-F. + */ + + if (!n) + { + memset(net, 0, v4 ? sizeof(net_addr_ip4) : sizeof(net_addr_ip6)); + return 0; + } + +next_node:; + /* Current node prefix length */ + int nlen = v4 ? n->v4.plen : n->v6.plen; + + /* First, check for accept prefix */ + for (; len < nlen; len++) + if (GET_ACCEPT_BIT(n, v4, len - 1)) + { + if (v4) + net_fill_ip4(net, ip4_and(n->v4.addr, ip4_mkmask(len)), len); + else + net_fill_ip6(net, ip6_and(n->v6.addr, ip6_mkmask(len)), len); + + s->local_pos = pos; + s->accept_length = len + 1; + return 1; + } + +next_pos: + /* Bottom of this node */ + if (pos >= (1 << TRIE_STEP)) + { + const struct f_trie_node *child = GET_CHILD(n, v4, pos - (1 << TRIE_STEP)); + int dir = 0; + + /* No child node */ + if (!child) + { + /* Step up until return from left child (pos is even) */ + do + { + /* Step up from start node */ + if ((s->stack_pos == 0) && (pos == s->start_pos)) + { + s->stack[0] = NULL; + memset(net, 0, v4 ? sizeof(net_addr_ip4) : sizeof(net_addr_ip6)); + return 0; + } + + /* Top of this node */ + if (pos == 1) + { + ASSERT(s->stack_pos); + const struct f_trie_node *old = n; + + /* Move to parent node */ + s->stack_pos--; + n = s->stack[s->stack_pos]; + nlen = v4 ? n->v4.plen : n->v6.plen; + + pos = v4 ? + ip4_getbits(old->v4.addr, nlen, TRIE_STEP) : + ip6_getbits(old->v6.addr, nlen, TRIE_STEP); + pos += (1 << TRIE_STEP); + len = nlen; + + ASSERT(GET_CHILD(n, v4, pos - (1 << TRIE_STEP)) == old); + } + + /* Step up */ + dir = pos % 2; + pos = pos / 2; + } + while (dir); + + /* Continue with step down to the right child */ + pos = 2 * pos + 1; + goto next_pos; + } + + /* Move to child node */ + pos = 1; + len = nlen + TRIE_STEP; + + s->stack_pos++; + n = s->stack[s->stack_pos] = child; + goto next_node; + } + + /* Check for local prefix */ + if (GET_LOCAL_BIT(n, v4, pos)) + { + /* Convert pos to address of local network */ + int x = (pos >= 2) + (pos >= 4) + (pos >= 8); + int y = pos & ((1u << x) - 1); + + if (v4) + net_fill_ip4(net, !x ? n->v4.addr : ip4_setbits(n->v4.addr, nlen + x - 1, y), nlen + x); + else + net_fill_ip6(net, !x ? n->v6.addr : ip6_setbits(n->v6.addr, nlen + x - 1, y), nlen + x); + + s->local_pos = 2 * pos; + s->accept_length = len; + return 1; + } + + /* Step down */ + pos = 2 * pos; + goto next_pos; +} + + static int trie_node_same4(const struct f_trie_node4 *t1, const struct f_trie_node4 *t2) { @@ -392,7 +1014,11 @@ trie_node_same4(const struct f_trie_node4 *t1, const struct f_trie_node4 *t2) (! ip4_equal(t1->accept, t2->accept))) return 0; - return trie_node_same4(t1->c[0], t2->c[0]) && trie_node_same4(t1->c[1], t2->c[1]); + for (uint i = 0; i < (1 << TRIE_STEP); i++) + if (! trie_node_same4(t1->c[i], t2->c[i])) + return 0; + + return 1; } static int @@ -409,7 +1035,11 @@ trie_node_same6(const struct f_trie_node6 *t1, const struct f_trie_node6 *t2) (! ip6_equal(t1->accept, t2->accept))) return 0; - return trie_node_same6(t1->c[0], t2->c[0]) && trie_node_same6(t1->c[1], t2->c[1]); + for (uint i = 0; i < (1 << TRIE_STEP); i++) + if (! trie_node_same6(t1->c[i], t2->c[i])) + return 0; + + return 1; } /** @@ -431,30 +1061,70 @@ trie_same(const struct f_trie *t1, const struct f_trie *t2) return trie_node_same6(&t1->root.v6, &t2->root.v6); } + +static const u8 log2[16] = {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3}; + static void -trie_node_format4(const struct f_trie_node4 *t, buffer *buf) +trie_node_format(const struct f_trie_node *n, buffer *buf, int v4) { - if (t == NULL) + if (n == NULL) return; - if (ip4_nonzero(t->accept)) - buffer_print(buf, "%I4/%d{%I4}, ", t->addr, t->plen, t->accept); + if (v4) + { + if (ip4_nonzero(n->v4.accept)) + buffer_print(buf, "%I4/%d{%I4}, ", n->v4.addr, n->v4.plen, n->v4.accept); + } + else + { + if (ip6_nonzero(n->v6.accept)) + buffer_print(buf, "%I6/%d{%I6}, ", n->v6.addr, n->v6.plen, n->v6.accept); + } - trie_node_format4(t->c[0], buf); - trie_node_format4(t->c[1], buf); -} + int nlen = v4 ? n->v4.plen : n->v6.plen; + uint local = v4 ? n->v4.local : n->v6.local; -static void -trie_node_format6(const struct f_trie_node6 *t, buffer *buf) -{ - if (t == NULL) - return; + for (int i = (nlen ? 0 : 1); i < TRIE_STEP; i++) + if (GET_ACCEPT_BIT(n, v4, nlen + i - 1)) + local &= ~trie_level_mask(1, i); - if (ip6_nonzero(t->accept)) - buffer_print(buf, "%I6/%d{%I6}, ", t->addr, t->plen, t->accept); + for (int pos = 2; local && (pos < (1 << TRIE_STEP)); pos++) + if (local & (1u << pos)) + { + int lvl = log2[pos]; + int plen = nlen + lvl; + + int i; + for (i = 0; lvl + i < TRIE_STEP; i++) + { + uint lmask = trie_level_mask(pos, i); + + if ((local & lmask) != lmask) + break; + + local &= ~lmask; + } + + uint addr_bits = pos & ((1u << lvl) - 1); + uint accept_bits = (1u << i) - 1; + int h = plen + i - 1; + + if (v4) + { + ip4_addr addr = ip4_setbits(n->v4.addr, plen - 1, addr_bits); + ip4_addr mask = ip4_setbits(IP4_NONE, h - 1, accept_bits); + buffer_print(buf, "%I4/%d{%I4}, ", addr, plen, mask); + } + else + { + ip6_addr addr = ip6_setbits(n->v6.addr, plen - 1, addr_bits); + ip6_addr mask = ip6_setbits(IP6_NONE, h - 1, accept_bits); + buffer_print(buf, "%I6/%d{%I6}, ", addr, plen, mask); + } + } - trie_node_format6(t->c[0], buf); - trie_node_format6(t->c[1], buf); + for (int i = 0; i < (1 << TRIE_STEP); i++) + trie_node_format(GET_CHILD(n, v4, i), buf, v4); } /** @@ -472,10 +1142,7 @@ trie_format(const struct f_trie *t, buffer *buf) if (t->zero) buffer_print(buf, "%I/%d, ", t->ipv4 ? IPA_NONE4 : IPA_NONE6, 0); - if (t->ipv4) - trie_node_format4(&t->root.v4, buf); - else - trie_node_format6(&t->root.v6, buf); + trie_node_format(&t->root, buf, t->ipv4); if (buf->pos == buf->end) return; diff --git a/filter/trie_test.c b/filter/trie_test.c index b2b36716..ddce2daa 100644 --- a/filter/trie_test.c +++ b/filter/trie_test.c @@ -14,9 +14,12 @@ #include "conf/conf.h" #define TESTS_NUM 10 -#define PREFIXES_NUM 10 +#define PREFIXES_NUM 32 #define PREFIX_TESTS_NUM 10000 +#define PREFIX_BENCH_NUM 100000000 +#define TRIE_BUFFER_SIZE 1024 +#define TEST_BUFFER_SIZE (1024*1024) #define BIG_BUFFER_SIZE 10000 /* Wrapping structure for storing f_prefixes structures in list */ @@ -31,146 +34,849 @@ xrandom(u32 max) return (bt_random() % max); } +static inline uint +get_exp_random(void) +{ + uint r, n = 0; + + for (r = bt_random(); r & 1; r = r >> 1) + n++; + + return n; +} + static int -is_prefix_included(list *prefixes, struct f_prefix *needle) +compare_prefixes(const void *a, const void *b) { - struct f_prefix_node *n; - WALK_LIST(n, *prefixes) - { - ip6_addr cmask = ip6_mkmask(MIN(n->prefix.net.pxlen, needle->net.pxlen)); + return net_compare(&((const struct f_prefix *) a)->net, + &((const struct f_prefix *) b)->net); +} + +static inline int +matching_ip4_nets(const net_addr_ip4 *a, const net_addr_ip4 *b) +{ + ip4_addr cmask = ip4_mkmask(MIN(a->pxlen, b->pxlen)); + return ip4_compare(ip4_and(a->prefix, cmask), ip4_and(b->prefix, cmask)) == 0; +} + +static inline int +matching_ip6_nets(const net_addr_ip6 *a, const net_addr_ip6 *b) +{ + ip6_addr cmask = ip6_mkmask(MIN(a->pxlen, b->pxlen)); + return ip6_compare(ip6_and(a->prefix, cmask), ip6_and(b->prefix, cmask)) == 0; +} - ip6_addr ip = net6_prefix(&n->prefix.net); - ip6_addr needle_ip = net6_prefix(&needle->net); +static inline int +matching_nets(const net_addr *a, const net_addr *b) +{ + if (a->type != b->type) + return 0; + + return (a->type == NET_IP4) ? + matching_ip4_nets((const net_addr_ip4 *) a, (const net_addr_ip4 *) b) : + matching_ip6_nets((const net_addr_ip6 *) a, (const net_addr_ip6 *) b); +} - if ((ipa_compare(ipa_and(ip, cmask), ipa_and(needle_ip, cmask)) == 0) && - (n->prefix.lo <= needle->net.pxlen) && (needle->net.pxlen <= n->prefix.hi)) +static int +is_prefix_included(list *prefixes, const net_addr *needle) +{ + struct f_prefix_node *n; + WALK_LIST(n, *prefixes) + if (matching_nets(&n->prefix.net, needle) && + (n->prefix.lo <= needle->pxlen) && (needle->pxlen <= n->prefix.hi)) { - bt_debug("FOUND\t" PRIip6 "/%d %d-%d\n", ARGip6(net6_prefix(&n->prefix.net)), n->prefix.net.pxlen, n->prefix.lo, n->prefix.hi); + char buf[64]; + bt_format_net(buf, 64, &n->prefix.net); + bt_debug("FOUND %s %d-%d\n", buf, n->prefix.lo, n->prefix.hi); + return 1; /* OK */ } - } + return 0; /* FAIL */ } -static struct f_prefix -get_random_ip6_prefix(void) +static void +get_random_net(net_addr *net, int v6) { - struct f_prefix p; - u8 pxlen = xrandom(120)+8; - ip6_addr ip6 = ip6_build(bt_random(),bt_random(),bt_random(),bt_random()); - net_addr_ip6 net6 = NET_ADDR_IP6(ip6, pxlen); + if (!v6) + { + uint pxlen = xrandom(24)+8; + ip4_addr ip4 = ip4_from_u32((u32) bt_random()); + net_fill_ip4(net, ip4_and(ip4, ip4_mkmask(pxlen)), pxlen); + } + else + { + uint pxlen = xrandom(120)+8; + ip6_addr ip6 = ip6_build(bt_random(), bt_random(), bt_random(), bt_random()); + net_fill_ip6(net, ip6_and(ip6, ip6_mkmask(pxlen)), pxlen); + } +} - p.net = *((net_addr*) &net6); +static void +get_random_prefix(struct f_prefix *px, int v6, int tight) +{ + get_random_net(&px->net, v6); + + if (tight) + { + px->lo = px->hi = px->net.pxlen; + } + else if (bt_random() % 2) + { + px->lo = 0; + px->hi = px->net.pxlen; + } + else + { + px->lo = px->net.pxlen; + px->hi = net_max_prefix_length[px->net.type]; + } +} + +static void +get_random_ip4_subnet(net_addr_ip4 *net, const net_addr_ip4 *src, int pxlen) +{ + *net = NET_ADDR_IP4(ip4_and(src->prefix, ip4_mkmask(pxlen)), pxlen); + + if (pxlen > src->pxlen) + { + ip4_addr rnd = ip4_from_u32((u32) bt_random()); + ip4_addr mask = ip4_xor(ip4_mkmask(src->pxlen), ip4_mkmask(pxlen)); + net->prefix = ip4_or(net->prefix, ip4_and(rnd, mask)); + } +} + +static void +get_random_ip6_subnet(net_addr_ip6 *net, const net_addr_ip6 *src, int pxlen) +{ + *net = NET_ADDR_IP6(ip6_and(src->prefix, ip6_mkmask(pxlen)), pxlen); + + if (pxlen > src->pxlen) + { + ip6_addr rnd = ip6_build(bt_random(), bt_random(), bt_random(), bt_random()); + ip6_addr mask = ip6_xor(ip6_mkmask(src->pxlen), ip6_mkmask(pxlen)); + net->prefix = ip6_or(net->prefix, ip6_and(rnd, mask)); + } +} + +static void +get_random_subnet(net_addr *net, const net_addr *src, int pxlen) +{ + if (src->type == NET_IP4) + get_random_ip4_subnet((net_addr_ip4 *) net, (const net_addr_ip4 *) src, pxlen); + else + get_random_ip6_subnet((net_addr_ip6 *) net, (const net_addr_ip6 *) src, pxlen); +} + +static void +get_inner_net(net_addr *net, const struct f_prefix *src) +{ + int pxlen, step; if (bt_random() % 2) { - p.lo = 0; - p.hi = p.net.pxlen; + step = get_exp_random(); + step = MIN(step, src->hi - src->lo); + pxlen = (bt_random() % 2) ? (src->lo + step) : (src->hi - step); } else + pxlen = src->lo + bt_random() % (src->hi - src->lo + 1); + + get_random_subnet(net, &src->net, pxlen); +} + +static void +swap_random_bits_ip4(net_addr_ip4 *net, int num) +{ + for (int i = 0; i < num; i++) { - p.lo = p.net.pxlen; - p.hi = net_max_prefix_length[p.net.type]; + ip4_addr swap = IP4_NONE; + ip4_setbit(&swap, bt_random() % net->pxlen); + net->prefix = ip4_xor(net->prefix, swap); } +} - return p; +static void +swap_random_bits_ip6(net_addr_ip6 *net, int num) +{ + for (int i = 0; i < num; i++) + { + ip6_addr swap = IP6_NONE; + ip6_setbit(&swap, bt_random() % net->pxlen); + net->prefix = ip6_xor(net->prefix, swap); + } } static void -generate_random_ipv6_prefixes(list *prefixes) +swap_random_bits(net_addr *net, int num) { - int i; - for (i = 0; i < PREFIXES_NUM; i++) + if (net->type == NET_IP4) + swap_random_bits_ip4((net_addr_ip4 *) net, num); + else + swap_random_bits_ip6((net_addr_ip6 *) net, num); +} + +static void +get_outer_net(net_addr *net, const struct f_prefix *src) +{ + int pxlen, step; + int inside = 0; + int max = net_max_prefix_length[src->net.type]; + + if ((src->lo > 0) && (bt_random() % 3)) + { + step = 1 + get_exp_random(); + step = MIN(step, src->lo); + pxlen = src->lo - step; + } + else if ((src->hi < max) && (bt_random() % 2)) { - struct f_prefix f = get_random_ip6_prefix(); + step = 1 + get_exp_random(); + step = MIN(step, max - src->hi); + pxlen = src->hi + step; + } + else + { + pxlen = src->lo + bt_random() % (src->hi - src->lo + 1); + inside = 1; + } - struct f_prefix_node *px = calloc(1, sizeof(struct f_prefix_node)); - px->prefix = f; + get_random_subnet(net, &src->net, pxlen); - bt_debug("ADD\t" PRIip6 "/%d %d-%d\n", ARGip6(net6_prefix(&px->prefix.net)), px->prefix.net.pxlen, px->prefix.lo, px->prefix.hi); + /* Perhaps swap some bits in prefix */ + if ((net->pxlen > 0) && (inside || (bt_random() % 4))) + swap_random_bits(net, 1 + get_exp_random()); +} + +static list * +make_random_prefix_list(int num, int v6, int tight) +{ + list *prefixes = tmp_allocz(sizeof(struct f_prefix_node)); + init_list(prefixes); + + for (int i = 0; i < num; i++) + { + struct f_prefix_node *px = tmp_allocz(sizeof(struct f_prefix_node)); + get_random_prefix(&px->prefix, v6, tight); add_tail(prefixes, &px->n); + + char buf[64]; + bt_format_net(buf, 64, &px->prefix.net); + bt_debug("ADD %s{%d,%d}\n", buf, px->prefix.lo, px->prefix.hi); + } + + return prefixes; +} + +static struct f_trie * +make_trie_from_prefix_list(list *prefixes) +{ + struct f_trie *trie = f_new_trie(tmp_linpool, 0); + + struct f_prefix_node *n; + WALK_LIST(n, *prefixes) + trie_add_prefix(trie, &n->prefix.net, n->prefix.lo, n->prefix.hi); + + return trie; +} + +/* + * Read sequence of prefixes from file handle and return prefix list. + * Each prefix is on one line, sequence terminated by empty line or eof. + * Arg @plus means prefix should include all longer ones. + */ +static list * +read_prefix_list(FILE *f, int v6, int plus) +{ + ASSERT(!v6); + + uint a0, a1, a2, a3, pl; + char s[32]; + int n; + + list *pxlist = tmp_allocz(sizeof(struct f_prefix_node)); + init_list(pxlist); + + errno = 0; + while (fgets(s, 32, f)) + { + if (s[0] == '\n') + return pxlist; + + n = sscanf(s, "%u.%u.%u.%u/%u", &a0, &a1, &a2, &a3, &pl); + + if (n != 5) + bt_abort_msg("Invalid content of trie_data"); + + struct f_prefix_node *px = tmp_allocz(sizeof(struct f_prefix_node)); + net_fill_ip4(&px->prefix.net, ip4_build(a0, a1, a2, a3), pl); + px->prefix.lo = pl; + px->prefix.hi = plus ? IP4_MAX_PREFIX_LENGTH : pl; + add_tail(pxlist, &px->n); + + char buf[64]; + bt_format_net(buf, 64, &px->prefix.net); + bt_debug("ADD %s{%d,%d}\n", buf, px->prefix.lo, px->prefix.hi); } + + bt_syscall(errno, "fgets()"); + return EMPTY_LIST(*pxlist) ? NULL : pxlist; } +/* + * Open file, read multiple sequences of prefixes from it. Fill @data with + * prefix lists and @trie with generated tries. Return number of sequences / + * tries. Use separate linpool @lp0 for prefix lists and @lp1 for tries. + * Arg @plus means prefix should include all longer ones. + */ static int -t_match_net(void) +read_prefix_file(const char *filename, int plus, + list *data[], struct f_trie *trie[]) +{ + FILE *f = fopen(filename, "r"); + bt_syscall(!f, "fopen(%s)", filename); + + int n = 0; + list *pxlist; + while (pxlist = read_prefix_list(f, 0, plus)) + { + data[n] = pxlist; + trie[n] = make_trie_from_prefix_list(pxlist); + bt_debug("NEXT\n"); + n++; + } + + fclose(f); + bt_debug("DONE reading %d tries\n", n); + + return n; +} + +/* + * Select random subset of @dn prefixes from prefix list @src of length @sn, + * and store them to buffer @dst (of size @dn). Prefixes may be chosen multiple + * times. Randomize order of prefixes in @dst buffer. + */ +static void +select_random_prefix_subset(list *src[], net_addr dst[], int sn, int dn) +{ + int pn = 0; + + if (!dn) + return; + + /* Compute total prefix number */ + for (int i = 0; i < sn; i++) + pn += list_length(src[i]); + + /* Change of selecting a prefix */ + int rnd = (pn / dn) + 10; + int n = 0; + + /* Iterate indefinitely over src array */ + for (int i = 0; 1; i++, i = (i < sn) ? i : 0) + { + struct f_prefix_node *px; + WALK_LIST(px, *src[i]) + { + if (xrandom(rnd) != 0) + continue; + + net_copy(&dst[n], &px->prefix.net); + n++; + + /* We have enough */ + if (n == dn) + goto done; + } + } + +done: + /* Shuffle networks */ + for (int i = 0; i < dn; i++) + { + int j = xrandom(dn); + + if (i == j) + continue; + + net_addr tmp; + net_copy(&tmp, &dst[i]); + net_copy(&dst[i], &dst[j]); + net_copy(&dst[j], &tmp); + } +} + +/* Fill @dst buffer with @dn randomly generated /32 prefixes */ +static void +make_random_addresses(net_addr dst[], int dn) +{ + for (int i = 0; i < dn; i++) + net_fill_ip4(&dst[i], ip4_from_u32((u32) bt_random()), IP4_MAX_PREFIX_LENGTH); +} + +static void +test_match_net(list *prefixes, struct f_trie *trie, const net_addr *net) +{ + char buf[64]; + bt_format_net(buf, 64, net); + bt_debug("TEST %s\n", buf); + + int should_be = is_prefix_included(prefixes, net); + int is_there = trie_match_net(trie, net); + + bt_assert_msg(should_be == is_there, "Prefix %s %s match", buf, + (should_be ? "should" : "should not")); +} + +static int +t_match_random_net(void) { bt_bird_init(); bt_config_parse(BT_CONFIG_SIMPLE); - uint round; - for (round = 0; round < TESTS_NUM; round++) + int v6 = 0; + for (int round = 0; round < TESTS_NUM; round++) { - list prefixes; /* of structs f_extended_prefix */ - init_list(&prefixes); - struct f_trie *trie = f_new_trie(config->mem, 0); + list *prefixes = make_random_prefix_list(PREFIXES_NUM, v6, 0); + struct f_trie *trie = make_trie_from_prefix_list(prefixes); - generate_random_ipv6_prefixes(&prefixes); - struct f_prefix_node *n; - WALK_LIST(n, prefixes) + for (int i = 0; i < PREFIX_TESTS_NUM; i++) { - trie_add_prefix(trie, &n->prefix.net, n->prefix.lo, n->prefix.hi); + net_addr net; + get_random_net(&net, v6); + test_match_net(prefixes, trie, &net); } - int i; - for (i = 0; i < PREFIX_TESTS_NUM; i++) + v6 = !v6; + tmp_flush(); + } + + bt_bird_cleanup(); + return 1; +} + +static int +t_match_inner_net(void) +{ + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); + + int v6 = 0; + for (int round = 0; round < TESTS_NUM; round++) + { + list *prefixes = make_random_prefix_list(PREFIXES_NUM, v6, 0); + struct f_trie *trie = make_trie_from_prefix_list(prefixes); + + struct f_prefix_node *n = HEAD(*prefixes); + for (int i = 0; i < PREFIX_TESTS_NUM; i++) { - struct f_prefix f = get_random_ip6_prefix(); - bt_debug("TEST\t" PRIip6 "/%d\n", ARGip6(net6_prefix(&f.net)), f.net.pxlen); + net_addr net; + get_inner_net(&net, &n->prefix); + test_match_net(prefixes, trie, &net); - int should_be = is_prefix_included(&prefixes, &f); - int is_there = trie_match_net(trie, &f.net); - bt_assert_msg(should_be == is_there, "Prefix " PRIip6 "/%d %s", ARGip6(net6_prefix(&f.net)), f.net.pxlen, (should_be ? "should be found in trie" : "should not be found in trie")); + n = NODE_VALID(NODE_NEXT(n)) ? NODE_NEXT(n) : HEAD(*prefixes); } - struct f_prefix_node *nxt; - WALK_LIST_DELSAFE(n, nxt, prefixes) + v6 = !v6; + tmp_flush(); + } + + bt_bird_cleanup(); + return 1; +} + +static int +t_match_outer_net(void) +{ + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); + + int v6 = 0; + for (int round = 0; round < TESTS_NUM; round++) + { + list *prefixes = make_random_prefix_list(PREFIXES_NUM, v6, 0); + struct f_trie *trie = make_trie_from_prefix_list(prefixes); + + struct f_prefix_node *n = HEAD(*prefixes); + for (int i = 0; i < PREFIX_TESTS_NUM; i++) { - free(n); + net_addr net; + get_outer_net(&net, &n->prefix); + test_match_net(prefixes, trie, &net); + + n = NODE_VALID(NODE_NEXT(n)) ? NODE_NEXT(n) : HEAD(*prefixes); } + + v6 = !v6; + tmp_flush(); } + v6 = !v6; + bt_bird_cleanup(); + return 1; +} + +/* + * Read prefixes from @filename, build set of tries, prepare test data and do + * PREFIX_BENCH_NUM trie lookups. With @plus = 0, use random subset of known + * prefixes as test data, with @plus = 1, use randomly generated /32 prefixes + * as test data. + */ +static int +benchmark_trie_dataset(const char *filename, int plus) +{ + int n = 0; + list *data[TRIE_BUFFER_SIZE]; + struct f_trie *trie[TRIE_BUFFER_SIZE]; + net_addr *nets; + + bt_reset_suite_case_timer(); + bt_log_suite_case_result(1, "Reading %s", filename, n); + n = read_prefix_file(filename, plus, data, trie); + bt_log_suite_case_result(1, "Read prefix data, %d lists, ", n); + + size_t trie_size = rmemsize(tmp_linpool).effective * 1000 / (1024*1024); + bt_log_suite_case_result(1, "Trie size %u.%03u MB", + (uint) (trie_size / 1000), (uint) (trie_size % 1000)); + + int t = PREFIX_BENCH_NUM / n; + int tb = MIN(t, TEST_BUFFER_SIZE); + nets = tmp_alloc(tb * sizeof(net_addr)); + + if (!plus) + select_random_prefix_subset(data, nets, n, tb); + else + make_random_addresses(nets, tb); + + bt_log_suite_case_result(1, "Make test data, %d (%d) tests", t, tb); + bt_reset_suite_case_timer(); + + /* + int match = 0; + for (int i = 0; i < t; i++) + for (int j = 0; j < n; j++) + test_match_net(data[j], trie[j], &nets[i]); + */ + + int match = 0; + for (int i = 0; i < t; i++) + for (int j = 0; j < n; j++) + if (trie_match_net(trie[j], &nets[i % TEST_BUFFER_SIZE])) + match++; + + bt_log_suite_case_result(1, "Matching done, %d / %d matches", match, t * n); + + tmp_flush(); + return 1; +} + +static int UNUSED +t_bench_trie_datasets_subset(void) +{ + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); + + /* Specific datasets, not included */ + benchmark_trie_dataset("trie-data-bgp-1", 0); + benchmark_trie_dataset("trie-data-bgp-10", 0); + benchmark_trie_dataset("trie-data-bgp-100", 0); + benchmark_trie_dataset("trie-data-bgp-1000", 0); + bt_bird_cleanup(); + return 1; } +static int UNUSED +t_bench_trie_datasets_random(void) +{ + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); + + /* Specific datasets, not included */ + benchmark_trie_dataset("trie-data-bgp-1", 1); + benchmark_trie_dataset("trie-data-bgp-10", 1); + benchmark_trie_dataset("trie-data-bgp-100", 1); + benchmark_trie_dataset("trie-data-bgp-1000", 1); + + bt_bird_cleanup(); + + return 1; +} + + static int t_trie_same(void) { bt_bird_init(); bt_config_parse(BT_CONFIG_SIMPLE); - int round; - for (round = 0; round < TESTS_NUM*4; round++) + int v6 = 0; + for (int round = 0; round < TESTS_NUM*4; round++) { - struct f_trie * trie1 = f_new_trie(config->mem, 0); - struct f_trie * trie2 = f_new_trie(config->mem, 0); + list *prefixes = make_random_prefix_list(100 * PREFIXES_NUM, v6, 0); + struct f_trie *trie1 = f_new_trie(tmp_linpool, 0); + struct f_trie *trie2 = f_new_trie(tmp_linpool, 0); - list prefixes; /* a list of f_extended_prefix structures */ - init_list(&prefixes); - int i; - for (i = 0; i < 100; i++) - generate_random_ipv6_prefixes(&prefixes); + struct f_prefix_node *n; + WALK_LIST(n, *prefixes) + trie_add_prefix(trie1, &n->prefix.net, n->prefix.lo, n->prefix.hi); + + WALK_LIST_BACKWARDS(n, *prefixes) + trie_add_prefix(trie2, &n->prefix.net, n->prefix.lo, n->prefix.hi); + + bt_assert(trie_same(trie1, trie2)); + + v6 = !v6; + tmp_flush(); + } + + bt_bird_cleanup(); + return 1; +} + +static inline void +log_networks(const net_addr *a, const net_addr *b) +{ + if (bt_verbose >= BT_VERBOSE_ABSOLUTELY_ALL) + { + char buf0[64]; + char buf1[64]; + bt_format_net(buf0, 64, a); + bt_format_net(buf1, 64, b); + bt_debug("Found %s expected %s\n", buf0, buf1); + } +} + +static int +t_trie_walk(void) +{ + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); + + for (int round = 0; round < TESTS_NUM*8; round++) + { + int level = round / TESTS_NUM; + int v6 = level % 2; + int num = PREFIXES_NUM * (int[]){1, 10, 100, 1000}[level / 2]; + int pos = 0, end = 0; + list *prefixes = make_random_prefix_list(num, v6, 1); + struct f_trie *trie = make_trie_from_prefix_list(prefixes); + struct f_prefix *pxset = malloc((num + 1) * sizeof(struct f_prefix)); struct f_prefix_node *n; - WALK_LIST(n, prefixes) + WALK_LIST(n, *prefixes) + pxset[pos++] = n->prefix; + memset(&pxset[pos], 0, sizeof (struct f_prefix)); + + qsort(pxset, num, sizeof(struct f_prefix), compare_prefixes); + + + /* Full walk */ + bt_debug("Full walk (round %d, %d nets)\n", round, num); + + pos = 0; + uint pxc = 0; + TRIE_WALK(trie, net, NULL) { - trie_add_prefix(trie1, &n->prefix.net, n->prefix.lo, n->prefix.hi); + log_networks(&net, &pxset[pos].net); + bt_assert(net_equal(&net, &pxset[pos].net)); + + /* Skip possible duplicates */ + while (net_equal(&pxset[pos].net, &pxset[pos + 1].net)) + pos++; + + pos++; + pxc++; } - WALK_LIST_BACKWARDS(n, prefixes) + TRIE_WALK_END; + + bt_assert(pos == num); + bt_assert(pxc == trie->prefix_count); + bt_debug("Full walk done\n"); + + + /* Prepare net for subnet walk - start with random prefix */ + pos = bt_random() % num; + end = pos + (int[]){2, 2, 3, 4}[level / 2]; + end = MIN(end, num); + + struct f_prefix from = pxset[pos]; + + /* Find a common superprefix to several subsequent prefixes */ + for (; pos < end; pos++) { - trie_add_prefix(trie2, &n->prefix.net, n->prefix.lo, n->prefix.hi); + if (net_equal(&from.net, &pxset[pos].net)) + continue; + + int common = !v6 ? + ip4_pxlen(net4_prefix(&from.net), net4_prefix(&pxset[pos].net)) : + ip6_pxlen(net6_prefix(&from.net), net6_prefix(&pxset[pos].net)); + from.net.pxlen = MIN(from.net.pxlen, common); + + if (!v6) + ((net_addr_ip4 *) &from.net)->prefix = + ip4_and(net4_prefix(&from.net), net4_prefix(&pxset[pos].net)); + else + ((net_addr_ip6 *) &from.net)->prefix = + ip6_and(net6_prefix(&from.net), net6_prefix(&pxset[pos].net)); } - bt_assert(trie_same(trie1, trie2)); + /* Fix irrelevant bits */ + if (!v6) + ((net_addr_ip4 *) &from.net)->prefix = + ip4_and(net4_prefix(&from.net), ip4_mkmask(net4_pxlen(&from.net))); + else + ((net_addr_ip6 *) &from.net)->prefix = + ip6_and(net6_prefix(&from.net), ip6_mkmask(net6_pxlen(&from.net))); + + + /* Find initial position for final prefix */ + for (pos = 0; pos < num; pos++) + if (compare_prefixes(&pxset[pos], &from) >= 0) + break; + + int p0 = pos; + char buf0[64]; + bt_format_net(buf0, 64, &from.net); + bt_debug("Subnet walk for %s (round %d, %d nets)\n", buf0, round, num); + + /* Subnet walk */ + TRIE_WALK(trie, net, &from.net) + { + log_networks(&net, &pxset[pos].net); + bt_assert(net_equal(&net, &pxset[pos].net)); + bt_assert(net_in_netX(&net, &from.net)); + + /* Skip possible duplicates */ + while (net_equal(&pxset[pos].net, &pxset[pos + 1].net)) + pos++; - struct f_prefix_node *nxt; - WALK_LIST_DELSAFE(n, nxt, prefixes) + pos++; + } + TRIE_WALK_END; + + bt_assert((pos == num) || !net_in_netX(&pxset[pos].net, &from.net)); + bt_debug("Subnet walk done for %s (found %d nets)\n", buf0, pos - p0); + + tmp_flush(); + } + + bt_bird_cleanup(); + return 1; +} + +static int +find_covering_nets(struct f_prefix *prefixes, int num, const net_addr *net, net_addr *found) +{ + struct f_prefix key; + net_addr *n = &key.net; + int found_num = 0; + + net_copy(n, net); + + while (1) + { + struct f_prefix *px = + bsearch(&key, prefixes, num, sizeof(struct f_prefix), compare_prefixes); + + if (px) + { + net_copy(&found[found_num], n); + found_num++; + } + + if (n->pxlen == 0) + return found_num; + + n->pxlen--; + + if (n->type == NET_IP4) + ip4_clrbit(&((net_addr_ip4 *) n)->prefix, n->pxlen); + else + ip6_clrbit(&((net_addr_ip6 *) n)->prefix, n->pxlen); + } +} + +static int +t_trie_walk_to_root(void) +{ + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); + + for (int round = 0; round < TESTS_NUM * 4; round++) + { + int level = round / TESTS_NUM; + int v6 = level % 2; + int num = PREFIXES_NUM * (int[]){32, 512}[level / 2]; + int pos = 0; + int st = 0, sn = 0, sm = 0; + + list *prefixes = make_random_prefix_list(num, v6, 1); + struct f_trie *trie = make_trie_from_prefix_list(prefixes); + struct f_prefix *pxset = malloc((num + 1) * sizeof(struct f_prefix)); + + struct f_prefix_node *pxn; + WALK_LIST(pxn, *prefixes) + pxset[pos++] = pxn->prefix; + memset(&pxset[pos], 0, sizeof (struct f_prefix)); + + qsort(pxset, num, sizeof(struct f_prefix), compare_prefixes); + + int i; + for (i = 0; i < (PREFIX_TESTS_NUM / 10); i++) { - free(n); + net_addr from; + get_random_net(&from, v6); + + net_addr found[129]; + int found_num = find_covering_nets(pxset, num, &from, found); + int n = 0; + + if (bt_verbose >= BT_VERBOSE_ABSOLUTELY_ALL) + { + char buf[64]; + bt_format_net(buf, 64, &from); + bt_debug("Lookup for %s (expect %d)\n", buf, found_num); + } + + /* Walk to root, separate for IPv4 and IPv6 */ + if (!v6) + { + TRIE_WALK_TO_ROOT_IP4(trie, (net_addr_ip4 *) &from, net) + { + log_networks((net_addr *) &net, &found[n]); + bt_assert((n < found_num) && net_equal((net_addr *) &net, &found[n])); + n++; + } + TRIE_WALK_TO_ROOT_END; + } + else + { + TRIE_WALK_TO_ROOT_IP6(trie, (net_addr_ip6 *) &from, net) + { + log_networks((net_addr *) &net, &found[n]); + bt_assert((n < found_num) && net_equal((net_addr *) &net, &found[n])); + n++; + } + TRIE_WALK_TO_ROOT_END; + } + + bt_assert(n == found_num); + + /* Stats */ + st += n; + sn += !!n; + sm = MAX(sm, n); } + + bt_debug("Success in %d / %d, sum %d, max %d\n", sn, i, st, sm); + + tmp_flush(); } + bt_bird_cleanup(); return 1; } @@ -179,8 +885,15 @@ main(int argc, char *argv[]) { bt_init(argc, argv); - bt_test_suite(t_match_net, "Testing random prefix matching"); + bt_test_suite(t_match_random_net, "Testing random prefix matching"); + bt_test_suite(t_match_inner_net, "Testing random inner prefix matching"); + bt_test_suite(t_match_outer_net, "Testing random outer prefix matching"); bt_test_suite(t_trie_same, "A trie filled forward should be same with a trie filled backward."); + bt_test_suite(t_trie_walk, "Testing TRIE_WALK() on random tries"); + bt_test_suite(t_trie_walk_to_root, "Testing TRIE_WALK_TO_ROOT() on random tries"); + + // bt_test_suite(t_bench_trie_datasets_subset, "Benchmark tries from datasets by random subset of nets"); + // bt_test_suite(t_bench_trie_datasets_random, "Benchmark tries from datasets by generated addresses"); return bt_exit_value(); } diff --git a/lib/Makefile b/lib/Makefile index 98c5db3c..f4ade9a6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -1,7 +1,7 @@ -src := bitmap.c bitops.c blake2s.c blake2b.c checksum.c event.c flowspec.c idm.c ip.c lists.c mac.c md5.c mempool.c net.c patmatch.c printf.c resource.c rcu.c sha1.c sha256.c sha512.c slab.c slists.c strtoul.c tbf.c timer.c xmalloc.c +src := a-path.c a-set.c bitmap.c bitops.c blake2s.c blake2b.c checksum.c event.c flowspec.c idm.c ip.c lists.c mac.c md5.c mempool.c net.c patmatch.c printf.c rcu.c resource.c sha1.c sha256.c sha512.c slab.c slists.c strtoul.c tbf.c timer.c xmalloc.c obj := $(src-o-files) $(all-daemon) -tests_src := bitmap_test.c heap_test.c buffer_test.c event_test.c flowspec_test.c bitops_test.c patmatch_test.c fletcher16_test.c slist_test.c checksum_test.c lists_test.c mac_test.c ip_test.c hash_test.c printf_test.c +tests_src := a-set_test.c a-path_test.c bitmap_test.c heap_test.c buffer_test.c event_test.c flowspec_test.c bitops_test.c patmatch_test.c fletcher16_test.c slist_test.c checksum_test.c lists_test.c mac_test.c ip_test.c hash_test.c printf_test.c slab_test.c type_test.c tests_targets := $(tests_targets) $(tests-target-files) tests_objs := $(tests_objs) $(src-o-files) diff --git a/nest/a-path.c b/lib/a-path.c index 2e34a3d1..a7a22e40 100644 --- a/nest/a-path.c +++ b/lib/a-path.c @@ -8,8 +8,8 @@ */ #include "nest/bird.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "lib/resource.h" #include "lib/unaligned.h" #include "lib/string.h" @@ -591,7 +591,7 @@ as_path_match_set(const struct adata *path, const struct f_tree *set) p += 2; for (i=0; i<n; i++) { - struct f_val v = {T_INT, .val.i = get_as(p)}; + struct f_val v = { .type = T_INT, .val.i = get_as(p)}; if (find_tree(set, &v)) return 1; p += BS; @@ -602,8 +602,10 @@ as_path_match_set(const struct adata *path, const struct f_tree *set) } const struct adata * -as_path_filter(struct linpool *pool, const struct adata *path, const struct f_tree *set, u32 key, int pos) +as_path_filter(struct linpool *pool, const struct adata *path, const struct f_val *set, int pos) { + ASSERT((set->type == T_SET) || (set->type == T_INT)); + if (!path) return NULL; @@ -629,13 +631,13 @@ as_path_filter(struct linpool *pool, const struct adata *path, const struct f_tr u32 as = get_as(p); int match; - if (set) + if (set->type == T_SET) { - struct f_val v = {T_INT, .val.i = as}; - match = !!find_tree(set, &v); + struct f_val v = { .type = T_INT, .val.i = as}; + match = !!find_tree(set->val.t, &v); } - else - match = (as == key); + else /* T_INT */ + match = (as == set->val.i); if (match == pos) { @@ -667,6 +669,35 @@ as_path_filter(struct linpool *pool, const struct adata *path, const struct f_tr return res; } +int +as_path_walk(const struct adata *path, uint *pos, uint *val) +{ + if (!path) + return 0; + + const u8 *p = path->data; + const u8 *q = p + path->length; + uint n, x = *pos; + + while (p < q) + { + n = p[1]; + p += 2; + + if (x < n) + { + *val = get_as(p + x * BS); + *pos += 1; + return 1; + } + + p += n * BS; + x -= n; + } + + return 0; +} + struct pm_pos { diff --git a/nest/a-path_test.c b/lib/a-path_test.c index 2e6e4956..c6f8ce8b 100644 --- a/nest/a-path_test.c +++ b/lib/a-path_test.c @@ -9,9 +9,10 @@ #include "test/birdtest.h" #include "test/bt-utils.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "lib/resource.h" +#include "filter/data.h" #define TESTS_NUM 30 #define AS_PATH_LENGTH 1000 @@ -23,8 +24,6 @@ static int t_as_path_match(void) { - resource_init(); - int round; for (round = 0; round < TESTS_NUM; round++) { @@ -32,14 +31,13 @@ t_as_path_match(void) struct adata *as_path = &empty_as_path; u32 first_prepended, last_prepended; first_prepended = last_prepended = 0; - struct linpool *lp = lp_new_default(&root_pool); struct f_path_mask *mask = alloca(sizeof(struct f_path_mask) + AS_PATH_LENGTH * sizeof(struct f_path_mask_item)); mask->len = AS_PATH_LENGTH; for (int i = AS_PATH_LENGTH - 1; i >= 0; i--) { u32 val = bt_random(); - as_path = as_path_prepend(lp, as_path, val); + as_path = as_path_prepend(tmp_linpool, as_path, val); bt_debug("Prepending ASN: %10u \n", val); if (i == 0) @@ -61,7 +59,7 @@ t_as_path_match(void) bt_assert(as_path_get_last(as_path, &asn)); bt_assert_msg(asn == first_prepended, "as_path_get_last() should return the first prepended ASN"); - rfree(lp); + tmp_flush(); } return 1; @@ -70,16 +68,13 @@ t_as_path_match(void) static int t_path_format(void) { - resource_init(); - struct adata empty_as_path = {}; struct adata *as_path = &empty_as_path; - struct linpool *lp = lp_new_default(&root_pool); uint i; for (i = 4294967285; i <= 4294967294; i++) { - as_path = as_path_prepend(lp, as_path, i); + as_path = as_path_prepend(tmp_linpool, as_path, i); bt_debug("Prepending ASN: %10u \n", i); } @@ -97,7 +92,7 @@ t_path_format(void) as_path_format(as_path, buf2, SMALL_BUFFER_SIZE); bt_assert_msg(strcmp(buf2, "4294967294 42...") == 0, "Small Buffer(%zu): '%s'", strlen(buf2), buf2); - rfree(lp); + tmp_flush(); return 1; } @@ -116,11 +111,8 @@ count_asn_in_array(const u32 *array, u32 asn) static int t_path_include(void) { - resource_init(); - struct adata empty_as_path = {}; struct adata *as_path = &empty_as_path; - struct linpool *lp = lp_new_default(&root_pool); u32 as_nums[AS_PATH_LENGTH] = {}; int i; @@ -128,7 +120,7 @@ t_path_include(void) { u32 val = bt_random(); as_nums[i] = val; - as_path = as_path_prepend(lp, as_path, val); + as_path = as_path_prepend(tmp_linpool, as_path, val); } for (i = 0; i < AS_PATH_LENGTH; i++) @@ -136,8 +128,9 @@ t_path_include(void) int counts_of_contains = count_asn_in_array(as_nums, as_nums[i]); bt_assert_msg(as_path_contains(as_path, as_nums[i], counts_of_contains), "AS Path should contains %d-times number %d", counts_of_contains, as_nums[i]); - bt_assert(as_path_filter(lp, as_path, NULL, as_nums[i], 0) != NULL); - bt_assert(as_path_filter(lp, as_path, NULL, as_nums[i], 1) != NULL); + struct f_val v = { .type = T_INT, .val.i = as_nums[i] }; + bt_assert(as_path_filter(tmp_linpool, as_path, &v, 0) != NULL); + bt_assert(as_path_filter(tmp_linpool, as_path, &v, 1) != NULL); } for (i = 0; i < 10000; i++) @@ -152,7 +145,7 @@ t_path_include(void) bt_assert_msg(result == 0, "As path should not contain the number %u", test_val); } - rfree(lp); + tmp_flush(); return 1; } @@ -161,16 +154,13 @@ t_path_include(void) static int t_as_path_converting(void) { - resource_init(); - struct adata empty_as_path = {}; struct adata *as_path = &empty_as_path; - struct linpool *lp = lp_new_default(&root_pool); #define AS_PATH_LENGTH_FOR_CONVERTING_TEST 10 int i; for (i = 0; i < AS_PATH_LENGTH_FOR_CONVERTING_TEST; i++) - as_path = as_path_prepend(lp, as_path, i); + as_path = as_path_prepend(tmp_linpool, as_path, i); bt_debug("data length: %u \n", as_path->length); @@ -204,13 +194,10 @@ t_as_path_converting(void) } #endif -void resource_sys_init(void); - int main(int argc, char *argv[]) { bt_init(argc, argv); - resource_sys_init(); bt_test_suite(t_as_path_match, "Testing AS path matching and some a-path utilities."); bt_test_suite(t_path_format, "Testing formating as path into byte buffer"); diff --git a/nest/a-set.c b/lib/a-set.c index 1186eb56..dcb86058 100644 --- a/nest/a-set.c +++ b/lib/a-set.c @@ -10,8 +10,8 @@ #include <stdlib.h> #include "nest/bird.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "lib/resource.h" #include "lib/string.h" @@ -516,6 +516,48 @@ int_set_sort(struct linpool *pool, const struct adata *src) return dst; } +int +int_set_min(const struct adata *list, u32 *val) +{ + if (!list) + return 0; + + u32 *l = (u32 *) list->data; + int len = int_set_get_size(list); + int i; + + if (len < 1) + return 0; + + *val = *l++; + for (i = 1; i < len; i++, l++) + if (int_set_cmp(val, l) > 0) + *val = *l; + + return 1; +} + +int +int_set_max(const struct adata *list, u32 *val) +{ + if (!list) + return 0; + + u32 *l = (u32 *) list->data; + int len = int_set_get_size(list); + int i; + + if (len < 1) + return 0; + + *val = *l++; + for (i = 1; i < len; i++, l++) + if (int_set_cmp(val, l) < 0) + *val = *l; + + return 1; +} + static int ec_set_cmp(const void *X, const void *Y) @@ -541,6 +583,50 @@ ec_set_sort_x(struct adata *set) qsort(set->data, set->length / 8, 8, ec_set_cmp); } +int +ec_set_min(const struct adata *list, u64 *val) +{ + if (!list) + return 0; + + u32 *l = int_set_get_data(list); + int len = int_set_get_size(list); + int i; + + if (len < 1) + return 0; + + u32 *res = l; l += 2; + for (i = 2; i < len; i += 2, l += 2) + if (ec_set_cmp(res, l) > 0) + res = l; + + *val = ec_generic(res[0], res[1]); + return 1; +} + +int +ec_set_max(const struct adata *list, u64 *val) +{ + if (!list) + return 0; + + u32 *l = int_set_get_data(list); + int len = int_set_get_size(list); + int i; + + if (len < 1) + return 0; + + u32 *res = l; l += 2; + for (i = 2; i < len; i += 2, l += 2) + if (ec_set_cmp(res, l) < 0) + res = l; + + *val = ec_generic(res[0], res[1]); + return 1; +} + static int lc_set_cmp(const void *X, const void *Y) @@ -563,3 +649,95 @@ lc_set_sort(struct linpool *pool, const struct adata *src) qsort(dst->data, dst->length / LCOMM_LENGTH, LCOMM_LENGTH, lc_set_cmp); return dst; } + +int +lc_set_min(const struct adata *list, lcomm *val) +{ + if (!list) + return 0; + + u32 *l = int_set_get_data(list); + int len = int_set_get_size(list); + int i; + + if (len < 1) + return 0; + + u32 *res = l; l += 3; + for (i = 3; i < len; i += 3, l += 3) + if (lc_set_cmp(res, l) > 0) + res = l; + + *val = (lcomm) { res[0], res[1], res[2] }; + return 1; +} + +int +lc_set_max(const struct adata *list, lcomm *val) +{ + if (!list) + return 0; + + u32 *l = int_set_get_data(list); + int len = int_set_get_size(list); + int i; + + if (len < 1) + return 0; + + u32 *res = l; l += 3; + for (i = 3; i < len; i += 3, l += 3) + if (lc_set_cmp(res, l) < 0) + res = l; + + *val = (lcomm) { res[0], res[1], res[2] }; + return 1; +} + +int +int_set_walk(const struct adata *list, uint *pos, uint *val) +{ + if (!list) + return 0; + + if (*pos >= (uint) int_set_get_size(list)) + return 0; + + u32 *res = int_set_get_data(list) + *pos; + *val = *res; + *pos += 1; + + return 1; +} + +int +ec_set_walk(const struct adata *list, uint *pos, u64 *val) +{ + if (!list) + return 0; + + if (*pos >= (uint) int_set_get_size(list)) + return 0; + + u32 *res = int_set_get_data(list) + *pos; + *val = ec_generic(res[0], res[1]); + *pos += 2; + + return 1; +} + +int +lc_set_walk(const struct adata *list, uint *pos, lcomm *val) +{ + if (!list) + return 0; + + if (*pos >= (uint) int_set_get_size(list)) + return 0; + + u32 *res = int_set_get_data(list) + *pos; + *val = (lcomm) { res[0], res[1], res[2] }; + *pos += 3; + + return 1; +} diff --git a/nest/a-set_test.c b/lib/a-set_test.c index efd1b67d..693b8f08 100644 --- a/nest/a-set_test.c +++ b/lib/a-set_test.c @@ -10,8 +10,8 @@ #include "test/bt-utils.h" #include "lib/net.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "lib/resource.h" #define SET_SIZE 10 @@ -25,8 +25,6 @@ static byte buf[BUFFER_SIZE] = {}; #define SET_SIZE_FOR_FORMAT_OUTPUT 10 -struct linpool *lp; - enum set_type { SET_TYPE_INT, @@ -38,24 +36,23 @@ generate_set_sequence(enum set_type type, int len) { struct adata empty_as_path = {}; set_sequence = set_sequence_same = set_sequence_higher = set_random = &empty_as_path; - lp = lp_new_default(&root_pool); int i; for (i = 0; i < len; i++) { if (type == SET_TYPE_INT) { - set_sequence = int_set_add(lp, set_sequence, i); - set_sequence_same = int_set_add(lp, set_sequence_same, i); - set_sequence_higher = int_set_add(lp, set_sequence_higher, i + SET_SIZE); - set_random = int_set_add(lp, set_random, bt_random()); + set_sequence = int_set_add(tmp_linpool, set_sequence, i); + set_sequence_same = int_set_add(tmp_linpool, set_sequence_same, i); + set_sequence_higher = int_set_add(tmp_linpool, set_sequence_higher, i + SET_SIZE); + set_random = int_set_add(tmp_linpool, set_random, bt_random()); } else if (type == SET_TYPE_EC) { - set_sequence = ec_set_add(lp, set_sequence, i); - set_sequence_same = ec_set_add(lp, set_sequence_same, i); - set_sequence_higher = ec_set_add(lp, set_sequence_higher, i + SET_SIZE); - set_random = ec_set_add(lp, set_random, (bt_random() << 32 | bt_random())); + set_sequence = ec_set_add(tmp_linpool, set_sequence, i); + set_sequence_same = ec_set_add(tmp_linpool, set_sequence_same, i); + set_sequence_higher = ec_set_add(tmp_linpool, set_sequence_higher, i + SET_SIZE); + set_random = ec_set_add(tmp_linpool, set_random, (bt_random() << 32 | bt_random())); } else bt_abort_msg("This should be unreachable"); @@ -71,7 +68,6 @@ t_set_int_contains(void) { int i; - resource_init(); generate_set_sequence(SET_TYPE_INT, SET_SIZE); bt_assert(int_set_get_size(set_sequence) == SET_SIZE); @@ -85,33 +81,29 @@ t_set_int_contains(void) for (i = 0; i < SET_SIZE; i++) bt_assert_msg(data[i] == i, "(data[i] = %d) == i = %d)", data[i], i); - rfree(lp); return 1; } static int t_set_int_union(void) { - resource_init(); generate_set_sequence(SET_TYPE_INT, SET_SIZE); const struct adata *set_union; - set_union = int_set_union(lp, set_sequence, set_sequence_same); + set_union = int_set_union(tmp_linpool, set_sequence, set_sequence_same); bt_assert(int_set_get_size(set_union) == SET_SIZE); bt_assert(int_set_format(set_union, 0, 2, buf, BUFFER_SIZE) == 0); - set_union = int_set_union(lp, set_sequence, set_sequence_higher); + set_union = int_set_union(tmp_linpool, set_sequence, set_sequence_higher); bt_assert_msg(int_set_get_size(set_union) == SET_SIZE*2, "int_set_get_size(set_union) %d, SET_SIZE*2 %d", int_set_get_size(set_union), SET_SIZE*2); bt_assert(int_set_format(set_union, 0, 2, buf, BUFFER_SIZE) == 0); - rfree(lp); return 1; } static int t_set_int_format(void) { - resource_init(); generate_set_sequence(SET_TYPE_INT, SET_SIZE_FOR_FORMAT_OUTPUT); bt_assert(int_set_format(set_sequence, 0, 0, buf, BUFFER_SIZE) == 0); @@ -125,21 +117,19 @@ t_set_int_format(void) bt_assert(int_set_format(set_sequence, 1, 0, buf, BUFFER_SIZE) == 0); bt_assert(strcmp(buf, "(0,0) (0,1) (0,2) (0,3) (0,4) (0,5) (0,6) (0,7) (0,8) (0,9)") == 0); - rfree(lp); return 1; } static int t_set_int_delete(void) { - resource_init(); generate_set_sequence(SET_TYPE_INT, SET_SIZE); const struct adata *deleting_sequence = set_sequence; u32 i; for (i = 0; i < SET_SIZE; i++) { - deleting_sequence = int_set_del(lp, deleting_sequence, i); + deleting_sequence = int_set_del(tmp_linpool, deleting_sequence, i); bt_assert_msg(int_set_get_size(deleting_sequence) == (int) (SET_SIZE-1-i), "int_set_get_size(deleting_sequence) %d == SET_SIZE-1-i %d", int_set_get_size(deleting_sequence), @@ -160,7 +150,6 @@ t_set_ec_contains(void) { u32 i; - resource_init(); generate_set_sequence(SET_TYPE_EC, SET_SIZE); bt_assert(ec_set_get_size(set_sequence) == SET_SIZE); @@ -174,62 +163,54 @@ t_set_ec_contains(void) // for (i = 0; i < SET_SIZE; i++) // bt_assert_msg(data[i] == (SET_SIZE-1-i), "(data[i] = %d) == ((SET_SIZE-1-i) = %d)", data[i], SET_SIZE-1-i); - rfree(lp); return 1; } static int t_set_ec_union(void) { - resource_init(); generate_set_sequence(SET_TYPE_EC, SET_SIZE); const struct adata *set_union; - set_union = ec_set_union(lp, set_sequence, set_sequence_same); + set_union = ec_set_union(tmp_linpool, set_sequence, set_sequence_same); bt_assert(ec_set_get_size(set_union) == SET_SIZE); bt_assert(ec_set_format(set_union, 0, buf, BUFFER_SIZE) == 0); - set_union = ec_set_union(lp, set_sequence, set_sequence_higher); + set_union = ec_set_union(tmp_linpool, set_sequence, set_sequence_higher); bt_assert_msg(ec_set_get_size(set_union) == SET_SIZE*2, "ec_set_get_size(set_union) %d, SET_SIZE*2 %d", ec_set_get_size(set_union), SET_SIZE*2); bt_assert(ec_set_format(set_union, 0, buf, BUFFER_SIZE) == 0); - rfree(lp); return 1; } static int t_set_ec_format(void) { - resource_init(); - const struct adata empty_as_path = {}; set_sequence = set_sequence_same = set_sequence_higher = set_random = &empty_as_path; - lp = lp_new_default(&root_pool); u64 i = 0; - set_sequence = ec_set_add(lp, set_sequence, i); + set_sequence = ec_set_add(tmp_linpool, set_sequence, i); for (i = 1; i < SET_SIZE_FOR_FORMAT_OUTPUT; i++) - set_sequence = ec_set_add(lp, set_sequence, i + ((i%2) ? ((u64)EC_RO << 48) : ((u64)EC_RT << 48))); + set_sequence = ec_set_add(tmp_linpool, set_sequence, i + ((i%2) ? ((u64)EC_RO << 48) : ((u64)EC_RT << 48))); bt_assert(ec_set_format(set_sequence, 0, buf, BUFFER_SIZE) == 0); bt_assert_msg(strcmp(buf, "(unknown 0x0, 0, 0) (ro, 0, 1) (rt, 0, 2) (ro, 0, 3) (rt, 0, 4) (ro, 0, 5) (rt, 0, 6) (ro, 0, 7) (rt, 0, 8) (ro, 0, 9)") == 0, "ec_set_format() returns '%s'", buf); - rfree(lp); return 1; } static int t_set_ec_delete(void) { - resource_init(); generate_set_sequence(SET_TYPE_EC, SET_SIZE); const struct adata *deleting_sequence = set_sequence; u32 i; for (i = 0; i < SET_SIZE; i++) { - deleting_sequence = ec_set_del(lp, deleting_sequence, i); + deleting_sequence = ec_set_del(tmp_linpool, deleting_sequence, i); bt_assert_msg(ec_set_get_size(deleting_sequence) == (int) (SET_SIZE-1-i), "ec_set_get_size(deleting_sequence) %d == SET_SIZE-1-i %d", ec_set_get_size(deleting_sequence), SET_SIZE-1-i); @@ -241,13 +222,10 @@ t_set_ec_delete(void) } -void resource_sys_init(void); - int main(int argc, char *argv[]) { bt_init(argc, argv); - resource_sys_init(); bt_test_suite(t_set_int_contains, "Testing sets of integers: contains, get_data"); bt_test_suite(t_set_int_format, "Testing sets of integers: format"); diff --git a/nest/attrs.h b/lib/attrs.h index 50da817b..af2f1036 100644 --- a/nest/attrs.h +++ b/lib/attrs.h @@ -11,7 +11,39 @@ #include <stdint.h> #include "lib/unaligned.h" -#include "nest/route.h" + +typedef struct adata { + uint length; /* Length of data */ + byte data[0]; +} adata; + +#define ADATA_SIZE(s) BIRD_CPU_ALIGN(sizeof(struct adata) + s) + +extern const adata null_adata; /* adata of length 0 */ + +static inline struct adata * +lp_alloc_adata(struct linpool *pool, uint len) +{ + struct adata *ad = lp_alloc(pool, sizeof(struct adata) + len); + ad->length = len; + return ad; +} + +static inline struct adata * +lp_store_adata(struct linpool *pool, const void *buf, uint len) +{ + struct adata *ad = lp_alloc_adata(pool, len); + memcpy(ad->data, buf, len); + return ad; +} + +#define tmp_alloc_adata(len) lp_alloc_adata(tmp_linpool, len) +#define tmp_store_adata(buf, len) lp_store_adata(tmp_linpool, buf, len) +#define tmp_copy_adata(ad) tmp_store_adata((ad)->data, (ad)->length) + +static inline int adata_same(const struct adata *a, const struct adata *b) +{ return (!a && !b) || (a->length == b->length && !memcmp(a->data, b->data, a->length)); } + /* a-path.c */ @@ -28,6 +60,7 @@ * to 16bit slot (like in 16bit AS_PATH). See RFC 4893 for details */ +struct f_val; struct f_tree; int as_path_valid(byte *data, uint len, int bs, int sets, int confed, char *err, uint elen); @@ -49,7 +82,8 @@ int as_path_get_last(const struct adata *path, u32 *last_as); u32 as_path_get_last_nonaggregated(const struct adata *path); int as_path_contains(const struct adata *path, u32 as, int min); int as_path_match_set(const struct adata *path, const struct f_tree *set); -const struct adata *as_path_filter(struct linpool *pool, const struct adata *path, const struct f_tree *set, u32 key, int pos); +const struct adata *as_path_filter(struct linpool *pool, const struct adata *path, const struct f_val *set, int pos); +int as_path_walk(const struct adata *path, uint *pos, uint *val); static inline struct adata *as_path_prepend(struct linpool *pool, const struct adata *path, u32 as) { return as_path_prepend2(pool, path, AS_PATH_SEQUENCE, as); } @@ -218,6 +252,15 @@ struct adata *ec_set_del_nontrans(struct linpool *pool, const struct adata *set) struct adata *int_set_sort(struct linpool *pool, const struct adata *src); struct adata *ec_set_sort(struct linpool *pool, const struct adata *src); struct adata *lc_set_sort(struct linpool *pool, const struct adata *src); +int int_set_min(const struct adata *list, u32 *val); +int ec_set_min(const struct adata *list, u64 *val); +int lc_set_min(const struct adata *list, lcomm *val); +int int_set_max(const struct adata *list, u32 *val); +int ec_set_max(const struct adata *list, u64 *val); +int lc_set_max(const struct adata *list, lcomm *val); +int int_set_walk(const struct adata *list, uint *pos, u32 *val); +int ec_set_walk(const struct adata *list, uint *pos, u64 *val); +int lc_set_walk(const struct adata *list, uint *pos, lcomm *val); void ec_set_sort_x(struct adata *set); /* Sort in place */ diff --git a/lib/birdlib.h b/lib/birdlib.h index 385bf75c..d743ecdf 100644 --- a/lib/birdlib.h +++ b/lib/birdlib.h @@ -14,12 +14,25 @@ /* Ugly structure offset handling macros */ -struct align_probe { char x; long int y; }; - +#define SAME_TYPE(a, b) ({ int _ = ((a) != (b)); !_; }) #define OFFSETOF(s, i) ((size_t) &((s *)0)->i) -#define SKIP_BACK(s, i, p) ({ s *_ptr = ((s *)((char *)p - OFFSETOF(s, i))); ASSERT_DIE(&_ptr->i == p); _ptr; }) +#define SKIP_BACK(s, i, p) ({ s *_ptr = ((s *)((char *)p - OFFSETOF(s, i))); SAME_TYPE(&_ptr->i, p); _ptr; }) #define BIRD_ALIGN(s, a) (((s)+a-1)&~(a-1)) -#define CPU_STRUCT_ALIGN (sizeof(struct align_probe)) +#define CPU_STRUCT_ALIGN (MAX_(_Alignof(void*), _Alignof(u64))) +#define BIRD_CPU_ALIGN(s) BIRD_ALIGN((s), CPU_STRUCT_ALIGN) + +/* Structure item alignment macros */ + +#define PADDING_NAME(id) _padding_##id +#define PADDING_(id, sz) u8 PADDING_NAME(id)[sz] + +#if CPU_POINTER_ALIGNMENT == 4 +#define PADDING(id, n32, n64) PADDING_(id, n32) +#elif CPU_POINTER_ALIGNMENT == 8 +#define PADDING(id, n32, n64) PADDING_(id, n64) +#else +#error "Strange CPU pointer alignment: " CPU_POINTER_ALIGNMENT +#endif /* Utility macros */ @@ -33,6 +46,9 @@ struct align_probe { char x; long int y; }; #define MAX(a,b) MAX_(a,b) #endif +#define ROUND_DOWN_POW2(a,b) ((a) & ~((b)-1)) +#define ROUND_UP_POW2(a,b) (((a)+((b)-1)) & ~((b)-1)) + #define U64(c) UINT64_C(c) #define ABS(a) ((a)>=0 ? (a) : -(a)) #define DELTA(a,b) (((a)>=(b))?(a)-(b):(b)-(a)) @@ -160,8 +176,13 @@ void debug(const char *msg, ...); /* Printf to debug output */ #if defined(LOCAL_DEBUG) || defined(GLOBAL_DEBUG) #define DBG(x, y...) debug(x, ##y) +#define DBGL(x, y...) debug(x "\n", ##y) +#elif defined(DEBUG_TO_LOG) +#define DBG(...) do { } while (0) +#define DBGL(...) log(L_DEBUG __VA_ARGS__) #else -#define DBG(x, y...) do { } while(0) +#define DBG(...) do { } while(0) +#define DBGL(...) do { } while (0) #endif #define ASSERT_DIE(x) do { if (!(x)) bug("Assertion '%s' failed at %s:%d", #x, __FILE__, __LINE__); } while(0) diff --git a/lib/bitmap_test.c b/lib/bitmap_test.c index 0595a4d0..07860c94 100644 --- a/lib/bitmap_test.c +++ b/lib/bitmap_test.c @@ -24,7 +24,6 @@ t_bmap_set_clear_random(void) { struct bmap b; - resource_init(); bmap_init(&b, &root_pool, 1024); char expected[MAX_NUM] = {}; @@ -60,7 +59,6 @@ t_hmap_set_clear_random(void) { struct hmap b; - resource_init(); hmap_init(&b, &root_pool, 1024); char expected[MAX_NUM] = {}; @@ -119,7 +117,6 @@ t_hmap_set_clear_fill(void) { struct hmap b; - resource_init(); hmap_init(&b, &root_pool, 1024); char expected[MAX_NUM] = {}; diff --git a/lib/buffer_test.c b/lib/buffer_test.c index 5b7de330..0629e901 100644 --- a/lib/buffer_test.c +++ b/lib/buffer_test.c @@ -41,7 +41,6 @@ fill_expected_array(void) static void init_buffer(void) { - resource_init(); buffer_pool = &root_pool; BUFFER_INIT(buf, buffer_pool, MAX_NUM); } diff --git a/lib/coro.h b/lib/coro.h deleted file mode 100644 index b36f1d2c..00000000 --- a/lib/coro.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * BIRD Coroutines - * - * (c) 2017 Martin Mares <mj@ucw.cz> - * (c) 2020-2021 Maria Matejka <mq@jmq.cz> - * - * Can be freely distributed and used under the terms of the GNU GPL. - */ - -#ifndef _BIRD_CORO_H_ -#define _BIRD_CORO_H_ - -#include "lib/resource.h" - -/* A completely opaque coroutine handle. */ -struct coroutine; - -/* Coroutines are independent threads bound to pools. - * You request a coroutine by calling coro_run(). - * It is forbidden to free a running coroutine from outside. - * The running coroutine must free itself by rfree() before returning. - */ -struct coroutine *coro_run(pool *, void (*entry)(void *), void *data); - -/* Get self. */ -extern _Thread_local struct coroutine *this_coro; - -/* Just wait for a little while. Not intended for general use; use events if possible. */ -void coro_yield(void); - -#endif diff --git a/lib/event.c b/lib/event.c index 5031f314..68ee4c06 100644 --- a/lib/event.c +++ b/lib/event.c @@ -23,27 +23,148 @@ #include "nest/bird.h" #include "lib/event.h" -#include "lib/locking.h" #include "lib/io-loop.h" -extern _Thread_local struct coroutine *this_coro; - event_list global_event_list; event_list global_work_list; +//#ifdef DEBUGGING +#if 0 +#define EDL_MAX 16384 +enum edl_caller { + EDL_REMOVE_FROM = 1, + EDL_POSTPONE = 2, + EDL_RUN = 3, + EDL_SEND = 4, + EDL_RUN_LIST = 5, +} caller; +static struct event_debug_log { + event_list *target_list; + event *event; + event *receiver; + uint pos; + uint prev_edl_pos; + uint thread; + enum edl_caller caller; +} edl[EDL_MAX]; +static _Atomic uint edl_cnt; +_Thread_local static uint edl_thread; +_Thread_local static uint prev_edl_pos = ~0; +static inline void edlog(event_list *list, event *e, event *receiver, uint pos, enum edl_caller caller) +{ + uint edl_pos = atomic_fetch_add_explicit(&edl_cnt, 1, memory_order_acq_rel); + if (!edl_thread) + edl_thread = edl_pos; + + edl[edl_pos % EDL_MAX] = (struct event_debug_log) { + .target_list = list, + .event = e, + .receiver = receiver, + .pos = pos, + .prev_edl_pos = prev_edl_pos, + .thread = edl_thread, + .caller = caller, + }; + + prev_edl_pos = edl_pos; +} +#else +#define edlog(...) +#endif + + +void +ev_init_list(event_list *el, struct birdloop *loop, const char *name) +{ + el->name = name; + el->loop = loop; + + atomic_store_explicit(&el->receiver, NULL, memory_order_release); + atomic_store_explicit(&el->_executor, NULL, memory_order_release); +} + +/* + * The event list should work as a message passing point. Sending a message + * must be a fairly fast process with no locks and low waiting times. OTOH, + * processing messages always involves running the assigned code and the + * receiver is always a single one thread with no concurrency at all. There is + * also a postponing requirement to synchronously remove an event from a queue, + * yet we allow this only when the caller has its receiver event loop locked. + * It still means that the event may get postponed from other event in the same + * list, therefore we have to be careful. + */ + +static inline int +ev_remove_from(event *e, event * _Atomic * head) +{ + /* The head pointer stores where cur is pointed to from */ + event * _Atomic *prev = head; + + /* The current event in queue to check */ + event *cur = atomic_load_explicit(prev, memory_order_acquire); + + /* This part of queue is empty! */ + if (!cur) + return 0; + + edlog(NULL, e, cur, 1, EDL_REMOVE_FROM); + while (cur) + { + /* Pre-loaded next pointer */ + event *next = atomic_load_explicit(&cur->next, memory_order_acquire); + + if (e == cur) + { + edlog(NULL, e, next, 3, EDL_REMOVE_FROM); + + /* Check whether we have collided with somebody else + * adding an item to the queue. */ + if (!atomic_compare_exchange_strong_explicit( + prev, &cur, next, + memory_order_acq_rel, memory_order_acquire)) + { + /* This may happen only on list head */ + ASSERT_DIE(prev == head); + + /* Restart. The collision should never happen again. */ + return ev_remove_from(e, head); + } + + /* Successfully removed from the list; inactivate this event. */ + atomic_store_explicit(&cur->next, NULL, memory_order_release); + return 1; + } + + edlog(NULL, e, next, 2, EDL_REMOVE_FROM); + + /* Go to the next event. */ + prev = &cur->next; + cur = next; + } + + edlog(NULL, e, cur, 4, EDL_REMOVE_FROM); + + return 0; +} + inline void ev_postpone(event *e) { - event_list *el = e->list; - if (!el) + /* Find the list to remove the event from */ + event_list *sl = ev_get_list(e); + edlog(sl, e, NULL, 1, EDL_POSTPONE); + if (!sl) return; - ASSERT_DIE(birdloop_inside(el->loop)); + /* Postponing allowed only from the target loop */ + ASSERT_DIE(birdloop_inside(sl->loop)); + + /* Remove from one of these lists. */ + ASSERT(ev_remove_from(e, &sl->_executor) || ev_remove_from(e, &sl->receiver)); - LOCK_DOMAIN(event, el->lock); - if (ev_active(e)) - rem_node(&e->n); - UNLOCK_DOMAIN(event, el->lock); + /* Mark as inactive */ + ASSERT_DIE(sl == atomic_exchange_explicit(&e->list, NULL, memory_order_acq_rel)); + edlog(sl, e, NULL, 2, EDL_POSTPONE); } static void @@ -54,7 +175,7 @@ ev_dump(resource *r) debug("(code %p, data %p, %s)\n", e->hook, e->data, - e->n.next ? "scheduled" : "inactive"); + atomic_load_explicit(&e->next, memory_order_relaxed) ? "scheduled" : "inactive"); } static struct resclass ev_class = { @@ -93,8 +214,10 @@ ev_new(pool *p) inline void ev_run(event *e) { + edlog(NULL, e, NULL, 1, EDL_RUN); ev_postpone(e); e->hook(e->data); + edlog(NULL, e, NULL, 2, EDL_RUN); } /** @@ -108,48 +231,37 @@ ev_run(event *e) inline void ev_send(event_list *l, event *e) { - DBG("ev_send(%p, %p)\n", l, e); - ASSERT_DIE(e->hook); - ASSERT_DIE(!e->list || (e->list == l) || (e->list->loop == l->loop)); - - e->list = l; - - struct event_cork *ec = e->cork; - - uint ping = 0; - - if (ec) - { - LOCK_DOMAIN(cork, ec->lock); - LOCK_DOMAIN(event, l->lock); - - if (!enlisted(&e->n)) - if (ec->count) - add_tail(&ec->events, &e->n); - else - { - add_tail(&l->events, &e->n); - ping = 1; - } - - UNLOCK_DOMAIN(event, l->lock); - UNLOCK_DOMAIN(cork, ec->lock); - } - else - { - LOCK_DOMAIN(event, l->lock); - - if (!enlisted(&e->n)) + edlog(l, e, NULL, 1, EDL_SEND); + /* Set the target list */ + event_list *ol = NULL; + if (!atomic_compare_exchange_strong_explicit( + &e->list, &ol, l, + memory_order_acq_rel, memory_order_acquire)) + if (ol == l) + return; + else + bug("Queuing an already queued event to another queue is not supported."); + + /* Here should be no concurrent senders */ + event *next = atomic_load_explicit(&l->receiver, memory_order_acquire); + edlog(l, e, next, 2, EDL_SEND); + event *old_next = NULL; + do + if (!atomic_compare_exchange_strong_explicit( + &e->next, &old_next, next, + memory_order_acq_rel, memory_order_acquire)) + bug("Event %p in inconsistent state"); + else { - add_tail(&l->events, &e->n); - ping = 1; + old_next = next; + edlog(l, old_next, next, 3, EDL_SEND); } + while (!atomic_compare_exchange_strong_explicit( + &l->receiver, &next, e, + memory_order_acq_rel, memory_order_acquire)); - UNLOCK_DOMAIN(event, l->lock); - } - - if (ping) - birdloop_ping(l->loop); + edlog(l, e, next, 4, EDL_SEND); + birdloop_ping(l->loop); } void io_log_event(void *hook, void *data); @@ -161,128 +273,66 @@ void io_log_event(void *hook, void *data); * This function calls ev_run() for all events enqueued in the list @l. */ int -ev_run_list(event_list *l) -{ - const _Bool legacy = LEGACY_EVENT_LIST(l); - - if (legacy) - ASSERT_THE_BIRD_LOCKED; - - node *n; - - list tmp_list; - init_list(&tmp_list); - - /* Move the event list contents to a local list to avoid executing repeatedly added events */ - LOCK_DOMAIN(event, l->lock); - add_tail_list(&tmp_list, &l->events); - init_list(&l->events); - UNLOCK_DOMAIN(event, l->lock); - - WALK_LIST_FIRST(n, tmp_list) - { - event *e = SKIP_BACK(event, n, n); - - if (legacy) - { - /* The legacy way of event execution */ - io_log_event(e->hook, e->data); - ev_postpone(e); - e->hook(e->data); - } - else - { - // io_log_event(e->hook, e->data); /* TODO: add support for event logging in other io loops */ - ASSERT_DIE(e->list == l); - LOCK_DOMAIN(event, l->lock); - rem_node(&e->n); - UNLOCK_DOMAIN(event, l->lock); - e->hook(e->data); - } - } - - LOCK_DOMAIN(event, l->lock); - int repeat = ! EMPTY_LIST(l->events); - UNLOCK_DOMAIN(event, l->lock); - return repeat; -} - -int ev_run_list_limited(event_list *l, uint limit) { - ASSERT_DIE(LEGACY_EVENT_LIST(l)); - ASSERT_THE_BIRD_LOCKED; + event * _Atomic *ep = &l->_executor; + edlog(l, NULL, NULL, 1, EDL_RUN_LIST); - node *n; - list tmp_list; - - LOCK_DOMAIN(event, l->lock); - init_list(&tmp_list); - add_tail_list(&tmp_list, &l->events); - init_list(&l->events); - UNLOCK_DOMAIN(event, l->lock); - - WALK_LIST_FIRST(n, tmp_list) - { - event *e = SKIP_BACK(event, n, n); - - if (!limit) - break; - - io_log_event(e->hook, e->data); - - ev_run(e); - limit--; - } - - LOCK_DOMAIN(event, l->lock); - if (!EMPTY_LIST(tmp_list)) + /* No pending events, refill the queue. */ + if (!atomic_load_explicit(ep, memory_order_acquire)) { - /* Attach new items after the unprocessed old items */ - add_tail_list(&tmp_list, &l->events); - init_list(&l->events); - add_tail_list(&l->events, &tmp_list); - } + /* Move the current event list aside and create a new one. */ + event *received = atomic_exchange_explicit(&l->receiver, NULL, memory_order_acq_rel); + edlog(l, NULL, received, 2, EDL_RUN_LIST); - int repeat = ! EMPTY_LIST(l->events); - UNLOCK_DOMAIN(event, l->lock); + /* No event to run. */ + if (!received) + return 0; - return repeat; -} + /* Setup the executor queue */ + event *head = NULL; -void ev_cork(struct event_cork *ec) -{ - LOCK_DOMAIN(cork, ec->lock); - ec->count++; - UNLOCK_DOMAIN(cork, ec->lock); -} - -void ev_uncork(struct event_cork *ec) -{ - LOCK_DOMAIN(cork, ec->lock); + /* Flip the order of the events by relinking them one by one (push-pop) */ + while (received) + { + event *cur = received; + received = atomic_exchange_explicit(&cur->next, head, memory_order_acq_rel); + edlog(l, head, received, 3, EDL_RUN_LIST); + head = cur; + } - if (--ec->count) - { - UNLOCK_DOMAIN(cork, ec->lock); - return; + /* Store the executor queue to its designated place */ + ASSERT_DIE(atomic_exchange_explicit(ep, head, memory_order_acq_rel) == NULL); + edlog(l, NULL, head, 4, EDL_RUN_LIST); } - node *n; - WALK_LIST_FIRST(n, ec->events) + /* Run the events in order. */ + event *e; + while (e = atomic_load_explicit(ep, memory_order_acquire)) { - event *e = SKIP_BACK(event, n, n); - event_list *el = e->list; + edlog(l, e, NULL, 5, EDL_RUN_LIST); + /* Check limit */ + if (!--limit) + return 1; - rem_node(&e->n); + /* This is ugly hack, we want to log just events executed from the main I/O loop */ + if ((l == &global_event_list) || (l == &global_work_list)) + io_log_event(e->hook, e->data); - LOCK_DOMAIN(event, el->lock); - add_tail(&el->events, &e->n); - UNLOCK_DOMAIN(event, el->lock); + edlog(l, e, NULL, 6, EDL_RUN_LIST); + /* Inactivate the event */ + event *next = atomic_load_explicit(&e->next, memory_order_relaxed); + ASSERT_DIE(e == atomic_exchange_explicit(ep, next, memory_order_acq_rel)); + ASSERT_DIE(next == atomic_exchange_explicit(&e->next, NULL, memory_order_acq_rel)); + ASSERT_DIE(l == atomic_exchange_explicit(&e->list, NULL, memory_order_acq_rel)); + edlog(l, e, next, 7, EDL_RUN_LIST); - birdloop_ping(el->loop); - } + /* Run the event */ + e->hook(e->data); + tmp_flush(); - UNLOCK_DOMAIN(cork, ec->lock); + edlog(l, e, next, 8, EDL_RUN_LIST); + } - birdloop_ping(&main_birdloop); + return !!atomic_load_explicit(&l->receiver, memory_order_acquire); } diff --git a/lib/event.h b/lib/event.h index cd85bf78..0bef737a 100644 --- a/lib/event.h +++ b/lib/event.h @@ -11,91 +11,56 @@ #include "lib/resource.h" #include "lib/locking.h" +#include "lib/rcu.h" #include <stdatomic.h> -DEFINE_DOMAIN(event); -DEFINE_DOMAIN(cork); +struct birdloop; typedef struct event { resource r; void (*hook)(void *); void *data; - node n; /* Internal link */ - struct event_list *list; /* List where this event is put in */ - struct event_cork *cork; /* Event execution limiter */ - node cork_node; + struct event * _Atomic next; + struct event_list * _Atomic list; } event; typedef struct event_list { - list events; - pool *pool; - struct birdloop *loop; - DOMAIN(event) lock; + event * _Atomic receiver; /* Event receive list */ + event * _Atomic _executor; /* Event execute list */ + const char *name; + struct birdloop *loop; /* The executor loop */ } event_list; -struct event_cork { - DOMAIN(cork) lock; - u32 count; - list events; -}; - extern event_list global_event_list; extern event_list global_work_list; event *ev_new(pool *); void ev_run(event *); - -static inline void ev_init_list(event_list *el, struct birdloop *loop, const char *name) -{ - init_list(&el->events); - el->loop = loop; - el->lock = DOMAIN_NEW(event, name); -} - -static inline void ev_init_cork(struct event_cork *ec, const char *name) -{ - init_list(&ec->events); - ec->lock = DOMAIN_NEW(cork, name); - ec->count = 0; -}; - -void ev_send(event_list *, event *); +void ev_init_list(event_list *, struct birdloop *loop, const char *name); +void ev_enqueue(event_list *, event *); +#define ev_send ev_enqueue #define ev_send_loop(l, e) ev_send(birdloop_event_list((l)), (e)) #define ev_schedule(e) ({ ASSERT_THE_BIRD_LOCKED; if (!ev_active((e))) ev_send(&global_event_list, (e)); }) #define ev_schedule_work(e) ({ ASSERT_THE_BIRD_LOCKED; if (!ev_active((e))) ev_send(&global_work_list, (e)); }) void ev_postpone(event *); -int ev_run_list(event_list *); int ev_run_list_limited(event_list *, uint); +#define ev_run_list(l) ev_run_list_limited((l), ~0) #define LEGACY_EVENT_LIST(l) (((l) == &global_event_list) || ((l) == &global_work_list)) -void ev_cork(struct event_cork *); -void ev_uncork(struct event_cork *); - -static inline u32 ev_corked(struct event_cork *ec) -{ - if (!ec) - return 0; - - LOCK_DOMAIN(cork, ec->lock); - u32 out = ec->count; - UNLOCK_DOMAIN(cork, ec->lock); - return out; -} - -_Bool birdloop_inside(struct birdloop *loop); - static inline int ev_active(event *e) { - if (e->list == NULL) - return 0; + return atomic_load_explicit(&e->list, memory_order_acquire) != NULL; +} - ASSERT_DIE(birdloop_inside(e->list->loop)); - return enlisted(&e->n); +static inline event_list * +ev_get_list(event *e) +{ + return atomic_load_explicit(&e->list, memory_order_acquire); } static inline event* diff --git a/lib/event_test.c b/lib/event_test.c index 9dda3e2a..612deb25 100644 --- a/lib/event_test.c +++ b/lib/event_test.c @@ -15,7 +15,7 @@ #include "nest/locks.h" #include "sysdep/unix/unix.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #define MAX_NUM 4 @@ -48,19 +48,14 @@ init_event_check_points(void) event_check_points[i] = 0; } -void resource_sys_init(void); - static int t_ev_run_list(void) { int i; - resource_sys_init(); - resource_init(); olock_init(); - birdloop_init(); - io_init(); rt_init(); + io_init(); if_init(); // roa_init(); config_init(); @@ -85,9 +80,7 @@ main(int argc, char *argv[]) { bt_init(argc, argv); - the_bird_lock(); bt_test_suite(t_ev_run_list, "Schedule and run 3 events in right order."); - the_bird_unlock(); return bt_exit_value(); } diff --git a/lib/fib.h b/lib/fib.h new file mode 100644 index 00000000..bec2a8d4 --- /dev/null +++ b/lib/fib.h @@ -0,0 +1,119 @@ +/* + * BIRD Internet Routing Daemon -- Network prefix storage + * + * (c) 1998--2000 Martin Mares <mj@ucw.cz> + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_LIB_FIB_H_ +#define _BIRD_LIB_FIB_H_ + +/* + * BIRD FIBs are generic data structure for storing network prefixes. + * Also used for the master routing table. Currently implemented as + * a hash table. + * + * Available operations: + * - insertion of new entry + * - deletion of entry + * - searching for entry by network prefix + * - asynchronous retrieval of fib contents + */ + +struct fib; + +struct fib_node { + struct fib_node *next; /* Next in hash chain */ + struct fib_iterator *readers; /* List of readers of this node */ + net_addr addr[0]; +}; + +struct fib_iterator { /* See lib/slists.h for an explanation */ + struct fib_iterator *prev, *next; /* Must be synced with struct fib_node! */ + byte efef; /* 0xff to distinguish between iterator and node */ + byte pad[3]; + struct fib_node *node; /* Or NULL if freshly merged */ + uint hash; +}; + +typedef void (*fib_init_fn)(struct fib *, void *); + +struct fib { + pool *fib_pool; /* Pool holding all our data */ + slab *fib_slab; /* Slab holding all fib nodes */ + struct fib_node **hash_table; /* Node hash table */ + uint hash_size; /* Number of hash table entries (a power of two) */ + uint hash_order; /* Binary logarithm of hash_size */ + uint hash_shift; /* 32 - hash_order */ + uint addr_type; /* Type of address data stored in fib (NET_*) */ + uint node_size; /* FIB node size, 0 for nonuniform */ + uint node_offset; /* Offset of fib_node struct inside of user data */ + uint entries; /* Number of entries */ + uint entries_min, entries_max; /* Entry count limits (else start rehashing) */ + fib_init_fn init; /* Constructor */ +}; + +static inline void * fib_node_to_user(struct fib *f, struct fib_node *e) +{ return e ? (void *) ((char *) e - f->node_offset) : NULL; } + +static inline struct fib_node * fib_user_to_node(struct fib *f, void *e) +{ return e ? (void *) ((char *) e + f->node_offset) : NULL; } + +void fib_init(struct fib *f, pool *p, uint addr_type, uint node_size, uint node_offset, uint hash_order, fib_init_fn init); +void *fib_find(struct fib *, const net_addr *); /* Find or return NULL if doesn't exist */ +void *fib_get_chain(struct fib *f, const net_addr *a); /* Find first node in linked list from hash table */ +void *fib_get(struct fib *, const net_addr *); /* Find or create new if nonexistent */ +void *fib_route(struct fib *, const net_addr *); /* Longest-match routing lookup */ +void fib_delete(struct fib *, void *); /* Remove fib entry */ +void fib_free(struct fib *); /* Destroy the fib */ +void fib_check(struct fib *); /* Consistency check for debugging */ + +void fit_init(struct fib_iterator *, struct fib *); /* Internal functions, don't call */ +struct fib_node *fit_get(struct fib *, struct fib_iterator *); +void fit_put(struct fib_iterator *, struct fib_node *); +void fit_put_next(struct fib *f, struct fib_iterator *i, struct fib_node *n, uint hpos); +void fit_put_end(struct fib_iterator *i); +void fit_copy(struct fib *f, struct fib_iterator *dst, struct fib_iterator *src); + + +#define FIB_WALK(fib, type, z) do { \ + struct fib_node *fn_, **ff_ = (fib)->hash_table; \ + uint count_ = (fib)->hash_size; \ + type *z; \ + while (count_--) \ + for (fn_ = *ff_++; z = fib_node_to_user(fib, fn_); fn_=fn_->next) + +#define FIB_WALK_END } while (0) + +#define FIB_ITERATE_INIT(it, fib) fit_init(it, fib) + +#define FIB_ITERATE_START(fib, it, type, z) do { \ + struct fib_node *fn_ = fit_get(fib, it); \ + uint count_ = (fib)->hash_size; \ + uint hpos_ = (it)->hash; \ + type *z; \ + for(;;) { \ + if (!fn_) \ + { \ + if (++hpos_ >= count_) \ + break; \ + fn_ = (fib)->hash_table[hpos_]; \ + continue; \ + } \ + z = fib_node_to_user(fib, fn_); + +#define FIB_ITERATE_END fn_ = fn_->next; } } while(0) + +#define FIB_ITERATE_PUT(it) fit_put(it, fn_) + +#define FIB_ITERATE_PUT_NEXT(it, fib) fit_put_next(fib, it, fn_, hpos_) + +#define FIB_ITERATE_PUT_END(it) fit_put_end(it) + +#define FIB_ITERATE_UNLINK(it, fib) fit_get(fib, it) + +#define FIB_ITERATE_COPY(dst, src, fib) fit_copy(fib, dst, src) + +#endif diff --git a/lib/flowspec_test.c b/lib/flowspec_test.c index f7f70982..03649b99 100644 --- a/lib/flowspec_test.c +++ b/lib/flowspec_test.c @@ -446,10 +446,7 @@ t_validation6(void) static int t_builder4(void) { - resource_init(); - struct flow_builder *fb = flow_builder_init(&root_pool); - linpool *lp = lp_new_default(&root_pool); /* Expectation */ @@ -492,7 +489,7 @@ t_builder4(void) flow_builder_set_type(fb, FLOW_TYPE_TCP_FLAGS); flow_builder_add_op_val(fb, 0, 0x55); - net_addr_flow4 *res = flow_builder4_finalize(fb, lp); + net_addr_flow4 *res = flow_builder4_finalize(fb, tmp_linpool); bt_assert(memcmp(res, expect, expect->length) == 0); @@ -529,8 +526,6 @@ t_builder6(void) { net_addr_ip6 ip; - resource_init(); - linpool *lp = lp_new_default(&root_pool); struct flow_builder *fb = flow_builder_init(&root_pool); fb->ipv6 = 1; @@ -574,7 +569,7 @@ t_builder6(void) flow_builder_set_type(fb, FLOW_TYPE_LABEL); flow_builder_add_op_val(fb, 0, 0x55); - net_addr_flow6 *res = flow_builder6_finalize(fb, lp); + net_addr_flow6 *res = flow_builder6_finalize(fb, tmp_linpool); bt_assert(memcmp(res, expect, expect->length) == 0); /* Reverse order */ @@ -601,7 +596,7 @@ t_builder6(void) flow_builder_set_type(fb, FLOW_TYPE_DST_PREFIX); flow_builder6_add_pfx(fb, &ip, 61); - res = flow_builder6_finalize(fb, lp); + res = flow_builder6_finalize(fb, tmp_linpool); bt_assert(memcmp(res, expect, expect->length) == 0); return 1; @@ -666,13 +661,10 @@ t_formatting6(void) return 1; } -void resource_sys_init(void); - int main(int argc, char *argv[]) { bt_init(argc, argv); - resource_sys_init(); bt_test_suite(t_read_length, "Testing get NLRI length"); bt_test_suite(t_write_length, "Testing set NLRI length"); @@ -10,7 +10,7 @@ #ifndef _BIRD_HASH_H_ #define _BIRD_HASH_H_ -#define HASH(type) struct { type **data; uint count, order; } +#define HASH(type) struct { type **data; uint count; u16 iterators; u8 order; u8 down_requested:1; } #define HASH_TYPE(v) typeof(** (v).data) #define HASH_SIZE(v) (1U << (v).order) @@ -125,20 +125,26 @@ #define HASH_MAY_STEP_DOWN_(v,pool,rehash_fn,args) \ ({ \ - if (((v).count < (HASH_SIZE(v) REHASH_LO_MARK(args))) && \ - ((v).order > (REHASH_LO_BOUND(args)))) \ + if ((v).iterators) \ + (v).down_requested = 1; \ + else if (((v).count < (HASH_SIZE(v) REHASH_LO_MARK(args))) && \ + ((v).order > (REHASH_LO_BOUND(args)))) \ rehash_fn(&(v), pool, -(REHASH_LO_STEP(args))); \ }) #define HASH_MAY_RESIZE_DOWN_(v,pool,rehash_fn,args) \ ({ \ - uint _o = (v).order; \ - while (((v).count < ((1U << _o) REHASH_LO_MARK(args))) && \ - (_o > (REHASH_LO_BOUND(args)))) \ - _o -= (REHASH_LO_STEP(args)); \ - if (_o < (v).order) \ - rehash_fn(&(v), pool, _o - (v).order); \ - }) + if ((v).iterators) \ + (v).down_requested = 1; \ + else { \ + uint _o = (v).order; \ + while (((v).count < ((1U << _o) REHASH_LO_MARK(args))) && \ + (_o > (REHASH_LO_BOUND(args)))) \ + _o -= (REHASH_LO_STEP(args)); \ + if (_o < (v).order) \ + rehash_fn(&(v), pool, _o - (v).order); \ + } \ + }) #define HASH_INSERT2(v,id,pool,node) \ @@ -195,6 +201,20 @@ #define HASH_WALK_FILTER_END } while (0) +#define HASH_WALK_ITER(v, id, n, iter) \ + do { \ + uint _hash_walk_iter_put = 0; \ + uint _shift = 32 - (v).order; \ + for ( ; !_hash_walk_iter_put; (iter) += (1U << _shift)) { \ + _hash_walk_iter_put = ((iter) + (1U << _shift) == 0); \ + for (HASH_TYPE(v) *n = (v).data[(iter) >> _shift]; n; n = id##_NEXT((n)))\ + if (HASH_FN(v, id, id##_KEY(n)) >= ((iter) >> _shift)) \ + +#define HASH_WALK_ITER_PUT (_hash_walk_iter_put = 1) + +#define HASH_WALK_ITER_END } } while (0) + + static inline void mem_hash_init(u64 *h) { diff --git a/lib/hash_test.c b/lib/hash_test.c index 59beb7c0..ecfcdd66 100644 --- a/lib/hash_test.c +++ b/lib/hash_test.c @@ -61,7 +61,6 @@ dump_nodes(void) static void init_hash_(uint order) { - resource_init(); my_pool = rp_new(&root_pool, "Test pool"); HASH_INIT(hash, my_pool, order); @@ -286,6 +285,46 @@ t_walk_filter(void) return 1; } +static int +t_walk_iter(void) +{ + init_hash(); + fill_hash(); + + u32 hit = 0; + + u32 prev_hash = ~0; + for (uint cnt = 0; cnt < MAX_NUM; ) + { + u32 last_hash = ~0; +// printf("PUT!\n"); + HASH_WALK_ITER(hash, TEST, n, hit) + { + cnt++; + u32 cur_hash = HASH_FN(hash, TEST, n->key); + /* + printf("C%08x L%08x P%08x K%08x H%08x N%p S%d I%ld\n", + cur_hash, last_hash, prev_hash, n->key, hit, n, _shift, n - &nodes[0]); + */ + + if (last_hash == ~0U) + { + if (prev_hash != ~0U) + bt_assert(prev_hash < cur_hash); + last_hash = prev_hash = cur_hash; + } + else + bt_assert(last_hash == cur_hash); + + if (cnt < MAX_NUM) + HASH_WALK_ITER_PUT; + } + HASH_WALK_ITER_END; + } + + return 1; +} + int main(int argc, char *argv[]) { @@ -300,6 +339,7 @@ main(int argc, char *argv[]) bt_test_suite(t_walk_delsafe_remove, "HASH_WALK_DELSAFE and HASH_REMOVE"); bt_test_suite(t_walk_delsafe_remove2, "HASH_WALK_DELSAFE and HASH_REMOVE2. HASH_REMOVE2 is HASH_REMOVE and smart auto-resize function"); bt_test_suite(t_walk_filter, "HASH_WALK_FILTER"); + bt_test_suite(t_walk_iter, "HASH_WALK_ITER"); return bt_exit_value(); } diff --git a/lib/io-loop.h b/lib/io-loop.h index 25f1b2a3..ae58bbee 100644 --- a/lib/io-loop.h +++ b/lib/io-loop.h @@ -14,12 +14,12 @@ #include "lib/event.h" #include "lib/socket.h" +extern struct birdloop main_birdloop; + void sk_start(sock *s); void sk_stop(sock *s); void sk_reloop(sock *s, struct birdloop *loop); -extern struct birdloop main_birdloop; - /* Start a new birdloop owned by given pool and domain */ struct birdloop *birdloop_new(pool *p, uint order, const char *name); @@ -50,5 +50,17 @@ void birdloop_unlink(struct birdloop *loop); void birdloop_ping(struct birdloop *loop); +struct birdloop_flag_handler { + void (*hook)(struct birdloop_flag_handler *, u32 flags); + void *data; +}; + +void birdloop_flag(struct birdloop *loop, u32 flag); +void birdloop_flag_set_handler(struct birdloop *, struct birdloop_flag_handler *); + void birdloop_init(void); + +/* Yield for a little while. Use only in special cases. */ +void birdloop_yield(void); + #endif /* _BIRD_IO_LOOP_H_ */ @@ -85,25 +85,29 @@ ip4_classify(ip4_addr ad) u32 a = _I(ad); u32 b = a >> 24U; - if (b && b <= 0xdf) + if (b < 0xe0) { - if (b == 0x7f) + if (b == 0x00) /* 0.0.0.0/8 This network */ + return IADDR_INVALID; + + if (b == 0x7f) /* 127.0.0.0/8 Loopback address */ return IADDR_HOST | SCOPE_HOST; - else if ((b == 0x0a) || - ((a & 0xffff0000) == 0xc0a80000) || - ((a & 0xfff00000) == 0xac100000)) + + if ((b == 0x0a) || /* 10.0.0.0/8 Private range */ + ((a & 0xffff0000) == 0xc0a80000) || /* 192.168.0.0/16 Private range */ + ((a & 0xfff00000) == 0xac100000)) /* 172.16.0.0/12 Private range */ return IADDR_HOST | SCOPE_SITE; - else - return IADDR_HOST | SCOPE_UNIVERSE; + + return IADDR_HOST | SCOPE_UNIVERSE; } - if (b >= 0xe0 && b <= 0xef) + if (b < 0xf0) /* 224.0.0.0/4 Multicast address */ return IADDR_MULTICAST | SCOPE_UNIVERSE; - if (a == 0xffffffff) + if (a == 0xffffffff) /* 255.255.255.255 Broadcast address */ return IADDR_BROADCAST | SCOPE_LINK; - return IADDR_INVALID; + return IADDR_HOST | SCOPE_SITE; /* 240.0.0.0/4 Reserved / private */ } int @@ -279,11 +279,35 @@ static inline uint ip6_pxlen(ip6_addr a, ip6_addr b) return 32 * i + 31 - u32_log2(a.addr[i] ^ b.addr[i]); } +static inline int ip4_prefix_equal(ip4_addr a, ip4_addr b, uint n) +{ + return (_I(a) ^ _I(b)) < ((u64) 1 << (32 - n)); +} + +static inline int ip6_prefix_equal(ip6_addr a, ip6_addr b, uint n) +{ + uint n0 = n / 32; + uint n1 = n % 32; + + return + ((n0 <= 0) || (_I0(a) == _I0(b))) && + ((n0 <= 1) || (_I1(a) == _I1(b))) && + ((n0 <= 2) || (_I2(a) == _I2(b))) && + ((n0 <= 3) || (_I3(a) == _I3(b))) && + (!n1 || ((a.addr[n0] ^ b.addr[n0]) < (1u << (32 - n1)))); +} + static inline u32 ip4_getbit(ip4_addr a, uint pos) -{ return _I(a) & (0x80000000 >> pos); } +{ return (_I(a) >> (31 - pos)) & 1; } + +static inline u32 ip4_getbits(ip4_addr a, uint pos, uint n) +{ return (_I(a) >> ((32 - n) - pos)) & ((1u << n) - 1); } static inline u32 ip6_getbit(ip6_addr a, uint pos) -{ return a.addr[pos / 32] & (0x80000000 >> (pos % 32)); } +{ return (a.addr[pos / 32] >> (31 - (pos % 32))) & 0x1; } + +static inline u32 ip6_getbits(ip6_addr a, uint pos, uint n) +{ return (a.addr[pos / 32] >> ((32 - n) - (pos % 32))) & ((1u << n) - 1); } static inline u32 ip4_setbit(ip4_addr *a, uint pos) { return _I(*a) |= (0x80000000 >> pos); } @@ -297,6 +321,13 @@ static inline u32 ip4_clrbit(ip4_addr *a, uint pos) static inline u32 ip6_clrbit(ip6_addr *a, uint pos) { return a->addr[pos / 32] &= ~(0x80000000 >> (pos % 32)); } +static inline ip4_addr ip4_setbits(ip4_addr a, uint pos, uint val) +{ _I(a) |= val << (31 - pos); return a; } + +static inline ip6_addr ip6_setbits(ip6_addr a, uint pos, uint val) +{ a.addr[pos / 32] |= val << (31 - pos % 32); return a; } + + static inline ip4_addr ip4_opposite_m1(ip4_addr a) { return _MI4(_I(a) ^ 1); } @@ -331,11 +362,7 @@ static inline ip6_addr ip6_hton(ip6_addr a) static inline ip6_addr ip6_ntoh(ip6_addr a) { return _MI6(ntohl(_I0(a)), ntohl(_I1(a)), ntohl(_I2(a)), ntohl(_I3(a))); } -#define MPLS_MAX_LABEL_STACK 8 -typedef struct mpls_label_stack { - uint len; - u32 stack[MPLS_MAX_LABEL_STACK]; -} mpls_label_stack; +#define MPLS_MAX_LABEL_STACK 16 static inline int mpls_get(const char *buf, int buflen, u32 *stack) diff --git a/lib/ip_test.c b/lib/ip_test.c index 36d10d68..eee0a427 100644 --- a/lib/ip_test.c +++ b/lib/ip_test.c @@ -167,6 +167,70 @@ t_ip6_ntop(void) return bt_assert_batch(test_vectors, test_ipa_ntop, bt_fmt_ipa, bt_fmt_str); } +static int +t_ip4_prefix_equal(void) +{ + bt_assert( ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x1234ffff), 16)); + bt_assert(!ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x1234ffff), 17)); + bt_assert( ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345000), 21)); + bt_assert(!ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345000), 22)); + + bt_assert( ip4_prefix_equal(ip4_from_u32(0x00000000), ip4_from_u32(0xffffffff), 0)); + bt_assert( ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345678), 0)); + + bt_assert( ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345678), 32)); + bt_assert(!ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345679), 32)); + bt_assert(!ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x92345678), 32)); + + return 1; +} + +static int +t_ip6_prefix_equal(void) +{ + bt_assert( ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x1234ffff, 0xfefefefe, 0xdcdcdcdc), + 48)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x1234ffff, 0xfefefefe, 0xdcdcdcdc), + 49)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20020db8, 0x12345678, 0xfefefefe, 0xdcdcdcdc), + 48)); + + bt_assert( ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x12345678, 0xfefefefe, 0xdcdcdcdc), + 64)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x1234567e, 0xfefefefe, 0xdcdcdcdc), + 64)); + + bt_assert( ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20002020), + ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + 106)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20002020), + ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + 107)); + + bt_assert( ip6_prefix_equal(ip6_build(0xfeef0db8, 0x87654321, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x12345678, 0xfefefefe, 0xdcdcdcdc), + 0)); + + bt_assert( ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + 128)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202021), + 128)); + + return 1; +} + int main(int argc, char *argv[]) { @@ -176,6 +240,8 @@ main(int argc, char *argv[]) bt_test_suite(t_ip6_pton, "Converting IPv6 string to ip6_addr struct"); bt_test_suite(t_ip4_ntop, "Converting ip4_addr struct to IPv4 string"); bt_test_suite(t_ip6_ntop, "Converting ip6_addr struct to IPv6 string"); + bt_test_suite(t_ip4_prefix_equal, "Testing ip4_prefix_equal()"); + bt_test_suite(t_ip6_prefix_equal, "Testing ip6_prefix_equal()"); return bt_exit_value(); } diff --git a/lib/lists.c b/lib/lists.c index dc2e4cbb..8f95c7c2 100644 --- a/lib/lists.c +++ b/lib/lists.c @@ -110,15 +110,6 @@ add_head(list *l, node *n) l->head = n; } -LIST_INLINE void -self_link(node *n) -{ - ASSUME(n->prev == NULL); - ASSUME(n->next == NULL); - - n->prev = n->next = n; -} - /** * insert_node - insert a node to a list * @n: a new list node diff --git a/lib/lists.h b/lib/lists.h index dc49ec8a..86ff59c9 100644 --- a/lib/lists.h +++ b/lib/lists.h @@ -42,6 +42,7 @@ typedef union list { /* In fact two overlayed nodes */ }; } list; +#define STATIC_LIST_INIT(name) name = { .head = &name.tail_node, .tail = &name.head_node, .null = NULL } #define NODE (node *) #define HEAD(list) ((void *)((list).head)) @@ -90,7 +91,6 @@ enlisted(node *n) #define LIST_INLINE void add_tail(list *, node *); void add_head(list *, node *); -void self_link(node *); void rem_node(node *); void add_tail_list(list *, list *); void init_list(list *); diff --git a/lib/locking.h b/lib/locking.h index 1a8bdcd4..498afdc8 100644 --- a/lib/locking.h +++ b/lib/locking.h @@ -15,10 +15,9 @@ struct domain_generic; struct lock_order { struct domain_generic *the_bird; struct domain_generic *proto; + struct domain_generic *service; struct domain_generic *rtable; struct domain_generic *attrs; - struct domain_generic *cork; - struct domain_generic *event; struct domain_generic *resource; }; diff --git a/lib/mempool.c b/lib/mempool.c index 8f300b81..33eaec86 100644 --- a/lib/mempool.c +++ b/lib/mempool.c @@ -27,26 +27,24 @@ struct lp_chunk { struct lp_chunk *next; - uint size; uintptr_t data_align[0]; byte data[0]; }; -const int lp_chunk_size = sizeof(struct lp_chunk); +#define LP_DATA_SIZE (page_size - OFFSETOF(struct lp_chunk, data)) struct linpool { resource r; byte *ptr, *end; - pool *p; struct lp_chunk *first, *current; /* Normal (reusable) chunks */ struct lp_chunk *first_large; /* Large chunks */ - uint chunk_size, threshold, total:31, use_pages:1, total_large; + uint total, total_large; }; static void lp_free(resource *); static void lp_dump(resource *); static resource *lp_lookup(resource *, unsigned long); -static size_t lp_memsize(resource *r); +static struct resmem lp_memsize(resource *r); static struct resclass lp_class = { "LinPool", @@ -60,26 +58,14 @@ static struct resclass lp_class = { /** * lp_new - create a new linear memory pool * @p: pool - * @blk: block size * * lp_new() creates a new linear memory pool resource inside the pool @p. - * The linear pool consists of a list of memory chunks of size at least - * @blk. + * The linear pool consists of a list of memory chunks of page size. */ linpool -*lp_new(pool *p, uint blk) +*lp_new(pool *p) { - linpool *m = ralloc(p, &lp_class); - m->p = p; - if (!blk) - { - m->use_pages = 1; - blk = page_size - lp_chunk_size; - } - - m->chunk_size = blk; - m->threshold = 3*blk/4; - return m; + return ralloc(p, &lp_class); } /** @@ -110,14 +96,13 @@ lp_alloc(linpool *m, uint size) else { struct lp_chunk *c; - if (size >= m->threshold) + if (size > LP_DATA_SIZE) { /* Too large => allocate large chunk */ c = xmalloc(sizeof(struct lp_chunk) + size); m->total_large += size; c->next = m->first_large; m->first_large = c; - c->size = size; } else { @@ -129,14 +114,10 @@ lp_alloc(linpool *m, uint size) else { /* Need to allocate a new chunk */ - if (m->use_pages) - c = alloc_page(m->p); - else - c = xmalloc(sizeof(struct lp_chunk) + m->chunk_size); + c = alloc_page(); - m->total += m->chunk_size; + m->total += LP_DATA_SIZE; c->next = NULL; - c->size = m->chunk_size; if (m->current) m->current->next = c; @@ -145,7 +126,7 @@ lp_alloc(linpool *m, uint size) } m->current = c; m->ptr = c->data + size; - m->end = c->data + m->chunk_size; + m->end = c->data + LP_DATA_SIZE; } return c->data; } @@ -207,7 +188,7 @@ lp_flush(linpool *m) /* Move ptr to the first chunk and free all large chunks */ m->current = c = m->first; m->ptr = c ? c->data : NULL; - m->end = c ? c->data + m->chunk_size : NULL; + m->end = c ? c->data + LP_DATA_SIZE : NULL; while (c = m->first_large) { @@ -230,6 +211,7 @@ lp_save(linpool *m, lp_state *p) { p->current = m->current; p->large = m->first_large; + p->total_large = m->total_large; p->ptr = m->ptr; } @@ -251,12 +233,12 @@ lp_restore(linpool *m, lp_state *p) /* Move ptr to the saved pos and free all newer large chunks */ m->current = c = p->current; m->ptr = p->ptr; - m->end = c ? c->data + m->chunk_size : NULL; + m->end = c ? c->data + LP_DATA_SIZE : NULL; + m->total_large = p->total_large; while ((c = m->first_large) && (c != p->large)) { m->first_large = c->next; - m->total_large -= c->size; xfree(c); } } @@ -270,10 +252,7 @@ lp_free(resource *r) for(d=m->first; d; d = c) { c = d->next; - if (m->use_pages) - free_page(m->p, d); - else - xfree(d); + free_page(d); } for(d=m->first_large; d; d = c) { @@ -293,30 +272,33 @@ lp_dump(resource *r) ; for(cntl=0, c=m->first_large; c; c=c->next, cntl++) ; - debug("(chunk=%d threshold=%d count=%d+%d total=%d+%d)\n", - m->chunk_size, - m->threshold, + debug("(count=%d+%d total=%d+%d)\n", cnt, cntl, m->total, m->total_large); } -static size_t +static struct resmem lp_memsize(resource *r) { linpool *m = (linpool *) r; - struct lp_chunk *c; - int cnt = 0; + struct resmem sz = { + .overhead = sizeof(struct linpool) + ALLOC_OVERHEAD, + .effective = m->total_large, + }; - for(c=m->first; c; c=c->next) - cnt++; - for(c=m->first_large; c; c=c->next) - cnt++; + for (struct lp_chunk *c = m->first_large; c; c = c->next) + sz.overhead += sizeof(struct lp_chunk) + ALLOC_OVERHEAD; - return ALLOC_OVERHEAD + sizeof(struct linpool) + - cnt * (ALLOC_OVERHEAD + sizeof(struct lp_chunk)) + - m->total + m->total_large; + uint regular = 0; + for (struct lp_chunk *c = m->first; c; c = c->next) + regular++; + + sz.effective += LP_DATA_SIZE * regular; + sz.overhead += (sizeof(struct lp_chunk) + ALLOC_OVERHEAD) * regular; + + return sz; } @@ -327,10 +309,7 @@ lp_lookup(resource *r, unsigned long a) struct lp_chunk *c; for(c=m->first; c; c=c->next) - if ((unsigned long) c->data <= a && (unsigned long) c->data + c->size > a) - return r; - for(c=m->first_large; c; c=c->next) - if ((unsigned long) c->data <= a && (unsigned long) c->data + c->size > a) + if ((unsigned long) c->data <= a && (unsigned long) c->data + LP_DATA_SIZE > a) return r; return NULL; } @@ -38,6 +38,7 @@ #define NB_IP (NB_IP4 | NB_IP6) #define NB_VPN (NB_VPN4 | NB_VPN6) +#define NB_ROA (NB_ROA4 | NB_ROA6) #define NB_FLOW (NB_FLOW4 | NB_FLOW6) #define NB_DEST (NB_IP | NB_IP6_SADR | NB_VPN | NB_MPLS) #define NB_ANY 0xffffffff diff --git a/lib/printf.c b/lib/printf.c index 236df427..424d545f 100644 --- a/lib/printf.c +++ b/lib/printf.c @@ -568,3 +568,51 @@ buffer_puts(buffer *buf, const char *str) buf->pos = (bp < be) ? bp : buf->end; } + +#define POOL_PRINTF_MAXBUF 1024 + +char *mb_vsprintf(pool *p, const char *fmt, va_list args) +{ + char buf[POOL_PRINTF_MAXBUF]; + int count = bvsnprintf(buf, POOL_PRINTF_MAXBUF, fmt, args); + + if (count < 0) + bug("Attempted to mb_vsprintf() a too long string"); + + char *out = mb_alloc(p, count + 1); + memcpy(out, buf, count + 1); + return out; +} + +char *mb_sprintf(pool *p, const char *fmt, ...) +{ + va_list args; + char *out; + va_start(args, fmt); + out = mb_vsprintf(p, fmt, args); + va_end(args); + return out; +} + +char *lp_vsprintf(linpool *p, const char *fmt, va_list args) +{ + char buf[POOL_PRINTF_MAXBUF]; + int count = bvsnprintf(buf, POOL_PRINTF_MAXBUF, fmt, args); + + if (count < 0) + bug("Attempted to mb_vsprintf() a too long string"); + + char *out = lp_alloc(p, count + 1); + memcpy(out, buf, count + 1); + return out; +} + +char *lp_sprintf(linpool *p, const char *fmt, ...) +{ + va_list args; + char *out; + va_start(args, fmt); + out = lp_vsprintf(p, fmt, args); + va_end(args); + return out; +} @@ -13,15 +13,15 @@ */ #include "lib/rcu.h" -#include "lib/coro.h" +#include "lib/io-loop.h" #include "lib/locking.h" _Atomic uint rcu_gp_ctl = RCU_NEST_CNT; -_Thread_local struct rcu_coro *this_rcu_coro = NULL; +_Thread_local struct rcu_birdloop *this_rcu_birdloop = NULL; -static list rcu_coro_list; +static list rcu_birdloop_list; -static struct rcu_coro main_rcu_coro; +static struct rcu_birdloop main_rcu_birdloop; DEFINE_DOMAIN(resource); static DOMAIN(resource) rcu_domain; @@ -37,10 +37,10 @@ static void update_counter_and_wait(void) { atomic_fetch_xor(&rcu_gp_ctl, RCU_GP_PHASE); - struct rcu_coro *rc; - WALK_LIST(rc, rcu_coro_list) + struct rcu_birdloop *rc; + WALK_LIST(rc, rcu_birdloop_list) while (rcu_gp_ongoing(&rc->ctl)) - coro_yield(); + birdloop_yield(); } void @@ -53,19 +53,19 @@ synchronize_rcu(void) } void -rcu_coro_start(struct rcu_coro *rc) +rcu_birdloop_start(struct rcu_birdloop *rc) { LOCK_DOMAIN(resource, rcu_domain); - add_tail(&rcu_coro_list, &rc->n); - this_rcu_coro = rc; + add_tail(&rcu_birdloop_list, &rc->n); + this_rcu_birdloop = rc; UNLOCK_DOMAIN(resource, rcu_domain); } void -rcu_coro_stop(struct rcu_coro *rc) +rcu_birdloop_stop(struct rcu_birdloop *rc) { LOCK_DOMAIN(resource, rcu_domain); - this_rcu_coro = NULL; + this_rcu_birdloop = NULL; rem_node(&rc->n); UNLOCK_DOMAIN(resource, rcu_domain); } @@ -74,6 +74,6 @@ void rcu_init(void) { rcu_domain = DOMAIN_NEW(resource, "Read-Copy-Update"); - init_list(&rcu_coro_list); - rcu_coro_start(&main_rcu_coro); + init_list(&rcu_birdloop_list); + rcu_birdloop_start(&main_rcu_birdloop); } @@ -21,33 +21,33 @@ extern _Atomic uint rcu_gp_ctl; -struct rcu_coro { +struct rcu_birdloop { node n; _Atomic uint ctl; }; -extern _Thread_local struct rcu_coro *this_rcu_coro; +extern _Thread_local struct rcu_birdloop *this_rcu_birdloop; static inline void rcu_read_lock(void) { - uint cmp = atomic_load_explicit(&this_rcu_coro->ctl, memory_order_acquire); + uint cmp = atomic_load_explicit(&this_rcu_birdloop->ctl, memory_order_acquire); if (cmp & RCU_NEST_MASK) - atomic_store_explicit(&this_rcu_coro->ctl, cmp + RCU_NEST_CNT, memory_order_relaxed); + atomic_store_explicit(&this_rcu_birdloop->ctl, cmp + RCU_NEST_CNT, memory_order_relaxed); else - atomic_store(&this_rcu_coro->ctl, atomic_load_explicit(&rcu_gp_ctl, memory_order_acquire)); + atomic_store(&this_rcu_birdloop->ctl, atomic_load_explicit(&rcu_gp_ctl, memory_order_acquire)); } static inline void rcu_read_unlock(void) { - atomic_fetch_sub(&this_rcu_coro->ctl, RCU_NEST_CNT); + atomic_fetch_sub(&this_rcu_birdloop->ctl, RCU_NEST_CNT); } void synchronize_rcu(void); -/* Registering and unregistering a coroutine. To be called from coroutine implementation */ -void rcu_coro_start(struct rcu_coro *); -void rcu_coro_stop(struct rcu_coro *); +/* Registering and unregistering a birdloop. To be called from birdloop implementation */ +void rcu_birdloop_start(struct rcu_birdloop *); +void rcu_birdloop_stop(struct rcu_birdloop *); /* Run this from resource init */ void rcu_init(void); diff --git a/lib/resource.c b/lib/resource.c index 0651406f..2e367132 100644 --- a/lib/resource.c +++ b/lib/resource.c @@ -2,6 +2,7 @@ * BIRD Resource Manager * * (c) 1998--2000 Martin Mares <mj@ucw.cz> + * (c) 2021 Maria Matejka <mq@jmq.cz> * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -29,25 +30,10 @@ * is freed upon shutdown of the module. */ -struct pool { - resource r; - list inside; - struct pool_pages *pages; - const char *name; -}; - -struct pool_pages { - uint free; - uint used; - void *ptr[0]; -}; - -#define POOL_PAGES_MAX ((page_size - sizeof(struct pool_pages)) / sizeof (void *)) - static void pool_dump(resource *); static void pool_free(resource *); static resource *pool_lookup(resource *, unsigned long); -static size_t pool_memsize(resource *P); +static struct resmem pool_memsize(resource *P); static struct resclass pool_class = { "Pool", @@ -60,9 +46,6 @@ static struct resclass pool_class = { pool root_pool; -void *alloc_sys_page(void); -int free_sys_page(void *); - static int indent; /** @@ -82,6 +65,20 @@ rp_new(pool *p, const char *name) return z; } +pool * +rp_newf(pool *p, const char *fmt, ...) +{ + pool *z = rp_new(p, NULL); + + va_list args; + va_start(args, fmt); + z->name = mb_vsprintf(p, fmt, args); + va_end(args); + + return z; +} + + static void pool_free(resource *P) { @@ -95,16 +92,6 @@ pool_free(resource *P) xfree(r); r = rr; } - - if (p->pages) - { - ASSERT_DIE(!p->pages->used); - - for (uint i = 0; i < p->pages->free; i++) - free_sys_page(p->pages->ptr[i]); - - free_sys_page(p->pages); - } } static void @@ -120,18 +107,22 @@ pool_dump(resource *P) indent -= 3; } -static size_t +static struct resmem pool_memsize(resource *P) { pool *p = (pool *) P; resource *r; - size_t sum = sizeof(pool) + ALLOC_OVERHEAD; + struct resmem sum = { + .effective = 0, + .overhead = sizeof(pool) + ALLOC_OVERHEAD, + }; WALK_LIST(r, p->inside) - sum += rmemsize(r); - - if (p->pages) - sum += page_size * (p->pages->used + p->pages->free + 1); + { + struct resmem add = rmemsize(r); + sum.effective += add.effective; + sum.overhead += add.overhead; + } return sum; } @@ -219,14 +210,17 @@ rdump(void *res) debug("NULL\n"); } -size_t +struct resmem rmemsize(void *res) { resource *r = res; if (!r) - return 0; + return (struct resmem) {}; if (!r->class->memsize) - return r->class->size + ALLOC_OVERHEAD; + return (struct resmem) { + .effective = r->class->size - sizeof(resource), + .overhead = ALLOC_OVERHEAD + sizeof(resource), + }; return r->class->memsize(r); } @@ -286,12 +280,33 @@ void resource_init(void) { rcu_init(); + resource_sys_init(); root_pool.r.class = &pool_class; root_pool.name = "Root"; init_list(&root_pool.inside); + tmp_init(&root_pool); +} + +_Thread_local struct tmp_resources tmp_res; + +void +tmp_init(pool *p) +{ + tmp_res.lp = lp_new_default(p); + tmp_res.parent = p; + tmp_res.pool = rp_new(p, "TMP"); +} + +void +tmp_flush(void) +{ + lp_flush(tmp_linpool); + rfree(tmp_res.pool); + tmp_res.pool = rp_new(tmp_res.parent, "TMP"); } + /** * DOC: Memory blocks * @@ -333,11 +348,14 @@ mbl_lookup(resource *r, unsigned long a) return NULL; } -static size_t +static struct resmem mbl_memsize(resource *r) { struct mblock *m = (struct mblock *) r; - return ALLOC_OVERHEAD + sizeof(struct mblock) + m->size; + return (struct resmem) { + .effective = m->size, + .overhead = ALLOC_OVERHEAD + sizeof(struct mblock), + }; } static struct resclass mb_class = { @@ -421,21 +439,6 @@ mb_realloc(void *m, unsigned size) return b->data; } -/** - * mb_move - move a memory block - * @m: memory block - * @p: target pool - * - * mb_move() moves the given memory block to another pool in the same way - * as rmove() moves a plain resource. - */ -void -mb_move(void *m, pool *p) -{ - struct mblock *b = SKIP_BACK(struct mblock, data, m); - rmove(b, p); -} - /** * mb_free - free a memory block @@ -453,48 +456,6 @@ mb_free(void *m) rfree(b); } -void * -alloc_page(pool *p) -{ - if (!p->pages) - { - p->pages = alloc_sys_page(); - p->pages->free = 0; - p->pages->used = 1; - } - else - p->pages->used++; - - if (p->pages->free) - { - void *ptr = p->pages->ptr[--p->pages->free]; - bzero(ptr, page_size); - return ptr; - } - else - return alloc_sys_page(); -} - -void -free_page(pool *p, void *ptr) -{ - ASSERT_DIE(p->pages); - p->pages->used--; - - ASSERT_DIE(p->pages->free <= POOL_PAGES_MAX); - - if (p->pages->free == POOL_PAGES_MAX) - { - const unsigned long keep = POOL_PAGES_MAX / 4; - - for (uint i = keep; i < p->pages->free; i++) - free_sys_page(p->pages->ptr[i]); - - p->pages->free = keep; - } - - p->pages->ptr[p->pages->free++] = ptr; -} #define STEP_UP(x) ((x) + (x)/2 + 4) diff --git a/lib/resource.h b/lib/resource.h index 26030aea..5d9e2165 100644 --- a/lib/resource.h +++ b/lib/resource.h @@ -2,6 +2,7 @@ * BIRD Resource Manager * * (c) 1998--1999 Martin Mares <mj@ucw.cz> + * (c) 2021 Maria Matejka <mq@jmq.cz> * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -11,6 +12,11 @@ #include "lib/lists.h" +struct resmem { + size_t effective; /* Memory actually used for data storage */ + size_t overhead; /* Overhead memory imposed by allocator strategies */ +}; + /* Resource */ typedef struct resource { @@ -26,21 +32,27 @@ struct resclass { void (*free)(resource *); /* Freeing function */ void (*dump)(resource *); /* Dump to debug output */ resource *(*lookup)(resource *, unsigned long); /* Look up address (only for debugging) */ - size_t (*memsize)(resource *); /* Return size of memory used by the resource, may be NULL */ + struct resmem (*memsize)(resource *); /* Return size of memory used by the resource, may be NULL */ }; /* Estimate of system allocator overhead per item, for memory consumtion stats */ -#define ALLOC_OVERHEAD 8 +#define ALLOC_OVERHEAD 16 /* Generic resource manipulation */ -typedef struct pool pool; +typedef struct pool { + resource r; + list inside; + const char *name; +} pool; + void resource_init(void); pool *rp_new(pool *, const char *); /* Create new pool */ +pool *rp_newf(pool *, const char *, ...); /* Create a new pool with a formatted string as its name */ void rfree(void *); /* Free single resource */ void rdump(void *); /* Dump to debug output */ -size_t rmemsize(void *res); /* Return size of memory used by the resource */ +struct resmem rmemsize(void *res); /* Return size of memory used by the resource */ void rlookup(unsigned long); /* Look up address (only for debugging) */ void rmove(void *, pool *); /* Move to a different pool */ @@ -53,7 +65,6 @@ extern pool root_pool; void *mb_alloc(pool *, unsigned size); void *mb_allocz(pool *, unsigned size); void *mb_realloc(void *m, unsigned size); -void mb_move(void *, pool *); void mb_free(void *); /* Memory pools with linear allocation */ @@ -63,9 +74,10 @@ typedef struct linpool linpool; typedef struct lp_state { void *current, *large; byte *ptr; + uint total_large; } lp_state; -linpool *lp_new(pool *, unsigned blk); +linpool *lp_new(pool *); void *lp_alloc(linpool *, unsigned size); /* Aligned */ void *lp_allocu(linpool *, unsigned size); /* Unaligned */ void *lp_allocz(linpool *, unsigned size); /* With clear */ @@ -73,10 +85,23 @@ void lp_flush(linpool *); /* Free everything, but leave linpool */ void lp_save(linpool *m, lp_state *p); /* Save state */ void lp_restore(linpool *m, lp_state *p); /* Restore state */ -extern const int lp_chunk_size; -#define LP_GAS 1024 -#define LP_GOOD_SIZE(x) (((x + LP_GAS - 1) & (~(LP_GAS - 1))) - lp_chunk_size) -#define lp_new_default(p) lp_new(p, 0) +struct tmp_resources { + pool *pool, *parent; + linpool *lp; +}; + +extern _Thread_local struct tmp_resources tmp_res; + +#define tmp_linpool tmp_res.lp +#define tmp_alloc(sz) lp_alloc(tmp_linpool, sz) +#define tmp_allocu(sz) lp_allocu(tmp_linpool, sz) +#define tmp_allocz(sz) lp_allocz(tmp_linpool, sz) + +void tmp_init(pool *p); +void tmp_flush(void); + + +#define lp_new_default lp_new /* Slabs */ @@ -85,7 +110,7 @@ typedef struct slab slab; slab *sl_new(pool *, unsigned size); void *sl_alloc(slab *); void *sl_allocz(slab *); -void sl_free(slab *, void *); +void sl_free(void *); /* * Low-level memory allocation functions, please don't use @@ -94,12 +119,16 @@ void sl_free(slab *, void *); void buffer_realloc(void **buf, unsigned *size, unsigned need, unsigned item_size); +/* Allocator of whole pages; for use in slabs and other high-level allocators. */ +#define PAGE_HEAD(x) ((void *) (((uintptr_t) (x)) & ~(page_size-1))) extern long page_size; +extern _Atomic int pages_kept; +extern _Atomic int pages_kept_locally; +void *alloc_page(void); +void free_page(void *); +void flush_local_pages(void); -/* Allocator of whole pages; for use in slabs and other high-level allocators. */ -void *alloc_page(pool *); -void free_page(pool *, void *); -#define PAGE_HEAD(x) ((void *) (((intptr_t) (x)) & ~(page_size-1))) +void resource_sys_init(void); #ifdef HAVE_LIBDMALLOC /* diff --git a/lib/route.h b/lib/route.h new file mode 100644 index 00000000..eae251e7 --- /dev/null +++ b/lib/route.h @@ -0,0 +1,549 @@ +/* + * BIRD Internet Routing Daemon -- Routing data structures + * + * (c) 1998--2000 Martin Mares <mj@ucw.cz> + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_LIB_ROUTE_H_ +#define _BIRD_LIB_ROUTE_H_ + +#undef RT_SOURCE_DEBUG + +#include "lib/type.h" +#include "lib/rcu.h" +#include "lib/hash.h" +#include "lib/event.h" + +struct network; +struct proto; +struct cli; +struct rtable_private; + +typedef struct rte { + struct ea_list *attrs; /* Attributes of this route */ + const net_addr *net; /* Network this RTE belongs to */ + struct rte_src *src; /* Route source that created the route */ + struct rt_import_hook *sender; /* Import hook used to send the route to the routing table */ + btime lastmod; /* Last modified (set by table) */ + u32 id; /* Table specific route id */ + byte flags; /* Table-specific flags */ + byte pflags; /* Protocol-specific flags */ + u8 generation; /* If this route import is based on other previously exported route, + this value should be 1 + MAX(generation of the parent routes). + Otherwise the route is independent and this value is zero. */ + u8 stale_cycle; /* Auxiliary value for route refresh */ +} rte; + +#define REF_FILTERED 2 /* Route is rejected by import filter */ +#define REF_PENDING 32 /* Route has not propagated completely yet */ + +/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ +static inline int rte_is_valid(rte *r) { return r && !(r->flags & REF_FILTERED); } + +/* Route just has REF_FILTERED flag */ +static inline int rte_is_filtered(rte *r) { return !!(r->flags & REF_FILTERED); } + +struct rte_src { + struct rte_src *next; /* Hash chain */ + struct rte_owner *owner; /* Route source owner */ + u32 private_id; /* Private ID, assigned by the protocol */ + u32 global_id; /* Globally unique ID of the source */ + _Atomic u64 uc; /* Use count */ +}; + +struct rte_owner_class { + void (*get_route_info)(struct rte *, byte *buf); /* Get route information (for `show route' command) */ + int (*rte_better)(struct rte *, struct rte *); + int (*rte_mergable)(struct rte *, struct rte *); + u32 (*rte_igp_metric)(const rte *); +}; + +struct rte_owner { + struct rte_owner_class *class; + int (*rte_recalculate)(struct rtable_private *, struct network *, struct rte *, struct rte *, struct rte *); + HASH(struct rte_src) hash; + const char *name; + u32 hash_key; + u32 uc; + event_list *list; + event *prune; + event *stop; +}; + +DEFINE_DOMAIN(attrs); +extern DOMAIN(attrs) attrs_domain; + +#define RTA_LOCK LOCK_DOMAIN(attrs, attrs_domain) +#define RTA_UNLOCK UNLOCK_DOMAIN(attrs, attrs_domain) + +#define RTE_SRC_PU_SHIFT 44 +#define RTE_SRC_IN_PROGRESS (1ULL << RTE_SRC_PU_SHIFT) + +/* Get a route source. This also locks the source, therefore the caller has to + * unlock the source after the route has been propagated. */ +struct rte_src *rt_get_source_o(struct rte_owner *o, u32 id); +#define rt_get_source(p, id) rt_get_source_o(&(p)->sources, (id)) + +struct rte_src *rt_find_source_global(u32 id); + +#ifdef RT_SOURCE_DEBUG +#define rt_lock_source _rt_lock_source_internal +#define rt_unlock_source _rt_unlock_source_internal +#endif + +static inline void rt_lock_source(struct rte_src *src) +{ + /* Locking a source is trivial; somebody already holds it so we just increase + * the use count. Nothing can be freed underneath our hands. */ + u64 uc = atomic_fetch_add_explicit(&src->uc, 1, memory_order_acq_rel); + ASSERT_DIE(uc > 0); +} + +static inline void rt_unlock_source(struct rte_src *src) +{ + /* Unlocking is tricky. We do it lockless so at the same time, the prune + * event may be running, therefore if the unlock gets us to zero, it must be + * the last thing in this routine, otherwise the prune routine may find the + * source's usecount zeroed, freeing it prematurely. + * + * The usecount is split into two parts: + * the top 20 bits are an in-progress indicator + * the bottom 44 bits keep the actual usecount. + * + * Therefore at most 1 million of writers can simultaneously unlock the same + * source, while at most ~17T different routes can reference it. Both limits + * are insanely high from the 2022 point of view. Let's suppose that when 17T + * routes or 1M writers get real, we get also 128bit atomic variables in the + * C norm. */ + + /* First, we push the in-progress indicator */ + u64 uc = atomic_fetch_add_explicit(&src->uc, RTE_SRC_IN_PROGRESS, memory_order_acq_rel); + + /* Then we split the indicator to its parts. Remember, we got the value before the operation happened. */ + u64 pending = (uc >> RTE_SRC_PU_SHIFT) + 1; + uc &= RTE_SRC_IN_PROGRESS - 1; + + /* We per-use the RCU critical section indicator to make the prune event wait + * until we finish here in the rare case we get preempted. */ + rcu_read_lock(); + + /* Obviously, there can't be more pending unlocks than the usecount itself */ + if (uc == pending) + /* If we're the last unlocker, schedule the owner's prune event */ + ev_send(src->owner->list, src->owner->prune); + else + ASSERT_DIE(uc > pending); + + /* And now, finally, simultaneously pop the in-progress indicator and the + * usecount, possibly allowing the source pruning routine to free this structure */ + atomic_fetch_sub_explicit(&src->uc, RTE_SRC_IN_PROGRESS + 1, memory_order_acq_rel); + + /* ... and to reduce the load a bit, the source pruning routine will better wait for + * RCU synchronization instead of a busy loop. */ + rcu_read_unlock(); +} + +#ifdef RT_SOURCE_DEBUG +#undef rt_lock_source +#undef rt_unlock_source + +#define rt_lock_source(x) ( log(L_INFO "Lock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_lock_source_internal(x) ) +#define rt_unlock_source(x) ( log(L_INFO "Unlock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_unlock_source_internal(x) ) +#endif + +void rt_init_sources(struct rte_owner *, const char *name, event_list *list); +void rt_destroy_sources(struct rte_owner *, event *); + +/* + * Route Attributes + * + * Beware: All standard BGP attributes must be represented here instead + * of making them local to the route. This is needed to ensure proper + * construction of BGP route attribute lists. + */ + +/* Nexthop structure */ +struct nexthop { + ip_addr gw; /* Next hop */ + struct iface *iface; /* Outgoing interface */ + byte flags; + byte weight; + byte labels; /* Number of all labels */ + u32 label[0]; +}; + +/* For packing one into eattrs */ +struct nexthop_adata { + struct adata ad; + /* There is either a set of nexthops or a special destination (RTD_*) */ + union { + struct nexthop nh; + uint dest; + }; +}; + +#define NEXTHOP_DEST_SIZE (OFFSETOF(struct nexthop_adata, dest) + sizeof(uint) - OFFSETOF(struct adata, data)) +#define NEXTHOP_DEST_LITERAL(x) ((struct nexthop_adata) { \ + .ad.length = NEXTHOP_DEST_SIZE, .dest = (x), }) + +#define RNF_ONLINK 0x1 /* Gateway is onlink regardless of IP ranges */ + + +#define RTS_STATIC 1 /* Normal static route */ +#define RTS_INHERIT 2 /* Route inherited from kernel */ +#define RTS_DEVICE 3 /* Device route */ +#define RTS_STATIC_DEVICE 4 /* Static device route */ +#define RTS_REDIRECT 5 /* Learned via redirect */ +#define RTS_RIP 6 /* RIP route */ +#define RTS_OSPF 7 /* OSPF route */ +#define RTS_OSPF_IA 8 /* OSPF inter-area route */ +#define RTS_OSPF_EXT1 9 /* OSPF external route type 1 */ +#define RTS_OSPF_EXT2 10 /* OSPF external route type 2 */ +#define RTS_BGP 11 /* BGP route */ +#define RTS_PIPE 12 /* Inter-table wormhole */ +#define RTS_BABEL 13 /* Babel route */ +#define RTS_RPKI 14 /* Route Origin Authorization */ +#define RTS_PERF 15 /* Perf checker */ +#define RTS_MAX 16 + +#define RTD_NONE 0 /* Undefined next hop */ +#define RTD_UNICAST 1 /* A standard next hop */ +#define RTD_BLACKHOLE 2 /* Silently drop packets */ +#define RTD_UNREACHABLE 3 /* Reject as unreachable */ +#define RTD_PROHIBIT 4 /* Administratively prohibited */ +#define RTD_MAX 5 + +extern const char * rta_dest_names[RTD_MAX]; + +static inline const char *rta_dest_name(uint n) +{ return (n < RTD_MAX) ? rta_dest_names[n] : "???"; } + + +/* + * Extended Route Attributes + */ + +typedef struct eattr { + word id; /* EA_CODE(PROTOCOL_..., protocol-dependent ID) */ + byte flags; /* Protocol-dependent flags */ + byte type; /* Attribute type */ + byte rfu:5; + byte originated:1; /* The attribute has originated locally */ + byte fresh:1; /* An uncached attribute (e.g. modified in export filter) */ + byte undef:1; /* Explicitly undefined */ + + PADDING(unused, 3, 3); + + union bval u; +} eattr; + + +#define EA_CODE_MASK 0xffff +#define EA_ALLOW_UNDEF 0x10000 /* ea_find: allow EAF_TYPE_UNDEF */ +#define EA_BIT(n) ((n) << 24) /* Used in bitfield accessors */ +#define EA_BIT_GET(ea) ((ea) >> 24) + +typedef struct ea_list { + struct ea_list *next; /* In case we have an override list */ + byte flags; /* Flags: EALF_... */ + byte rfu; + word count; /* Number of attributes */ + eattr attrs[0]; /* Attribute definitions themselves */ +} ea_list; + +struct ea_storage { + struct ea_storage *next_hash; /* Next in hash chain */ + struct ea_storage **pprev_hash; /* Previous in hash chain */ + _Atomic u32 uc; /* Use count */ + u32 hash_key; /* List hash */ + ea_list l[0]; /* The list itself */ +}; + +#define EALF_SORTED 1 /* Attributes are sorted by code */ +#define EALF_BISECT 2 /* Use interval bisection for searching */ +#define EALF_CACHED 4 /* List is cached */ + +struct ea_class { +#define EA_CLASS_INSIDE \ + const char *name; /* Name (both print and filter) */ \ + struct symbol *sym; /* Symbol to export to configs */ \ + uint id; /* Autoassigned attribute ID */ \ + uint uc; /* Reference count */ \ + btype type; /* Data type ID */ \ + uint readonly:1; /* This attribute can't be changed by filters */ \ + uint conf:1; /* Requested by config */ \ + uint hidden:1; /* Technical attribute, do not show, do not expose to filters */ \ + void (*format)(const eattr *ea, byte *buf, uint size); \ + void (*stored)(const eattr *ea); /* When stored into global hash */ \ + void (*freed)(const eattr *ea); /* When released from global hash */ \ + + EA_CLASS_INSIDE; +}; + +struct ea_class_ref { + resource r; + struct ea_class *class; +}; + +void ea_register_init(struct ea_class *); +struct ea_class_ref *ea_register_alloc(pool *, struct ea_class); + +#define EA_REGISTER_ALL_HELPER(x) ea_register_init(x); +#define EA_REGISTER_ALL(...) MACRO_FOREACH(EA_REGISTER_ALL_HELPER, __VA_ARGS__) + +struct ea_class *ea_class_find_by_id(uint id); +struct ea_class *ea_class_find_by_name(const char *name); +static inline struct ea_class *ea_class_self(struct ea_class *self) { return self; } +#define ea_class_find(_arg) _Generic((_arg), \ + uint: ea_class_find_by_id, \ + word: ea_class_find_by_id, \ + char *: ea_class_find_by_name, \ + const char *: ea_class_find_by_name, \ + struct ea_class *: ea_class_self)(_arg) + +struct ea_walk_state { + ea_list *eattrs; /* Ccurrent ea_list, initially set by caller */ + eattr *ea; /* Current eattr, initially NULL */ + u32 visited[4]; /* Bitfield, limiting max to 128 */ +}; + +#define ea_find(_l, _arg) _Generic((_arg), uint: ea_find_by_id, struct ea_class *: ea_find_by_class, char *: ea_find_by_name)(_l, _arg) +eattr *ea_find_by_id(ea_list *, unsigned ea); +static inline eattr *ea_find_by_class(ea_list *l, const struct ea_class *def) +{ return ea_find_by_id(l, def->id); } +static inline eattr *ea_find_by_name(ea_list *l, const char *name) +{ + const struct ea_class *def = ea_class_find_by_name(name); + return def ? ea_find_by_class(l, def) : NULL; +} + +#define ea_get_int(_l, _ident, _def) ({ \ + struct ea_class *cls = ea_class_find((_ident)); \ + ASSERT_DIE(cls->type & EAF_EMBEDDED); \ + const eattr *ea = ea_find((_l), cls->id); \ + (ea ? ea->u.data : (_def)); \ + }) + +#define ea_get_ip(_l, _ident, _def) ({ \ + struct ea_class *cls = ea_class_find((_ident)); \ + ASSERT_DIE(cls->type == T_IP); \ + const eattr *ea = ea_find((_l), cls->id); \ + (ea ? *((const ip_addr *) ea->u.ptr->data) : (_def)); \ + }) + +eattr *ea_walk(struct ea_walk_state *s, uint id, uint max); +void ea_dump(ea_list *); +int ea_same(ea_list *x, ea_list *y); /* Test whether two ea_lists are identical */ +uint ea_hash(ea_list *e); /* Calculate 16-bit hash value */ +ea_list *ea_append(ea_list *to, ea_list *what); +void ea_format_bitfield(const struct eattr *a, byte *buf, int bufsize, const char **names, int min, int max); + +/* Normalize ea_list; allocates the result from tmp_linpool */ +ea_list *ea_normalize(ea_list *e, int overlay); + +uint ea_list_size(ea_list *); +void ea_list_copy(ea_list *dest, ea_list *src, uint size); + +#define EA_LOCAL_LIST(N) struct { ea_list l; eattr a[N]; } + +#define EA_LITERAL_EMBEDDED(_class, _flags, _val) ({ \ + btype _type = (_class)->type; \ + ASSERT_DIE(_type & EAF_EMBEDDED); \ + EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.i = _val); \ + }) + +#define EA_LITERAL_STORE_ADATA(_class, _flags, _buf, _len) ({ \ + btype _type = (_class)->type; \ + ASSERT_DIE(!(_type & EAF_EMBEDDED)); \ + EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.ad = tmp_store_adata((_buf), (_len))); \ + }) + +#define EA_LITERAL_DIRECT_ADATA(_class, _flags, _adata) ({ \ + btype _type = (_class)->type; \ + ASSERT_DIE(!(_type & EAF_EMBEDDED)); \ + EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.ad = _adata); \ + }) + +#define EA_LITERAL_GENERIC(_id, _type, _flags, ...) \ + ((eattr) { .id = _id, .type = _type, .flags = _flags, __VA_ARGS__ }) + +static inline eattr * +ea_set_attr(ea_list **to, eattr a) +{ + EA_LOCAL_LIST(1) *ea = tmp_alloc(sizeof(*ea)); + *ea = (typeof(*ea)) { + .l.flags = EALF_SORTED, + .l.count = 1, + .l.next = *to, + .a[0] = a, + }; + + *to = &ea->l; + return &ea->a[0]; +} + +static inline void +ea_unset_attr(ea_list **to, _Bool local, const struct ea_class *def) +{ + ea_set_attr(to, EA_LITERAL_GENERIC(def->id, 0, 0, + .fresh = local, .originated = local, .undef = 1)); +} + +static inline void +ea_set_attr_u32(ea_list **to, const struct ea_class *def, uint flags, u64 data) +{ ea_set_attr(to, EA_LITERAL_EMBEDDED(def, flags, data)); } + +static inline void +ea_set_attr_data(ea_list **to, const struct ea_class *def, uint flags, const void *data, uint len) +{ ea_set_attr(to, EA_LITERAL_STORE_ADATA(def, flags, data, len)); } + +static inline void +ea_copy_attr(ea_list **to, ea_list *from, const struct ea_class *def) +{ + eattr *e = ea_find_by_class(from, def); + if (e) + if (e->type & EAF_EMBEDDED) + ea_set_attr_u32(to, def, e->flags, e->u.data); + else + ea_set_attr_data(to, def, e->flags, e->u.ptr->data, e->u.ptr->length); + else + ea_unset_attr(to, 0, def); +} + +/* + * Common route attributes + */ + +/* Preference: first-order comparison */ +extern struct ea_class ea_gen_preference; +static inline u32 rt_get_preference(rte *rt) +{ return ea_get_int(rt->attrs, &ea_gen_preference, 0); } + +/* IGP metric: second-order comparison */ +extern struct ea_class ea_gen_igp_metric; +u32 rt_get_igp_metric(const rte *rt); +#define IGP_METRIC_UNKNOWN 0x80000000 /* Default igp_metric used when no other + protocol-specific metric is availabe */ + +/* From: Advertising router */ +extern struct ea_class ea_gen_from; + +/* Source: An old method to devise the route source protocol and kind. + * To be superseded in a near future by something more informative. */ +extern struct ea_class ea_gen_source; +static inline u32 rt_get_source_attr(const rte *rt) +{ return ea_get_int(rt->attrs, &ea_gen_source, 0); } + +/* Flowspec validation result */ +enum flowspec_valid { + FLOWSPEC_UNKNOWN = 0, + FLOWSPEC_VALID = 1, + FLOWSPEC_INVALID = 2, + FLOWSPEC__MAX, +}; + +extern const char * flowspec_valid_names[FLOWSPEC__MAX]; +static inline const char *flowspec_valid_name(enum flowspec_valid v) +{ return (v < FLOWSPEC__MAX) ? flowspec_valid_names[v] : "???"; } + +extern struct ea_class ea_gen_flowspec_valid; +static inline enum flowspec_valid rt_get_flowspec_valid(rte *rt) +{ return ea_get_int(rt->attrs, &ea_gen_flowspec_valid, FLOWSPEC_UNKNOWN); } + +/* Next hop: For now, stored as adata */ +extern struct ea_class ea_gen_nexthop; + +static inline void ea_set_dest(struct ea_list **to, uint flags, uint dest) +{ + struct nexthop_adata nhad = NEXTHOP_DEST_LITERAL(dest); + ea_set_attr_data(to, &ea_gen_nexthop, flags, &nhad.ad.data, nhad.ad.length); +} + +/* Next hop structures */ + +#define NEXTHOP_ALIGNMENT (_Alignof(struct nexthop)) +#define NEXTHOP_MAX_SIZE (sizeof(struct nexthop) + sizeof(u32)*MPLS_MAX_LABEL_STACK) +#define NEXTHOP_SIZE(_nh) NEXTHOP_SIZE_CNT(((_nh)->labels)) +#define NEXTHOP_SIZE_CNT(cnt) BIRD_ALIGN((sizeof(struct nexthop) + sizeof(u32) * (cnt)), NEXTHOP_ALIGNMENT) +#define nexthop_size(nh) NEXTHOP_SIZE((nh)) + +#define NEXTHOP_NEXT(_nh) ((void *) (_nh) + NEXTHOP_SIZE(_nh)) +#define NEXTHOP_END(_nhad) ((_nhad)->ad.data + (_nhad)->ad.length) +#define NEXTHOP_VALID(_nh, _nhad) ((void *) (_nh) < (void *) NEXTHOP_END(_nhad)) +#define NEXTHOP_ONE(_nhad) (NEXTHOP_NEXT(&(_nhad)->nh) == NEXTHOP_END(_nhad)) + +#define NEXTHOP_WALK(_iter, _nhad) for ( \ + struct nexthop *_iter = &(_nhad)->nh; \ + (void *) _iter < (void *) NEXTHOP_END(_nhad); \ + _iter = NEXTHOP_NEXT(_iter)) + + +static inline int nexthop_same(struct nexthop_adata *x, struct nexthop_adata *y) +{ return adata_same(&x->ad, &y->ad); } +struct nexthop_adata *nexthop_merge(struct nexthop_adata *x, struct nexthop_adata *y, int max, linpool *lp); +struct nexthop_adata *nexthop_sort(struct nexthop_adata *x, linpool *lp); +int nexthop_is_sorted(struct nexthop_adata *x); + +#define NEXTHOP_IS_REACHABLE(nhad) ((nhad)->ad.length > NEXTHOP_DEST_SIZE) + +/* Route has regular, reachable nexthop (i.e. not RTD_UNREACHABLE and like) */ +static inline int rte_is_reachable(rte *r) +{ + eattr *nhea = ea_find(r->attrs, &ea_gen_nexthop); + if (!nhea) + return 0; + + struct nexthop_adata *nhad = (void *) nhea->u.ptr; + return NEXTHOP_IS_REACHABLE(nhad); +} + +static inline int nhea_dest(eattr *nhea) +{ + if (!nhea) + return RTD_NONE; + + struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + if (NEXTHOP_IS_REACHABLE(nhad)) + return RTD_UNICAST; + else + return nhad->dest; +} + +static inline int rte_dest(const rte *r) +{ + return nhea_dest(ea_find(r->attrs, &ea_gen_nexthop)); +} + +void rta_init(void); +ea_list *ea_lookup(ea_list *, int overlay); /* Get a cached (and normalized) variant of this attribute list */ +static inline int ea_is_cached(const ea_list *r) { return r->flags & EALF_CACHED; } +static inline struct ea_storage *ea_get_storage(ea_list *r) +{ + ASSERT_DIE(ea_is_cached(r)); + return SKIP_BACK(struct ea_storage, l[0], r); +} + +static inline ea_list *ea_clone(ea_list *r) { + ASSERT_DIE(0 < atomic_fetch_add_explicit(&ea_get_storage(r)->uc, 1, memory_order_acq_rel)); + return r; +} +void ea__free(struct ea_storage *r); +static inline void ea_free(ea_list *l) { + if (!l) return; + struct ea_storage *r = ea_get_storage(l); + if (1 == atomic_fetch_sub_explicit(&r->uc, 1, memory_order_acq_rel)) ea__free(r); +} + +void ea_dump(ea_list *); +void ea_dump_all(void); +void ea_show_list(struct cli *, ea_list *); + +#define rta_lookup ea_lookup +#define rta_is_cached ea_is_cached +#define rta_clone ea_clone +#define rta_free ea_free + +#endif diff --git a/lib/settle.h b/lib/settle.h new file mode 100644 index 00000000..d274599d --- /dev/null +++ b/lib/settle.h @@ -0,0 +1,64 @@ +/* + * BIRD -- Settle timer + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * (c) 2022 CZ.NIC z.s.p.o. + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_SETTLE_H_ +#define _BIRD_SETTLE_H_ + +#include "lib/birdlib.h" +#include "lib/timer.h" + +struct settle_config { + btime min, max; +}; + +struct settle { + union { + /* Timer hook polymorphism. */ + struct { + resource _r; + void (*hook)(struct settle *); + }; + timer tm; + }; + struct settle_config cf; + btime started; +}; + +STATIC_ASSERT(OFFSETOF(struct settle, hook) == OFFSETOF(struct settle, tm) + OFFSETOF(timer, hook)); + +#define SETTLE_INIT(_cfp, _hook, _data) (struct settle) { .tm = { .data = (_data), }, .hook = (_hook), .cf = ({ASSERT_DIE((_cfp)->min <= (_cfp)->max); *(_cfp); }), } + + +static inline void settle_init(struct settle *s, struct settle_config *cf, void (*hook)(struct settle *), void *data) +{ + *s = SETTLE_INIT(cf, hook, data); +} + +#define settle_active(s) tm_active(&(s)->tm) + +static inline void settle_kick(struct settle *s, struct birdloop *loop) +{ + if (!tm_active(&s->tm)) + { + s->started = current_time(); + tm_set_in(&s->tm, s->started + s->cf.min, loop); + } + else + { + btime now = current_time(); + tm_set_in(&s->tm, MIN_(now + s->cf.min, s->started + s->cf.max), loop); + } +} + +static inline void settle_cancel(struct settle *s) +{ + tm_stop(&s->tm); +} + +#endif @@ -32,6 +32,7 @@ #include "nest/bird.h" #include "lib/resource.h" #include "lib/string.h" +#include "lib/tlists.h" #undef FAKE_SLAB /* Turn on if you want to debug memory allocations */ @@ -42,7 +43,7 @@ static void slab_free(resource *r); static void slab_dump(resource *r); static resource *slab_lookup(resource *r, unsigned long addr); -static size_t slab_memsize(resource *r); +static struct resmem slab_memsize(resource *r); #ifdef FAKE_SLAB @@ -98,7 +99,7 @@ sl_allocz(slab *s) } void -sl_free(slab *s, void *oo) +sl_free(void *oo) { struct sl_obj *o = SKIP_BACK(struct sl_obj, data, oo); @@ -128,7 +129,7 @@ slab_dump(resource *r) debug("(%d objects per %d bytes)\n", cnt, s->size); } -static size_t +static struct resmem slab_memsize(resource *r) { slab *s = (slab *) r; @@ -138,7 +139,10 @@ slab_memsize(resource *r) WALK_LIST(o, s->objs) cnt++; - return ALLOC_OVERHEAD + sizeof(struct slab) + cnt * (ALLOC_OVERHEAD + s->size); + return (struct resmem) { + .effective = cnt * s->size, + .overhead = ALLOC_OVERHEAD + sizeof(struct slab) + cnt * ALLOC_OVERHEAD, + }; } @@ -150,12 +154,38 @@ slab_memsize(resource *r) #define MAX_EMPTY_HEADS 1 +enum sl_head_state { + slh_empty = 2, + slh_partial = 0, + slh_full = 1, +} PACKED; + +struct sl_head { + struct slab *slab; + TLIST_NODE(sl_head, struct sl_head) n; + u16 num_full; + enum sl_head_state state; + u32 used_bits[0]; +}; + +struct sl_alignment { /* Magic structure for testing of alignment */ + byte data; + int x[0]; +}; + +#define TLIST_PREFIX sl_head +#define TLIST_TYPE struct sl_head +#define TLIST_ITEM n +#define TLIST_WANT_WALK +#define TLIST_WANT_ADD_HEAD + +#include "lib/tlists.h" + struct slab { resource r; - pool *p; uint obj_size, head_size, head_bitfield_len; uint objs_per_slab, num_empty_heads, data_size; - list empty_heads, partial_heads, full_heads; + struct sl_head_list empty_heads, partial_heads, full_heads; }; static struct resclass sl_class = { @@ -167,18 +197,15 @@ static struct resclass sl_class = { slab_memsize }; -struct sl_head { - node n; - u32 num_full; - u32 used_bits[0]; -}; +#define SL_GET_HEAD(x) PAGE_HEAD(x) -struct sl_alignment { /* Magic structure for testing of alignment */ - byte data; - int x[0]; -}; +#define SL_HEAD_CHANGE_STATE(_s, _h, _from, _to) ({ \ + ASSERT_DIE(_h->state == slh_##_from); \ + sl_head_rem_node(&_s->_from##_heads, _h); \ + sl_head_add_head(&_s->_to##_heads, _h); \ + _h->state = slh_##_to; \ + }) -#define SL_GET_HEAD(x) ((struct sl_head *) PAGE_HEAD(x)) /** * sl_new - create a new Slab @@ -192,10 +219,9 @@ slab * sl_new(pool *p, uint size) { slab *s = ralloc(p, &sl_class); - s->p = p; uint align = sizeof(struct sl_alignment); - if (align < sizeof(int)) - align = sizeof(int); + if (align < sizeof(void *)) + align = sizeof(void *); s->data_size = size; size = (size + align - 1) / align * align; s->obj_size = size; @@ -210,15 +236,12 @@ sl_new(pool *p, uint size) + sizeof(u32) * s->head_bitfield_len + align - 1) / align * align; - } while (s->objs_per_slab * size + s->head_size > page_size); + } while (s->objs_per_slab * size + s->head_size > (size_t) page_size); if (!s->objs_per_slab) bug("Slab: object too large"); s->num_empty_heads = 0; - init_list(&s->empty_heads); - init_list(&s->partial_heads); - init_list(&s->full_heads); return s; } @@ -235,8 +258,7 @@ sl_alloc(slab *s) struct sl_head *h; redo: - h = HEAD(s->partial_heads); - if (!h->n.next) + if (!(h = s->partial_heads.first)) goto no_partial; okay: for (uint i=0; i<s->head_bitfield_len; i++) @@ -256,26 +278,27 @@ okay: return out; } - rem_node(&h->n); - add_tail(&s->full_heads, &h->n); + SL_HEAD_CHANGE_STATE(s, h, partial, full); goto redo; no_partial: - h = HEAD(s->empty_heads); - if (h->n.next) + if (h = s->empty_heads.first) { - rem_node(&h->n); - add_head(&s->partial_heads, &h->n); + SL_HEAD_CHANGE_STATE(s, h, empty, partial); s->num_empty_heads--; goto okay; } - h = alloc_page(s->p); + + h = alloc_page(); + ASSERT_DIE(SL_GET_HEAD(h) == h); + #ifdef POISON memset(h, 0xba, page_size); #endif - ASSERT_DIE(SL_GET_HEAD(h) == h); + memset(h, 0, s->head_size); - add_head(&s->partial_heads, &h->n); + h->slab = s; + sl_head_add_head(&s->partial_heads, h); goto okay; } @@ -304,9 +327,10 @@ sl_allocz(slab *s) * and returns it back to the Slab @s. */ void -sl_free(slab *s, void *oo) +sl_free(void *oo) { struct sl_head *h = SL_GET_HEAD(oo); + struct slab *s = h->slab; #ifdef POISON memset(oo, 0xdb, s->data_size); @@ -319,24 +343,22 @@ sl_free(slab *s, void *oo) h->used_bits[pos / 32] &= ~(1 << (pos % 32)); - if (h->num_full-- == s->objs_per_slab) - { - rem_node(&h->n); - add_head(&s->partial_heads, &h->n); - } + if ((h->num_full-- == s->objs_per_slab) && (h->state == slh_full)) + SL_HEAD_CHANGE_STATE(s, h, full, partial); else if (!h->num_full) { - rem_node(&h->n); + sl_head_rem_node(&s->partial_heads, h); if (s->num_empty_heads >= MAX_EMPTY_HEADS) { #ifdef POISON memset(h, 0xde, page_size); #endif - free_page(s->p, h); + free_page(h); } else { - add_head(&s->empty_heads, &h->n); + sl_head_add_head(&s->empty_heads, h); + h->state = slh_empty; s->num_empty_heads++; } } @@ -346,14 +368,13 @@ static void slab_free(resource *r) { slab *s = (slab *) r; - struct sl_head *h, *g; - - WALK_LIST_DELSAFE(h, g, s->empty_heads) - free_page(s->p, h); - WALK_LIST_DELSAFE(h, g, s->partial_heads) - free_page(s->p, h); - WALK_LIST_DELSAFE(h, g, s->full_heads) - free_page(s->p, h); + + WALK_TLIST_DELSAFE(sl_head, h, &s->empty_heads) + free_page(h); + WALK_TLIST_DELSAFE(sl_head, h, &s->partial_heads) + free_page(h); + WALK_TLIST_DELSAFE(sl_head, h, &s->full_heads) + free_page(h); } static void @@ -361,45 +382,53 @@ slab_dump(resource *r) { slab *s = (slab *) r; int ec=0, pc=0, fc=0; - struct sl_head *h; - WALK_LIST(h, s->empty_heads) + WALK_TLIST(sl_head, h, &s->empty_heads) ec++; - WALK_LIST(h, s->partial_heads) + WALK_TLIST(sl_head, h, &s->partial_heads) pc++; - WALK_LIST(h, s->full_heads) + WALK_TLIST(sl_head, h, &s->full_heads) fc++; debug("(%de+%dp+%df blocks per %d objs per %d bytes)\n", ec, pc, fc, s->objs_per_slab, s->obj_size); } -static size_t +static struct resmem slab_memsize(resource *r) { slab *s = (slab *) r; size_t heads = 0; - struct sl_head *h; - WALK_LIST(h, s->empty_heads) + WALK_TLIST(sl_head, h, &s->full_heads) heads++; - WALK_LIST(h, s->partial_heads) + + size_t items = heads * s->objs_per_slab; + + WALK_TLIST(sl_head, h, &s->partial_heads) + { heads++; - WALK_LIST(h, s->full_heads) + items += h->num_full; + } + + WALK_TLIST(sl_head, h, &s->empty_heads) heads++; -// return ALLOC_OVERHEAD + sizeof(struct slab) + heads * (ALLOC_OVERHEAD + page_size); - return ALLOC_OVERHEAD + sizeof(struct slab); /* The page sizes are accounted for in the pool */ + size_t eff = items * s->data_size; + + return (struct resmem) { + .effective = eff, + .overhead = ALLOC_OVERHEAD + sizeof(struct slab) + heads * page_size - eff, + }; } static resource * slab_lookup(resource *r, unsigned long a) { slab *s = (slab *) r; - struct sl_head *h; - WALK_LIST(h, s->partial_heads) + WALK_TLIST(sl_head, h, &s->partial_heads) if ((unsigned long) h < a && (unsigned long) h + page_size < a) return r; - WALK_LIST(h, s->full_heads) + WALK_TLIST(sl_head, h, &s->full_heads) if ((unsigned long) h < a && (unsigned long) h + page_size < a) return r; return NULL; diff --git a/lib/slab_test.c b/lib/slab_test.c new file mode 100644 index 00000000..803d0215 --- /dev/null +++ b/lib/slab_test.c @@ -0,0 +1,171 @@ +/* + * BIRD Library -- Slab Alloc / Dealloc Tests + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include "test/birdtest.h" +#include "lib/resource.h" +#include "lib/bitops.h" + +static const int sizes[] = { + 8, 12, 18, 27, 41, 75, 131, 269, +}; + +#define TEST_SIZE 1024 * 128 +#define ITEMS(sz) TEST_SIZE / ( (sz) >> u32_log2((sz))/2 ) + +struct test_request { + int size; + enum strategy { + TEST_NONE, + TEST_FORWARDS, + TEST_BACKWARDS, + TEST_RANDOM, + TEST_MIXED, + TEST__MAX, + } strategy; +}; + +const char * const strategy_name[TEST__MAX] = { + [TEST_FORWARDS] = "forwards", + [TEST_BACKWARDS] = "backwards", + [TEST_RANDOM] = "random", + [TEST_MIXED] = "mixed", +}; + +static inline byte *test_alloc(slab *s, int sz, struct resmem *sliz) +{ + byte *out = sl_alloc(s); + + for (int p=0; p < sz; p++) + out[p] = p & 0xff; + + struct resmem ns = rmemsize((resource *) s); + + bt_assert(sliz->effective + sz == ns.effective); + bt_assert((sliz->overhead - sz - ns.overhead) % page_size == 0); + + *sliz = ns; + + return out; +} + +static inline void test_free(slab *s, byte *block, int sz, struct resmem *sliz) +{ + for (int p=0; p < sz; p++) + { + bt_assert(block[p] == (p & 0xff)); + block[p]++; + } + + sl_free(block); + + struct resmem ns = rmemsize((resource *) s); + + bt_assert(sliz->effective - sz == ns.effective); + bt_assert((sliz->overhead + sz - ns.overhead) % page_size == 0); + + *sliz = ns; +} + +static inline struct resmem get_memsize(slab *s) +{ + struct resmem sz = rmemsize((resource *) s); + bt_assert(sz.effective == 0); + return sz; +} + +static int +t_slab(const void *data) +{ + const struct test_request *tr = data; + int sz = tr->size; + + slab *s = sl_new(&root_pool, sz); + struct resmem sliz = get_memsize(s); + + int n = ITEMS(sz); + byte **block = mb_alloc(&root_pool, n * sizeof(*block)); + + switch (tr->strategy) { + case TEST_FORWARDS: + for (int i = 0; i < n; i++) + block[i] = test_alloc(s, sz, &sliz); + + for (int i = 0; i < n; i++) + test_free(s, block[i], sz, &sliz); + + break; + + case TEST_BACKWARDS: + for (int i = 0; i < n; i++) + block[i] = test_alloc(s, sz, &sliz); + + for (int i = n - 1; i >= 0; i--) + test_free(s, block[i], sz, &sliz); + + break; + + case TEST_RANDOM: + for (int i = 0; i < n; i++) + block[i] = test_alloc(s, sz, &sliz); + + for (int i = 0; i < n; i++) + { + int pos = bt_random() % (n - i); + test_free(s, block[pos], sz, &sliz); + if (pos != n - i - 1) + block[pos] = block[n - i - 1]; + } + + break; + + case TEST_MIXED: + { + int cur = 0; + int pending = n; + + while (cur + pending > 0) { + int action = bt_random() % (cur + pending); + + if (action < cur) { + test_free(s, block[action], sz, &sliz); + if (action != --cur) + block[action] = block[cur]; + } else { + block[cur++] = test_alloc(s, sz, &sliz); + pending--; + } + } + + break; + } + + default: bug("This shouldn't happen"); + } + + mb_free(block); + return 1; +} +int main(int argc, char *argv[]) +{ + bt_init(argc, argv); + + struct test_request tr; + + for (uint i = 0; i < sizeof(sizes) / sizeof(*sizes); i++) + for (uint strategy = TEST_FORWARDS; strategy < TEST__MAX; strategy++) + { + tr = (struct test_request) { + .size = sizes[i], + .strategy = strategy, + }; + bt_test_suite_arg(t_slab, &tr, "Slab allocator test, size=%d, strategy=%s", + tr.size, strategy_name[strategy]); + } + + return bt_exit_value(); +} diff --git a/lib/socket.h b/lib/socket.h index ff07660f..5c69482e 100644 --- a/lib/socket.h +++ b/lib/socket.h @@ -57,7 +57,6 @@ typedef struct birdsock { uint fast_rx; /* RX has higher priority in event loop */ uint rbsize; int (*rx_hook)(struct birdsock *, uint size); /* NULL=receiving turned off, returns 1 to clear rx buffer */ - struct event_cork *cork; /* Cork to temporarily stop receiving data */ byte *tbuf, *tpos; /* NULL=allocate automatically */ byte *ttx; /* Internal */ @@ -126,6 +125,7 @@ extern int sk_priority_control; /* Suggested priority for control traffic, shou #define SKF_TTL_RX 0x08 /* Report TTL / Hop Limit for RX packets */ #define SKF_BIND 0x10 /* Bind datagram socket to given source address */ #define SKF_HIGH_PORT 0x20 /* Choose port from high range if possible */ +#define SKF_FREEBIND 0x40 /* Allow socket to bind to a nonlocal address */ #define SKF_THREAD 0x100 /* Socked used in thread, Do not add to main loop */ #define SKF_TRUNCATED 0x200 /* Received packet was truncated, set by IO layer */ diff --git a/lib/string.h b/lib/string.h index 976b1c24..2829943d 100644 --- a/lib/string.h +++ b/lib/string.h @@ -20,6 +20,11 @@ int bvsprintf(char *str, const char *fmt, va_list args); int bsnprintf(char *str, int size, const char *fmt, ...); int bvsnprintf(char *str, int size, const char *fmt, va_list args); +char *mb_sprintf(pool *p, const char *fmt, ...); +char *mb_vsprintf(pool *p, const char *fmt, va_list args); +char *lp_sprintf(linpool *p, const char *fmt, ...); +char *lp_vsprintf(linpool *p, const char *fmt, va_list args); + int buffer_vprint(buffer *buf, const char *fmt, va_list args); int buffer_print(buffer *buf, const char *fmt, ...); void buffer_puts(buffer *buf, const char *str); diff --git a/lib/timer.c b/lib/timer.c index eb7ea690..ff6975a4 100644 --- a/lib/timer.c +++ b/lib/timer.c @@ -32,7 +32,6 @@ #include "nest/bird.h" -#include "lib/coro.h" #include "lib/heap.h" #include "lib/resource.h" #include "lib/timer.h" @@ -117,7 +116,7 @@ tm_set_in_tl(timer *t, btime when, struct timeloop *local_timeloop) t->loop = local_timeloop; - if ((t->index == 1) && (local_timeloop->coro != this_coro)) + if (t->index == 1) birdloop_ping(local_timeloop->loop); } @@ -193,6 +192,7 @@ timers_fire(struct timeloop *loop, int io_log) io_log_event(t->hook, t->data); t->hook(t); + tmp_flush(); } } diff --git a/lib/timer.h b/lib/timer.h index 04544ace..555fc96f 100644 --- a/lib/timer.h +++ b/lib/timer.h @@ -41,7 +41,6 @@ struct timeloop BUFFER_(timer *) timers; struct domain_generic *domain; struct birdloop *loop; - struct coroutine *coro; }; #define TLOCK_TIMER_ASSERT(loop) ASSERT_DIE((loop)->domain && DG_IS_LOCKED((loop)->domain)) diff --git a/lib/tlists.h b/lib/tlists.h new file mode 100644 index 00000000..e1ed79ea --- /dev/null +++ b/lib/tlists.h @@ -0,0 +1,172 @@ +/* + * BIRD Library -- Typed Linked Lists + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + * + * + * This implementation of linked lists forces its members to be + * typed. On the other hand, it needs to be implemented as ugly macros to + * keep the needed genericity. + * + * Usage: + * 1. Include this file + * 2. Define the node structure + * 3. For every list type you need to define: + * A. #define TLIST_PREFIX and other macros + * B. Include this file once again + * + * Macros to define: + * TLIST_PREFIX: prefix to prepend to everything generated + * TLIST_TYPE: the actual node type + * TLIST_ITEM: where the tlist structure is + * TLIST_WANT_WALK: if defined, generates a helper functions for list walking macros + * TLIST_WANT_ADD_HEAD: if defined, TLIST_PREFIX_add_head() is generated to + * add an item to the beginning of the list + * TLIST_WANT_ADD_TAIL: if defined, TLIST_PREFIX_add_tail() is generated to + * add an item to the end of the list + * + * TLIST_PREFIX_rem_node() is generated always. + * + * All these macros are #undef-ed by including this file. + * + * Example: + * + * #include "lib/tlists.h" + * + * struct foo { + * ... + * TLIST_NODE(bar, struct foo) baz; + * ... + * }; + * + * #define TLIST_PREFIX bar + * #define TLIST_TYPE struct foo + * #define TLIST_ITEM baz + * + * #define TLIST_WANT_WALK + * #define TLIST_WANT_ADD_HEAD + * + * #include "lib/tlists.h" + * + * ... + * (end of example) + * + */ + +#ifdef _BIRD_LIB_TLISTS_H_ +# ifdef TLIST_PREFIX + +/* Check for mandatory arguments */ +#ifndef TLIST_TYPE +#error "TLIST_TYPE must be defined" +#endif +#ifndef TLIST_ITEM +#error "TLIST_ITEM must be defined" +#endif +#ifndef TLIST_PREFIX +#error "TLIST_PREFIX must be defined" +#endif + +#define TLIST_NAME(x) MACRO_CONCAT_AFTER(TLIST_PREFIX,_##x) +#ifndef TLIST_LIST_STRUCT +#define TLIST_LIST_STRUCT TLIST_NAME(list) +#endif + +typedef struct TLIST_LIST_STRUCT { + TLIST_TYPE *first; + TLIST_TYPE *last; +} TLIST_LIST_STRUCT; + +#ifdef TLIST_WANT_WALK +static inline struct TLIST_NAME(node) * TLIST_NAME(node_get)(TLIST_TYPE *node) +{ return &(node->TLIST_ITEM); } +#endif + +#ifdef TLIST_WANT_ADD_HEAD +static inline void TLIST_NAME(add_head)(TLIST_LIST_STRUCT *list, TLIST_TYPE *node) +{ + ASSERT_DIE(!node->TLIST_ITEM.prev && !node->TLIST_ITEM.next); + if (node->TLIST_ITEM.next = list->first) + list->first->TLIST_ITEM.prev = node; + else + list->last = node; + list->first = node; +} +#endif + +#ifdef TLIST_WANT_ADD_TAIL +static inline void TLIST_NAME(add_tail)(TLIST_LIST_STRUCT *list, TLIST_TYPE *node) +{ + ASSERT_DIE(!node->TLIST_ITEM.prev && !node->TLIST_ITEM.next); + if (node->TLIST_ITEM.prev = list->last) + list->last->TLIST_ITEM.next = node; + else + list->first = node; + list->last = node; +} +#endif + +static inline void TLIST_NAME(rem_node)(TLIST_LIST_STRUCT *list, TLIST_TYPE *node) +{ + if (node->TLIST_ITEM.prev) + node->TLIST_ITEM.prev->TLIST_ITEM.next = node->TLIST_ITEM.next; + else + { + ASSERT_DIE(list->first == node); + list->first = node->TLIST_ITEM.next; + } + + if (node->TLIST_ITEM.next) + node->TLIST_ITEM.next->TLIST_ITEM.prev = node->TLIST_ITEM.prev; + else + { + ASSERT_DIE(list->last == node); + list->last = node->TLIST_ITEM.prev; + } + + node->TLIST_ITEM.next = node->TLIST_ITEM.prev = NULL; +} + +#undef TLIST_PREFIX +#undef TLIST_NAME +#undef TLIST_LIST_STRUCT +#undef TLIST_TYPE +#undef TLIST_ITEM +#undef TLIST_WANT_ADD_HEAD +#undef TLIST_WANT_ADD_TAIL + +# endif +#else +#define _BIRD_LIB_TLISTS_H_ + +#include "lib/macro.h" + +#if defined(TLIST_NAME) || defined(TLIST_PREFIX) +#error "You should first include lib/tlists.h without requesting a TLIST" +#endif + +#define TLIST_NODE(_name, _type) struct _name##_node { _type *next; _type *prev; } +#define TLIST_LIST(_name) struct _name##_list + +/* Use ->first and ->last to access HEAD and TAIL */ +#define THEAD(_name, _list) (_list)->first +#define TTAIL(_name, _list) (_list)->last + +/* Walkaround macros: simple and resilient to node removal */ +#define WALK_TLIST(_name, _node, _list) \ + for (typeof((_list)->first) _node = (_list)->first; \ + _node; _node = _name##_node_get((_node))->next) + +#define WALK_TLIST_DELSAFE(_name, _node, _list) \ + for (typeof((_list)->first) _node = (_list)->first, \ + _helper = _node ? _name##_node_get((_list)->first)->next : NULL; \ + _node; \ + (_node = _helper) ? (_helper = _name##_node_get(_helper)->next) : 0) + +/* Empty check */ +#define EMPTY_TLIST(_name, _list) (!(_list)->first) + +#endif + diff --git a/lib/type.h b/lib/type.h new file mode 100644 index 00000000..b54744c1 --- /dev/null +++ b/lib/type.h @@ -0,0 +1,112 @@ +/* + * BIRD Internet Routing Daemon -- Internal Data Types + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_TYPE_H_ +#define _BIRD_TYPE_H_ + +#include "lib/birdlib.h" +#include "lib/attrs.h" + +union bval { +#define BVAL_ITEMS \ + struct { \ + u32 data; /* Integer type inherited from eattrs */ \ + PADDING(data, 0, 4); /* Must be padded on 64-bits */ \ + }; \ + struct { \ + u32 i; /* Integer type inherited from filters */ \ + PADDING(i, 0, 4); /* Must be padded on 64-bits */ \ + }; \ + const struct adata *ptr; /* Generic attribute data inherited from eattrs */ \ + const struct adata *ad; /* Generic attribute data inherited from filters */ \ + + BVAL_ITEMS; +}; + +union bval_long { + union bval bval; /* For direct assignments */ + BVAL_ITEMS; /* For item-wise access */ + + u64 ec; + lcomm lc; + ip_addr ip; + const net_addr *net; + const char *s; + const struct f_tree *t; + const struct f_trie *ti; + const struct f_path_mask *path_mask; + struct f_path_mask_item pmi; +}; + + +/* Internal types */ +enum btype { +/* Nothing. Simply nothing. */ + T_VOID = 0, + +/* Something but inaccessible. */ + T_OPAQUE = 0x02, /* Opaque byte string (not filterable) */ + T_IFACE = 0x0c, /* Pointer to an interface (inside adata) */ + T_NEXTHOP_LIST = 0x2c, /* The whole nexthop block */ + T_HOSTENTRY = 0x2e, /* Hostentry with possible MPLS labels */ + +/* Types shared with eattrs */ + T_INT = 0x01, /* 32-bit unsigned integer number */ + T_IP = 0x04, /* IP address */ + T_QUAD = 0x05, /* Router ID (IPv4 address) */ + T_PATH = 0x06, /* BGP AS path (encoding per RFC 1771:4.3) */ + T_CLIST = 0x0a, /* Set of u32's (e.g., a community list) */ + T_ECLIST = 0x0e, /* Set of pairs of u32's - ext. community list */ + T_LCLIST = 0x08, /* Set of triplets of u32's - large community list */ + + T_ENUM_BGP_ORIGIN = 0x11, /* BGP Origin enum */ + T_ENUM_RA_PREFERENCE = 0x13, /* RA Preference enum */ + T_ENUM_FLOWSPEC_VALID = 0x15, /* Flowspec validation result */ + +#define EAF_TYPE__MAX 0x1f +#define EAF_EMBEDDED 0x01 /* Data stored in eattr.u.data (part of type spec) */ + /* Otherwise, attribute data is adata */ + +/* Other user visible types which fit in int */ + T_BOOL = 0xa0, + T_PAIR = 0xa4, /* Notice that pair is stored as integer: first << 16 | second */ + +/* Put enumerational types in 0x20..0x3f range */ + T_ENUM_LO = 0x10, + T_ENUM_HI = 0x3f, + + T_ENUM_RTS = 0x31, + T_ENUM_SCOPE = 0x33, + T_ENUM_RTD = 0x37, + T_ENUM_ROA = 0x39, + T_ENUM_NETTYPE = 0x3b, + T_ENUM_AF = 0x3d, + +/* new enums go here */ + +#define T_ENUM T_ENUM_LO ... T_ENUM_HI + +/* Bigger ones */ + T_NET = 0xb0, + T_STRING = 0xb4, + T_PATH_MASK = 0xb8, /* mask for BGP path */ + T_EC = 0xbc, /* Extended community value, u64 */ + T_LC = 0xc0, /* Large community value, lcomm */ + T_RD = 0xc4, /* Route distinguisher for VPN addresses */ + T_PATH_MASK_ITEM = 0xc8, /* Path mask item for path mask constructors */ + + T_SET = 0x80, + T_PREFIX_SET = 0x84, +} PACKED; + +typedef enum btype btype; + +STATIC_ASSERT(sizeof(btype) == sizeof(byte)); + + +#endif diff --git a/lib/type_test.c b/lib/type_test.c new file mode 100644 index 00000000..b526db69 --- /dev/null +++ b/lib/type_test.c @@ -0,0 +1,79 @@ +/* + * BIRD Library -- Data Type Alignment Tests + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include "test/birdtest.h" +#include "lib/type.h" +#include "lib/route.h" + +#define CHECK_ONE(val) \ + for (uint i=0; i<sizeof(val); i++) \ + bt_assert(((const u8 *) &val)[i] == (u8) ~0); + +#define SET_PADDING(val, name) \ + for (uint i=0; i<sizeof(val.PADDING_NAME(name)); i++) \ + val.PADDING_NAME(name)[i] = ~0; + + +static int +t_bval(void) +{ + union bval v; + + memset(&v, 0, sizeof(v)); + v.data = ~0; + SET_PADDING(v, data); + CHECK_ONE(v); + + memset(&v, 0, sizeof(v)); + v.i = ~0; + SET_PADDING(v, i); + CHECK_ONE(v); + + memset(&v, 0, sizeof(v)); + v.ptr = (void *) ~0; + CHECK_ONE(v); + + memset(&v, 0, sizeof(v)); + v.ad = (void *) ~0; + CHECK_ONE(v); + + return 1; +} + +static int +t_eattr(void) +{ + struct eattr e; + memset(&e, 0, sizeof(e)); + + e.id = ~0; + e.flags = ~0; + e.type = ~0; + e.rfu = ~0; + e.originated = ~0; + e.fresh = ~0; + e.undef = ~0; + memset(&e.u, ~0, sizeof(e.u)); /* Assumes t_bval passed */ + + SET_PADDING(e, unused); + + CHECK_ONE(e); + + return 1; +} + + +int main(int argc, char *argv[]) +{ + bt_init(argc, argv); + + bt_test_suite(t_bval, "Structure alignment test: bval"); + bt_test_suite(t_eattr, "Structure alignment test: eattr"); + + return bt_exit_value(); +} diff --git a/misc/bird.spec b/misc/bird.spec index 26b43011..c3fdd7d6 100644 --- a/misc/bird.spec +++ b/misc/bird.spec @@ -1,6 +1,6 @@ Summary: BIRD Internet Routing Daemon Name: bird -Version: 2.0.8 +Version: 2.0.10 Release: 1 Copyright: GPL Group: Networking/Daemons diff --git a/misc/docker/ubuntu-20.10-amd64/Dockerfile b/misc/docker/ubuntu-21.10-amd64/Dockerfile index 19cb1b85..aa0987b6 100644 --- a/misc/docker/ubuntu-20.10-amd64/Dockerfile +++ b/misc/docker/ubuntu-21.10-amd64/Dockerfile @@ -1,9 +1,10 @@ -FROM ubuntu:20.10 +FROM ubuntu:21.10 ENV DEBIAN_FRONTEND=noninteractive RUN sed -i 's/deb.debian.org/ftp.cz.debian.org/' /etc/apt/sources.list RUN apt-get -y update RUN apt-get -y upgrade -RUN apt-get -y install \ +RUN apt-get -y --no-install-recommends install \ + tzdata \ build-essential \ flex \ bison \ diff --git a/nest/Makefile b/nest/Makefile index 884d3950..5b27da0c 100644 --- a/nest/Makefile +++ b/nest/Makefile @@ -1,8 +1,16 @@ -src := a-path.c a-set.c cli.c cmds.c iface.c locks.c neighbor.c password.c proto.c rt-attr.c rt-dev.c rt-fib.c rt-show.c rt-table.c +src := cli.c cmds.c iface.c locks.c neighbor.c password.c proto.c proto-build.c rt-attr.c rt-dev.c rt-fib.c rt-show.c rt-table.c obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_src := a-set_test.c a-path_test.c +$(objdir)/nest/proto-build.c: $(lastword $(MAKEFILE_LIST)) + $(E)echo GEN $@ + $(Q)echo "#include \"lib/birdlib.h\"" > $@ + $(Q)$(patsubst %,echo 'void %_build(void);' >> $@;,$(PROTO_BUILD)) + $(Q)echo "void protos_build_gen(void) {" >> $@ + $(Q)$(patsubst %,echo ' %_build();'>>$@;,$(PROTO_BUILD)) + $(Q)echo "}" >> $@ + +tests_src := tests_targets := $(tests_targets) $(tests-target-files) tests_objs := $(tests_objs) $(src-o-files) diff --git a/nest/bird.h b/nest/bird.h index 55712abe..931974a0 100644 --- a/nest/bird.h +++ b/nest/bird.h @@ -9,7 +9,6 @@ #ifndef _BIRD_BIRD_H_ #define _BIRD_BIRD_H_ -#include "sysdep/config.h" #include "lib/birdlib.h" #include "lib/ip.h" #include "lib/net.h" @@ -302,7 +302,8 @@ cli_event(void *data) cli_command(c); } - cli_write_trigger(c); + if (c->tx_pos) + cli_write_trigger(c); } cli * @@ -319,7 +320,6 @@ cli_new(void *priv) c->event->data = c; c->cont = cli_hello; c->parser_pool = lp_new_default(c->pool); - c->show_pool = lp_new_default(c->pool); c->rx_buf = mb_alloc(c->pool, CLI_RX_BUF_SIZE); ev_schedule(c->event); return c; @@ -409,11 +409,14 @@ void cli_free(cli *c) { cli_set_log_echo(c, 0, 0); + int defer = 0; if (c->cleanup) - c->cleanup(c); + defer = c->cleanup(c); if (c == cmd_reconfig_stored_cli) cmd_reconfig_stored_cli = NULL; - rfree(c->pool); + + if (!defer) + rfree(c->pool); } /** @@ -33,12 +33,12 @@ typedef struct cli { struct cli_out *tx_buf, *tx_pos, *tx_write; event *event; void (*cont)(struct cli *c); - void (*cleanup)(struct cli *c); + int (*cleanup)(struct cli *c); /* Return 0 if finished and cli may be freed immediately. + Otherwise return 1 and call rfree(c->pool) when appropriate. */ void *rover; /* Private to continuation routine */ int last_reply; int restricted; /* CLI is restricted to read-only commands */ struct linpool *parser_pool; /* Pool used during parsing */ - struct linpool *show_pool; /* Pool used during route show */ byte *ring_buf; /* Ring buffer for asynchronous messages */ byte *ring_end, *ring_read, *ring_write; /* Pointers to the ring buffer */ uint ring_overflow; /* Counter of ring overflows */ diff --git a/nest/cmds.c b/nest/cmds.c index 18f39eb5..8a5bbdd4 100644 --- a/nest/cmds.c +++ b/nest/cmds.c @@ -8,7 +8,7 @@ #include "nest/bird.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "conf/conf.h" #include "nest/cmds.h" @@ -51,34 +51,60 @@ cmd_show_symbols(struct sym_show_data *sd) cli_msg(1010, "%-8s\t%s", sd->sym->name, cf_symbol_class_name(sd->sym)); else { - HASH_WALK(config->sym_hash, next, sym) - { - if (!sym->scope->active) - continue; + for (const struct sym_scope *scope = config->root_scope; scope; scope = scope->next) + HASH_WALK(scope->hash, next, sym) + { + if (!sym->scope->active) + continue; - if (sd->type && (sym->class != sd->type)) - continue; + if (sd->type && (sym->class != sd->type)) + continue; - cli_msg(-1010, "%-8s\t%s", sym->name, cf_symbol_class_name(sym)); - } - HASH_WALK_END; + cli_msg(-1010, "%-8s\t%s", sym->name, cf_symbol_class_name(sym)); + } + HASH_WALK_END; cli_msg(0, ""); } } -static void -print_size(char *dsc, size_t val) +#define SIZE_SUFFIX " kMGT" +#define SIZE_FORMAT "% 4u.%1u % 1cB" +#define SIZE_ARGS(a) (a).val, (a).decimal, SIZE_SUFFIX[(a).magnitude] + +struct size_args { + u64 val:48; + u64 decimal:8; + u64 magnitude:8; +}; + +static struct size_args +get_size_args(u64 val) { - char *px = " kMG"; - int i = 0; - while ((val >= 10000) && (i < 3)) +#define VALDEC 10 /* One decimal place */ + val *= VALDEC; + + uint i = 0; + while ((val >= 10000 * VALDEC) && (i < 4)) { val = (val + 512) / 1024; i++; } - cli_msg(-1018, "%-17s %4u %cB", dsc, (unsigned) val, px[i]); + return (struct size_args) { + .val = (val / VALDEC), + .decimal = (val % VALDEC), + .magnitude = i, + }; +} + +static void +print_size(char *dsc, struct resmem vals) +{ + struct size_args effective = get_size_args(vals.effective); + struct size_args overhead = get_size_args(vals.overhead); + + cli_msg(-1018, "%-17s " SIZE_FORMAT " " SIZE_FORMAT, dsc, SIZE_ARGS(effective), SIZE_ARGS(overhead)); } extern pool *rt_table_pool; @@ -88,10 +114,18 @@ void cmd_show_memory(void) { cli_msg(-1018, "BIRD memory usage"); + cli_msg(-1018, "%-17s Effective Overhead", ""); print_size("Routing tables:", rmemsize(rt_table_pool)); print_size("Route attributes:", rmemsize(rta_pool)); print_size("Protocols:", rmemsize(proto_pool)); - print_size("Total:", rmemsize(&root_pool)); + struct resmem total = rmemsize(&root_pool); +#ifdef HAVE_MMAP + int pk = atomic_load_explicit(&pages_kept, memory_order_relaxed) + + atomic_load_explicit(&pages_kept_locally, memory_order_relaxed); + print_size("Standby memory:", (struct resmem) { .overhead = page_size * pk }); + total.overhead += page_size * pk; +#endif + print_size("Total:", total); cli_msg(0, ""); } @@ -101,7 +135,7 @@ cmd_eval(const struct f_line *expr) buffer buf; LOG_BUFFER_INIT(buf); - if (f_eval_buf(expr, this_cli->parser_pool, &buf) > F_RETURN) + if (f_eval_buf(expr, &buf) > F_RETURN) { cli_msg(8008, "runtime error"); return; diff --git a/nest/config.Y b/nest/config.Y index 0914048b..f2904882 100644 --- a/nest/config.Y +++ b/nest/config.Y @@ -17,6 +17,7 @@ CF_HDR CF_DEFINES +static struct rtable_config *this_table; static struct proto_config *this_proto; static struct channel_config *this_channel; static struct iface_patt *this_ipatt; @@ -117,13 +118,14 @@ CF_KEYWORDS(IPV4, IPV6, VPN4, VPN6, ROA4, ROA6, FLOW4, FLOW6, SADR, MPLS) CF_KEYWORDS(RECEIVE, LIMIT, ACTION, WARN, BLOCK, RESTART, DISABLE, KEEP, FILTERED, RPKI) CF_KEYWORDS(PASSWORD, KEY, FROM, PASSIVE, TO, ID, EVENTS, PACKETS, PROTOCOLS, CHANNELS, INTERFACES) CF_KEYWORDS(ALGORITHM, KEYED, HMAC, MD5, SHA1, SHA256, SHA384, SHA512, BLAKE2S128, BLAKE2S256, BLAKE2B256, BLAKE2B512) -CF_KEYWORDS(PRIMARY, STATS, COUNT, BY, FOR, COMMANDS, PREEXPORT, NOEXPORT, EXPORTED, GENERATE) -CF_KEYWORDS(BGP, PASSWORDS, DESCRIPTION, SORTED) -CF_KEYWORDS(RELOAD, IN, OUT, MRTDUMP, MESSAGES, RESTRICT, MEMORY, IGP_METRIC, CLASS, DSCP) +CF_KEYWORDS(PRIMARY, STATS, COUNT, FOR, IN, COMMANDS, PREEXPORT, NOEXPORT, EXPORTED, GENERATE) +CF_KEYWORDS(BGP, PASSWORDS, DESCRIPTION) +CF_KEYWORDS(RELOAD, IN, OUT, MRTDUMP, MESSAGES, RESTRICT, MEMORY, CLASS, DSCP) CF_KEYWORDS(TIMEFORMAT, ISO, SHORT, LONG, ROUTE, PROTOCOL, BASE, LOG, S, MS, US) -CF_KEYWORDS(GRACEFUL, RESTART, WAIT, MAX, FLUSH, AS) +CF_KEYWORDS(GRACEFUL, RESTART, WAIT, MAX, AS) CF_KEYWORDS(MIN, IDLE, RX, TX, INTERVAL, MULTIPLIER, PASSIVE) CF_KEYWORDS(CHECK, LINK) +CF_KEYWORDS(CORK, SORTED, TRIE, MIN, MAX, ROA, ROUTE, REFRESH, SETTLE, TIME, GC, THRESHOLD, PERIOD) /* For r_args_channel */ CF_KEYWORDS(IPV4, IPV4_MC, IPV4_MPLS, IPV6, IPV6_MC, IPV6_MPLS, IPV6_SADR, VPN4, VPN4_MC, VPN4_MPLS, VPN6, VPN6_MC, VPN6_MPLS, ROA4, ROA6, FLOW4, FLOW6, MPLS, PRI, SEC) @@ -131,7 +133,7 @@ CF_KEYWORDS(IPV4, IPV4_MC, IPV4_MPLS, IPV6, IPV6_MC, IPV6_MPLS, IPV6_SADR, VPN4, CF_ENUM(T_ENUM_RTS, RTS_, STATIC, INHERIT, DEVICE, STATIC_DEVICE, REDIRECT, RIP, OSPF, OSPF_IA, OSPF_EXT1, OSPF_EXT2, BGP, PIPE, BABEL) CF_ENUM(T_ENUM_SCOPE, SCOPE_, HOST, LINK, SITE, ORGANIZATION, UNIVERSE, UNDEFINED) -CF_ENUM(T_ENUM_RTD, RTD_, UNICAST, BLACKHOLE, UNREACHABLE, PROHIBIT) +CF_ENUM(T_ENUM_RTD, RTD_, BLACKHOLE, UNREACHABLE, PROHIBIT) CF_ENUM(T_ENUM_ROA, ROA_, UNKNOWN, VALID, INVALID) CF_ENUM_PX(T_ENUM_AF, AF_, AFI_, IPV4, IPV6) @@ -141,7 +143,7 @@ CF_ENUM_PX(T_ENUM_AF, AF_, AFI_, IPV4, IPV6) %type <s> optproto %type <ra> r_args %type <sd> sym_args -%type <i> proto_start echo_mask echo_size debug_mask debug_list debug_flag mrtdump_mask mrtdump_list mrtdump_flag export_mode limit_action net_type table_sorted tos password_algorithm +%type <i> proto_start echo_mask echo_size debug_mask debug_list debug_flag mrtdump_mask mrtdump_list mrtdump_flag export_mode limit_action net_type tos password_algorithm %type <ps> proto_patt proto_patt2 %type <cc> channel_start proto_channel %type <cl> limit_spec @@ -163,7 +165,7 @@ rtrid: idval: NUM { $$ = $1; } - | '(' term ')' { $$ = f_eval_int(f_linearize($2)); } + | '(' term ')' { $$ = f_eval_int(f_linearize($2, 1)); } | IP4 { $$ = ip4_to_u32($1); } | CF_SYM_KNOWN { if ($1->class == (SYM_CONSTANT | T_INT) || $1->class == (SYM_CONSTANT | T_QUAD)) @@ -206,16 +208,44 @@ CF_ENUM(T_ENUM_NETTYPE, NET_, IP4, IP6, VPN4, VPN6, ROA4, ROA6, FLOW4, FLOW6, IP conf: table ; +table: table_start table_sorted table_opt_list ; + +table_start: net_type TABLE symbol { + this_table = rt_new_table($3, $1); + } + ; + table_sorted: - { $$ = 0; } - | SORTED { $$ = 1; } + /* empty */ + | SORTED { this_table->sorted = 1; } ; -table: net_type TABLE symbol table_sorted { - struct rtable_config *cf; - cf = rt_new_table($3, $1); - cf->sorted = $4; +table_opt: + SORTED bool { this_table->sorted = $2; } + | TRIE bool { + if (!net_val_match(this_table->addr_type, NB_IP | NB_VPN | NB_ROA | NB_IP6_SADR)) + cf_error("Trie option not supported for %s table", net_label[this_table->addr_type]); + this_table->trie_used = $2; } + | GC THRESHOLD expr { this_table->gc_threshold = $3; } + | GC PERIOD expr_us { this_table->gc_period = (uint) $3; if ($3 > 3600 S_) cf_error("GC period must be at most 3600 s"); } + | CORK THRESHOLD expr expr { + if ($3 > $4) cf_error("Cork low threshold must be lower than the high threshold."); + this_table->cork_threshold.low = $3; + this_table->cork_threshold.high = $4; } + | EXPORT SETTLE TIME settle { this_table->export_settle = $4; } + | ROUTE REFRESH EXPORT SETTLE TIME settle { this_table->export_rr_settle = $6; } + | DEBUG bool { this_table->debug = $2; } + ; + +table_opts: + /* empty */ + | table_opts table_opt ';' + ; + +table_opt_list: + /* empty */ + | '{' table_opts '}' ; @@ -283,12 +313,26 @@ channel_item_: this_channel->table = $2; } | IMPORT imexport { this_channel->in_filter = $2; } + | EXPORT IN net_any imexport { + if (this_channel->net_type && ($3->type != this_channel->net_type)) + cf_error("Incompatible export prefilter type"); + this_channel->out_subprefix = $3; + this_channel->out_filter = $4; + } | EXPORT imexport { this_channel->out_filter = $2; } | RECEIVE LIMIT limit_spec { this_channel->rx_limit = $3; } | IMPORT LIMIT limit_spec { this_channel->in_limit = $3; } | EXPORT LIMIT limit_spec { this_channel->out_limit = $3; } + | ROA SETTLE TIME settle { this_channel->roa_settle = $4; } | PREFERENCE expr { this_channel->preference = $2; check_u16($2); } - | IMPORT KEEP FILTERED bool { this_channel->in_keep_filtered = $4; } + | IMPORT KEEP FILTERED bool { + if ($4) + this_channel->in_keep |= RIK_REJECTED; + else if ((this_channel->in_keep & RIK_PREFILTER) == RIK_PREFILTER) + cf_error("Import keep filtered is implied by the import table."); + else + this_channel->in_keep &= ~RIK_REJECTED; + } | RPKI RELOAD bool { this_channel->rpki_reload = $3; } ; @@ -319,7 +363,11 @@ channel_end: proto_channel: channel_start channel_opt_list channel_end; -rtable: CF_SYM_KNOWN { cf_assert_symbol($1, SYM_TABLE); $$ = $1->table; } ; +rtable: CF_SYM_KNOWN { + cf_assert_symbol($1, SYM_TABLE); + if (!$1->table) rt_new_default_table($1); + $$ = $1->table; +} ; imexport: FILTER filter { $$ = $2; } @@ -348,7 +396,7 @@ debug_default: DEBUG PROTOCOLS debug_mask { new_config->proto_default_debug = $3; } | DEBUG CHANNELS debug_mask { new_config->channel_default_debug = $3; } | DEBUG COMMANDS expr { new_config->cli_debug = $3; } - | DEBUG TABLES bool { new_config->table_debug = $3; } + | DEBUG TABLES debug_mask { new_config->table_debug = $3; } ; /* MRTDUMP PROTOCOLS is in systep/unix/config.Y */ @@ -377,7 +425,6 @@ timeformat_base: TIMEFORMAT timeformat_spec ';' ; - /* Interface patterns */ iface_patt_node_init: @@ -617,21 +664,31 @@ r_args: $$ = cfg_allocz(sizeof(struct rt_show_data)); init_list(&($$->tables)); $$->filter = FILTER_ACCEPT; - $$->running_on_config = new_config->fallback; + $$->running_on_config = config; + $$->cli = this_cli; } | r_args net_any { $$ = $1; if ($$->addr) cf_error("Only one prefix expected"); $$->addr = $2; + $$->addr_mode = TE_ADDR_EQUAL; } | r_args FOR r_args_for { $$ = $1; if ($$->addr) cf_error("Only one prefix expected"); - $$->show_for = 1; $$->addr = $3; + $$->addr_mode = TE_ADDR_FOR; + } + | r_args IN net_any { + $$ = $1; + if ($$->addr) cf_error("Only one prefix expected"); + if (!net_type_match($3, NB_IP)) cf_error("Only IP networks accepted for 'in' argument"); + $$->addr = $3; + $$->addr_mode = TE_ADDR_IN; } - | r_args TABLE CF_SYM_KNOWN { +| r_args TABLE symbol_known { cf_assert_symbol($3, SYM_TABLE); + if (!$3->table) cf_error("Table %s not configured", $3->name); $$ = $1; rt_show_add_table($$, $3->table->table); $$->tables_defined_by = RSD_TDB_DIRECT; @@ -644,13 +701,14 @@ r_args: $$->tables_defined_by = RSD_TDB_ALL; } | r_args IMPORT TABLE channel_arg { - if (!$4->in_table) cf_error("No import table in channel %s.%s", $4->proto->name, $4->name); - rt_show_add_table($$, $4->in_table->tab); + if (!($4->in_keep & RIK_PREFILTER)) cf_error("No import table in channel %s.%s", $4->proto->name, $4->name); + RT_LOCKED($4->table, tab) + rt_show_add_exporter($$, &tab->exporter.e, "import")->prefilter = $4; $$->tables_defined_by = RSD_TDB_DIRECT; } | r_args EXPORT TABLE channel_arg { if (!$4->out_table) cf_error("No export table in channel %s.%s", $4->proto->name, $4->name); - rt_show_add_table($$, $4->out_table->tab); + rt_show_add_exporter($$, $4->out_table, "export"); $$->tables_defined_by = RSD_TDB_DIRECT; } | r_args FILTER filter { @@ -675,7 +733,7 @@ r_args: $$ = $1; $$->filtered = 1; } - | r_args export_mode CF_SYM_KNOWN { + | r_args export_mode symbol_known { cf_assert_symbol($3, SYM_PROTO); struct proto_config *c = (struct proto_config *) $3->proto; $$ = $1; @@ -692,7 +750,7 @@ r_args: $$->export_channel = $3; $$->tables_defined_by = RSD_TDB_INDIRECT; } - | r_args PROTOCOL CF_SYM_KNOWN { + | r_args PROTOCOL symbol_known { cf_assert_symbol($3, SYM_PROTO); struct proto_config *c = (struct proto_config *) $3->proto; $$ = $1; @@ -820,7 +878,7 @@ CF_CLI(DUMP INTERFACES,,, [[Dump interface information]]) CF_CLI(DUMP NEIGHBORS,,, [[Dump neighbor cache]]) { neigh_dump_all(); cli_msg(0, ""); } ; CF_CLI(DUMP ATTRIBUTES,,, [[Dump attribute cache]]) -{ rta_dump_all(); cli_msg(0, ""); } ; +{ ea_dump_all(); cli_msg(0, ""); } ; CF_CLI(DUMP ROUTES,,, [[Dump routes]]) { rt_dump_all(); cli_msg(0, ""); } ; CF_CLI(DUMP TABLES,,, [[Dump table connections]]) @@ -831,7 +889,7 @@ CF_CLI(DUMP FILTER ALL,,, [[Dump all filters in linearized form]]) { filters_dump_all(); cli_msg(0, ""); } ; CF_CLI(EVAL, term, <expr>, [[Evaluate an expression]]) -{ cmd_eval(f_linearize($2)); } ; +{ cmd_eval(f_linearize($2, 1)); } ; CF_CLI_HELP(ECHO, ..., [[Control echoing of log messages]]) CF_CLI(ECHO, echo_mask echo_size, (all | off | { debug|trace|info|remote|warning|error|auth [, ...] }) [<buffer-size>], [[Control echoing of log messages]]) { @@ -894,9 +952,6 @@ proto_patt2: | TEXT { $$.ptr = $1; $$.patt = 1; } ; -dynamic_attr: IGP_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_GEN_IGP_METRIC); } ; - - CF_CODE CF_END diff --git a/nest/iface.c b/nest/iface.c index 5cb9e814..fc896e26 100644 --- a/nest/iface.c +++ b/nest/iface.c @@ -595,7 +595,7 @@ ifa_update(struct ifa *a) if (ipa_equal(b->brd, a->brd) && ipa_equal(b->opposite, a->opposite) && b->scope == a->scope && - !((b->flags ^ a->flags) & IA_PEER)) + !((b->flags ^ a->flags) & (IA_SECONDARY | IA_PEER | IA_HOST))) { b->flags |= IA_UPDATED; return b; diff --git a/nest/limit.h b/nest/limit.h index 5838ad3b..f8d4b212 100644 --- a/nest/limit.h +++ b/nest/limit.h @@ -32,6 +32,7 @@ static inline int limit_push(struct limit *l, void *data) static inline void limit_pop(struct limit *l) { + ASSERT_DIE(l->count > 0); --l->count; } diff --git a/nest/neighbor.c b/nest/neighbor.c index cb2d1b2b..81da24d5 100644 --- a/nest/neighbor.c +++ b/nest/neighbor.c @@ -345,7 +345,7 @@ neigh_free(neighbor *n) { rem_node(&n->n); rem_node(&n->if_n); - sl_free(neigh_slab, n); + sl_free(n); } /** diff --git a/nest/proto.c b/nest/proto.c index 978582ca..e39fbdfe 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -15,19 +15,17 @@ #include "lib/event.h" #include "lib/timer.h" #include "lib/string.h" -#include "lib/coro.h" #include "conf/conf.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/iface.h" #include "nest/cli.h" #include "filter/filter.h" #include "filter/f-inst.h" pool *proto_pool; -list proto_list; +list STATIC_LIST_INIT(proto_list); -static list protocol_list; -struct protocol *class_to_protocol[PROTOCOL__MAX]; +static list STATIC_LIST_INIT(protocol_list); #define CD(c, msg, args...) ({ if (c->debug & D_STATES) log(L_TRACE "%s.%s: " msg, c->proto->name, c->name ?: "?", ## args); }) #define PD(p, msg, args...) ({ if (p->debug & D_STATES) log(L_TRACE "%s: " msg, p->name, ## args); }) @@ -48,7 +46,7 @@ static char *c_states[] = { "DOWN", "START", "UP", "STOP", "RESTART" }; extern struct protocol proto_unix_iface; -static void channel_aux_request_refeed(struct channel_aux_table *cat); +static void channel_request_reload(struct channel *c); static void proto_shutdown_loop(timer *); static void proto_rethink_goal(struct proto *p); static char *proto_state_name(struct proto *p); @@ -57,6 +55,7 @@ static void channel_update_limit(struct channel *c, struct limit *l, int dir, st static void channel_reset_limit(struct channel *c, struct limit *l, int dir); static void channel_feed_end(struct channel *c); static void channel_export_stopped(struct rt_export_request *req); +static void channel_check_stopped(struct channel *c); static inline int proto_is_done(struct proto *p) { return (p->proto_state == PS_DOWN) && proto_is_inactive(p); } @@ -110,9 +109,7 @@ channel_export_log_state_change(struct rt_export_request *req, u8 state) switch (state) { case TES_FEEDING: - if (c->out_table) - rt_refresh_begin(&c->out_table->push); - else if (c->proto->feed_begin) + if (c->proto->feed_begin) c->proto->feed_begin(c, !c->refeeding); break; case TES_READY: @@ -202,8 +199,7 @@ proto_find_channel_by_name(struct proto *p, const char *n) return NULL; } -rte * channel_preimport(struct rt_import_request *req, rte *new, rte *old); -rte * channel_in_preimport(struct rt_import_request *req, rte *new, rte *old); +int channel_preimport(struct rt_import_request *req, rte *new, rte *old); void rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); void rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); @@ -236,25 +232,22 @@ proto_add_channel(struct proto *p, struct channel_config *cf) c->channel = cf->channel; c->proto = p; c->table = cf->table->table; - - RT_LOCKED(c->table, t) - rt_lock_table(t); + rt_lock_table(c->table); c->in_filter = cf->in_filter; c->out_filter = cf->out_filter; + c->out_subprefix = cf->out_subprefix; channel_init_limit(c, &c->rx_limit, PLD_RX, &cf->rx_limit); channel_init_limit(c, &c->in_limit, PLD_IN, &cf->in_limit); channel_init_limit(c, &c->out_limit, PLD_OUT, &cf->out_limit); - c->rte_update_pool = lp_new_default(proto_pool); - c->net_type = cf->net_type; c->ra_mode = cf->ra_mode; c->preference = cf->preference; c->debug = cf->debug; c->merge_limit = cf->merge_limit; - c->in_keep_filtered = cf->in_keep_filtered; + c->in_keep = cf->in_keep; c->rpki_reload = cf->rpki_reload; c->channel_state = CS_DOWN; @@ -279,9 +272,7 @@ proto_remove_channel(struct proto *p UNUSED, struct channel *c) CD(c, "Removed", c->name); - RT_LOCKED(c->table, t) - rt_unlock_table(t); - + rt_unlock_table(c->table); rem_node(&c->n); mb_free(c); } @@ -322,20 +313,34 @@ proto_remove_channels(struct proto *p) proto_remove_channel(p, c); } +struct roa_subscription { + node roa_node; + struct settle settle; + struct channel *c; + struct rt_export_request req; +}; + static void -channel_roa_in_changed(void *_data) +channel_roa_in_changed(struct settle *se) { - struct channel *c = _data; + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, settle, se); + struct channel *c = s->c; + int active = !!c->reload_req.hook; - CD(c, "Reload triggered by RPKI change"); + CD(c, "Reload triggered by RPKI change%s", active ? " - already active" : ""); - channel_request_reload(c); + if (!active) + channel_request_reload(c); + else + c->reload_pending = 1; } static void -channel_roa_out_changed(void *_data) +channel_roa_out_changed(struct settle *se) { - struct channel *c = _data; + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, settle, se); + struct channel *c = s->c; + CD(c, "Feeding triggered by RPKI change"); c->refeed_pending = 1; @@ -344,29 +349,47 @@ channel_roa_out_changed(void *_data) rt_stop_export(&c->out_req, channel_export_stopped); } -/* Temporary code, subscriptions should be changed to resources */ -struct roa_subscription { - struct rt_subscription s; - node roa_node; -}; +static void +channel_export_one_roa(struct rt_export_request *req, const net_addr *net UNUSED, struct rt_pending_export *first) +{ + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, req, req); + + /* TODO: use the information about what roa has changed */ + settle_kick(&s->settle, s->c->proto->loop); + + rpe_mark_seen_all(req->hook, first, NULL); +} + +static void +channel_dump_roa_req(struct rt_export_request *req) +{ + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, req, req); + struct channel *c = s->c; + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter.e, req->hook->table); + + debug(" Channel %s.%s ROA %s change notifier from table %s request %p\n", + c->proto->name, c->name, + (s->settle.hook == channel_roa_in_changed) ? "import" : "export", + tab->name, req); +} static int channel_roa_is_subscribed(struct channel *c, rtable *tab, int dir) { - void (*hook)(void *) = + void (*hook)(struct settle *) = dir ? channel_roa_in_changed : channel_roa_out_changed; struct roa_subscription *s; node *n; WALK_LIST2(s, n, c->roa_subscriptions, roa_node) - if ((s->s.tab == tab) && (s->s.event->hook == hook)) + if ((tab == SKIP_BACK(rtable, priv.exporter.e, s->req.hook->table)) + && (s->settle.hook == hook)) return 1; return 0; } - static void channel_roa_subscribe(struct channel *c, rtable *tab, int dir) { @@ -374,21 +397,40 @@ channel_roa_subscribe(struct channel *c, rtable *tab, int dir) return; struct roa_subscription *s = mb_allocz(c->proto->pool, sizeof(struct roa_subscription)); - s->s.event = ev_new_init(c->proto->pool, dir ? channel_roa_in_changed : channel_roa_out_changed, c); - s->s.event->list = proto_work_list(c->proto); - rt_subscribe(tab, &s->s); + *s = (struct roa_subscription) { + .settle = SETTLE_INIT(&c->roa_settle, dir ? channel_roa_in_changed : channel_roa_out_changed, NULL), + .c = c, + .req = { + .name = mb_sprintf(c->proto->pool, "%s.%s.roa-%s.%s", + c->proto->name, c->name, dir ? "in" : "out", tab->name), + .list = proto_work_list(c->proto), + .trace_routes = c->debug | c->proto->debug, + .dump_req = channel_dump_roa_req, + .export_one = channel_export_one_roa, + }, + }; add_tail(&c->roa_subscriptions, &s->roa_node); + rt_request_export(tab, &s->req); } static void -channel_roa_unsubscribe(struct roa_subscription *s) +channel_roa_unsubscribed(struct rt_export_request *req) { - rt_unsubscribe(&s->s); + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, req, req); + struct channel *c = s->c; + rem_node(&s->roa_node); - rfree(s->s.event); mb_free(s); + + channel_check_stopped(c); +} + +static void +channel_roa_unsubscribe(struct roa_subscription *s) +{ + rt_stop_export(&s->req, channel_roa_unsubscribed); } static void @@ -408,7 +450,7 @@ channel_roa_subscribe_filter(struct channel *c, int dir) #ifdef CONFIG_BGP /* No automatic reload for BGP channels without in_table / out_table */ if (c->channel == &channel_bgp) - valid = dir ? !!c->in_table : !!c->out_table; + valid = dir ? ((c->in_keep & RIK_PREFILTER) == RIK_PREFILTER) : !!c->out_table; #endif struct filter_iterator fit; @@ -418,14 +460,8 @@ channel_roa_subscribe_filter(struct channel *c, int dir) { switch (fi->fi_code) { - case FI_ROA_CHECK_IMPLICIT: - tab = fi->i_FI_ROA_CHECK_IMPLICIT.rtc->table; - if (valid) channel_roa_subscribe(c, tab, dir); - found = 1; - break; - - case FI_ROA_CHECK_EXPLICIT: - tab = fi->i_FI_ROA_CHECK_EXPLICIT.rtc->table; + case FI_ROA_CHECK: + tab = fi->i_FI_ROA_CHECK.rtc->table; if (valid) channel_roa_subscribe(c, tab, dir); found = 1; break; @@ -462,14 +498,10 @@ channel_start_import(struct channel *c) return; } - int nlen = strlen(c->name) + strlen(c->proto->name) + 2; - char *rn = mb_allocz(c->proto->pool, nlen); - bsprintf(rn, "%s.%s", c->proto->name, c->name); - c->in_req = (struct rt_import_request) { - .name = rn, - .list = proto_work_list(c->proto), + .name = mb_sprintf(c->proto->pool, "%s.%s", c->proto->name, c->name), .trace_routes = c->debug | c->proto->debug, + .list = proto_work_list(c->proto), .dump_req = channel_dump_import_req, .log_state_change = channel_import_log_state_change, .preimport = channel_preimport, @@ -491,19 +523,17 @@ channel_start_export(struct channel *c) { if (c->out_req.hook) { - c->restart_export = 1; - log(L_WARN "%s.%s: Fast channel export restart", c->proto->name, c->name); + log(L_WARN "%s.%s: Attempted to start channel's already started export", c->proto->name, c->name); return; } ASSERT(c->channel_state == CS_UP); - int nlen = strlen(c->name) + strlen(c->proto->name) + 2; - char *rn = mb_allocz(c->proto->pool, nlen); - bsprintf(rn, "%s.%s", c->proto->name, c->name); c->out_req = (struct rt_export_request) { - .name = rn, + .name = mb_sprintf(c->proto->pool, "%s.%s", c->proto->name, c->name), .list = proto_work_list(c->proto), + .addr = c->out_subprefix, + .addr_mode = c->out_subprefix ? TE_ADDR_IN : TE_ADDR_NONE, .trace_routes = c->debug | c->proto->debug, .dump_req = channel_dump_export_req, .log_state_change = channel_export_log_state_change, @@ -544,7 +574,7 @@ channel_check_stopped(struct channel *c) switch (c->channel_state) { case CS_STOP: - if (c->out_req.hook || c->in_req.hook || c->out_table || c->in_table) + if (!EMPTY_LIST(c->roa_subscriptions) || c->out_req.hook || c->in_req.hook) return; channel_set_state(c, CS_DOWN); @@ -552,7 +582,7 @@ channel_check_stopped(struct channel *c) break; case CS_PAUSE: - if (c->out_req.hook) + if (!EMPTY_LIST(c->roa_subscriptions) || c->out_req.hook) return; channel_set_state(c, CS_START); @@ -565,11 +595,9 @@ channel_check_stopped(struct channel *c) } void -channel_import_stopped(void *_c) +channel_import_stopped(struct rt_import_request *req) { - struct channel *c = _c; - - c->in_req.hook = NULL; + struct channel *c = SKIP_BACK(struct channel, in_req, req); mb_free(c->in_req.name); c->in_req.name = NULL; @@ -596,13 +624,7 @@ channel_export_stopped(struct rt_export_request *req) mb_free(c->out_req.name); c->out_req.name = NULL; - if (c->restart_export) - { - c->restart_export = 0; - channel_start_export(c); - } - else - channel_check_stopped(c); + channel_check_stopped(c); } static void @@ -624,342 +646,72 @@ channel_feed_end(struct channel *c) return; } - if (c->out_table) - rt_refresh_end(&c->out_table->push); - else if (c->proto->feed_end) + if (c->proto->feed_end) c->proto->feed_end(c); if (c->refeed_pending) rt_stop_export(req, channel_export_stopped); -} - -#define CHANNEL_AUX_TABLE_DUMP_REQ(inout, imex, pgimex, pushget) static void \ - channel_##inout##_##pushget##_dump_req(struct rt_##pgimex##_request *req) { \ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, pushget, req); \ - debug(" Channel %s.%s " #imex " table " #pushget " request %p\n", cat->c->proto->name, cat->c->name, req); } - -CHANNEL_AUX_TABLE_DUMP_REQ(in, import, import, push) -CHANNEL_AUX_TABLE_DUMP_REQ(in, import, export, get) -CHANNEL_AUX_TABLE_DUMP_REQ(out, export, import, push) -CHANNEL_AUX_TABLE_DUMP_REQ(out, export, export, get) - -#undef CHANNEL_AUX_TABLE_DUMP_REQ - -static uint channel_aux_imex(struct channel_aux_table *cat) -{ - if (cat->c->in_table == cat) - return 0; - else if (cat->c->out_table == cat) - return 1; else - bug("Channel aux table must be in_table or out_table"); -} - -static void -channel_aux_stopped(void *data) -{ - struct channel_aux_table *cat = data; - struct channel *c = cat->c; - - if (channel_aux_imex(cat)) - c->out_table = NULL; - else - c->in_table = NULL; - - rfree(cat->tab->priv.rp); - mb_free(cat); - channel_check_stopped(c); -} - -static void -channel_aux_import_stopped(void *_cat) -{ - struct channel_aux_table *cat = _cat; - cat->push.hook = NULL; -} - -static void -channel_aux_export_stopped(struct rt_export_request *req) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - req->hook = NULL; - - int del; - RT_LOCKED(cat->tab, t) - del = !!t->delete_event; - - if (del) - return; - - ASSERT_DIE(cat->refeed_pending); - cat->refeed_pending = 0; - rt_request_export(cat->tab, req); -} - -static void -channel_aux_stop(struct channel_aux_table *cat) -{ - RT_LOCKED(cat->tab, t) - { - t->delete_event = ev_new_init(t->rp, channel_aux_stopped, cat); - t->delete_event->list = proto_event_list(cat->c->proto); - } - - cat->push_stopped = (event) { - .hook = channel_aux_import_stopped, - .data = cat, - .list = proto_event_list(cat->c->proto), - }; - - rt_stop_import(&cat->push, &cat->push_stopped); - rt_stop_export(&cat->get, channel_aux_export_stopped); -} - -static void -channel_push_log_state_change(struct rt_import_request *req, u8 state) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, push, req); - const char *imex = channel_aux_imex(cat) ? "export" : "import"; - CD(cat->c, "Channel %s table import state changed to %s", imex, rt_import_state_name(state)); -} - -static void -channel_get_log_state_change(struct rt_export_request *req, u8 state) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - const char *imex = channel_aux_imex(cat) ? "export" : "import"; - CD(cat->c, "Channel %s table export state changed to %s", imex, rt_export_state_name(state)); - - switch (state) - { - case TES_FEEDING: - if (imex && cat->c->proto->feed_begin) - cat->c->proto->feed_begin(cat->c, !cat->c->refeeding); - else if (!imex) - rt_refresh_begin(&cat->c->in_req); - break; - - case TES_READY: - if (imex && cat->c->proto->feed_end) - cat->c->proto->feed_end(cat->c); - else if (!imex) - rt_refresh_end(&cat->c->in_req); - - if (cat->refeed_pending) - rt_stop_export(&cat->get, channel_aux_export_stopped); - - break; - } -} - -void rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src *src); - -static int -channel_aux_export_one_any(struct rt_export_request *req, struct rt_pending_export *rpe, rte **new, rte **old) -{ - struct rte_src *src = rpe->new ? rpe->new->rte.src : rpe->old->rte.src; - *old = RTES_OR_NULL(rpe->old); - struct rte_storage *new_stored; - - while (rpe) - { - new_stored = rpe->new; - rpe_mark_seen(req->hook, rpe); - rpe = rpe_next(rpe, src); - } - - *new = RTES_CLONE(new_stored, *new); - - return (*new || *old) && (&new_stored->rte != *old); + c->refeeding = 0; } -static int -channel_aux_export_one_best(struct rt_export_request *req, struct rt_pending_export *rpe, rte **new, rte **old) +/* Called by protocol for reload from in_table */ +void +channel_schedule_reload(struct channel *c) { - *old = RTES_OR_NULL(rpe->old_best); - struct rte_storage *new_stored; - - while (rpe) - { - new_stored = rpe->new_best; - rpe_mark_seen(req->hook, rpe); - rpe = rpe_next(rpe, NULL); - } - - *new = RTES_CLONE(new_stored, *new); + ASSERT(c->in_req.hook); - return (*new || *old) && (&new_stored->rte != *old); + rt_refresh_begin(&c->in_req); + rt_request_export(c->table, &c->reload_req); } static void -channel_in_export_one_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +channel_reload_stopped(struct rt_export_request *req) { - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); + struct channel *c = SKIP_BACK(struct channel, reload_req, req); - rte n0, *new = &n0, *old; - if (channel_aux_export_one_any(req, rpe, &new, &old)) - rte_update_direct(cat->c, net, new, old ? old->src : new->src); + /* Restart reload */ + if (c->reload_pending) + channel_request_reload(c); } static void -channel_in_export_one_best(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +channel_reload_log_state_change(struct rt_export_request *req, u8 state) { - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); + struct channel *c = SKIP_BACK(struct channel, reload_req, req); - rte n0, *new = &n0, *old; - if (channel_aux_export_one_best(req, rpe, &new, &old)) - rte_update_direct(cat->c, net, new, old ? old->src : new->src); -} - -static void -channel_in_export_bulk_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - for (uint i=0; i<count; i++) + if (state == TES_READY) { - rte n0 = *feed[i]; - rte_update_direct(cat->c, net, &n0, n0.src); + rt_refresh_end(&c->in_req); + rt_stop_export(req, channel_reload_stopped); } } static void -channel_in_export_bulk_best(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - if (!count) - return; - - rte n0 = *feed[0]; - rte_update_direct(cat->c, net, &n0, n0.src); -} - -void do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte *old); - -static void -channel_out_export_one_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - rte n0, *new = &n0, *old; - if (channel_aux_export_one_any(req, rpe, &new, &old)) - do_rt_notify_direct(cat->c, net, new, old); -} - -static void -channel_out_export_one_best(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +channel_reload_dump_req(struct rt_export_request *req) { - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - rte n0, *new = &n0, *old; - if (channel_aux_export_one_best(req, rpe, &new, &old)) - do_rt_notify_direct(cat->c, net, new, old); + struct channel *c = SKIP_BACK(struct channel, reload_req, req); + debug(" Channel %s.%s import reload request %p\n", c->proto->name, c->name, req); } -static void -channel_out_export_bulk(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - if (cat->c->ra_mode != RA_ANY) - ASSERT_DIE(count <= 1); - - for (uint i=0; i<count; i++) - { - rte n0 = *feed[i]; - do_rt_notify_direct(cat->c, net, &n0, NULL); - } -} +void channel_reload_export_bulk(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe, rte **feed, uint count); /* Called by protocol to activate in_table */ void -channel_setup_in_table(struct channel *c, int best) +channel_setup_in_table(struct channel *c) { - int nlen = sizeof("import") + strlen(c->name) + strlen(c->proto->name) + 3; - - struct { - struct channel_aux_table cat; - struct rtable_config tab_cf; - char name[0]; - } *cat = mb_allocz(c->proto->pool, sizeof(*cat) + nlen); - - bsprintf(cat->name, "%s.%s.import", c->proto->name, c->name); - - cat->tab_cf.name = cat->name; - cat->tab_cf.addr_type = c->net_type; - cat->tab_cf.cork_limit = 4 * page_size / sizeof(struct rt_pending_export); - - c->in_table = &cat->cat; - c->in_table->push = (struct rt_import_request) { - .name = cat->name, - .list = proto_work_list(c->proto), - .trace_routes = c->debug | c->proto->debug, - .dump_req = channel_in_push_dump_req, - .log_state_change = channel_push_log_state_change, - .preimport = channel_in_preimport, - }; - c->in_table->get = (struct rt_export_request) { - .name = cat->name, + c->reload_req = (struct rt_export_request) { + .name = mb_sprintf(c->proto->pool, "%s.%s.import", c->proto->name, c->name), .list = proto_work_list(c->proto), .trace_routes = c->debug | c->proto->debug, - .dump_req = channel_in_get_dump_req, - .log_state_change = channel_get_log_state_change, - .export_one = best ? channel_in_export_one_best : channel_in_export_one_any, - .export_bulk = best ? channel_in_export_bulk_best : channel_in_export_bulk_any, + .export_bulk = channel_reload_export_bulk, + .dump_req = channel_reload_dump_req, + .log_state_change = channel_reload_log_state_change, }; - c->in_table->c = c; - c->in_table->tab = rt_setup(c->proto->pool, &cat->tab_cf); - - rt_request_import(c->in_table->tab, &c->in_table->push); - rt_request_export(c->in_table->tab, &c->in_table->get); + c->in_keep |= RIK_PREFILTER; } -/* Called by protocol to activate out_table */ -void -channel_setup_out_table(struct channel *c) -{ - int nlen = sizeof("export") + strlen(c->name) + strlen(c->proto->name) + 3; - - struct { - struct channel_aux_table cat; - struct rtable_config tab_cf; - char name[0]; - } *cat = mb_allocz(c->proto->pool, sizeof(*cat) + nlen); - - bsprintf(cat->name, "%s.%s.export", c->proto->name, c->name); - - cat->tab_cf.name = cat->name; - cat->tab_cf.addr_type = c->net_type; - cat->tab_cf.cork_limit = 4 * page_size / sizeof(struct rt_pending_export); - - c->out_table = &cat->cat; - c->out_table->push = (struct rt_import_request) { - .name = cat->name, - .list = proto_work_list(c->proto), - .trace_routes = c->debug | c->proto->debug, - .dump_req = channel_out_push_dump_req, - .log_state_change = channel_push_log_state_change, - }; - c->out_table->get = (struct rt_export_request) { - .name = cat->name, - .list = proto_work_list(c->proto), - .trace_routes = c->debug | c->proto->debug, - .dump_req = channel_out_get_dump_req, - .log_state_change = channel_get_log_state_change, - .export_one = (c->ra_mode == RA_ANY) ? channel_out_export_one_any : channel_out_export_one_best, - .export_bulk = channel_out_export_bulk, - }; - - c->out_table->c = c; - c->out_table->tab = rt_setup(c->proto->pool, &cat->tab_cf); - - rt_request_import(c->out_table->tab, &c->out_table->push); - rt_request_export(c->out_table->tab, &c->out_table->get); -} - -static void -channel_aux_request_refeed(struct channel_aux_table *cat) -{ - cat->refeed_pending = 1; - rt_stop_export(&cat->get, channel_aux_export_stopped); -} static void channel_do_start(struct channel *c) @@ -985,13 +737,19 @@ channel_do_up(struct channel *c) static void channel_do_pause(struct channel *c) { - /* Stop export */ - if (c->out_req.hook) + /* Need to abort feeding */ + if (c->reload_req.hook) { - rt_stop_export(&c->out_req, channel_export_stopped); - c->refeeding = 0; + c->reload_pending = 0; + rt_stop_export(&c->reload_req, channel_reload_stopped); } + /* Stop export */ + if (c->refeed_pending) + c->refeed_pending = 0; + else if (c->out_req.hook) + rt_stop_export(&c->out_req, channel_export_stopped); + channel_roa_unsubscribe_all(c); bmap_free(&c->export_map); @@ -1001,23 +759,9 @@ channel_do_pause(struct channel *c) static void channel_do_stop(struct channel *c) { - /* Drop auxiliary tables */ - if (c->in_table) - channel_aux_stop(c->in_table); - - if (c->out_table) - channel_aux_stop(c->out_table); - /* Stop import */ if (c->in_req.hook) - { - c->in_stopped = (event) { - .hook = channel_import_stopped, - .data = c, - .list = proto_event_list(c->proto), - }; - rt_stop_import(&c->in_req, &c->in_stopped); - } + rt_stop_import(&c->in_req, channel_import_stopped); c->gr_wait = 0; if (c->gr_lock) @@ -1025,13 +769,12 @@ channel_do_stop(struct channel *c) CALL(c->channel->shutdown, c); - channel_roa_unsubscribe_all(c); } static void channel_do_down(struct channel *c) { - ASSERT(!c->out_req.hook && !c->in_req.hook && !c->out_table && !c->in_table); + ASSERT(!c->reload_req.hook); c->proto->active_channels--; @@ -1039,11 +782,11 @@ channel_do_down(struct channel *c) memset(&c->import_stats, 0, sizeof(struct channel_import_stats)); memset(&c->export_stats, 0, sizeof(struct channel_export_stats)); - CALL(c->channel->cleanup, c); + c->out_table = NULL; - /* This have to be done in here, as channel pool is freed before channel_do_down() */ - bmap_free(&c->export_map); - bmap_free(&c->export_reject_map); + /* The in_table and out_table are going to be freed by freeing their resource pools. */ + + CALL(c->channel->cleanup, c); /* Schedule protocol shutddown */ if (proto_is_done(c->proto)) @@ -1073,7 +816,7 @@ channel_set_state(struct channel *c, uint state) break; case CS_UP: - ASSERT(cs == CS_DOWN || cs == CS_START || cs == CS_PAUSE); + ASSERT(cs == CS_DOWN || cs == CS_START); if (cs == CS_DOWN) channel_do_start(c); @@ -1123,32 +866,19 @@ channel_set_state(struct channel *c, uint state) * completed, it will switch back to ES_READY. This function can be called * even when feeding is already running, in that case it is restarted. */ -static void -channel_request_table_feeding(struct channel *c) -{ - ASSERT(c->out_req.hook); - - c->refeed_pending = 1; - rt_stop_export(&c->out_req, channel_export_stopped); -} - void channel_request_feeding(struct channel *c) { - if (c->gr_wait || !c->proto->rt_notify) - return; - - CD(c, "Refeed requested"); + ASSERT(c->out_req.hook); - ASSERT_DIE(c->out_req.hook); + if (c->refeed_pending) + return; - if (c->out_table) - channel_aux_request_refeed(c->out_table); - else - channel_request_table_feeding(c); + c->refeed_pending = 1; + rt_stop_export(&c->out_req, channel_export_stopped); } -void +static void channel_request_reload(struct channel *c) { ASSERT(c->in_req.hook); @@ -1156,31 +886,12 @@ channel_request_reload(struct channel *c) CD(c, "Reload requested"); - if (c->in_table) - channel_aux_request_refeed(c->in_table); + if (c->in_keep & RIK_PREFILTER) + channel_schedule_reload(c); else c->proto->reload_routes(c); } -void -channel_refresh_begin(struct channel *c) -{ - CD(c, "Channel route refresh begin"); - if (c->in_table) - rt_refresh_begin(&c->in_table->push); - else - rt_refresh_begin(&c->in_req); -} - -void -channel_refresh_end(struct channel *c) -{ - if (c->in_table) - rt_refresh_end(&c->in_table->push); - else - rt_refresh_end(&c->in_req); -} - const struct channel_class channel_basic = { .channel_size = sizeof(struct channel), .config_size = sizeof(struct channel_config) @@ -1200,7 +911,7 @@ channel_config_new(const struct channel_class *cc, const char *name, uint net_ty if (proto->net_type && (net_type != proto->net_type)) cf_error("Different channel type"); - tab = new_config->def_tables[net_type]; + tab = rt_get_default_table(new_config, net_type); } if (!cc) @@ -1219,6 +930,11 @@ channel_config_new(const struct channel_class *cc, const char *name, uint net_ty cf->debug = new_config->channel_default_debug; cf->rpki_reload = 1; + cf->roa_settle = (struct settle_config) { + .min = 1 S, + .max = 20 S, + }; + add_tail(&proto->channels, &cf->n); return cf; @@ -1264,7 +980,12 @@ int channel_reconfigure(struct channel *c, struct channel_config *cf) { /* FIXME: better handle these changes, also handle in_keep_filtered */ - if ((c->table != cf->table->table) || (cf->ra_mode && (c->ra_mode != cf->ra_mode))) + if ((c->table != cf->table->table) || + (cf->ra_mode && (c->ra_mode != cf->ra_mode)) || + (cf->in_keep != c->in_keep) || + cf->out_subprefix && c->out_subprefix && + !net_equal(cf->out_subprefix, c->out_subprefix) || + (!cf->out_subprefix != !c->out_subprefix)) return 0; /* Note that filter_same() requires arguments in (new, old) order */ @@ -1289,11 +1010,27 @@ channel_reconfigure(struct channel *c, struct channel_config *cf) // c->ra_mode = cf->ra_mode; c->merge_limit = cf->merge_limit; c->preference = cf->preference; + c->out_req.addr = c->out_subprefix = cf->out_subprefix; c->debug = cf->debug; c->in_req.trace_routes = c->out_req.trace_routes = c->debug | c->proto->debug; - c->in_keep_filtered = cf->in_keep_filtered; c->rpki_reload = cf->rpki_reload; + if ( (c->roa_settle.min != cf->roa_settle.min) + || (c->roa_settle.max != cf->roa_settle.max)) + { + c->roa_settle = cf->roa_settle; + + struct roa_subscription *s; + node *n; + + WALK_LIST2(s, n, c->roa_subscriptions, roa_node) + { + s->settle.cf = cf->roa_settle; + if (settle_active(&s->settle)) + settle_kick(&s->settle, &main_birdloop); + } + } + /* Execute channel-specific reconfigure hook */ if (c->channel->reconfigure && !c->channel->reconfigure(c, cf, &import_changed, &export_changed)) return 0; @@ -1336,7 +1073,7 @@ channel_reconfigure(struct channel *c, struct channel_config *cf) channel_request_reload(c); if (export_changed) - channel_request_table_feeding(c); + channel_request_feeding(c); done: CD(c, "Reconfigured"); @@ -1403,7 +1140,6 @@ proto_loop_stopped(void *ptr) birdloop_enter(&main_birdloop); p->loop = &main_birdloop; - p->event->list = NULL; proto_cleanup(p); birdloop_leave(&main_birdloop); @@ -1486,21 +1222,13 @@ proto_start(struct proto *p) DBG("Kicking %s up\n", p->name); PD(p, "Starting"); - int ns = strlen("Protocol ") + strlen(p->cf->name) + 1; - void *nb = mb_alloc(proto_pool, ns); - ASSERT_DIE(ns - 1 == bsnprintf(nb, ns, "Protocol %s", p->cf->name)); - - p->pool = rp_new(proto_pool, nb); + p->pool = rp_newf(proto_pool, "Protocol %s", p->cf->name); if (graceful_restart_state == GRS_INIT) p->gr_recovery = 1; if (p->cf->loop_order != DOMAIN_ORDER(the_bird)) - p->loop = birdloop_new(p->pool, p->cf->loop_order, nb); - - p->event->list = proto_event_list(p); - - mb_move(nb, p->pool); + p->loop = birdloop_new(p->pool, p->cf->loop_order, p->pool->name); PROTO_LOCKED_FROM_MAIN(p) proto_notify_state(p, (p->proto->start ? p->proto->start(p) : PS_UP)); @@ -2069,7 +1797,7 @@ protos_dump_all(void) WALK_LIST(p, proto_list) { #define DPF(x) (p->x ? " " #x : "") - debug(" protocol %s (%p) state %s with %d active channels flags: %s%s%s%s\n", + debug(" protocol %s (%p) state %s with %d active channels flags: %s%s%s%s%s\n", p->name, p, p_states[p->proto_state], p->active_channels, DPF(disabled), DPF(active), DPF(do_stop), DPF(reconfiguring)); #undef DPF @@ -2085,20 +1813,6 @@ protos_dump_all(void) debug("\tChannel state: %s/%s/%s\n", c_states[c->channel_state], c->in_req.hook ? rt_import_state_name(rt_import_get_state(c->in_req.hook)) : "-", c->out_req.hook ? rt_export_state_name(rt_export_get_state(c->out_req.hook)) : "-"); - if (c->in_table) - { - debug("\tInput aux table:\n"); - rt_dump_hooks(c->in_table->tab); - rt_dump(c->in_table->tab); - debug("\tEnd of input aux table.\n"); - } - if (c->out_table) - { - debug("\tOutput aux table:\n"); - rt_dump_hooks(c->in_table->tab); - rt_dump(c->in_table->tab); - debug("\tEnd of output aux table.\n"); - } } if (p->proto->dump && (p->proto_state != PS_DOWN)) @@ -2118,14 +1832,13 @@ void proto_build(struct protocol *p) { add_tail(&protocol_list, &p->n); - ASSERT(p->class); - ASSERT(!class_to_protocol[p->class]); - class_to_protocol[p->class] = p; } /* FIXME: convert this call to some protocol hook */ extern void bfd_init_all(void); +void protos_build_gen(void); + /** * protos_build - build a protocol list * @@ -2138,44 +1851,7 @@ extern void bfd_init_all(void); void protos_build(void) { - init_list(&proto_list); - init_list(&protocol_list); - - proto_build(&proto_device); -#ifdef CONFIG_RADV - proto_build(&proto_radv); -#endif -#ifdef CONFIG_RIP - proto_build(&proto_rip); -#endif -#ifdef CONFIG_STATIC - proto_build(&proto_static); -#endif -#ifdef CONFIG_MRT - proto_build(&proto_mrt); -#endif -#ifdef CONFIG_OSPF - proto_build(&proto_ospf); -#endif -#ifdef CONFIG_PIPE - proto_build(&proto_pipe); -#endif -#ifdef CONFIG_BGP - proto_build(&proto_bgp); -#endif -#ifdef CONFIG_BFD - proto_build(&proto_bfd); - bfd_init_all(); -#endif -#ifdef CONFIG_BABEL - proto_build(&proto_babel); -#endif -#ifdef CONFIG_RPKI - proto_build(&proto_rpki); -#endif -#ifdef CONFIG_PERF - proto_build(&proto_perf); -#endif + protos_build_gen(); proto_pool = rp_new(&root_pool, "Protocols"); proto_shutdown_timer = tm_new(proto_pool); @@ -2362,7 +2038,7 @@ proto_do_start(struct proto *p) { p->active = 1; - rt_init_sources(&p->sources, p->name, proto_work_list(p)); + rt_init_sources(&p->sources, p->name, proto_event_list(p)); if (!p->sources.class) p->sources.class = &default_rte_owner_class; @@ -2522,18 +2198,18 @@ channel_show_stats(struct channel *c) u32 in_routes = c->in_limit.count; u32 out_routes = c->out_limit.count; - if (c->in_keep_filtered) + if (c->in_keep) cli_msg(-1006, " Routes: %u imported, %u filtered, %u exported, %u preferred", in_routes, (rx_routes - in_routes), out_routes, SRI(pref)); else cli_msg(-1006, " Routes: %u imported, %u exported, %u preferred", in_routes, out_routes, SRI(pref)); - cli_msg(-1006, " Route change stats: received rejected filtered ignored limited accepted"); - cli_msg(-1006, " Import updates: %10u %10u %10u %10u %10u %10u", + cli_msg(-1006, " Route change stats: received rejected filtered ignored RX limit IN limit accepted"); + cli_msg(-1006, " Import updates: %10u %10u %10u %10u %10u %10u %10u", SCI(updates_received), SCI(updates_invalid), SCI(updates_filtered), SRI(updates_ignored), - SCI(updates_limited_rx) + SCI(updates_limited_in), + SCI(updates_limited_rx), SCI(updates_limited_in), SRI(updates_accepted)); cli_msg(-1006, " Import withdraws: %10u %10u --- %10u --- %10u", SCI(withdraws_received), SCI(withdraws_invalid), diff --git a/nest/protocol.h b/nest/protocol.h index 1d4f2059..892d1890 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -12,7 +12,8 @@ #include "lib/lists.h" #include "lib/resource.h" #include "lib/event.h" -#include "nest/route.h" +#include "lib/settle.h" +#include "nest/rt.h" #include "nest/limit.h" #include "conf/conf.h" @@ -37,38 +38,20 @@ struct symbol; * Routing Protocol */ -enum protocol_class { - PROTOCOL_NONE, - PROTOCOL_BABEL, - PROTOCOL_BFD, - PROTOCOL_BGP, - PROTOCOL_DEVICE, - PROTOCOL_DIRECT, - PROTOCOL_KERNEL, - PROTOCOL_OSPF, - PROTOCOL_MRT, - PROTOCOL_PERF, - PROTOCOL_PIPE, - PROTOCOL_RADV, - PROTOCOL_RIP, - PROTOCOL_RPKI, - PROTOCOL_STATIC, - PROTOCOL__MAX -}; - -extern struct protocol *class_to_protocol[PROTOCOL__MAX]; struct protocol { node n; char *name; char *template; /* Template for automatic generation of names */ int name_counter; /* Counter for automatic name generation */ - enum protocol_class class; /* Machine readable protocol class */ uint preference; /* Default protocol preference */ uint channel_mask; /* Mask of accepted channel types (NB_*) */ uint proto_size; /* Size of protocol data structure */ uint config_size; /* Size of protocol config data structure */ + uint eattr_begin; /* First ID of registered eattrs */ + uint eattr_end; /* End of eattr id zone */ + void (*preconfig)(struct protocol *, struct config *); /* Just before configuring */ void (*postconfig)(struct proto_config *); /* After configuring each instance */ struct proto * (*init)(struct proto_config *); /* Create new instance */ @@ -77,13 +60,13 @@ struct protocol { int (*start)(struct proto *); /* Start the instance */ int (*shutdown)(struct proto *); /* Stop the instance */ void (*get_status)(struct proto *, byte *buf); /* Get instance status (for `show protocols' command) */ - int (*get_attr)(const struct eattr *, byte *buf, int buflen); /* ASCIIfy dynamic attribute (returns GA_*) */ +// int (*get_attr)(const struct eattr *, byte *buf, int buflen); /* ASCIIfy dynamic attribute (returns GA_*) */ void (*show_proto_info)(struct proto *); /* Show protocol info (for `show protocols all' command) */ void (*copy_config)(struct proto_config *, struct proto_config *); /* Copy config from given protocol instance */ }; -void protos_build(void); -void proto_build(struct protocol *); +void protos_build(void); /* Called from sysdep to initialize protocols */ +void proto_build(struct protocol *); /* Called from protocol to register itself */ void protos_preconfig(struct config *); void protos_commit(struct config *new, struct config *old, int force_restart, int type); struct proto * proto_spawn(struct proto_config *cf, uint disabled); @@ -150,7 +133,7 @@ struct proto { u32 debug; /* Debugging flags */ u32 mrtdump; /* MRTDump flags */ uint active_channels; /* Number of active channels */ - uint active_coroutines; /* Number of active coroutines */ + uint active_loops; /* Number of active IO loops */ byte net_type; /* Protocol network type (NET_*), 0 for undefined */ byte disabled; /* Manually disabled */ byte proto_state; /* Protocol state machine (PS_*, see below) */ @@ -197,19 +180,16 @@ struct proto { * Routing entry hooks (called only for routes belonging to this protocol): * * rte_recalculate Called at the beginning of the best route selection - * rte_better Compare two rte's and decide which one is better (1=first, 0=second). - * rte_same Compare two rte's and decide whether they are identical (1=yes, 0=no). * rte_mergable Compare two rte's and decide whether they could be merged (1=yes, 0=no). * rte_insert Called whenever a rte is inserted to a routing table. * rte_remove Called whenever a rte is removed from the routing table. */ - int (*rte_recalculate)(rtable *, struct network *, struct rte *, struct rte *, struct rte *); - int (*rte_better)(struct rte *, struct rte *); + int (*rte_recalculate)(struct rtable_private *, struct network *, struct rte *, struct rte *, struct rte *); int (*rte_mergable)(struct rte *, struct rte *); void (*rte_insert)(struct network *, struct rte *); void (*rte_remove)(struct network *, struct rte *); - u32 (*rte_igp_metric)(struct rte *); + u32 (*rte_igp_metric)(const struct rte *); /* Hic sunt protocol-specific data */ }; @@ -357,7 +337,7 @@ void proto_notify_state(struct proto *p, unsigned state); */ static inline int proto_is_inactive(struct proto *p) -{ return (p->active_channels == 0) && (p->active_coroutines == 0) && (p->sources.uc == 0); } +{ return (p->active_channels == 0) && (p->active_loops == 0) && (p->sources.uc == 0); } /* @@ -471,18 +451,21 @@ struct channel_config { struct proto_config *parent; /* Where channel is defined (proto or template) */ struct rtable_config *table; /* Table we're attached to */ const struct filter *in_filter, *out_filter; /* Attached filters */ + const net_addr *out_subprefix; /* Export only subprefixes of this net */ struct channel_limit rx_limit; /* Limit for receiving routes from protocol - (relevant when in_keep_filtered is active) */ + (relevant when in_keep & RIK_REJECTED) */ struct channel_limit in_limit; /* Limit for importing routes from protocol */ struct channel_limit out_limit; /* Limit for exporting routes to protocol */ + struct settle_config roa_settle; /* Settle times for ROA-induced reload */ + u8 net_type; /* Routing table network type (NET_*), 0 for undefined */ u8 ra_mode; /* Mode of received route advertisements (RA_*) */ u16 preference; /* Default route preference */ u32 debug; /* Debugging flags (D_*) */ u8 merge_limit; /* Maximal number of nexthops for RA_MERGED */ - u8 in_keep_filtered; /* Routes rejected in import filter are kept */ + u8 in_keep; /* Which states of routes to keep (RIK_*) */ u8 rpki_reload; /* RPKI changes trigger channel reload */ }; @@ -496,19 +479,19 @@ struct channel { rtable *table; const struct filter *in_filter; /* Input filter */ const struct filter *out_filter; /* Output filter */ + const net_addr *out_subprefix; /* Export only subprefixes of this net */ struct bmap export_map; /* Keeps track which routes were really exported */ struct bmap export_reject_map; /* Keeps track which routes were rejected by export filter */ - struct limit rx_limit; /* Receive limit (for in_keep_filtered) */ + struct limit rx_limit; /* Receive limit (for in_keep & RIK_REJECTED) */ struct limit in_limit; /* Input limit */ struct limit out_limit; /* Output limit */ + struct settle_config roa_settle; /* Settle times for ROA-induced reload */ + u8 limit_actions[PLD_MAX]; /* Limit actions enum */ u8 limit_active; /* Flags for active limits */ - linpool *rte_update_pool; - uint rte_update_nest_cnt; - struct channel_import_stats { /* Import - from protocol to core */ u32 updates_received; /* Number of route updates received */ @@ -539,7 +522,7 @@ struct channel { u16 preference; /* Default route preference */ u32 debug; /* Debugging flags (D_*) */ u8 merge_limit; /* Maximal number of nexthops for RA_MERGED */ - u8 in_keep_filtered; /* Routes rejected in import filter are kept */ + u8 in_keep; /* Which states of routes to keep (RIK_*) */ u8 disabled; u8 stale; /* Used in reconfiguration */ @@ -548,31 +531,22 @@ struct channel { u8 reloadable; /* Hook reload_routes() is allowed on the channel */ u8 gr_lock; /* Graceful restart mechanism should wait for this channel */ u8 gr_wait; /* Route export to channel is postponed until graceful restart */ - u8 restart_export; /* Route export should restart as soon as it stops */ btime last_state_change; /* Time of last state transition */ - struct channel_aux_table *in_table; /* Internal table for received routes */ - struct event in_stopped; /* Import stop callback */ + struct rt_export_request reload_req; /* Feeder for import reload */ u8 reload_pending; /* Reloading and another reload is scheduled */ u8 refeed_pending; /* Refeeding and another refeed is scheduled */ u8 rpki_reload; /* RPKI changes trigger channel reload */ - struct channel_aux_table *out_table; /* Internal table for exported routes */ + struct rt_exporter *out_table; /* Internal table for exported routes */ - list roa_subscriptions; /* List of active ROA table subscriptions based on filters roa_check() */ + list roa_subscriptions; /* List of active ROA table subscriptions based on filters' roa_check() calls */ }; -struct channel_aux_table { - struct channel *c; - struct rt_import_request push; - struct rt_export_request get; - event push_stopped; - rtable *tab; - event *stop; - u8 refeed_pending; -}; +#define RIK_REJECTED 1 /* Routes rejected in import filter are kept */ +#define RIK_PREFILTER (2 | RIK_REJECTED) /* All routes' attribute state before import filter is kept */ /* * Channel states @@ -638,8 +612,7 @@ struct channel *proto_add_channel(struct proto *p, struct channel_config *cf); int proto_configure_channel(struct proto *p, struct channel **c, struct channel_config *cf); void channel_set_state(struct channel *c, uint state); -void channel_setup_in_table(struct channel *c, int best); -void channel_setup_out_table(struct channel *c); +void channel_setup_in_table(struct channel *c); void channel_schedule_reload(struct channel *c); static inline void channel_init(struct channel *c) { channel_set_state(c, CS_START); } @@ -647,9 +620,6 @@ static inline void channel_open(struct channel *c) { channel_set_state(c, CS_UP) static inline void channel_close(struct channel *c) { channel_set_state(c, CS_STOP); } void channel_request_feeding(struct channel *c); -void channel_request_reload(struct channel *c); -void channel_refresh_begin(struct channel *c); -void channel_refresh_end(struct channel *c); void *channel_config_new(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto); void *channel_config_get(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto); int channel_reconfigure(struct channel *c, struct channel_config *cf); diff --git a/nest/route.h b/nest/route.h deleted file mode 100644 index b5d44040..00000000 --- a/nest/route.h +++ /dev/null @@ -1,959 +0,0 @@ -/* - * BIRD Internet Routing Daemon -- Routing Table - * - * (c) 1998--2000 Martin Mares <mj@ucw.cz> - * (c) 2019--2021 Maria Matejka <mq@jmq.cz> - * - * Can be freely distributed and used under the terms of the GNU GPL. - */ - -#ifndef _BIRD_ROUTE_H_ -#define _BIRD_ROUTE_H_ - -#include "lib/lists.h" -#include "lib/event.h" -#include "lib/bitmap.h" -#include "lib/resource.h" -#include "lib/net.h" -#include "lib/hash.h" -#include "lib/event.h" - -#include <stdatomic.h> - -struct ea_list; -struct protocol; -struct proto; -struct channel; -struct rte_src; -struct symbol; -struct timer; -struct filter; -struct cli; - -/* - * Generic data structure for storing network prefixes. Also used - * for the master routing table. Currently implemented as a hash - * table. - * - * Available operations: - * - insertion of new entry - * - deletion of entry - * - searching for entry by network prefix - * - asynchronous retrieval of fib contents - */ - -struct fib_node { - struct fib_node *next; /* Next in hash chain */ - struct fib_iterator *readers; /* List of readers of this node */ - net_addr addr[0]; -}; - -struct fib_iterator { /* See lib/slists.h for an explanation */ - struct fib_iterator *prev, *next; /* Must be synced with struct fib_node! */ - byte efef; /* 0xff to distinguish between iterator and node */ - byte pad[3]; - struct fib_node *node; /* Or NULL if freshly merged */ - uint hash; -}; - -typedef void (*fib_init_fn)(void *); - -struct fib { - pool *fib_pool; /* Pool holding all our data */ - slab *fib_slab; /* Slab holding all fib nodes */ - struct fib_node **hash_table; /* Node hash table */ - uint hash_size; /* Number of hash table entries (a power of two) */ - uint hash_order; /* Binary logarithm of hash_size */ - uint hash_shift; /* 32 - hash_order */ - uint addr_type; /* Type of address data stored in fib (NET_*) */ - uint node_size; /* FIB node size, 0 for nonuniform */ - uint node_offset; /* Offset of fib_node struct inside of user data */ - uint entries; /* Number of entries */ - uint entries_min, entries_max; /* Entry count limits (else start rehashing) */ - fib_init_fn init; /* Constructor */ -}; - -static inline void * fib_node_to_user(struct fib *f, struct fib_node *e) -{ return e ? (void *) ((char *) e - f->node_offset) : NULL; } - -static inline struct fib_node * fib_user_to_node(struct fib *f, void *e) -{ return e ? (void *) ((char *) e + f->node_offset) : NULL; } - -void fib_init(struct fib *f, pool *p, uint addr_type, uint node_size, uint node_offset, uint hash_order, fib_init_fn init); -void *fib_find(struct fib *, const net_addr *); /* Find or return NULL if doesn't exist */ -void *fib_get_chain(struct fib *f, const net_addr *a); /* Find first node in linked list from hash table */ -void *fib_get(struct fib *, const net_addr *); /* Find or create new if nonexistent */ -void *fib_route(struct fib *, const net_addr *); /* Longest-match routing lookup */ -void fib_delete(struct fib *, void *); /* Remove fib entry */ -void fib_free(struct fib *); /* Destroy the fib */ -void fib_check(struct fib *); /* Consistency check for debugging */ - -void fit_init(struct fib_iterator *, struct fib *); /* Internal functions, don't call */ -struct fib_node *fit_get(struct fib *, struct fib_iterator *); -void fit_put(struct fib_iterator *, struct fib_node *); -void fit_put_next(struct fib *f, struct fib_iterator *i, struct fib_node *n, uint hpos); -void fit_put_end(struct fib_iterator *i); -void fit_copy(struct fib *f, struct fib_iterator *dst, struct fib_iterator *src); - - -#define FIB_WALK(fib, type, z) do { \ - struct fib_node *fn_, **ff_ = (fib)->hash_table; \ - uint count_ = (fib)->hash_size; \ - type *z; \ - while (count_--) \ - for (fn_ = *ff_++; z = fib_node_to_user(fib, fn_); fn_=fn_->next) - -#define FIB_WALK_END } while (0) - -#define FIB_ITERATE_INIT(it, fib) fit_init(it, fib) - -#define FIB_ITERATE_START(fib, it, type, z) do { \ - struct fib_node *fn_ = fit_get(fib, it); \ - uint count_ = (fib)->hash_size; \ - uint hpos_ = (it)->hash; \ - type *z; \ - for(;;) { \ - if (!fn_) \ - { \ - if (++hpos_ >= count_) \ - break; \ - fn_ = (fib)->hash_table[hpos_]; \ - continue; \ - } \ - z = fib_node_to_user(fib, fn_); - -#define FIB_ITERATE_END fn_ = fn_->next; } } while(0) - -#define FIB_ITERATE_PUT(it) fit_put(it, fn_) - -#define FIB_ITERATE_PUT_NEXT(it, fib) fit_put_next(fib, it, fn_, hpos_) - -#define FIB_ITERATE_PUT_END(it) fit_put_end(it) - -#define FIB_ITERATE_UNLINK(it, fib) fit_get(fib, it) - -#define FIB_ITERATE_COPY(dst, src, fib) fit_copy(fib, dst, src) - - -/* - * Master Routing Tables. Generally speaking, each of them contains a FIB - * with each entry pointing to a list of route entries representing routes - * to given network (with the selected one at the head). - * - * Each of the RTE's contains variable data (the preference and protocol-dependent - * metrics) and a pointer to a route attribute block common for many routes). - * - * It's guaranteed that there is at most one RTE for every (prefix,proto) pair. - */ - -typedef struct rtable_private { -#define RTABLE_PUBLIC \ - resource r; \ - node n; /* Node in list of all tables */ \ - struct birdloop *loop; /* This loop runs the table */ \ - char *name; /* Name of this table */ \ - uint addr_type; /* Type of address data stored in table (NET_*) */ \ - struct rtable_config *config; /* Configuration of this table */ \ - struct event *nhu_event; /* Event to update next hops */ \ - _Atomic byte nhu_state; /* Next Hop Update state */ \ - - RTABLE_PUBLIC; - pool *rp; /* Resource pool to allocate everything from, including itself */ - struct slab *rte_slab; /* Slab to allocate route objects */ - struct fib fib; - int use_count; /* Number of protocols using this table */ - u32 rt_count; /* Number of routes in the table */ - u32 rr_count; /* Number of running route refresh requests */ - - list imports; /* Registered route importers */ - list exports; /* Registered route exporters */ - - struct hmap id_map; - struct hostcache *hostcache; - struct event *prune_event; /* Event to prune abandoned routes */ - struct event *announce_event; /* Event to announce pending exports */ - struct event *ec_event; /* Event to prune finished exports */ - struct event *hcu_event; /* Event to update host cache */ - struct event *delete_event; /* Event to delete the table */ - btime last_rt_change; /* Last time when route changed */ - btime base_settle_time; /* Start time of rtable settling interval */ - btime gc_time; /* Time of last GC */ - int gc_counter; /* Number of operations since last GC */ - byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */ - - byte cork_active; /* Congestion control activated */ - - struct fib_iterator prune_fit; /* Rtable prune FIB iterator */ - struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ - struct tbf rl_pipe; /* Rate limiting token buffer for pipe collisions */ - - linpool *nhu_lp; /* Linpool used for NHU */ - - list subscribers; /* Subscribers for notifications */ - struct timer *settle_timer; /* Settle time for notifications */ - - list pending_exports; /* List of packed struct rt_pending_export */ - - struct rt_pending_export *first_export; /* First export to announce */ - u64 next_export_seq; /* The next export will have this ID */ -} rtable_private; - -typedef union { - struct { RTABLE_PUBLIC }; - rtable_private priv; -} rtable; - -#define RT_LOCK(tab) ({ birdloop_enter((tab)->loop); &(tab)->priv; }) -#define RT_UNLOCK(tab) birdloop_leave((tab)->loop) -#define RT_PRIV(tab) ({ ASSERT_DIE(birdloop_inside((tab)->loop)); &(tab)->priv; }) - -#define RT_LOCKED(tpub, tpriv) for (rtable_private *tpriv = RT_LOCK(tpub); tpriv; RT_UNLOCK(tpriv), (tpriv = NULL)) - -struct rtable_config { - node n; - char *name; - struct config *config; - rtable *table; - struct proto_config *krt_attached; /* Kernel syncer attached to this table */ - uint addr_type; /* Type of address data stored in table (NET_*) */ - int gc_max_ops; /* Maximum number of operations before GC is run */ - int gc_min_time; /* Minimum time between two consecutive GC runs */ - byte sorted; /* Routes of network are sorted according to rte_better() */ - btime min_settle_time; /* Minimum settle time for notifications */ - btime max_settle_time; /* Maximum settle time for notifications */ - btime min_rr_settle_time; /* Minimum settle time for notifications when route refresh is running */ - btime max_rr_settle_time; /* Maximum settle time for notifications when route refresh is running */ - uint cork_limit; /* Amount of routes to be pending on export to cork imports */ -}; - -struct rt_subscription { - node n; - rtable *tab; - event *event; -}; - -#define NHU_CLEAN 0 -#define NHU_SCHEDULED 1 -#define NHU_RUNNING 2 -#define NHU_DIRTY 3 - -typedef struct network { - struct rte_storage *routes; /* Available routes for this network */ - struct rt_pending_export *last, *first; /* Routes with unfinished exports */ - struct fib_node n; /* FIB flags reserved for kernel syncer */ -} net; - -struct hostcache { - slab *slab; /* Slab holding all hostentries */ - struct hostentry **hash_table; /* Hash table for hostentries */ - unsigned hash_order, hash_shift; - unsigned hash_max, hash_min; - unsigned hash_items; - linpool *lp; /* Linpool for trie */ - struct f_trie *trie; /* Trie of prefixes that might affect hostentries */ - list hostentries; /* List of all hostentries */ - byte update_hostcache; -}; - -struct hostentry { - node ln; - ip_addr addr; /* IP address of host, part of key */ - ip_addr link; /* (link-local) IP address of host, used as gw - if host is directly attached */ - rtable *tab; /* Dependent table, part of key */ - struct hostentry *next; /* Next in hash chain */ - unsigned hash_key; /* Hash key */ - unsigned uc; /* Use count */ - struct rta *src; /* Source rta entry */ - byte dest; /* Chosen route destination type (RTD_...) */ - byte nexthop_linkable; /* Nexthop list is completely non-device */ - u32 igp_metric; /* Chosen route IGP metric */ -}; - -typedef struct rte { - struct rta *attrs; /* Attributes of this route */ - const net_addr *net; /* Network this RTE belongs to */ - struct rte_src *src; /* Route source that created the route */ - struct rt_import_hook *sender; /* Import hook used to send the route to the routing table */ - btime lastmod; /* Last modified (set by table) */ - u32 id; /* Table specific route id */ - byte flags; /* Table-specific flags */ - byte pflags; /* Protocol-specific flags */ - u8 generation; /* If this route import is based on other previously exported route, - this value should be 1 + MAX(generation of the parent routes). - Otherwise the route is independent and this value is zero. */ - u8 stale_cycle; /* Auxiliary value for route refresh */ -} rte; - -struct rte_storage { - struct rte_storage *next; /* Next in chain */ - struct rte rte; /* Route data */ -}; - -#define RTES_CLONE(r, l) ((r) ? (((*(l)) = (r)->rte), (l)) : NULL) -#define RTES_OR_NULL(r) ((r) ? &((r)->rte) : NULL) - -#define REF_FILTERED 2 /* Route is rejected by import filter */ -#define REF_USE_STALE 4 /* Do not reset route's stale_cycle to the actual value */ - -/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ -static inline int rte_is_valid(const rte *r) { return r && !(r->flags & REF_FILTERED); } - -/* Route just has REF_FILTERED flag */ -static inline int rte_is_filtered(const rte *r) { return !!(r->flags & REF_FILTERED); } - - -/* Table-channel connections */ - -struct rt_import_request { - struct rt_import_hook *hook; /* The table part of importer */ - char *name; - u8 trace_routes; - - event_list *list; /* Where to schedule import events */ - - void (*dump_req)(struct rt_import_request *req); - void (*log_state_change)(struct rt_import_request *req, u8 state); - /* Preimport is called when the @new route is just-to-be inserted, replacing @old. - * Return a route (may be different or modified in-place) to continue or NULL to withdraw. */ - struct rte *(*preimport)(struct rt_import_request *req, struct rte *new, struct rte *old); -}; - -struct rt_import_hook { - node n; - rtable *table; /* The connected table */ - struct rt_import_request *req; /* The requestor */ - - struct rt_import_stats { - /* Import - from protocol to core */ - u32 pref; /* Number of routes selected as best in the (adjacent) routing table */ - u32 updates_ignored; /* Number of route updates rejected as already in route table */ - u32 updates_accepted; /* Number of route updates accepted and imported */ - u32 withdraws_ignored; /* Number of route withdraws rejected as already not in route table */ - u32 withdraws_accepted; /* Number of route withdraws accepted and processed */ - } stats; - - u64 flush_seq; /* Table export seq when the channel announced flushing */ - btime last_state_change; /* Time of last state transition */ - - u8 import_state; /* IS_* */ - u8 stale_set; /* Set this stale_cycle to imported routes */ - u8 stale_valid; /* Routes with this stale_cycle and bigger are considered valid */ - u8 stale_pruned; /* Last prune finished when this value was set at stale_valid */ - u8 stale_pruning; /* Last prune started when this value was set at stale_valid */ - - struct event *export_announce_event; /* Event to run to announce new exports */ - struct event *stopped; /* Event to run when import is stopped */ -}; - -struct rt_pending_export { - struct rt_pending_export * _Atomic next; /* Next export for the same destination */ - struct rte_storage *new, *new_best, *old, *old_best; - u64 seq; /* Sequential ID (table-local) of the pending export */ -}; - -struct rt_export_request { - struct rt_export_hook *hook; /* Table part of the export */ - char *name; - u8 trace_routes; - - event_list *list; /* Where to schedule export events */ - - /* There are two methods of export. You can either request feeding every single change - * or feeding the whole route feed. In case of regular export, &export_one is preferred. - * Anyway, when feeding, &export_bulk is preferred, falling back to &export_one. - * Thus, for RA_OPTIMAL, &export_one is only set, - * for RA_MERGED and RA_ACCEPTED, &export_bulk is only set - * and for RA_ANY, both are set to accomodate for feeding all routes but receiving single changes - */ - void (*export_one)(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); - void (*export_bulk)(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe, rte **feed, uint count); - - void (*dump_req)(struct rt_export_request *req); - void (*log_state_change)(struct rt_export_request *req, u8); -}; - -struct rt_export_hook { - node n; - rtable *table; /* The connected table */ - - pool *pool; - - struct rt_export_request *req; /* The requestor */ - - struct rt_export_stats { - /* Export - from core to protocol */ - u32 updates_received; /* Number of route updates received */ - u32 withdraws_received; /* Number of route withdraws received */ - } stats; - - struct fib_iterator feed_fit; /* Routing table iterator used during feeding */ - - struct bmap seq_map; /* Keep track which exports were already procesed */ - - struct rt_pending_export * _Atomic last_export;/* Last export processed */ - struct rt_pending_export *rpe_next; /* Next pending export to process */ - - btime last_state_change; /* Time of last state transition */ - - u8 refeed_pending; /* Refeeding and another refeed is scheduled */ - _Atomic u8 export_state; /* Route export state (TES_*, see below) */ - - struct event *event; /* Event running all the export operations */ - - void (*stopped)(struct rt_export_request *); /* Stored callback when export is stopped */ -}; - -extern struct event_cork rt_cork; - -#define TIS_DOWN 0 -#define TIS_UP 1 -#define TIS_STOP 2 -#define TIS_FLUSHING 3 -#define TIS_WAITING 4 -#define TIS_CLEARED 5 -#define TIS_MAX 6 - -#define TES_DOWN 0 -#define TES_HUNGRY 1 -#define TES_FEEDING 2 -#define TES_READY 3 -#define TES_STOP 4 -#define TES_MAX 5 - -void rt_request_import(rtable *tab, struct rt_import_request *req); -void rt_request_export(rtable *tab, struct rt_export_request *req); - -void rt_stop_import(struct rt_import_request *, struct event *stopped); -void rt_stop_export(struct rt_export_request *, void (*stopped)(struct rt_export_request *)); - -const char *rt_import_state_name(u8 state); -const char *rt_export_state_name(u8 state); - -static inline u8 rt_import_get_state(struct rt_import_hook *ih) { return ih ? ih->import_state : TIS_DOWN; } -static inline u8 rt_export_get_state(struct rt_export_hook *eh) { return eh ? eh->export_state : TES_DOWN; } - -void rte_import(struct rt_import_request *req, const net_addr *net, rte *new, struct rte_src *src); - -/* Get next rpe. If src is given, it must match. */ -struct rt_pending_export *rpe_next(struct rt_pending_export *rpe, struct rte_src *src); - -/* Mark the pending export processed */ -void rpe_mark_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); - -/* Get pending export seen status */ -int rpe_get_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); - - -/* Types of route announcement, also used as flags */ -#define RA_UNDEF 0 /* Undefined RA type */ -#define RA_OPTIMAL 1 /* Announcement of optimal route change */ -#define RA_ACCEPTED 2 /* Announcement of first accepted route */ -#define RA_ANY 3 /* Announcement of any route change */ -#define RA_MERGED 4 /* Announcement of optimal route merged with next ones */ - -/* Return value of preexport() callback */ -#define RIC_ACCEPT 1 /* Accepted by protocol */ -#define RIC_PROCESS 0 /* Process it through import filter */ -#define RIC_REJECT -1 /* Rejected by protocol */ -#define RIC_DROP -2 /* Silently dropped by protocol */ - -#define rte_update channel_rte_import -/** - * rte_update - enter a new update to a routing table - * @c: channel doing the update - * @net: network address - * @rte: a &rte representing the new route - * @src: old route source identifier - * - * This function imports a new route to the appropriate table (via the channel). - * Table keys are @net (obligatory) and @rte->attrs->src. - * Both the @net and @rte pointers can be local. - * - * The route attributes (@rte->attrs) are obligatory. They can be also allocated - * locally. Anyway, if you use an already-cached attribute object, you shall - * call rta_clone() on that object yourself. (This semantics may change in future.) - * - * If the route attributes are local, you may set @rte->attrs->src to NULL, then - * the protocol's default route source will be supplied. - * - * When rte_update() gets a route, it automatically validates it. This includes - * checking for validity of the given network and next hop addresses and also - * checking for host-scope or link-scope routes. Then the import filters are - * processed and if accepted, the route is passed to route table recalculation. - * - * The accepted routes are then inserted into the table, replacing the old route - * for the same @net identified by @src. Then the route is announced - * to all the channels connected to the table using the standard export mechanism. - * Setting @rte to NULL makes this a withdraw, otherwise @rte->src must be the same - * as @src. - * - * All memory used for temporary allocations is taken from a special linpool - * @rte_update_pool and freed when rte_update() finishes. - */ -void rte_update(struct channel *c, const net_addr *net, struct rte *rte, struct rte_src *src); - -extern list routing_tables; -struct config; - -void rt_init(void); -void rt_preconfig(struct config *); -void rt_commit(struct config *new, struct config *old); -void rt_lock_table(rtable_private *); -void rt_unlock_table(rtable_private *); -void rt_subscribe(rtable *tab, struct rt_subscription *s); -void rt_unsubscribe(struct rt_subscription *s); -rtable *rt_setup(pool *, struct rtable_config *); - -static inline net *net_find(rtable_private *tab, const net_addr *addr) { return (net *) fib_find(&tab->fib, addr); } -static inline net *net_find_valid(rtable_private *tab, const net_addr *addr) -{ net *n = net_find(tab, addr); return (n && n->routes && rte_is_valid(&n->routes->rte)) ? n : NULL; } -static inline net *net_get(rtable_private *tab, const net_addr *addr) { return (net *) fib_get(&tab->fib, addr); } -void *net_route(rtable_private *tab, const net_addr *n); -int net_roa_check(rtable *tab, const net_addr *n, u32 asn); -int rt_examine(rtable_private *t, net_addr *a, struct channel *c, const struct filter *filter); -rte *rt_export_merged(struct channel *c, rte ** feed, uint count, linpool *pool, int silent); - -void rt_refresh_begin(struct rt_import_request *); -void rt_refresh_end(struct rt_import_request *); -void rt_schedule_prune(rtable_private *t); -void rte_dump(struct rte_storage *); -void rte_free(struct rte_storage *, rtable_private *); -struct rte_storage *rte_store(const rte *, net *net, rtable_private *); -void rt_dump(rtable *); -void rt_dump_all(void); -void rt_dump_hooks(rtable *); -void rt_dump_hooks_all(void); -void rt_prune_sync(rtable *t, int all); -struct rtable_config *rt_new_table(struct symbol *s, uint addr_type); - -/* Default limit for ECMP next hops, defined in sysdep code */ -extern const int rt_default_ecmp; - -struct rt_show_data_rtable { - node n; - rtable *table; - struct channel *export_channel; -}; - -struct rt_show_data { - net_addr *addr; - list tables; - struct rt_show_data_rtable *tab; /* Iterator over table list */ - struct rt_show_data_rtable *last_table; /* Last table in output */ - struct fib_iterator fit; /* Iterator over networks in table */ - int verbose, tables_defined_by; - const struct filter *filter; - struct proto *show_protocol; - struct proto *export_protocol; - struct channel *export_channel; - struct config *running_on_config; - struct krt_proto *kernel; - struct rt_export_hook *kernel_export_hook; - int export_mode, primary_only, filtered, stats, show_for; - - int table_open; /* Iteration (fit) is open */ - int net_counter, rt_counter, show_counter, table_counter; - int net_counter_last, rt_counter_last, show_counter_last; -}; - -void rt_show(struct rt_show_data *); -struct rt_show_data_rtable * rt_show_add_table(struct rt_show_data *d, rtable *t); - -/* Value of table definition mode in struct rt_show_data */ -#define RSD_TDB_DEFAULT 0 /* no table specified */ -#define RSD_TDB_INDIRECT 0 /* show route ... protocol P ... */ -#define RSD_TDB_ALL RSD_TDB_SET /* show route ... table all ... */ -#define RSD_TDB_DIRECT RSD_TDB_SET | RSD_TDB_NMN /* show route ... table X table Y ... */ - -#define RSD_TDB_SET 0x1 /* internal: show empty tables */ -#define RSD_TDB_NMN 0x2 /* internal: need matching net */ - -/* Value of export_mode in struct rt_show_data */ -#define RSEM_NONE 0 /* Export mode not used */ -#define RSEM_PREEXPORT 1 /* Routes ready for export, before filtering */ -#define RSEM_EXPORT 2 /* Routes accepted by export filter */ -#define RSEM_NOEXPORT 3 /* Routes rejected by export filter */ -#define RSEM_EXPORTED 4 /* Routes marked in export map */ - -/* - * Route Attributes - * - * Beware: All standard BGP attributes must be represented here instead - * of making them local to the route. This is needed to ensure proper - * construction of BGP route attribute lists. - */ - -/* Nexthop structure */ -struct nexthop { - ip_addr gw; /* Next hop */ - struct iface *iface; /* Outgoing interface */ - struct nexthop *next; - byte flags; - byte weight; - byte labels_orig; /* Number of labels before hostentry was applied */ - byte labels; /* Number of all labels */ - u32 label[0]; -}; - -#define RNF_ONLINK 0x1 /* Gateway is onlink regardless of IP ranges */ - - -struct rte_src { - struct rte_src *next; /* Hash chain */ - struct rte_owner *owner; /* Route source owner */ - u32 private_id; /* Private ID, assigned by the protocol */ - u32 global_id; /* Globally unique ID of the source */ - _Atomic u64 uc; /* Use count */ -}; - - -typedef struct rta { - struct rta * _Atomic next, * _Atomic *pprev; /* Hash chain */ - _Atomic u32 uc; /* Use count */ - u32 hash_key; /* Hash over important fields */ - struct ea_list *eattrs; /* Extended Attribute chain */ - struct hostentry *hostentry; /* Hostentry for recursive next-hops */ - ip_addr from; /* Advertising router */ - u32 igp_metric; /* IGP metric to next hop (for iBGP routes) */ - u16 cached:1; /* Are attributes cached? */ - u16 source:7; /* Route source (RTS_...) */ - u16 scope:4; /* Route scope (SCOPE_... -- see ip.h) */ - u16 dest:4; /* Route destination type (RTD_...) */ - word pref; - struct nexthop nh; /* Next hop */ -} rta; - -#define RTS_STATIC 1 /* Normal static route */ -#define RTS_INHERIT 2 /* Route inherited from kernel */ -#define RTS_DEVICE 3 /* Device route */ -#define RTS_STATIC_DEVICE 4 /* Static device route */ -#define RTS_REDIRECT 5 /* Learned via redirect */ -#define RTS_RIP 6 /* RIP route */ -#define RTS_OSPF 7 /* OSPF route */ -#define RTS_OSPF_IA 8 /* OSPF inter-area route */ -#define RTS_OSPF_EXT1 9 /* OSPF external route type 1 */ -#define RTS_OSPF_EXT2 10 /* OSPF external route type 2 */ -#define RTS_BGP 11 /* BGP route */ -#define RTS_PIPE 12 /* Inter-table wormhole */ -#define RTS_BABEL 13 /* Babel route */ -#define RTS_RPKI 14 /* Route Origin Authorization */ -#define RTS_PERF 15 /* Perf checker */ -#define RTS_MAX 16 - -#define RTD_NONE 0 /* Undefined next hop */ -#define RTD_UNICAST 1 /* Next hop is neighbor router */ -#define RTD_BLACKHOLE 2 /* Silently drop packets */ -#define RTD_UNREACHABLE 3 /* Reject as unreachable */ -#define RTD_PROHIBIT 4 /* Administratively prohibited */ -#define RTD_MAX 5 - -#define IGP_METRIC_UNKNOWN 0x80000000 /* Default igp_metric used when no other - protocol-specific metric is availabe */ - - -extern const char * rta_dest_names[RTD_MAX]; - -static inline const char *rta_dest_name(uint n) -{ return (n < RTD_MAX) ? rta_dest_names[n] : "???"; } - -/* Route has regular, reachable nexthop (i.e. not RTD_UNREACHABLE and like) */ -static inline int rte_is_reachable(rte *r) -{ return r->attrs->dest == RTD_UNICAST; } - - -/* - * Extended Route Attributes - */ - -typedef struct eattr { - word id; /* EA_CODE(PROTOCOL_..., protocol-dependent ID) */ - byte flags; /* Protocol-dependent flags */ - byte type; /* Attribute type and several flags (EAF_...) */ - union { - uintptr_t data; - const struct adata *ptr; /* Attribute data elsewhere */ - } u; -} eattr; - - -#define EA_CODE(proto,id) (((proto) << 8) | (id)) -#define EA_ID(ea) ((ea) & 0xff) -#define EA_PROTO(ea) ((ea) >> 8) -#define EA_CUSTOM(id) ((id) | EA_CUSTOM_BIT) -#define EA_IS_CUSTOM(ea) ((ea) & EA_CUSTOM_BIT) -#define EA_CUSTOM_ID(ea) ((ea) & ~EA_CUSTOM_BIT) - -const char *ea_custom_name(uint ea); - -#define EA_GEN_IGP_METRIC EA_CODE(PROTOCOL_NONE, 0) - -#define EA_CODE_MASK 0xffff -#define EA_CUSTOM_BIT 0x8000 -#define EA_ALLOW_UNDEF 0x10000 /* ea_find: allow EAF_TYPE_UNDEF */ -#define EA_BIT(n) ((n) << 24) /* Used in bitfield accessors */ -#define EA_BIT_GET(ea) ((ea) >> 24) - -#define EAF_TYPE_MASK 0x1f /* Mask with this to get type */ -#define EAF_TYPE_INT 0x01 /* 32-bit unsigned integer number */ -#define EAF_TYPE_OPAQUE 0x02 /* Opaque byte string (not filterable) */ -#define EAF_TYPE_IP_ADDRESS 0x04 /* IP address */ -#define EAF_TYPE_ROUTER_ID 0x05 /* Router ID (IPv4 address) */ -#define EAF_TYPE_AS_PATH 0x06 /* BGP AS path (encoding per RFC 1771:4.3) */ -#define EAF_TYPE_BITFIELD 0x09 /* 32-bit embedded bitfield */ -#define EAF_TYPE_INT_SET 0x0a /* Set of u32's (e.g., a community list) */ -#define EAF_TYPE_PTR 0x0d /* Pointer to an object */ -#define EAF_TYPE_EC_SET 0x0e /* Set of pairs of u32's - ext. community list */ -#define EAF_TYPE_LC_SET 0x12 /* Set of triplets of u32's - large community list */ -#define EAF_TYPE_UNDEF 0x1f /* `force undefined' entry */ -#define EAF_EMBEDDED 0x01 /* Data stored in eattr.u.data (part of type spec) */ -#define EAF_VAR_LENGTH 0x02 /* Attribute length is variable (part of type spec) */ -#define EAF_ORIGINATED 0x20 /* The attribute has originated locally */ -#define EAF_FRESH 0x40 /* An uncached attribute (e.g. modified in export filter) */ - -typedef struct adata { - uint length; /* Length of data */ - byte data[0]; -} adata; - -extern const adata null_adata; /* adata of length 0 */ - -static inline struct adata * -lp_alloc_adata(struct linpool *pool, uint len) -{ - struct adata *ad = lp_alloc(pool, sizeof(struct adata) + len); - ad->length = len; - return ad; -} - -static inline int adata_same(const struct adata *a, const struct adata *b) -{ return (a->length == b->length && !memcmp(a->data, b->data, a->length)); } - - -typedef struct ea_list { - struct ea_list *next; /* In case we have an override list */ - byte flags; /* Flags: EALF_... */ - byte rfu; - word count; /* Number of attributes */ - eattr attrs[0]; /* Attribute definitions themselves */ -} ea_list; - -#define EALF_SORTED 1 /* Attributes are sorted by code */ -#define EALF_BISECT 2 /* Use interval bisection for searching */ -#define EALF_CACHED 4 /* Attributes belonging to cached rta */ - -struct rte_owner_class { - void (*get_route_info)(struct rte *, byte *buf); /* Get route information (for `show route' command) */ - int (*rte_better)(struct rte *, struct rte *); - int (*rte_mergable)(struct rte *, struct rte *); - u32 (*rte_igp_metric)(struct rte *); -}; - -struct rte_owner { - struct rte_owner_class *class; - int (*rte_recalculate)(rtable_private *, struct network *, struct rte *, struct rte *, struct rte *); - HASH(struct rte_src) hash; - const char *name; - u32 hash_key; - u32 uc; - event_list *list; - event *prune; - event *stop; -}; - -#define RTE_SRC_PU_SHIFT 44 -#define RTE_SRC_IN_PROGRESS (1ULL << RTE_SRC_PU_SHIFT) - -struct rte_src *rt_get_source_o(struct rte_owner *o, u32 id); -#define rt_get_source(p, id) rt_get_source_o(&(p)->sources, (id)) -static inline void rt_lock_source(struct rte_src *src) -{ - u64 uc = atomic_fetch_add_explicit(&src->uc, 1, memory_order_acq_rel); - ASSERT_DIE(uc > 0); -} - -static inline void rt_unlock_source(struct rte_src *src) -{ - u64 uc = atomic_fetch_add_explicit(&src->uc, RTE_SRC_IN_PROGRESS, memory_order_acq_rel); - u64 pending = uc >> RTE_SRC_PU_SHIFT; - uc &= RTE_SRC_IN_PROGRESS - 1; - - ASSERT_DIE(uc > pending); - if (uc == pending + 1) - ev_send(src->owner->list, src->owner->prune); - - atomic_fetch_sub_explicit(&src->uc, RTE_SRC_IN_PROGRESS + 1, memory_order_acq_rel); -} - -void rt_init_sources(struct rte_owner *, const char *name, event_list *list); -void rt_destroy_sources(struct rte_owner *, event *); - -struct ea_walk_state { - ea_list *eattrs; /* Ccurrent ea_list, initially set by caller */ - eattr *ea; /* Current eattr, initially NULL */ - u32 visited[4]; /* Bitfield, limiting max to 128 */ -}; - -eattr *ea_find(ea_list *, unsigned ea); -eattr *ea_walk(struct ea_walk_state *s, uint id, uint max); -uintptr_t ea_get_int(ea_list *, unsigned ea, uintptr_t def); -void ea_dump(ea_list *); -void ea_sort(ea_list *); /* Sort entries in all sub-lists */ -unsigned ea_scan(ea_list *); /* How many bytes do we need for merged ea_list */ -void ea_merge(ea_list *from, ea_list *to); /* Merge sub-lists to allocated buffer */ -int ea_same(ea_list *x, ea_list *y); /* Test whether two ea_lists are identical */ -uint ea_hash(ea_list *e); /* Calculate 16-bit hash value */ -ea_list *ea_append(ea_list *to, ea_list *what); -void ea_format_bitfield(const struct eattr *a, byte *buf, int bufsize, const char **names, int min, int max); - -#define ea_normalize(ea) do { \ - if (ea->next) { \ - ea_list *t = alloca(ea_scan(ea)); \ - ea_merge(ea, t); \ - ea = t; \ - } \ - ea_sort(ea); \ - if (ea->count == 0) \ - ea = NULL; \ -} while(0) \ - -static inline eattr * -ea_set_attr(ea_list **to, struct linpool *pool, uint id, uint flags, uint type, uintptr_t val) -{ - ea_list *a = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr)); - eattr *e = &a->attrs[0]; - - a->flags = EALF_SORTED; - a->count = 1; - a->next = *to; - *to = a; - - e->id = id; - e->type = type; - e->flags = flags; - - if (type & EAF_EMBEDDED) - e->u.data = (u32) val; - else - e->u.ptr = (struct adata *) val; - - return e; -} - -static inline void -ea_set_attr_u32(ea_list **to, struct linpool *pool, uint id, uint flags, uint type, u32 val) -{ ea_set_attr(to, pool, id, flags, type, (uintptr_t) val); } - -static inline void -ea_set_attr_ptr(ea_list **to, struct linpool *pool, uint id, uint flags, uint type, struct adata *val) -{ ea_set_attr(to, pool, id, flags, type, (uintptr_t) val); } - -static inline void -ea_set_attr_data(ea_list **to, struct linpool *pool, uint id, uint flags, uint type, void *data, uint len) -{ - struct adata *a = lp_alloc_adata(pool, len); - memcpy(a->data, data, len); - ea_set_attr(to, pool, id, flags, type, (uintptr_t) a); -} - - -#define NEXTHOP_MAX_SIZE (sizeof(struct nexthop) + sizeof(u32)*MPLS_MAX_LABEL_STACK) - -static inline size_t nexthop_size(const struct nexthop *nh) -{ return sizeof(struct nexthop) + sizeof(u32)*nh->labels; } -int nexthop__same(struct nexthop *x, struct nexthop *y); /* Compare multipath nexthops */ -static inline int nexthop_same(struct nexthop *x, struct nexthop *y) -{ return (x == y) || nexthop__same(x, y); } -struct nexthop *nexthop_merge(struct nexthop *x, struct nexthop *y, int rx, int ry, int max, linpool *lp); -struct nexthop *nexthop_sort(struct nexthop *x); -static inline void nexthop_link(struct rta *a, struct nexthop *from) -{ memcpy(&a->nh, from, nexthop_size(from)); } -void nexthop_insert(struct nexthop **n, struct nexthop *y); -int nexthop_is_sorted(struct nexthop *x); - -void rta_init(void); -static inline size_t rta_size(const rta *a) { return sizeof(rta) + sizeof(u32)*a->nh.labels; } -#define RTA_MAX_SIZE (sizeof(rta) + sizeof(u32)*MPLS_MAX_LABEL_STACK) -rta *rta_lookup(rta *); /* Get rta equivalent to this one, uc++ */ -static inline int rta_is_cached(rta *r) { return r->cached; } - -static inline rta *rta_clone(rta *r) { - u32 uc = atomic_fetch_add_explicit(&r->uc, 1, memory_order_acq_rel); - ASSERT_DIE(uc > 0); - return r; -} - -void rta__free(rta *r); -static inline void rta_free(rta *r) { - if (!r) - return; - - u32 uc = atomic_fetch_sub_explicit(&r->uc, 1, memory_order_acq_rel); - if (uc == 1) - rta__free(r); -} - -rta *rta_do_cow(rta *o, linpool *lp); -static inline rta * rta_cow(rta *r, linpool *lp) { return rta_is_cached(r) ? rta_do_cow(r, lp) : r; } -static inline void rta_uncache(rta *r) { r->cached = 0; r->uc = 0; } -void rta_dump(const rta *); -void rta_dump_all(void); -void rta_show(struct cli *, const rta *); - -u32 rt_get_igp_metric(rte *); -struct hostentry * rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep); -void rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls, linpool *lp); - -static inline void -rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr gw, ip_addr ll, mpls_label_stack *mls, linpool *lp) -{ - rta_apply_hostentry(a, rt_get_hostentry(tab, gw, ll, dep), mls, lp); -} - -/* - * rta_set_recursive_next_hop() acquires hostentry from hostcache and fills - * rta->hostentry field. New hostentry has zero use count. Cached rta locks its - * hostentry (increases its use count), uncached rta does not lock it. Hostentry - * with zero use count is removed asynchronously during host cache update, - * therefore it is safe to hold such hostentry temorarily. Hostentry holds a - * lock for a 'source' rta, mainly to share multipath nexthops. - * - * There is no need to hold a lock for hostentry->dep table, because that table - * contains routes responsible for that hostentry, and therefore is non-empty if - * given hostentry has non-zero use count. If the hostentry has zero use count, - * the entry is removed before dep is referenced. - * - * The protocol responsible for routes with recursive next hops should hold a - * lock for a 'source' table governing that routes (argument tab to - * rta_set_recursive_next_hop()), because its routes reference hostentries - * (through rta) related to the governing table. When all such routes are - * removed, rtas are immediately removed achieving zero uc. Then the 'source' - * table lock could be immediately released, although hostentries may still - * exist - they will be freed together with the 'source' table. - */ - -static inline void rt_lock_hostentry(struct hostentry *he) { if (he) he->uc++; } -static inline void rt_unlock_hostentry(struct hostentry *he) { if (he) he->uc--; } - -/* - * Default protocol preferences - */ - -#define DEF_PREF_DIRECT 240 /* Directly connected */ -#define DEF_PREF_STATIC 200 /* Static route */ -#define DEF_PREF_OSPF 150 /* OSPF intra-area, inter-area and type 1 external routes */ -#define DEF_PREF_BABEL 130 /* Babel */ -#define DEF_PREF_RIP 120 /* RIP */ -#define DEF_PREF_BGP 100 /* BGP */ -#define DEF_PREF_RPKI 100 /* RPKI */ -#define DEF_PREF_INHERITED 10 /* Routes inherited from other routing daemons */ - -/* - * Route Origin Authorization - */ - -#define ROA_UNKNOWN 0 -#define ROA_VALID 1 -#define ROA_INVALID 2 - -#endif diff --git a/nest/rt-attr.c b/nest/rt-attr.c index 9a5498ed..471209ee 100644 --- a/nest/rt-attr.c +++ b/nest/rt-attr.c @@ -45,22 +45,37 @@ */ #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "nest/cli.h" -#include "nest/attrs.h" +#include "lib/attrs.h" #include "lib/alloca.h" #include "lib/hash.h" #include "lib/idm.h" #include "lib/resource.h" -#include "lib/rcu.h" #include "lib/string.h" #include <stddef.h> +#include <stdlib.h> const adata null_adata; /* adata of length 0 */ +struct ea_class ea_gen_igp_metric = { + .name = "igp_metric", + .type = T_INT, +}; + +struct ea_class ea_gen_preference = { + .name = "preference", + .type = T_INT, +}; + +struct ea_class ea_gen_from = { + .name = "from", + .type = T_IP, +}; + const char * const rta_src_names[RTS_MAX] = { [RTS_STATIC] = "static", [RTS_INHERIT] = "inherit", @@ -78,6 +93,71 @@ const char * const rta_src_names[RTS_MAX] = { [RTS_RPKI] = "RPKI", }; +static void +ea_gen_source_format(const eattr *a, byte *buf, uint size) +{ + if ((a->u.data >= RTS_MAX) || !rta_src_names[a->u.data]) + bsnprintf(buf, size, "unknown"); + else + bsnprintf(buf, size, "%s", rta_src_names[a->u.data]); +} + +struct ea_class ea_gen_source = { + .name = "source", + .type = T_ENUM_RTS, + .readonly = 1, + .format = ea_gen_source_format, +}; + +struct ea_class ea_gen_nexthop = { + .name = "nexthop", + .type = T_NEXTHOP_LIST, +}; + +/* + * ea_set_hostentry() acquires hostentry from hostcache. + * New hostentry has zero use count. Cached rta locks its + * hostentry (increases its use count), uncached rta does not lock it. + * Hostentry with zero use count is removed asynchronously + * during host cache update, therefore it is safe to hold + * such hostentry temporarily as long as you hold the table lock. + * + * There is no need to hold a lock for hostentry->dep table, because that table + * contains routes responsible for that hostentry, and therefore is non-empty if + * given hostentry has non-zero use count. If the hostentry has zero use count, + * the entry is removed before dep is referenced. + * + * The protocol responsible for routes with recursive next hops should hold a + * lock for a 'source' table governing that routes (argument tab), + * because its routes reference hostentries related to the governing table. + * When all such routes are + * removed, rtas are immediately removed achieving zero uc. Then the 'source' + * table lock could be immediately released, although hostentries may still + * exist - they will be freed together with the 'source' table. + */ + + static void +ea_gen_hostentry_stored(const eattr *ea) +{ + struct hostentry_adata *had = (struct hostentry_adata *) ea->u.ptr; + had->he->uc++; +} + +static void +ea_gen_hostentry_freed(const eattr *ea) +{ + struct hostentry_adata *had = (struct hostentry_adata *) ea->u.ptr; + had->he->uc--; +} + +struct ea_class ea_gen_hostentry = { + .name = "hostentry", + .type = T_HOSTENTRY, + .readonly = 1, + .stored = ea_gen_hostentry_stored, + .freed = ea_gen_hostentry_freed, +}; + const char * rta_dest_names[RTD_MAX] = { [RTD_NONE] = "", [RTD_UNICAST] = "unicast", @@ -86,17 +166,22 @@ const char * rta_dest_names[RTD_MAX] = { [RTD_PROHIBIT] = "prohibited", }; -DEFINE_DOMAIN(attrs); -static DOMAIN(attrs) src_domain; +struct ea_class ea_gen_flowspec_valid = { + .name = "flowspec_valid", + .type = T_ENUM_FLOWSPEC_VALID, + .readonly = 1, +}; -#define SRC_LOCK LOCK_DOMAIN(attrs, src_domain) -#define SRC_UNLOCK UNLOCK_DOMAIN(attrs, src_domain) +const char * flowspec_valid_names[FLOWSPEC__MAX] = { + [FLOWSPEC_UNKNOWN] = "unknown", + [FLOWSPEC_VALID] = "", + [FLOWSPEC_INVALID] = "invalid", +}; + +DOMAIN(attrs) attrs_domain; pool *rta_pool; -pool *src_pool; -static slab *rta_slab_[4]; -static slab *nexthop_slab_[4]; static slab *rte_src_slab; static struct idm src_ids; @@ -112,15 +197,16 @@ static struct idm src_ids; #define RSH_REHASH rte_src_rehash #define RSH_PARAMS /2, *2, 1, 1, 8, 20 #define RSH_INIT_ORDER 2 +static struct rte_src **rte_src_global; +static uint rte_src_global_max = SRC_ID_INIT_SIZE; static void rte_src_init(void) { - src_domain = DOMAIN_NEW(attrs, "Route sources"); - src_pool = rp_new(&root_pool, "Route sources"); - rte_src_slab = sl_new(src_pool, sizeof(struct rte_src)); + rte_src_slab = sl_new(rta_pool, sizeof(struct rte_src)); + rte_src_global = mb_allocz(rta_pool, sizeof(struct rte_src *) * rte_src_global_max); - idm_init(&src_ids, src_pool, SRC_ID_INIT_SIZE); + idm_init(&src_ids, rta_pool, SRC_ID_INIT_SIZE); } HASH_DEFINE_REHASH_FN(RSH, struct rte_src) @@ -145,7 +231,7 @@ rt_get_source_o(struct rte_owner *p, u32 id) return src; } - SRC_LOCK; + RTA_LOCK; src = sl_allocz(rte_src_slab); src->owner = p; src->private_id = id; @@ -154,23 +240,37 @@ rt_get_source_o(struct rte_owner *p, u32 id) atomic_store_explicit(&src->uc, 1, memory_order_release); p->uc++; - HASH_INSERT2(p->hash, RSH, src_pool, src); + HASH_INSERT2(p->hash, RSH, rta_pool, src); if (config->table_debug) log(L_TRACE "Allocated new rte_src for %s, ID %uL %uG, have %u sources now", p->name, src->private_id, src->global_id, p->uc); - SRC_UNLOCK; + if (src->global_id >= rte_src_global_max) + { + rte_src_global = mb_realloc(rte_src_global, sizeof(struct rte_src *) * (rte_src_global_max *= 2)); + memset(&rte_src_global[rte_src_global_max / 2], 0, + sizeof(struct rte_src *) * (rte_src_global_max / 2)); + } + + rte_src_global[src->global_id] = src; + RTA_UNLOCK; return src; } +struct rte_src * +rt_find_source_global(u32 id) +{ + if (id >= rte_src_global_max) + return NULL; + else + return rte_src_global[id]; +} + static inline void rt_done_sources(struct rte_owner *o) { - if (o->stop->list) - ev_send(o->stop->list, o->stop); - else - ev_send(o->list, o->stop); + ev_send(o->list, o->stop); } void @@ -182,7 +282,7 @@ rt_prune_sources(void *data) { u64 uc; while ((uc = atomic_load_explicit(&src->uc, memory_order_acquire)) >> RTE_SRC_PU_SHIFT) - ; + synchronize_rcu(); if (uc == 0) { @@ -190,21 +290,22 @@ rt_prune_sources(void *data) HASH_DO_REMOVE(o->hash, RSH, sp); - SRC_LOCK; + RTA_LOCK; + rte_src_global[src->global_id] = NULL; idm_free(&src_ids, src->global_id); - sl_free(rte_src_slab, src); - SRC_UNLOCK; + sl_free(src); + RTA_UNLOCK; } } HASH_WALK_FILTER_END; - SRC_LOCK; - HASH_MAY_RESIZE_DOWN(o->hash, RSH, src_pool); + RTA_LOCK; + HASH_MAY_RESIZE_DOWN(o->hash, RSH, rta_pool); if (o->stop && !o->uc) { rfree(o->prune); - SRC_UNLOCK; + RTA_UNLOCK; if (config->table_debug) log(L_TRACE "All rte_src's for %s pruned, scheduling stop event", o->name); @@ -212,21 +313,21 @@ rt_prune_sources(void *data) rt_done_sources(o); } else - SRC_UNLOCK; + RTA_UNLOCK; } void rt_init_sources(struct rte_owner *o, const char *name, event_list *list) { - SRC_LOCK; - HASH_INIT(o->hash, src_pool, RSH_INIT_ORDER); + RTA_LOCK; + HASH_INIT(o->hash, rta_pool, RSH_INIT_ORDER); o->hash_key = random_u32(); o->uc = 0; o->name = name; - o->prune = ev_new_init(src_pool, rt_prune_sources, o); + o->prune = ev_new_init(rta_pool, rt_prune_sources, o); o->stop = NULL; o->list = list; - SRC_UNLOCK; + RTA_UNLOCK; } void @@ -239,9 +340,9 @@ rt_destroy_sources(struct rte_owner *o, event *done) if (config->table_debug) log(L_TRACE "Source owner %s destroy requested. All rte_src's already pruned, scheduling stop event", o->name); - SRC_LOCK; + RTA_LOCK; rfree(o->prune); - SRC_UNLOCK; + RTA_UNLOCK; rt_done_sources(o); } @@ -254,50 +355,10 @@ rt_destroy_sources(struct rte_owner *o, event *done) * Multipath Next Hop */ -static inline u32 -nexthop_hash(struct nexthop *x) -{ - u32 h = 0; - for (; x; x = x->next) - { - h ^= ipa_hash(x->gw) ^ (h << 5) ^ (h >> 9); - - for (int i = 0; i < x->labels; i++) - h ^= x->label[i] ^ (h << 6) ^ (h >> 7); - } - - return h; -} - -int -nexthop__same(struct nexthop *x, struct nexthop *y) -{ - for (; x && y; x = x->next, y = y->next) - { - if (!ipa_equal(x->gw, y->gw) || (x->iface != y->iface) || - (x->flags != y->flags) || (x->weight != y->weight) || - (x->labels_orig != y->labels_orig) || (x->labels != y->labels)) - return 0; - - for (int i = 0; i < x->labels; i++) - if (x->label[i] != y->label[i]) - return 0; - } - - return x == y; -} - static int nexthop_compare_node(const struct nexthop *x, const struct nexthop *y) { int r; - - if (!x) - return 1; - - if (!y) - return -1; - /* Should we also compare flags ? */ r = ((int) y->weight) - ((int) x->weight); @@ -322,23 +383,16 @@ nexthop_compare_node(const struct nexthop *x, const struct nexthop *y) return ((int) x->iface->index) - ((int) y->iface->index); } -static inline struct nexthop * -nexthop_copy_node(const struct nexthop *src, linpool *lp) +static int +nexthop_compare_qsort(const void *x, const void *y) { - struct nexthop *n = lp_alloc(lp, nexthop_size(src)); - - memcpy(n, src, nexthop_size(src)); - n->next = NULL; - - return n; + return nexthop_compare_node( *(const struct nexthop **) x, *(const struct nexthop **) y ); } /** * nexthop_merge - merge nexthop lists * @x: list 1 * @y: list 2 - * @rx: reusability of list @x - * @ry: reusability of list @y * @max: max number of nexthops * @lp: linpool for allocating nexthops * @@ -355,138 +409,227 @@ nexthop_copy_node(const struct nexthop *src, linpool *lp) * resulting list is no longer needed. When reusability is not set, the * corresponding lists are not modified nor linked from the resulting list. */ -struct nexthop * -nexthop_merge(struct nexthop *x, struct nexthop *y, int rx, int ry, int max, linpool *lp) +struct nexthop_adata * +nexthop_merge(struct nexthop_adata *xin, struct nexthop_adata *yin, int max, linpool *lp) { - struct nexthop *root = NULL; - struct nexthop **n = &root; + uint outlen = ADATA_SIZE(xin->ad.length) + ADATA_SIZE(yin->ad.length); + struct nexthop_adata *out = lp_alloc(lp, outlen); + out->ad.length = outlen - sizeof (struct adata); + + struct nexthop *x = &xin->nh, *y = &yin->nh, *cur = &out->nh; + int xvalid, yvalid; - while ((x || y) && max--) + while (max--) { - int cmp = nexthop_compare_node(x, y); + xvalid = NEXTHOP_VALID(x, xin); + yvalid = NEXTHOP_VALID(y, yin); + + if (!xvalid && !yvalid) + break; + + ASSUME(NEXTHOP_VALID(cur, out)); + + int cmp = !xvalid ? 1 : !yvalid ? -1 : nexthop_compare_node(x, y); if (cmp < 0) { - ASSUME(x); - *n = rx ? x : nexthop_copy_node(x, lp); - x = x->next; + ASSUME(NEXTHOP_VALID(x, xin)); + memcpy(cur, x, nexthop_size(x)); + x = NEXTHOP_NEXT(x); } else if (cmp > 0) { - ASSUME(y); - *n = ry ? y : nexthop_copy_node(y, lp); - y = y->next; + ASSUME(NEXTHOP_VALID(y, yin)); + memcpy(cur, y, nexthop_size(y)); + y = NEXTHOP_NEXT(y); } else { - ASSUME(x && y); - *n = rx ? x : (ry ? y : nexthop_copy_node(x, lp)); - x = x->next; - y = y->next; + ASSUME(NEXTHOP_VALID(x, xin)); + memcpy(cur, x, nexthop_size(x)); + x = NEXTHOP_NEXT(x); + + ASSUME(NEXTHOP_VALID(y, yin)); + y = NEXTHOP_NEXT(y); } - n = &((*n)->next); + cur = NEXTHOP_NEXT(cur); } - *n = NULL; - return root; + out->ad.length = (void *) cur - (void *) out->ad.data; + + return out; } -void -nexthop_insert(struct nexthop **n, struct nexthop *x) +struct nexthop_adata * +nexthop_sort(struct nexthop_adata *nhad, linpool *lp) { - for (; *n; n = &((*n)->next)) - { - int cmp = nexthop_compare_node(*n, x); + /* Count the nexthops */ + uint cnt = 0; + NEXTHOP_WALK(nh, nhad) + cnt++; - if (cmp < 0) - continue; - else if (cmp > 0) - break; - else - return; - } + if (cnt <= 1) + return nhad; - x->next = *n; - *n = x; -} + /* Get pointers to them */ + struct nexthop **sptr = tmp_alloc(cnt * sizeof(struct nexthop *)); -struct nexthop * -nexthop_sort(struct nexthop *x) -{ - struct nexthop *s = NULL; + uint i = 0; + NEXTHOP_WALK(nh, nhad) + sptr[i++] = nh; + + /* Sort the pointers */ + qsort(sptr, cnt, sizeof(struct nexthop *), nexthop_compare_qsort); + + /* Allocate the output */ + struct nexthop_adata *out = (struct nexthop_adata *) lp_alloc_adata(lp, nhad->ad.length); + struct nexthop *dest = &out->nh; - /* Simple insert-sort */ - while (x) + /* Deduplicate nexthops while storing them */ + for (uint i = 0; i < cnt; i++) { - struct nexthop *n = x; - x = n->next; - n->next = NULL; + if (i && !nexthop_compare_node(sptr[i], sptr[i-1])) + continue; - nexthop_insert(&s, n); + memcpy(dest, sptr[i], NEXTHOP_SIZE(sptr[i])); + dest = NEXTHOP_NEXT(dest); } - return s; + out->ad.length = (void *) dest - (void *) out->ad.data; + return out; } int -nexthop_is_sorted(struct nexthop *x) +nexthop_is_sorted(struct nexthop_adata *nhad) { - for (; x && x->next; x = x->next) - if (nexthop_compare_node(x, x->next) >= 0) + struct nexthop *prev = NULL; + NEXTHOP_WALK(nh, nhad) + { + if (prev && (nexthop_compare_node(prev, nh) >= 0)) return 0; + prev = nh; + } + return 1; } -static inline slab * -nexthop_slab(struct nexthop *nh) +/* + * Extended Attributes + */ + +#define EA_CLASS_INITIAL_MAX 128 +static struct ea_class **ea_class_global = NULL; +static uint ea_class_max; +static struct idm ea_class_idm; + +/* Config parser lex register function */ +void ea_lex_register(struct ea_class *def); +void ea_lex_unregister(struct ea_class *def); + +static void +ea_class_free(struct ea_class *cl) { - return nexthop_slab_[MIN(nh->labels, 3)]; + /* No more ea class references. Unregister the attribute. */ + idm_free(&ea_class_idm, cl->id); + ea_class_global[cl->id] = NULL; + if (!cl->hidden) + ea_lex_unregister(cl); } -static struct nexthop * -nexthop_copy(struct nexthop *o) +static void +ea_class_ref_free(resource *r) { - struct nexthop *first = NULL; - struct nexthop **last = &first; - - for (; o; o = o->next) - { - struct nexthop *n = sl_allocz(nexthop_slab(o)); - n->gw = o->gw; - n->iface = o->iface; - n->next = NULL; - n->flags = o->flags; - n->weight = o->weight; - n->labels_orig = o->labels_orig; - n->labels = o->labels; - for (int i=0; i<o->labels; i++) - n->label[i] = o->label[i]; - - *last = n; - last = &(n->next); - } + struct ea_class_ref *ref = SKIP_BACK(struct ea_class_ref, r, r); + if (!--ref->class->uc) + ea_class_free(ref->class); +} - return first; +static void +ea_class_ref_dump(resource *r) +{ + struct ea_class_ref *ref = SKIP_BACK(struct ea_class_ref, r, r); + debug("name \"%s\", type=%d\n", ref->class->name, ref->class->type); } +static struct resclass ea_class_ref_class = { + .name = "Attribute class reference", + .size = sizeof(struct ea_class_ref), + .free = ea_class_ref_free, + .dump = ea_class_ref_dump, + .lookup = NULL, + .memsize = NULL, +}; + static void -nexthop_free(struct nexthop *o) +ea_class_init(void) { - struct nexthop *n; + idm_init(&ea_class_idm, rta_pool, EA_CLASS_INITIAL_MAX); + ea_class_global = mb_allocz(rta_pool, + sizeof(*ea_class_global) * (ea_class_max = EA_CLASS_INITIAL_MAX)); +} - while (o) - { - n = o->next; - sl_free(nexthop_slab(o), o); - o = n; - } +static struct ea_class_ref * +ea_ref_class(pool *p, struct ea_class *def) +{ + def->uc++; + struct ea_class_ref *ref = ralloc(p, &ea_class_ref_class); + ref->class = def; + return ref; } +static struct ea_class_ref * +ea_register(pool *p, struct ea_class *def) +{ + def->id = idm_alloc(&ea_class_idm); -/* - * Extended Attributes - */ + ASSERT_DIE(ea_class_global); + while (def->id >= ea_class_max) + ea_class_global = mb_realloc(ea_class_global, sizeof(*ea_class_global) * (ea_class_max *= 2)); + + ASSERT_DIE(def->id < ea_class_max); + ea_class_global[def->id] = def; + + if (!def->hidden) + ea_lex_register(def); + + return ea_ref_class(p, def); +} + +struct ea_class_ref * +ea_register_alloc(pool *p, struct ea_class cl) +{ + struct ea_class *clp = ea_class_find_by_name(cl.name); + if (clp && clp->type == cl.type) + return ea_ref_class(p, clp); + + uint namelen = strlen(cl.name) + 1; + + struct { + struct ea_class cl; + char name[0]; + } *cla = mb_alloc(rta_pool, sizeof(struct ea_class) + namelen); + cla->cl = cl; + memcpy(cla->name, cl.name, namelen); + cla->cl.name = cla->name; + + return ea_register(p, &cla->cl); +} + +void +ea_register_init(struct ea_class *clp) +{ + ASSERT_DIE(!ea_class_find_by_name(clp->name)); + ea_register(&root_pool, clp); +} + +struct ea_class * +ea_class_find_by_id(uint id) +{ + ASSERT_DIE(id < ea_class_max); + ASSERT_DIE(ea_class_global[id]); + return ea_class_global[id]; +} static inline eattr * ea__find(ea_list *e, unsigned id) @@ -531,12 +674,11 @@ ea__find(ea_list *e, unsigned id) * to its &eattr structure or %NULL if no such attribute exists. */ eattr * -ea_find(ea_list *e, unsigned id) +ea_find_by_id(ea_list *e, unsigned id) { eattr *a = ea__find(e, id & EA_CODE_MASK); - if (a && (a->type & EAF_TYPE_MASK) == EAF_TYPE_UNDEF && - !(id & EA_ALLOW_UNDEF)) + if (a && a->undef && !(id & EA_ALLOW_UNDEF)) return NULL; return a; } @@ -603,7 +745,7 @@ ea_walk(struct ea_walk_state *s, uint id, uint max) BIT32_SET(s->visited, n); - if ((a->type & EAF_TYPE_MASK) == EAF_TYPE_UNDEF) + if (a->undef) continue; s->eattrs = e; @@ -617,25 +759,6 @@ ea_walk(struct ea_walk_state *s, uint id, uint max) return NULL; } -/** - * ea_get_int - fetch an integer attribute - * @e: attribute list - * @id: attribute ID - * @def: default value - * - * This function is a shortcut for retrieving a value of an integer attribute - * by calling ea_find() to find the attribute, extracting its value or returning - * a provided default if no such attribute is present. - */ -uintptr_t -ea_get_int(ea_list *e, unsigned id, uintptr_t def) -{ - eattr *a = ea_find(e, id); - if (!a) - return def; - return a->u.data; -} - static inline void ea_do_sort(ea_list *e) { @@ -702,15 +825,18 @@ ea_do_prune(ea_list *e) s++; /* Now s0 is the most recent version, s[-1] the oldest one */ - /* Drop undefs */ - if ((s0->type & EAF_TYPE_MASK) == EAF_TYPE_UNDEF) + /* Drop undefs unless this is a true overlay */ + if (s0->undef && (s[-1].undef || !e->next)) continue; /* Copy the newest version to destination */ *d = *s0; /* Preserve info whether it originated locally */ - d->type = (d->type & ~(EAF_ORIGINATED|EAF_FRESH)) | (s[-1].type & EAF_ORIGINATED); + d->originated = s[-1].originated; + + /* Not fresh any more, we prefer surstroemming */ + d->fresh = 0; /* Next destination */ d++; @@ -730,21 +856,18 @@ ea_do_prune(ea_list *e) * If an attribute occurs multiple times in a single &ea_list, * ea_sort() leaves only the first (the only significant) occurrence. */ -void +static void ea_sort(ea_list *e) { - while (e) - { - if (!(e->flags & EALF_SORTED)) - { - ea_do_sort(e); - ea_do_prune(e); - e->flags |= EALF_SORTED; - } - if (e->count > 5) - e->flags |= EALF_BISECT; - e = e->next; - } + if (!(e->flags & EALF_SORTED)) + { + ea_do_sort(e); + ea_do_prune(e); + e->flags |= EALF_SORTED; + } + + if (e->count > 5) + e->flags |= EALF_BISECT; } /** @@ -754,8 +877,8 @@ ea_sort(ea_list *e) * This function calculates an upper bound of the size of * a given &ea_list after merging with ea_merge(). */ -unsigned -ea_scan(ea_list *e) +static unsigned +ea_scan(const ea_list *e, int overlay) { unsigned cnt = 0; @@ -763,6 +886,8 @@ ea_scan(ea_list *e) { cnt += e->count; e = e->next; + if (e && overlay && ea_is_cached(e)) + break; } return sizeof(ea_list) + sizeof(eattr)*cnt; } @@ -781,21 +906,36 @@ ea_scan(ea_list *e) * segments with ea_merge() and finally sort and prune the result * by calling ea_sort(). */ -void -ea_merge(ea_list *e, ea_list *t) +static void +ea_merge(ea_list *e, ea_list *t, int overlay) { eattr *d = t->attrs; t->flags = 0; t->count = 0; - t->next = NULL; + while (e) { memcpy(d, e->attrs, sizeof(eattr)*e->count); t->count += e->count; d += e->count; e = e->next; + + if (e && overlay && ea_is_cached(e)) + break; } + + t->next = e; +} + +ea_list * +ea_normalize(ea_list *e, int overlay) +{ + ea_list *t = tmp_alloc(ea_scan(e, overlay)); + ea_merge(e, t, overlay); + ea_sort(t); + + return t->count ? t : t->next; } /** @@ -813,7 +953,8 @@ ea_same(ea_list *x, ea_list *y) if (!x || !y) return x == y; - ASSERT(!x->next && !y->next); + if (x->next != y->next) + return 0; if (x->count != y->count) return 0; for(c=0; c<x->count; c++) @@ -824,39 +965,46 @@ ea_same(ea_list *x, ea_list *y) if (a->id != b->id || a->flags != b->flags || a->type != b->type || + a->originated != b->originated || + a->fresh != b->fresh || + a->undef != b->undef || ((a->type & EAF_EMBEDDED) ? a->u.data != b->u.data : !adata_same(a->u.ptr, b->u.ptr))) return 0; } return 1; } -static inline ea_list * -ea_list_copy(ea_list *o) +uint +ea_list_size(ea_list *o) { - ea_list *n; - unsigned i, adpos, elen; + unsigned i, elen; - if (!o) - return NULL; - ASSERT(!o->next); - elen = adpos = sizeof(ea_list) + sizeof(eattr) * o->count; + ASSERT_DIE(o); + elen = BIRD_CPU_ALIGN(sizeof(ea_list) + sizeof(eattr) * o->count); for(i=0; i<o->count; i++) { eattr *a = &o->attrs[i]; - if (!(a->type & EAF_EMBEDDED)) - elen += sizeof(struct adata) + a->u.ptr->length; + if (!a->undef && !(a->type & EAF_EMBEDDED)) + elen += ADATA_SIZE(a->u.ptr->length); } - n = mb_alloc(rta_pool, elen); + return elen; +} + +void +ea_list_copy(ea_list *n, ea_list *o, uint elen) +{ + uint adpos = sizeof(ea_list) + sizeof(eattr) * o->count; memcpy(n, o, adpos); - n->flags |= EALF_CACHED; - for(i=0; i<o->count; i++) + adpos = BIRD_CPU_ALIGN(adpos); + + for(uint i=0; i<o->count; i++) { eattr *a = &n->attrs[i]; - if (!(a->type & EAF_EMBEDDED)) + if (!a->undef && !(a->type & EAF_EMBEDDED)) { - unsigned size = sizeof(struct adata) + a->u.ptr->length; + unsigned size = ADATA_SIZE(a->u.ptr->length); ASSERT_DIE(adpos + size <= elen); struct adata *d = ((void *) n) + adpos; @@ -866,30 +1014,58 @@ ea_list_copy(ea_list *o) adpos += size; } } + ASSERT_DIE(adpos == elen); - return n; } -static inline void -ea_free(ea_list *o) +static void +ea_list_ref(ea_list *l) { - if (o) + for(uint i=0; i<l->count; i++) { - ASSERT(!o->next); - mb_free(o); + eattr *a = &l->attrs[i]; + ASSERT_DIE(a->id < ea_class_max); + + if (a->undef) + continue; + + struct ea_class *cl = ea_class_global[a->id]; + ASSERT_DIE(cl && cl->uc); + + CALL(cl->stored, a); + cl->uc++; } + + if (l->next) + { + ASSERT_DIE(ea_is_cached(l->next)); + ea_clone(l->next); + } } -static int -get_generic_attr(const eattr *a, byte **buf, int buflen UNUSED) +static void ea_free_nested(ea_list *l); + +static void +ea_list_unref(ea_list *l) { - if (a->id == EA_GEN_IGP_METRIC) + for(uint i=0; i<l->count; i++) { - *buf += bsprintf(*buf, "igp_metric"); - return GA_NAME; + eattr *a = &l->attrs[i]; + ASSERT_DIE(a->id < ea_class_max); + + if (a->undef) + continue; + + struct ea_class *cl = ea_class_global[a->id]; + ASSERT_DIE(cl && cl->uc); + + CALL(cl->freed, a); + if (!--cl->uc) + ea_class_free(cl); } - return GA_UNKNOWN; + if (l->next) + ea_free_nested(l->next); } void @@ -942,41 +1118,90 @@ opaque_format(const struct adata *ad, byte *buf, uint size) } static inline void -ea_show_int_set(struct cli *c, const struct adata *ad, int way, byte *pos, byte *buf, byte *end) +ea_show_int_set(struct cli *c, const char *name, const struct adata *ad, int way, byte *buf) { - int i = int_set_format(ad, way, 0, pos, end - pos); - cli_printf(c, -1012, "\t%s", buf); + int nlen = strlen(name); + int i = int_set_format(ad, way, 0, buf, CLI_MSG_SIZE - nlen - 3); + cli_printf(c, -1012, "\t%s: %s", name, buf); while (i) { - i = int_set_format(ad, way, i, buf, end - buf - 1); + i = int_set_format(ad, way, i, buf, CLI_MSG_SIZE - 1); cli_printf(c, -1012, "\t\t%s", buf); } } static inline void -ea_show_ec_set(struct cli *c, const struct adata *ad, byte *pos, byte *buf, byte *end) +ea_show_ec_set(struct cli *c, const char *name, const struct adata *ad, byte *buf) { - int i = ec_set_format(ad, 0, pos, end - pos); - cli_printf(c, -1012, "\t%s", buf); + int nlen = strlen(name); + int i = ec_set_format(ad, 0, buf, CLI_MSG_SIZE - nlen - 3); + cli_printf(c, -1012, "\t%s: %s", name, buf); while (i) { - i = ec_set_format(ad, i, buf, end - buf - 1); + i = ec_set_format(ad, i, buf, CLI_MSG_SIZE - 1); cli_printf(c, -1012, "\t\t%s", buf); } } static inline void -ea_show_lc_set(struct cli *c, const struct adata *ad, byte *pos, byte *buf, byte *end) +ea_show_lc_set(struct cli *c, const char *name, const struct adata *ad, byte *buf) { - int i = lc_set_format(ad, 0, pos, end - pos); - cli_printf(c, -1012, "\t%s", buf); + int nlen = strlen(name); + int i = lc_set_format(ad, 0, buf, CLI_MSG_SIZE - nlen - 3); + cli_printf(c, -1012, "\t%s: %s", name, buf); while (i) { - i = lc_set_format(ad, i, buf, end - buf - 1); + i = lc_set_format(ad, i, buf, CLI_MSG_SIZE - 1); cli_printf(c, -1012, "\t\t%s", buf); } } +void +ea_show_nexthop_list(struct cli *c, struct nexthop_adata *nhad) +{ + if (!NEXTHOP_IS_REACHABLE(nhad)) + return; + + NEXTHOP_WALK(nh, nhad) + { + char mpls[MPLS_MAX_LABEL_STACK*12 + 5], *lsp = mpls; + char *onlink = (nh->flags & RNF_ONLINK) ? " onlink" : ""; + char weight[16] = ""; + + if (nh->labels) + { + lsp += bsprintf(lsp, " mpls %d", nh->label[0]); + for (int i=1;i<nh->labels; i++) + lsp += bsprintf(lsp, "/%d", nh->label[i]); + } + *lsp = '\0'; + + if (!NEXTHOP_ONE(nhad)) + bsprintf(weight, " weight %d", nh->weight + 1); + + if (ipa_nonzero(nh->gw)) + if (nh->iface) + cli_printf(c, -1007, "\tvia %I on %s%s%s%s", + nh->gw, nh->iface->name, mpls, onlink, weight); + else + cli_printf(c, -1007, "\tvia %I", nh->gw); + else + cli_printf(c, -1007, "\tdev %s%s%s", + nh->iface->name, mpls, onlink, weight); + } +} + +void +ea_show_hostentry(const struct adata *ad, byte *buf, uint size) +{ + const struct hostentry_adata *had = (const struct hostentry_adata *) ad; + + if (ipa_nonzero(had->he->link) && !ipa_equal(had->he->link, had->he->addr)) + bsnprintf(buf, size, "via %I %I table %s", had->he->addr, had->he->link, had->he->tab->name); + else + bsnprintf(buf, size, "via %I table %s", had->he->addr, had->he->tab->name); +} + /** * ea_show - print an &eattr to CLI * @c: destination CLI @@ -988,79 +1213,80 @@ ea_show_lc_set(struct cli *c, const struct adata *ad, byte *pos, byte *buf, byte * If the protocol defining the attribute provides its own * get_attr() hook, it's consulted first. */ -void +static void ea_show(struct cli *c, const eattr *e) { - struct protocol *p; - int status = GA_UNKNOWN; const struct adata *ad = (e->type & EAF_EMBEDDED) ? NULL : e->u.ptr; byte buf[CLI_MSG_SIZE]; byte *pos = buf, *end = buf + sizeof(buf); - if (EA_IS_CUSTOM(e->id)) - { - const char *name = ea_custom_name(e->id); - if (name) - { - pos += bsprintf(pos, "%s", name); - status = GA_NAME; - } - else - pos += bsprintf(pos, "%02x.", EA_PROTO(e->id)); - } - else if (p = class_to_protocol[EA_PROTO(e->id)]) - { - pos += bsprintf(pos, "%s.", p->name); - if (p->get_attr) - status = p->get_attr(e, pos, end - pos); - pos += strlen(pos); - } - else if (EA_PROTO(e->id)) - pos += bsprintf(pos, "%02x.", EA_PROTO(e->id)); + ASSERT_DIE(e->id < ea_class_max); + + struct ea_class *cls = ea_class_global[e->id]; + ASSERT_DIE(cls); + + if (e->undef || cls->hidden) + return; + else if (cls->format) + cls->format(e, buf, end - buf); else - status = get_generic_attr(e, &pos, end - pos); + switch (e->type) + { + case T_INT: + if ((cls == &ea_gen_igp_metric) && e->u.data >= IGP_METRIC_UNKNOWN) + return; - if (status < GA_NAME) - pos += bsprintf(pos, "%02x", EA_ID(e->id)); - if (status < GA_FULL) - { - *pos++ = ':'; - *pos++ = ' '; - switch (e->type & EAF_TYPE_MASK) - { - case EAF_TYPE_INT: bsprintf(pos, "%u", e->u.data); break; - case EAF_TYPE_OPAQUE: + case T_OPAQUE: opaque_format(ad, pos, end - pos); break; - case EAF_TYPE_IP_ADDRESS: + case T_IP: bsprintf(pos, "%I", *(ip_addr *) ad->data); break; - case EAF_TYPE_ROUTER_ID: + case T_QUAD: bsprintf(pos, "%R", e->u.data); break; - case EAF_TYPE_AS_PATH: + case T_PATH: as_path_format(ad, pos, end - pos); break; - case EAF_TYPE_BITFIELD: - bsprintf(pos, "%08x", e->u.data); - break; - case EAF_TYPE_INT_SET: - ea_show_int_set(c, ad, 1, pos, buf, end); + case T_CLIST: + ea_show_int_set(c, cls->name, ad, 1, buf); return; - case EAF_TYPE_EC_SET: - ea_show_ec_set(c, ad, pos, buf, end); + case T_ECLIST: + ea_show_ec_set(c, cls->name, ad, buf); return; - case EAF_TYPE_LC_SET: - ea_show_lc_set(c, ad, pos, buf, end); + case T_LCLIST: + ea_show_lc_set(c, cls->name, ad, buf); return; - case EAF_TYPE_UNDEF: + case T_NEXTHOP_LIST: + ea_show_nexthop_list(c, (struct nexthop_adata *) e->u.ptr); + return; + case T_HOSTENTRY: + ea_show_hostentry(ad, pos, end - pos); + break; default: bsprintf(pos, "<type %02x>", e->type); - } + } + + cli_printf(c, -1012, "\t%s: %s", cls->name, buf); +} + +static void +nexthop_dump(const struct adata *ad) +{ + struct nexthop_adata *nhad = (struct nexthop_adata *) ad; + + debug(":"); + + NEXTHOP_WALK(nh, nhad) + { + if (ipa_nonzero(nh->gw)) debug(" ->%I", nh->gw); + if (nh->labels) debug(" L %d", nh->label[0]); + for (int i=1; i<nh->labels; i++) + debug("/%d", nh->label[i]); + debug(" [%s]", nh->iface ? nh->iface->name : "???"); } - cli_printf(c, -1012, "\t%s", buf); } /** @@ -1082,19 +1308,26 @@ ea_dump(ea_list *e) } while (e) { - debug("[%c%c%c]", + struct ea_storage *s = ea_is_cached(e) ? ea_get_storage(e) : NULL; + debug("[%c%c%c] uc=%d h=%08x", (e->flags & EALF_SORTED) ? 'S' : 's', (e->flags & EALF_BISECT) ? 'B' : 'b', - (e->flags & EALF_CACHED) ? 'C' : 'c'); + (e->flags & EALF_CACHED) ? 'C' : 'c', + s ? s->uc : 0, s ? s->hash_key : 0); for(i=0; i<e->count; i++) { eattr *a = &e->attrs[i]; - debug(" %02x:%02x.%02x", EA_PROTO(a->id), EA_ID(a->id), a->flags); - debug("=%c", "?iO?I?P???S?????" [a->type & EAF_TYPE_MASK]); - if (a->type & EAF_ORIGINATED) + debug(" %04x.%02x", a->id, a->flags); + debug("=%c", + "?iO?IRP???S??pE?" + "??L???N?????????" + "?o???r??????????" [a->type]); + if (a->originated) debug("o"); if (a->type & EAF_EMBEDDED) debug(":%08x", a->u.data); + else if (a->id == ea_gen_nexthop.id) + nexthop_dump(a->u.ptr); else { int j, len = a->u.ptr->length; @@ -1124,10 +1357,13 @@ ea_hash(ea_list *e) if (e) /* Assuming chain of length 1 */ { + h ^= mem_hash(&e->next, sizeof(e->next)); for(i=0; i<e->count; i++) { struct eattr *a = &e->attrs[i]; h ^= a->id; h *= mul; + if (a->undef) + continue; if (a->type & EAF_EMBEDDED) h ^= a->u.data; else @@ -1167,163 +1403,52 @@ ea_append(ea_list *to, ea_list *what) * rta's */ -static DOMAIN(attrs) attrs_domain; - -#define RTA_LOCK LOCK_DOMAIN(attrs, attrs_domain) -#define RTA_UNLOCK UNLOCK_DOMAIN(attrs, attrs_domain) - -struct rta_cache { - u32 count; - u32 size; - u32 limit; - u32 mask; - rta * _Atomic table[0]; -} * _Atomic rta_cache; -// rta_aux, rta_cache = { .size = ATOMIC_VAR_INIT(32), }; +static uint rta_cache_count; +static uint rta_cache_size = 32; +static uint rta_cache_limit; +static uint rta_cache_mask; +static struct ea_storage **rta_hash_table; -static struct rta_cache * -rta_alloc_hash(u32 size) -{ - struct rta_cache *c = mb_allocz(rta_pool, sizeof(struct rta_cache) + sizeof(rta * _Atomic) * size); - c->size = size; - c->limit = (size >> 20) ? (~0U) : (size * 2); - c->mask = size - 1; - return c; -} - -static inline uint -rta_hash(rta *a) -{ - u64 h; - mem_hash_init(&h); -#define MIX(f) mem_hash_mix(&h, &(a->f), sizeof(a->f)); -#define BMIX(f) mem_hash_mix_num(&h, a->f); - MIX(hostentry); - MIX(from); - MIX(igp_metric); - BMIX(source); - BMIX(scope); - BMIX(dest); - MIX(pref); -#undef MIX - - return mem_hash_value(&h) ^ nexthop_hash(&(a->nh)) ^ ea_hash(a->eattrs); -} - -static inline int -rta_same(rta *x, rta *y) -{ - return (x->source == y->source && - x->scope == y->scope && - x->dest == y->dest && - x->igp_metric == y->igp_metric && - ipa_equal(x->from, y->from) && - x->hostentry == y->hostentry && - nexthop_same(&(x->nh), &(y->nh)) && - ea_same(x->eattrs, y->eattrs)); -} - -static inline slab * -rta_slab(rta *a) -{ - return rta_slab_[a->nh.labels > 2 ? 3 : a->nh.labels]; -} - -static rta * -rta_copy(rta *o) +static void +rta_alloc_hash(void) { - rta *r = sl_alloc(rta_slab(o)); - - memcpy(r, o, rta_size(o)); - r->uc = 1; - r->nh.next = nexthop_copy(o->nh.next); - r->eattrs = ea_list_copy(o->eattrs); - return r; + rta_hash_table = mb_allocz(rta_pool, sizeof(struct ea_storage *) * rta_cache_size); + if (rta_cache_size < 32768) + rta_cache_limit = rta_cache_size * 2; + else + rta_cache_limit = ~0; + rta_cache_mask = rta_cache_size - 1; } static inline void -rta_insert(rta *r, struct rta_cache *c) +rta_insert(struct ea_storage *r) { - uint h = r->hash_key & c->mask; - rta *next = atomic_load_explicit(&c->table[h], memory_order_relaxed); - - atomic_store_explicit(&r->next, next, memory_order_relaxed); - r->pprev = &c->table[h]; - - if (next) - next->pprev = &r->next; - - /* This store MUST be the last and MUST have release order for thread-safety */ - atomic_store_explicit(&c->table[h], r, memory_order_release); + uint h = r->hash_key & rta_cache_mask; + r->next_hash = rta_hash_table[h]; + if (r->next_hash) + r->next_hash->pprev_hash = &r->next_hash; + r->pprev_hash = &rta_hash_table[h]; + rta_hash_table[h] = r; } static void -rta_rehash(struct rta_cache *c) +rta_rehash(void) { - u32 os = c->size; - - struct rta_cache *nc = rta_alloc_hash(os * 2); - nc->count = c->count; - - /* First we simply copy every chain to both new locations */ - for (u32 h = 0; h < os; h++) - { - rta *r = atomic_load_explicit(&c->table[h], memory_order_relaxed); - atomic_store_explicit(&nc->table[h], r, memory_order_relaxed); - atomic_store_explicit(&nc->table[h + os], r, memory_order_relaxed); - } - - /* Then we exchange the hashes; release semantics forces the previous code to be already done */ - atomic_store_explicit(&rta_cache, nc, memory_order_release); - - /* And now we pass through both chains and filter them */ - for (u32 h = 0; h < c->size; h++) - { - rta * _Atomic * ap = &nc->table[h]; - rta * _Atomic * bp = &nc->table[h + os]; - - rta *r = atomic_load_explicit(ap, memory_order_relaxed); - ASSERT_DIE(r == atomic_load_explicit(bp, memory_order_relaxed)); - - while (r) - { - if (r->hash_key & os) - { - r->pprev = bp; - atomic_store_explicit(bp, r, memory_order_release); - bp = &r->next; - } - else + uint ohs = rta_cache_size; + uint h; + struct ea_storage *r, *n; + struct ea_storage **oht = rta_hash_table; + + rta_cache_size = 2*rta_cache_size; + DBG("Rehashing rta cache from %d to %d entries.\n", ohs, rta_cache_size); + rta_alloc_hash(); + for(h=0; h<ohs; h++) + for(r=oht[h]; r; r=n) { - r->pprev = ap; - atomic_store_explicit(ap, r, memory_order_release); - ap = &r->next; + n = r->next_hash; + rta_insert(r); } - - r = atomic_load_explicit(&r->next, memory_order_acquire); - } - - atomic_store_explicit(ap, NULL, memory_order_release); - atomic_store_explicit(bp, NULL, memory_order_release); - } - - synchronize_rcu(); - mb_free(c); -} - -static rta * -rta_find(rta *o, u32 h, struct rta_cache *c) -{ - rta *r = NULL; - - for (r = atomic_load_explicit(&c->table[h & c->mask], memory_order_acquire); r; r = atomic_load_explicit(&r->next, memory_order_acquire)) - if (r->hash_key == h && rta_same(r, o)) - { - atomic_fetch_add_explicit(&r->uc, 1, memory_order_acq_rel); - return r; - } - - return NULL; + mb_free(oht); } /** @@ -1339,147 +1464,75 @@ rta_find(rta *o, u32 h, struct rta_cache *c) * The extended attribute lists attached to the &rta are automatically * converted to the normalized form. */ -rta * -rta_lookup(rta *o) +ea_list * +ea_lookup(ea_list *o, int overlay) { - rta *r; + struct ea_storage *r; uint h; - ASSERT(!o->cached); - if (o->eattrs) - ea_normalize(o->eattrs); - - h = rta_hash(o); - - /* Lockless lookup */ - rcu_read_lock(); - r = rta_find(o, h, atomic_load_explicit(&rta_cache, memory_order_acquire)); - rcu_read_unlock(); - - if (r) - return r; + ASSERT(!ea_is_cached(o)); + o = ea_normalize(o, overlay); + h = ea_hash(o); RTA_LOCK; - /* Locked lookup to avoid duplicates if possible */ - struct rta_cache *c = atomic_load_explicit(&rta_cache, memory_order_acquire); - r = rta_find(o, h, c); - if (r) - { - RTA_UNLOCK; - return r; - } + for(r=rta_hash_table[h & rta_cache_mask]; r; r=r->next_hash) + if (r->hash_key == h && ea_same(r->l, o)) + { + atomic_fetch_add_explicit(&r->uc, 1, memory_order_acq_rel); + RTA_UNLOCK; + return r->l; + } + + uint elen = ea_list_size(o); + r = mb_alloc(rta_pool, elen + sizeof(struct ea_storage)); + ea_list_copy(r->l, o, elen); + ea_list_ref(r->l); - /* Store the rta */ - r = rta_copy(o); + r->l->flags |= EALF_CACHED; r->hash_key = h; - r->cached = 1; - rt_lock_hostentry(r->hostentry); - rta_insert(r, c); + r->uc = 1; - if (++c->count > c->limit) - rta_rehash(c); + rta_insert(r); + + if (++rta_cache_count > rta_cache_limit) + rta_rehash(); RTA_UNLOCK; - return r; + return r->l; } -void -rta__free(rta *a) +static void +ea_free_locked(struct ea_storage *a) { - ASSERT(a->cached); - - RTA_LOCK; - struct rta_cache *c = atomic_load_explicit(&rta_cache, memory_order_acquire); - + /* Somebody has cloned this rta inbetween. This sometimes happens. */ if (atomic_load_explicit(&a->uc, memory_order_acquire)) - { - /* Acquired inbetween */ - RTA_UNLOCK; return; - } - - /* Relink the forward pointer */ - rta *next = atomic_load_explicit(&a->next, memory_order_acquire); - atomic_store_explicit(a->pprev, next, memory_order_release); - - /* Relink the backwards pointer */ - if (next) - next->pprev = a->pprev; - /* Wait until nobody knows about us */ - synchronize_rcu(); + ASSERT(rta_cache_count); + rta_cache_count--; + *a->pprev_hash = a->next_hash; + if (a->next_hash) + a->next_hash->pprev_hash = a->pprev_hash; - if (atomic_load_explicit(&a->uc, memory_order_acquire)) - { - /* Acquired inbetween, relink back */ - rta_insert(a, c); - RTA_UNLOCK; - return; - } - - /* Cleared to free the memory */ - rt_unlock_hostentry(a->hostentry); - if (a->nh.next) - nexthop_free(a->nh.next); - ea_free(a->eattrs); - a->cached = 0; - c->count--; - sl_free(rta_slab(a), a); - - RTA_UNLOCK; + ea_list_unref(a->l); + mb_free(a); } -rta * -rta_do_cow(rta *o, linpool *lp) +static void +ea_free_nested(struct ea_list *l) { - rta *r = lp_alloc(lp, rta_size(o)); - memcpy(r, o, rta_size(o)); - for (struct nexthop **nhn = &(r->nh.next), *nho = o->nh.next; nho; nho = nho->next) - { - *nhn = lp_alloc(lp, nexthop_size(nho)); - memcpy(*nhn, nho, nexthop_size(nho)); - nhn = &((*nhn)->next); - } - rta_uncache(r); - return r; + struct ea_storage *r = ea_get_storage(l); + if (1 == atomic_fetch_sub_explicit(&r->uc, 1, memory_order_acq_rel)) + ea_free_locked(r); } -/** - * rta_dump - dump route attributes - * @a: attribute structure to dump - * - * This function takes a &rta and dumps its contents to the debug output. - */ void -rta_dump(const rta *a) +ea__free(struct ea_storage *a) { - static char *rts[] = { "", "RTS_STATIC", "RTS_INHERIT", "RTS_DEVICE", - "RTS_STAT_DEV", "RTS_REDIR", "RTS_RIP", - "RTS_OSPF", "RTS_OSPF_IA", "RTS_OSPF_EXT1", - "RTS_OSPF_EXT2", "RTS_BGP", "RTS_PIPE", "RTS_BABEL" }; - static char *rtd[] = { "", " DEV", " HOLE", " UNREACH", " PROHIBIT" }; - - debug("pref=%d uc=%d %s %s%s h=%04x", - a->pref, a->uc, rts[a->source], ip_scope_text(a->scope), - rtd[a->dest], a->hash_key); - if (!a->cached) - debug(" !CACHED"); - debug(" <-%I", a->from); - if (a->dest == RTD_UNICAST) - for (const struct nexthop *nh = &(a->nh); nh; nh = nh->next) - { - if (ipa_nonzero(nh->gw)) debug(" ->%I", nh->gw); - if (nh->labels) debug(" L %d", nh->label[0]); - for (int i=1; i<nh->labels; i++) - debug("/%d", nh->label[i]); - debug(" [%s]", nh->iface ? nh->iface->name : "???"); - } - if (a->eattrs) - { - debug(" EA: "); - ea_dump(a->eattrs); - } + RTA_LOCK; + ea_free_locked(a); + RTA_UNLOCK; } /** @@ -1489,23 +1542,16 @@ rta_dump(const rta *a) * to the debug output. */ void -rta_dump_all(void) +ea_dump_all(void) { - rta *a; - uint h; - RTA_LOCK; - struct rta_cache *c = atomic_load_explicit(&rta_cache, memory_order_acquire); - - debug("Route attribute cache (%d entries, rehash at %d):\n", c->count, c->limit); - for(h=0; h<c->size; h++) - for(a = atomic_load_explicit(&c->table[h], memory_order_acquire); - a; - a = atomic_load_explicit(&a->next, memory_order_acquire)) + debug("Route attribute cache (%d entries, rehash at %d):\n", rta_cache_count, rta_cache_limit); + for (uint h=0; h < rta_cache_size; h++) + for (struct ea_storage *a = rta_hash_table[h]; a; a = a->next_hash) { debug("%p ", a); - rta_dump(a); + ea_dump(a->l); debug("\n"); } debug("\n"); @@ -1514,13 +1560,11 @@ rta_dump_all(void) } void -rta_show(struct cli *c, const rta *a) +ea_show_list(struct cli *c, ea_list *eal) { - cli_printf(c, -1008, "\tType: %s %s", rta_src_names[a->source], ip_scope_text(a->scope)); - - for(ea_list *eal = a->eattrs; eal; eal=eal->next) - for(int i=0; i<eal->count; i++) - ea_show(c, &eal->attrs[i]); + ea_list *n = ea_normalize(eal, 0); + for (int i =0; i < n->count; i++) + ea_show(c, &n->attrs[i]); } /** @@ -1536,18 +1580,20 @@ rta_init(void) rta_pool = rp_new(&root_pool, "Attributes"); - rta_slab_[0] = sl_new(rta_pool, sizeof(rta)); - rta_slab_[1] = sl_new(rta_pool, sizeof(rta) + sizeof(u32)); - rta_slab_[2] = sl_new(rta_pool, sizeof(rta) + sizeof(u32)*2); - rta_slab_[3] = sl_new(rta_pool, sizeof(rta) + sizeof(u32)*MPLS_MAX_LABEL_STACK); - - nexthop_slab_[0] = sl_new(rta_pool, sizeof(struct nexthop)); - nexthop_slab_[1] = sl_new(rta_pool, sizeof(struct nexthop) + sizeof(u32)); - nexthop_slab_[2] = sl_new(rta_pool, sizeof(struct nexthop) + sizeof(u32)*2); - nexthop_slab_[3] = sl_new(rta_pool, sizeof(struct nexthop) + sizeof(u32)*MPLS_MAX_LABEL_STACK); - - atomic_store_explicit(&rta_cache, rta_alloc_hash(32), memory_order_relaxed); + rta_alloc_hash(); rte_src_init(); + ea_class_init(); + + /* These attributes are required to be first for nice "show route" output */ + ea_register_init(&ea_gen_nexthop); + ea_register_init(&ea_gen_hostentry); + + /* Other generic route attributes */ + ea_register_init(&ea_gen_preference); + ea_register_init(&ea_gen_igp_metric); + ea_register_init(&ea_gen_from); + ea_register_init(&ea_gen_source); + ea_register_init(&ea_gen_flowspec_valid); } /* diff --git a/nest/rt-dev.c b/nest/rt-dev.c index c1251675..4199e17c 100644 --- a/nest/rt-dev.c +++ b/nest/rt-dev.c @@ -18,7 +18,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/rt-dev.h" #include "conf/conf.h" #include "lib/resource.h" @@ -80,16 +80,18 @@ dev_ifa_notify(struct proto *P, uint flags, struct ifa *ad) /* Use iface ID as local source ID */ struct rte_src *src = rt_get_source(P, ad->iface->index); - rta a0 = { - .pref = c->preference, - .source = RTS_DEVICE, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, - .nh.iface = ad->iface, + ea_list *ea = NULL; + struct nexthop_adata nhad = { + .nh = { .iface = ad->iface, }, + .ad = { .length = (void *) NEXTHOP_NEXT(&nhad.nh) - (void *) nhad.ad.data, }, }; + ea_set_attr_u32(&ea, &ea_gen_preference, 0, c->preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_DEVICE); + ea_set_attr_data(&ea, &ea_gen_nexthop, 0, nhad.ad.data, nhad.ad.length); + rte e0 = { - .attrs = rta_lookup(&a0), + .attrs = ea, .src = src, }; @@ -186,7 +188,6 @@ dev_copy_config(struct proto_config *dest, struct proto_config *src) struct protocol proto_device = { .name = "Direct", .template = "direct%d", - .class = PROTOCOL_DIRECT, .preference = DEF_PREF_DIRECT, .channel_mask = NB_IP | NB_IP6_SADR, .proto_size = sizeof(struct rt_dev_proto), @@ -196,3 +197,9 @@ struct protocol proto_device = { .reconfigure = dev_reconfigure, .copy_config = dev_copy_config }; + +void +dev_build(void) +{ + proto_build(&proto_device); +} diff --git a/nest/rt-fib.c b/nest/rt-fib.c index a7f70371..801561da 100644 --- a/nest/rt-fib.c +++ b/nest/rt-fib.c @@ -55,7 +55,7 @@ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "lib/string.h" /* @@ -331,7 +331,7 @@ fib_get(struct fib *f, const net_addr *a) memset(b, 0, f->node_offset); if (f->init) - f->init(b); + f->init(f, b); if (f->entries++ > f->entries_max) fib_rehash(f, HASH_HI_STEP); @@ -475,7 +475,7 @@ fib_delete(struct fib *f, void *E) } if (f->fib_slab) - sl_free(f->fib_slab, E); + sl_free(E); else mb_free(E); diff --git a/nest/rt-show.c b/nest/rt-show.c index 65b59af4..dc88047a 100644 --- a/nest/rt-show.c +++ b/nest/rt-show.c @@ -10,123 +10,92 @@ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/cli.h" #include "nest/iface.h" #include "filter/filter.h" +#include "filter/data.h" #include "sysdep/unix/krt.h" static void -rt_show_table(struct cli *c, struct rt_show_data *d) +rt_show_table(struct rt_show_data *d) { + struct cli *c = d->cli; + /* No table blocks in 'show route count' */ if (d->stats == 2) return; if (d->last_table) cli_printf(c, -1007, ""); - cli_printf(c, -1007, "Table %s:", d->tab->table->name); + cli_printf(c, -1007, "Table %s:", + d->tab->name); d->last_table = d->tab; } -static inline struct krt_proto * -rt_show_get_kernel(struct rt_show_data *d) -{ - struct proto_config *krt = d->tab->table->config->krt_attached; - return krt ? (struct krt_proto *) krt->proto : NULL; -} - static void rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, int primary) { byte from[IPA_MAX_TEXT_LENGTH+8]; byte tm[TM_DATETIME_BUFFER_SIZE], info[256]; - rta *a = e->attrs; - int sync_error = d->kernel ? krt_get_sync_error(d->kernel, e) : 0; + ea_list *a = e->attrs; + int sync_error = d->tab->kernel ? krt_get_sync_error(d->tab->kernel, e) : 0; void (*get_route_info)(struct rte *, byte *buf); - struct nexthop *nh; + eattr *nhea = net_type_match(e->net, NB_DEST) ? + ea_find(a, &ea_gen_nexthop) : NULL; + struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhad ? (NEXTHOP_IS_REACHABLE(nhad) ? RTD_UNICAST : nhad->dest) : RTD_NONE; + int flowspec_valid = net_is_flow(e->net) ? rt_get_flowspec_valid(e) : FLOWSPEC_UNKNOWN; tm_format_time(tm, &config->tf_route, e->lastmod); - if (ipa_nonzero(a->from) && !ipa_equal(a->from, a->nh.gw)) - bsprintf(from, " from %I", a->from); + ip_addr a_from = ea_get_ip(a, &ea_gen_from, IPA_NONE); + if (ipa_nonzero(a_from) && (!nhad || !ipa_equal(a_from, nhad->nh.gw))) + bsprintf(from, " from %I", a_from); else from[0] = 0; /* Need to normalize the extended attributes */ - if (d->verbose && !rta_is_cached(a) && a->eattrs) - ea_normalize(a->eattrs); + if (d->verbose && !rta_is_cached(a) && a) + a = ea_normalize(a, 0); get_route_info = e->src->owner->class ? e->src->owner->class->get_route_info : NULL; if (get_route_info) get_route_info(e, info); else - bsprintf(info, " (%d)", a->pref); + bsprintf(info, " (%d)", rt_get_preference(e)); if (d->last_table != d->tab) - rt_show_table(c, d); - - cli_printf(c, -1007, "%-20s %s [%s %s%s]%s%s", ia, rta_dest_name(a->dest), - e->src->owner->name, tm, from, primary ? (sync_error ? " !" : " *") : "", info); - - if (a->dest == RTD_UNICAST) - for (nh = &(a->nh); nh; nh = nh->next) - { - char mpls[MPLS_MAX_LABEL_STACK*12 + 5], *lsp = mpls; - char *onlink = (nh->flags & RNF_ONLINK) ? " onlink" : ""; - char weight[16] = ""; - - if (nh->labels) - { - lsp += bsprintf(lsp, " mpls %d", nh->label[0]); - for (int i=1;i<nh->labels; i++) - lsp += bsprintf(lsp, "/%d", nh->label[i]); - } - *lsp = '\0'; + rt_show_table(d); - if (a->nh.next) - bsprintf(weight, " weight %d", nh->weight + 1); + eattr *heea; + struct hostentry_adata *had = NULL; + if (!net_is_flow(e->net) && (dest == RTD_NONE) && (heea = ea_find(a, &ea_gen_hostentry))) + had = (struct hostentry_adata *) heea->u.ptr; - if (ipa_nonzero(nh->gw)) - cli_printf(c, -1007, "\tvia %I on %s%s%s%s", - nh->gw, nh->iface->name, mpls, onlink, weight); - else - cli_printf(c, -1007, "\tdev %s%s%s", - nh->iface->name, mpls, onlink, weight); - } + cli_printf(c, -1007, "%-20s %s [%s %s%s]%s%s", ia, + net_is_flow(e->net) ? flowspec_valid_name(flowspec_valid) : had ? "recursive" : rta_dest_name(dest), + e->src->owner->name, tm, from, primary ? (sync_error ? " !" : " *") : "", info); if (d->verbose) { - cli_printf(c, -1008, "\tInternal route ID: %uL %uG %uS", e->src->private_id, e->src->global_id, e->stale_cycle); - rta_show(c, a); + ea_show_list(c, a); + cli_printf(c, -1008, "\tInternal route handling values: %uL %uG %uS id %u", + e->src->private_id, e->src->global_id, e->stale_cycle, e->id); + } + else if (dest == RTD_UNICAST) + ea_show_nexthop_list(c, nhad); + else if (had) + { + char hetext[256]; + ea_show_hostentry(&had->ad, hetext, sizeof hetext); + cli_printf(c, -1007, "\t%s", hetext); } -} - -static uint -rte_feed_count(net *n) -{ - uint count = 0; - for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) - count++; - return count; -} - -static void -rte_feed_obtain(net *n, rte **feed, uint count) -{ - uint i = 0; - for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) - { - ASSERT_DIE(i < count); - feed[i++] = &e->rte; - } - ASSERT_DIE(i == count); } static void -rt_show_net(struct cli *c, net *n, struct rt_show_data *d) +rt_show_net(struct rt_show_data *d, const net_addr *n, rte **feed, uint count) { + struct cli *c = d->cli; byte ia[NET_MAX_TEXT_LENGTH+1]; struct channel *ec = d->tab->export_channel; @@ -135,13 +104,12 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) ASSUME(!d->export_mode || ec); int first = 1; + int first_show = 1; int pass = 0; - bsnprintf(ia, sizeof(ia), "%N", n->n.addr); - - for (struct rte_storage *er = n->routes; er; er = er->next) + for (uint i = 0; i < count; i++) { - if (rte_is_filtered(&er->rte) != d->filtered) + if (!d->tab->prefilter && (rte_is_filtered(feed[i]) != d->filtered)) continue; d->rt_counter++; @@ -151,7 +119,12 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) if (pass) continue; - struct rte e = er->rte; + struct rte e = *feed[i]; + if (d->tab->prefilter) + if (e.sender != d->tab->prefilter->in_req.hook) + continue; + else while (e.attrs->next) + e.attrs = e.attrs->next; /* Export channel is down, do not try to export routes to it */ if (ec && !ec->out_req.hook) @@ -169,13 +142,7 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) { /* Special case for merged export */ pass = 1; - uint count = rte_feed_count(n); - if (!count) - goto skip; - - rte **feed = alloca(count * sizeof(rte *)); - rte_feed_obtain(n, feed, count); - rte *em = rt_export_merged(ec, feed, count, c->show_pool, 1); + rte *em = rt_export_merged(ec, feed, count, tmp_linpool, 1); if (em) e = *em; @@ -201,7 +168,7 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) * command may change the export filter and do not update routes. */ int do_export = (ic > 0) || - (f_run(ec->out_filter, &e, c->show_pool, FF_SILENT) <= F_ACCEPT); + (f_run(ec->out_filter, &e, FF_SILENT) <= F_ACCEPT); if (do_export != (d->export_mode == RSEM_EXPORT)) goto skip; @@ -214,130 +181,192 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) if (d->show_protocol && (&d->show_protocol->sources != e.src->owner)) goto skip; - if (f_run(d->filter, &e, c->show_pool, 0) > F_ACCEPT) + if (f_run(d->filter, &e, 0) > F_ACCEPT) goto skip; if (d->stats < 2) - rt_show_rte(c, ia, &e, d, (n->routes == er)); + { + if (first_show) + net_format(n, ia, sizeof(ia)); + else + ia[0] = 0; + + rt_show_rte(c, ia, &e, d, !d->tab->prefilter && !i); + first_show = 0; + } d->show_counter++; - ia[0] = 0; skip: - lp_flush(c->show_pool); - if (d->primary_only) break; } + + if ((d->show_counter - d->show_counter_last_flush) > 64) + { + d->show_counter_last_flush = d->show_counter; + cli_write_trigger(d->cli); + } +} + +static void +rt_show_net_export_bulk(struct rt_export_request *req, const net_addr *n, + struct rt_pending_export *rpe UNUSED, rte **feed, uint count) +{ + struct rt_show_data *d = SKIP_BACK(struct rt_show_data, req, req); + return rt_show_net(d, n, feed, count); } static void +rt_show_export_stopped_cleanup(struct rt_export_request *req) +{ + struct rt_show_data *d = SKIP_BACK(struct rt_show_data, req, req); + + /* The hook is now invalid */ + req->hook = NULL; + + /* And free the CLI (deferred) */ + rfree(d->cli->pool); +} + +static int rt_show_cleanup(struct cli *c) { struct rt_show_data *d = c->rover; - struct rt_show_data_rtable *tab; - /* Unlink the iterator */ - if (d->table_open) - RT_LOCKED(d->tab->table, t) - fit_get(&t->fib, &d->fit); + /* Cancel the feed */ + if (d->req.hook) + { + rt_stop_export(&d->req, rt_show_export_stopped_cleanup); + return 1; + } + else + return 0; +} + +static void rt_show_export_stopped(struct rt_export_request *req); - /* Unlock referenced tables */ - WALK_LIST(tab, d->tables) - RT_LOCKED(tab->table, t) - rt_unlock_table(t); +static void +rt_show_log_state_change(struct rt_export_request *req, u8 state) +{ + if (state == TES_READY) + rt_stop_export(req, rt_show_export_stopped); } static void -rt_show_cont(struct cli *c) +rt_show_dump_req(struct rt_export_request *req) { - struct rt_show_data *d = c->rover; -#ifdef DEBUGGING - unsigned max = 4; -#else - unsigned max = 64; -#endif + debug(" CLI Show Route Feed %p\n", req); +} + +static void +rt_show_done(struct rt_show_data *d) +{ + /* No more action */ + d->cli->cleanup = NULL; + d->cli->cont = NULL; + d->cli->rover = NULL; + + /* Write pending messages */ + cli_write_trigger(d->cli); +} + +static void +rt_show_cont(struct rt_show_data *d) +{ + struct cli *c = d->cli; if (d->running_on_config && (d->running_on_config != config)) { cli_printf(c, 8004, "Stopped due to reconfiguration"); - goto done; + return rt_show_done(d); } - rtable_private *t = RT_LOCK(d->tab->table); + d->req = (struct rt_export_request) { + .addr = d->addr, + .name = "CLI Show Route", + .list = &global_work_list, + .export_bulk = rt_show_net_export_bulk, + .dump_req = rt_show_dump_req, + .log_state_change = rt_show_log_state_change, + .addr_mode = d->addr_mode, + }; - struct fib *fib = &t->fib; - struct fib_iterator *it = &d->fit; + d->table_counter++; - if (!d->table_open) - { - FIB_ITERATE_INIT(&d->fit, fib); - d->table_open = 1; - d->table_counter++; - d->kernel = rt_show_get_kernel(d); + d->show_counter_last = d->show_counter; + d->rt_counter_last = d->rt_counter; + d->net_counter_last = d->net_counter; - d->show_counter_last = d->show_counter; - d->rt_counter_last = d->rt_counter; - d->net_counter_last = d->net_counter; + if (d->tables_defined_by & RSD_TDB_SET) + rt_show_table(d); - if (d->tables_defined_by & RSD_TDB_SET) - rt_show_table(c, d); - } + rt_request_export_other(d->tab->table, &d->req); +} - FIB_ITERATE_START(fib, it, net, n) - { - if (!max--) - { - FIB_ITERATE_PUT(it); - RT_UNLOCK(d->tab->table); - return; - } - rt_show_net(c, n, d); - } - FIB_ITERATE_END; +static void +rt_show_export_stopped(struct rt_export_request *req) +{ + struct rt_show_data *d = SKIP_BACK(struct rt_show_data, req, req); + + /* The hook is now invalid */ + req->hook = NULL; if (d->stats) { if (d->last_table != d->tab) - rt_show_table(c, d); + rt_show_table(d); - cli_printf(c, -1007, "%d of %d routes for %d networks in table %s", + cli_printf(d->cli, -1007, "%d of %d routes for %d networks in table %s", d->show_counter - d->show_counter_last, d->rt_counter - d->rt_counter_last, - d->net_counter - d->net_counter_last, d->tab->table->name); + d->net_counter - d->net_counter_last, d->tab->name); } - RT_UNLOCK(d->tab->table); - - d->kernel = NULL; - d->table_open = 0; d->tab = NODE_NEXT(d->tab); if (NODE_VALID(d->tab)) - return; + return rt_show_cont(d); + /* Printout total stats */ if (d->stats && (d->table_counter > 1)) { - if (d->last_table) cli_printf(c, -1007, ""); - cli_printf(c, 14, "Total: %d of %d routes for %d networks in %d tables", + if (d->last_table) cli_printf(d->cli, -1007, ""); + cli_printf(d->cli, 14, "Total: %d of %d routes for %d networks in %d tables", d->show_counter, d->rt_counter, d->net_counter, d->table_counter); } + else if (!d->rt_counter && ((d->addr_mode == TE_ADDR_EQUAL) || (d->addr_mode == TE_ADDR_FOR))) + cli_printf(d->cli, 8001, "Network not found"); else - cli_printf(c, 0, ""); + cli_printf(d->cli, 0, ""); -done: - rt_show_cleanup(c); - c->cont = c->cleanup = NULL; + /* No more route showing */ + rt_show_done(d); } struct rt_show_data_rtable * -rt_show_add_table(struct rt_show_data *d, rtable *t) +rt_show_add_exporter(struct rt_show_data *d, struct rt_exporter *t, const char *name) { struct rt_show_data_rtable *tab = cfg_allocz(sizeof(struct rt_show_data_rtable)); tab->table = t; + tab->name = name; add_tail(&(d->tables), &(tab->n)); return tab; } +struct rt_show_data_rtable * +rt_show_add_table(struct rt_show_data *d, rtable *t) +{ + struct rt_show_data_rtable *rsdr; + RT_LOCKED(t, tp) + rsdr = rt_show_add_exporter(d, &tp->exporter.e, t->name); + + struct proto_config *krt = t->config->krt_attached; + if (krt) + rsdr->kernel = (struct krt_proto *) krt->proto; + + return rsdr; +} + static inline void rt_show_get_default_tables(struct rt_show_data *d) { @@ -373,8 +402,8 @@ rt_show_get_default_tables(struct rt_show_data *d) } for (int i=1; i<NET_MAX; i++) - if (config->def_tables[i] && config->def_tables[i]->table) - rt_show_add_table(d, config->def_tables[i]->table); + if (config->def_tables[i] && config->def_tables[i]->table && config->def_tables[i]->table->table) + rt_show_add_table(d, config->def_tables[i]->table->table); } static inline void @@ -391,17 +420,18 @@ rt_show_prepare_tables(struct rt_show_data *d) /* Ensure there is defined export_channel for each table */ if (d->export_mode) { + rtable *rt = SKIP_BACK(rtable, priv.exporter.e, tab->table); if (!tab->export_channel && d->export_channel && - (tab->table == d->export_channel->table)) + (rt == d->export_channel->table)) tab->export_channel = d->export_channel; if (!tab->export_channel && d->export_protocol) - tab->export_channel = proto_find_channel_by_table(d->export_protocol, tab->table); + tab->export_channel = proto_find_channel_by_table(d->export_protocol, rt); if (!tab->export_channel) { if (d->tables_defined_by & RSD_TDB_NMN) - cf_error("No export channel for table %s", tab->table->name); + cf_error("No export channel for table %s", tab->name); rem_node(&(tab->n)); continue; @@ -412,7 +442,7 @@ rt_show_prepare_tables(struct rt_show_data *d) if (d->addr && (tab->table->addr_type != d->addr->type)) { if (d->tables_defined_by & RSD_TDB_NMN) - cf_error("Incompatible type of prefix/ip for table %s", tab->table->name); + cf_error("Incompatible type of prefix/ip for table %s", tab->name); rem_node(&(tab->n)); continue; @@ -424,53 +454,29 @@ rt_show_prepare_tables(struct rt_show_data *d) cf_error("No valid tables"); } +static void +rt_show_dummy_cont(struct cli *c UNUSED) +{ + /* Explicitly do nothing to prevent CLI from trying to parse another command. */ +} + void rt_show(struct rt_show_data *d) { - struct rt_show_data_rtable *tab; - net *n; - /* Filtered routes are neither exported nor have sensible ordering */ if (d->filtered && (d->export_mode || d->primary_only)) cf_error("Incompatible show route options"); rt_show_prepare_tables(d); - if (!d->addr) - { - WALK_LIST(tab, d->tables) - RT_LOCKED(tab->table, t) - rt_lock_table(t); - - /* There is at least one table */ - d->tab = HEAD(d->tables); - this_cli->cont = rt_show_cont; - this_cli->cleanup = rt_show_cleanup; - this_cli->rover = d; - } - else - { - WALK_LIST(tab, d->tables) - { - d->tab = tab; - d->kernel = rt_show_get_kernel(d); - - RT_LOCK(tab->table); + if (EMPTY_LIST(d->tables)) + cf_error("No suitable tables found"); - if (d->show_for) - n = net_route(RT_PRIV(tab->table), d->addr); - else - n = net_find(RT_PRIV(tab->table), d->addr); + d->tab = HEAD(d->tables); - if (n) - rt_show_net(this_cli, n, d); + this_cli->cleanup = rt_show_cleanup; + this_cli->rover = d; + this_cli->cont = rt_show_dummy_cont; - RT_UNLOCK(tab->table); - } - - if (d->rt_counter) - cli_msg(0, ""); - else - cli_msg(8001, "Network not found"); - } + rt_show_cont(d); } diff --git a/nest/rt-table.c b/nest/rt-table.c index d09abbef..dad5d0cc 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -26,12 +26,70 @@ * (see the route attribute module for a precise explanation) holding the * remaining route attributes which are expected to be shared by multiple * routes in order to conserve memory. + * + * There are several mechanisms that allow automatic update of routes in one + * routing table (dst) as a result of changes in another routing table (src). + * They handle issues of recursive next hop resolving, flowspec validation and + * RPKI validation. + * + * The first such mechanism is handling of recursive next hops. A route in the + * dst table has an indirect next hop address, which is resolved through a route + * in the src table (which may also be the same table) to get an immediate next + * hop. This is implemented using structure &hostcache attached to the src + * table, which contains &hostentry structures for each tracked next hop + * address. These structures are linked from recursive routes in dst tables, + * possibly multiple routes sharing one hostentry (as many routes may have the + * same indirect next hop). There is also a trie in the hostcache, which matches + * all prefixes that may influence resolving of tracked next hops. + * + * When a best route changes in the src table, the hostcache is notified using + * an auxiliary export request, which checks using the trie whether the + * change is relevant and if it is, then it schedules asynchronous hostcache + * recomputation. The recomputation is done by rt_update_hostcache() (called + * as an event of src table), it walks through all hostentries and resolves + * them (by rt_update_hostentry()). It also updates the trie. If a change in + * hostentry resolution was found, then it schedules asynchronous nexthop + * recomputation of associated dst table. That is done by rt_next_hop_update() + * (called from rt_event() of dst table), it iterates over all routes in the dst + * table and re-examines their hostentries for changes. Note that in contrast to + * hostcache update, next hop update can be interrupted by main loop. These two + * full-table walks (over hostcache and dst table) are necessary due to absence + * of direct lookups (route -> affected nexthop, nexthop -> its route). + * + * The second mechanism is for flowspec validation, where validity of flowspec + * routes depends of resolving their network prefixes in IP routing tables. This + * is similar to the recursive next hop mechanism, but simpler as there are no + * intermediate hostcache and hostentries (because flows are less likely to + * share common net prefix than routes sharing a common next hop). Every dst + * table has its own export request in every src table. Each dst table has its + * own trie of prefixes that may influence validation of flowspec routes in it + * (flowspec_trie). + * + * When a best route changes in the src table, the notification mechanism is + * invoked by the export request which checks its dst table's trie to see + * whether the change is relevant, and if so, an asynchronous re-validation of + * flowspec routes in the dst table is scheduled. That is also done by function + * rt_next_hop_update(), like nexthop recomputation above. It iterates over all + * flowspec routes and re-validates them. It also recalculates the trie. + * + * Note that in contrast to the hostcache update, here the trie is recalculated + * during the rt_next_hop_update(), which may be interleaved with IP route + * updates. The trie is flushed at the beginning of recalculation, which means + * that such updates may use partial trie to see if they are relevant. But it + * works anyway! Either affected flowspec was already re-validated and added to + * the trie, then IP route change would match the trie and trigger a next round + * of re-validation, or it was not yet re-validated and added to the trie, but + * will be re-validated later in this round anyway. + * + * The third mechanism is used for RPKI re-validation of IP routes and it is the + * simplest. It is also an auxiliary export request belonging to the + * appropriate channel, triggering its reload/refeed timer after a settle time. */ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "lib/resource.h" @@ -44,12 +102,21 @@ #include "lib/hash.h" #include "lib/string.h" #include "lib/alloca.h" +#include "lib/flowspec.h" +#include "lib/idm.h" + +#ifdef CONFIG_BGP +#include "proto/bgp/bgp.h" +#endif #include <stdatomic.h> pool *rt_table_pool; list routing_tables; +list deleted_routing_tables; + +struct rt_cork rt_cork; /* Data structures for export journal */ #define RT_PENDING_EXPORT_ITEMS (page_size - sizeof(struct rt_export_block)) / sizeof(struct rt_pending_export) @@ -61,16 +128,28 @@ struct rt_export_block { struct rt_pending_export export[]; }; -static void rt_free_hostcache(rtable_private *tab); -static void rt_notify_hostcache(rtable_private *tab, net *net); +static void rt_free_hostcache(struct rtable_private *tab); static void rt_update_hostcache(void *tab); -static void rt_next_hop_update(void *tab); -static inline void rt_prune_table(void *tab); -static inline void rt_schedule_notify(rtable_private *tab); -static void rt_feed_channel(void *); - -static inline void rt_export_used(rtable_private *tab); -static void rt_export_cleanup(void *tab); +static void rt_next_hop_update(struct rtable_private *tab); +static void rt_nhu_uncork(void *_tab); +static inline void rt_next_hop_resolve_rte(rte *r); +static inline void rt_flowspec_resolve_rte(rte *r, struct channel *c); +static inline void rt_prune_table(struct rtable_private *tab); +static void rt_kick_prune_timer(struct rtable_private *tab); +static void rt_feed_by_fib(void *); +static void rt_feed_by_trie(void *); +static void rt_feed_equal(void *); +static void rt_feed_for(void *); +static void rt_check_cork_low(struct rtable_private *tab); +static void rt_check_cork_high(struct rtable_private *tab); +static void rt_cork_release_hook(void *); +static void rt_shutdown(void *); +static void rt_delete(void *); + +static void rt_export_used(struct rt_table_exporter *, const char *, const char *); +static void rt_export_cleanup(struct rtable_private *tab); + +static int rte_same(rte *x, rte *y); const char *rt_import_state_name_array[TIS_MAX] = { [TIS_DOWN] = "DOWN", @@ -105,54 +184,193 @@ const char *rt_export_state_name(u8 state) return rt_export_state_name_array[state]; } -struct event_cork rt_cork; +static struct hostentry *rt_get_hostentry(struct rtable_private *tab, ip_addr a, ip_addr ll, rtable *dep); -static inline void -rte_update_lock(struct channel *c) +static inline rtable *rt_priv_to_pub(struct rtable_private *tab) { return RT_PUB(tab); } +static inline rtable *rt_pub_to_pub(rtable *tab) { return tab; } +#define RT_ANY_TO_PUB(tab) _Generic((tab),rtable*:rt_pub_to_pub,struct rtable_private*:rt_priv_to_pub)((tab)) + +#define rt_trace(tab, level, fmt, args...) do {\ + rtable *t = RT_ANY_TO_PUB((tab)); \ + if (t->config->debug & (level)) \ + log(L_TRACE "%s: " fmt, t->name, ##args); \ +} while (0) + +static void +net_init_with_trie(struct fib *f, void *N) { - c->rte_update_nest_cnt++; + struct rtable_private *tab = SKIP_BACK(struct rtable_private, fib, f); + net *n = N; + + if (tab->trie) + trie_add_prefix(tab->trie, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen); + + if (tab->trie_new) + trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen); } -static inline void -rte_update_unlock(struct channel *c) +static inline net * +net_route_ip4_trie(struct rtable_private *t, const net_addr_ip4 *n0) +{ + TRIE_WALK_TO_ROOT_IP4(t->trie, n0, n) + { + net *r; + if (r = net_find_valid(t, (net_addr *) &n)) + return r; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; +} + +static inline net * +net_route_vpn4_trie(struct rtable_private *t, const net_addr_vpn4 *n0) +{ + TRIE_WALK_TO_ROOT_IP4(t->trie, (const net_addr_ip4 *) n0, px) + { + net_addr_vpn4 n = NET_ADDR_VPN4(px.prefix, px.pxlen, n0->rd); + + net *r; + if (r = net_find_valid(t, (net_addr *) &n)) + return r; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; +} + +static inline net * +net_route_ip6_trie(struct rtable_private *t, const net_addr_ip6 *n0) +{ + TRIE_WALK_TO_ROOT_IP6(t->trie, n0, n) + { + net *r; + if (r = net_find_valid(t, (net_addr *) &n)) + return r; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; +} + +static inline net * +net_route_vpn6_trie(struct rtable_private *t, const net_addr_vpn6 *n0) { - if (!--c->rte_update_nest_cnt) - lp_flush(c->rte_update_pool); + TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px) + { + net_addr_vpn6 n = NET_ADDR_VPN6(px.prefix, px.pxlen, n0->rd); + + net *r; + if (r = net_find_valid(t, (net_addr *) &n)) + return r; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; } -/* Like fib_route(), but skips empty net entries */ static inline void * -net_route_ip4(rtable_private *t, net_addr_ip4 *n) +net_route_ip6_sadr_trie(struct rtable_private *t, const net_addr_ip6_sadr *n0) +{ + TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px) + { + net_addr_ip6_sadr n = NET_ADDR_IP6_SADR(px.prefix, px.pxlen, n0->src_prefix, n0->src_pxlen); + net *best = NULL; + int best_pxlen = 0; + + /* We need to do dst first matching. Since sadr addresses are hashed on dst + prefix only, find the hash table chain and go through it to find the + match with the longest matching src prefix. */ + for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next) + { + net_addr_ip6_sadr *a = (void *) fn->addr; + + if (net_equal_dst_ip6_sadr(&n, a) && + net_in_net_src_ip6_sadr(&n, a) && + (a->src_pxlen >= best_pxlen)) + { + best = fib_node_to_user(&t->fib, fn); + best_pxlen = a->src_pxlen; + } + } + + if (best) + return best; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; +} + +static inline net * +net_route_ip4_fib(struct rtable_private *t, const net_addr_ip4 *n0) { + net_addr_ip4 n; + net_copy_ip4(&n, n0); + net *r; + while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0)) + { + n.pxlen--; + ip4_clrbit(&n.prefix, n.pxlen); + } + + return r; +} - while (r = net_find_valid(t, (net_addr *) n), (!r) && (n->pxlen > 0)) +static inline net * +net_route_vpn4_fib(struct rtable_private *t, const net_addr_vpn4 *n0) +{ + net_addr_vpn4 n; + net_copy_vpn4(&n, n0); + + net *r; + while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0)) { - n->pxlen--; - ip4_clrbit(&n->prefix, n->pxlen); + n.pxlen--; + ip4_clrbit(&n.prefix, n.pxlen); } return r; } -static inline void * -net_route_ip6(rtable_private *t, net_addr_ip6 *n) +static inline net * +net_route_ip6_fib(struct rtable_private *t, const net_addr_ip6 *n0) { + net_addr_ip6 n; + net_copy_ip6(&n, n0); + net *r; + while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0)) + { + n.pxlen--; + ip6_clrbit(&n.prefix, n.pxlen); + } + + return r; +} + +static inline net * +net_route_vpn6_fib(struct rtable_private *t, const net_addr_vpn6 *n0) +{ + net_addr_vpn6 n; + net_copy_vpn6(&n, n0); - while (r = net_find_valid(t, (net_addr *) n), (!r) && (n->pxlen > 0)) + net *r; + while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0)) { - n->pxlen--; - ip6_clrbit(&n->prefix, n->pxlen); + n.pxlen--; + ip6_clrbit(&n.prefix, n.pxlen); } return r; } static inline void * -net_route_ip6_sadr(rtable_private *t, net_addr_ip6_sadr *n) +net_route_ip6_sadr_fib(struct rtable_private *t, const net_addr_ip6_sadr *n0) { - struct fib_node *fn; + net_addr_ip6_sadr n; + net_copy_ip6_sadr(&n, n0); while (1) { @@ -161,13 +379,13 @@ net_route_ip6_sadr(rtable_private *t, net_addr_ip6_sadr *n) /* We need to do dst first matching. Since sadr addresses are hashed on dst prefix only, find the hash table chain and go through it to find the - match with the smallest matching src prefix. */ - for (fn = fib_get_chain(&t->fib, (net_addr *) n); fn; fn = fn->next) + match with the longest matching src prefix. */ + for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next) { net_addr_ip6_sadr *a = (void *) fn->addr; - if (net_equal_dst_ip6_sadr(n, a) && - net_in_net_src_ip6_sadr(n, a) && + if (net_equal_dst_ip6_sadr(&n, a) && + net_in_net_src_ip6_sadr(&n, a) && (a->src_pxlen >= best_pxlen)) { best = fib_node_to_user(&t->fib, fn); @@ -178,38 +396,52 @@ net_route_ip6_sadr(rtable_private *t, net_addr_ip6_sadr *n) if (best) return best; - if (!n->dst_pxlen) + if (!n.dst_pxlen) break; - n->dst_pxlen--; - ip6_clrbit(&n->dst_prefix, n->dst_pxlen); + n.dst_pxlen--; + ip6_clrbit(&n.dst_prefix, n.dst_pxlen); } return NULL; } -void * -net_route(rtable_private *tab, const net_addr *n) +net * +net_route(struct rtable_private *tab, const net_addr *n) { ASSERT(tab->addr_type == n->type); - net_addr *n0 = alloca(n->length); - net_copy(n0, n); - switch (n->type) { case NET_IP4: + if (tab->trie) + return net_route_ip4_trie(tab, (net_addr_ip4 *) n); + else + return net_route_ip4_fib (tab, (net_addr_ip4 *) n); + case NET_VPN4: - case NET_ROA4: - return net_route_ip4(tab, (net_addr_ip4 *) n0); + if (tab->trie) + return net_route_vpn4_trie(tab, (net_addr_vpn4 *) n); + else + return net_route_vpn4_fib (tab, (net_addr_vpn4 *) n); case NET_IP6: + if (tab->trie) + return net_route_ip6_trie(tab, (net_addr_ip6 *) n); + else + return net_route_ip6_fib (tab, (net_addr_ip6 *) n); + case NET_VPN6: - case NET_ROA6: - return net_route_ip6(tab, (net_addr_ip6 *) n0); + if (tab->trie) + return net_route_vpn6_trie(tab, (net_addr_vpn6 *) n); + else + return net_route_vpn6_fib (tab, (net_addr_vpn6 *) n); case NET_IP6_SADR: - return net_route_ip6_sadr(tab, (net_addr_ip6_sadr *) n0); + if (tab->trie) + return net_route_ip6_sadr_trie(tab, (net_addr_ip6_sadr *) n); + else + return net_route_ip6_sadr_fib (tab, (net_addr_ip6_sadr *) n); default: return NULL; @@ -218,15 +450,40 @@ net_route(rtable_private *tab, const net_addr *n) static int -net_roa_check_ip4(rtable *t, const net_addr_ip4 *px, u32 asn) +net_roa_check_ip4_trie(struct rtable_private *tab, const net_addr_ip4 *px, u32 asn) +{ + int anything = 0; + + TRIE_WALK_TO_ROOT_IP4(tab->trie, px, px0) + { + net_addr_roa4 roa0 = NET_ADDR_ROA4(px0.prefix, px0.pxlen, 0, 0); + + struct fib_node *fn; + for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next) + { + net_addr_roa4 *roa = (void *) fn->addr; + net *r = fib_node_to_user(&tab->fib, fn); + + if (net_equal_prefix_roa4(roa, &roa0) && r->routes && rte_is_valid(&r->routes->rte)) + { + anything = 1; + if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) + return ROA_VALID; + } + } + } + TRIE_WALK_TO_ROOT_END; + + return anything ? ROA_INVALID : ROA_UNKNOWN; +} + +static int +net_roa_check_ip4_fib(struct rtable_private *tab, const net_addr_ip4 *px, u32 asn) { struct net_addr_roa4 n = NET_ADDR_ROA4(px->prefix, px->pxlen, 0, 0); struct fib_node *fn; int anything = 0; - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); - while (1) { for (fn = fib_get_chain(&tab->fib, (net_addr *) &n); fn; fn = fn->next) @@ -238,10 +495,7 @@ net_roa_check_ip4(rtable *t, const net_addr_ip4 *px, u32 asn) { anything = 1; if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) - { - RT_UNLOCK(tab); return ROA_VALID; - } } } @@ -252,20 +506,44 @@ net_roa_check_ip4(rtable *t, const net_addr_ip4 *px, u32 asn) ip4_clrbit(&n.prefix, n.pxlen); } - RT_UNLOCK(tab); return anything ? ROA_INVALID : ROA_UNKNOWN; } static int -net_roa_check_ip6(rtable *t, const net_addr_ip6 *px, u32 asn) +net_roa_check_ip6_trie(struct rtable_private *tab, const net_addr_ip6 *px, u32 asn) +{ + int anything = 0; + + TRIE_WALK_TO_ROOT_IP6(tab->trie, px, px0) + { + net_addr_roa6 roa0 = NET_ADDR_ROA6(px0.prefix, px0.pxlen, 0, 0); + + struct fib_node *fn; + for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next) + { + net_addr_roa6 *roa = (void *) fn->addr; + net *r = fib_node_to_user(&tab->fib, fn); + + if (net_equal_prefix_roa6(roa, &roa0) && r->routes && rte_is_valid(&r->routes->rte)) + { + anything = 1; + if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) + return ROA_VALID; + } + } + } + TRIE_WALK_TO_ROOT_END; + + return anything ? ROA_INVALID : ROA_UNKNOWN; +} + +static int +net_roa_check_ip6_fib(struct rtable_private *tab, const net_addr_ip6 *px, u32 asn) { struct net_addr_roa6 n = NET_ADDR_ROA6(px->prefix, px->pxlen, 0, 0); struct fib_node *fn; int anything = 0; - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); - while (1) { for (fn = fib_get_chain(&tab->fib, (net_addr *) &n); fn; fn = fn->next) @@ -277,10 +555,7 @@ net_roa_check_ip6(rtable *t, const net_addr_ip6 *px, u32 asn) { anything = 1; if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) - { - RT_UNLOCK(tab); return ROA_VALID; - } } } @@ -291,7 +566,6 @@ net_roa_check_ip6(rtable *t, const net_addr_ip6 *px, u32 asn) ip6_clrbit(&n.prefix, n.pxlen); } - RT_UNLOCK(tab); return anything ? ROA_INVALID : ROA_UNKNOWN; } @@ -311,14 +585,30 @@ net_roa_check_ip6(rtable *t, const net_addr_ip6 *px, u32 asn) * must have type NET_IP4 or NET_IP6, respectively. */ int -net_roa_check(rtable *tab, const net_addr *n, u32 asn) +net_roa_check(rtable *tp, const net_addr *n, u32 asn) { - if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4)) - return net_roa_check_ip4(tab, (const net_addr_ip4 *) n, asn); - else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6)) - return net_roa_check_ip6(tab, (const net_addr_ip6 *) n, asn); - else - return ROA_UNKNOWN; /* Should not happen */ + int out = ROA_UNKNOWN; + + RT_LOCKED(tp, tab) + { + if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4)) + { + if (tab->trie) + out = net_roa_check_ip4_trie(tab, (const net_addr_ip4 *) n, asn); + else + out = net_roa_check_ip4_fib (tab, (const net_addr_ip4 *) n, asn); + } + else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6)) + { + if (tab->trie) + out = net_roa_check_ip6_trie(tab, (const net_addr_ip6 *) n, asn); + else + out = net_roa_check_ip6_fib (tab, (const net_addr_ip6 *) n, asn); + } + else + out = ROA_UNKNOWN; /* Should not happen */ + } + return out; } /** @@ -342,7 +632,7 @@ rte_find(net *net, struct rte_src *src) struct rte_storage * -rte_store(const rte *r, net *net, rtable_private *tab) +rte_store(const rte *r, net *net, struct rtable_private *tab) { struct rte_storage *e = sl_alloc(tab->rte_slab); @@ -351,10 +641,10 @@ rte_store(const rte *r, net *net, rtable_private *tab) rt_lock_source(e->rte.src); - if (e->rte.attrs->cached) + if (ea_is_cached(e->rte.attrs)) e->rte.attrs = rta_clone(e->rte.attrs); else - e->rte.attrs = rta_lookup(e->rte.attrs); + e->rte.attrs = rta_lookup(e->rte.attrs, 1); return e; } @@ -368,11 +658,11 @@ rte_store(const rte *r, net *net, rtable_private *tab) */ void -rte_free(struct rte_storage *e, rtable_private *tab) +rte_free(struct rte_storage *e) { rt_unlock_source(e->rte.src); rta_free(e->rte.attrs); - sl_free(tab->rte_slab, e); + sl_free(e); } static int /* Actually better or at least as good as */ @@ -385,9 +675,12 @@ rte_better(rte *new, rte *old) if (!rte_is_valid(new)) return 0; - if (new->attrs->pref > old->attrs->pref) + u32 np = rt_get_preference(new); + u32 op = rt_get_preference(old); + + if (np > op) return 1; - if (new->attrs->pref < old->attrs->pref) + if (np < op) return 0; if (new->src->owner->class != old->src->owner->class) { @@ -411,7 +704,7 @@ rte_mergable(rte *pri, rte *sec) if (!rte_is_valid(pri) || !rte_is_valid(sec)) return 0; - if (pri->attrs->pref != sec->attrs->pref) + if (rt_get_preference(pri) != rt_get_preference(sec)) return 0; if (pri->src->owner->class != sec->src->owner->class) @@ -426,11 +719,10 @@ rte_mergable(rte *pri, rte *sec) static void rte_trace(const char *name, const rte *e, int dir, const char *msg) { - log(L_TRACE "%s %c %s %N src %uL %uG %uS id %u %s%s", + log(L_TRACE "%s %c %s %N src %uL %uG %uS id %u %s", name, dir, msg, e->net, e->src->private_id, e->src->global_id, e->stale_cycle, e->id, - rta_dest_name(e->attrs->dest), - rte_is_filtered(e) ? " (filtered)" : ""); + rta_dest_name(rte_dest(e))); } static inline void @@ -469,8 +761,8 @@ rte_feed_count(net *n) { uint count = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) - count++; + count++; + return count; } @@ -479,16 +771,16 @@ rte_feed_obtain(net *n, struct rte **feed, uint count) { uint i = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) { ASSERT_DIE(i < count); feed[i++] = &e->rte; } + ASSERT_DIE(i == count); } static rte * -export_filter_(struct channel *c, rte *rt, linpool *pool, int silent) +export_filter(struct channel *c, rte *rt, int silent) { struct proto *p = c->proto; const struct filter *filter = c->out_filter; @@ -518,7 +810,7 @@ export_filter_(struct channel *c, rte *rt, linpool *pool, int silent) } v = filter && ((filter == FILTER_REJECT) || - (f_run(filter, rt, pool, + (f_run(filter, rt, (silent ? FF_SILENT : 0)) > F_ACCEPT)); if (v) { @@ -544,17 +836,10 @@ reject_noset: return NULL; } -static inline rte * -export_filter(struct channel *c, rte *rt, int silent) -{ - return export_filter_(c, rt, c->rte_update_pool, silent); -} - -void do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte *old); - static void do_rt_notify(struct channel *c, const net_addr *net, rte *new, const rte *old) { + struct proto *p = c->proto; struct channel_export_stats *stats = &c->export_stats; if (c->refeeding && new) @@ -571,30 +856,16 @@ do_rt_notify(struct channel *c, const net_addr *net, rte *new, const rte *old) if (!new && old) CHANNEL_LIMIT_POP(c, OUT); - /* Store route export state */ - if (old) - bmap_clear(&c->export_map, old->id); - if (new) - bmap_set(&c->export_map, new->id); - - /* Apply export table */ - if (c->out_table) - rte_import(&c->out_table->push, net, new, old ? old->src : new->src); + stats->updates_accepted++; else - do_rt_notify_direct(c, net, new, old); -} + stats->withdraws_accepted++; -void -do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte *old) -{ - struct proto *p = c->proto; - struct channel_export_stats *stats = &c->export_stats; + if (old) + bmap_clear(&c->export_map, old->id); if (new) - stats->updates_accepted++; - else - stats->withdraws_accepted++; + bmap_set(&c->export_map, new->id); if (p->debug & D_ROUTES) { @@ -612,6 +883,16 @@ do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte static void rt_notify_basic(struct channel *c, const net_addr *net, rte *new, rte *old) { + if (new && old && rte_same(new, old)) + { + if ((new->id != old->id) && bmap_test(&c->export_map, old->id)) + { + bmap_set(&c->export_map, new->id); + bmap_clear(&c->export_map, old->id); + } + return; + } + if (new) new = export_filter(c, new, 0); @@ -635,13 +916,11 @@ channel_rpe_mark_seen(struct rt_export_request *req, struct rt_pending_export *r } void -rt_notify_accepted(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe, +rt_notify_accepted(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *first, struct rte **feed, uint count) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); - rte nb0, *new_best = NULL; const rte *old_best = NULL; @@ -681,7 +960,7 @@ rt_notify_accepted(struct rt_export_request *req, const net_addr *n, struct rt_p done: /* Check obsolete routes for previously exported */ - while (rpe) + RPE_WALK(first, rpe, NULL) { channel_rpe_mark_seen(req, rpe); if (rpe->old) @@ -692,7 +971,6 @@ done: old_best = &rpe->old->rte; } } - rpe = rpe_next(rpe, NULL); } /* Nothing to export */ @@ -700,15 +978,6 @@ done: do_rt_notify(c, n, new_best, old_best); else DBG("rt_notify_accepted: nothing to export\n"); - - rte_update_unlock(c); -} - - -static struct nexthop * -nexthop_merge_rta(struct nexthop *nhs, rta *a, linpool *pool, int max) -{ - return nexthop_merge(nhs, &(a->nh), 1, 0, max, pool); } rte * @@ -717,7 +986,7 @@ rt_export_merged(struct channel *c, struct rte **feed, uint count, linpool *pool _Thread_local static rte rloc; // struct proto *p = c->proto; - struct nexthop *nhs = NULL; + struct nexthop_adata *nhs = NULL; rte *best0 = feed[0]; rte *best = NULL; @@ -729,7 +998,7 @@ rt_export_merged(struct channel *c, struct rte **feed, uint count, linpool *pool return NULL; rloc = *best0; - best = export_filter_(c, &rloc, pool, silent); + best = export_filter(c, &rloc, silent); if (!best) /* Best route doesn't pass the filter */ @@ -745,35 +1014,40 @@ rt_export_merged(struct channel *c, struct rte **feed, uint count, linpool *pool continue; rte tmp0 = *feed[i]; - rte *tmp = export_filter_(c, &tmp0, pool, 1); + rte *tmp = export_filter(c, &tmp0, 1); if (!tmp || !rte_is_reachable(tmp)) continue; - nhs = nexthop_merge_rta(nhs, tmp->attrs, pool, c->merge_limit); + eattr *nhea = ea_find(tmp->attrs, &ea_gen_nexthop); + ASSERT_DIE(nhea); + + if (nhs) + nhs = nexthop_merge(nhs, (struct nexthop_adata *) nhea->u.ptr, c->merge_limit, pool); + else + nhs = (struct nexthop_adata *) nhea->u.ptr; } if (nhs) { - nhs = nexthop_merge_rta(nhs, best->attrs, pool, c->merge_limit); + eattr *nhea = ea_find(best->attrs, &ea_gen_nexthop); + ASSERT_DIE(nhea); - if (nhs->next) - { - best->attrs = rta_cow(best->attrs, pool); - nexthop_link(best->attrs, nhs); - } + nhs = nexthop_merge(nhs, (struct nexthop_adata *) nhea->u.ptr, c->merge_limit, pool); + + ea_set_attr(&best->attrs, + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nhs->ad)); } return best; } void -rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe, +rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *first, struct rte **feed, uint count) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); // struct proto *p = c->proto; #if 0 /* TODO: Find whether this check is possible when processing multiple changes at once. */ @@ -795,7 +1069,7 @@ rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pen } /* Check obsolete routes for previously exported */ - while (rpe) + RPE_WALK(first, rpe, NULL) { channel_rpe_mark_seen(req, rpe); if (rpe->old) @@ -806,80 +1080,72 @@ rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pen old_best = &rpe->old->rte; } } - rpe = rpe_next(rpe, NULL); } /* Prepare new merged route */ - rte *new_merged = count ? rt_export_merged(c, feed, count, c->rte_update_pool, 0) : NULL; + rte *new_merged = count ? rt_export_merged(c, feed, count, tmp_linpool, 0) : NULL; if (new_merged || old_best) do_rt_notify(c, n, new_merged, old_best); - - rte_update_unlock(c); } void -rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); - rte *old = RTES_OR_NULL(rpe->old_best); - struct rte_storage *new_best = rpe->new_best; + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; - while (rpe) + RPE_WALK(first, rpe, NULL) { channel_rpe_mark_seen(req, rpe); new_best = rpe->new_best; - rpe = rpe_next(rpe, NULL); } - if (&new_best->rte != old) - { - rte n0, *new = RTES_CLONE(new_best, &n0); - rt_notify_basic(c, net, new, old); - } - - rte_update_unlock(c); + rte n0 = RTE_COPY_VALID(new_best); + if (n0.src || o) + rt_notify_basic(c, net, n0.src ? &n0 : NULL, o); } void -rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); - struct rte_src *src = rpe->new ? rpe->new->rte.src : rpe->old->rte.src; - rte *old = RTES_OR_NULL(rpe->old); - struct rte_storage *new_any = rpe->new; - while (rpe) + rte *n = RTE_VALID_OR_NULL(first->new); + rte *o = RTE_VALID_OR_NULL(first->old); + + if (!n && !o) { - channel_rpe_mark_seen(req, rpe); - new_any = rpe->new; - rpe = rpe_next(rpe, src); + channel_rpe_mark_seen(req, first); + return; } - if (&new_any->rte != old) + struct rte_src *src = n ? n->src : o->src; + struct rte_storage *new_latest = first->new; + + RPE_WALK(first, rpe, src) { - rte n0, *new = RTES_CLONE(new_any, &n0); - rt_notify_basic(c, net, new, old); + channel_rpe_mark_seen(req, rpe); + new_latest = rpe->new; } - rte_update_unlock(c); + rte n0 = RTE_COPY_VALID(new_latest); + if (n0.src || o) + rt_notify_basic(c, net, n0.src ? &n0 : NULL, o); } void rt_feed_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); for (uint i=0; i<count; i++) - { - rte n0 = *feed[i]; - rt_notify_basic(c, net, &n0, NULL); - } - - rte_update_unlock(c); + if (rte_is_valid(feed[i])) + { + rte n0 = *feed[i]; + rt_notify_basic(c, net, &n0, NULL); + } } void @@ -909,14 +1175,38 @@ rpe_next(struct rt_pending_export *rpe, struct rte_src *src) } static struct rt_pending_export * rt_next_export_fast(struct rt_pending_export *last); -static void -rte_export(struct rt_export_hook *hook, struct rt_pending_export *rpe) +static int +rte_export(struct rt_table_export_hook *th, struct rt_pending_export *rpe) { + rtable *tab = RT_PUB(SKIP_BACK(struct rtable_private, exporter, th->table)); + struct rt_export_hook *hook = &th->h; if (bmap_test(&hook->seq_map, rpe->seq)) - goto seen; + goto ignore; /* Seen already */ const net_addr *n = rpe->new_best ? rpe->new_best->rte.net : rpe->old_best->rte.net; + switch (hook->req->addr_mode) + { + case TE_ADDR_NONE: + break; + + case TE_ADDR_IN: + if (!net_in_netX(n, hook->req->addr)) + goto ignore; + break; + + case TE_ADDR_EQUAL: + if (!net_equal(n, hook->req->addr)) + goto ignore; + break; + + case TE_ADDR_FOR: + bug("Continuos export of best prefix match not implemented yet."); + + default: + bug("Strange table export address mode: %d", hook->req->addr_mode); + } + if (rpe->new) hook->stats.updates_received++; else @@ -926,8 +1216,8 @@ rte_export(struct rt_export_hook *hook, struct rt_pending_export *rpe) hook->req->export_one(hook->req, n, rpe); else if (hook->req->export_bulk) { - RT_LOCK(hook->table); net *net = SKIP_BACK(struct network, n.addr, (net_addr (*)[0]) n); + RT_LOCK(tab); uint count = rte_feed_count(net); rte **feed = NULL; if (count) @@ -935,27 +1225,24 @@ rte_export(struct rt_export_hook *hook, struct rt_pending_export *rpe) feed = alloca(count * sizeof(rte *)); rte_feed_obtain(net, feed, count); } - RT_UNLOCK(hook->table); + RT_UNLOCK(tab); hook->req->export_bulk(hook->req, n, rpe, feed, count); } else bug("Export request must always provide an export method"); -seen: +ignore: /* Get the next export if exists */ - hook->rpe_next = rt_next_export_fast(rpe); + th->rpe_next = rt_next_export_fast(rpe); /* The last block may be available to free */ - if (PAGE_HEAD(hook->rpe_next) != PAGE_HEAD(rpe)) - { - RT_LOCK(hook->table); - rt_export_used(RT_PRIV(hook->table)); - RT_UNLOCK(hook->table); - } + int used = (PAGE_HEAD(th->rpe_next) != PAGE_HEAD(rpe)); /* Releasing this export for cleanup routine */ DBG("store hook=%p last_export=%p seq=%lu\n", hook, rpe, rpe->seq); - atomic_store_explicit(&hook->last_export, rpe, memory_order_release); + atomic_store_explicit(&th->last_export, rpe, memory_order_release); + + return used; } /** @@ -990,47 +1277,28 @@ seen: * done outside of scope of rte_announce(). */ static void -rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_storage *old, +rte_announce(struct rtable_private *tab, net *net, struct rte_storage *new, struct rte_storage *old, struct rte_storage *new_best, struct rte_storage *old_best) { - if (!new_best || !rte_is_valid(&new_best->rte)) - new_best = NULL; - - if (!old_best || !rte_is_valid(&old_best->rte)) - old_best = NULL; - - if (!new || !rte_is_valid(&new->rte)) - new = NULL; - - if (old && !rte_is_valid(&old->rte)) - { - /* Filtered old route isn't announced, should be freed immediately. */ - rte_free(old, tab); - old = NULL; - } + int new_best_valid = rte_is_valid(RTE_OR_NULL(new_best)); + int old_best_valid = rte_is_valid(RTE_OR_NULL(old_best)); if ((new == old) && (new_best == old_best)) return; - if (new_best != old_best) - { - if (new_best) - new_best->rte.sender->stats.pref++; - if (old_best) - old_best->rte.sender->stats.pref--; - - if (tab->hostcache) - rt_notify_hostcache(tab, net); - } + if (new_best_valid) + new_best->rte.sender->stats.pref++; + if (old_best_valid) + old_best->rte.sender->stats.pref--; - if (EMPTY_LIST(tab->exports) && EMPTY_LIST(tab->pending_exports)) + if (EMPTY_LIST(tab->exporter.e.hooks) && EMPTY_LIST(tab->exporter.pending)) { /* No export hook and no pending exports to cleanup. We may free the route immediately. */ if (!old) return; hmap_clear(&tab->id_map, old->rte.id); - rte_free(old, tab); + rte_free(old); return; } @@ -1038,9 +1306,9 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ struct rt_export_block *rpeb = NULL, *rpebsnl = NULL; u32 end = 0; - if (!EMPTY_LIST(tab->pending_exports)) + if (!EMPTY_LIST(tab->exporter.pending)) { - rpeb = TAIL(tab->pending_exports); + rpeb = TAIL(tab->exporter.pending); end = atomic_load_explicit(&rpeb->end, memory_order_relaxed); if (end >= RT_PENDING_EXPORT_ITEMS) { @@ -1054,9 +1322,9 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ if (!rpeb) { - rpeb = alloc_page(tab->rp); + rpeb = alloc_page(); *rpeb = (struct rt_export_block) {}; - add_tail(&tab->pending_exports, &rpeb->n); + add_tail(&tab->exporter.pending, &rpeb->n); } /* Fill the pending export */ @@ -1066,10 +1334,14 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ .new_best = new_best, .old = old, .old_best = old_best, - .seq = tab->next_export_seq++, + .seq = tab->exporter.next_seq++, }; - DBG("rte_announce: table=%s net=%N new=%p from %p old=%p from %p new_best=%p old_best=%p seq=%lu\n", tab->name, net->n.addr, new, new ? new->sender : NULL, old, old ? old->sender : NULL, new_best, old_best, rpe->seq); + DBGL("rte_announce: table=%s net=%N new=%p id %u from %s old=%p id %u from %s new_best=%p id %u old_best=%p id %u seq=%lu", + tab->name, net->n.addr, + new, new ? new->rte.id : 0, new ? new->rte.sender->req->name : NULL, + old, old ? old->rte.id : 0, old ? old->rte.sender->req->name : NULL, + new_best, old_best, rpe->seq); ASSERT_DIE(atomic_fetch_add_explicit(&rpeb->end, 1, memory_order_release) == end); @@ -1088,7 +1360,7 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ &net->last->next, &rpenull, rpe, memory_order_relaxed, memory_order_relaxed)); - + } net->last = rpe; @@ -1096,19 +1368,10 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ if (!net->first) net->first = rpe; - if (tab->first_export == NULL) - tab->first_export = rpe; + if (tab->exporter.first == NULL) + tab->exporter.first = rpe; - if (!EMPTY_LIST(tab->exports) && - (tab->first_export->seq + tab->config->cork_limit <= tab->next_export_seq) && - !tab->cork_active) - { - if (config->table_debug) - log(L_TRACE "%s: cork activated", tab->name); - - ev_cork(&rt_cork); - tab->cork_active = 1; - } + rt_check_cork_high(tab); } static struct rt_pending_export * @@ -1137,8 +1400,10 @@ rt_next_export_fast(struct rt_pending_export *last) } static struct rt_pending_export * -rt_next_export(struct rt_export_hook *hook, rtable_private *tab) +rt_next_export(struct rt_table_export_hook *hook, struct rt_table_exporter *tab) { + ASSERT_DIE(RT_IS_LOCKED(SKIP_BACK(struct rtable_private, exporter, tab))); + /* As the table is locked, it is safe to reload the last export pointer */ struct rt_pending_export *last = atomic_load_explicit(&hook->last_export, memory_order_acquire); @@ -1148,62 +1413,75 @@ rt_next_export(struct rt_export_hook *hook, rtable_private *tab) /* No, therefore we must process the table's first pending export */ else - return tab->first_export; + return tab->first; } static inline void rt_send_export_event(struct rt_export_hook *hook) { - ev_send(hook->req->list, hook->event); + ev_send(hook->req->list, &hook->event); } static void -rt_announce_exports(void *data) +rt_announce_exports(struct settle *s) { - rtable_private *tab = data; - ASSERT_DIE(birdloop_inside(tab->loop)); - - rt_schedule_notify(tab); + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, export_settle, s)), tab) + if (!EMPTY_LIST(tab->exporter.pending)) + { + struct rt_export_hook *c; node *n; + WALK_LIST2(c, n, tab->exporter.e.hooks, n) + { + if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_READY) + continue; - struct rt_export_hook *c; node *n; - WALK_LIST2(c, n, tab->exports, n) - { - if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_READY) - continue; + rt_send_export_event(c); + } + } +} - rt_send_export_event(c); - } +static void +rt_kick_export_settle(struct rtable_private *tab) +{ + tab->export_settle.cf = tab->rr_counter ? tab->config->export_rr_settle : tab->config->export_settle; + settle_kick(&tab->export_settle, tab->loop); } static void -rt_import_announce_exports(void *data) +rt_import_announce_exports(void *_hook) { - struct rt_import_hook *hook = data; - RT_LOCKED(hook->table, tab) + struct rt_import_hook *hook = _hook; + if (hook->import_state == TIS_CLEARED) { - if (hook->import_state == TIS_CLEARED) + void (*stopped)(struct rt_import_request *) = hook->stopped; + struct rt_import_request *req = hook->req; + + RT_LOCKED(hook->table, tab) { - rfree(hook->export_announce_event); + req->hook = NULL; - ev_send(hook->stopped->list, hook->stopped); + rt_trace(tab, D_EVENTS, "Hook %s stopped", req->name); rem_node(&hook->n); mb_free(hook); rt_unlock_table(tab); } - else - ev_send_loop(tab->loop, tab->announce_event); + + stopped(req); + return; } + + rt_trace(hook->table, D_EVENTS, "Announcing exports after imports from %s", hook->req->name); + birdloop_flag(hook->table->loop, RTF_EXPORT); } static struct rt_pending_export * -rt_last_export(rtable_private *tab) +rt_last_export(struct rt_table_exporter *tab) { struct rt_pending_export *rpe = NULL; - if (!EMPTY_LIST(tab->pending_exports)) + if (!EMPTY_LIST(tab->pending)) { /* We'll continue processing exports from this export on */ - struct rt_export_block *reb = TAIL(tab->pending_exports); + struct rt_export_block *reb = TAIL(tab->pending); ASSERT_DIE(reb->end); rpe = &reb->export[reb->end - 1]; } @@ -1216,35 +1494,42 @@ rt_last_export(rtable_private *tab) static void rt_export_hook(void *_data) { - struct rt_export_hook *c = _data; + struct rt_table_export_hook *c = _data; + rtable *tab = SKIP_BACK(rtable, priv.exporter, c->table); - ASSERT_DIE(atomic_load_explicit(&c->export_state, memory_order_relaxed) == TES_READY); + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_READY); if (!c->rpe_next) { - RT_LOCK(c->table); - c->rpe_next = rt_next_export(c, RT_PRIV(c->table)); + RT_LOCK(tab); + c->rpe_next = rt_next_export(c, c->table); if (!c->rpe_next) { - rt_export_used(RT_PRIV(c->table)); - RT_UNLOCK(c->table); + rt_export_used(c->table, c->h.req->name, "done exporting"); + RT_UNLOCK(tab); return; } - RT_UNLOCK(c->table); + RT_UNLOCK(tab); } + int used = 0; + /* Process the export */ for (uint i=0; i<RT_EXPORT_BULK; i++) { - rte_export(c, c->rpe_next); + used += rte_export(c, c->rpe_next); if (!c->rpe_next) break; } - rt_send_export_event(c); + if (used) + RT_LOCKED(tab, _) + rt_export_used(c->table, c->h.req->name, "finished export bulk"); + + rt_send_export_event(&c->h); } @@ -1271,16 +1556,29 @@ rte_validate(struct channel *ch, rte *e) return 0; } - if (net_type_match(n, NB_DEST) == !e->attrs->dest) + if (net_type_match(n, NB_DEST)) { - log(L_WARN "Ignoring route %N with invalid dest %d received via %s", - n, e->attrs->dest, ch->proto->name); - return 0; - } + eattr *nhea = ea_find(e->attrs, &ea_gen_nexthop); + int dest = nhea_dest(nhea); + + if (dest == RTD_NONE) + { + log(L_WARN "Ignoring route %N with no destination received via %s", + n, ch->proto->name); + return 0; + } - if ((e->attrs->dest == RTD_UNICAST) && !nexthop_is_sorted(&(e->attrs->nh))) + if ((dest == RTD_UNICAST) && + !nexthop_is_sorted((struct nexthop_adata *) nhea->u.ptr)) + { + log(L_WARN "Ignoring unsorted multipath route %N received via %s", + n, ch->proto->name); + return 0; + } + } + else if (ea_find(e->attrs, &ea_gen_nexthop)) { - log(L_WARN "Ignoring unsorted multipath route %N received via %s", + log(L_WARN "Ignoring route %N having a nexthop attribute received via %s", n, ch->proto->name); return 0; } @@ -1301,8 +1599,8 @@ rte_same(rte *x, rte *y) static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); } -static void -rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte *new, struct rte_src *src) +static int +rte_recalculate(struct rtable_private *table, struct rt_import_hook *c, net *net, rte *new, struct rte_src *src) { struct rt_import_request *req = c->req; struct rt_import_stats *stats = &c->stats; @@ -1310,9 +1608,13 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * rte *old_best = old_best_stored ? &old_best_stored->rte : NULL; rte *old = NULL; - /* Set the stale cycle unless already set */ - if (new && !(new->flags & REF_USE_STALE)) - new->stale_cycle = c->stale_set; + /* If the new route is identical to the old one, we find the attributes in + * cache and clone these with no performance drop. OTOH, if we were to lookup + * the attributes, such a route definitely hasn't been anywhere yet, + * therefore it's definitely worth the time. */ + struct rte_storage *new_stored = NULL; + if (new) + new = &(new_stored = rte_store(new, net, table))->rte; /* Find and remove original route from the same protocol */ struct rte_storage **before_old = rte_find(net, src); @@ -1336,7 +1638,7 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * c->table->name, net->n.addr, old->src->owner->name, old->src->private_id, old->src->global_id); } - if (new && rte_same(old, new)) + if (new && rte_same(old, &new_stored->rte)) { /* No changes, ignore the new route and refresh the old one */ old->stale_cycle = new->stale_cycle; @@ -1346,6 +1648,10 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * stats->updates_ignored++; rt_rte_trace_in(D_ROUTES, req, new, "ignored"); } + + /* We need to free the already stored route here before returning */ + rte_free(new_stored); + return 0; } *before_old = (*before_old)->next; @@ -1355,11 +1661,16 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * if (!old && !new) { stats->withdraws_ignored++; - return; + return 0; } - if (req->preimport) - new = req->preimport(req, new, old); + /* If rejected by import limit, we need to pretend there is no route */ + if (req->preimport && (req->preimport(req, new, old) == 0)) + { + rte_free(new_stored); + new_stored = NULL; + new = NULL; + } int new_ok = rte_is_ok(new); int old_ok = rte_is_ok(old); @@ -1374,8 +1685,6 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * if (old_ok || new_ok) table->last_rt_change = current_time(); - struct rte_storage *new_stored = new ? rte_store(new, net, table) : NULL; - if (table->config->sorted) { /* If routes are sorted, just insert new route to appropriate position */ @@ -1471,117 +1780,81 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * hmap_set(&table->id_map, new_stored->rte.id); } - _Bool nb = (new_stored == net->routes); - _Bool ob = (old_best == old); - /* Log the route change */ - if (new_ok && old_ok) + if (new_ok) + rt_rte_trace_in(D_ROUTES, req, &new_stored->rte, new_stored == net->routes ? "added [best]" : "added"); + else if (old_ok) { - const char *best_indicator[2][2] = { { "updated", "updated [-best]" }, { "updated [+best]", "updated [best]" } }; - rt_rte_trace_in(D_ROUTES, req, &new_stored->rte, best_indicator[nb][ob]); + if (old != old_best) + rt_rte_trace_in(D_ROUTES, req, old, "removed"); + else if (net->routes && rte_is_ok(&net->routes->rte)) + rt_rte_trace_in(D_ROUTES, req, old, "removed [replaced]"); + else + rt_rte_trace_in(D_ROUTES, req, old, "removed [sole]"); } - else if (new_ok) - rt_rte_trace_in(D_ROUTES, req, &new_stored->rte, - (!net->routes->next || !rte_is_ok(&net->routes->next->rte)) ? "added [sole]" : - nb ? "added [best]" : "added"); - else if (old_ok) - rt_rte_trace_in(D_ROUTES, req, old, - (!net->routes || !rte_is_ok(&net->routes->rte)) ? "removed [sole]" : - ob ? "removed [best]" : "removed"); + else + if (req->trace_routes & D_ROUTES) + log(L_TRACE "%s > ignored %N %s->%s", req->name, net->n.addr, old ? "filtered" : "none", new ? "filtered" : "none"); /* Propagate the route change */ rte_announce(table, net, new_stored, old_stored, net->routes, old_best_stored); - ev_send(req->list, c->export_announce_event); - - if (!net->routes && - (table->gc_counter++ >= table->config->gc_max_ops) && - (table->gc_time + table->config->gc_min_time <= current_time())) - rt_schedule_prune(table); - -#if 0 - /* Enable and reimplement these callbacks if anybody wants to use them */ - if (old_ok && p->rte_remove) - p->rte_remove(net, old); - if (new_ok && p->rte_insert) - p->rte_insert(net, &new_stored->rte); -#endif - + return 1; } -rte * +int channel_preimport(struct rt_import_request *req, rte *new, rte *old) { struct channel *c = SKIP_BACK(struct channel, in_req, req); - if (!c->in_table) - { - if (new && !old) - if (CHANNEL_LIMIT_PUSH(c, RX)) - return NULL; + if (new && !old) + if (CHANNEL_LIMIT_PUSH(c, RX)) + return 0; - if (!new && old) - CHANNEL_LIMIT_POP(c, RX); - } + if (!new && old) + CHANNEL_LIMIT_POP(c, RX); int new_in = new && !rte_is_filtered(new); int old_in = old && !rte_is_filtered(old); if (new_in && !old_in) if (CHANNEL_LIMIT_PUSH(c, IN)) - if (c->in_keep_filtered) + if (c->in_keep & RIK_REJECTED) { new->flags |= REF_FILTERED; - return new; + return 1; } else - return NULL; + return 0; if (!new_in && old_in) CHANNEL_LIMIT_POP(c, IN); - return new; -} - -rte * -channel_in_preimport(struct rt_import_request *req, rte *new, rte *old) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, push, req); - - if (new && !old) - if (CHANNEL_LIMIT_PUSH(cat->c, RX)) - return NULL; - - if (!new && old) - CHANNEL_LIMIT_POP(cat->c, RX); - - return new; + return 1; } -void rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src *src); - void rte_update(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) { if (!c->in_req.hook) + { + log(L_WARN "%s.%s: Called rte_update without import hook", c->proto->name, c->name); return; + } ASSERT(c->channel_state == CS_UP); - if (c->in_table) - rte_import(&c->in_table->push, n, new, src); - else - rte_update_direct(c, n, new, src); -} + /* The import reloader requires prefilter routes to be the first layer */ + if (new && (c->in_keep & RIK_PREFILTER)) + if (ea_is_cached(new->attrs) && !new->attrs->next) + new->attrs = ea_clone(new->attrs); + else + new->attrs = ea_lookup(new->attrs, 0); -void -rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) -{ const struct filter *filter = c->in_filter; struct channel_import_stats *stats = &c->import_stats; - rte_update_lock(c); if (new) { new->net = n; @@ -1589,30 +1862,49 @@ rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src int fr; stats->updates_received++; - if (!rte_validate(c, new)) - { - channel_rte_trace_in(D_FILTERS, c, new, "invalid"); - stats->updates_invalid++; - new = NULL; - } - else if ((filter == FILTER_REJECT) || - ((fr = f_run(filter, new, c->rte_update_pool, 0)) > F_ACCEPT)) + if ((filter == FILTER_REJECT) || + ((fr = f_run(filter, new, 0)) > F_ACCEPT)) { stats->updates_filtered++; channel_rte_trace_in(D_FILTERS, c, new, "filtered out"); - if (c->in_keep_filtered) + if (c->in_keep & RIK_REJECTED) new->flags |= REF_FILTERED; else new = NULL; } + + if (new) + if (net_is_flow(n)) + rt_flowspec_resolve_rte(new, c); + else + rt_next_hop_resolve_rte(new); + + if (new && !rte_validate(c, new)) + { + channel_rte_trace_in(D_FILTERS, c, new, "invalid"); + stats->updates_invalid++; + new = NULL; + } + } else stats->withdraws_received++; rte_import(&c->in_req, n, new, src); - rte_update_unlock(c); + /* Now the route attributes are kept by the in-table cached version + * and we may drop the local handle */ + if (new && (c->in_keep & RIK_PREFILTER)) + { + /* There may be some updates on top of the original attribute block */ + ea_list *a = new->attrs; + while (a->next) + a = a->next; + + ea_free(a); + } + } void @@ -1620,84 +1912,99 @@ rte_import(struct rt_import_request *req, const net_addr *n, rte *new, struct rt { struct rt_import_hook *hook = req->hook; if (!hook) + { + log(L_WARN "%s: Called rte_import without import hook", req->name); return; + } - RT_LOCK(hook->table); - rtable_private *tab = RT_PRIV(hook->table); - - net *nn; - if (new) + RT_LOCKED(hook->table, tab) + { + net *nn; + if (new) { /* Use the actual struct network, not the dummy one */ nn = net_get(tab, n); new->net = nn->n.addr; new->sender = hook; + + /* Set the stale cycle */ + new->stale_cycle = hook->stale_set; } - else if (!(nn = net_find(tab, n))) + else if (!(nn = net_find(tab, n))) { req->hook->stats.withdraws_ignored++; - RT_UNLOCK(tab); - return; + if (req->trace_routes & D_ROUTES) + log(L_TRACE "%s > ignored %N withdraw", req->name, n); + RT_RETURN(tab); } - /* And recalculate the best route */ - rte_recalculate(tab, hook, nn, new, src); - RT_UNLOCK(tab); + /* Recalculate the best route */ + if (rte_recalculate(tab, hook, nn, new, src)) + ev_send(req->list, &hook->announce_event); + } } /* Check rtable for best route to given net whether it would be exported do p */ int -rt_examine(rtable_private *t, net_addr *a, struct channel *c, const struct filter *filter) +rt_examine(rtable *tp, net_addr *a, struct channel *c, const struct filter *filter) { - net *n = net_find(t, a); + rte rt = {}; - if (!n || !n->routes) - return 0; - - rte rt = n->routes->rte; + RT_LOCKED(tp, t) + { + net *n = net_find(t, a); + if (n) + rt = RTE_COPY_VALID(n->routes); + } - if (!rte_is_valid(&rt)) + if (!rt.src) return 0; - rte_update_lock(c); - - /* Rest is stripped down export_filter() */ int v = c->proto->preexport ? c->proto->preexport(c, &rt) : 0; if (v == RIC_PROCESS) - v = (f_run(filter, &rt, c->rte_update_pool, FF_SILENT) <= F_ACCEPT); - - rte_update_unlock(c); + v = (f_run(filter, &rt, FF_SILENT) <= F_ACCEPT); return v > 0; } static void -rt_export_stopped(void *data) +rt_table_export_done(void *hh) { - struct rt_export_hook *hook = data; + struct rt_table_export_hook *hook = hh; + struct rt_export_request *req = hook->h.req; + void (*stopped)(struct rt_export_request *) = hook->h.stopped; + rtable *t = SKIP_BACK(rtable, priv.exporter, hook->table); - RT_LOCKED(hook->table, tab) + RT_LOCKED(t, tab) { + DBG("Export hook %p in table %s finished uc=%u\n", hook, tab->name, tab->use_count); + /* Drop pending exports */ - rt_export_used(tab); + rt_export_used(&tab->exporter, hook->h.req->name, "stopped"); - /* Unlist */ - rem_node(&hook->n); + /* Do the common code; this frees the hook */ + rt_export_stopped(&hook->h); } /* Report the channel as stopped. */ - hook->stopped(hook->req); + CALL(stopped, req); - RT_LOCKED(hook->table, tab) - { - /* Free the hook together with its coroutine. */ - rfree(hook->pool); - rt_unlock_table(tab); - - DBG("Export hook %p in table %s finished uc=%u\n", hook, tab->name, tab->use_count); - } + /* Unlock the table; this may free it */ + rt_unlock_table(t); } +void +rt_export_stopped(struct rt_export_hook *hook) +{ + /* Unlink from the request */ + hook->req->hook = NULL; + + /* Unlist */ + rem_node(&hook->n); + + /* Free the hook itself together with its pool */ + rfree(hook->pool); +} static inline void rt_set_import_state(struct rt_import_hook *hook, u8 state) @@ -1705,121 +2012,288 @@ rt_set_import_state(struct rt_import_hook *hook, u8 state) hook->last_state_change = current_time(); hook->import_state = state; - if (hook->req->log_state_change) - hook->req->log_state_change(hook->req, state); + CALL(hook->req->log_state_change, hook->req, state); } -static inline void +void rt_set_export_state(struct rt_export_hook *hook, u8 state) { hook->last_state_change = current_time(); - atomic_store_explicit(&hook->export_state, state, memory_order_release); + u8 old = atomic_exchange_explicit(&hook->export_state, state, memory_order_release); - if (hook->req->log_state_change) - hook->req->log_state_change(hook->req, state); + if (old != state) + CALL(hook->req->log_state_change, hook->req, state); } void rt_request_import(rtable *t, struct rt_import_request *req) { - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); - rt_lock_table(tab); - - struct rt_import_hook *hook = req->hook = mb_allocz(tab->rp, sizeof(struct rt_import_hook)); - - DBG("Lock table %s for import %p req=%p uc=%u\n", tab->name, hook, req, tab->use_count); + RT_LOCKED(t, tab) + { + rt_lock_table(tab); - hook->req = req; - hook->table = t; + struct rt_import_hook *hook = req->hook = mb_allocz(tab->rp, sizeof(struct rt_import_hook)); - hook->export_announce_event = ev_new_init(tab->rp, rt_import_announce_exports, hook); + hook->announce_event = (event) { .hook = rt_import_announce_exports, .data = hook }; - if (!hook->stale_set) - hook->stale_set = hook->stale_valid = hook->stale_pruning = hook->stale_pruned = 1; + DBG("Lock table %s for import %p req=%p uc=%u\n", tab->name, hook, req, tab->use_count); - rt_set_import_state(hook, TIS_UP); + hook->req = req; + hook->table = t; - hook->n = (node) {}; - add_tail(&tab->imports, &hook->n); - - RT_UNLOCK(t); + rt_set_import_state(hook, TIS_UP); + add_tail(&tab->imports, &hook->n); + } } void -rt_stop_import(struct rt_import_request *req, event *stopped) +rt_stop_import(struct rt_import_request *req, void (*stopped)(struct rt_import_request *)) { ASSERT_DIE(req->hook); struct rt_import_hook *hook = req->hook; - rtable_private *tab = RT_LOCK(hook->table); + RT_LOCKED(hook->table, tab) + { + rt_schedule_prune(tab); + rt_set_import_state(hook, TIS_STOP); + hook->stopped = stopped; - rt_schedule_prune(tab); + if (hook->stale_set != hook->stale_pruned) + tab->rr_counter -= (hook->stale_set - hook->stale_pruned - 1); + else + tab->rr_counter++; - rt_set_import_state(hook, TIS_STOP); - hook->stopped = stopped; + hook->stale_set = hook->stale_pruned = hook->stale_pruning = hook->stale_valid = 0; + } +} - if (hook->stale_set < hook->stale_valid) - if (!--tab->rr_count) - rt_schedule_notify(tab); +static void rt_table_export_start_feed(struct rtable_private *tab, struct rt_table_export_hook *hook); +static void +rt_table_export_uncork(void *_hook) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); - RT_UNLOCK(tab); + struct rt_table_export_hook *hook = _hook; + struct birdloop *loop = hook->h.req->list->loop; + + if (loop != &main_birdloop) + birdloop_enter(loop); + + u8 state; + switch (state = atomic_load_explicit(&hook->h.export_state, memory_order_relaxed)) + { + case TES_HUNGRY: + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, hook->table)), tab) + if ((state = atomic_load_explicit(&hook->h.export_state, memory_order_relaxed)) == TES_HUNGRY) + rt_table_export_start_feed(tab, hook); + if (state != TES_STOP) + break; + /* fall through */ + case TES_STOP: + rt_stop_export_common(&hook->h); + break; + default: + bug("Uncorking a table export in a strange state: %u", state); + } + + if (loop != &main_birdloop) + birdloop_leave(loop); } -void -rt_request_export(rtable *t, struct rt_export_request *req) +static void +rt_table_export_start_locked(struct rtable_private *tab, struct rt_export_request *req) { - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); + struct rt_exporter *re = &tab->exporter.e; rt_lock_table(tab); - pool *p = rp_new(tab->rp, "Export hook"); - struct rt_export_hook *hook = req->hook = mb_allocz(p, sizeof(struct rt_export_hook)); - hook->pool = p; - - hook->req = req; - hook->table = t; + req->hook = rt_alloc_export(re, sizeof(struct rt_table_export_hook)); + req->hook->req = req; + + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, req->hook); + hook->h.event = (event) { + .hook = rt_table_export_uncork, + .data = hook, + }; + + if (rt_cork_check(&hook->h.event)) + rt_set_export_state(&hook->h, TES_HUNGRY); + else + rt_table_export_start_feed(tab, hook); +} + +static void +rt_table_export_start_feed(struct rtable_private *tab, struct rt_table_export_hook *hook) +{ + struct rt_exporter *re = &tab->exporter.e; + struct rt_export_request *req = hook->h.req; /* stats zeroed by mb_allocz */ + switch (req->addr_mode) + { + case TE_ADDR_IN: + if (tab->trie && net_val_match(tab->addr_type, NB_IP)) + { + hook->walk_state = mb_allocz(hook->h.pool, sizeof (struct f_trie_walk_state)); + hook->walk_lock = rt_lock_trie(tab); + trie_walk_init(hook->walk_state, tab->trie, req->addr); + hook->h.event.hook = rt_feed_by_trie; + hook->walk_last.type = 0; + break; + } + /* fall through */ + case TE_ADDR_NONE: + FIB_ITERATE_INIT(&hook->feed_fit, &tab->fib); + hook->h.event.hook = rt_feed_by_fib; + break; - bmap_init(&hook->seq_map, p, 1024); + case TE_ADDR_EQUAL: + hook->h.event.hook = rt_feed_equal; + break; - rt_set_export_state(hook, TES_HUNGRY); + case TE_ADDR_FOR: + hook->h.event.hook = rt_feed_for; + break; - hook->n = (node) {}; - add_tail(&tab->exports, &hook->n); + default: + bug("Requested an unknown export address mode"); + } DBG("New export hook %p req %p in table %s uc=%u\n", hook, req, tab->name, tab->use_count); - hook->event = ev_new_init(p, rt_feed_channel, hook); - RT_UNLOCK(t); + struct rt_pending_export *rpe = rt_last_export(hook->table); + DBG("store hook=%p last_export=%p seq=%lu\n", hook, rpe, rpe ? rpe->seq : 0); + atomic_store_explicit(&hook->last_export, rpe, memory_order_relaxed); + + rt_init_export(re, req->hook); +} + +static void +rt_table_export_start(struct rt_exporter *re, struct rt_export_request *req) +{ + RT_LOCKED(SKIP_BACK(rtable, priv.exporter.e, re), tab) + rt_table_export_start_locked(tab, req); +} + +void rt_request_export(rtable *t, struct rt_export_request *req) +{ + RT_LOCKED(t, tab) + rt_table_export_start_locked(tab, req); /* Is locked inside */ +} + +void +rt_request_export_other(struct rt_exporter *re, struct rt_export_request *req) +{ + return re->class->start(re, req); +} + +struct rt_export_hook * +rt_alloc_export(struct rt_exporter *re, uint size) +{ + pool *p = rp_new(re->rp, "Export hook"); + struct rt_export_hook *hook = mb_allocz(p, size); + + hook->pool = p; + hook->table = re; + + hook->n = (node) {}; + add_tail(&re->hooks, &hook->n); + + return hook; +} +void +rt_init_export(struct rt_exporter *re UNUSED, struct rt_export_hook *hook) +{ + hook->event.data = hook; + + bmap_init(&hook->seq_map, hook->pool, 1024); + + /* Regular export */ + rt_set_export_state(hook, TES_FEEDING); rt_send_export_event(hook); } +static int +rt_table_export_stop_locked(struct rt_export_hook *hh) +{ + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, hh); + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter, hook->table); + + switch (atomic_load_explicit(&hh->export_state, memory_order_relaxed)) + { + case TES_HUNGRY: + rt_trace(tab, D_EVENTS, "Stopping export hook %s must wait for uncorking; %p", hook->h.req->name, hook->h.n.next); + return 0; + case TES_FEEDING: + switch (hh->req->addr_mode) + { + case TE_ADDR_IN: + if (hook->walk_lock) + { + rt_unlock_trie(tab, hook->walk_lock); + hook->walk_lock = NULL; + mb_free(hook->walk_state); + hook->walk_state = NULL; + break; + } + /* fall through */ + case TE_ADDR_NONE: + fit_get(&tab->fib, &hook->feed_fit); + break; + } + + } + + rt_trace(tab, D_EVENTS, "Stopping export hook %s right now", hook->h.req->name); + return 1; +} + +static void +rt_table_export_stop(struct rt_export_hook *hh) +{ + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, hh); + int ok = 0; + rtable *t = SKIP_BACK(rtable, priv.exporter, hook->table); + if (RT_IS_LOCKED(t)) + ok = rt_table_export_stop_locked(hh); + else + RT_LOCKED(t, tab) + ok = rt_table_export_stop_locked(hh); + + if (ok) + rt_stop_export_common(hh); + else + rt_set_export_state(&hook->h, TES_STOP); +} + void rt_stop_export(struct rt_export_request *req, void (*stopped)(struct rt_export_request *)) { + ASSERT_DIE(birdloop_inside(req->list->loop)); ASSERT_DIE(req->hook); struct rt_export_hook *hook = req->hook; - RT_LOCK(hook->table); - rtable_private *tab = RT_PRIV(hook->table); + /* Set the stopped callback */ + hook->stopped = stopped; - /* Stop feeding */ - ev_postpone(hook->event); + /* Run the stop code */ + if (hook->table->class->stop) + hook->table->class->stop(hook); + else + rt_stop_export_common(hook); +} - if (atomic_load_explicit(&hook->export_state, memory_order_relaxed) == TES_FEEDING) - fit_get(&tab->fib, &hook->feed_fit); +void +rt_stop_export_common(struct rt_export_hook *hook) +{ + /* Update export state */ + rt_set_export_state(hook, TES_STOP); - hook->event->hook = rt_export_stopped; - hook->stopped = stopped; + /* Reset the event as the stopped event */ + hook->event.hook = hook->table->class->done; + /* Run the stopped event */ rt_send_export_event(hook); - - RT_UNLOCK(hook->table); - - rt_set_export_state(hook, TES_STOP); } /** @@ -1832,48 +2306,50 @@ rt_stop_export(struct rt_export_request *req, void (*stopped)(struct rt_export_r * routes to the routing table (by rte_update()). After that, all protocol * routes (more precisely routes with @c as @sender) not sent during the * refresh cycle but still in the table from the past are pruned. This is - * implemented by setting rte->stale_cycle to req->stale_set in rte_update() - * and then dropping all routes with old stale_cycle values in table prune loop. */ + * implemented by marking all related routes as stale by REF_STALE flag in + * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD + * flag in rt_refresh_end() and then removing such routes in the prune loop. + */ void rt_refresh_begin(struct rt_import_request *req) { struct rt_import_hook *hook = req->hook; ASSERT_DIE(hook); - - RT_LOCK(hook->table); - rtable_private *tab = RT_PRIV(hook->table); - ASSERT_DIE(hook->stale_set == hook->stale_valid); + RT_LOCKED(hook->table, tab) + { + /* If the pruning routine is too slow */ if ((hook->stale_pruned < hook->stale_valid) && (hook->stale_pruned + 128 < hook->stale_valid) || (hook->stale_pruned > hook->stale_valid) && (hook->stale_pruned > hook->stale_valid + 128)) { - log(L_WARN "Route refresh flood in table %s", tab->name); + log(L_WARN "Route refresh flood in table %s", hook->table->name); FIB_WALK(&tab->fib, net, n) { - for (struct rte_storage *e = n->routes; e; e = e->next) - if (e->rte.sender == req->hook) - e->rte.stale_cycle = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + if (e->rte.sender == req->hook) + e->rte.stale_cycle = 0; } FIB_WALK_END; + tab->rr_counter -= (hook->stale_set - hook->stale_pruned - 1); hook->stale_set = 1; hook->stale_valid = 0; hook->stale_pruned = 0; } + /* Setting a new value of the stale modifier */ else if (!++hook->stale_set) { /* Let's reserve the stale_cycle zero value for always-invalid routes */ hook->stale_set = 1; hook->stale_valid = 0; + tab->rr_counter++; } - tab->rr_count++; - if (req->trace_routes & D_STATES) log(L_TRACE "%s: route refresh begin [%u]", req->name, hook->stale_set); - RT_UNLOCK(tab); + } } /** @@ -1890,19 +2366,16 @@ rt_refresh_end(struct rt_import_request *req) struct rt_import_hook *hook = req->hook; ASSERT_DIE(hook); - rtable_private *tab = RT_LOCK(hook->table); - hook->stale_valid++; - ASSERT_DIE(hook->stale_set == hook->stale_valid); - - rt_schedule_prune(tab); - - if (req->trace_routes & D_STATES) - log(L_TRACE "%s: route refresh end [%u]", req->name, hook->stale_valid); + RT_LOCKED(hook->table, tab) + { + hook->stale_valid++; + ASSERT_DIE(hook->stale_set == hook->stale_valid); - if (!--tab->rr_count) - rt_schedule_notify(tab); + rt_schedule_prune(tab); - RT_UNLOCK(tab); + if (req->trace_routes & D_STATES) + log(L_TRACE "%s: route refresh end [%u]", req->name, hook->stale_valid); + } } /** @@ -1916,7 +2389,7 @@ rte_dump(struct rte_storage *e) { debug("%-1N ", e->rte.net); debug("PF=%02x ", e->rte.pflags); - rta_dump(e->rte.attrs); + ea_dump(e->rte.attrs); debug("\n"); } @@ -1927,11 +2400,12 @@ rte_dump(struct rte_storage *e) * This function dumps contents of a given routing table to debug output. */ void -rt_dump(rtable *tab) +rt_dump(rtable *tp) { - RT_LOCK(tab); - rtable_private *t = RT_PRIV(tab); - debug("Dump of routing table <%s>%s\n", t->name, t->delete_event ? " (deleted)" : ""); + RT_LOCKED(tp, t) + { + + debug("Dump of routing table <%s>%s\n", t->name, t->deleted ? " (deleted)" : ""); #ifdef DEBUGGING fib_check(&t->fib); #endif @@ -1942,7 +2416,8 @@ rt_dump(rtable *tab) } FIB_WALK_END; debug("\n"); - RT_UNLOCK(tab); + + } } /** @@ -1958,16 +2433,20 @@ rt_dump_all(void) WALK_LIST2(t, n, routing_tables, n) rt_dump(t); + + WALK_LIST2(t, n, deleted_routing_tables, n) + rt_dump(t); } void -rt_dump_hooks(rtable *t) +rt_dump_hooks(rtable *tp) { - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); - debug("Dump of hooks in routing table <%s>%s\n", tab->name, tab->delete_event ? " (deleted)" : ""); - debug(" nhu_state=%u hcu_scheduled=%u use_count=%d rt_count=%u\n", - atomic_load(&tab->nhu_state), ev_active(tab->hcu_event), tab->use_count, tab->rt_count); + RT_LOCKED(tp, tab) + { + + debug("Dump of hooks in routing table <%s>%s\n", tab->name, tab->deleted ? " (deleted)" : ""); + debug(" nhu_state=%u use_count=%d rt_count=%u\n", + tab->nhu_state, tab->use_count, tab->rt_count); debug(" last_rt_change=%t gc_time=%t gc_counter=%d prune_state=%u\n", tab->last_rt_change, tab->gc_time, tab->gc_counter, tab->prune_state); @@ -1981,16 +2460,18 @@ rt_dump_hooks(rtable *t) ih->last_state_change, ih->import_state, ih->stopped); } - struct rt_export_hook *eh; - WALK_LIST(eh, tab->exports) + struct rt_table_export_hook *eh; + WALK_LIST(eh, tab->exporter.e.hooks) { - eh->req->dump_req(eh->req); + eh->h.req->dump_req(eh->h.req); debug(" Export hook %p requested by %p:" " refeed_pending=%u last_state_change=%t export_state=%u\n", - eh, eh->req, eh->refeed_pending, eh->last_state_change, atomic_load_explicit(&eh->export_state, memory_order_relaxed)); + eh, eh->h.req, eh->refeed_pending, eh->h.last_state_change, + atomic_load_explicit(&eh->h.export_state, memory_order_relaxed)); } debug("\n"); - RT_UNLOCK(t); + + } } void @@ -2003,146 +2484,271 @@ rt_dump_hooks_all(void) WALK_LIST2(t, n, routing_tables, n) rt_dump_hooks(t); + + WALK_LIST2(t, n, deleted_routing_tables, n) + rt_dump_hooks(t); } static inline void -rt_schedule_nhu(rtable *tab) +rt_schedule_nhu(struct rtable_private *tab) { - atomic_fetch_or_explicit(&tab->nhu_state, NHU_SCHEDULED, memory_order_acq_rel); - ev_send_loop(tab->loop, tab->nhu_event); - - /* state change: - * NHU_CLEAN -> NHU_SCHEDULED - * NHU_RUNNING -> NHU_DIRTY - */ + if (tab->nhu_corked) + { + if (!(tab->nhu_corked & NHU_SCHEDULED)) + tab->nhu_corked |= NHU_SCHEDULED; + } + else if (!(tab->nhu_state & NHU_SCHEDULED)) + { + rt_trace(tab, D_EVENTS, "Scheduling NHU"); + + /* state change: + * NHU_CLEAN -> NHU_SCHEDULED + * NHU_RUNNING -> NHU_DIRTY + */ + if ((tab->nhu_state |= NHU_SCHEDULED) == NHU_SCHEDULED) + birdloop_flag(tab->loop, RTF_NHU); + } } void -rt_schedule_prune(rtable_private *tab) +rt_schedule_prune(struct rtable_private *tab) { if (tab->prune_state == 0) - ev_send_loop(tab->loop, tab->prune_event); + birdloop_flag(tab->loop, RTF_CLEANUP); /* state change 0->1, 2->3 */ tab->prune_state |= 1; } -void -rt_export_used(rtable_private *tab) +static void +rt_export_used(struct rt_table_exporter *e, const char *who, const char *why) { - if (config->table_debug) - log(L_TRACE "%s: Export cleanup requested", tab->name); + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter, e); + ASSERT_DIE(RT_IS_LOCKED(tab)); + + rt_trace(tab, D_EVENTS, "Export cleanup requested by %s %s", who, why); + + if (tab->export_used) + return; - ev_send_loop(tab->loop, tab->ec_event); + tab->export_used = 1; + birdloop_flag(tab->loop, RTF_CLEANUP); } -static inline btime -rt_settled_time(rtable_private *tab) +static void +rt_flag_handler(struct birdloop_flag_handler *fh, u32 flags) { - ASSUME(tab->base_settle_time != 0); + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, fh, fh)), tab) + { + ASSERT_DIE(birdloop_inside(tab->loop)); + rt_lock_table(tab); + + if (flags & RTF_NHU) + rt_next_hop_update(tab); - btime min_settle_time = tab->rr_count ? tab->config->min_rr_settle_time : tab->config->min_settle_time; - btime max_settle_time = tab->rr_count ? tab->config->max_rr_settle_time : tab->config->max_settle_time; + if (flags & RTF_EXPORT) + rt_kick_export_settle(tab); - DBG("settled time computed from %t %t %t %t as %t / %t, now is %t\n", - tab->name, tab->last_rt_change, min_settle_time, - tab->base_settle_time, max_settle_time, - tab->last_rt_change + min_settle_time, - tab->base_settle_time + max_settle_time, current_time()); + if (flags & RTF_CLEANUP) + { + if (tab->export_used) + rt_export_cleanup(tab); + + if (tab->prune_state) + rt_prune_table(tab); + } - return MIN(tab->last_rt_change + min_settle_time, - tab->base_settle_time + max_settle_time); + rt_unlock_table(tab); + } } static void -rt_settle_timer(timer *t) +rt_prune_timer(timer *t) { - rtable_private *tab = t->data; - ASSERT_DIE(birdloop_inside(tab->loop)); + RT_LOCKED((rtable *) t->data, tab) + if (tab->gc_counter >= tab->config->gc_threshold) + rt_schedule_prune(tab); +} - if (!tab->base_settle_time) +static void +rt_kick_prune_timer(struct rtable_private *tab) +{ + /* Return if prune is already scheduled */ + if (tm_active(tab->prune_timer) || (tab->prune_state & 1)) return; - btime settled_time = rt_settled_time(tab); - if (current_time() < settled_time) + /* Randomize GC period to +/- 50% */ + btime gc_period = tab->config->gc_period; + gc_period = (gc_period / 2) + (random_u32() % (uint) gc_period); + tm_start_in(tab->prune_timer, gc_period, tab->loop); +} + + +static void +rt_flowspec_export_one(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) +{ + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rtable *dst_pub = ln->dst; + ASSUME(rt_is_flow(dst_pub)); + struct rtable_private *dst = RT_LOCK(dst_pub); + + /* No need to inspect it further if recalculation is already scheduled */ + if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY) + || !trie_match_net(dst->flowspec_trie, net)) { - tm_set_in(tab->settle_timer, settled_time, tab->loop); + RT_UNLOCK(dst_pub); + rpe_mark_seen_all(req->hook, first, NULL); return; } - /* Settled */ - tab->base_settle_time = 0; + /* This net may affect some flowspecs, check the actual change */ + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; + + RPE_WALK(first, rpe, NULL) + { + rpe_mark_seen(req->hook, rpe); + new_best = rpe->new_best; + } + + /* Yes, something has actually changed. Schedule the update. */ + if (o != RTE_VALID_OR_NULL(new_best)) + rt_schedule_nhu(dst); - struct rt_subscription *s; - WALK_LIST(s, tab->subscribers) - ev_send(s->event->list, s->event); + RT_UNLOCK(dst_pub); } static void -rt_kick_settle_timer(rtable_private *tab) +rt_flowspec_dump_req(struct rt_export_request *req) { - tab->base_settle_time = current_time(); - - if (!tab->settle_timer) - tab->settle_timer = tm_new_init(tab->rp, rt_settle_timer, tab, 0, 0); - - if (!tm_active(tab->settle_timer)) - tm_set_in(tab->settle_timer, rt_settled_time(tab), tab->loop); + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + debug(" Flowspec link for table %s (%p)\n", ln->dst->name, req); } -static inline void -rt_schedule_notify(rtable_private *tab) +static void +rt_flowspec_log_state_change(struct rt_export_request *req, u8 state) { - if (EMPTY_LIST(tab->subscribers)) - return; + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rt_trace(ln->dst, D_STATES, "Flowspec link from %s export state changed to %s", + ln->src->name, rt_export_state_name(state)); +} - if (tab->base_settle_time) - return; +static struct rt_flowspec_link * +rt_flowspec_find_link(struct rtable_private *src, rtable *dst) +{ + struct rt_table_export_hook *hook; node *n; + WALK_LIST2(hook, n, src->exporter.e.hooks, h.n) + switch (atomic_load_explicit(&hook->h.export_state, memory_order_acquire)) + { + case TES_HUNGRY: + case TES_FEEDING: + case TES_READY: + if (hook->h.req->export_one == rt_flowspec_export_one) + { + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, hook->h.req); + if (ln->dst == dst) + return ln; + } + } - rt_kick_settle_timer(tab); + return NULL; } void -rt_subscribe(rtable *t, struct rt_subscription *s) +rt_flowspec_link(rtable *src_pub, rtable *dst_pub) { - s->tab = t; - RT_LOCKED(t, tab) + ASSERT(rt_is_ip(src_pub)); + ASSERT(rt_is_flow(dst_pub)); + + int lock_dst = 0; + + RT_LOCKED(src_pub, src) { - rt_lock_table(tab); - DBG("rt_subscribe(%s)\n", tab->name); - add_tail(&tab->subscribers, &s->n); + struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst_pub); + + if (!ln) + { + pool *p = src->rp; + ln = mb_allocz(p, sizeof(struct rt_flowspec_link)); + ln->src = src_pub; + ln->dst = dst_pub; + ln->req = (struct rt_export_request) { + .name = mb_sprintf(p, "%s.flowspec.notifier", dst_pub->name), + .list = &global_work_list, + .trace_routes = src->config->debug, + .dump_req = rt_flowspec_dump_req, + .log_state_change = rt_flowspec_log_state_change, + .export_one = rt_flowspec_export_one, + }; + + rt_table_export_start_locked(src, &ln->req); + + lock_dst = 1; + } + + ln->uc++; } + + if (lock_dst) + rt_lock_table(dst_pub); +} + +static void +rt_flowspec_link_stopped(struct rt_export_request *req) +{ + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rtable *dst = ln->dst; + + mb_free(ln); + rt_unlock_table(dst); } void -rt_unsubscribe(struct rt_subscription *s) +rt_flowspec_unlink(rtable *src, rtable *dst) { - RT_LOCKED(s->tab, tab) + struct rt_flowspec_link *ln; + RT_LOCKED(src, t) { - rem_node(&s->n); - if (EMPTY_LIST(tab->subscribers) && tm_active(tab->settle_timer)) - tm_stop(tab->settle_timer); - rt_unlock_table(tab); + ln = rt_flowspec_find_link(t, dst); + + ASSERT(ln && (ln->uc > 0)); + + if (!--ln->uc) + rt_stop_export(&ln->req, rt_flowspec_link_stopped); } } static void +rt_flowspec_reset_trie(struct rtable_private *tab) +{ + linpool *lp = tab->flowspec_trie->lp; + int ipv4 = tab->flowspec_trie->ipv4; + + lp_flush(lp); + tab->flowspec_trie = f_new_trie(lp, 0); + tab->flowspec_trie->ipv4 = ipv4; +} + +static void rt_free(resource *_r) { - rtable_private *r = (rtable_private *) _r; + struct rtable_private *r = SKIP_BACK(struct rtable_private, r, _r); + + DOMAIN_FREE(rtable, r->lock); DBG("Deleting routing table %s\n", r->name); ASSERT_DIE(r->use_count == 0); - ASSERT_DIE(r->rt_count == 0); - ASSERT_DIE(!r->cork_active); - ASSERT_DIE(EMPTY_LIST(r->imports)); - ASSERT_DIE(EMPTY_LIST(r->exports)); + + r->config->table = NULL; + rem_node(&r->n); + + if (r->hostcache) + rt_free_hostcache(r); /* Freed automagically by the resource pool fib_free(&r->fib); hmap_free(&r->id_map); rfree(r->rt_event); - rfree(r->settle_timer); mb_free(r); */ } @@ -2150,30 +2756,44 @@ rt_free(resource *_r) static void rt_res_dump(resource *_r) { - RT_LOCKED((rtable *) _r, r) + struct rtable_private *r = SKIP_BACK(struct rtable_private, r, _r); + debug("name \"%s\", addr_type=%s, rt_count=%u, use_count=%d\n", r->name, net_label[r->addr_type], r->rt_count, r->use_count); } static struct resclass rt_class = { .name = "Routing table", - .size = sizeof(rtable_private), + .size = sizeof(rtable), .free = rt_free, .dump = rt_res_dump, .lookup = NULL, .memsize = NULL, }; +static const struct rt_exporter_class rt_table_exporter_class = { + .start = rt_table_export_start, + .stop = rt_table_export_stop, + .done = rt_table_export_done, +}; + +void +rt_exporter_init(struct rt_exporter *e) +{ + init_list(&e->hooks); +} + +static struct idm rtable_idm; +uint rtable_max_id = 0; + rtable * rt_setup(pool *pp, struct rtable_config *cf) { - int ns = strlen("Routing table ") + strlen(cf->name) + 1; - void *nb = mb_alloc(pp, ns); - ASSERT_DIE(ns - 1 == bsnprintf(nb, ns, "Routing table %s", cf->name)); + ASSERT_DIE(birdloop_inside(&main_birdloop)); - pool *p = rp_new(pp, nb); + pool *p = rp_newf(pp, "Routing table %s", cf->name); - rtable_private *t = ralloc(p, &rt_class); + struct rtable_private *t = ralloc(p, &rt_class); t->rp = p; t->rte_slab = sl_new(p, sizeof(struct rte_storage)); @@ -2181,38 +2801,64 @@ rt_setup(pool *pp, struct rtable_config *cf) t->name = cf->name; t->config = cf; t->addr_type = cf->addr_type; + t->id = idm_alloc(&rtable_idm); + if (t->id >= rtable_max_id) + rtable_max_id = t->id + 1; + + t->lock = DOMAIN_NEW(rtable, t->name); fib_init(&t->fib, p, t->addr_type, sizeof(net), OFFSETOF(net, n), 0, NULL); + if (cf->trie_used) + { + t->trie = f_new_trie(lp_new_default(p), 0); + t->trie->ipv4 = net_val_match(t->addr_type, NB_IP4 | NB_VPN4 | NB_ROA4); + + t->fib.init = net_init_with_trie; + } + init_list(&t->imports); - init_list(&t->exports); hmap_init(&t->id_map, p, 1024); hmap_set(&t->id_map, 0); - init_list(&t->pending_exports); - init_list(&t->subscribers); + t->fh = (struct birdloop_flag_handler) { .hook = rt_flag_handler, }; + t->nhu_uncork_event = ev_new_init(p, rt_nhu_uncork, t); + t->prune_timer = tm_new_init(p, rt_prune_timer, t, 0, 0); + t->last_rt_change = t->gc_time = current_time(); - t->loop = birdloop_new(p, DOMAIN_ORDER(rtable), nb); + t->export_settle = SETTLE_INIT(&cf->export_settle, rt_announce_exports, NULL); - t->announce_event = ev_new_init(p, rt_announce_exports, t); - t->ec_event = ev_new_init(p, rt_export_cleanup, t); - t->prune_event = ev_new_init(p, rt_prune_table, t); - t->hcu_event = ev_new_init(p, rt_update_hostcache, t); - t->nhu_event = ev_new_init(p, rt_next_hop_update, t); + t->exporter = (struct rt_table_exporter) { + .e = { + .class = &rt_table_exporter_class, + .addr_type = t->addr_type, + .rp = t->rp, + }, + .next_seq = 1, + }; - t->nhu_event->cork = &rt_cork; - t->prune_event->cork = &rt_cork; + rt_exporter_init(&t->exporter.e); - t->last_rt_change = t->gc_time = current_time(); - t->next_export_seq = 1; + init_list(&t->exporter.pending); + + t->cork_threshold = cf->cork_threshold; t->rl_pipe = (struct tbf) TBF_DEFAULT_LOG_LIMITS; - t->nhu_lp = lp_new_default(p); + if (rt_is_flow(RT_PUB(t))) + { + t->flowspec_trie = f_new_trie(lp_new_default(p), 0); + t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4); + } - mb_move(nb, p); - return (rtable *) t; + /* Start the service thread */ + t->loop = birdloop_new(p, DOMAIN_ORDER(service), mb_sprintf(p, "Routing tahle %s", t->name)); + birdloop_enter(t->loop); + birdloop_flag_set_handler(t->loop, &t->fh); + birdloop_leave(t->loop); + + return RT_PUB(t); } /** @@ -2227,9 +2873,13 @@ rt_init(void) rta_init(); rt_table_pool = rp_new(&root_pool, "Routing tables"); init_list(&routing_tables); - ev_init_cork(&rt_cork, "Route Table Cork"); + init_list(&deleted_routing_tables); + ev_init_list(&rt_cork.queue, &main_birdloop, "Route cork release"); + rt_cork.run = (event) { .hook = rt_cork_release_hook }; + idm_init(&rtable_idm, rt_table_pool, 256); } + /** * rt_prune_table - prune a routing table * @@ -2245,18 +2895,15 @@ rt_init(void) * iteration. */ static void -rt_prune_table(void *data) +rt_prune_table(struct rtable_private *tab) { - rtable_private *tab = data; - ASSERT_DIE(birdloop_inside(tab->loop)); - struct fib_iterator *fit = &tab->prune_fit; - int limit = 512; + int limit = 2000; struct rt_import_hook *ih; node *n, *x; - DBG("Pruning route table %s\n", tab->name); + rt_trace(tab, D_STATES, "Pruning"); #ifdef DEBUGGING fib_check(&tab->fib); #endif @@ -2264,8 +2911,6 @@ rt_prune_table(void *data) if (tab->prune_state == 0) return; - rt_lock_table(tab); - if (tab->prune_state == 1) { /* Mark channels to flush */ @@ -2282,28 +2927,36 @@ rt_prune_table(void *data) FIB_ITERATE_INIT(fit, &tab->fib); tab->prune_state = 2; + + tab->gc_counter = 0; + tab->gc_time = current_time(); + + if (tab->prune_trie) + { + /* Init prefix trie pruning */ + tab->trie_new = f_new_trie(lp_new_default(tab->rp), 0); + tab->trie_new->ipv4 = tab->trie->ipv4; + } } again: FIB_ITERATE_START(&tab->fib, fit, net, n) { rescan: + if (limit <= 0) + { + FIB_ITERATE_PUT(fit); + birdloop_flag(tab->loop, RTF_CLEANUP); + return; + } + for (struct rte_storage *e=n->routes; e; e=e->next) { struct rt_import_hook *s = e->rte.sender; - if ((s->import_state == TIS_FLUSHING) || (e->rte.stale_cycle < s->stale_valid) || (e->rte.stale_cycle > s->stale_set)) { - if (limit <= 0) - { - FIB_ITERATE_PUT(fit); - ev_send_loop(tab->loop, tab->prune_event); - rt_unlock_table(tab); - return; - } - rte_recalculate(tab, e->rte.sender, n, NULL, e->rte.src); limit--; @@ -2317,19 +2970,56 @@ again: fib_delete(&tab->fib, n); goto again; } + + if (tab->trie_new) + { + trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen); + limit--; + } } FIB_ITERATE_END; + rt_trace(tab, D_EVENTS, "Prune done, scheduling export timer"); + rt_kick_export_settle(tab); + #ifdef DEBUGGING fib_check(&tab->fib); #endif - tab->gc_counter = 0; - tab->gc_time = current_time(); - /* state change 2->0, 3->1 */ if (tab->prune_state &= 1) - ev_send_loop(tab->loop, tab->prune_event); + birdloop_flag(tab->loop, RTF_CLEANUP); + + if (tab->trie_new) + { + /* Finish prefix trie pruning */ + + if (!tab->trie_lock_count) + { + rfree(tab->trie->lp); + } + else + { + ASSERT(!tab->trie_old); + tab->trie_old = tab->trie; + tab->trie_old_lock_count = tab->trie_lock_count; + tab->trie_lock_count = 0; + } + + tab->trie = tab->trie_new; + tab->trie_new = NULL; + tab->prune_trie = 0; + } + else + { + /* Schedule prefix trie pruning */ + if (tab->trie && !tab->trie_old && (tab->trie->prefix_count > (2 * tab->fib.entries))) + { + /* state change 0->1, 2->3 */ + tab->prune_state |= 1; + tab->prune_trie = 1; + } + } uint flushed_channels = 0; @@ -2337,45 +3027,53 @@ again: WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) if (ih->import_state == TIS_FLUSHING) { - ih->flush_seq = tab->next_export_seq; + ih->flush_seq = tab->exporter.next_seq; rt_set_import_state(ih, TIS_WAITING); flushed_channels++; + tab->rr_counter--; } else if (ih->stale_pruning != ih->stale_pruned) { + tab->rr_counter -= (ih->stale_pruned - ih->stale_pruning); ih->stale_pruned = ih->stale_pruning; - if (ih->req->trace_routes & D_STATES) log(L_TRACE "%s: table prune after refresh end [%u]", ih->req->name, ih->stale_pruned); } /* In some cases, we may want to directly proceed to export cleanup */ - if (EMPTY_LIST(tab->exports) && flushed_channels) + if (EMPTY_LIST(tab->exporter.e.hooks) && flushed_channels) rt_export_cleanup(tab); - - rt_unlock_table(tab); } static void -rt_export_cleanup(void *data) +rt_export_cleanup(struct rtable_private *tab) { - rtable_private *tab = data; - ASSERT_DIE(birdloop_inside(tab->loop)); + tab->export_used = 0; u64 min_seq = ~((u64) 0); struct rt_pending_export *last_export_to_free = NULL; - struct rt_pending_export *first_export = tab->first_export; + struct rt_pending_export *first = tab->exporter.first; + int want_prune = 0; - struct rt_export_hook *eh; + struct rt_table_export_hook *eh; node *n; - WALK_LIST2(eh, n, tab->exports, n) + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) { - switch (atomic_load_explicit(&eh->export_state, memory_order_acquire)) + switch (atomic_load_explicit(&eh->h.export_state, memory_order_acquire)) { - case TES_DOWN: - case TES_HUNGRY: + /* Export cleanup while feeding isn't implemented */ + case TES_FEEDING: + goto done; + + /* States not interfering with export cleanup */ + case TES_DOWN: /* This should not happen at all */ + log(L_WARN "%s: Export cleanup found hook %s in explicit state TES_DOWN", tab->name, eh->h.req->name); + /* fall through */ + case TES_HUNGRY: /* Feeding waiting for uncork */ + case TES_STOP: /* No more export will happen on this hook */ continue; + /* Regular export */ case TES_READY: { struct rt_pending_export *last = atomic_load_explicit(&eh->last_export, memory_order_acquire); @@ -2392,23 +3090,20 @@ rt_export_cleanup(void *data) } default: - /* It's only safe to cleanup when the export state is idle or regular. No feeding or stopping allowed. */ - goto done; + bug("%s: Strange export state of hook %s: %d", tab->name, eh->h.req->name, atomic_load_explicit(&eh->h.export_state, memory_order_relaxed)); } } - tab->first_export = last_export_to_free ? rt_next_export_fast(last_export_to_free) : NULL; + tab->exporter.first = last_export_to_free ? rt_next_export_fast(last_export_to_free) : NULL; - if (config->table_debug) - log(L_TRACE "%s: Export cleanup, old first_export seq %lu, new %lu, min_seq %ld", - tab->name, - first_export ? first_export->seq : 0, - tab->first_export ? tab->first_export->seq : 0, + rt_trace(tab, D_STATES, "Export cleanup, old exporter.first seq %lu, new %lu, min_seq %ld", + first ? first->seq : 0, + tab->exporter.first ? tab->exporter.first->seq : 0, min_seq); - WALK_LIST2(eh, n, tab->exports, n) + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) { - if (atomic_load_explicit(&eh->export_state, memory_order_acquire) != TES_READY) + if (atomic_load_explicit(&eh->h.export_state, memory_order_acquire) != TES_READY) continue; struct rt_pending_export *last = atomic_load_explicit(&eh->last_export, memory_order_acquire); @@ -2424,45 +3119,47 @@ rt_export_cleanup(void *data) } } - while (first_export && (first_export->seq <= min_seq)) + while (first && (first->seq <= min_seq)) { - ASSERT_DIE(first_export->new || first_export->old); + ASSERT_DIE(first->new || first->old); - const net_addr *n = first_export->new ? - first_export->new->rte.net : - first_export->old->rte.net; + const net_addr *n = first->new ? + first->new->rte.net : + first->old->rte.net; net *net = SKIP_BACK(struct network, n.addr, (net_addr (*)[0]) n); - ASSERT_DIE(net->first == first_export); - - if (first_export == net->last) + ASSERT_DIE(net->first == first); + + if (first == net->last) /* The only export here */ net->last = net->first = NULL; else /* First is now the next one */ - net->first = atomic_load_explicit(&first_export->next, memory_order_relaxed); + net->first = atomic_load_explicit(&first->next, memory_order_relaxed); + + want_prune += !net->routes && !net->first; /* For now, the old route may be finally freed */ - if (first_export->old) + if (first->old) { - rt_rte_trace_in(D_ROUTES, first_export->old->rte.sender->req, &first_export->old->rte, "freed"); - hmap_clear(&tab->id_map, first_export->old->rte.id); - rte_free(first_export->old, tab); + rt_rte_trace_in(D_ROUTES, first->old->rte.sender->req, &first->old->rte, "freed"); + hmap_clear(&tab->id_map, first->old->rte.id); + rte_free(first->old); } #ifdef LOCAL_DEBUG - memset(first_export, 0xbd, sizeof(struct rt_pending_export)); + memset(first, 0xbd, sizeof(struct rt_pending_export)); #endif - struct rt_export_block *reb = HEAD(tab->pending_exports); - ASSERT_DIE(reb == PAGE_HEAD(first_export)); + struct rt_export_block *reb = HEAD(tab->exporter.pending); + ASSERT_DIE(reb == PAGE_HEAD(first)); - u32 pos = (first_export - &reb->export[0]); + u32 pos = (first - &reb->export[0]); u32 end = atomic_load_explicit(&reb->end, memory_order_relaxed); ASSERT_DIE(pos < end); struct rt_pending_export *next = NULL; - + if (++pos < end) next = &reb->export[pos]; else @@ -2473,66 +3170,160 @@ rt_export_cleanup(void *data) memset(reb, 0xbe, page_size); #endif - free_page(tab->rp, reb); + free_page(reb); - if (EMPTY_LIST(tab->pending_exports)) + if (EMPTY_LIST(tab->exporter.pending)) { - if (config->table_debug) - log(L_TRACE "%s: Resetting export seq", tab->name); + rt_trace(tab, D_EVENTS, "Resetting export seq"); node *n; - WALK_LIST2(eh, n, tab->exports, n) + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) { - if (atomic_load_explicit(&eh->export_state, memory_order_acquire) != TES_READY) + if (atomic_load_explicit(&eh->h.export_state, memory_order_acquire) != TES_READY) continue; ASSERT_DIE(atomic_load_explicit(&eh->last_export, memory_order_acquire) == NULL); - bmap_reset(&eh->seq_map, 1024); + bmap_reset(&eh->h.seq_map, 1024); } - tab->next_export_seq = 1; + tab->exporter.next_seq = 1; } else { - reb = HEAD(tab->pending_exports); + reb = HEAD(tab->exporter.pending); next = &reb->export[0]; } } - first_export = next; + first = next; } + rt_check_cork_low(tab); + done:; struct rt_import_hook *ih; node *x; WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) if (ih->import_state == TIS_WAITING) - if (!first_export || (first_export->seq >= ih->flush_seq)) + if (!first || (first->seq >= ih->flush_seq)) { ih->import_state = TIS_CLEARED; - ev_send(ih->req->list, ih->export_announce_event); + ev_send(ih->req->list, &ih->announce_event); } - if (EMPTY_LIST(tab->pending_exports) && ev_active(tab->announce_event)) - ev_postpone(tab->announce_event); + if ((tab->gc_counter += want_prune) >= tab->config->gc_threshold) + rt_kick_prune_timer(tab); + + if (tab->export_used) + birdloop_flag(tab->loop, RTF_CLEANUP); + + if (EMPTY_LIST(tab->exporter.pending)) + settle_cancel(&tab->export_settle); +} + +static void +rt_cork_release_hook(void *data UNUSED) +{ + do synchronize_rcu(); + while ( + !atomic_load_explicit(&rt_cork.active, memory_order_acquire) && + ev_run_list(&rt_cork.queue) + ); +} + +/** + * rt_lock_trie - lock a prefix trie of a routing table + * @tab: routing table with prefix trie to be locked + * + * The prune loop may rebuild the prefix trie and invalidate f_trie_walk_state + * structures. Therefore, asynchronous walks should lock the prefix trie using + * this function. That allows the prune loop to rebuild the trie, but postpones + * its freeing until all walks are done (unlocked by rt_unlock_trie()). + * + * Return a current trie that will be locked, the value should be passed back to + * rt_unlock_trie() for unlocking. + * + */ +struct f_trie * +rt_lock_trie(struct rtable_private *tab) +{ + ASSERT(tab->trie); + + tab->trie_lock_count++; + return tab->trie; +} + +/** + * rt_unlock_trie - unlock a prefix trie of a routing table + * @tab: routing table with prefix trie to be locked + * @trie: value returned by matching rt_lock_trie() + * + * Done for trie locked by rt_lock_trie() after walk over the trie is done. + * It may free the trie and schedule next trie pruning. + */ +void +rt_unlock_trie(struct rtable_private *tab, struct f_trie *trie) +{ + ASSERT(trie); - /* If reduced to at most one export block pending */ - if (tab->cork_active && - ((!tab->first_export) || (tab->first_export->seq + 128 > tab->next_export_seq))) + if (trie == tab->trie) { - tab->cork_active = 0; - ev_uncork(&rt_cork); - if (config->table_debug) - log(L_TRACE "%s: cork released", tab->name); + /* Unlock the current prefix trie */ + ASSERT(tab->trie_lock_count); + tab->trie_lock_count--; } + else if (trie == tab->trie_old) + { + /* Unlock the old prefix trie */ + ASSERT(tab->trie_old_lock_count); + tab->trie_old_lock_count--; + + /* Free old prefix trie that is no longer needed */ + if (!tab->trie_old_lock_count) + { + rfree(tab->trie_old->lp); + tab->trie_old = NULL; + + /* Kick prefix trie pruning that was postponed */ + if (tab->trie && (tab->trie->prefix_count > (2 * tab->fib.entries))) + { + tab->prune_trie = 1; + rt_schedule_prune(tab); + } + } + } + else + log(L_BUG "Invalid arg to rt_unlock_trie()"); } + void rt_preconfig(struct config *c) { init_list(&c->tables); - rt_new_table(cf_get_symbol("master4"), NET_IP4); - rt_new_table(cf_get_symbol("master6"), NET_IP6); + c->def_tables[NET_IP4] = cf_define_symbol(cf_get_symbol("master4"), SYM_TABLE, table, NULL); + c->def_tables[NET_IP6] = cf_define_symbol(cf_get_symbol("master6"), SYM_TABLE, table, NULL); +} + +void +rt_postconfig(struct config *c) +{ + uint num_tables = list_length(&c->tables); + btime def_gc_period = 400 MS * num_tables; + def_gc_period = MAX(def_gc_period, 10 S); + def_gc_period = MIN(def_gc_period, 600 S); + + struct rtable_config *rc; + WALK_LIST(rc, c->tables) + if (rc->gc_period == (uint) -1) + rc->gc_period = (uint) def_gc_period; + + for (uint net_type = 0; net_type < NET_MAX; net_type++) + if (c->def_tables[net_type] && !c->def_tables[net_type]->table) + { + c->def_tables[net_type]->class = SYM_VOID; + c->def_tables[net_type] = NULL; + } } @@ -2541,180 +3332,453 @@ rt_preconfig(struct config *c) * triggered by rt_schedule_nhu(). */ -static inline int -rta_next_hop_outdated(rta *a) +void +ea_set_hostentry(ea_list **to, rtable *dep, rtable *src, ip_addr gw, ip_addr ll, u32 lnum, u32 labels[lnum]) { - struct hostentry *he = a->hostentry; - - if (!he) - return 0; - - if (!he->src) - return a->dest != RTD_UNREACHABLE; - - return (a->dest != he->dest) || (a->igp_metric != he->igp_metric) || - (!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh)); + struct { + struct adata ad; + struct hostentry *he; + u32 labels[lnum]; + } *head = (void *) tmp_alloc_adata(sizeof *head - sizeof(struct adata)); + + RT_LOCKED(src, tab) + head->he = rt_get_hostentry(tab, gw, ll, dep); + memcpy(head->labels, labels, lnum * sizeof(u32)); + + ea_set_attr(to, EA_LITERAL_DIRECT_ADATA( + &ea_gen_hostentry, 0, &head->ad)); } -void -rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls, linpool *lp) + +static void +rta_apply_hostentry(ea_list **to, struct hostentry_adata *head) { - a->hostentry = he; - a->dest = he->dest; - a->igp_metric = he->igp_metric; + struct hostentry *he = head->he; + u32 *labels = head->labels; + u32 lnum = (u32 *) (head->ad.data + head->ad.length) - labels; + + ea_set_attr_u32(to, &ea_gen_igp_metric, 0, he->igp_metric); - if (a->dest != RTD_UNICAST) + if (!he->src) { - /* No nexthop */ -no_nexthop: - a->nh = (struct nexthop) {}; - if (mls) - { /* Store the label stack for later changes */ - a->nh.labels_orig = a->nh.labels = mls->len; - memcpy(a->nh.label, mls->stack, mls->len * sizeof(u32)); - } + ea_set_dest(to, 0, RTD_UNREACHABLE); return; } - if (((!mls) || (!mls->len)) && he->nexthop_linkable) + eattr *he_nh_ea = ea_find(he->src, &ea_gen_nexthop); + ASSERT_DIE(he_nh_ea); + + struct nexthop_adata *nhad = (struct nexthop_adata *) he_nh_ea->u.ptr; + int idest = nhea_dest(he_nh_ea); + + if ((idest != RTD_UNICAST) || + !lnum && he->nexthop_linkable) { /* Just link the nexthop chain, no label append happens. */ - memcpy(&(a->nh), &(he->src->nh), nexthop_size(&(he->src->nh))); + ea_copy_attr(to, he->src, &ea_gen_nexthop); return; } - struct nexthop *nhp = NULL, *nhr = NULL; - int skip_nexthop = 0; + uint total_size = OFFSETOF(struct nexthop_adata, nh); - for (struct nexthop *nh = &(he->src->nh); nh; nh = nh->next) + NEXTHOP_WALK(nh, nhad) { - if (skip_nexthop) - skip_nexthop--; - else + if (nh->labels + lnum > MPLS_MAX_LABEL_STACK) { - nhr = nhp; - nhp = (nhp ? (nhp->next = lp_alloc(lp, NEXTHOP_MAX_SIZE)) : &(a->nh)); + log(L_WARN "Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)", + nh->labels, lnum, nh->labels + lnum, MPLS_MAX_LABEL_STACK); + continue; } - memset(nhp, 0, NEXTHOP_MAX_SIZE); - nhp->iface = nh->iface; - nhp->weight = nh->weight; + total_size += NEXTHOP_SIZE_CNT(nh->labels + lnum); + } - if (mls) - { - nhp->labels = nh->labels + mls->len; - nhp->labels_orig = mls->len; - if (nhp->labels <= MPLS_MAX_LABEL_STACK) - { - memcpy(nhp->label, nh->label, nh->labels * sizeof(u32)); /* First the hostentry labels */ - memcpy(&(nhp->label[nh->labels]), mls->stack, mls->len * sizeof(u32)); /* Then the bottom labels */ - } - else - { - log(L_WARN "Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)", - nh->labels, mls->len, nhp->labels, MPLS_MAX_LABEL_STACK); - skip_nexthop++; - continue; - } - } - else if (nh->labels) + if (total_size == OFFSETOF(struct nexthop_adata, nh)) + { + log(L_WARN "No valid nexthop remaining, setting route unreachable"); + + struct nexthop_adata nha = { + .ad.length = NEXTHOP_DEST_SIZE, + .dest = RTD_UNREACHABLE, + }; + + ea_set_attr_data(to, &ea_gen_nexthop, 0, &nha.ad.data, nha.ad.length); + return; + } + + struct nexthop_adata *new = (struct nexthop_adata *) tmp_alloc_adata(total_size); + struct nexthop *dest = &new->nh; + + NEXTHOP_WALK(nh, nhad) + { + if (nh->labels + lnum > MPLS_MAX_LABEL_STACK) + continue; + + memcpy(dest, nh, NEXTHOP_SIZE(nh)); + if (lnum) { - nhp->labels = nh->labels; - nhp->labels_orig = 0; - memcpy(nhp->label, nh->label, nh->labels * sizeof(u32)); + memcpy(&(dest->label[dest->labels]), labels, lnum * sizeof labels[0]); + dest->labels += lnum; } if (ipa_nonzero(nh->gw)) - { - nhp->gw = nh->gw; /* Router nexthop */ - nhp->flags |= (nh->flags & RNF_ONLINK); - } + /* Router nexthop */ + dest->flags = (dest->flags & RNF_ONLINK); else if (!(nh->iface->flags & IF_MULTIACCESS) || (nh->iface->flags & IF_LOOPBACK)) - nhp->gw = IPA_NONE; /* PtP link - no need for nexthop */ + dest->gw = IPA_NONE; /* PtP link - no need for nexthop */ else if (ipa_nonzero(he->link)) - nhp->gw = he->link; /* Device nexthop with link-local address known */ + dest->gw = he->link; /* Device nexthop with link-local address known */ else - nhp->gw = he->addr; /* Device nexthop with link-local address unknown */ + dest->gw = he->addr; /* Device nexthop with link-local address unknown */ + + dest = NEXTHOP_NEXT(dest); } - if (skip_nexthop) - if (nhr) - nhr->next = NULL; - else + /* Fix final length */ + new->ad.length = (void *) dest - (void *) new->ad.data; + ea_set_attr(to, EA_LITERAL_DIRECT_ADATA( + &ea_gen_nexthop, 0, &new->ad)); +} + +static inline struct hostentry_adata * +rta_next_hop_outdated(ea_list *a) +{ + /* First retrieve the hostentry */ + eattr *heea = ea_find(a, &ea_gen_hostentry); + if (!heea) + return NULL; + + struct hostentry_adata *head = (struct hostentry_adata *) heea->u.ptr; + + /* If no nexthop is present, we have to create one */ + eattr *a_nh_ea = ea_find(a, &ea_gen_nexthop); + if (!a_nh_ea) + return head; + + struct nexthop_adata *nhad = (struct nexthop_adata *) a_nh_ea->u.ptr; + + /* Shortcut for unresolvable hostentry */ + if (!head->he->src) + return NEXTHOP_IS_REACHABLE(nhad) ? head : NULL; + + /* Comparing our nexthop with the hostentry nexthop */ + eattr *he_nh_ea = ea_find(head->he->src, &ea_gen_nexthop); + + return ( + (ea_get_int(a, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN) != head->he->igp_metric) || + (!head->he->nexthop_linkable) || + (!he_nh_ea != !a_nh_ea) || + (he_nh_ea && a_nh_ea && !adata_same(he_nh_ea->u.ptr, a_nh_ea->u.ptr))) + ? head : NULL; +} + +static inline int +rt_next_hop_update_rte(rte *old, rte *new) +{ + struct hostentry_adata *head = rta_next_hop_outdated(old->attrs); + if (!head) + return 0; + + *new = *old; + rta_apply_hostentry(&new->attrs, head); + return 1; +} + +static inline void +rt_next_hop_resolve_rte(rte *r) +{ + eattr *heea = ea_find(r->attrs, &ea_gen_hostentry); + if (!heea) + return; + + struct hostentry_adata *head = (struct hostentry_adata *) heea->u.ptr; + + rta_apply_hostentry(&r->attrs, head); +} + +#ifdef CONFIG_BGP + +static inline int +net_flow_has_dst_prefix(const net_addr *n) +{ + ASSUME(net_is_flow(n)); + + if (n->pxlen) + return 1; + + if (n->type == NET_FLOW4) + { + const net_addr_flow4 *n4 = (void *) n; + return (n4->length > sizeof(net_addr_flow4)) && (n4->data[0] == FLOW_TYPE_DST_PREFIX); + } + else + { + const net_addr_flow6 *n6 = (void *) n; + return (n6->length > sizeof(net_addr_flow6)) && (n6->data[0] == FLOW_TYPE_DST_PREFIX); + } +} + +static inline int +rta_as_path_is_empty(ea_list *a) +{ + eattr *e = ea_find(a, "bgp_path"); + return !e || (as_path_getlen(e->u.ptr) == 0); +} + +static inline u32 +rta_get_first_asn(ea_list *a) +{ + eattr *e = ea_find(a, "bgp_path"); + u32 asn; + + return (e && as_path_get_first_regular(e->u.ptr, &asn)) ? asn : 0; +} + +static inline enum flowspec_valid +rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, ea_list *a, int interior) +{ + ASSERT(rt_is_ip(tab_ip)); + ASSERT(rt_is_flow(tab_flow)); + + /* RFC 8955 6. a) Flowspec has defined dst prefix */ + if (!net_flow_has_dst_prefix(n)) + return FLOWSPEC_INVALID; + + /* RFC 9117 4.1. Accept AS_PATH is empty (fr */ + if (interior && rta_as_path_is_empty(a)) + return FLOWSPEC_VALID; + + + /* RFC 8955 6. b) Flowspec and its best-match route have the same originator */ + + /* Find flowspec dst prefix */ + net_addr dst; + if (n->type == NET_FLOW4) + net_fill_ip4(&dst, net4_prefix(n), net4_pxlen(n)); + else + net_fill_ip6(&dst, net6_prefix(n), net6_pxlen(n)); + + rte rb = {}; + net_addr_union nau; + RT_LOCKED(tab_ip, tip) + { + ASSERT(tip->trie); + /* Find best-match BGP unicast route for flowspec dst prefix */ + net *nb = net_route(tip, &dst); + if (nb) + { + rb = RTE_COPY_VALID(nb->routes); + rta_clone(rb.attrs); + net_copy(&nau.n, nb->n.addr); + rb.net = &nau.n; + } + } + + /* Register prefix to trie for tracking further changes */ + int max_pxlen = (n->type == NET_FLOW4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH; + RT_LOCKED(tab_flow, tfl) + trie_add_prefix(tfl->flowspec_trie, &dst, (rb.net ? rb.net->pxlen : 0), max_pxlen); + + /* No best-match BGP route -> no flowspec */ + if (!rb.attrs || (rt_get_source_attr(&rb) != RTS_BGP)) + return FLOWSPEC_INVALID; + + /* Find ORIGINATOR_ID values */ + u32 orig_a = ea_get_int(a, "bgp_originator_id", 0); + u32 orig_b = ea_get_int(rb.attrs, "bgp_originator_id", 0); + + /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */ + if ((orig_a != orig_b) || (!orig_a && !orig_b && !ipa_equal( + ea_get_ip(a, &ea_gen_from, IPA_NONE), + ea_get_ip(rb.attrs, &ea_gen_from, IPA_NONE) + ))) + return FLOWSPEC_INVALID; + + + /* Find ASN of the best-match route, for use in next checks */ + u32 asn_b = rta_get_first_asn(rb.attrs); + if (!asn_b) + return FLOWSPEC_INVALID; + + /* RFC 9117 4.2. For EBGP, flowspec and its best-match route are from the same AS */ + if (!interior && (rta_get_first_asn(a) != asn_b)) + return FLOWSPEC_INVALID; + + /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */ + RT_LOCKED(tab_ip, tip) + { + TRIE_WALK(tip->trie, subnet, &dst) { - a->dest = RTD_UNREACHABLE; - log(L_WARN "No valid nexthop remaining, setting route unreachable"); - goto no_nexthop; + net *nc = net_find_valid(tip, &subnet); + if (!nc) + continue; + + const rte *rc = &nc->routes->rte; + if (rt_get_source_attr(rc) != RTS_BGP) + RT_RETURN(tip, FLOWSPEC_INVALID); + + if (rta_get_first_asn(rc->attrs) != asn_b) + RT_RETURN(tip, FLOWSPEC_INVALID); } + TRIE_WALK_END; + } + + return FLOWSPEC_VALID; } -static inline struct rte_storage * -rt_next_hop_update_rte(rtable_private *tab, net *n, rte *old) +#endif /* CONFIG_BGP */ + +static int +rt_flowspec_update_rte(rtable *tab, rte *r, rte *new) { - rta *a = alloca(RTA_MAX_SIZE); - memcpy(a, old->attrs, rta_size(old->attrs)); +#ifdef CONFIG_BGP + if (r->generation || (rt_get_source_attr(r) != RTS_BGP)) + return 0; + + struct bgp_channel *bc = (struct bgp_channel *) SKIP_BACK(struct channel, in_req, r->sender->req); + if (!bc->base_table) + return 0; - mpls_label_stack mls = { .len = a->nh.labels_orig }; - memcpy(mls.stack, &a->nh.label[a->nh.labels - mls.len], mls.len * sizeof(u32)); + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, p, bc->c.proto); - rta_apply_hostentry(a, old->attrs->hostentry, &mls, tab->nhu_lp); - a->cached = 0; + enum flowspec_valid old = rt_get_flowspec_valid(r), + valid = rt_flowspec_check(bc->base_table, tab, r->net, r->attrs, p->is_interior); - rte e0 = *old; - e0.attrs = a; + if (old == valid) + return 0; - return rte_store(&e0, n, tab); + *new = *r; + ea_set_attr_u32(&new->attrs, &ea_gen_flowspec_valid, 0, valid); + return 1; +#else + return 0; +#endif +} + +static inline void +rt_flowspec_resolve_rte(rte *r, struct channel *c) +{ +#ifdef CONFIG_BGP + enum flowspec_valid valid, old = rt_get_flowspec_valid(r); + struct bgp_channel *bc = (struct bgp_channel *) c; + + if ( (rt_get_source_attr(r) == RTS_BGP) + && (c->channel == &channel_bgp) + && (bc->base_table)) + { + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, p, bc->c.proto); + valid = rt_flowspec_check( + bc->base_table, + c->in_req.hook->table, + r->net, r->attrs, p->is_interior); + } + else + valid = FLOWSPEC_UNKNOWN; + + if (valid == old) + return; + + if (valid == FLOWSPEC_UNKNOWN) + ea_unset_attr(&r->attrs, 0, &ea_gen_flowspec_valid); + else + ea_set_attr_u32(&r->attrs, &ea_gen_flowspec_valid, 0, valid); +#endif } static inline int -rt_next_hop_update_net(rtable_private *tab, net *n) +rt_next_hop_update_net(struct rtable_private *tab, net *n) { - struct rte_storage *new; - int count = 0; + uint count = 0; + int is_flow = net_is_flow(n->n.addr); struct rte_storage *old_best = n->routes; if (!old_best) return 0; for (struct rte_storage *e, **k = &n->routes; e = *k; k = &e->next) - if (rta_next_hop_outdated(e->rte.attrs)) - count++; + count++; if (!count) return 0; struct rte_multiupdate { - struct rte_storage *old, *new; - } *updates = alloca(sizeof(struct rte_multiupdate) * count); + struct rte_storage *old, *new_stored; + rte new; + } *updates = tmp_allocz(sizeof(struct rte_multiupdate) * (count+1)); - int pos = 0; + struct rt_pending_export *last_pending = n->last; + + uint pos = 0; for (struct rte_storage *e, **k = &n->routes; e = *k; k = &e->next) - if (rta_next_hop_outdated(e->rte.attrs)) - { - struct rte_storage *new = rt_next_hop_update_rte(tab, n, &e->rte); + updates[pos++].old = e; + + /* This is an exceptional place where table can be unlocked while keeping its data: + * the reason why this is safe is that NHU must be always run from the same + * thread as cleanup routines, therefore the only real problem may arise when + * some importer does a change on this particular net (destination) while NHU + * is being computed. Statistically, this should almost never happen. In such + * case, we just drop all the computed changes and do it once again. + * */ + RT_UNLOCK(tab); - /* Call a pre-comparison hook */ - /* Not really an efficient way to compute this */ - if (e->rte.src->owner->rte_recalculate) - e->rte.src->owner->rte_recalculate(tab, n, &new->rte, &e->rte, &old_best->rte); + uint mod = 0; + if (is_flow) + for (uint i = 0; i < pos; i++) + mod += rt_flowspec_update_rte(RT_PUB(tab), &updates[i].old->rte, &updates[i].new); - updates[pos++] = (struct rte_multiupdate) { - .old = e, - .new = new, - }; + else + for (uint i = 0; i < pos; i++) + mod += rt_next_hop_update_rte(&updates[i].old->rte, &updates[i].new); + + RT_LOCK(RT_PUB(tab)); + + if (!mod) + return 0; - /* Replace the route in the list */ - new->next = e->next; - *k = e = new; + /* Something has changed inbetween, retry NHU. */ + if (last_pending != n->last) + return rt_next_hop_update_net(tab, n); + /* Now we reconstruct the original linked list */ + struct rte_storage **nptr = &n->routes; + for (uint i = 0; i < pos; i++) + { + updates[i].old->next = NULL; + + struct rte_storage *put; + if (updates[i].new.attrs) + put = updates[i].new_stored = rte_store(&updates[i].new, n, tab); + else + put = updates[i].old; + + *nptr = put; + nptr = &put->next; + } + *nptr = NULL; + + /* Call the pre-comparison hooks */ + for (uint i = 0; i < pos; i++) + if (updates[i].new_stored) + { /* Get a new ID for the route */ - new->rte.lastmod = current_time(); - new->rte.id = hmap_first_zero(&tab->id_map); - hmap_set(&tab->id_map, new->rte.id); + updates[i].new_stored->rte.lastmod = current_time(); + updates[i].new_stored->rte.id = hmap_first_zero(&tab->id_map); + hmap_set(&tab->id_map, updates[i].new_stored->rte.id); - lp_flush(tab->nhu_lp); + /* Call a pre-comparison hook */ + /* Not really an efficient way to compute this */ + if (updates[i].old->rte.src->owner->rte_recalculate) + updates[i].old->rte.src->owner->rte_recalculate(tab, n, &updates[i].new_stored->rte, &updates[i].old->rte, &old_best->rte); } - ASSERT_DIE(pos == count); +#if DEBUGGING + { + uint t = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + t++; + ASSERT_DIE(t == pos); + ASSERT_DIE(pos == count); + } +#endif /* Find the new best route */ struct rte_storage **new_best = NULL; @@ -2725,7 +3789,7 @@ rt_next_hop_update_net(rtable_private *tab, net *n) } /* Relink the new best route to the first position */ - new = *new_best; + struct rte_storage *new = *new_best; if (new != n->routes) { *new_best = new->next; @@ -2733,95 +3797,166 @@ rt_next_hop_update_net(rtable_private *tab, net *n) n->routes = new; } + uint total = 0; /* Announce the changes */ - for (int i=0; i<count; i++) + for (uint i=0; i<count; i++) { - _Bool nb = (new == updates[i].new), ob = (old_best == updates[i].old); + if (!updates[i].new_stored) + continue; + + _Bool nb = (new->rte.src == updates[i].new.src), ob = (i == 0); const char *best_indicator[2][2] = { { "autoupdated", "autoupdated [-best]" }, { "autoupdated [+best]", "autoupdated [best]" } }; - rt_rte_trace_in(D_ROUTES, updates[i].new->rte.sender->req, &updates[i].new->rte, best_indicator[nb][ob]); - rte_announce(tab, n, updates[i].new, updates[i].old, new, old_best); + rt_rte_trace_in(D_ROUTES, updates[i].new.sender->req, &updates[i].new, best_indicator[nb][ob]); + rte_announce(tab, n, updates[i].new_stored, updates[i].old, new, old_best); + + total++; } - return count; + return total; } static void -rt_next_hop_update(void *data) +rt_nhu_uncork(void *_tab) +{ + RT_LOCKED((rtable *) _tab, tab) + { + ASSERT_DIE(tab->nhu_corked); + ASSERT_DIE(tab->nhu_state == 0); + + /* Reset the state */ + tab->nhu_state = tab->nhu_corked; + tab->nhu_corked = 0; + rt_trace(tab, D_STATES, "Next hop updater uncorked"); + + birdloop_flag(tab->loop, RTF_NHU); + } +} + +static void +rt_next_hop_update(struct rtable_private *tab) { - rtable_private *tab = data; ASSERT_DIE(birdloop_inside(tab->loop)); - struct fib_iterator *fit = &tab->nhu_fit; - int max_feed = 32; + if (tab->nhu_corked) + return; - if (atomic_load_explicit(&tab->nhu_state, memory_order_acquire) == NHU_CLEAN) + if (!tab->nhu_state) return; - rt_lock_table(tab); + /* Check corkedness */ + if (rt_cork_check(tab->nhu_uncork_event)) + { + rt_trace(tab, D_STATES, "Next hop updater corked"); + if ((tab->nhu_state & NHU_RUNNING) + && !EMPTY_LIST(tab->exporter.pending)) + rt_kick_export_settle(tab); - if (atomic_load_explicit(&tab->nhu_state, memory_order_acquire) == NHU_SCHEDULED) - { - FIB_ITERATE_INIT(fit, &tab->fib); - ASSERT_DIE(atomic_exchange_explicit(&tab->nhu_state, NHU_RUNNING, memory_order_acq_rel) == NHU_SCHEDULED); - } + tab->nhu_corked = tab->nhu_state; + tab->nhu_state = 0; + return; + } + + struct fib_iterator *fit = &tab->nhu_fit; + int max_feed = 32; + + /* Initialize a new run */ + if (tab->nhu_state == NHU_SCHEDULED) + { + FIB_ITERATE_INIT(fit, &tab->fib); + tab->nhu_state = NHU_RUNNING; + if (tab->flowspec_trie) + rt_flowspec_reset_trie(tab); + } + + /* Walk the fib one net after another */ FIB_ITERATE_START(&tab->fib, fit, net, n) { if (max_feed <= 0) { FIB_ITERATE_PUT(fit); - ev_send_loop(tab->loop, tab->nhu_event); - rt_unlock_table(tab); + birdloop_flag(tab->loop, RTF_NHU); return; } + lp_state lps; + lp_save(tmp_linpool, &lps); max_feed -= rt_next_hop_update_net(tab, n); + lp_restore(tmp_linpool, &lps); } FIB_ITERATE_END; + /* Finished NHU, cleanup */ + rt_trace(tab, D_EVENTS, "NHU done, scheduling export timer"); + rt_kick_export_settle(tab); + /* State change: * NHU_DIRTY -> NHU_SCHEDULED * NHU_RUNNING -> NHU_CLEAN */ - if (atomic_fetch_and_explicit(&tab->nhu_state, NHU_SCHEDULED, memory_order_acq_rel) != NHU_RUNNING) - ev_send_loop(tab->loop, tab->nhu_event); + if ((tab->nhu_state &= NHU_SCHEDULED) == NHU_SCHEDULED) + birdloop_flag(tab->loop, RTF_NHU); +} - ev_send_loop(tab->loop, tab->announce_event); +void +rt_new_default_table(struct symbol *s) +{ + for (uint addr_type = 0; addr_type < NET_MAX; addr_type++) + if (s == new_config->def_tables[addr_type]) + { + s->table = rt_new_table(s, addr_type); + return; + } - rt_unlock_table(tab); + bug("Requested an unknown new default table: %s", s->name); } +struct rtable_config * +rt_get_default_table(struct config *cf, uint addr_type) +{ + struct symbol *ts = cf->def_tables[addr_type]; + if (!ts) + return NULL; + + if (!ts->table) + rt_new_default_table(ts); + + return ts->table; +} struct rtable_config * rt_new_table(struct symbol *s, uint addr_type) { - /* Hack that allows to 'redefine' the master table */ - if ((s->class == SYM_TABLE) && - (s->table == new_config->def_tables[addr_type]) && - ((addr_type == NET_IP4) || (addr_type == NET_IP6))) - return s->table; - struct rtable_config *c = cfg_allocz(sizeof(struct rtable_config)); - cf_define_symbol(s, SYM_TABLE, table, c); + if (s == new_config->def_tables[addr_type]) + s->table = c; + else + cf_define_symbol(s, SYM_TABLE, table, c); + c->name = s->name; c->addr_type = addr_type; - c->gc_max_ops = 1000; - c->gc_min_time = 5; - c->min_settle_time = 1 S; - c->max_settle_time = 20 S; - c->min_rr_settle_time = 30 S; - c->max_rr_settle_time = 90 S; - c->cork_limit = 4 * page_size / sizeof(struct rt_pending_export); - c->config = new_config; + c->gc_threshold = 1000; + c->gc_period = (uint) -1; /* set in rt_postconfig() */ + c->cork_threshold.low = 128; + c->cork_threshold.high = 512; + c->export_settle = (struct settle_config) { + .min = 1 MS, + .max = 100 MS, + }; + c->export_rr_settle = (struct settle_config) { + .min = 100 MS, + .max = 3 S, + }; + c->debug = new_config->table_debug; add_tail(&new_config->tables, &c->n); /* First table of each type is kept as default */ if (! new_config->def_tables[addr_type]) - new_config->def_tables[addr_type] = c; + new_config->def_tables[addr_type] = s; return c; } @@ -2835,23 +3970,12 @@ rt_new_table(struct symbol *s, uint addr_type) * configuration. */ void -rt_lock_table(rtable_private *r) +rt_lock_table_priv(struct rtable_private *r, const char *file, uint line) { + rt_trace(r, D_STATES, "Locked at %s:%d", file, line); r->use_count++; } -static void -rt_loop_stopped(void *data) -{ - rtable_private *r = data; - birdloop_free(r->loop); - r->loop = NULL; - r->prune_event->list = r->ec_event->list = NULL; - r->nhu_event->list = r->hcu_event->list = NULL; - r->announce_event->list = NULL; - ev_send(r->delete_event->list, r->delete_event); -} - /** * rt_unlock_table - unlock a routing table * @r: routing table to be unlocked @@ -2861,40 +3985,106 @@ rt_loop_stopped(void *data) * for deletion by configuration changes. */ void -rt_unlock_table(rtable_private *r) +rt_unlock_table_priv(struct rtable_private *r, const char *file, uint line) { - if (!--r->use_count && r->delete_event && - !r->prune_state && !atomic_load_explicit(&r->nhu_state, memory_order_acquire)) - /* Delete the routing table by freeing its pool */ - birdloop_stop_self(r->loop, rt_loop_stopped, r); + rt_trace(r, D_STATES, "Unlocked at %s:%d", file, line); + if (!--r->use_count && r->deleted) + /* Stop the service thread to finish this up */ + ev_send(&global_event_list, ev_new_init(r->rp, rt_shutdown, r)); } -static struct rtable_config * -rt_find_table_config(struct config *cf, char *name) +static void +rt_shutdown(void *tab_) { - struct symbol *sym = cf_find_symbol(cf, name); - return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL; + struct rtable_private *r = tab_; + birdloop_stop(r->loop, rt_delete, r); } static void -rt_done(void *data) +rt_delete(void *tab_) { - rtable_private *t = data; - ASSERT_DIE(t->loop == NULL); + birdloop_enter(&main_birdloop); + + /* We assume that nobody holds the table reference now as use_count is zero. + * Anyway the last holder may still hold the lock. Therefore we lock and + * unlock it the last time to be sure that nobody is there. */ + struct rtable_private *tab = RT_LOCK((rtable *) tab_); + struct config *conf = tab->deleted; - struct rtable_config *tc = t->config; - struct config *c = tc->config; + RT_UNLOCK(RT_PUB(tab)); - tc->table = NULL; - rem_node(&t->n); + rfree(tab->rp); + config_del_obstacle(conf); - if (t->hostcache) - rt_free_hostcache(t); + birdloop_leave(&main_birdloop); +} + + +static void +rt_check_cork_low(struct rtable_private *tab) +{ + if (!tab->cork_active) + return; + + if (tab->deleted || !tab->exporter.first || (tab->exporter.first->seq + tab->cork_threshold.low > tab->exporter.next_seq)) + { + tab->cork_active = 0; + rt_cork_release(); - rfree(t->delete_event); - rfree(t->rp); + rt_trace(tab, D_STATES, "Uncorked"); + } +} - config_del_obstacle(c); +static void +rt_check_cork_high(struct rtable_private *tab) +{ + if (!tab->deleted && !tab->cork_active && tab->exporter.first && (tab->exporter.first->seq + tab->cork_threshold.high <= tab->exporter.next_seq)) + { + tab->cork_active = 1; + rt_cork_acquire(); + + rt_trace(tab, D_STATES, "Corked"); + } +} + + +static int +rt_reconfigure(struct rtable_private *tab, struct rtable_config *new, struct rtable_config *old) +{ + if ((new->addr_type != old->addr_type) || + (new->sorted != old->sorted) || + (new->trie_used != old->trie_used)) + return 0; + + DBG("\t%s: same\n", new->name); + new->table = RT_PUB(tab); + tab->name = new->name; + tab->config = new; + + if (tab->hostcache) + tab->hostcache->req.trace_routes = new->debug; + + struct rt_table_export_hook *hook; node *n; + WALK_LIST2(hook, n, tab->exporter.e.hooks, h.n) + if (hook->h.req->export_one == rt_flowspec_export_one) + hook->h.req->trace_routes = new->debug; + + tab->cork_threshold = new->cork_threshold; + + if (new->cork_threshold.high != old->cork_threshold.high) + rt_check_cork_high(tab); + + if (new->cork_threshold.low != old->cork_threshold.low) + rt_check_cork_low(tab); + + return 1; +} + +static struct rtable_config * +rt_find_table_config(struct config *cf, char *name) +{ + struct symbol *sym = cf_find_symbol(cf, name); + return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL; } /** @@ -2919,31 +4109,37 @@ rt_commit(struct config *new, struct config *old) { WALK_LIST(o, old->tables) { - RT_LOCK(o->table); - rtable_private *ot = RT_PRIV(o->table); - if (!ot->delete_event) - { - r = rt_find_table_config(new, o->name); - if (r && (r->addr_type == o->addr_type) && !new->shutdown) - { - DBG("\t%s: same\n", o->name); - r->table = (rtable *) ot; - ot->name = r->name; - ot->config = r; - if (o->sorted != r->sorted) - log(L_WARN "Reconfiguration of rtable sorted flag not implemented"); - } - else - { - DBG("\t%s: deleted\n", o->name); - rt_lock_table(ot); - ot->delete_event = ev_new_init(&root_pool, rt_done, ot); - ot->delete_event->list = &global_event_list; - config_add_obstacle(old); - rt_unlock_table(ot); - } - } - RT_UNLOCK(o->table); + struct rtable_private *tab = RT_LOCK(o->table); + + if (tab->deleted) + { + RT_UNLOCK(tab); + continue; + } + + r = rt_find_table_config(new, o->name); + if (r && !new->shutdown && rt_reconfigure(tab, r, o)) + { + RT_UNLOCK(tab); + continue; + } + + DBG("\t%s: deleted\n", o->name); + tab->deleted = old; + config_add_obstacle(old); + rt_lock_table(tab); + + if (tab->hostcache) + { + rt_stop_export(&tab->hostcache->req, NULL); + if (ev_get_list(&tab->hostcache->update) == &rt_cork.queue) + ev_postpone(&tab->hostcache->update); + } + + rt_check_cork_low(tab); + rt_unlock_table(tab); + + RT_UNLOCK(tab); } } @@ -2957,8 +4153,86 @@ rt_commit(struct config *new, struct config *old) DBG("\tdone\n"); } +static void +rt_feed_done(struct rt_export_hook *c) +{ + c->event.hook = rt_export_hook; + + rt_set_export_state(c, TES_READY); + + rt_send_export_event(c); +} + +#define MAX_FEED_BLOCK 1024 +typedef struct { + uint cnt, pos; + union { + struct rt_pending_export *rpe; + struct { + rte **feed; + uint *start; + }; + }; +} rt_feed_block; + +static int +rt_prepare_feed(struct rt_table_export_hook *c, net *n, rt_feed_block *b) +{ + if (n->routes) + { + if (c->h.req->export_bulk) + { + uint cnt = rte_feed_count(n); + if (b->cnt && (b->cnt + cnt > MAX_FEED_BLOCK)) + return 0; + + if (!b->cnt) + { + b->feed = tmp_alloc(sizeof(rte *) * MAX(MAX_FEED_BLOCK, cnt)); + b->start = tmp_alloc(sizeof(uint) * ((cnt >= MAX_FEED_BLOCK) ? 2 : (MAX_FEED_BLOCK + 2 - cnt))); + } + + rte_feed_obtain(n, &b->feed[b->cnt], cnt); + b->start[b->pos++] = b->cnt; + b->cnt += cnt; + } + else if (b->pos == MAX_FEED_BLOCK) + return 0; + else + { + if (!b->pos) + b->rpe = tmp_alloc(sizeof(struct rt_pending_export) * MAX_FEED_BLOCK); + + b->rpe[b->pos++] = (struct rt_pending_export) { .new = n->routes, .new_best = n->routes }; + } + } + + rpe_mark_seen_all(&c->h, n->first, NULL); + return 1; +} + +static void +rt_process_feed(struct rt_table_export_hook *c, rt_feed_block *b) +{ + if (!b->pos) + return; + + if (c->h.req->export_bulk) + { + b->start[b->pos] = b->cnt; + for (uint p = 0; p < b->pos; p++) + { + rte **feed = &b->feed[b->start[p]]; + c->h.req->export_bulk(c->h.req, feed[0]->net, NULL, feed, b->start[p+1] - b->start[p]); + } + } + else + for (uint p = 0; p < b->pos; p++) + c->h.req->export_one(c->h.req, b->rpe[p].new->rte.net, &b->rpe[p]); +} + /** - * rt_feed_channel - advertise all routes to a channel + * rt_feed_by_fib - advertise all routes to a channel by walking a fib * @c: channel to be fed * * This function performs one pass of advertisement of routes to a channel that @@ -2967,119 +4241,148 @@ rt_commit(struct config *new, struct config *old) * order not to monopolize CPU time.) */ static void -rt_feed_channel(void *data) +rt_feed_by_fib(void *data) { - struct rt_export_hook *c = data; - + struct rt_table_export_hook *c = data; struct fib_iterator *fit = &c->feed_fit; - int max_feed = 256; - - rtable_private *tab; - if (c->export_state == TES_HUNGRY) - { - rt_set_export_state(c, TES_FEEDING); + rt_feed_block block = {}; - tab = RT_LOCK(c->table); - - struct rt_pending_export *rpe = rt_last_export(tab); - DBG("store hook=%p last_export=%p seq=%lu\n", c, rpe, rpe ? rpe->seq : 0); - atomic_store_explicit(&c->last_export, rpe, memory_order_relaxed); - - FIB_ITERATE_INIT(&c->feed_fit, &tab->fib); - } - else - tab = RT_LOCK(c->table); + ASSERT(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); - ASSERT_DIE(c->export_state == TES_FEEDING); + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { -redo: FIB_ITERATE_START(&tab->fib, fit, net, n) { - if (max_feed <= 0) + if ((c->h.req->addr_mode == TE_ADDR_NONE) || net_in_netX(n->n.addr, c->h.req->addr)) + { + if (!rt_prepare_feed(c, n, &block)) { FIB_ITERATE_PUT(fit); - rt_send_export_event(c); - - RT_UNLOCK(c->table); + RT_UNLOCK(tab); + rt_process_feed(c, &block); + rt_send_export_event(&c->h); return; } - - if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_FEEDING) - { - RT_UNLOCK(c->table); - return; } + } + FIB_ITERATE_END; + } - if (!n->routes || !rte_is_valid(&n->routes->rte)) - ; /* if no route, do nothing */ - else if (c->req->export_bulk) - { - uint count = rte_feed_count(n); - if (count) - { - rte **feed = alloca(count * sizeof(rte *)); - rte_feed_obtain(n, feed, count); + rt_process_feed(c, &block); + rt_feed_done(&c->h); +} - struct rt_pending_export *rpe_last, *rpe_first = n->first; - for (struct rt_pending_export *rpe = rpe_first; rpe; rpe = rpe_next(rpe, NULL)) - rpe_last = rpe; +static void +rt_feed_by_trie(void *data) +{ + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; - FIB_ITERATE_PUT_NEXT(fit, &tab->fib); - RT_UNLOCK(c->table); + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { - c->req->export_bulk(c->req, n->n.addr, NULL, feed, count); + ASSERT_DIE(c->walk_state); + struct f_trie_walk_state *ws = c->walk_state; - RT_LOCK(c->table); + ASSERT(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); - for (struct rt_pending_export *rpe = rpe_first; rpe; rpe = rpe_next(rpe, NULL)) - { - rpe_mark_seen(c, rpe); - if (rpe == rpe_last) - break; - ASSERT_DIE(rpe->seq < rpe_last->seq); - } + do { + if (!c->walk_last.type) + continue; - max_feed -= count; + net *n = net_find(tab, &c->walk_last); + if (!n) + continue; - goto redo; - } - } - else if (c->req->export_one) - { - struct rt_pending_export rpe = { .new = n->routes, .new_best = n->routes }; + if (!rt_prepare_feed(c, n, &block)) + { + RT_UNLOCK(tab); + rt_process_feed(c, &block); + rt_send_export_event(&c->h); + return; + } + } + while (trie_walk_next(ws, &c->walk_last)); - struct rt_pending_export *rpe_last, *rpe_first = n->first; - for (struct rt_pending_export *rpe = rpe_first; rpe; rpe = rpe_next(rpe, NULL)) - rpe_last = rpe; + rt_unlock_trie(tab, c->walk_lock); + c->walk_lock = NULL; - FIB_ITERATE_PUT_NEXT(fit, &tab->fib); - RT_UNLOCK(c->table); + mb_free(c->walk_state); + c->walk_state = NULL; - c->req->export_one(c->req, n->n.addr, &rpe); + c->walk_last.type = 0; - RT_LOCK(c->table); - for (struct rt_pending_export *rpe = rpe_first; rpe; rpe = rpe_next(rpe, NULL)) - { - rpe_mark_seen(c, rpe); - if (rpe == rpe_last) - break; - ASSERT_DIE(rpe->seq < rpe_last->seq); - } + } - max_feed--; - goto redo; - } - else - bug("Export request must always provide an export method"); - } - FIB_ITERATE_END; + rt_process_feed(c, &block); + rt_feed_done(&c->h); +} - c->event->hook = rt_export_hook; - rt_send_export_event(c); +static void +rt_feed_equal(void *data) +{ + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; + net *n; - RT_UNLOCK(c->table); + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); + ASSERT_DIE(c->h.req->addr_mode == TE_ADDR_EQUAL); - rt_set_export_state(c, TES_READY); + if (n = net_find(tab, c->h.req->addr)) + ASSERT_DIE(rt_prepare_feed(c, n, &block)); + } + + if (n) + rt_process_feed(c, &block); + + rt_feed_done(&c->h); +} + +static void +rt_feed_for(void *data) +{ + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; + net *n; + + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); + ASSERT_DIE(c->h.req->addr_mode == TE_ADDR_FOR); + + if (n = net_route(tab, c->h.req->addr)) + ASSERT_DIE(rt_prepare_feed(c, n, &block)); + } + + if (n) + rt_process_feed(c, &block); + + rt_feed_done(&c->h); +} + + +/* + * Import table + */ + +void channel_reload_export_bulk(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) +{ + struct channel *c = SKIP_BACK(struct channel, reload_req, req); + + for (uint i=0; i<count; i++) + if (feed[i]->sender == c->in_req.hook) + { + /* Strip the later attribute layers */ + rte new = *feed[i]; + while (new.attrs->next) + new.attrs = new.attrs->next; + + /* And reload the route */ + rte_update(c, net, &new, new.src); + } } @@ -3178,7 +4481,7 @@ hc_delete_hostentry(struct hostcache *hc, pool *p, struct hostentry *he) rem_node(&he->ln); hc_remove(hc, he); - sl_free(hc->slab, he); + sl_free(he); hc->hash_items--; if (hc->hash_items < hc->hash_min) @@ -3186,7 +4489,56 @@ hc_delete_hostentry(struct hostcache *hc, pool *p, struct hostentry *he) } static void -rt_init_hostcache(rtable_private *tab) +hc_notify_dump_req(struct rt_export_request *req) +{ + debug(" Table %s (%p)\n", req->name, req); +} + +static void +hc_notify_log_state_change(struct rt_export_request *req, u8 state) +{ + struct hostcache *hc = SKIP_BACK(struct hostcache, req, req); + rt_trace((rtable *) hc->update.data, D_STATES, "HCU Export state changed to %s", rt_export_state_name(state)); +} + +static void +hc_notify_export_one(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) +{ + struct hostcache *hc = SKIP_BACK(struct hostcache, req, req); + + /* No interest in this update, mark seen only */ + int interested = 1; + RT_LOCKED((rtable *) hc->update.data, tab) + if (ev_active(&hc->update) || !trie_match_net(hc->trie, net)) + { + rpe_mark_seen_all(req->hook, first, NULL); + interested = 0; + } + + if (!interested) + return; + + /* This net may affect some hostentries, check the actual change */ + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; + + RPE_WALK(first, rpe, NULL) + { + rpe_mark_seen(req->hook, rpe); + new_best = rpe->new_best; + } + + /* Yes, something has actually changed. Do the hostcache update. */ + if (o != RTE_VALID_OR_NULL(new_best)) + RT_LOCKED((rtable *) hc->update.data, tab) + if ((atomic_load_explicit(&req->hook->export_state, memory_order_acquire) == TES_READY) + && !ev_active(&hc->update)) + ev_send_loop(tab->loop, &hc->update); +} + + +static void +rt_init_hostcache(struct rtable_private *tab) { struct hostcache *hc = mb_allocz(tab->rp, sizeof(struct hostcache)); init_list(&hc->hostentries); @@ -3195,14 +4547,30 @@ rt_init_hostcache(rtable_private *tab) hc_alloc_table(hc, tab->rp, HC_DEF_ORDER); hc->slab = sl_new(tab->rp, sizeof(struct hostentry)); - hc->lp = lp_new(tab->rp, LP_GOOD_SIZE(1024)); + hc->lp = lp_new(tab->rp); hc->trie = f_new_trie(hc->lp, 0); + hc->update = (event) { + .hook = rt_update_hostcache, + .data = tab, + }; + + hc->req = (struct rt_export_request) { + .name = mb_sprintf(tab->rp, "%s.hcu.notifier", tab->name), + .list = &global_work_list, + .trace_routes = tab->config->debug, + .dump_req = hc_notify_dump_req, + .log_state_change = hc_notify_log_state_change, + .export_one = hc_notify_export_one, + }; + + rt_table_export_start_locked(tab, &hc->req); + tab->hostcache = hc; } static void -rt_free_hostcache(rtable_private *tab) +rt_free_hostcache(struct rtable_private *tab) { struct hostcache *hc = tab->hostcache; @@ -3224,16 +4592,6 @@ rt_free_hostcache(rtable_private *tab) */ } -static void -rt_notify_hostcache(rtable_private *tab, net *net) -{ - if (ev_active(tab->hcu_event)) - return; - - if (trie_match_net(tab->hostcache->trie, net->n.addr)) - ev_send_loop(tab->loop, tab->hcu_event); -} - static int if_local_addr(ip_addr a, struct iface *i) { @@ -3247,14 +4605,14 @@ if_local_addr(ip_addr a, struct iface *i) } u32 -rt_get_igp_metric(rte *rt) +rt_get_igp_metric(const rte *rt) { - eattr *ea = ea_find(rt->attrs->eattrs, EA_GEN_IGP_METRIC); + eattr *ea = ea_find(rt->attrs, "igp_metric"); if (ea) return ea->u.data; - if (rt->attrs->source == RTS_DEVICE) + if (rt_get_source_attr(rt) == RTS_DEVICE) return 0; if (rt->src->owner->class->rte_igp_metric) @@ -3264,15 +4622,14 @@ rt_get_igp_metric(rte *rt) } static int -rt_update_hostentry(rtable_private *tab, struct hostentry *he) +rt_update_hostentry(struct rtable_private *tab, struct hostentry *he) { - rta *old_src = he->src; + ea_list *old_src = he->src; int direct = 0; int pxlen = 0; /* Reset the hostentry */ he->src = NULL; - he->dest = RTD_UNREACHABLE; he->nexthop_linkable = 0; he->igp_metric = 0; @@ -3282,11 +4639,13 @@ rt_update_hostentry(rtable_private *tab, struct hostentry *he) if (n) { struct rte_storage *e = n->routes; - rta *a = e->rte.attrs; - word pref = a->pref; + ea_list *a = e->rte.attrs; + u32 pref = rt_get_preference(&e->rte); for (struct rte_storage *ee = n->routes; ee; ee = ee->next) - if ((ee->rte.attrs->pref >= pref) && ee->rte.attrs->hostentry) + if (rte_is_valid(&ee->rte) && + (rt_get_preference(&ee->rte) >= pref) && + ea_find(ee->rte.attrs, &ea_gen_hostentry)) { /* Recursive route should not depend on another recursive route */ log(L_WARN "Next hop address %I resolvable through recursive route for %N", @@ -3296,9 +4655,12 @@ rt_update_hostentry(rtable_private *tab, struct hostentry *he) pxlen = n->n.addr->pxlen; - if (a->dest == RTD_UNICAST) - { - for (struct nexthop *nh = &(a->nh); nh; nh = nh->next) + eattr *nhea = ea_find(a, &ea_gen_nexthop); + ASSERT_DIE(nhea); + struct nexthop_adata *nhad = (void *) nhea->u.ptr; + + if (NEXTHOP_IS_REACHABLE(nhad)) + NEXTHOP_WALK(nh, nhad) if (ipa_zero(nh->gw)) { if (if_local_addr(he->addr, nh->iface)) @@ -3311,10 +4673,8 @@ rt_update_hostentry(rtable_private *tab, struct hostentry *he) direct++; } - } he->src = rta_clone(a); - he->dest = a->dest; he->nexthop_linkable = !direct; he->igp_metric = rt_get_igp_metric(&e->rte); } @@ -3330,10 +4690,26 @@ done: static void rt_update_hostcache(void *data) { - rtable_private *tab = data; - ASSERT_DIE(birdloop_inside(tab->loop)); + rtable **nhu_pending; + + RT_LOCKED((rtable *) data, tab) + { struct hostcache *hc = tab->hostcache; + + /* Shutdown shortcut */ + if (!hc->req.hook) + RT_RETURN(tab); + + if (rt_cork_check(&hc->update)) + { + rt_trace(tab, D_STATES, "Hostcache update corked"); + RT_RETURN(tab); + } + + /* Destination schedule map */ + nhu_pending = tmp_allocz(sizeof(rtable *) * rtable_max_id); + struct hostentry *he; node *n, *x; @@ -3351,17 +4727,21 @@ rt_update_hostcache(void *data) } if (rt_update_hostentry(tab, he)) - rt_schedule_nhu(he->tab); + nhu_pending[he->tab->id] = he->tab; } + } + + for (uint i=0; i<rtable_max_id; i++) + if (nhu_pending[i]) + RT_LOCKED(nhu_pending[i], dst) + rt_schedule_nhu(dst); } -struct hostentry * -rt_get_hostentry(rtable *t, ip_addr a, ip_addr ll, rtable *dep) +static struct hostentry * +rt_get_hostentry(struct rtable_private *tab, ip_addr a, ip_addr ll, rtable *dep) { struct hostentry *he; - rtable_private *tab = RT_LOCK(t); - if (!tab->hostcache) rt_init_hostcache(tab); @@ -3369,13 +4749,10 @@ rt_get_hostentry(rtable *t, ip_addr a, ip_addr ll, rtable *dep) struct hostcache *hc = tab->hostcache; for (he = hc->hash_table[k >> hc->hash_shift]; he != NULL; he = he->next) if (ipa_equal(he->addr, a) && (he->tab == dep)) - goto done; + return he; he = hc_new_hostentry(hc, tab->rp, a, ipa_zero(ll) ? a : ll, dep, k); rt_update_hostentry(tab, he); - -done: - RT_UNLOCK(t); return he; } diff --git a/nest/rt.h b/nest/rt.h new file mode 100644 index 00000000..6ee2ce9b --- /dev/null +++ b/nest/rt.h @@ -0,0 +1,682 @@ +/* + * BIRD Internet Routing Daemon -- Routing Table + * + * (c) 1998--2000 Martin Mares <mj@ucw.cz> + * (c) 2019--2021 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_NEST_RT_H_ +#define _BIRD_NEST_RT_H_ + +#include "lib/lists.h" +#include "lib/bitmap.h" +#include "lib/resource.h" +#include "lib/net.h" +#include "lib/type.h" +#include "lib/fib.h" +#include "lib/route.h" +#include "lib/event.h" +#include "lib/rcu.h" +#include "lib/io-loop.h" +#include "lib/settle.h" + +#include <stdatomic.h> + +struct ea_list; +struct protocol; +struct proto; +struct channel; +struct rte_src; +struct hostcache; +struct symbol; +struct timer; +struct filter; +struct f_trie; +struct f_trie_walk_state; +struct cli; + +struct rt_cork_threshold { + u64 low, high; +}; + +/* + * Master Routing Tables. Generally speaking, each of them contains a FIB + * with each entry pointing to a list of route entries representing routes + * to given network (with the selected one at the head). + * + * Each of the RTE's contains variable data (the preference and protocol-dependent + * metrics) and a pointer to a route attribute block common for many routes). + * + * It's guaranteed that there is at most one RTE for every (prefix,proto) pair. + */ + +struct rtable_config { + node n; + char *name; + union rtable *table; + struct proto_config *krt_attached; /* Kernel syncer attached to this table */ + uint addr_type; /* Type of address data stored in table (NET_*) */ + uint gc_threshold; /* Maximum number of operations before GC is run */ + uint gc_period; /* Approximate time between two consecutive GC runs */ + byte sorted; /* Routes of network are sorted according to rte_better() */ + byte trie_used; /* Rtable has attached trie */ + byte debug; /* Whether to log */ + struct rt_cork_threshold cork_threshold; /* Cork threshold values */ + struct settle_config export_settle; /* Export announcement settler */ + struct settle_config export_rr_settle;/* Export announcement settler config valid when any + route refresh is running */ +}; + +struct rt_export_hook; +struct rt_export_request; +struct rt_exporter; + +struct rt_exporter_class { + void (*start)(struct rt_exporter *, struct rt_export_request *); + void (*stop)(struct rt_export_hook *); + void (*done)(void *_rt_export_hook); +}; + +struct rt_exporter { + const struct rt_exporter_class *class; + pool *rp; + list hooks; /* Registered route export hooks */ + uint addr_type; /* Type of address data exported (NET_*) */ +}; + +struct rt_table_exporter { + struct rt_exporter e; + list pending; /* List of packed struct rt_pending_export */ + + struct rt_pending_export *first; /* First export to announce */ + u64 next_seq; /* The next export will have this ID */ +}; + +extern uint rtable_max_id; + +DEFINE_DOMAIN(rtable); + +/* The public part of rtable structure */ +#define RTABLE_PUBLIC \ + resource r; \ + node n; /* Node in list of all tables */ \ + char *name; /* Name of this table */ \ + uint addr_type; /* Type of address data stored in table (NET_*) */ \ + uint id; /* Integer table ID for fast lookup */ \ + DOMAIN(rtable) lock; /* Lock to take to access the private parts */ \ + struct rtable_config *config; /* Configuration of this table */ \ + struct birdloop *loop; /* Service thread */ \ + +/* The complete rtable structure */ +struct rtable_private { + /* Once more the public part */ + RTABLE_PUBLIC; + + /* Here the private items not to be accessed without locking */ + pool *rp; /* Resource pool to allocate everything from, including itself */ + struct slab *rte_slab; /* Slab to allocate route objects */ + struct fib fib; + struct f_trie *trie; /* Trie of prefixes defined in fib */ + int use_count; /* Number of protocols using this table */ + u32 rt_count; /* Number of routes in the table */ + + list imports; /* Registered route importers */ + struct rt_table_exporter exporter; /* Exporter API structure */ + + struct hmap id_map; + struct hostcache *hostcache; + struct config *deleted; /* Table doesn't exist in current configuration, + * delete as soon as use_count becomes 0 and remove + * obstacle from this routing table. + */ + struct event *nhu_uncork_event; /* Helper event to schedule NHU on uncork */ + struct settle export_settle; /* Export batching settle timer */ + struct timer *prune_timer; /* Timer for periodic pruning / GC */ + struct birdloop_flag_handler fh; /* Handler for simple events */ + btime last_rt_change; /* Last time when route changed */ + btime gc_time; /* Time of last GC */ + uint gc_counter; /* Number of operations since last GC */ + uint rr_counter; /* Number of currently running route refreshes, + in fact sum of (stale_set - stale_pruned) over all importers + + one for each TIS_FLUSHING importer */ + byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */ + byte prune_trie; /* Prune prefix trie during next table prune */ + byte nhu_state; /* Next Hop Update state */ + byte nhu_corked; /* Next Hop Update is corked with this state */ + byte export_used; /* Pending Export pruning is scheduled */ + byte cork_active; /* Cork has been activated */ + struct rt_cork_threshold cork_threshold; /* Threshold for table cork */ + struct fib_iterator prune_fit; /* Rtable prune FIB iterator */ + struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ + struct f_trie *trie_new; /* New prefix trie defined during pruning */ + struct f_trie *trie_old; /* Old prefix trie waiting to be freed */ + u32 trie_lock_count; /* Prefix trie locked by walks */ + u32 trie_old_lock_count; /* Old prefix trie locked by walks */ + struct tbf rl_pipe; /* Rate limiting token buffer for pipe collisions */ + + struct f_trie *flowspec_trie; /* Trie for evaluation of flowspec notifications */ +}; + +/* The final union private-public rtable structure */ +typedef union rtable { + struct { + RTABLE_PUBLIC; + }; + struct rtable_private priv; +} rtable; + +#define RT_IS_LOCKED(tab) DOMAIN_IS_LOCKED(rtable, (tab)->lock) + +#define RT_LOCK(tab) ({ LOCK_DOMAIN(rtable, (tab)->lock); &(tab)->priv; }) +#define RT_UNLOCK(tab) UNLOCK_DOMAIN(rtable, (tab)->lock) +#define RT_PRIV(tab) ({ ASSERT_DIE(RT_IS_LOCKED((tab))); &(tab)->priv; }) +#define RT_PUB(tab) SKIP_BACK(rtable, priv, tab) + +#define RT_LOCKED(tpub, tpriv) for (struct rtable_private *tpriv = RT_LOCK(tpub); tpriv; RT_UNLOCK(tpriv), (tpriv = NULL)) +#define RT_RETURN(tpriv, ...) do { RT_UNLOCK(tpriv); return __VA_ARGS__; } while (0) + +#define RT_PRIV_SAME(tpriv, tpub) (&(tpub)->priv == (tpriv)) + +/* Flags for birdloop_flag() */ +#define RTF_CLEANUP 1 +#define RTF_NHU 2 +#define RTF_EXPORT 4 + +extern struct rt_cork { + _Atomic uint active; + event_list queue; + event run; +} rt_cork; + +static inline void rt_cork_acquire(void) +{ + atomic_fetch_add_explicit(&rt_cork.active, 1, memory_order_acq_rel); +} + +static inline void rt_cork_release(void) +{ + if (atomic_fetch_sub_explicit(&rt_cork.active, 1, memory_order_acq_rel) == 1) + { + synchronize_rcu(); + ev_send(&global_work_list, &rt_cork.run); + } +} + +static inline int rt_cork_check(event *e) +{ + rcu_read_lock(); + + int corked = (atomic_load_explicit(&rt_cork.active, memory_order_acquire) > 0); + if (corked) + ev_send(&rt_cork.queue, e); + + rcu_read_unlock(); + + return corked; +} + + +typedef struct network { + struct rte_storage *routes; /* Available routes for this network */ + struct rt_pending_export *first, *last; + struct fib_node n; /* FIB flags reserved for kernel syncer */ +} net; + +struct rte_storage { + struct rte_storage *next; /* Next in chain */ + struct rte rte; /* Route data */ +}; + +#define RTE_COPY(r) ((r) ? (r)->rte : (rte) {}) +#define RTE_COPY_VALID(r) (((r) && (rte_is_valid(&(r)->rte))) ? (r)->rte : (rte) {}) +#define RTE_OR_NULL(r) ((r) ? &((r)->rte) : NULL) +#define RTE_VALID_OR_NULL(r) (((r) && (rte_is_valid(&(r)->rte))) ? &((r)->rte) : NULL) + +/* Table-channel connections */ + +struct rt_import_request { + struct rt_import_hook *hook; /* The table part of importer */ + char *name; + u8 trace_routes; + + event_list *list; /* Where to schedule announce events */ + + void (*dump_req)(struct rt_import_request *req); + void (*log_state_change)(struct rt_import_request *req, u8 state); + /* Preimport is called when the @new route is just-to-be inserted, replacing @old. + * Return a route (may be different or modified in-place) to continue or NULL to withdraw. */ + int (*preimport)(struct rt_import_request *req, struct rte *new, struct rte *old); +}; + +struct rt_import_hook { + node n; + rtable *table; /* The connected table */ + struct rt_import_request *req; /* The requestor */ + + struct rt_import_stats { + /* Import - from protocol to core */ + u32 pref; /* Number of routes selected as best in the (adjacent) routing table */ + u32 updates_ignored; /* Number of route updates rejected as already in route table */ + u32 updates_accepted; /* Number of route updates accepted and imported */ + u32 withdraws_ignored; /* Number of route withdraws rejected as already not in route table */ + u32 withdraws_accepted; /* Number of route withdraws accepted and processed */ + } stats; + + u64 flush_seq; /* Table export seq when the channel announced flushing */ + btime last_state_change; /* Time of last state transition */ + + u8 import_state; /* IS_* */ + u8 stale_set; /* Set this stale_cycle to imported routes */ + u8 stale_valid; /* Routes with this stale_cycle and bigger are considered valid */ + u8 stale_pruned; /* Last prune finished when this value was set at stale_valid */ + u8 stale_pruning; /* Last prune started when this value was set at stale_valid */ + + void (*stopped)(struct rt_import_request *); /* Stored callback when import is stopped */ + event announce_event; /* This event announces table updates */ +}; + +struct rt_pending_export { + struct rt_pending_export * _Atomic next; /* Next export for the same destination */ + struct rte_storage *new, *new_best, *old, *old_best; + u64 seq; /* Sequential ID (table-local) of the pending export */ +}; + +struct rt_export_request { + struct rt_export_hook *hook; /* Table part of the export */ + char *name; + const net_addr *addr; /* Network prefilter address */ + u8 trace_routes; + u8 addr_mode; /* Network prefilter mode (TE_ADDR_*) */ + + event_list *list; /* Where to schedule export events */ + + /* There are two methods of export. You can either request feeding every single change + * or feeding the whole route feed. In case of regular export, &export_one is preferred. + * Anyway, when feeding, &export_bulk is preferred, falling back to &export_one. + * Thus, for RA_OPTIMAL, &export_one is only set, + * for RA_MERGED and RA_ACCEPTED, &export_bulk is only set + * and for RA_ANY, both are set to accomodate for feeding all routes but receiving single changes + */ + void (*export_one)(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); + void (*export_bulk)(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe, rte **feed, uint count); + + void (*dump_req)(struct rt_export_request *req); + void (*log_state_change)(struct rt_export_request *req, u8); +}; + +struct rt_export_hook { + node n; + struct rt_exporter *table; /* The connected table */ + + pool *pool; + + struct rt_export_request *req; /* The requestor */ + + struct rt_export_stats { + /* Export - from core to protocol */ + u32 updates_received; /* Number of route updates received */ + u32 withdraws_received; /* Number of route withdraws received */ + } stats; + + btime last_state_change; /* Time of last state transition */ + + _Atomic u8 export_state; /* Route export state (TES_*, see below) */ + struct event event; /* Event running all the export operations */ + + struct bmap seq_map; /* Keep track which exports were already procesed */ + + void (*stopped)(struct rt_export_request *); /* Stored callback when export is stopped */ +}; + +struct rt_table_export_hook { + union { + struct rt_export_hook h; + struct { /* Overriding the parent structure beginning */ + node _n; + struct rt_table_exporter *table; + }; + }; + + union { + struct fib_iterator feed_fit; /* Routing table iterator used during feeding */ + struct { + struct f_trie_walk_state *walk_state; /* Iterator over networks in trie */ + struct f_trie *walk_lock; /* Locked trie for walking */ + union { /* Last net visited but not processed */ + net_addr walk_last; + net_addr_ip4 walk_last_ip4; + net_addr_ip6 walk_last_ip6; + }; + }; + }; + + struct rt_pending_export *_Atomic last_export;/* Last export processed */ + struct rt_pending_export *rpe_next; /* Next pending export to process */ + + u8 refeed_pending; /* Refeeding and another refeed is scheduled */ + u8 feed_type; /* Which feeding method is used (TFT_*, see below) */ + +}; + +#define TIS_DOWN 0 +#define TIS_UP 1 +#define TIS_STOP 2 +#define TIS_FLUSHING 3 +#define TIS_WAITING 4 +#define TIS_CLEARED 5 +#define TIS_MAX 6 + +#define TES_DOWN 0 +#define TES_HUNGRY 1 +#define TES_FEEDING 2 +#define TES_READY 3 +#define TES_STOP 4 +#define TES_MAX 5 + +/* Value of addr_mode */ +#define TE_ADDR_NONE 0 /* No address matching */ +#define TE_ADDR_EQUAL 1 /* Exact query - show route <addr> */ +#define TE_ADDR_FOR 2 /* Longest prefix match - show route for <addr> */ +#define TE_ADDR_IN 3 /* Interval query - show route in <addr> */ + + +#define TFT_FIB 1 +#define TFT_TRIE 2 +#define TFT_HASH 3 + +void rt_request_import(rtable *tab, struct rt_import_request *req); +void rt_request_export(rtable *tab, struct rt_export_request *req); +void rt_request_export_other(struct rt_exporter *tab, struct rt_export_request *req); + +void rt_export_once(struct rt_exporter *tab, struct rt_export_request *req); + +void rt_stop_import(struct rt_import_request *, void (*stopped)(struct rt_import_request *)); +void rt_stop_export(struct rt_export_request *, void (*stopped)(struct rt_export_request *)); + +const char *rt_import_state_name(u8 state); +const char *rt_export_state_name(u8 state); + +static inline u8 rt_import_get_state(struct rt_import_hook *ih) { return ih ? ih->import_state : TIS_DOWN; } +static inline u8 rt_export_get_state(struct rt_export_hook *eh) { return eh ? eh->export_state : TES_DOWN; } + +void rt_set_export_state(struct rt_export_hook *hook, u8 state); + +void rte_import(struct rt_import_request *req, const net_addr *net, rte *new, struct rte_src *src); + +/* + * For table export processing + */ + +/* Get next rpe. If src is given, it must match. */ +struct rt_pending_export *rpe_next(struct rt_pending_export *rpe, struct rte_src *src); + +/* Walk all rpe's */ +#define RPE_WALK(first, it, src) \ + for (struct rt_pending_export *it = (first); it; it = rpe_next(it, (src))) + +/* Mark the pending export processed */ +void rpe_mark_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); + +#define rpe_mark_seen_all(hook, first, src) \ + RPE_WALK((first), _rpe, (src)) rpe_mark_seen((hook), _rpe) + +/* Get pending export seen status */ +int rpe_get_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); + +/* + * For rt_export_hook and rt_exporter inheritance + */ + +void rt_init_export(struct rt_exporter *re, struct rt_export_hook *hook); +struct rt_export_hook *rt_alloc_export(struct rt_exporter *re, uint size); +void rt_stop_export_common(struct rt_export_hook *hook); +void rt_export_stopped(struct rt_export_hook *hook); +void rt_exporter_init(struct rt_exporter *re); + +/* Types of route announcement, also used as flags */ +#define RA_UNDEF 0 /* Undefined RA type */ +#define RA_OPTIMAL 1 /* Announcement of optimal route change */ +#define RA_ACCEPTED 2 /* Announcement of first accepted route */ +#define RA_ANY 3 /* Announcement of any route change */ +#define RA_MERGED 4 /* Announcement of optimal route merged with next ones */ + +/* Return value of preexport() callback */ +#define RIC_ACCEPT 1 /* Accepted by protocol */ +#define RIC_PROCESS 0 /* Process it through import filter */ +#define RIC_REJECT -1 /* Rejected by protocol */ +#define RIC_DROP -2 /* Silently dropped by protocol */ + +/* + * Next hop update data structures + */ + +#define NHU_CLEAN 0 +#define NHU_SCHEDULED 1 +#define NHU_RUNNING 2 +#define NHU_DIRTY 3 + +struct hostentry { + node ln; + ip_addr addr; /* IP address of host, part of key */ + ip_addr link; /* (link-local) IP address of host, used as gw + if host is directly attached */ + rtable *tab; /* Dependent table, part of key */ + struct hostentry *next; /* Next in hash chain */ + unsigned hash_key; /* Hash key */ + unsigned uc; /* Use count */ + ea_list *src; /* Source attributes */ + byte nexthop_linkable; /* Nexthop list is completely non-device */ + u32 igp_metric; /* Chosen route IGP metric */ +}; + +struct hostcache { + slab *slab; /* Slab holding all hostentries */ + struct hostentry **hash_table; /* Hash table for hostentries */ + unsigned hash_order, hash_shift; + unsigned hash_max, hash_min; + unsigned hash_items; + linpool *lp; /* Linpool for trie */ + struct f_trie *trie; /* Trie of prefixes that might affect hostentries */ + list hostentries; /* List of all hostentries */ + event update; + struct rt_export_request req; /* Notifier */ +}; + +struct rt_flowspec_link { + rtable *src; + rtable *dst; + u32 uc; + struct rt_export_request req; +}; + +#define rte_update channel_rte_import +/** + * rte_update - enter a new update to a routing table + * @c: channel doing the update + * @net: network address + * @rte: a &rte representing the new route + * @src: old route source identifier + * + * This function imports a new route to the appropriate table (via the channel). + * Table keys are @net (obligatory) and @rte->attrs->src. + * Both the @net and @rte pointers can be local. + * + * The route attributes (@rte->attrs) are obligatory. They can be also allocated + * locally. Anyway, if you use an already-cached attribute object, you shall + * call rta_clone() on that object yourself. (This semantics may change in future.) + * + * If the route attributes are local, you may set @rte->attrs->src to NULL, then + * the protocol's default route source will be supplied. + * + * When rte_update() gets a route, it automatically validates it. This includes + * checking for validity of the given network and next hop addresses and also + * checking for host-scope or link-scope routes. Then the import filters are + * processed and if accepted, the route is passed to route table recalculation. + * + * The accepted routes are then inserted into the table, replacing the old route + * for the same @net identified by @src. Then the route is announced + * to all the channels connected to the table using the standard export mechanism. + * Setting @rte to NULL makes this a withdraw, otherwise @rte->src must be the same + * as @src. + * + * All memory used for temporary allocations is taken from a special linpool + * @rte_update_pool and freed when rte_update() finishes. + */ +void rte_update(struct channel *c, const net_addr *net, struct rte *rte, struct rte_src *src); + +extern list routing_tables; +struct config; + +void rt_init(void); +void rt_preconfig(struct config *); +void rt_postconfig(struct config *); +void rt_commit(struct config *new, struct config *old); +void rt_lock_table_priv(struct rtable_private *, const char *file, uint line); +void rt_unlock_table_priv(struct rtable_private *, const char *file, uint line); +static inline void rt_lock_table_pub(rtable *t, const char *file, uint line) +{ RT_LOCKED(t, tt) rt_lock_table_priv(tt, file, line); } +static inline void rt_unlock_table_pub(rtable *t, const char *file, uint line) +{ RT_LOCKED(t, tt) rt_unlock_table_priv(tt, file, line); } + +#define rt_lock_table(t) _Generic((t), rtable *: rt_lock_table_pub, \ + struct rtable_private *: rt_lock_table_priv)((t), __FILE__, __LINE__) +#define rt_unlock_table(t) _Generic((t), rtable *: rt_unlock_table_pub, \ + struct rtable_private *: rt_unlock_table_priv)((t), __FILE__, __LINE__) + +struct f_trie * rt_lock_trie(struct rtable_private *tab); +void rt_unlock_trie(struct rtable_private *tab, struct f_trie *trie); +void rt_flowspec_link(rtable *src, rtable *dst); +void rt_flowspec_unlink(rtable *src, rtable *dst); +rtable *rt_setup(pool *, struct rtable_config *); + +static inline net *net_find(struct rtable_private *tab, const net_addr *addr) { return (net *) fib_find(&tab->fib, addr); } +static inline net *net_find_valid(struct rtable_private *tab, const net_addr *addr) +{ net *n = net_find(tab, addr); return (n && n->routes && rte_is_valid(&n->routes->rte)) ? n : NULL; } +static inline net *net_get(struct rtable_private *tab, const net_addr *addr) { return (net *) fib_get(&tab->fib, addr); } +net *net_route(struct rtable_private *tab, const net_addr *n); +int rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter); +rte *rt_export_merged(struct channel *c, rte ** feed, uint count, linpool *pool, int silent); +void rt_refresh_begin(struct rt_import_request *); +void rt_refresh_end(struct rt_import_request *); +void rt_modify_stale(rtable *t, struct rt_import_request *); +void rt_schedule_prune(struct rtable_private *t); +void rte_dump(struct rte_storage *); +void rt_dump(rtable *); +void rt_dump_all(void); +void rt_dump_hooks(rtable *); +void rt_dump_hooks_all(void); +int rt_reload_channel(struct channel *c); +void rt_reload_channel_abort(struct channel *c); +void rt_refeed_channel(struct channel *c); +void rt_prune_sync(rtable *t, int all); +struct rtable_config *rt_new_table(struct symbol *s, uint addr_type); +void rt_new_default_table(struct symbol *s); +struct rtable_config *rt_get_default_table(struct config *cf, uint addr_type); + +static inline int rt_is_ip(rtable *tab) +{ return (tab->addr_type == NET_IP4) || (tab->addr_type == NET_IP6); } + +static inline int rt_is_vpn(rtable *tab) +{ return (tab->addr_type == NET_VPN4) || (tab->addr_type == NET_VPN6); } + +static inline int rt_is_roa(rtable *tab) +{ return (tab->addr_type == NET_ROA4) || (tab->addr_type == NET_ROA6); } + +static inline int rt_is_flow(rtable *tab) +{ return (tab->addr_type == NET_FLOW4) || (tab->addr_type == NET_FLOW6); } + + +/* Default limit for ECMP next hops, defined in sysdep code */ +extern const int rt_default_ecmp; + +struct rt_show_data_rtable { + node n; + const char *name; + struct rt_exporter *table; + struct channel *export_channel; + struct channel *prefilter; + struct krt_proto *kernel; +}; + +struct rt_show_data { + struct cli *cli; /* Pointer back to the CLI */ + net_addr *addr; + list tables; + struct rt_show_data_rtable *tab; /* Iterator over table list */ + struct rt_show_data_rtable *last_table; /* Last table in output */ + struct rt_export_request req; /* Export request in use */ + int verbose, tables_defined_by; + const struct filter *filter; + struct proto *show_protocol; + struct proto *export_protocol; + struct channel *export_channel; + struct config *running_on_config; + struct rt_export_hook *kernel_export_hook; + int export_mode, addr_mode, primary_only, filtered, stats; + + int net_counter, rt_counter, show_counter, table_counter; + int net_counter_last, rt_counter_last, show_counter_last; + int show_counter_last_flush; +}; + +void rt_show(struct rt_show_data *); +struct rt_show_data_rtable * rt_show_add_exporter(struct rt_show_data *d, struct rt_exporter *t, const char *name); +struct rt_show_data_rtable * rt_show_add_table(struct rt_show_data *d, rtable *t); + +/* Value of table definition mode in struct rt_show_data */ +#define RSD_TDB_DEFAULT 0 /* no table specified */ +#define RSD_TDB_INDIRECT 0 /* show route ... protocol P ... */ +#define RSD_TDB_ALL RSD_TDB_SET /* show route ... table all ... */ +#define RSD_TDB_DIRECT RSD_TDB_SET | RSD_TDB_NMN /* show route ... table X table Y ... */ + +#define RSD_TDB_SET 0x1 /* internal: show empty tables */ +#define RSD_TDB_NMN 0x2 /* internal: need matching net */ + +/* Value of export_mode in struct rt_show_data */ +#define RSEM_NONE 0 /* Export mode not used */ +#define RSEM_PREEXPORT 1 /* Routes ready for export, before filtering */ +#define RSEM_EXPORT 2 /* Routes accepted by export filter */ +#define RSEM_NOEXPORT 3 /* Routes rejected by export filter */ +#define RSEM_EXPORTED 4 /* Routes marked in export map */ + +/* Host entry: Resolve hook for recursive nexthops */ +extern struct ea_class ea_gen_hostentry; +struct hostentry_adata { + adata ad; + struct hostentry *he; + u32 labels[0]; +}; + +void +ea_set_hostentry(ea_list **to, rtable *dep, rtable *tab, ip_addr gw, ip_addr ll, u32 lnum, u32 labels[lnum]); + +void ea_show_hostentry(const struct adata *ad, byte *buf, uint size); +void ea_show_nexthop_list(struct cli *c, struct nexthop_adata *nhad); + +/* + * Default protocol preferences + */ + +#define DEF_PREF_DIRECT 240 /* Directly connected */ +#define DEF_PREF_STATIC 200 /* Static route */ +#define DEF_PREF_OSPF 150 /* OSPF intra-area, inter-area and type 1 external routes */ +#define DEF_PREF_BABEL 130 /* Babel */ +#define DEF_PREF_RIP 120 /* RIP */ +#define DEF_PREF_BGP 100 /* BGP */ +#define DEF_PREF_RPKI 100 /* RPKI */ +#define DEF_PREF_INHERITED 10 /* Routes inherited from other routing daemons */ +#define DEF_PREF_UNKNOWN 0 /* Routes with no preference set */ + +/* + * Route Origin Authorization + */ + +#define ROA_UNKNOWN 0 +#define ROA_VALID 1 +#define ROA_INVALID 2 + +int net_roa_check(rtable *tab, const net_addr *n, u32 asn); + +#endif diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 40e85a16..4d024e3a 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -37,6 +37,7 @@ #include <stdlib.h> #include "babel.h" +#include "lib/macro.h" #define LOG_PKT_AUTH(msg, args...) \ log_rl(&p->log_pkt_tbf, L_AUTH "%s: " msg, p->p.name, args) @@ -58,12 +59,14 @@ static void babel_update_cost(struct babel_neighbor *n); static inline void babel_kick_timer(struct babel_proto *p); static inline void babel_iface_kick_timer(struct babel_iface *ifa); +static struct ea_class ea_babel_metric, ea_babel_router_id, ea_babel_seqno; + /* * Functions to maintain data structures */ static void -babel_init_entry(void *E) +babel_init_entry(struct fib *f UNUSED, void *E) { struct babel_entry *e = E; @@ -119,7 +122,7 @@ babel_get_source(struct babel_proto *p, struct babel_entry *e, u64 router_id) } static void -babel_expire_sources(struct babel_proto *p, struct babel_entry *e) +babel_expire_sources(struct babel_proto *p UNUSED, struct babel_entry *e) { struct babel_source *n, *nx; btime now_ = current_time(); @@ -129,7 +132,7 @@ babel_expire_sources(struct babel_proto *p, struct babel_entry *e) if (n->expires && n->expires <= now_) { rem_node(NODE n); - sl_free(p->source_slab, n); + sl_free(n); } } } @@ -174,7 +177,7 @@ babel_retract_route(struct babel_proto *p, struct babel_route *r) } static void -babel_flush_route(struct babel_proto *p, struct babel_route *r) +babel_flush_route(struct babel_proto *p UNUSED, struct babel_route *r) { DBG("Babel: Flush route %N router_id %lR neigh %I\n", r->e->n.addr, r->router_id, r->neigh->addr); @@ -185,7 +188,7 @@ babel_flush_route(struct babel_proto *p, struct babel_route *r) if (r->e->selected == r) r->e->selected = NULL; - sl_free(p->route_slab, r); + sl_free(r); } static void @@ -312,7 +315,9 @@ babel_add_seqno_request(struct babel_proto *p, struct babel_entry *e, /* Found older */ rem_node(NODE sr); - rem_node(&sr->nbr_node); + + if (sr->nbr) + rem_node(&sr->nbr_node); goto found; } @@ -336,13 +341,13 @@ found: } static void -babel_remove_seqno_request(struct babel_proto *p, struct babel_seqno_request *sr) +babel_remove_seqno_request(struct babel_proto *p UNUSED, struct babel_seqno_request *sr) { if (sr->nbr) rem_node(&sr->nbr_node); rem_node(NODE sr); - sl_free(p->seqno_slab, sr); + sl_free(sr); } static int @@ -452,10 +457,7 @@ babel_flush_neighbor(struct babel_proto *p, struct babel_neighbor *nbr) struct babel_seqno_request *sr; WALK_LIST_FIRST2(sr, nbr_node, nbr->requests) - { - sr->nbr = NULL; - rem_node(&sr->nbr_node); - } + babel_remove_seqno_request(p, sr); nbr->ifa = NULL; rem_node(NODE nbr); @@ -640,37 +642,14 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) if (r) { - rta a0 = { - .source = RTS_BABEL, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, - .pref = c->preference, - .from = r->neigh->addr, - .nh.gw = r->next_hop, - .nh.iface = r->neigh->ifa->iface, - .eattrs = alloca(sizeof(ea_list) + 3*sizeof(eattr)), - }; - - *a0.eattrs = (ea_list) { .count = 3 }; - a0.eattrs->attrs[0] = (eattr) { - .id = EA_BABEL_METRIC, - .type = EAF_TYPE_INT, - .u.data = r->metric, - }; - - struct adata *ad = alloca(sizeof(struct adata) + sizeof(u64)); - ad->length = sizeof(u64); - memcpy(ad->data, &(r->router_id), sizeof(u64)); - a0.eattrs->attrs[1] = (eattr) { - .id = EA_BABEL_ROUTER_ID, - .type = EAF_TYPE_OPAQUE, - .u.ptr = ad, - }; - - a0.eattrs->attrs[2] = (eattr) { - .id = EA_BABEL_SEQNO, - .type = EAF_TYPE_INT, - .u.data = r->seqno, + struct nexthop_adata nhad = { + .nh = { + .gw = r->next_hop, + .iface = r->neigh->ifa->iface, + }, + .ad = { + .length = sizeof nhad - sizeof nhad.ad, + }, }; /* @@ -679,10 +658,26 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) * have routing work. */ if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) - a0.nh.flags = RNF_ONLINK; + nhad.nh.flags = RNF_ONLINK; + + struct { + ea_list l; + eattr a[7]; + } eattrs = { + .l.count = ARRAY_SIZE(eattrs.a), + .a = { + EA_LITERAL_EMBEDDED(&ea_gen_preference, 0, c->preference), + EA_LITERAL_STORE_ADATA(&ea_gen_from, 0, &r->neigh->addr, sizeof(r->neigh->addr)), + EA_LITERAL_EMBEDDED(&ea_gen_source, 0, RTS_BABEL), + EA_LITERAL_STORE_ADATA(&ea_gen_nexthop, 0, nhad.ad.data, nhad.ad.length), + EA_LITERAL_EMBEDDED(&ea_babel_metric, 0, r->metric), + EA_LITERAL_STORE_ADATA(&ea_babel_router_id, 0, &r->router_id, sizeof(r->router_id)), + EA_LITERAL_EMBEDDED(&ea_babel_seqno, 0, r->seqno), + } + }; rte e0 = { - .attrs = &a0, + .attrs = &eattrs.l, .src = p->p.main_source, }; @@ -692,15 +687,14 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) else if (e->valid && (e->router_id != p->router_id)) { /* Unreachable */ - rta a0 = { - .source = RTS_BABEL, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNREACHABLE, - .pref = 1, - }; + ea_list *ea = NULL; + + ea_set_attr_u32(&ea, &ea_gen_preference, 0, 1); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_BABEL); + ea_set_dest(&ea, 0, RTD_UNREACHABLE); rte e0 = { - .attrs = &a0, + .attrs = ea, .src = p->p.main_source, }; @@ -862,14 +856,14 @@ babel_send_ihus(struct babel_iface *ifa) } static void -babel_send_hello(struct babel_iface *ifa) +babel_send_hello(struct babel_iface *ifa, uint interval) { struct babel_proto *p = ifa->proto; union babel_msg msg = {}; msg.type = BABEL_TLV_HELLO; msg.hello.seqno = ifa->hello_seqno++; - msg.hello.interval = ifa->cf->hello_interval; + msg.hello.interval = interval ?: ifa->cf->hello_interval; TRACE(D_PACKETS, "Sending hello on %s with seqno %d interval %t", ifa->ifname, msg.hello.seqno, (btime) msg.hello.interval); @@ -1577,7 +1571,7 @@ babel_iface_timer(timer *t) if (now_ >= ifa->next_hello) { - babel_send_hello(ifa); + babel_send_hello(ifa, 0); ifa->next_hello += hello_period * (1 + (now_ - ifa->next_hello) / hello_period); } @@ -1624,7 +1618,7 @@ babel_iface_start(struct babel_iface *ifa) tm_start(ifa->timer, 100 MS); ifa->up = 1; - babel_send_hello(ifa); + babel_send_hello(ifa, 0); babel_send_wildcard_retraction(ifa); babel_send_wildcard_request(ifa); babel_send_update(ifa, 0); /* Full update */ @@ -1919,7 +1913,7 @@ babel_reconfigure_ifaces(struct babel_proto *p, struct babel_config *cf) struct babel_iface *ifa = babel_find_iface(p, iface); struct babel_iface_config *ic = (void *) iface_patt_find(&cf->iface_list, iface, NULL); - if (ic && iface_is_valid(p, iface)) + if (ic && !iface_is_valid(p, iface)) ic = NULL; if (ifa && ic) @@ -2031,38 +2025,41 @@ static void babel_get_route_info(rte *rte, byte *buf) { u64 rid = 0; - eattr *e = ea_find(rte->attrs->eattrs, EA_BABEL_ROUTER_ID); + eattr *e = ea_find(rte->attrs, &ea_babel_router_id); if (e) memcpy(&rid, e->u.ptr->data, sizeof(u64)); - buf += bsprintf(buf, " (%d/%d) [%lR]", rte->attrs->pref, - ea_get_int(rte->attrs->eattrs, EA_BABEL_METRIC, BABEL_INFINITY), rid); + buf += bsprintf(buf, " (%d/%d) [%lR]", + rt_get_preference(rte), + ea_get_int(rte->attrs, &ea_babel_metric, BABEL_INFINITY), rid); } -static int -babel_get_attr(const eattr *a, byte *buf, int buflen UNUSED) +static void +babel_router_id_format(const eattr *a, byte *buf, uint len) { - switch (a->id) - { - case EA_BABEL_SEQNO: - return GA_FULL; + u64 rid = 0; + memcpy(&rid, a->u.ptr->data, sizeof(u64)); + bsnprintf(buf, len, "%lR", rid); +} - case EA_BABEL_METRIC: - bsprintf(buf, "metric: %d", a->u.data); - return GA_FULL; +static struct ea_class ea_babel_metric = { + .name = "babel_metric", + .type = T_INT, +}; - case EA_BABEL_ROUTER_ID: - { - u64 rid = 0; - memcpy(&rid, a->u.ptr->data, sizeof(u64)); - bsprintf(buf, "router_id: %lR", rid); - return GA_FULL; - } +static struct ea_class ea_babel_router_id = { + .name = "babel_router_id", + .type = T_OPAQUE, + .readonly = 1, + .format = babel_router_id_format, +}; + +static struct ea_class ea_babel_seqno = { + .name = "babel_seqno", + .type = T_INT, + .readonly = 1, +}; - default: - return GA_UNKNOWN; - } -} void babel_show_interfaces(struct proto *P, const char *iff) @@ -2260,11 +2257,15 @@ babel_kick_timer(struct babel_proto *p) static int -babel_preexport(struct channel *c, struct rte *new) +babel_preexport(struct channel *C, struct rte *new) { - struct rta *a = new->attrs; + if (new->src->owner != &C->proto->sources) + return 0; + /* Reject our own unreachable routes */ - if ((a->dest == RTD_UNREACHABLE) && (new->src->owner == &c->proto->sources)) + eattr *ea = ea_find(new->attrs, &ea_gen_nexthop); + struct nexthop_adata *nhad = (void *) ea->u.ptr; + if (!NEXTHOP_IS_REACHABLE(nhad)) return -1; return 0; @@ -2285,13 +2286,13 @@ babel_rt_notify(struct proto *P, struct channel *c UNUSED, const net_addr *net, { /* Update */ uint rt_seqno; - uint rt_metric = ea_get_int(new->attrs->eattrs, EA_BABEL_METRIC, 0); + uint rt_metric = ea_get_int(new->attrs, &ea_babel_metric, 0); u64 rt_router_id = 0; if (new->src->owner == &P->sources) { - rt_seqno = ea_find(new->attrs->eattrs, EA_BABEL_SEQNO)->u.data; - eattr *e = ea_find(new->attrs->eattrs, EA_BABEL_ROUTER_ID); + rt_seqno = ea_get_int(new->attrs, &ea_babel_seqno, 0); + eattr *e = ea_find(new->attrs, &ea_babel_router_id); if (e) memcpy(&rt_router_id, e->u.ptr->data, sizeof(u64)); } @@ -2342,16 +2343,16 @@ babel_rt_notify(struct proto *P, struct channel *c UNUSED, const net_addr *net, static int babel_rte_better(struct rte *new, struct rte *old) { - uint new_metric = ea_find(new->attrs->eattrs, EA_BABEL_SEQNO)->u.data; - uint old_metric = ea_find(old->attrs->eattrs, EA_BABEL_SEQNO)->u.data; + uint new_metric = ea_get_int(new->attrs, &ea_babel_metric, BABEL_INFINITY); + uint old_metric = ea_get_int(old->attrs, &ea_babel_metric, BABEL_INFINITY); return new_metric < old_metric; } static u32 -babel_rte_igp_metric(struct rte *rt) +babel_rte_igp_metric(const rte *rt) { - return ea_get_int(rt->attrs->eattrs, EA_BABEL_METRIC, BABEL_INFINITY); + return ea_get_int(rt->attrs, &ea_babel_metric, BABEL_INFINITY); } @@ -2441,6 +2442,11 @@ babel_iface_shutdown(struct babel_iface *ifa) { if (ifa->sk) { + /* + * Retract all our routes and lower the hello interval so peers' neighbour + * state expires quickly + */ + babel_send_hello(ifa, BABEL_MIN_INTERVAL); babel_send_wildcard_retraction(ifa); babel_send_queue(ifa); } @@ -2488,7 +2494,6 @@ babel_reconfigure(struct proto *P, struct proto_config *CF) struct protocol proto_babel = { .name = "Babel", .template = "babel%d", - .class = PROTOCOL_BABEL, .preference = DEF_PREF_BABEL, .channel_mask = NB_IP | NB_IP6_SADR, .proto_size = sizeof(struct babel_proto), @@ -2499,5 +2504,16 @@ struct protocol proto_babel = { .start = babel_start, .shutdown = babel_shutdown, .reconfigure = babel_reconfigure, - .get_attr = babel_get_attr }; + +void +babel_build(void) +{ + proto_build(&proto_babel); + + EA_REGISTER_ALL( + &ea_babel_metric, + &ea_babel_router_id, + &ea_babel_seqno + ); +} diff --git a/proto/babel/babel.h b/proto/babel/babel.h index 8b6da3c8..a980d1da 100644 --- a/proto/babel/babel.h +++ b/proto/babel/babel.h @@ -16,7 +16,7 @@ #include "nest/bird.h" #include "nest/cli.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/locks.h" #include "nest/password.h" @@ -26,10 +26,6 @@ #include "lib/string.h" #include "lib/timer.h" -#define EA_BABEL_METRIC EA_CODE(PROTOCOL_BABEL, 0) -#define EA_BABEL_ROUTER_ID EA_CODE(PROTOCOL_BABEL, 1) -#define EA_BABEL_SEQNO EA_CODE(PROTOCOL_BABEL, 2) - #define BABEL_MAGIC 42 #define BABEL_VERSION 2 #define BABEL_PORT 6696 diff --git a/proto/babel/config.Y b/proto/babel/config.Y index 05210fa4..a4350eed 100644 --- a/proto/babel/config.Y +++ b/proto/babel/config.Y @@ -24,7 +24,7 @@ CF_DECLS CF_KEYWORDS(BABEL, INTERFACE, METRIC, RXCOST, HELLO, UPDATE, INTERVAL, PORT, TYPE, WIRED, WIRELESS, RX, TX, BUFFER, PRIORITY, LENGTH, CHECK, LINK, - NEXT, HOP, IPV4, IPV6, BABEL_METRIC, SHOW, INTERFACES, NEIGHBORS, + NEXT, HOP, IPV4, IPV6, SHOW, INTERFACES, NEIGHBORS, ENTRIES, RANDOMIZE, ROUTER, ID, AUTHENTICATION, NONE, MAC, PERMISSIVE) CF_GRAMMAR @@ -163,8 +163,6 @@ babel_iface_opt_list: babel_iface: babel_iface_start iface_patt_list_nopx babel_iface_opt_list babel_iface_finish; -dynamic_attr: BABEL_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_BABEL_METRIC); } ; - CF_CLI_HELP(SHOW BABEL, ..., [[Show information about Babel protocol]]); CF_CLI(SHOW BABEL INTERFACES, optproto opttext, [<name>] [\"<interface>\"], [[Show information about Babel interfaces]]) diff --git a/proto/babel/packets.c b/proto/babel/packets.c index f13410e2..d4acc170 100644 --- a/proto/babel/packets.c +++ b/proto/babel/packets.c @@ -1318,7 +1318,6 @@ babel_send_to(struct babel_iface *ifa, ip_addr dest) static uint babel_write_queue(struct babel_iface *ifa, list *queue) { - struct babel_proto *p = ifa->proto; struct babel_write_state state = { .next_hop_ip6 = ifa->addr }; if (EMPTY_LIST(*queue)) @@ -1346,7 +1345,7 @@ babel_write_queue(struct babel_iface *ifa, list *queue) pos += len; rem_node(NODE msg); - sl_free(p->msg_slab, msg); + sl_free(msg); } pos += babel_auth_add_tlvs(ifa, (struct babel_tlv *) pos, end - pos); @@ -1507,13 +1506,13 @@ babel_process_packet(struct babel_iface *ifa, else if (res == PARSE_IGNORE) { DBG("Babel: Ignoring TLV of type %d\n", tlv->type); - sl_free(p->msg_slab, msg); + sl_free(msg); } else /* PARSE_ERROR */ { LOG_PKT("Bad TLV from %I via %s type %d pos %d - parse error", saddr, ifa->iface->name, tlv->type, (int) ((byte *)tlv - (byte *)pkt)); - sl_free(p->msg_slab, msg); + sl_free(msg); break; } } @@ -1525,7 +1524,7 @@ babel_process_packet(struct babel_iface *ifa, if (tlv_data[msg->msg.type].handle_tlv) tlv_data[msg->msg.type].handle_tlv(&msg->msg, ifa); rem_node(NODE msg); - sl_free(p->msg_slab, msg); + sl_free(msg); } } @@ -2011,7 +2010,7 @@ babel_auth_sign(struct babel_iface *ifa, ip_addr dest) } DBG("Added MAC signatures (%d bytes) on ifa %s for dest %I\n", - tot_len, ifa->ifname, dest); + pos - (pkt + len), ifa->ifname, dest); return pos - (pkt + len); } diff --git a/proto/bfd/bfd.c b/proto/bfd/bfd.c index dd3488d4..25ff19ac 100644 --- a/proto/bfd/bfd.c +++ b/proto/bfd/bfd.c @@ -82,7 +82,7 @@ * BFD thread to the main thread. This is done in an asynchronous way, sesions * with pending notifications are linked (in the BFD thread) to @notify_list in * &bfd_proto, and then bfd_notify_hook() in the main thread is activated using - * bfd_notify_kick() and a pipe. The hook then processes scheduled sessions and + * a standard event sending code. The hook then processes scheduled sessions and * calls hooks from associated BFD requests. This @notify_list (and state fields * in structure &bfd_session) is protected by a spinlock in &bfd_proto and * functions bfd_lock_sessions() / bfd_unlock_sessions(). @@ -113,7 +113,6 @@ #define HASH_IP_EQ(a1,n1,a2,n2) ipa_equal(a1, a2) && n1 == n2 #define HASH_IP_FN(a,n) ipa_hash(a) ^ u32_hash(n) -DEFINE_DOMAIN(rtable); #define BFD_LOCK LOCK_DOMAIN(rtable, bfd_global.lock) #define BFD_UNLOCK UNLOCK_DOMAIN(rtable, bfd_global.lock) @@ -129,7 +128,6 @@ const char *bfd_state_names[] = { "AdminDown", "Down", "Init", "Up" }; static void bfd_session_set_min_tx(struct bfd_session *s, u32 val); static struct bfd_iface *bfd_get_iface(struct bfd_proto *p, ip_addr local, struct iface *iface); static void bfd_free_iface(struct bfd_iface *ifa); -static inline void bfd_notify_kick(struct bfd_proto *p); /* @@ -178,7 +176,7 @@ bfd_session_update_state(struct bfd_session *s, uint state, uint diag) bfd_session_set_min_tx(s, s->cf.idle_tx_int); if (notify) - bfd_notify_kick(p); + ev_send(&global_event_list, &p->notify_event); } static void @@ -512,7 +510,7 @@ bfd_remove_session_locked(struct bfd_proto *p, struct bfd_session *s) TRACE(D_EVENTS, "Session to %I removed", s->addr); - sl_free(p->session_slab, s); + sl_free(s); } static void @@ -590,6 +588,9 @@ bfd_get_iface(struct bfd_proto *p, ip_addr local, struct iface *iface) ifa->sk = bfd_open_tx_sk(p, local, iface); ifa->uc = 1; + if (cf->strict_bind) + ifa->rx = bfd_open_rx_sk_bound(p, local, iface); + add_tail(&p->iface_list, &ifa->n); return ifa; @@ -607,6 +608,12 @@ bfd_free_iface(struct bfd_iface *ifa) rfree(ifa->sk); } + if (ifa->rx) + { + sk_stop(ifa->rx); + rfree(ifa->rx); + } + rem_node(&ifa->n); mb_free(ifa); } @@ -944,21 +951,15 @@ bfd_reconfigure_neighbors(struct bfd_proto *p, struct bfd_config *new) /* This core notify code should be replaced after main loop transition to birdloop */ -int pipe(int pipefd[2]); -void pipe_drain(int fd); -void pipe_kick(int fd); - -static int -bfd_notify_hook(sock *sk, uint len UNUSED) +static void +bfd_notify_hook(void *data) { - struct bfd_proto *p = sk->data; + struct bfd_proto *p = data; struct bfd_session *s; list tmp_list; u8 state, diag; node *n, *nn; - pipe_drain(sk->fd); - bfd_lock_sessions(p); init_list(&tmp_list); add_tail_list(&tmp_list, &p->notify_list); @@ -982,68 +983,12 @@ bfd_notify_hook(sock *sk, uint len UNUSED) if (EMPTY_LIST(s->request_list)) bfd_remove_session(p, s); } - - return 0; } -static inline void -bfd_notify_kick(struct bfd_proto *p) -{ - pipe_kick(p->notify_ws->fd); -} - -static void -bfd_noterr_hook(sock *sk, int err) -{ - struct bfd_proto *p = sk->data; - log(L_ERR "%s: Notify socket error: %m", p->p.name, err); -} - -static void -bfd_notify_init(struct bfd_proto *p) -{ - int pfds[2]; - sock *sk; - - int rv = pipe(pfds); - if (rv < 0) - die("pipe: %m"); - - sk = sk_new(p->p.pool); - sk->type = SK_MAGIC; - sk->rx_hook = bfd_notify_hook; - sk->err_hook = bfd_noterr_hook; - sk->fd = pfds[0]; - sk->data = p; - if (sk_open(sk) < 0) - die("bfd: sk_open failed"); - p->notify_rs = sk; - - /* The write sock is not added to any event loop */ - sk = sk_new(p->p.pool); - sk->type = SK_MAGIC; - sk->fd = pfds[1]; - sk->data = p; - sk->flags = SKF_THREAD; - if (sk_open(sk) < 0) - die("bfd: sk_open failed"); - p->notify_ws = sk; -} - - /* * BFD protocol glue */ -void -bfd_init_all(void) -{ - bfd_global.lock = DOMAIN_NEW(rtable, "BFD Global"); - init_list(&bfd_global.wait_list); - init_list(&bfd_global.pickup_list); - init_list(&bfd_global.proto_list); -} - static struct proto * bfd_init(struct proto_config *c) { @@ -1071,21 +1016,27 @@ bfd_start(struct proto *P) init_list(&p->iface_list); init_list(&p->notify_list); - bfd_notify_init(p); + p->notify_event = (event) { + .hook = bfd_notify_hook, + .data = p, + }; add_tail(&bfd_global.proto_list, &p->bfd_node); - if (cf->accept_ipv4 && cf->accept_direct) - p->rx4_1 = bfd_open_rx_sk(p, 0, SK_IPV4); + if (!cf->strict_bind) + { + if (cf->accept_ipv4 && cf->accept_direct) + p->rx4_1 = bfd_open_rx_sk(p, 0, SK_IPV4); - if (cf->accept_ipv4 && cf->accept_multihop) - p->rx4_m = bfd_open_rx_sk(p, 1, SK_IPV4); + if (cf->accept_ipv4 && cf->accept_multihop) + p->rx4_m = bfd_open_rx_sk(p, 1, SK_IPV4); - if (cf->accept_ipv6 && cf->accept_direct) - p->rx6_1 = bfd_open_rx_sk(p, 0, SK_IPV6); + if (cf->accept_ipv6 && cf->accept_direct) + p->rx6_1 = bfd_open_rx_sk(p, 0, SK_IPV6); - if (cf->accept_ipv6 && cf->accept_multihop) - p->rx6_m = bfd_open_rx_sk(p, 1, SK_IPV6); + if (cf->accept_ipv6 && cf->accept_multihop) + p->rx6_m = bfd_open_rx_sk(p, 1, SK_IPV6); + } bfd_take_requests(p); @@ -1130,7 +1081,8 @@ bfd_reconfigure(struct proto *P, struct proto_config *c) if ((new->accept_ipv4 != old->accept_ipv4) || (new->accept_ipv6 != old->accept_ipv6) || (new->accept_direct != old->accept_direct) || - (new->accept_multihop != old->accept_multihop)) + (new->accept_multihop != old->accept_multihop) || + (new->strict_bind != old->strict_bind)) return 0; birdloop_mask_wakeups(p->p.loop); @@ -1205,7 +1157,6 @@ bfd_show_sessions(struct proto *P) struct protocol proto_bfd = { .name = "BFD", .template = "bfd%d", - .class = PROTOCOL_BFD, .proto_size = sizeof(struct bfd_proto), .config_size = sizeof(struct bfd_config), .init = bfd_init, @@ -1214,3 +1165,14 @@ struct protocol proto_bfd = { .reconfigure = bfd_reconfigure, .copy_config = bfd_copy_config, }; + +void +bfd_build(void) +{ + proto_build(&proto_bfd); + + bfd_global.lock = DOMAIN_NEW(rtable, "BFD Global"); + init_list(&bfd_global.wait_list); + init_list(&bfd_global.pickup_list); + init_list(&bfd_global.proto_list); +} diff --git a/proto/bfd/bfd.h b/proto/bfd/bfd.h index 8430064b..9a8e20c6 100644 --- a/proto/bfd/bfd.h +++ b/proto/bfd/bfd.h @@ -13,7 +13,7 @@ #include "nest/cli.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/password.h" #include "conf/conf.h" #include "lib/hash.h" @@ -47,6 +47,7 @@ struct bfd_config u8 accept_ipv6; u8 accept_direct; u8 accept_multihop; + u8 strict_bind; }; struct bfd_iface_config @@ -98,8 +99,7 @@ struct bfd_proto HASH(struct bfd_session) session_hash_id; HASH(struct bfd_session) session_hash_ip; - sock *notify_rs; - sock *notify_ws; + event notify_event; list notify_list; sock *rx4_1; @@ -118,6 +118,7 @@ struct bfd_iface struct bfd_proto *bfd; sock *sk; + sock *rx; u32 uc; u8 changed; }; @@ -223,6 +224,7 @@ void bfd_show_sessions(struct proto *P); /* packets.c */ void bfd_send_ctl(struct bfd_proto *p, struct bfd_session *s, int final); sock * bfd_open_rx_sk(struct bfd_proto *p, int multihop, int inet_version); +sock * bfd_open_rx_sk_bound(struct bfd_proto *p, ip_addr local, struct iface *ifa); sock * bfd_open_tx_sk(struct bfd_proto *p, ip_addr local, struct iface *ifa); diff --git a/proto/bfd/config.Y b/proto/bfd/config.Y index ed5479fb..0d6e33fa 100644 --- a/proto/bfd/config.Y +++ b/proto/bfd/config.Y @@ -23,7 +23,8 @@ CF_DECLS CF_KEYWORDS(BFD, MIN, IDLE, RX, TX, INTERVAL, MULTIPLIER, PASSIVE, INTERFACE, MULTIHOP, NEIGHBOR, DEV, LOCAL, AUTHENTICATION, - NONE, SIMPLE, METICULOUS, KEYED, MD5, SHA1, IPV4, IPV6, DIRECT) + NONE, SIMPLE, METICULOUS, KEYED, MD5, SHA1, IPV4, IPV6, DIRECT, + STRICT, BIND) %type <iface> bfd_neigh_iface %type <a> bfd_neigh_local @@ -49,6 +50,7 @@ bfd_proto_item: | INTERFACE bfd_iface | MULTIHOP bfd_multihop | NEIGHBOR bfd_neighbor + | STRICT BIND bool { BFD_CFG->strict_bind = $3; } ; bfd_proto_opts: diff --git a/proto/bfd/packets.c b/proto/bfd/packets.c index 37d77f37..6f0b4eaf 100644 --- a/proto/bfd/packets.c +++ b/proto/bfd/packets.c @@ -366,7 +366,9 @@ bfd_rx_hook(sock *sk, uint len) if (ps > BFD_STATE_DOWN) DROP("invalid init state", ps); - uint ifindex = (sk->sport == BFD_CONTROL_PORT) ? sk->lifindex : 0; + uint ifindex = (sk->sport == BFD_CONTROL_PORT) ? + (sk->iface ? sk->iface->index : sk->lifindex) : + 0; s = bfd_find_session_by_addr(p, sk->faddr, ifindex); /* FIXME: better session matching and message */ @@ -439,6 +441,38 @@ bfd_open_rx_sk(struct bfd_proto *p, int multihop, int af) } sock * +bfd_open_rx_sk_bound(struct bfd_proto *p, ip_addr local, struct iface *ifa) +{ + sock *sk = sk_new(p->tpool); + sk->type = SK_UDP; + sk->saddr = local; + sk->sport = ifa ? BFD_CONTROL_PORT : BFD_MULTI_CTL_PORT; + sk->iface = ifa; + sk->vrf = p->p.vrf; + sk->data = p; + + sk->rbsize = BFD_MAX_LEN; + sk->rx_hook = bfd_rx_hook; + sk->err_hook = bfd_err_hook; + + /* TODO: configurable ToS and priority */ + sk->tos = IP_PREC_INTERNET_CONTROL; + sk->priority = sk_priority_control; + sk->flags = SKF_THREAD | SKF_BIND | (ifa ? SKF_TTL_RX : 0); + + if (sk_open(sk) < 0) + goto err; + + sk_start(sk); + return sk; + + err: + sk_log_error(sk, p->p.name); + rfree(sk); + return NULL; +} + +sock * bfd_open_tx_sk(struct bfd_proto *p, ip_addr local, struct iface *ifa) { sock *sk = sk_new(p->p.pool); diff --git a/proto/bgp/Makefile b/proto/bgp/Makefile index 00aaef5e..f6a38678 100644 --- a/proto/bgp/Makefile +++ b/proto/bgp/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 1080db77..bfdd9ac5 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -15,12 +15,13 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "conf/conf.h" #include "lib/resource.h" #include "lib/string.h" #include "lib/unaligned.h" +#include "lib/macro.h" #include "bgp.h" @@ -45,9 +46,9 @@ * * export - Hook that validates and normalizes attribute during export phase. * Receives eattr, may modify it (e.g., sort community lists for canonical - * representation), UNSET() it (e.g., skip empty lists), or WITHDRAW() it if - * necessary. May assume that eattr has value valid w.r.t. its type, but may be - * invalid w.r.t. BGP constraints. Optional. + * representation), UNSET() it (e.g., skip empty lists), or REJECT() the route + * if necessary. May assume that eattr has value valid w.r.t. its type, but may + * be invalid w.r.t. BGP constraints. Optional. * * encode - Hook that converts internal representation to external one during * packet writing. Receives eattr and puts it in the buffer (including attribute @@ -64,37 +65,72 @@ * format - Optional hook that converts eattr to textual representation. */ - -struct bgp_attr_desc { - const char *name; - uint type; - uint flags; - void (*export)(struct bgp_export_state *s, eattr *a); - int (*encode)(struct bgp_write_state *s, eattr *a, byte *buf, uint size); - void (*decode)(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to); - void (*format)(const eattr *ea, byte *buf, uint size); +union bgp_attr_desc { + struct ea_class class; + struct { + EA_CLASS_INSIDE; + uint flags; + void (*export)(struct bgp_export_state *s, eattr *a); + int (*encode)(struct bgp_write_state *s, eattr *a, byte *buf, uint size); + void (*decode)(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to); + }; }; -static const struct bgp_attr_desc bgp_attr_table[]; +static union bgp_attr_desc bgp_attr_table[]; +static inline const union bgp_attr_desc *bgp_find_attr_desc(eattr *a) +{ + const struct ea_class *class = ea_class_find(a->id); -static inline int bgp_attr_known(uint code); + if ((class < &bgp_attr_table[0].class) || (class >= &bgp_attr_table[BGP_ATTR_MAX].class)) + return NULL; -eattr * -bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintptr_t val) + return (const union bgp_attr_desc *) class; +} + +#define BGP_EA_ID(code) (bgp_attr_table[code].id) +#define EA_BGP_ID(code) (((union bgp_attr_desc *) ea_class_find(code)) - bgp_attr_table) + +void bgp_set_attr_u32(ea_list **to, uint code, uint flags, u32 val) +{ + const union bgp_attr_desc *desc = &bgp_attr_table[code]; + + ea_set_attr(to, EA_LITERAL_EMBEDDED( + &desc->class, + flags & ~BAF_EXT_LEN, + val + )); +} + +void bgp_set_attr_ptr(ea_list **to, uint code, uint flags, const struct adata *ad) { - ASSERT(bgp_attr_known(code)); + const union bgp_attr_desc *desc = &bgp_attr_table[code]; - return ea_set_attr( - attrs, - pool, - EA_CODE(PROTOCOL_BGP, code), - flags & ~BAF_EXT_LEN, - bgp_attr_table[code].type, - val - ); + ea_set_attr(to, EA_LITERAL_DIRECT_ADATA( + &desc->class, + flags & ~BAF_EXT_LEN, + ad + )); } +void +bgp_set_attr_data(ea_list **to, uint code, uint flags, void *data, uint len) +{ + const union bgp_attr_desc *desc = &bgp_attr_table[code]; + ea_set_attr(to, EA_LITERAL_STORE_ADATA( + &desc->class, + flags & ~BAF_EXT_LEN, + data, + len + )); +} + +void +bgp_unset_attr(ea_list **to, uint code) +{ + const union bgp_attr_desc *desc = &bgp_attr_table[code]; + ea_unset_attr(to, 0, &desc->class); +} #define REPORT(msg, args...) \ ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); }) @@ -106,7 +142,10 @@ bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintp ({ REPORT(msg, ## args); s->err_withdraw = 1; return; }) #define UNSET(a) \ - ({ a->type = EAF_TYPE_UNDEF; return; }) + ({ a->undef = 1; return; }) + +#define REJECT(msg, args...) \ + ({ log(L_ERR "%s: " msg, s->proto->p.name, ## args); s->err_reject = 1; return; }) #define NEW_BGP "Discarding %s attribute received from AS4-aware neighbor" #define BAD_EBGP "Discarding %s attribute received from EBGP neighbor" @@ -148,7 +187,7 @@ bgp_encode_u8(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) if (size < (3+1)) return -1; - bgp_put_attr_hdr3(buf, EA_ID(a->id), a->flags, 1); + bgp_put_attr_hdr3(buf, EA_BGP_ID(a->id), a->flags, 1); buf[3] = a->u.data; return 3+1; @@ -160,7 +199,7 @@ bgp_encode_u32(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) if (size < (3+4)) return -1; - bgp_put_attr_hdr3(buf, EA_ID(a->id), a->flags, 4); + bgp_put_attr_hdr3(buf, EA_BGP_ID(a->id), a->flags, 4); put_u32(buf+3, a->u.data); return 3+4; @@ -174,7 +213,7 @@ bgp_encode_u32s(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size if (size < (4+len)) return -1; - uint hdr = bgp_put_attr_hdr(buf, EA_ID(a->id), a->flags, len); + uint hdr = bgp_put_attr_hdr(buf, EA_BGP_ID(a->id), a->flags, len); put_u32s(buf + hdr, (u32 *) a->u.ptr->data, len / 4); return hdr + len; @@ -195,7 +234,7 @@ bgp_put_attr(byte *buf, uint size, uint code, uint flags, const byte *data, uint static int bgp_encode_raw(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) { - return bgp_put_attr(buf, size, EA_ID(a->id), a->flags, a->u.ptr->data, a->u.ptr->length); + return bgp_put_attr(buf, size, EA_BGP_ID(a->id), a->flags, a->u.ptr->data, a->u.ptr->length); } @@ -333,26 +372,26 @@ bgp_aigp_set_metric(struct linpool *pool, const struct adata *ad, u64 metric) } int -bgp_total_aigp_metric_(struct rta *a, u64 *metric, const struct adata **ad) +bgp_total_aigp_metric_(const rte *e, u64 *metric, const struct adata **ad) { - eattr *ea = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AIGP)); - if (!ea) + eattr *a = ea_find(e->attrs, BGP_EA_ID(BA_AIGP)); + if (!a) return 0; - const byte *b = bgp_aigp_get_tlv(ea->u.ptr, BGP_AIGP_METRIC); + const byte *b = bgp_aigp_get_tlv(a->u.ptr, BGP_AIGP_METRIC); if (!b) return 0; u64 aigp = get_u64(b + 3); - u64 step = a->igp_metric; + u64 step = rt_get_igp_metric(e); - if (!rta_resolvable(a) || (step >= IGP_METRIC_UNKNOWN)) + if (!rte_resolvable(e) || (step >= IGP_METRIC_UNKNOWN)) step = BGP_AIGP_MAX; if (!step) step = 1; - *ad = ea->u.ptr; + *ad = a->u.ptr; *metric = aigp + step; if (*metric < aigp) *metric = BGP_AIGP_MAX; @@ -363,7 +402,7 @@ bgp_total_aigp_metric_(struct rta *a, u64 *metric, const struct adata **ad) static inline int bgp_init_aigp_metric(rte *e, u64 *metric, const struct adata **ad) { - if (e->attrs->source == RTS_BGP) + if (rt_get_source_attr(e) == RTS_BGP) return 0; *metric = rt_get_igp_metric(e); @@ -372,9 +411,9 @@ bgp_init_aigp_metric(rte *e, u64 *metric, const struct adata **ad) } u32 -bgp_rte_igp_metric(struct rte *rt) +bgp_rte_igp_metric(const rte *rt) { - u64 metric = bgp_total_aigp_metric(rt->attrs); + u64 metric = bgp_total_aigp_metric(rt); return (u32) MIN(metric, (u64) IGP_METRIC_UNKNOWN); } @@ -387,7 +426,7 @@ static void bgp_export_origin(struct bgp_export_state *s, eattr *a) { if (a->u.data > 2) - WITHDRAW(BAD_VALUE, "ORIGIN", a->u.data); + REJECT(BAD_VALUE, "ORIGIN", a->u.data); } static void @@ -399,7 +438,7 @@ bgp_decode_origin(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte if (data[0] > 2) WITHDRAW(BAD_VALUE, "ORIGIN", data[0]); - bgp_set_attr_u32(to, s->pool, BA_ORIGIN, flags, data[0]); + bgp_set_attr_u32(to, BA_ORIGIN, flags, data[0]); } static void @@ -467,7 +506,7 @@ bgp_decode_as_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte !bgp_as_path_first_as_equal(data, len, p->remote_as)) WITHDRAW("Malformed AS_PATH attribute - %s", "First AS differs from neigbor AS"); - bgp_set_attr_data(to, s->pool, BA_AS_PATH, flags, data, len); + bgp_set_attr_data(to, BA_AS_PATH, flags, data, len); } @@ -539,7 +578,7 @@ bgp_decode_med(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *da WITHDRAW(BAD_LENGTH, "MULTI_EXIT_DISC", len); u32 val = get_u32(data); - bgp_set_attr_u32(to, s->pool, BA_MULTI_EXIT_DISC, flags, val); + bgp_set_attr_u32(to, BA_MULTI_EXIT_DISC, flags, val); } @@ -560,7 +599,7 @@ bgp_decode_local_pref(struct bgp_parse_state *s, uint code UNUSED, uint flags, b WITHDRAW(BAD_LENGTH, "LOCAL_PREF", len); u32 val = get_u32(data); - bgp_set_attr_u32(to, s->pool, BA_LOCAL_PREF, flags, val); + bgp_set_attr_u32(to, BA_LOCAL_PREF, flags, val); } @@ -570,7 +609,7 @@ bgp_decode_atomic_aggr(struct bgp_parse_state *s, uint code UNUSED, uint flags, if (len != 0) DISCARD(BAD_LENGTH, "ATOMIC_AGGR", len); - bgp_set_attr_data(to, s->pool, BA_ATOMIC_AGGR, flags, NULL, 0); + bgp_set_attr_data(to, BA_ATOMIC_AGGR, flags, NULL, 0); } static int @@ -604,7 +643,7 @@ bgp_decode_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flags, b len = aggregator_16to32(data, src); } - bgp_set_attr_data(to, s->pool, BA_AGGREGATOR, flags, data, len); + bgp_set_attr_data(to, BA_AGGREGATOR, flags, data, len); } static void @@ -633,7 +672,7 @@ bgp_decode_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, by struct adata *ad = lp_alloc_adata(s->pool, len); get_u32s(data, (u32 *) ad->data, len / 4); - bgp_set_attr_ptr(to, s->pool, BA_COMMUNITY, flags, ad); + bgp_set_attr_ptr(to, BA_COMMUNITY, flags, ad); } @@ -654,7 +693,7 @@ bgp_decode_originator_id(struct bgp_parse_state *s, uint code UNUSED, uint flags WITHDRAW(BAD_LENGTH, "ORIGINATOR_ID", len); u32 val = get_u32(data); - bgp_set_attr_u32(to, s->pool, BA_ORIGINATOR_ID, flags, val); + bgp_set_attr_u32(to, BA_ORIGINATOR_ID, flags, val); } @@ -679,7 +718,7 @@ bgp_decode_cluster_list(struct bgp_parse_state *s, uint code UNUSED, uint flags, struct adata *ad = lp_alloc_adata(s->pool, len); get_u32s(data, (u32 *) ad->data, len / 4); - bgp_set_attr_ptr(to, s->pool, BA_CLUSTER_LIST, flags, ad); + bgp_set_attr_ptr(to, BA_CLUSTER_LIST, flags, ad); } static void @@ -798,7 +837,7 @@ bgp_decode_ext_community(struct bgp_parse_state *s, uint code UNUSED, uint flags struct adata *ad = lp_alloc_adata(s->pool, len); get_u32s(data, (u32 *) ad->data, len / 4); - bgp_set_attr_ptr(to, s->pool, BA_EXT_COMMUNITY, flags, ad); + bgp_set_attr_ptr(to, BA_EXT_COMMUNITY, flags, ad); } @@ -811,7 +850,7 @@ bgp_decode_as4_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flag if (len != 8) DISCARD(BAD_LENGTH, "AS4_AGGREGATOR", len); - bgp_set_attr_data(to, s->pool, BA_AS4_AGGREGATOR, flags, data, len); + bgp_set_attr_data(to, BA_AS4_AGGREGATOR, flags, data, len); } static void @@ -841,7 +880,7 @@ bgp_decode_as4_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byt a = as_path_strip_confed(s->pool, a); } - bgp_set_attr_ptr(to, s->pool, BA_AS4_PATH, flags, a); + bgp_set_attr_ptr(to, BA_AS4_PATH, flags, a); } @@ -865,7 +904,7 @@ bgp_decode_aigp(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *d if (!bgp_aigp_valid(data, len, err, sizeof(err))) DISCARD("Malformed AIGP attribute - %s", err); - bgp_set_attr_data(to, s->pool, BA_AIGP, flags, data, len); + bgp_set_attr_data(to, BA_AIGP, flags, data, len); } static void @@ -897,9 +936,21 @@ bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint fla struct adata *ad = lp_alloc_adata(s->pool, len); get_u32s(data, (u32 *) ad->data, len / 4); - bgp_set_attr_ptr(to, s->pool, BA_LARGE_COMMUNITY, flags, ad); + bgp_set_attr_ptr(to, BA_LARGE_COMMUNITY, flags, ad); } + +static void +bgp_decode_otc(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to) +{ + if (len != 4) + WITHDRAW(BAD_LENGTH, "OTC", len); + + u32 val = get_u32(data); + bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, flags, val); +} + + static void bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a) { @@ -909,20 +960,20 @@ bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a) /* Perhaps we should just ignore it? */ if (!s->mpls) - WITHDRAW("Unexpected MPLS stack"); + REJECT("Unexpected MPLS stack"); /* Empty MPLS stack is not allowed */ if (!lnum) - WITHDRAW("Malformed MPLS stack - empty"); + REJECT("Malformed MPLS stack - empty"); /* This is ugly, but we must ensure that labels fit into NLRI field */ if ((24*lnum + (net_is_vpn(n) ? 64 : 0) + net_pxlen(n)) > 255) - WITHDRAW("Malformed MPLS stack - too many labels (%u)", lnum); + REJECT("Malformed MPLS stack - too many labels (%u)", lnum); for (uint i = 0; i < lnum; i++) { if (labels[i] > 0xfffff) - WITHDRAW("Malformed MPLS stack - invalid label (%u)", labels[i]); + REJECT("Malformed MPLS stack - invalid label (%u)", labels[i]); /* TODO: Check for special-purpose label values? */ } @@ -970,10 +1021,29 @@ bgp_format_mpls_label_stack(const eattr *a, byte *buf, uint size) } static inline void -bgp_decode_unknown(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to) +bgp_export_unknown(struct bgp_export_state *s UNUSED, eattr *a) +{ + if (!(a->flags & BAF_TRANSITIVE)) + UNSET(a); + + a->flags |= BAF_PARTIAL; +} + +static inline void +bgp_decode_unknown(struct bgp_parse_state *s UNUSED, uint code, uint flags, byte *data, uint len, ea_list **to) { + if (!(flags & BAF_OPTIONAL)) + WITHDRAW("Unknown attribute (code %u) - conflicting flags (%02x)", code, flags); + /* Cannot use bgp_set_attr_data() as it works on known attributes only */ - ea_set_attr_data(to, s->pool, EA_CODE(PROTOCOL_BGP, code), flags, EAF_TYPE_OPAQUE, data, len); + ea_set_attr_data(to, &bgp_attr_table[code].class, flags, data, len); +} + +static inline void +bgp_format_unknown(const eattr *a, byte *buf, uint size) +{ + if (a->flags & BAF_TRANSITIVE) + bsnprintf(buf, size, "(transitive)"); } @@ -981,10 +1051,10 @@ bgp_decode_unknown(struct bgp_parse_state *s, uint code, uint flags, byte *data, * Attribute table */ -static const struct bgp_attr_desc bgp_attr_table[] = { +static union bgp_attr_desc bgp_attr_table[BGP_ATTR_MAX] = { [BA_ORIGIN] = { - .name = "origin", - .type = EAF_TYPE_INT, + .name = "bgp_origin", + .type = T_ENUM_BGP_ORIGIN, .flags = BAF_TRANSITIVE, .export = bgp_export_origin, .encode = bgp_encode_u8, @@ -992,69 +1062,69 @@ static const struct bgp_attr_desc bgp_attr_table[] = { .format = bgp_format_origin, }, [BA_AS_PATH] = { - .name = "as_path", - .type = EAF_TYPE_AS_PATH, + .name = "bgp_path", + .type = T_PATH, .flags = BAF_TRANSITIVE, .encode = bgp_encode_as_path, .decode = bgp_decode_as_path, }, [BA_NEXT_HOP] = { - .name = "next_hop", - .type = EAF_TYPE_IP_ADDRESS, + .name = "bgp_next_hop", + .type = T_IP, .flags = BAF_TRANSITIVE, .encode = bgp_encode_next_hop, .decode = bgp_decode_next_hop, .format = bgp_format_next_hop, }, [BA_MULTI_EXIT_DISC] = { - .name = "med", - .type = EAF_TYPE_INT, + .name = "bgp_med", + .type = T_INT, .flags = BAF_OPTIONAL, .encode = bgp_encode_u32, .decode = bgp_decode_med, }, [BA_LOCAL_PREF] = { - .name = "local_pref", - .type = EAF_TYPE_INT, + .name = "bgp_local_pref", + .type = T_INT, .flags = BAF_TRANSITIVE, .export = bgp_export_local_pref, .encode = bgp_encode_u32, .decode = bgp_decode_local_pref, }, [BA_ATOMIC_AGGR] = { - .name = "atomic_aggr", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_atomic_aggr", + .type = T_OPAQUE, .flags = BAF_TRANSITIVE, .encode = bgp_encode_raw, .decode = bgp_decode_atomic_aggr, }, [BA_AGGREGATOR] = { - .name = "aggregator", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_aggregator", + .type = T_OPAQUE, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .encode = bgp_encode_aggregator, .decode = bgp_decode_aggregator, .format = bgp_format_aggregator, }, [BA_COMMUNITY] = { - .name = "community", - .type = EAF_TYPE_INT_SET, + .name = "bgp_community", + .type = T_CLIST, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .export = bgp_export_community, .encode = bgp_encode_u32s, .decode = bgp_decode_community, }, [BA_ORIGINATOR_ID] = { - .name = "originator_id", - .type = EAF_TYPE_ROUTER_ID, + .name = "bgp_originator_id", + .type = T_QUAD, .flags = BAF_OPTIONAL, .export = bgp_export_originator_id, .encode = bgp_encode_u32, .decode = bgp_decode_originator_id, }, [BA_CLUSTER_LIST] = { - .name = "cluster_list", - .type = EAF_TYPE_INT_SET, + .name = "bgp_cluster_list", + .type = T_CLIST, .flags = BAF_OPTIONAL, .export = bgp_export_cluster_list, .encode = bgp_encode_u32s, @@ -1062,43 +1132,47 @@ static const struct bgp_attr_desc bgp_attr_table[] = { .format = bgp_format_cluster_list, }, [BA_MP_REACH_NLRI] = { - .name = "mp_reach_nlri", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_mp_reach_nlri", + .type = T_OPAQUE, + .hidden = 1, .flags = BAF_OPTIONAL, .decode = bgp_decode_mp_reach_nlri, }, [BA_MP_UNREACH_NLRI] = { - .name = "mp_unreach_nlri", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_mp_unreach_nlri", + .type = T_OPAQUE, + .hidden = 1, .flags = BAF_OPTIONAL, .decode = bgp_decode_mp_unreach_nlri, }, [BA_EXT_COMMUNITY] = { - .name = "ext_community", - .type = EAF_TYPE_EC_SET, + .name = "bgp_ext_community", + .type = T_ECLIST, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .export = bgp_export_ext_community, .encode = bgp_encode_u32s, .decode = bgp_decode_ext_community, }, [BA_AS4_PATH] = { - .name = "as4_path", - .type = EAF_TYPE_AS_PATH, + .name = "bgp_as4_path", + .type = T_PATH, + .hidden = 1, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .encode = bgp_encode_raw, .decode = bgp_decode_as4_path, }, [BA_AS4_AGGREGATOR] = { - .name = "as4_aggregator", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_as4_aggregator", + .type = T_OPAQUE, + .hidden = 1, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .encode = bgp_encode_raw, .decode = bgp_decode_as4_aggregator, .format = bgp_format_aggregator, }, [BA_AIGP] = { - .name = "aigp", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_aigp", + .type = T_OPAQUE, .flags = BAF_OPTIONAL | BAF_DECODE_FLAGS, .export = bgp_export_aigp, .encode = bgp_encode_raw, @@ -1106,16 +1180,24 @@ static const struct bgp_attr_desc bgp_attr_table[] = { .format = bgp_format_aigp, }, [BA_LARGE_COMMUNITY] = { - .name = "large_community", - .type = EAF_TYPE_LC_SET, + .name = "bgp_large_community", + .type = T_LCLIST, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .export = bgp_export_large_community, .encode = bgp_encode_u32s, .decode = bgp_decode_large_community, }, + [BA_ONLY_TO_CUSTOMER] = { + .name = "otc", + .type = T_INT, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .encode = bgp_encode_u32, + .decode = bgp_decode_otc, + }, [BA_MPLS_LABEL_STACK] = { - .name = "mpls_label_stack", - .type = EAF_TYPE_INT_SET, + .name = "bgp_mpls_label_stack", + .type = T_CLIST, + .readonly = 1, .export = bgp_export_mpls_label_stack, .encode = bgp_encode_mpls_label_stack, .decode = bgp_decode_mpls_label_stack, @@ -1123,12 +1205,32 @@ static const struct bgp_attr_desc bgp_attr_table[] = { }, }; -static inline int -bgp_attr_known(uint code) +eattr * +bgp_find_attr(ea_list *attrs, uint code) { - return (code < ARRAY_SIZE(bgp_attr_table)) && bgp_attr_table[code].name; + return ea_find(attrs, BGP_EA_ID(code)); } +void +bgp_register_attrs(void) +{ + for (uint i=0; i<ARRAY_SIZE(bgp_attr_table); i++) + { + if (!bgp_attr_table[i].name) + bgp_attr_table[i] = (union bgp_attr_desc) { + .name = mb_sprintf(&root_pool, "bgp_unknown_0x%02x", i), + .type = T_OPAQUE, + .flags = BAF_OPTIONAL, + .readonly = 1, + .export = bgp_export_unknown, + .encode = bgp_encode_raw, + .decode = bgp_decode_unknown, + .format = bgp_format_unknown, + }; + + ea_register_init(&bgp_attr_table[i].class); + } +} /* * Attribute export @@ -1137,38 +1239,24 @@ bgp_attr_known(uint code) static inline void bgp_export_attr(struct bgp_export_state *s, eattr *a, ea_list *to) { - if (EA_PROTO(a->id) != PROTOCOL_BGP) + const union bgp_attr_desc *desc = bgp_find_attr_desc(a); + if (!desc) return; - uint code = EA_ID(a->id); - - if (bgp_attr_known(code)) - { - const struct bgp_attr_desc *desc = &bgp_attr_table[code]; - - /* The flags might have been zero if the attr was added by filters */ - a->flags = (a->flags & BAF_PARTIAL) | desc->flags; + /* The flags might have been zero if the attr was added locally */ + a->flags = (a->flags & BAF_PARTIAL) | desc->flags; - /* Set partial bit if new opt-trans attribute is attached to non-local route */ - if ((s->src != NULL) && (a->type & EAF_ORIGINATED) && - (a->flags & BAF_OPTIONAL) && (a->flags & BAF_TRANSITIVE)) - a->flags |= BAF_PARTIAL; + /* Set partial bit if new opt-trans attribute is attached to non-local route */ + if ((s->src != NULL) && (a->originated) && + (a->flags & BAF_OPTIONAL) && (a->flags & BAF_TRANSITIVE)) + a->flags |= BAF_PARTIAL; - /* Call specific hook */ - CALL(desc->export, s, a); + /* Call specific hook */ + CALL(desc->export, s, a); - /* Attribute might become undefined in hook */ - if ((a->type & EAF_TYPE_MASK) == EAF_TYPE_UNDEF) - return; - } - else - { - /* Don't re-export unknown non-transitive attributes */ - if (!(a->flags & BAF_TRANSITIVE)) - return; - - a->flags |= BAF_PARTIAL; - } + /* Attribute might become undefined in hook */ + if (a->undef) + return; /* Append updated attribute */ to->attrs[to->count++] = *a; @@ -1188,12 +1276,11 @@ bgp_export_attr(struct bgp_export_state *s, eattr *a, ea_list *to) * Result: one sorted attribute list segment, or NULL if attributes are unsuitable. */ static inline ea_list * -bgp_export_attrs(struct bgp_export_state *s, ea_list *attrs) +bgp_export_attrs(struct bgp_export_state *s, ea_list *a) { /* Merge the attribute list */ - ea_list *new = lp_alloc(s->pool, ea_scan(attrs)); - ea_merge(attrs, new); - ea_sort(new); + ea_list *new = ea_normalize(a, 0); + ASSERT_DIE(new); uint i, count; count = new->count; @@ -1203,7 +1290,7 @@ bgp_export_attrs(struct bgp_export_state *s, ea_list *attrs) for (i = 0; i < count; i++) bgp_export_attr(s, &new->attrs[i], new); - if (s->err_withdraw) + if (s->err_reject) return NULL; return new; @@ -1217,14 +1304,9 @@ bgp_export_attrs(struct bgp_export_state *s, ea_list *attrs) static inline int bgp_encode_attr(struct bgp_write_state *s, eattr *a, byte *buf, uint size) { - ASSERT(EA_PROTO(a->id) == PROTOCOL_BGP); - - uint code = EA_ID(a->id); - - if (bgp_attr_known(code)) - return bgp_attr_table[code].encode(s, a, buf, size); - else - return bgp_encode_raw(s, a, buf, size); + const union bgp_attr_desc *desc = bgp_find_attr_desc(a); + ASSERT_DIE(desc); + return desc->encode(s, a, buf, size); } /** @@ -1289,7 +1371,7 @@ bgp_cluster_list_loopy(struct bgp_proto *p, ea_list *attrs) } static inline void -bgp_decode_attr(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to) +bgp_decode_attr(struct bgp_parse_state *s, byte code, byte flags, byte *data, uint len, ea_list **to) { /* Handle duplicate attributes; RFC 7606 3 (g) */ if (BIT32_TEST(s->attrs_seen, code)) @@ -1301,24 +1383,15 @@ bgp_decode_attr(struct bgp_parse_state *s, uint code, uint flags, byte *data, ui } BIT32_SET(s->attrs_seen, code); - if (bgp_attr_known(code)) - { - const struct bgp_attr_desc *desc = &bgp_attr_table[code]; - - /* Handle conflicting flags; RFC 7606 3 (c) */ - if (((flags ^ desc->flags) & (BAF_OPTIONAL | BAF_TRANSITIVE)) && - !(desc->flags & BAF_DECODE_FLAGS)) - WITHDRAW("Malformed %s attribute - conflicting flags (%02x)", desc->name, flags); + ASSERT_DIE(bgp_attr_table[code].id); + const union bgp_attr_desc *desc = &bgp_attr_table[code]; - desc->decode(s, code, flags, data, len, to); - } - else /* Unknown attribute */ - { - if (!(flags & BAF_OPTIONAL)) - WITHDRAW("Unknown attribute (code %u) - conflicting flags (%02x)", code, flags); + /* Handle conflicting flags; RFC 7606 3 (c) */ + if (((flags ^ desc->flags) & (BAF_OPTIONAL | BAF_TRANSITIVE)) && + !(desc->flags & BAF_DECODE_FLAGS)) + WITHDRAW("Malformed %s attribute - conflicting flags (%02x, expected %02x)", desc->name, flags, desc->flags); - bgp_decode_unknown(s, code, flags, data, len, to); - } + desc->decode(s, code, flags, data, len, to); } /** @@ -1336,7 +1409,8 @@ bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len) { struct bgp_proto *p = s->proto; ea_list *attrs = NULL; - uint code, flags, alen; + uint alen; + byte code, flags; byte *pos = data; /* Parse the attributes */ @@ -1401,23 +1475,23 @@ bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len) /* Reject routes with our ASN in AS_PATH attribute */ if (bgp_as_path_loopy(p, attrs, p->local_as)) - goto withdraw; + goto loop; /* Reject routes with our Confederation ID in AS_PATH attribute; RFC 5065 4.0 */ if ((p->public_as != p->local_as) && bgp_as_path_loopy(p, attrs, p->public_as)) - goto withdraw; + goto loop; /* Reject routes with our Router ID in ORIGINATOR_ID attribute; RFC 4456 8 */ if (p->is_internal && bgp_originator_id_loopy(p, attrs)) - goto withdraw; + goto loop; /* Reject routes with our Cluster ID in CLUSTER_LIST attribute; RFC 4456 8 */ if (p->rr_client && bgp_cluster_list_loopy(p, attrs)) - goto withdraw; + goto loop; /* If there is no local preference, define one */ if (!BIT32_TEST(s->attrs_seen, BA_LOCAL_PREF)) - bgp_set_attr_u32(&attrs, s->pool, BA_LOCAL_PREF, 0, p->cf->default_local_pref); + bgp_set_attr_u32(&attrs, BA_LOCAL_PREF, 0, p->cf->default_local_pref); return attrs; @@ -1434,16 +1508,43 @@ withdraw: s->err_withdraw = 1; return NULL; + +loop: + /* Loops are handled as withdraws, but ignored silently. Do not set err_withdraw. */ + return NULL; } void -bgp_finish_attrs(struct bgp_parse_state *s, rta *a) +bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to) { /* AIGP test here instead of in bgp_decode_aigp() - we need to know channel */ if (BIT32_TEST(s->attrs_seen, BA_AIGP) && !s->channel->cf->aigp) { REPORT("Discarding AIGP attribute received on non-AIGP session"); - bgp_unset_attr(&a->eattrs, s->pool, BA_AIGP); + bgp_unset_attr(to, BA_AIGP); + } + + /* Handle OTC ingress procedure, RFC 9234 */ + if (bgp_channel_is_role_applicable(s->channel)) + { + struct bgp_proto *p = s->proto; + eattr *e = bgp_find_attr(*to, BA_ONLY_TO_CUSTOMER); + + /* Reject routes from downstream if they are leaked */ + if (e && (p->cf->local_role == BGP_ROLE_PROVIDER || + p->cf->local_role == BGP_ROLE_RS_SERVER)) + WITHDRAW("Route leak detected - OTC attribute from downstream"); + + /* Reject routes from peers if they are leaked */ + if (e && (p->cf->local_role == BGP_ROLE_PEER) && (e->u.data != p->cf->remote_as)) + WITHDRAW("Route leak detected - OTC attribute with mismatched ASN (%u)", + (uint) e->u.data); + + /* Mark routes from upstream if it did not happened before */ + if (!e && (p->cf->local_role == BGP_ROLE_CUSTOMER || + p->cf->local_role == BGP_ROLE_PEER || + p->cf->local_role == BGP_ROLE_RS_CLIENT)) + bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, 0, p->cf->remote_as); } } @@ -1458,13 +1559,13 @@ bgp_finish_attrs(struct bgp_parse_state *s, rta *a) #define RBH_FN(a,h) h #define RBH_REHASH bgp_rbh_rehash -#define RBH_PARAMS /8, *2, 2, 2, 8, 20 +#define RBH_PARAMS /8, *2, 2, 2, 12, 20 HASH_DEFINE_REHASH_FN(RBH, struct bgp_bucket) -void -bgp_init_bucket_table(struct bgp_channel *c) +static void +bgp_init_bucket_table(struct bgp_pending_tx *c) { HASH_INIT(c->bucket_hash, c->pool, 8); @@ -1472,24 +1573,8 @@ bgp_init_bucket_table(struct bgp_channel *c) c->withdraw_bucket = NULL; } -void -bgp_free_bucket_table(struct bgp_channel *c) -{ - HASH_FREE(c->bucket_hash); - - struct bgp_bucket *b; - WALK_LIST_FIRST(b, c->bucket_queue) - { - rem_node(&b->send_node); - mb_free(b); - } - - mb_free(c->withdraw_bucket); - c->withdraw_bucket = NULL; -} - static struct bgp_bucket * -bgp_get_bucket(struct bgp_channel *c, ea_list *new) +bgp_get_bucket(struct bgp_pending_tx *c, ea_list *new) { /* Hash and lookup */ u32 hash = ea_hash(new); @@ -1498,55 +1583,27 @@ bgp_get_bucket(struct bgp_channel *c, ea_list *new) if (b) return b; - uint ea_size = sizeof(ea_list) + new->count * sizeof(eattr); - uint ea_size_aligned = BIRD_ALIGN(ea_size, CPU_STRUCT_ALIGN); - uint size = sizeof(struct bgp_bucket) + ea_size_aligned; - uint i; - byte *dest; - - /* Gather total size of non-inline attributes */ - for (i = 0; i < new->count; i++) - { - eattr *a = &new->attrs[i]; + /* Scan the list for total size */ + uint ea_size = BIRD_CPU_ALIGN(ea_list_size(new)); + uint size = sizeof(struct bgp_bucket) + ea_size; - if (!(a->type & EAF_EMBEDDED)) - size += BIRD_ALIGN(sizeof(struct adata) + a->u.ptr->length, CPU_STRUCT_ALIGN); - } - - /* Create the bucket */ + /* Allocate the bucket */ b = mb_alloc(c->pool, size); *b = (struct bgp_bucket) { }; init_list(&b->prefixes); b->hash = hash; - /* Copy list of extended attributes */ - memcpy(b->eattrs, new, ea_size); - dest = ((byte *) b->eattrs) + ea_size_aligned; - - /* Copy values of non-inline attributes */ - for (i = 0; i < new->count; i++) - { - eattr *a = &b->eattrs->attrs[i]; - - if (!(a->type & EAF_EMBEDDED)) - { - const struct adata *oa = a->u.ptr; - struct adata *na = (struct adata *) dest; - memcpy(na, oa, sizeof(struct adata) + oa->length); - a->u.ptr = na; - dest += BIRD_ALIGN(sizeof(struct adata) + na->length, CPU_STRUCT_ALIGN); - } - } + /* Copy the ea_list */ + ea_list_copy(b->eattrs, new, ea_size); - /* Insert the bucket to send queue and bucket hash */ - add_tail(&c->bucket_queue, &b->send_node); + /* Insert the bucket to bucket hash */ HASH_INSERT2(c->bucket_hash, RBH, c->pool, b); return b; } static struct bgp_bucket * -bgp_get_withdraw_bucket(struct bgp_channel *c) +bgp_get_withdraw_bucket(struct bgp_pending_tx *c) { if (!c->withdraw_bucket) { @@ -1557,25 +1614,45 @@ bgp_get_withdraw_bucket(struct bgp_channel *c) return c->withdraw_bucket; } -void -bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b) +static void +bgp_free_bucket(struct bgp_pending_tx *c, struct bgp_bucket *b) { - rem_node(&b->send_node); HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b); mb_free(b); } +int +bgp_done_bucket(struct bgp_channel *bc, struct bgp_bucket *b) +{ + struct bgp_pending_tx *c = bc->ptx; + + /* Won't free the withdraw bucket */ + if (b == c->withdraw_bucket) + return 0; + + if (EMPTY_LIST(b->prefixes)) + rem_node(&b->send_node); + + if (b->px_uc || !EMPTY_LIST(b->prefixes)) + return 0; + + bgp_free_bucket(c, b); + return 1; +} + void -bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_defer_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { + struct bgp_pending_tx *c = bc->ptx; rem_node(&b->send_node); add_tail(&c->bucket_queue, &b->send_node); } void -bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_withdraw_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { - struct bgp_proto *p = (void *) c->c.proto; + struct bgp_proto *p = (void *) bc->c.proto; + struct bgp_pending_tx *c = bc->ptx; struct bgp_bucket *wb = bgp_get_withdraw_bucket(c); log(L_ERR "%s: Attribute list too long", p->p.name); @@ -1584,8 +1661,8 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) struct bgp_prefix *px = HEAD(b->prefixes); log(L_ERR "%s: - withdrawing %N", p->p.name, &px->net); - rem_node(&px->buck_node); - add_tail(&wb->prefixes, &px->buck_node); + rem_node(&px->buck_node_xx); + add_tail(&wb->prefixes, &px->buck_node_xx); } } @@ -1596,44 +1673,36 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) #define PXH_KEY(px) px->net, px->path_id, px->hash #define PXH_NEXT(px) px->next -#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && i1 == i2 && net_equal(n1, n2) +#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2) #define PXH_FN(n,i,h) h #define PXH_REHASH bgp_pxh_rehash -#define PXH_PARAMS /8, *2, 2, 2, 8, 24 +#define PXH_PARAMS /8, *2, 2, 2, 12, 24 HASH_DEFINE_REHASH_FN(PXH, struct bgp_prefix) -void -bgp_init_prefix_table(struct bgp_channel *c) +static void +bgp_init_prefix_table(struct bgp_channel *bc) { + struct bgp_pending_tx *c = bc->ptx; HASH_INIT(c->prefix_hash, c->pool, 8); - uint alen = net_addr_length[c->c.net_type]; + uint alen = net_addr_length[bc->c.net_type]; c->prefix_slab = alen ? sl_new(c->pool, sizeof(struct bgp_prefix) + alen) : NULL; } -void -bgp_free_prefix_table(struct bgp_channel *c) -{ - HASH_FREE(c->prefix_hash); - - rfree(c->prefix_slab); - c->prefix_slab = NULL; -} - static struct bgp_prefix * -bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id) +bgp_get_prefix(struct bgp_pending_tx *c, const net_addr *net, struct rte_src *src, int add_path_tx) { - u32 hash = net_hash(net) ^ u32_hash(path_id); - struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id, hash); + u32 path_id = src->global_id; + u32 path_id_hash = add_path_tx ? path_id : 0; + /* We must use a different hash function than the rtable */ + u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id_hash)); + struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id_hash, hash); if (px) - { - rem_node(&px->buck_node); return px; - } if (c->prefix_slab) px = sl_alloc(c->prefix_slab); @@ -1644,34 +1713,317 @@ bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id) px->hash = hash; px->path_id = path_id; net_copy(px->net, net); + rt_lock_source(src); HASH_INSERT2(c->prefix_hash, PXH, c->pool, px); return px; } -void -bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px) +static void bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px); + +static inline int +bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *b) +{ +#define IS_WITHDRAW_BUCKET(b) ((b) == c->ptx->withdraw_bucket) +#define BPX_TRACE(what) do { \ + if (c->c.debug & D_ROUTES) log(L_TRACE "%s.%s < %s %N %uG %s", \ + c->c.proto->name, c->c.name, what, \ + px->net, px->path_id, IS_WITHDRAW_BUCKET(b) ? "withdraw" : "update"); } while (0) + px->lastmod = current_time(); + + /* Already queued for the same bucket */ + if (px->cur == b) + { + BPX_TRACE("already queued"); + return 0; + } + + /* Unqueue from the old bucket */ + if (px->cur) + { + rem_node(&px->buck_node_xx); + bgp_done_bucket(c, px->cur); + } + + /* The new bucket is the same as we sent before */ + if ((px->last == b) || c->c.out_table && !px->last && IS_WITHDRAW_BUCKET(b)) + { + if (px->cur) + BPX_TRACE("reverted"); + else + BPX_TRACE("already sent"); + + /* Well, we haven't sent anything yet */ + if (!px->last) + bgp_free_prefix(c->ptx, px); + + px->cur = NULL; + return 0; + } + + /* Enqueue the bucket if it has been empty */ + if (!IS_WITHDRAW_BUCKET(b) && EMPTY_LIST(b->prefixes)) + add_tail(&c->ptx->bucket_queue, &b->send_node); + + /* Enqueue to the new bucket and indicate the change */ + add_tail(&b->prefixes, &px->buck_node_xx); + px->cur = b; + + BPX_TRACE("queued"); + return 1; + +#undef BPX_TRACE +} + +static void +bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px) { - rem_node(&px->buck_node); HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px); + rt_unlock_source(rt_find_source_global(px->path_id)); + if (c->prefix_slab) - sl_free(c->prefix_slab, px); + sl_free(px); else mb_free(px); } +void +bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck) +{ + /* Cleanup: We're called from bucket senders. */ + ASSERT_DIE(px->cur == buck); + rem_node(&px->buck_node_xx); + + /* We may want to store the updates */ + if (c->c.out_table) + { + /* Nothing to be sent right now */ + px->cur = NULL; + + /* Unref the previous sent version */ + if (px->last) + px->last->px_uc--; + + /* Ref the current sent version */ + if (!IS_WITHDRAW_BUCKET(buck)) + { + px->last = buck; + px->last->px_uc++; + return; + } + + /* Prefixes belonging to the withdraw bucket are freed always */ + } + + bgp_free_prefix(c->ptx, px); +} + +static void +bgp_pending_tx_rfree(resource *r) +{ + struct bgp_pending_tx *ptx = SKIP_BACK(struct bgp_pending_tx, r, r); + + HASH_WALK(ptx->prefix_hash, next, n) + rt_unlock_source(rt_find_source_global(n->path_id)); + HASH_WALK_END; +} + +static void bgp_pending_tx_dump(resource *r UNUSED) { debug("\n"); } + +static struct resclass bgp_pending_tx_class = { + .name = "BGP Pending TX", + .size = sizeof(struct bgp_pending_tx), + .free = bgp_pending_tx_rfree, + .dump = bgp_pending_tx_dump, +}; + +void +bgp_init_pending_tx(struct bgp_channel *c) +{ + ASSERT_DIE(!c->ptx); + + pool *p = rp_new(c->pool, "BGP Pending TX"); + c->ptx = ralloc(p, &bgp_pending_tx_class); + c->ptx->pool = p; + + bgp_init_bucket_table(c->ptx); + bgp_init_prefix_table(c); +} + +void +bgp_free_pending_tx(struct bgp_channel *c) +{ + ASSERT_DIE(c->ptx); + ASSERT_DIE(c->ptx->pool); + + rfree(c->ptx->pool); + c->ptx = NULL; +} + + +/* + * Prefix hash table exporter + */ + +struct bgp_out_export_hook { + struct rt_export_hook h; + u32 hash_iter; /* Iterator over hash */ +}; + +static void +bgp_out_table_feed(void *data) +{ + struct bgp_out_export_hook *hook = data; + struct bgp_channel *bc = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->h.table); + struct bgp_pending_tx *c = bc->ptx; + + int max = 512; + + const net_addr *neq = (hook->h.req->addr_mode == TE_ADDR_EQUAL) ? hook->h.req->addr : NULL; + const net_addr *cand = NULL; + + do { + HASH_WALK_ITER(c->prefix_hash, PXH, n, hook->hash_iter) + { + switch (hook->h.req->addr_mode) + { + case TE_ADDR_IN: + if (!net_in_netX(n->net, hook->h.req->addr)) + continue; + /* fall through */ + case TE_ADDR_NONE: + /* Splitting only for multi-net exports */ + if (--max <= 0) + HASH_WALK_ITER_PUT; + break; + + case TE_ADDR_FOR: + if (!neq) + { + if (net_in_netX(hook->h.req->addr, n->net) && (!cand || (n->net->length > cand->length))) + cand = n->net; + continue; + } + /* fall through */ + case TE_ADDR_EQUAL: + if (!net_equal(n->net, neq)) + continue; + break; + } + + struct bgp_bucket *buck = n->cur ?: n->last; + ea_list *ea = NULL; + if (buck == c->withdraw_bucket) + ea_set_dest(&ea, 0, RTD_UNREACHABLE); + else + { + ea = buck->eattrs; + eattr *eanh = bgp_find_attr(ea, BA_NEXT_HOP); + ASSERT_DIE(eanh); + const ip_addr *nh = (const void *) eanh->u.ptr->data; + + struct nexthop_adata nhad = { + .ad = { .length = sizeof (struct nexthop_adata) - sizeof (struct adata), }, + .nh = { .gw = nh[0], }, + }; + + ea_set_attr(&ea, EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, tmp_copy_adata(&nhad.ad))); + } + + struct rte_storage es = { + .rte = { + .attrs = ea, + .net = n->net, + .src = rt_find_source_global(n->path_id), + .sender = NULL, + .lastmod = n->lastmod, + .flags = n->cur ? REF_PENDING : 0, + }, + }; + + struct rt_pending_export rpe = { + .new = &es, .new_best = &es, + }; + + if (hook->h.req->export_bulk) + { + rte *feed = &es.rte; + hook->h.req->export_bulk(hook->h.req, n->net, &rpe, &feed, 1); + } + else if (hook->h.req->export_one) + hook->h.req->export_one(hook->h.req, n->net, &rpe); + else + bug("No export method in export request"); + } + HASH_WALK_ITER_END; + + neq = cand; + cand = NULL; + } while (neq); + + if (hook->hash_iter) + ev_schedule_work(&hook->h.event); + else + rt_set_export_state(&hook->h, TES_READY); +} + +static void +bgp_out_table_export_start(struct rt_exporter *re, struct rt_export_request *req) +{ + req->hook = rt_alloc_export(re, sizeof(struct bgp_out_export_hook)); + req->hook->req = req; + + struct bgp_out_export_hook *hook = SKIP_BACK(struct bgp_out_export_hook, h, req->hook); + + hook->h.event.hook = bgp_out_table_feed; + rt_init_export(re, req->hook); +} + +static void +bgp_out_table_export_done(void *data) +{ + struct bgp_out_export_hook *hook = data; + struct rt_export_request *req = hook->h.req; + void (*stopped)(struct rt_export_request *) = hook->h.stopped; + + rt_export_stopped(&hook->h); + CALL(stopped, req); +} + +static const struct rt_exporter_class bgp_out_table_export_class = { + .start = bgp_out_table_export_start, + .done = bgp_out_table_export_done, +}; + +void +bgp_setup_out_table(struct bgp_channel *c) +{ + ASSERT_DIE(c->c.out_table == NULL); + + c->prefix_exporter = (struct rt_exporter) { + .class = &bgp_out_table_export_class, + .addr_type = c->c.table->addr_type, + .rp = c->c.proto->pool, + }; + + rt_exporter_init(&c->prefix_exporter); + + c->c.out_table = &c->prefix_exporter; +} + /* * BGP protocol glue */ int -bgp_preexport(struct channel *c, rte *e) +bgp_preexport(struct channel *C, rte *e) { - struct bgp_proto *p = (struct bgp_proto *) (c->proto); + struct bgp_proto *p = (struct bgp_proto *) C->proto; struct bgp_proto *src = bgp_rte_proto(e); + struct bgp_channel *c = (struct bgp_channel *) C; /* Reject our routes */ if (src == p) @@ -1681,6 +2033,22 @@ bgp_preexport(struct channel *c, rte *e) if (src == NULL) return 0; + /* Reject flowspec that failed validation */ + if (net_is_flow(e->net)) + switch (rt_get_flowspec_valid(e)) + { + case FLOWSPEC_VALID: + break; + case FLOWSPEC_INVALID: + return -1; + case FLOWSPEC_UNKNOWN: + ASSUME((rt_get_source_attr(e) != RTS_BGP) || + !((struct bgp_channel *) SKIP_BACK(struct channel, in_req, e->sender->req))->base_table); + break; + case FLOWSPEC__MAX: + bug("This never happens."); + } + /* IBGP route reflection, RFC 4456 */ if (p->is_internal && src->is_internal && (p->local_as == src->local_as)) { @@ -1690,16 +2058,16 @@ bgp_preexport(struct channel *c, rte *e) /* Generally, this should be handled when path is received, but we check it also here as rr_cluster_id may be undefined or different in src. */ - if (p->rr_cluster_id && bgp_cluster_list_loopy(p, e->attrs->eattrs)) + if (p->rr_cluster_id && bgp_cluster_list_loopy(p, e->attrs)) return -1; } /* Handle well-known communities, RFC 1997 */ - struct eattr *com; + struct eattr *a; if (p->cf->interpret_communities && - (com = ea_find(e->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)))) + (a = bgp_find_attr(e->attrs, BA_COMMUNITY))) { - const struct adata *d = com->u.ptr; + const struct adata *d = a->u.ptr; /* Do not export anywhere */ if (int_set_contains(d, BGP_COMM_NO_ADVERTISE)) @@ -1718,6 +2086,16 @@ bgp_preexport(struct channel *c, rte *e) return -1; } + /* Do not export routes marked with OTC to upstream, RFC 9234 */ + if (bgp_channel_is_role_applicable(c)) + { + a = bgp_find_attr(e->attrs, BA_ONLY_TO_CUSTOMER); + if (a && (p->cf->local_role==BGP_ROLE_CUSTOMER || + p->cf->local_role==BGP_ROLE_PEER || + p->cf->local_role==BGP_ROLE_RS_CLIENT)) + return -1; + } + return 0; } @@ -1732,7 +2110,7 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at /* ORIGIN attribute - mandatory, attach if missing */ if (! bgp_find_attr(attrs0, BA_ORIGIN)) - bgp_set_attr_u32(&attrs, pool, BA_ORIGIN, 0, src ? ORIGIN_INCOMPLETE : ORIGIN_IGP); + bgp_set_attr_u32(&attrs, BA_ORIGIN, 0, src ? ORIGIN_INCOMPLETE : ORIGIN_IGP); /* AS_PATH attribute - mandatory */ a = bgp_find_attr(attrs0, BA_AS_PATH); @@ -1747,24 +2125,24 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at { /* IBGP or route server -> just ensure there is one */ if (!a) - bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, &null_adata); + bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, &null_adata); } else if (p->is_interior) { /* Confederation -> prepend ASN as AS_CONFED_SEQUENCE */ ad = as_path_prepend2(pool, ad, AS_PATH_CONFED_SEQUENCE, p->public_as); - bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, ad); + bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, ad); } else /* Regular EBGP (no RS, no confederation) */ { /* Regular EBGP -> prepend ASN as regular sequence */ ad = as_path_prepend2(pool, ad, AS_PATH_SEQUENCE, p->public_as); - bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, ad); + bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, ad); /* MULTI_EXIT_DESC attribute - accept only if set in export filter */ a = bgp_find_attr(attrs0, BA_MULTI_EXIT_DISC); - if (a && !(a->type & EAF_FRESH)) - bgp_unset_attr(&attrs, pool, BA_MULTI_EXIT_DISC); + if (a && !(a->fresh)) + bgp_unset_attr(&attrs, BA_MULTI_EXIT_DISC); } /* NEXT_HOP attribute - delegated to AF-specific hook */ @@ -1773,16 +2151,16 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at /* LOCAL_PREF attribute - required for IBGP, attach if missing */ if (p->is_interior && ! bgp_find_attr(attrs0, BA_LOCAL_PREF)) - bgp_set_attr_u32(&attrs, pool, BA_LOCAL_PREF, 0, p->cf->default_local_pref); + bgp_set_attr_u32(&attrs, BA_LOCAL_PREF, 0, p->cf->default_local_pref); /* AIGP attribute - accumulate local metric or originate new one */ u64 metric; if (s.local_next_hop && - (bgp_total_aigp_metric_(e->attrs, &metric, &ad) || + (bgp_total_aigp_metric_(e, &metric, &ad) || (c->cf->aigp_originate && bgp_init_aigp_metric(e, &metric, &ad)))) { ad = bgp_aigp_set_metric(pool, ad, metric); - bgp_set_attr_ptr(&attrs, pool, BA_AIGP, 0, ad); + bgp_set_attr_ptr(&attrs, BA_AIGP, 0, ad); } /* IBGP route reflection, RFC 4456 */ @@ -1790,7 +2168,7 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at { /* ORIGINATOR_ID attribute - attach if not already set */ if (! bgp_find_attr(attrs0, BA_ORIGINATOR_ID)) - bgp_set_attr_u32(&attrs, pool, BA_ORIGINATOR_ID, 0, src->remote_id); + bgp_set_attr_u32(&attrs, BA_ORIGINATOR_ID, 0, src->remote_id); /* CLUSTER_LIST attribute - prepend cluster ID */ a = bgp_find_attr(attrs0, BA_CLUSTER_LIST); @@ -1805,7 +2183,7 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at ad = int_set_prepend(pool, ad, p->rr_cluster_id); /* Should be at least one prepended cluster ID */ - bgp_set_attr_ptr(&attrs, pool, BA_CLUSTER_LIST, 0, ad); + bgp_set_attr_ptr(&attrs, BA_CLUSTER_LIST, 0, ad); } /* AS4_* transition attributes, RFC 6793 4.2.2 */ @@ -1814,18 +2192,28 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at a = bgp_find_attr(attrs, BA_AS_PATH); if (a && as_path_contains_as4(a->u.ptr)) { - bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, as_path_to_old(pool, a->u.ptr)); - bgp_set_attr_ptr(&attrs, pool, BA_AS4_PATH, 0, as_path_strip_confed(pool, a->u.ptr)); + bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, as_path_to_old(pool, a->u.ptr)); + bgp_set_attr_ptr(&attrs, BA_AS4_PATH, 0, as_path_strip_confed(pool, a->u.ptr)); } a = bgp_find_attr(attrs, BA_AGGREGATOR); if (a && aggregator_contains_as4(a->u.ptr)) { - bgp_set_attr_ptr(&attrs, pool, BA_AGGREGATOR, 0, aggregator_to_old(pool, a->u.ptr)); - bgp_set_attr_ptr(&attrs, pool, BA_AS4_AGGREGATOR, 0, a->u.ptr); + bgp_set_attr_ptr(&attrs, BA_AGGREGATOR, 0, aggregator_to_old(pool, a->u.ptr)); + bgp_set_attr_ptr(&attrs, BA_AS4_AGGREGATOR, 0, a->u.ptr); } } + /* Mark routes for downstream with OTC, RFC 9234 */ + if (bgp_channel_is_role_applicable(c)) + { + a = bgp_find_attr(attrs, BA_ONLY_TO_CUSTOMER); + if (!a && (p->cf->local_role == BGP_ROLE_PROVIDER || + p->cf->local_role == BGP_ROLE_PEER || + p->cf->local_role == BGP_ROLE_RS_SERVER)) + bgp_set_attr_u32(&attrs, BA_ONLY_TO_CUSTOMER, 0, p->public_as); + } + /* * Presence of mandatory attributes ORIGIN and AS_PATH is ensured by above * conditions. Presence and validity of quasi-mandatory NEXT_HOP attribute @@ -1842,36 +2230,35 @@ bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, c struct bgp_proto *p = (void *) P; struct bgp_channel *c = (void *) C; struct bgp_bucket *buck; - struct bgp_prefix *px; - u32 path; + struct rte_src *path; if (new) { - struct ea_list *attrs = bgp_update_attrs(p, c, new, new->attrs->eattrs, bgp_linpool2); + struct ea_list *attrs = bgp_update_attrs(p, c, new, new->attrs, tmp_linpool); - /* If attributes are invalid, we fail back to withdraw */ - buck = attrs ? bgp_get_bucket(c, attrs) : bgp_get_withdraw_bucket(c); - path = new->src->global_id; + /* Error during attribute processing */ + if (!attrs) + log(L_ERR "%s: Invalid route %N withdrawn", p->p.name, n); - lp_flush(bgp_linpool2); + /* If attributes are invalid, we fail back to withdraw */ + buck = attrs ? bgp_get_bucket(c->ptx, attrs) : bgp_get_withdraw_bucket(c->ptx); + path = new->src; } else { - buck = bgp_get_withdraw_bucket(c); - path = old->src->global_id; + buck = bgp_get_withdraw_bucket(c->ptx); + path = old->src; } - px = bgp_get_prefix(c, n, c->add_path_tx ? path : 0); - add_tail(&buck->prefixes, &px->buck_node); - - bgp_schedule_packet(p->conn, c, PKT_UPDATE); + if (bgp_update_prefix(c, bgp_get_prefix(c->ptx, n, path, c->add_path_tx), buck)) + bgp_schedule_packet(p->conn, c, PKT_UPDATE); } static inline u32 bgp_get_neighbor(rte *r) { - eattr *e = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); + eattr *e = ea_find(r->attrs, BGP_EA_ID(BA_AS_PATH)); u32 as; if (e && as_path_get_first_regular(e->u.ptr, &as)) @@ -1892,7 +2279,7 @@ rte_stale(rte *r) return 0; /* If staleness is unknown, compute and cache it */ - eattr *a = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)); + eattr *a = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY)); if (a && int_set_contains(a->u.ptr, BGP_COMM_LLGR_STALE)) { r->pflags |= BGP_REF_STALE; @@ -1922,8 +2309,8 @@ bgp_rte_better(rte *new, rte *old) return 1; /* RFC 4271 9.1.2.1. Route resolvability test */ - n = rta_resolvable(new->attrs); - o = rta_resolvable(old->attrs); + n = rte_resolvable(new); + o = rte_resolvable(old); if (n > o) return 1; if (n < o) @@ -1938,8 +2325,8 @@ bgp_rte_better(rte *new, rte *old) return 1; /* Start with local preferences */ - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); + x = ea_find(new->attrs, BGP_EA_ID(BA_LOCAL_PREF)); + y = ea_find(old->attrs, BGP_EA_ID(BA_LOCAL_PREF)); n = x ? x->u.data : new_bgp->cf->default_local_pref; o = y ? y->u.data : old_bgp->cf->default_local_pref; if (n > o) @@ -1948,8 +2335,8 @@ bgp_rte_better(rte *new, rte *old) return 0; /* RFC 7311 4.1 - Apply AIGP metric */ - u64 n2 = bgp_total_aigp_metric(new->attrs); - u64 o2 = bgp_total_aigp_metric(old->attrs); + u64 n2 = bgp_total_aigp_metric(new); + u64 o2 = bgp_total_aigp_metric(old); if (n2 < o2) return 1; if (n2 > o2) @@ -1958,8 +2345,8 @@ bgp_rte_better(rte *new, rte *old) /* RFC 4271 9.1.2.2. a) Use AS path lengths */ if (new_bgp->cf->compare_path_lengths || old_bgp->cf->compare_path_lengths) { - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); + x = ea_find(new->attrs, BGP_EA_ID(BA_AS_PATH)); + y = ea_find(old->attrs, BGP_EA_ID(BA_AS_PATH)); n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; if (n < o) @@ -1969,8 +2356,8 @@ bgp_rte_better(rte *new, rte *old) } /* RFC 4271 9.1.2.2. b) Use origins */ - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); + x = ea_find(new->attrs, BGP_EA_ID(BA_ORIGIN)); + y = ea_find(old->attrs, BGP_EA_ID(BA_ORIGIN)); n = x ? x->u.data : ORIGIN_INCOMPLETE; o = y ? y->u.data : ORIGIN_INCOMPLETE; if (n < o) @@ -1992,8 +2379,8 @@ bgp_rte_better(rte *new, rte *old) if (new_bgp->cf->med_metric || old_bgp->cf->med_metric || (bgp_get_neighbor(new) == bgp_get_neighbor(old))) { - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); + x = ea_find(new->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC)); + y = ea_find(old->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC)); n = x ? x->u.data : new_bgp->cf->default_med; o = y ? y->u.data : old_bgp->cf->default_med; if (n < o) @@ -2009,8 +2396,8 @@ bgp_rte_better(rte *new, rte *old) return 1; /* RFC 4271 9.1.2.2. e) Compare IGP metrics */ - n = new_bgp->cf->igp_metric ? new->attrs->igp_metric : 0; - o = old_bgp->cf->igp_metric ? old->attrs->igp_metric : 0; + n = new_bgp->cf->igp_metric ? rt_get_igp_metric(new) : 0; + o = old_bgp->cf->igp_metric ? rt_get_igp_metric(old) : 0; if (n < o) return 1; if (n > o) @@ -2018,8 +2405,8 @@ bgp_rte_better(rte *new, rte *old) /* RFC 4271 9.1.2.2. f) Compare BGP identifiers */ /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighbor ID */ - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID)); + x = ea_find(new->attrs, BGP_EA_ID(BA_ORIGINATOR_ID)); + y = ea_find(old->attrs, BGP_EA_ID(BA_ORIGINATOR_ID)); n = x ? x->u.data : new_bgp->remote_id; o = y ? y->u.data : old_bgp->remote_id; @@ -2036,8 +2423,8 @@ bgp_rte_better(rte *new, rte *old) return 0; /* RFC 4456 9. b) Compare cluster list lengths */ - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_CLUSTER_LIST)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_CLUSTER_LIST)); + x = ea_find(new->attrs, BGP_EA_ID(BA_CLUSTER_LIST)); + y = ea_find(old->attrs, BGP_EA_ID(BA_CLUSTER_LIST)); n = x ? int_set_get_size(x->u.ptr) : 0; o = y ? int_set_get_size(y->u.ptr) : 0; if (n < o) @@ -2059,17 +2446,20 @@ bgp_rte_mergable(rte *pri, rte *sec) u32 p, s; /* Skip suppressed routes (see bgp_rte_recalculate()) */ - /* LLGR draft - depreference stale routes */ - if (pri->pflags != sec->pflags) + if ((pri->pflags ^ sec->pflags) & BGP_REF_SUPPRESSED) return 0; /* RFC 4271 9.1.2.1. Route resolvability test */ - if (rta_resolvable(pri->attrs) != rta_resolvable(sec->attrs)) + if (rte_resolvable(pri) != rte_resolvable(sec)) + return 0; + + /* LLGR draft - depreference stale routes */ + if (rte_stale(pri) != rte_stale(sec)) return 0; /* Start with local preferences */ - x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); - y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); + x = ea_find(pri->attrs, BGP_EA_ID(BA_LOCAL_PREF)); + y = ea_find(sec->attrs, BGP_EA_ID(BA_LOCAL_PREF)); p = x ? x->u.data : pri_bgp->cf->default_local_pref; s = y ? y->u.data : sec_bgp->cf->default_local_pref; if (p != s) @@ -2078,8 +2468,8 @@ bgp_rte_mergable(rte *pri, rte *sec) /* RFC 4271 9.1.2.2. a) Use AS path lengths */ if (pri_bgp->cf->compare_path_lengths || sec_bgp->cf->compare_path_lengths) { - x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); - y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); + x = ea_find(pri->attrs, BGP_EA_ID(BA_AS_PATH)); + y = ea_find(sec->attrs, BGP_EA_ID(BA_AS_PATH)); p = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; s = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; @@ -2091,8 +2481,8 @@ bgp_rte_mergable(rte *pri, rte *sec) } /* RFC 4271 9.1.2.2. b) Use origins */ - x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); - y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); + x = ea_find(pri->attrs, BGP_EA_ID(BA_ORIGIN)); + y = ea_find(sec->attrs, BGP_EA_ID(BA_ORIGIN)); p = x ? x->u.data : ORIGIN_INCOMPLETE; s = y ? y->u.data : ORIGIN_INCOMPLETE; if (p != s) @@ -2102,8 +2492,8 @@ bgp_rte_mergable(rte *pri, rte *sec) if (pri_bgp->cf->med_metric || sec_bgp->cf->med_metric || (bgp_get_neighbor(pri) == bgp_get_neighbor(sec))) { - x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); - y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); + x = ea_find(pri->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC)); + y = ea_find(sec->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC)); p = x ? x->u.data : pri_bgp->cf->default_med; s = y ? y->u.data : sec_bgp->cf->default_med; if (p != s) @@ -2115,8 +2505,8 @@ bgp_rte_mergable(rte *pri, rte *sec) return 0; /* RFC 4271 9.1.2.2. e) Compare IGP metrics */ - p = pri_bgp->cf->igp_metric ? pri->attrs->igp_metric : 0; - s = sec_bgp->cf->igp_metric ? sec->attrs->igp_metric : 0; + p = pri_bgp->cf->igp_metric ? rt_get_igp_metric(pri) : 0; + s = sec_bgp->cf->igp_metric ? rt_get_igp_metric(sec) : 0; if (p != s) return 0; @@ -2129,7 +2519,7 @@ bgp_rte_mergable(rte *pri, rte *sec) static inline int same_group(rte *r, u32 lpref, u32 lasn) { - return (r->attrs->pref == lpref) && (bgp_get_neighbor(r) == lasn); + return (rt_get_preference(r) == lpref) && (bgp_get_neighbor(r) == lasn); } static inline int @@ -2140,10 +2530,10 @@ use_deterministic_med(struct rte_storage *r) } int -bgp_rte_recalculate(rtable_private *table, net *net, rte *new, rte *old, rte *old_best) +bgp_rte_recalculate(struct rtable_private *table, net *net, rte *new, rte *old, rte *old_best) { rte *key = new ? new : old; - u32 lpref = key->attrs->pref; + u32 lpref = rt_get_preference(key); u32 lasn = bgp_get_neighbor(key); int old_suppressed = old ? !!(old->pflags & BGP_REF_SUPPRESSED) : 0; @@ -2210,7 +2600,7 @@ bgp_rte_recalculate(rtable_private *table, net *net, rte *new, rte *old, rte *ol /* The default case - find a new best-in-group route */ rte *r = new; /* new may not be in the list */ - for (struct rte_storage *s = net->routes; rte_is_valid(&s->rte); s = s->next) + for (struct rte_storage *s = net->routes; rte_is_valid(RTE_OR_NULL(s)); s = s->next) if (use_deterministic_med(s) && same_group(&s->rte, lpref, lasn)) { s->rte.pflags |= BGP_REF_SUPPRESSED; @@ -2227,7 +2617,7 @@ bgp_rte_recalculate(rtable_private *table, net *net, rte *new, rte *old, rte *ol new->pflags &= ~BGP_REF_SUPPRESSED; /* Found all existing routes mergable with best-in-group */ - for (struct rte_storage *s = net->routes; rte_is_valid(&s->rte); s = s->next) + for (struct rte_storage *s = net->routes; rte_is_valid(RTE_OR_NULL(s)); s = s->next) if (use_deterministic_med(s) && same_group(&s->rte, lpref, lasn)) if ((&s->rte != r) && bgp_rte_mergable(r, &s->rte)) s->rte.pflags &= ~BGP_REF_SUPPRESSED; @@ -2269,40 +2659,53 @@ void bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) { struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + struct rt_import_hook *irh = c->c.in_req.hook; - do { - rte *r = feed[--count]; - if (r->sender != c->c.in_req.hook) - continue; + /* Find our routes among others */ + for (uint i=0; i<count; i++) + { + rte *r = feed[i]; - /* A new route, do not mark as stale */ - if (r->stale_cycle == c->c.in_req.hook->stale_set) + if ( + !rte_is_valid(r) || /* Not a valid route */ + (r->sender != irh) || /* Not our route */ + (r->stale_cycle == irh->stale_set)) /* A new route, do not mark as stale */ continue; - eattr *ea = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)); + eattr *ea = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY)); const struct adata *ad = ea ? ea->u.ptr : NULL; uint flags = ea ? ea->flags : BAF_PARTIAL; - rte e0 = *r; - e0.flags |= REF_USE_STALE; - + /* LLGR not allowed, withdraw the route */ if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR)) + { rte_import(&c->c.in_req, n, NULL, r->src); + continue; + } - else if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE)) - rte_import(&c->c.in_req, n, &e0, r->src); - - else { - rta *a = e0.attrs = rta_do_cow(r->attrs, bgp_linpool); + /* Route already marked as LLGR, do nothing */ + if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE)) + continue; - bgp_set_attr_ptr(&(a->eattrs), bgp_linpool, BA_COMMUNITY, flags, - int_set_add(bgp_linpool, ad, BGP_COMM_LLGR_STALE)); - e0.pflags |= BGP_REF_STALE; + /* Store the tmp_linpool state to aggresively save memory */ + struct lp_state tmpp; + lp_save(tmp_linpool, &tmpp); - rte_import(&c->c.in_req, n, &e0, r->src); - lp_flush(bgp_linpool); - } - } while (count); + /* Mark the route as LLGR */ + rte e0 = *r; + bgp_set_attr_ptr(&e0.attrs, BA_COMMUNITY, flags, int_set_add(tmp_linpool, ad, BGP_COMM_LLGR_STALE)); + e0.pflags &= ~BGP_REF_NOT_STALE; + e0.pflags |= BGP_REF_STALE; + + /* We need to update the route but keep it stale. */ + ASSERT_DIE(irh->stale_set == irh->stale_valid + 1); + irh->stale_set--; + rte_import(&c->c.in_req, n, &e0, r->src); + irh->stale_set++; + + /* Restore the memory state */ + lp_restore(tmp_linpool, &tmpp); + } } @@ -2318,8 +2721,8 @@ bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool) eattr *a4 = bgp_find_attr(*attrs, BA_AS4_AGGREGATOR); /* First, unset AS4_* attributes */ - if (p4) bgp_unset_attr(attrs, pool, BA_AS4_PATH); - if (a4) bgp_unset_attr(attrs, pool, BA_AS4_AGGREGATOR); + if (p4) bgp_unset_attr(attrs, BA_AS4_PATH); + if (a4) bgp_unset_attr(attrs, BA_AS4_AGGREGATOR); /* Handle AGGREGATOR attribute */ if (a2 && a4) @@ -2352,60 +2755,37 @@ bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool) } } -int -bgp_get_attr(const eattr *a, byte *buf, int buflen) -{ - uint i = EA_ID(a->id); - const struct bgp_attr_desc *d; - int len; - - if (bgp_attr_known(i)) - { - d = &bgp_attr_table[i]; - len = bsprintf(buf, "%s", d->name); - buf += len; - if (d->format) - { - *buf++ = ':'; - *buf++ = ' '; - d->format(a, buf, buflen - len - 2); - return GA_FULL; - } - return GA_NAME; - } - - bsprintf(buf, "%02x%s", i, (a->flags & BAF_TRANSITIVE) ? " [t]" : ""); - return GA_NAME; -} - void bgp_get_route_info(rte *e, byte *buf) { - eattr *p = ea_find(e->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); - eattr *o = ea_find(e->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); + eattr *p = ea_find(e->attrs, BGP_EA_ID(BA_AS_PATH)); + eattr *o = ea_find(e->attrs, BGP_EA_ID(BA_ORIGIN)); u32 origas; - buf += bsprintf(buf, " (%d", e->attrs->pref); + buf += bsprintf(buf, " (%d", rt_get_preference(e)); - if (e->pflags & BGP_REF_SUPPRESSED) - buf += bsprintf(buf, "-"); + if (!net_is_flow(e->net)) + { + if (e->pflags & BGP_REF_SUPPRESSED) + buf += bsprintf(buf, "-"); - if (rte_stale(e)) - buf += bsprintf(buf, "s"); + if (rte_stale(e)) + buf += bsprintf(buf, "s"); - u64 metric = bgp_total_aigp_metric(e->attrs); - if (metric < BGP_AIGP_MAX) - { - buf += bsprintf(buf, "/%lu", metric); - } - else if (e->attrs->igp_metric) - { - if (!rta_resolvable(e->attrs)) - buf += bsprintf(buf, "/-"); - else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN) - buf += bsprintf(buf, "/?"); - else - buf += bsprintf(buf, "/%d", e->attrs->igp_metric); + u64 metric = bgp_total_aigp_metric(e); + if (metric < BGP_AIGP_MAX) + { + buf += bsprintf(buf, "/%lu", metric); + } + else if (metric = rt_get_igp_metric(e)) + { + if (!rte_resolvable(e)) + buf += bsprintf(buf, "/-"); + else if (metric >= IGP_METRIC_UNKNOWN) + buf += bsprintf(buf, "/?"); + else + buf += bsprintf(buf, "/%d", metric); + } } buf += bsprintf(buf, ") ["); diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index b825f778..8bacebfc 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -101,6 +101,8 @@ * RFC 8203 - BGP Administrative Shutdown Communication * RFC 8212 - Default EBGP Route Propagation Behavior without Policies * RFC 8654 - Extended Message Support for BGP + * RFC 9117 - Revised Validation Procedure for BGP Flow Specifications + * RFC 9234 - Route Leak Prevention and Detection Using Roles * draft-ietf-idr-ext-opt-param-07 * draft-uttaro-idr-bgp-persistence-04 * draft-walton-bgp-hostname-capability-02 @@ -113,7 +115,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "nest/locks.h" #include "conf/conf.h" @@ -125,9 +127,7 @@ #include "bgp.h" -struct linpool *bgp_linpool; /* Global temporary pool */ -struct linpool *bgp_linpool2; /* Global temporary pool for bgp_rt_notify() */ -static list bgp_sockets; /* Global list of listening sockets */ +static list STATIC_LIST_INIT(bgp_sockets); /* Global list of listening sockets */ static void bgp_connect(struct bgp_proto *p); @@ -141,13 +141,7 @@ static int bgp_incoming_connection(sock *sk, uint dummy UNUSED); static void bgp_listen_sock_err(sock *sk UNUSED, int err); static void bgp_graceful_restart_feed(struct bgp_channel *c); -static inline void channel_refresh_end_reload(struct channel *c) -{ - channel_refresh_end(c); - if (c->in_table) - channel_request_reload(c); -} /** * bgp_open - open a BGP instance @@ -166,16 +160,17 @@ bgp_open(struct bgp_proto *p) ip_addr addr = p->cf->strict_bind ? p->cf->local_ip : (p->ipv4 ? IPA_NONE4 : IPA_NONE6); uint port = p->cf->local_port; - - /* FIXME: Add some global init? */ - if (!bgp_linpool) - init_list(&bgp_sockets); + uint flags = p->cf->free_bind ? SKF_FREEBIND : 0; + uint flag_mask = SKF_FREEBIND; /* We assume that cf->iface is defined iff cf->local_ip is link-local */ WALK_LIST(bs, bgp_sockets) - if (ipa_equal(bs->sk->saddr, addr) && (bs->sk->sport == port) && - (bs->sk->iface == ifa) && (bs->sk->vrf == p->p.vrf)) + if (ipa_equal(bs->sk->saddr, addr) && + (bs->sk->sport == port) && + (bs->sk->iface == ifa) && + (bs->sk->vrf == p->p.vrf) && + ((bs->sk->flags & flag_mask) == flags)) { bs->uc++; p->sock = bs; @@ -189,7 +184,7 @@ bgp_open(struct bgp_proto *p) sk->sport = port; sk->iface = ifa; sk->vrf = p->p.vrf; - sk->flags = 0; + sk->flags = flags; sk->tos = IP_PREC_INTERNET_CONTROL; sk->rbsize = BGP_RX_BUFFER_SIZE; sk->tbsize = BGP_TX_BUFFER_SIZE; @@ -207,12 +202,6 @@ bgp_open(struct bgp_proto *p) add_tail(&bgp_sockets, &bs->n); - if (!bgp_linpool) - { - bgp_linpool = lp_new_default(proto_pool); - bgp_linpool2 = lp_new_default(proto_pool); - } - return 0; err: @@ -241,15 +230,6 @@ bgp_close(struct bgp_proto *p) rfree(bs->sk); rem_node(&bs->n); mb_free(bs); - - if (!EMPTY_LIST(bgp_sockets)) - return; - - rfree(bgp_linpool); - bgp_linpool = NULL; - - rfree(bgp_linpool2); - bgp_linpool2 = NULL; } static inline int @@ -397,6 +377,7 @@ bgp_close_conn(struct bgp_conn *conn) conn->keepalive_timer = NULL; rfree(conn->hold_timer); conn->hold_timer = NULL; + rfree(conn->tx_ev); conn->tx_ev = NULL; rfree(conn->sk); @@ -535,8 +516,15 @@ void bgp_stop(struct bgp_proto *p, int subcode, byte *data, uint len) { proto_notify_state(&p->p, PS_STOP); + p->uncork_ev->data = NULL; bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len); bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len); + + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + if (c->ptx) + bgp_free_pending_tx(c); + ev_schedule(p->event); } @@ -584,7 +572,6 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) p->link_addr = p->neigh->iface->llv6->ip; conn->sk->fast_rx = 0; - conn->sk->cork = &rt_cork; p->conn = conn; p->last_error_class = 0; @@ -785,16 +772,16 @@ bgp_handle_graceful_restart(struct bgp_proto *p) { case BGP_GRS_NONE: c->gr_active = BGP_GRS_ACTIVE; - channel_refresh_begin(&c->c); + rt_refresh_begin(&c->c.in_req); break; case BGP_GRS_ACTIVE: - channel_refresh_end(&c->c); - channel_refresh_begin(&c->c); + rt_refresh_end(&c->c.in_req); + rt_refresh_begin(&c->c.in_req); break; case BGP_GRS_LLGR: - channel_refresh_begin(&c->c); + rt_refresh_begin(&c->c.in_req); bgp_graceful_restart_feed(c); break; } @@ -802,15 +789,13 @@ bgp_handle_graceful_restart(struct bgp_proto *p) else { /* Just flush the routes */ - channel_refresh_begin(&c->c); - channel_refresh_end(&c->c); + rt_refresh_begin(&c->c.in_req); + rt_refresh_end(&c->c.in_req); } /* Reset bucket and prefix tables */ - bgp_free_bucket_table(c); - bgp_free_prefix_table(c); - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); + bgp_free_pending_tx(c); + bgp_init_pending_tx(c); c->packets_to_send = 0; } @@ -866,6 +851,8 @@ bgp_graceful_restart_feed(struct bgp_channel *c) } + + /** * bgp_graceful_restart_done - finish active BGP graceful restart * @c: BGP channel @@ -888,11 +875,8 @@ bgp_graceful_restart_done(struct bgp_channel *c) if (!p->gr_active_num) BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); - if (c->stale_feed.hook) - rt_stop_export(&c->stale_feed, bgp_graceful_restart_feed_done); - tm_stop(c->stale_timer); - channel_refresh_end_reload(&c->c); + rt_refresh_end(&c->c.in_req); } /** @@ -972,10 +956,10 @@ bgp_refresh_begin(struct bgp_channel *c) { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } if (c->load_state == BFS_REFRESHING) - channel_refresh_end(&c->c); + rt_refresh_end(&c->c.in_req); c->load_state = BFS_REFRESHING; - channel_refresh_begin(&c->c); + rt_refresh_begin(&c->c.in_req); } /** @@ -996,7 +980,7 @@ bgp_refresh_end(struct bgp_channel *c) { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } c->load_state = BFS_NONE; - channel_refresh_end_reload(&c->c); + rt_refresh_end(&c->c.in_req); } @@ -1463,8 +1447,7 @@ bgp_reload_routes(struct channel *C) struct bgp_proto *p = (void *) C->proto; struct bgp_channel *c = (void *) C; - ASSERT(p->conn && (p->route_refresh)); - + ASSERT(p->conn && p->route_refresh); bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); } @@ -1481,6 +1464,12 @@ bgp_feed_begin(struct channel *C, int initial) if (initial && p->cf->gr_mode) c->feed_state = BFS_LOADING; + if (!initial && C->out_table) + { + c->feed_out_table = 1; + return; + } + /* It is refeed and both sides support enhanced route refresh */ if (!initial && p->enhanced_refresh) { @@ -1499,6 +1488,12 @@ bgp_feed_end(struct channel *C) struct bgp_proto *p = (void *) C->proto; struct bgp_channel *c = (void *) C; + if (c->feed_out_table) + { + c->feed_out_table = 0; + return; + } + /* This should not happen */ if (!p->conn) return; @@ -1599,6 +1594,8 @@ bgp_start(struct proto *P) p->last_rx_update = 0; p->event = ev_new_init(p->p.pool, bgp_decision, p); + p->uncork_ev = ev_new_init(p->p.pool, bgp_uncork, p); + p->startup_timer = tm_new_init(p->p.pool, bgp_startup_timeout, p, 0, 0); p->gr_timer = tm_new_init(p->p.pool, bgp_graceful_restart_timeout, p, 0, 0); @@ -1793,6 +1790,9 @@ bgp_channel_init(struct channel *C, struct channel_config *CF) if (cf->igp_table_ip6) c->igp_table_ip6 = cf->igp_table_ip6->table; + + if (cf->base_table) + c->base_table = cf->base_table->table; } static int @@ -1803,22 +1803,26 @@ bgp_channel_start(struct channel *C) ip_addr src = p->local_ip; if (c->igp_table_ip4) - RT_LOCKED(c->igp_table_ip4, t) - rt_lock_table(t); + rt_lock_table(c->igp_table_ip4); if (c->igp_table_ip6) - RT_LOCKED(c->igp_table_ip6, t) - rt_lock_table(t); + rt_lock_table(c->igp_table_ip6); + + if (c->base_table) + { + rt_lock_table(c->base_table); + rt_flowspec_link(c->base_table, c->c.table); + } c->pool = p->p.pool; // XXXX - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); if (c->cf->import_table) - channel_setup_in_table(C, 0); + channel_setup_in_table(C); if (c->cf->export_table) - channel_setup_out_table(C); + bgp_setup_out_table(c); + + bgp_init_pending_tx(c); c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0); @@ -1889,12 +1893,16 @@ bgp_channel_cleanup(struct channel *C) struct bgp_channel *c = (void *) C; if (c->igp_table_ip4) - RT_LOCKED(c->igp_table_ip4, t) - rt_unlock_table(t); + rt_unlock_table(c->igp_table_ip4); if (c->igp_table_ip6) - RT_LOCKED(c->igp_table_ip6, t) - rt_unlock_table(t); + rt_unlock_table(c->igp_table_ip6); + + if (c->base_table) + { + rt_flowspec_unlink(c->base_table, c->c.table); + rt_unlock_table(c->base_table); + } c->index = 0; @@ -1937,12 +1945,31 @@ bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32 return cc2->c.table; /* Last, try default table of given type */ - if (tab = cf->c.global->def_tables[type]) + if (tab = rt_get_default_table(cf->c.global, type)) return tab; cf_error("Undefined IGP table"); } +static struct rtable_config * +bgp_default_base_table(struct bgp_config *cf, struct bgp_channel_config *cc) +{ + /* Expected table type */ + u32 type = (cc->afi == BGP_AF_FLOW4) ? NET_IP4 : NET_IP6; + + /* First, try appropriate IP channel */ + u32 afi2 = BGP_AF(BGP_AFI(cc->afi), BGP_SAFI_UNICAST); + struct bgp_channel_config *cc2 = bgp_find_channel_config(cf, afi2); + if (cc2 && (cc2->c.table->addr_type == type)) + return cc2->c.table; + + /* Last, try default table of given type */ + struct rtable_config *tab = rt_get_default_table(cf->c.global, type); + if (tab) + return tab; + + cf_error("Undefined base table"); +} void bgp_postconfig(struct proto_config *CF) @@ -2005,6 +2032,15 @@ bgp_postconfig(struct proto_config *CF) if (internal && cf->rs_client) cf_error("Only external neighbor can be RS client"); + if (internal && (cf->local_role != BGP_ROLE_UNDEFINED)) + cf_error("Local role cannot be set on IBGP sessions"); + + if (interior && (cf->local_role != BGP_ROLE_UNDEFINED)) + log(L_WARN "BGP roles are not recommended to be used within AS confederations"); + + if (cf->require_roles && (cf->local_role == BGP_ROLE_UNDEFINED)) + cf_error("Local role must be set if roles are required"); + if (!cf->confederation && cf->confederation_member) cf_error("Confederation ID must be set for member sessions"); @@ -2087,6 +2123,14 @@ bgp_postconfig(struct proto_config *CF) cf_error("Mismatched IGP table type"); } + /* Default value of base table */ + if ((BGP_SAFI(cc->afi) == BGP_SAFI_FLOW) && cc->validate && !cc->base_table) + cc->base_table = bgp_default_base_table(cf, cc); + + if (cc->base_table && !cc->base_table->trie_used) + cf_error("Flowspec validation requires base table (%s) with trie", + cc->base_table->name); + if (cf->multihop && (cc->gw_mode == GW_DIRECT)) cf_error("Multihop BGP cannot use direct gateway mode"); @@ -2155,7 +2199,7 @@ bgp_reconfigure(struct proto *P, struct proto_config *CF) return same; } -#define IGP_TABLE(cf, sym) ((cf)->igp_table_##sym ? (cf)->igp_table_##sym ->table : NULL ) +#define TABLE(cf, NAME) ((cf)->NAME ? (cf)->NAME->table : NULL ) static int bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *import_changed, int *export_changed) @@ -2166,6 +2210,7 @@ bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *impor struct bgp_channel_config *old = c->cf; if ((new->secondary != old->secondary) || + (new->validate != old->validate) || (new->gr_able != old->gr_able) || (new->llgr_able != old->llgr_able) || (new->llgr_time != old->llgr_time) || @@ -2173,8 +2218,9 @@ bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *impor (new->add_path != old->add_path) || (new->import_table != old->import_table) || (new->export_table != old->export_table) || - (IGP_TABLE(new, ip4) != IGP_TABLE(old, ip4)) || - (IGP_TABLE(new, ip6) != IGP_TABLE(old, ip6))) + (TABLE(new, igp_table_ip4) != TABLE(old, igp_table_ip4)) || + (TABLE(new, igp_table_ip6) != TABLE(old, igp_table_ip6)) || + (TABLE(new, base_table) != TABLE(old, base_table))) return 0; if (new->mandatory && !old->mandatory && (C->channel_state != CS_UP)) @@ -2185,7 +2231,7 @@ bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *impor (new->cost != old->cost)) { /* import_changed itself does not force ROUTE_REFRESH when import_table is active */ - if (c->c.in_table && (c->c.channel_state == CS_UP)) + if ((c->c.in_keep & RIK_PREFILTER) && (c->c.channel_state == CS_UP)) bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); *import_changed = 1; @@ -2357,6 +2403,15 @@ bgp_show_afis(int code, char *s, u32 *afis, uint count) cli_msg(code, b.start); } +static const char * +bgp_format_role_name(u8 role) +{ + static const char *bgp_role_names[] = { "provider", "rs_server", "rs_client", "customer", "peer" }; + if (role == BGP_ROLE_UNDEFINED) return "undefined"; + if (role < ARRAY_SIZE(bgp_role_names)) return bgp_role_names[role]; + return "?"; +} + static void bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) { @@ -2485,6 +2540,9 @@ bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) if (caps->hostname) cli_msg(-1006, " Hostname: %s", caps->hostname); + + if (caps->role != BGP_ROLE_UNDEFINED) + cli_msg(-1006, " Role: %s", bgp_format_role_name(caps->role)); } static void @@ -2499,6 +2557,9 @@ bgp_show_proto_info(struct proto *P) else cli_msg(-1006, " Neighbor address: %I%J", p->remote_ip, p->cf->iface); + if ((p->conn == &p->outgoing_conn) && (p->cf->remote_port != BGP_PORT)) + cli_msg(-1006, " Neighbor port: %u", p->cf->remote_port); + cli_msg(-1006, " Neighbor AS: %u", p->remote_as); cli_msg(-1006, " Local AS: %u", p->cf->local_as); @@ -2588,6 +2649,9 @@ bgp_show_proto_info(struct proto *P) if (c->igp_table_ip6) cli_msg(-1006, " IGP IPv6 table: %s", c->igp_table_ip6->name); + + if (c->base_table) + cli_msg(-1006, " Base table: %s", c->base_table->name); } } } @@ -2605,7 +2669,6 @@ struct channel_class channel_bgp = { struct protocol proto_bgp = { .name = "BGP", .template = "bgp%d", - .class = PROTOCOL_BGP, .preference = DEF_PREF_BGP, .channel_mask = NB_IP | NB_VPN | NB_FLOW, .proto_size = sizeof(struct bgp_proto), @@ -2617,6 +2680,11 @@ struct protocol proto_bgp = { .reconfigure = bgp_reconfigure, .copy_config = bgp_copy_config, .get_status = bgp_get_status, - .get_attr = bgp_get_attr, .show_proto_info = bgp_show_proto_info }; + +void bgp_build(void) +{ + proto_build(&proto_bgp); + bgp_register_attrs(); +} diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 60f93bce..b76ffe99 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -14,13 +14,12 @@ #include <stdint.h> #include <setjmp.h> #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/bfd.h" //#include "lib/lists.h" #include "lib/hash.h" #include "lib/socket.h" -struct linpool; struct eattr; @@ -68,10 +67,10 @@ struct bgp_af_desc { u8 no_igp; const char *name; uint (*encode_nlri)(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size); - void (*decode_nlri)(struct bgp_parse_state *s, byte *pos, uint len, rta *a); + void (*decode_nlri)(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a); void (*update_next_hop)(struct bgp_export_state *s, eattr *nh, ea_list **to); uint (*encode_next_hop)(struct bgp_write_state *s, eattr *nh, byte *buf, uint size); - void (*decode_next_hop)(struct bgp_parse_state *s, byte *pos, uint len, rta *a); + void (*decode_next_hop)(struct bgp_parse_state *s, byte *pos, uint len, ea_list **to); }; @@ -86,6 +85,7 @@ struct bgp_config { int peer_type; /* Internal or external BGP (BGP_PT_*, optional) */ int multihop; /* Number of hops if multihop */ int strict_bind; /* Bind listening socket to local address */ + int free_bind; /* Bind listening socket with SKF_FREEBIND */ int ttl_security; /* Enable TTL security [RFC 5082] */ int compare_path_lengths; /* Use path lengths when selecting best route */ int med_metric; /* Compare MULTI_EXIT_DISC even between routes from differen ASes */ @@ -113,6 +113,8 @@ struct bgp_config { int gr_mode; /* Graceful restart mode (BGP_GR_*) */ int llgr_mode; /* Long-lived graceful restart mode (BGP_LLGR_*) */ int setkey; /* Set MD5 password to system SA/SP database */ + u8 local_role; /* Set peering role with neighbor [RFC 9234] */ + int require_roles; /* Require configured roles on both sides */ /* Times below are in seconds */ unsigned gr_time; /* Graceful restart timeout */ unsigned llgr_time; /* Long-lived graceful restart stale time */ @@ -146,6 +148,7 @@ struct bgp_channel_config { u8 mandatory; /* Channel is mandatory in capability negotiation */ u8 gw_mode; /* How we compute route gateway from next_hop attr, see GW_* */ u8 secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */ + u8 validate; /* Validate Flowspec per RFC 8955 (6) */ u8 gr_able; /* Allow full graceful restart for the channel */ u8 llgr_able; /* Allow full long-lived GR for the channel */ uint llgr_time; /* Long-lived graceful restart stale time */ @@ -155,15 +158,23 @@ struct bgp_channel_config { u8 aigp_originate; /* AIGP is originated automatically */ u32 cost; /* IGP cost for direct next hops */ u8 import_table; /* Use c.in_table as Adj-RIB-In */ - u8 export_table; /* Use c.out_table as Adj-RIB-Out */ + u8 export_table; /* Keep Adj-RIB-Out and export it */ struct rtable_config *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ struct rtable_config *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ + struct rtable_config *base_table; /* Base table for Flowspec validation */ }; #define BGP_PT_INTERNAL 1 #define BGP_PT_EXTERNAL 2 +#define BGP_ROLE_UNDEFINED 255 +#define BGP_ROLE_PROVIDER 0 +#define BGP_ROLE_RS_SERVER 1 +#define BGP_ROLE_RS_CLIENT 2 +#define BGP_ROLE_CUSTOMER 3 +#define BGP_ROLE_PEER 4 + #define NH_NO 0 #define NH_ALL 1 #define NH_IBGP 2 @@ -224,6 +235,7 @@ struct bgp_caps { u8 ext_messages; /* Extended message length, RFC draft */ u8 route_refresh; /* Route refresh capability, RFC 2918 */ u8 enhanced_refresh; /* Enhanced route refresh, RFC 7313 */ + u8 role; /* BGP role capability, RFC 9234 */ u8 gr_aware; /* Graceful restart capability, RFC 4724 */ u8 gr_flags; /* Graceful restart flags */ @@ -317,6 +329,7 @@ struct bgp_proto { struct bgp_socket *sock; /* Shared listening socket */ struct bfd_request *bfd_req; /* BFD request, if BFD is used */ struct birdsock *postponed_sk; /* Postponed incoming socket for dynamic BGP */ + event *uncork_ev; /* Uncork event in case of congestion */ struct bgp_stats stats; /* BGP statistics */ btime last_established; /* Last time of enter/leave of established state */ btime last_rx_update; /* Last time of RX update */ @@ -344,15 +357,12 @@ struct bgp_channel { rtable *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ rtable *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ + rtable *base_table; /* Base table for Flowspec validation */ /* Rest are zeroed when down */ pool *pool; - HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ - struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ - list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ - - HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ - slab *prefix_slab; /* Slab holding prefix nodes */ + struct bgp_pending_tx *ptx; /* Routes waiting to be sent */ + struct rt_exporter prefix_exporter; /* Table-like exporter for ptx */ ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */ ip_addr link_addr; /* Link-local version of next_hop_addr */ @@ -373,11 +383,16 @@ struct bgp_channel { u8 feed_state; /* Feed state (TX) for EoR, RR packets, see BFS_* */ u8 load_state; /* Load state (RX) for EoR, RR packets, see BFS_* */ + + u8 feed_out_table; /* Refeed into out_table */ }; struct bgp_prefix { - node buck_node; /* Node in per-bucket list */ + node buck_node_xx; /* Node in per-bucket list */ struct bgp_prefix *next; /* Node in prefix hash table */ + struct bgp_bucket *last; /* Last bucket sent with this prefix */ + struct bgp_bucket *cur; /* Current bucket (cur == last) if no update is required */ + btime lastmod; /* Last modification of this prefix */ u32 hash; u32 path_id; net_addr net[0]; @@ -386,11 +401,24 @@ struct bgp_prefix { struct bgp_bucket { node send_node; /* Node in send queue */ struct bgp_bucket *next; /* Node in bucket hash table */ - list prefixes; /* Prefixes in this bucket (struct bgp_prefix) */ + list prefixes; /* Prefixes to send in this bucket (struct bgp_prefix) */ u32 hash; /* Hash over extended attributes */ + u32 px_uc; /* How many prefixes are linking this bucket */ ea_list eattrs[0]; /* Per-bucket extended attributes */ }; +struct bgp_pending_tx { + resource r; + pool *pool; + + HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ + struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ + list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ + + HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ + slab *prefix_slab; /* Slab holding prefix nodes */ +}; + struct bgp_export_state { struct bgp_proto *proto; struct bgp_channel *channel; @@ -401,7 +429,7 @@ struct bgp_export_state { int mpls; u32 attrs_seen[1]; - uint err_withdraw; + uint err_reject; uint local_next_hop; }; @@ -427,6 +455,7 @@ struct bgp_parse_state { int as4_session; int add_path; int mpls; + int reach_nlri_step; u32 attrs_seen[256/32]; @@ -453,13 +482,12 @@ struct bgp_parse_state { uint err_subcode; jmp_buf err_jmpbuf; - struct hostentry *hostentry; adata *mpls_labels; /* Cached state for bgp_rte_update() */ u32 last_id; struct rte_src *last_src; - rta *cached_rta; + ea_list *cached_ea; }; #define BGP_PORT 179 @@ -484,6 +512,12 @@ static inline int bgp_cc_is_ipv4(struct bgp_channel_config *c) static inline int bgp_cc_is_ipv6(struct bgp_channel_config *c) { return BGP_AFI(c->afi) == BGP_AFI_IPV6; } +static inline int bgp_channel_is_role_applicable(struct bgp_channel *c) +{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); } + +static inline int bgp_cc_is_role_applicable(struct bgp_channel_config *c) +{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); } + static inline uint bgp_max_packet_length(struct bgp_conn *conn) { return conn->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; } @@ -494,9 +528,6 @@ bgp_parse_error(struct bgp_parse_state *s, uint subcode) longjmp(s->err_jmpbuf, 1); } -extern struct linpool *bgp_linpool; -extern struct linpool *bgp_linpool2; - void bgp_start_timer(timer *t, uint value); void bgp_check_config(struct bgp_config *c); @@ -518,9 +549,14 @@ struct rte_source *bgp_find_source(struct bgp_proto *p, u32 path_id); struct rte_source *bgp_get_source(struct bgp_proto *p, u32 path_id); static inline int -rta_resolvable(rta *a) +rte_resolvable(const rte *rt) { - return a->dest == RTD_UNICAST; + eattr *nhea = ea_find(rt->attrs, &ea_gen_nexthop); + if (!nhea) + return 0; + + struct nexthop_adata *nhad = (void *) nhea->u.ptr; + return NEXTHOP_IS_REACHABLE(nhad) || (nhad->dest != RTD_UNREACHABLE); } extern struct rte_owner_class bgp_rte_owner_class; @@ -539,61 +575,39 @@ extern struct rte_owner_class bgp_rte_owner_class; /* attrs.c */ -static inline eattr * -bgp_find_attr(ea_list *attrs, uint code) -{ - return ea_find(attrs, EA_CODE(PROTOCOL_BGP, code)); -} - eattr * -bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintptr_t val); - -static inline void -bgp_set_attr_u32(ea_list **to, struct linpool *pool, uint code, uint flags, u32 val) -{ bgp_set_attr(to, pool, code, flags, (uintptr_t) val); } - -static inline void -bgp_set_attr_ptr(ea_list **to, struct linpool *pool, uint code, uint flags, const struct adata *val) -{ bgp_set_attr(to, pool, code, flags, (uintptr_t) val); } +bgp_find_attr(ea_list *attrs, uint code); -static inline void -bgp_set_attr_data(ea_list **to, struct linpool *pool, uint code, uint flags, void *data, uint len) -{ - struct adata *a = lp_alloc_adata(pool, len); - bmemcpy(a->data, data, len); - bgp_set_attr(to, pool, code, flags, (uintptr_t) a); -} - -static inline void -bgp_unset_attr(ea_list **to, struct linpool *pool, uint code) -{ eattr *e = bgp_set_attr(to, pool, code, 0, 0); e->type = EAF_TYPE_UNDEF; } +void bgp_set_attr_u32(ea_list **to, uint code, uint flags, u32 val); +void bgp_set_attr_ptr(ea_list **to, uint code, uint flags, const struct adata *ad); +void bgp_set_attr_data(ea_list **to, uint code, uint flags, void *data, uint len); +void bgp_unset_attr(ea_list **to, uint code); int bgp_encode_mp_reach_mrt(struct bgp_write_state *s, eattr *a, byte *buf, uint size); int bgp_encode_attrs(struct bgp_write_state *s, ea_list *attrs, byte *buf, byte *end); ea_list * bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len); -void bgp_finish_attrs(struct bgp_parse_state *s, rta *a); +void bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to); + +void bgp_setup_out_table(struct bgp_channel *c); + +void bgp_init_pending_tx(struct bgp_channel *c); +void bgp_free_pending_tx(struct bgp_channel *c); -void bgp_init_bucket_table(struct bgp_channel *c); -void bgp_free_bucket_table(struct bgp_channel *c); -void bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b); -void bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b); void bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b); +int bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b); -void bgp_init_prefix_table(struct bgp_channel *c); -void bgp_free_prefix_table(struct bgp_channel *c); -void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *bp); +void bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck); int bgp_rte_better(struct rte *, struct rte *); int bgp_rte_mergable(rte *pri, rte *sec); -int bgp_rte_recalculate(rtable_private *table, net *net, rte *new, rte *old, rte *old_best); -void bgp_rte_modify_stale(struct rt_export_request *, const net_addr *, struct rt_pending_export *, rte **, uint); -u32 bgp_rte_igp_metric(struct rte *); +int bgp_rte_recalculate(struct rtable_private *table, net *net, rte *new, rte *old, rte *old_best); +void bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe UNUSED, rte **feed, uint count); +u32 bgp_rte_igp_metric(const rte *); void bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, const rte *old); int bgp_preexport(struct channel *, struct rte *); -int bgp_get_attr(const struct eattr *e, byte *buf, int buflen); void bgp_get_route_info(struct rte *, byte *); -int bgp_total_aigp_metric_(rta *a, u64 *metric, const struct adata **ad); +int bgp_total_aigp_metric_(const rte *e, u64 *metric, const struct adata **ad); static inline struct bgp_proto *bgp_rte_proto(struct rte *rte) { @@ -605,15 +619,17 @@ static inline struct bgp_proto *bgp_rte_proto(struct rte *rte) #define BGP_AIGP_MAX U64(0xffffffffffffffff) static inline u64 -bgp_total_aigp_metric(rta *a) +bgp_total_aigp_metric(const rte *e) { u64 metric = BGP_AIGP_MAX; const struct adata *ad; - bgp_total_aigp_metric_(a, &metric, &ad); + bgp_total_aigp_metric_(e, &metric, &ad); return metric; } +void bgp_register_attrs(void); + /* packets.c */ @@ -625,6 +641,7 @@ void bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) void bgp_kick_tx(void *vconn); void bgp_tx(struct birdsock *sk); int bgp_rx(struct birdsock *sk, uint size); +void bgp_uncork(void *vp); const char * bgp_error_dsc(unsigned code, unsigned subcode); void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len); @@ -650,26 +667,32 @@ void bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to); #define BAF_DECODE_FLAGS 0x0100 /* Private flag - attribute flags are handled by the decode hook */ -#define BA_ORIGIN 0x01 /* RFC 4271 */ /* WM */ -#define BA_AS_PATH 0x02 /* WM */ -#define BA_NEXT_HOP 0x03 /* WM */ -#define BA_MULTI_EXIT_DISC 0x04 /* ON */ -#define BA_LOCAL_PREF 0x05 /* WD */ -#define BA_ATOMIC_AGGR 0x06 /* WD */ -#define BA_AGGREGATOR 0x07 /* OT */ -#define BA_COMMUNITY 0x08 /* RFC 1997 */ /* OT */ -#define BA_ORIGINATOR_ID 0x09 /* RFC 4456 */ /* ON */ -#define BA_CLUSTER_LIST 0x0a /* RFC 4456 */ /* ON */ -#define BA_MP_REACH_NLRI 0x0e /* RFC 4760 */ -#define BA_MP_UNREACH_NLRI 0x0f /* RFC 4760 */ -#define BA_EXT_COMMUNITY 0x10 /* RFC 4360 */ -#define BA_AS4_PATH 0x11 /* RFC 6793 */ -#define BA_AS4_AGGREGATOR 0x12 /* RFC 6793 */ -#define BA_AIGP 0x1a /* RFC 7311 */ -#define BA_LARGE_COMMUNITY 0x20 /* RFC 8092 */ +enum bgp_attr_id { + BA_ORIGIN = 0x01, /* RFC 4271 */ /* WM */ + BA_AS_PATH = 0x02, /* WM */ + BA_NEXT_HOP = 0x03, /* WM */ + BA_MULTI_EXIT_DISC = 0x04, /* ON */ + BA_LOCAL_PREF = 0x05, /* WD */ + BA_ATOMIC_AGGR = 0x06, /* WD */ + BA_AGGREGATOR = 0x07, /* OT */ + BA_COMMUNITY = 0x08, /* RFC 1997 */ /* OT */ + BA_ORIGINATOR_ID = 0x09, /* RFC 4456 */ /* ON */ + BA_CLUSTER_LIST = 0x0a, /* RFC 4456 */ /* ON */ + BA_MP_REACH_NLRI = 0x0e, /* RFC 4760 */ + BA_MP_UNREACH_NLRI = 0x0f, /* RFC 4760 */ + BA_EXT_COMMUNITY = 0x10, /* RFC 4360 */ + BA_AS4_PATH = 0x11, /* RFC 6793 */ + BA_AS4_AGGREGATOR = 0x12, /* RFC 6793 */ + BA_AIGP = 0x1a, /* RFC 7311 */ + BA_LARGE_COMMUNITY = 0x20, /* RFC 8092 */ +#define BA_ONLY_TO_CUSTOMER 0x23 /* RFC 9234 */ /* Bird's private internal BGP attributes */ -#define BA_MPLS_LABEL_STACK 0xfe /* MPLS label stack transfer attribute */ + BA_MPLS_LABEL_STACK = 0x100, /* MPLS label stack transfer attribute */ + +/* Maximum */ + BGP_ATTR_MAX, +}; /* BGP connection states */ diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 2dfbdca9..9f0d2306 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -19,19 +19,19 @@ CF_DECLS CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE, MULTIHOP, STARTUP, VIA, NEXT, HOP, SELF, DEFAULT, PATH, METRIC, ERROR, - START, DELAY, FORGET, WAIT, ENABLE, DISABLE, AFTER, BGP_PATH, - BGP_LOCAL_PREF, BGP_MED, BGP_ORIGIN, BGP_NEXT_HOP, BGP_ATOMIC_AGGR, - BGP_AGGREGATOR, BGP_COMMUNITY, BGP_EXT_COMMUNITY, BGP_LARGE_COMMUNITY, + START, DELAY, FORGET, WAIT, ENABLE, DISABLE, AFTER, + BGP_LOCAL_PREF, BGP_MED, SOURCE, ADDRESS, PASSWORD, RR, RS, CLIENT, CLUSTER, ID, AS4, ADVERTISE, IPV4, CAPABILITIES, LIMIT, PASSIVE, PREFER, OLDER, MISSING, LLADDR, - DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, - BGP_CLUSTER_LIST, IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, + DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, + IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC, SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX, GRACEFUL, RESTART, AWARE, CHECK, LINK, PORT, EXTENDED, MESSAGES, SETKEY, STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG, LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS, - DYNAMIC, RANGE, NAME, DIGITS, BGP_AIGP, AIGP, ORIGINATE, COST, ENFORCE, - FIRST) + DYNAMIC, RANGE, NAME, DIGITS, AIGP, ORIGINATE, COST, ENFORCE, + FIRST, FREE, VALIDATE, BASE, ROLE, ROLES, PEER, PROVIDER, CUSTOMER, + RS_SERVER, RS_CLIENT, REQUIRE, BGP_OTC) %type <i> bgp_nh %type <i32> bgp_afi @@ -40,10 +40,12 @@ CF_KEYWORDS(CEASE, PREFIX, LIMIT, HIT, ADMINISTRATIVE, SHUTDOWN, RESET, PEER, CONFIGURATION, CHANGE, DECONFIGURED, CONNECTION, REJECTED, COLLISION, OUT, OF, RESOURCES) -%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag +%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag bgp_role_name CF_GRAMMAR +toksym: BGP_MED | BGP_LOCAL_PREF | SOURCE ; + proto: bgp_proto '}' ; bgp_proto_start: proto_start BGP { @@ -72,6 +74,7 @@ bgp_proto_start: proto_start BGP { BGP_CFG->llgr_mode = -1; BGP_CFG->llgr_time = 3600; BGP_CFG->setkey = 1; + BGP_CFG->local_role = BGP_ROLE_UNDEFINED; BGP_CFG->dynamic_name = "dynbgp"; BGP_CFG->check_link = -1; } @@ -114,6 +117,14 @@ bgp_cease_flag: | OUT OF RESOURCES { $$ = 1 << 8; } ; +bgp_role_name: + PEER { $$ = BGP_ROLE_PEER; } + | PROVIDER { $$ = BGP_ROLE_PROVIDER; } + | CUSTOMER { $$ = BGP_ROLE_CUSTOMER; } + | RS_SERVER { $$ = BGP_ROLE_RS_SERVER; } + | RS_CLIENT { $$ = BGP_ROLE_RS_CLIENT; } + ; + bgp_proto: bgp_proto_start proto_name '{' | bgp_proto proto_item ';' @@ -155,6 +166,7 @@ bgp_proto: } | bgp_proto DYNAMIC NAME DIGITS expr ';' { BGP_CFG->dynamic_name_digits = $5; if ($5>10) cf_error("Dynamic name digits must be at most 10"); } | bgp_proto STRICT BIND bool ';' { BGP_CFG->strict_bind = $4; } + | bgp_proto FREE BIND bool ';' { BGP_CFG->free_bind = $4; } | bgp_proto PATH METRIC bool ';' { BGP_CFG->compare_path_lengths = $4; } | bgp_proto MED METRIC bool ';' { BGP_CFG->med_metric = $4; } | bgp_proto IGP METRIC bool ';' { BGP_CFG->igp_metric = $4; } @@ -196,6 +208,8 @@ bgp_proto: | bgp_proto BFD GRACEFUL ';' { init_bfd_opts(&BGP_CFG->bfd); BGP_CFG->bfd->mode = BGP_BFD_GRACEFUL; } | bgp_proto BFD { open_bfd_opts(&BGP_CFG->bfd); } bfd_opts { close_bfd_opts(); } ';' | bgp_proto ENFORCE FIRST AS bool ';' { BGP_CFG->enforce_first_as = $5; } + | bgp_proto LOCAL ROLE bgp_role_name ';' { BGP_CFG->local_role = $4; } + | bgp_proto REQUIRE ROLES bool ';' { BGP_CFG->require_roles = $4; } ; bgp_afi: @@ -255,6 +269,11 @@ bgp_channel_item: | GATEWAY DIRECT { BGP_CC->gw_mode = GW_DIRECT; } | GATEWAY RECURSIVE { BGP_CC->gw_mode = GW_RECURSIVE; } | SECONDARY bool { BGP_CC->secondary = $2; } + | VALIDATE bool { + BGP_CC->validate = $2; + if (BGP_SAFI(BGP_CC->afi) != BGP_SAFI_FLOW) + cf_error("Validate option limited to flowspec channels"); + } | GRACEFUL RESTART bool { BGP_CC->gr_able = $3; } | LONG LIVED GRACEFUL RESTART bool { BGP_CC->llgr_able = $5; } | LONG LIVED STALE TIME expr { BGP_CC->llgr_time = $5; } @@ -278,6 +297,16 @@ bgp_channel_item: else cf_error("Mismatched IGP table type"); } + | BASE TABLE rtable { + if (BGP_SAFI(BGP_CC->afi) != BGP_SAFI_FLOW) + cf_error("Base table option limited to flowspec channels"); + + if (((BGP_CC->afi == BGP_AF_FLOW4) && ($3->addr_type == NET_IP4)) || + ((BGP_CC->afi == BGP_AF_FLOW6) && ($3->addr_type == NET_IP6))) + BGP_CC->base_table = $3; + else + cf_error("Mismatched base table type"); + } ; bgp_channel_opts: @@ -300,36 +329,6 @@ bgp_channel_end: bgp_proto_channel: bgp_channel_start bgp_channel_opt_list bgp_channel_end; - -dynamic_attr: BGP_ORIGIN - { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_ENUM_BGP_ORIGIN, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); } ; -dynamic_attr: BGP_PATH - { $$ = f_new_dynamic_attr(EAF_TYPE_AS_PATH, T_PATH, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); } ; -dynamic_attr: BGP_NEXT_HOP - { $$ = f_new_dynamic_attr(EAF_TYPE_IP_ADDRESS, T_IP, EA_CODE(PROTOCOL_BGP, BA_NEXT_HOP)); } ; -dynamic_attr: BGP_MED - { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); } ; -dynamic_attr: BGP_LOCAL_PREF - { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); } ; -dynamic_attr: BGP_ATOMIC_AGGR - { $$ = f_new_dynamic_attr(EAF_TYPE_OPAQUE, T_ENUM_EMPTY, EA_CODE(PROTOCOL_BGP, BA_ATOMIC_AGGR)); } ; -dynamic_attr: BGP_AGGREGATOR - { $$ = f_new_dynamic_attr(EAF_TYPE_OPAQUE, T_ENUM_EMPTY, EA_CODE(PROTOCOL_BGP, BA_AGGREGATOR)); } ; -dynamic_attr: BGP_COMMUNITY - { $$ = f_new_dynamic_attr(EAF_TYPE_INT_SET, T_CLIST, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)); } ; -dynamic_attr: BGP_ORIGINATOR_ID - { $$ = f_new_dynamic_attr(EAF_TYPE_ROUTER_ID, T_QUAD, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID)); } ; -dynamic_attr: BGP_CLUSTER_LIST - { $$ = f_new_dynamic_attr(EAF_TYPE_INT_SET, T_CLIST, EA_CODE(PROTOCOL_BGP, BA_CLUSTER_LIST)); } ; -dynamic_attr: BGP_EXT_COMMUNITY - { $$ = f_new_dynamic_attr(EAF_TYPE_EC_SET, T_ECLIST, EA_CODE(PROTOCOL_BGP, BA_EXT_COMMUNITY)); } ; -dynamic_attr: BGP_AIGP - { $$ = f_new_dynamic_attr(EAF_TYPE_OPAQUE, T_ENUM_EMPTY, EA_CODE(PROTOCOL_BGP, BA_AIGP)); } ; -dynamic_attr: BGP_LARGE_COMMUNITY - { $$ = f_new_dynamic_attr(EAF_TYPE_LC_SET, T_LCLIST, EA_CODE(PROTOCOL_BGP, BA_LARGE_COMMUNITY)); } ; - - - CF_ENUM(T_ENUM_BGP_ORIGIN, ORIGIN_, IGP, EGP, INCOMPLETE) CF_CODE diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index ea9adb4c..d4d2d0b0 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -15,8 +15,8 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "proto/mrt/mrt.h" #include "conf/conf.h" #include "lib/unaligned.h" @@ -238,6 +238,7 @@ bgp_prepare_capabilities(struct bgp_conn *conn) caps->ext_messages = p->cf->enable_extended_messages; caps->route_refresh = p->cf->enable_refresh; caps->enhanced_refresh = p->cf->enable_refresh; + caps->role = p->cf->local_role; if (caps->as4_support) caps->as4_number = p->public_as; @@ -350,6 +351,13 @@ bgp_write_capabilities(struct bgp_conn *conn, byte *buf) *buf++ = 0; /* Capability data length */ } + if (caps->role != BGP_ROLE_UNDEFINED) + { + *buf++ = 9; /* Capability 9: Announce chosen BGP role */ + *buf++ = 1; /* Capability data length */ + *buf++ = caps->role; + } + if (caps->gr_aware) { *buf++ = 64; /* Capability 64: Support for graceful restart */ @@ -449,11 +457,15 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) struct bgp_proto *p = conn->bgp; struct bgp_caps *caps; struct bgp_af_caps *ac; + uint err_subcode = 0; int i, cl; u32 af; if (!conn->remote_caps) + { caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + sizeof(struct bgp_af_caps)); + caps->role = BGP_ROLE_UNDEFINED; + } else { caps = conn->remote_caps; @@ -513,6 +525,21 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) caps->ext_messages = 1; break; + case 9: /* BGP role capability, RFC 9234 */ + if (cl != 1) + goto err; + + /* Reserved value */ + if (pos[2] == BGP_ROLE_UNDEFINED) + { err_subcode = 11; goto err; } + + /* Multiple inconsistent values */ + if ((caps->role != BGP_ROLE_UNDEFINED) && (caps->role != pos[2])) + { err_subcode = 11; goto err; } + + caps->role = pos[2]; + break; + case 64: /* Graceful restart capability, RFC 4724 */ if (cl % 4 != 2) goto err; @@ -638,7 +665,7 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) err: mb_free(caps); - bgp_error(conn, 2, 0, NULL, 0); + bgp_error(conn, 2, err_subcode, NULL, 0); return -1; } @@ -854,6 +881,22 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) conn->received_as = asn; } + /* RFC 9234 4.2 - check role agreement */ + u8 local_role = p->cf->local_role; + u8 neigh_role = caps->role; + + if ((local_role != BGP_ROLE_UNDEFINED) && + (neigh_role != BGP_ROLE_UNDEFINED) && + !((local_role == BGP_ROLE_PEER && neigh_role == BGP_ROLE_PEER) || + (local_role == BGP_ROLE_CUSTOMER && neigh_role == BGP_ROLE_PROVIDER) || + (local_role == BGP_ROLE_PROVIDER && neigh_role == BGP_ROLE_CUSTOMER) || + (local_role == BGP_ROLE_RS_CLIENT && neigh_role == BGP_ROLE_RS_SERVER) || + (local_role == BGP_ROLE_RS_SERVER && neigh_role == BGP_ROLE_RS_CLIENT))) + { bgp_error(conn, 2, 11, NULL, 0); return; } + + if ((p->cf->require_roles) && (neigh_role == BGP_ROLE_UNDEFINED)) + { bgp_error(conn, 2, 11, NULL, 0); return; } + /* Check the other connection */ other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; switch (other->state) @@ -932,14 +975,18 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) #define WITHDRAW(msg, args...) \ ({ REPORT(msg, ## args); s->err_withdraw = 1; return; }) +#define REJECT(msg, args...) \ + ({ log(L_ERR "%s: " msg, s->proto->p.name, ## args); s->err_reject = 1; return; }) + #define BAD_AFI "Unexpected AF <%u/%u> in UPDATE" #define BAD_NEXT_HOP "Invalid NEXT_HOP attribute" #define NO_NEXT_HOP "Missing NEXT_HOP attribute" #define NO_LABEL_STACK "Missing MPLS stack" +#define MISMATCHED_AF " - mismatched address family (%I for %s)" static void -bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll) +bgp_apply_next_hop(struct bgp_parse_state *s, ea_list **to, ip_addr gw, ip_addr ll) { struct bgp_proto *p = s->proto; struct bgp_channel *c = s->channel; @@ -949,67 +996,86 @@ bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll) neighbor *nbr = NULL; /* GW_DIRECT -> single_hop -> p->neigh != NULL */ - if (ipa_nonzero(gw)) + if (ipa_nonzero2(gw)) nbr = neigh_find(&p->p, gw, NULL, 0); else if (ipa_nonzero(ll)) nbr = neigh_find(&p->p, ll, p->neigh->iface, 0); + else + WITHDRAW(BAD_NEXT_HOP " - zero address"); + + if (!nbr) + WITHDRAW(BAD_NEXT_HOP " - address %I not directly reachable", ipa_nonzero(gw) ? gw : ll); - if (!nbr || (nbr->scope == SCOPE_HOST)) - WITHDRAW(BAD_NEXT_HOP); + if (nbr->scope == SCOPE_HOST) + WITHDRAW(BAD_NEXT_HOP " - address %I is local", nbr->addr); - a->dest = RTD_UNICAST; - a->nh.gw = nbr->addr; - a->nh.iface = nbr->iface; - a->igp_metric = c->cf->cost; + ea_set_attr_u32(to, &ea_gen_igp_metric, 0, c->cf->cost); + + struct nexthop_adata nhad = { + .nh = { + .gw = nbr->addr, + .iface = nbr->iface, + }, + .ad = { + .length = sizeof nhad - sizeof nhad.ad, + }, + }; + ea_set_attr_data(to, &ea_gen_nexthop, 0, nhad.ad.data, nhad.ad.length); } else /* GW_RECURSIVE */ { - if (ipa_zero(gw)) - WITHDRAW(BAD_NEXT_HOP); + if (ipa_zero2(gw)) + WITHDRAW(BAD_NEXT_HOP " - zero address"); rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6; - s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table); - - if (!s->mpls) - rta_apply_hostentry(a, s->hostentry, NULL, s->pool); - - /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */ + if (s->mpls) + { + u32 labels[BGP_MPLS_MAX]; + ea_set_hostentry(to, c->c.table, tab, gw, ll, BGP_MPLS_MAX, labels); + } + else + ea_set_hostentry(to, c->c.table, tab, gw, ll, 0, NULL); } } static void -bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum) +bgp_apply_mpls_labels(struct bgp_parse_state *s, ea_list **to, u32 lnum, u32 labels[lnum]) { if (lnum > MPLS_MAX_LABEL_STACK) { REPORT("Too many MPLS labels ($u)", lnum); - a->dest = RTD_UNREACHABLE; - a->hostentry = NULL; - a->nh = (struct nexthop) { }; + ea_set_dest(to, 0, RTD_UNREACHABLE); return; } /* Handle implicit NULL as empty MPLS stack */ if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL)) - lnum = 0; + lnum = s->mpls_labels->length = 0; if (s->channel->cf->gw_mode == GW_DIRECT) { - a->nh.labels = lnum; - memcpy(a->nh.label, labels, 4*lnum); + eattr *e = ea_find(*to, &ea_gen_nexthop); + struct { + struct nexthop_adata nhad; + u32 labels[MPLS_MAX_LABEL_STACK]; + } nh; + + memcpy(&nh.nhad, e->u.ptr, sizeof(struct adata) + e->u.ptr->length); + nh.nhad.nh.labels = lnum; + memcpy(nh.labels, labels, lnum * sizeof(u32)); + nh.nhad.ad.length = sizeof nh.nhad + lnum * sizeof(u32); } else /* GW_RECURSIVE */ { - mpls_label_stack ms; - - ms.len = lnum; - memcpy(ms.stack, labels, 4*lnum); - rta_apply_hostentry(a, s->hostentry, &ms, s->pool); + eattr *e = ea_find(*to, &ea_gen_hostentry); + ASSERT_DIE(e); + struct hostentry_adata *head = (void *) e->u.ptr; + memcpy(&head->labels, labels, lnum * sizeof(u32)); + head->ad.length = (void *)(&head->labels[lnum]) - (void *) head->ad.data; } } - static int bgp_match_src(struct bgp_export_state *s, int mode) { @@ -1039,7 +1105,7 @@ bgp_use_next_hop(struct bgp_export_state *s, eattr *a) return 1; /* Keep it when explicitly set in export filter */ - if (a->type & EAF_FRESH) + if (a->fresh) return 1; /* Check for non-matching AF */ @@ -1056,31 +1122,41 @@ bgp_use_next_hop(struct bgp_export_state *s, eattr *a) return p->neigh && (p->neigh->iface == ifa); } -static inline int +static inline struct nexthop * bgp_use_gateway(struct bgp_export_state *s) { struct bgp_proto *p = s->proto; struct bgp_channel *c = s->channel; - rta *ra = s->route->attrs; + ea_list *ra = s->route->attrs; /* Handle next hop self option - also applies to gateway */ if (c->cf->next_hop_self && bgp_match_src(s, c->cf->next_hop_self)) - return 0; + return NULL; + + eattr *nhea = ea_find(ra, &ea_gen_nexthop); + if (!nhea) + return NULL; /* We need one valid global gateway */ - if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw)) - return 0; + struct nexthop_adata *nhad = (struct nexthop_adata *) nhea->u.ptr; + if (!NEXTHOP_IS_REACHABLE(nhad) || + !NEXTHOP_ONE(nhad) || ipa_zero(nhad->nh.gw) || + ipa_is_link_local(nhad->nh.gw)) + return NULL; /* Check for non-matching AF */ - if ((ipa_is_ip4(ra->nh.gw) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop) - return 0; + if ((ipa_is_ip4(nhad->nh.gw) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop) + return NULL; /* Use it when exported to internal peers */ if (p->is_interior) - return 1; + return &nhad->nh; /* Use it when forwarded to single-hop BGP peer on on the same iface */ - return p->neigh && (p->neigh->iface == ra->nh.iface); + if (p->neigh && (p->neigh->iface == nhad->nh.iface)) + return &nhad->nh; + + return NULL; } static void @@ -1088,31 +1164,31 @@ bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to) { if (!a || !bgp_use_next_hop(s, a)) { - if (bgp_use_gateway(s)) + struct nexthop *nhloc; + if (nhloc = bgp_use_gateway(s)) { - rta *ra = s->route->attrs; - ip_addr nh[1] = { ra->nh.gw }; - bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16); + ip_addr nh[1] = { nhloc->gw }; + bgp_set_attr_data(to, BA_NEXT_HOP, 0, nh, 16); if (s->mpls) { u32 implicit_null = BGP_MPLS_NULL; - u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null; - uint lnum = ra->nh.labels ? ra->nh.labels : 1; - bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4); + u32 *labels = nhloc->labels ? nhloc->label : &implicit_null; + uint lnum = nhloc->labels ? nhloc->labels : 1; + bgp_set_attr_data(to, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4); } } else { ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr }; - bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16); + bgp_set_attr_data(to, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16); s->local_next_hop = 1; /* TODO: Use local MPLS assigned label */ if (s->mpls) { u32 implicit_null = BGP_MPLS_NULL; - bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, &implicit_null, 4); + bgp_set_attr_data(to, BA_MPLS_LABEL_STACK, 0, &implicit_null, 4); } } } @@ -1120,28 +1196,28 @@ bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to) /* Check if next hop is valid */ a = bgp_find_attr(*to, BA_NEXT_HOP); if (!a) - WITHDRAW(NO_NEXT_HOP); + REJECT(NO_NEXT_HOP); ip_addr *nh = (void *) a->u.ptr->data; ip_addr peer = s->proto->remote_ip; uint len = a->u.ptr->length; /* Forbid zero next hop */ - if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1]))) - WITHDRAW(BAD_NEXT_HOP); + if (ipa_zero2(nh[0]) && ((len != 32) || ipa_zero(nh[1]))) + REJECT(BAD_NEXT_HOP " - zero address"); /* Forbid next hop equal to neighbor IP */ if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1]))) - WITHDRAW(BAD_NEXT_HOP); + REJECT(BAD_NEXT_HOP " - neighbor address %I", peer); /* Forbid next hop with non-matching AF */ if ((ipa_is_ip4(nh[0]) != bgp_channel_is_ipv4(s->channel)) && !s->channel->ext_next_hop) - WITHDRAW(BAD_NEXT_HOP); + REJECT(BAD_NEXT_HOP MISMATCHED_AF, nh[0], s->channel->desc->name); /* Just check if MPLS stack */ if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK)) - WITHDRAW(NO_LABEL_STACK); + REJECT(NO_LABEL_STACK); } static uint @@ -1175,7 +1251,7 @@ bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size } static void -bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a) +bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, ea_list **to) { struct bgp_channel *c = s->channel; struct adata *ad = lp_alloc_adata(s->pool, 32); @@ -1212,12 +1288,12 @@ bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a) ad->length = 16; if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop) - WITHDRAW(BAD_NEXT_HOP); + WITHDRAW(BAD_NEXT_HOP MISMATCHED_AF, nh[0], c->desc->name); // XXXX validate next hop - bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad); - bgp_apply_next_hop(s, a, nh[0], nh[1]); + bgp_set_attr_ptr(to, BA_NEXT_HOP, 0, ad); + bgp_apply_next_hop(s, to, nh[0], nh[1]); } static uint @@ -1255,7 +1331,7 @@ bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint siz } static void -bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a) +bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, ea_list **to) { struct bgp_channel *c = s->channel; struct adata *ad = lp_alloc_adata(s->pool, 32); @@ -1293,12 +1369,12 @@ bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a) bgp_parse_error(s, 9); if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop) - WITHDRAW(BAD_NEXT_HOP); + WITHDRAW(BAD_NEXT_HOP MISMATCHED_AF, nh[0], c->desc->name); // XXXX validate next hop - bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad); - bgp_apply_next_hop(s, a, nh[0], nh[1]); + bgp_set_attr_ptr(to, BA_NEXT_HOP, 0, ad); + bgp_apply_next_hop(s, to, nh[0], nh[1]); } @@ -1310,7 +1386,7 @@ bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte } static void -bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED) +bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, ea_list **to UNUSED) { /* * Although we expect no next hop and RFC 7606 7.11 states that attribute @@ -1322,11 +1398,11 @@ bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, ui } static void -bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to) +bgp_update_next_hop_none(struct bgp_export_state *s UNUSED, eattr *a, ea_list **to) { /* NEXT_HOP shall not pass */ if (a) - bgp_unset_attr(to, s->pool, BA_NEXT_HOP); + bgp_unset_attr(to, BA_NEXT_HOP); } @@ -1335,7 +1411,7 @@ bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to) */ static void -bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0) +bgp_rte_update(struct bgp_parse_state *s, const net_addr *n, u32 path_id, ea_list *a0) { if (path_id != s->last_id) { @@ -1344,28 +1420,27 @@ bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0) s->last_src = rt_get_source(&s->proto->p, path_id); s->last_id = path_id; - rta_free(s->cached_rta); - s->cached_rta = NULL; + ea_free(s->cached_ea); + s->cached_ea = NULL; } if (!a0) { + /* Route update was changed to withdraw */ + if (s->err_withdraw && s->reach_nlri_step) + REPORT("Invalid route %N withdrawn", n); + /* Route withdraw */ rte_update(&s->channel->c, n, NULL, s->last_src); return; } /* Prepare cached route attributes */ - if (s->cached_rta == NULL) - { - /* Workaround for rta_lookup() breaking eattrs */ - ea_list *ea = a0->eattrs; - s->cached_rta = rta_lookup(a0); - a0->eattrs = ea; - } + if (s->cached_ea == NULL) + s->cached_ea = ea_lookup(a0, 0); rte e0 = { - .attrs = s->cached_rta, + .attrs = s->cached_ea, .src = s->last_src, }; @@ -1392,9 +1467,10 @@ bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, const adata *mpls, byte } static void -bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a) +bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, ea_list **to) { - u32 labels[BGP_MPLS_MAX], label; + u32 labels[BGP_MPLS_MAX]; + u32 label; uint lnum = 0; do { @@ -1408,31 +1484,20 @@ bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *p /* RFC 8277 2.4 - withdraw does not have variable-size MPLS stack but fixed-size 24-bit Compatibility field, which MUST be ignored */ - if (!a && !s->err_withdraw) + if (!s->reach_nlri_step) return; } while (!(label & BGP_MPLS_BOS)); - if (!a) + if (!*to) return; - /* Attach MPLS attribute unless we already have one */ - if (!s->mpls_labels) - { - s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX); - bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels); - } - - /* Overwrite data in the attribute */ - s->mpls_labels->length = 4*lnum; - memcpy(s->mpls_labels->data, labels, 4*lnum); - /* Update next hop entry in rta */ - bgp_apply_mpls_labels(s, a, labels, lnum); + bgp_apply_mpls_labels(s, to, lnum, labels); /* Attributes were changed, invalidate cached entry */ - rta_free(s->cached_rta); - s->cached_rta = NULL; + rta_free(s->cached_ea); + s->cached_ea = NULL; return; } @@ -1468,14 +1533,14 @@ bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1501,7 +1566,7 @@ bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) /* Decode MPLS labels */ if (s->mpls) - bgp_decode_mpls_labels(s, &pos, &len, &l, a); + bgp_decode_mpls_labels(s, &pos, &len, &l, &a); if (l > IP4_MAX_PREFIX_LENGTH) bgp_parse_error(s, 10); @@ -1553,14 +1618,14 @@ bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1586,7 +1651,7 @@ bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) /* Decode MPLS labels */ if (s->mpls) - bgp_decode_mpls_labels(s, &pos, &len, &l, a); + bgp_decode_mpls_labels(s, &pos, &len, &l, &a); if (l > IP6_MAX_PREFIX_LENGTH) bgp_parse_error(s, 10); @@ -1641,14 +1706,14 @@ bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *b memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1674,7 +1739,7 @@ bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) /* Decode MPLS labels */ if (s->mpls) - bgp_decode_mpls_labels(s, &pos, &len, &l, a); + bgp_decode_mpls_labels(s, &pos, &len, &l, &a); /* Decode route distinguisher */ if (l < 64) @@ -1738,14 +1803,14 @@ bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *b memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1771,7 +1836,7 @@ bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) /* Decode MPLS labels */ if (s->mpls) - bgp_decode_mpls_labels(s, &pos, &len, &l, a); + bgp_decode_mpls_labels(s, &pos, &len, &l, &a); /* Decode route distinguisher */ if (l < 64) @@ -1825,14 +1890,14 @@ bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte * memcpy(pos, net->data, flen); ADVANCE(pos, size, flen); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1913,14 +1978,14 @@ bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte * memcpy(pos, net->data, flen); ADVANCE(pos, size, flen); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -2147,6 +2212,8 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu * var IPv4 Network Layer Reachability Information */ + ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck); + int lr, la; la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH); @@ -2168,6 +2235,8 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu static byte * bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { + ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck); + /* * 2 B IPv4 Withdrawn Routes Length (zero) * --- IPv4 Withdrawn Routes NLRI (unused) @@ -2291,11 +2360,14 @@ bgp_create_update(struct bgp_channel *c, byte *buf) again: ; + struct lp_state tmpp; + lp_save(tmp_linpool, &tmpp); + /* Initialize write state */ struct bgp_write_state s = { .proto = p, .channel = c, - .pool = bgp_linpool, + .pool = tmp_linpool, .mp_reach = (c->afi != BGP_AF_IPV4) || c->ext_next_hop, .as4_session = p->as4_session, .add_path = c->add_path_tx, @@ -2303,7 +2375,7 @@ again: ; }; /* Try unreachable bucket */ - if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) + if ((buck = c->ptx->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) { res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ? bgp_create_ip_unreach(&s, buck, buf, end): @@ -2313,14 +2385,14 @@ again: ; } /* Try reachable buckets */ - if (!EMPTY_LIST(c->bucket_queue)) + if (!EMPTY_LIST(c->ptx->bucket_queue)) { - buck = HEAD(c->bucket_queue); + buck = HEAD(c->ptx->bucket_queue); /* Cleanup empty buckets */ - if (EMPTY_LIST(buck->prefixes)) + if (bgp_done_bucket(c, buck)) { - bgp_free_bucket(c, buck); + lp_restore(tmp_linpool, &tmpp); goto again; } @@ -2328,13 +2400,13 @@ again: ; bgp_create_ip_reach(&s, buck, buf, end): bgp_create_mp_reach(&s, buck, buf, end); - if (EMPTY_LIST(buck->prefixes)) - bgp_free_bucket(c, buck); - else - bgp_defer_bucket(c, buck); + bgp_done_bucket(c, buck); if (!res) + { + lp_restore(tmp_linpool, &tmpp); goto again; + } goto done; } @@ -2345,7 +2417,7 @@ again: ; done: BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE"); p->stats.tx_updates++; - lp_flush(s.pool); + lp_restore(tmp_linpool, &tmpp); return res; } @@ -2412,7 +2484,6 @@ static inline void bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len) { struct bgp_channel *c = bgp_get_channel(s->proto, afi); - rta *a = NULL; if (!c) DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi)); @@ -2434,26 +2505,22 @@ bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_lis if (ea) { - a = allocz(RTA_MAX_SIZE); + ea_set_attr_data(&ea, &ea_gen_from, 0, &s->proto->remote_ip, sizeof(ip_addr)); + ea_set_attr_u32(&ea, &ea_gen_preference, 0, c->c.preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_BGP); - a->source = RTS_BGP; - a->scope = SCOPE_UNIVERSE; - a->from = s->proto->remote_ip; - a->eattrs = ea; - a->pref = c->c.preference; - - c->desc->decode_next_hop(s, nh, nh_len, a); - bgp_finish_attrs(s, a); + c->desc->decode_next_hop(s, nh, nh_len, &ea); + bgp_finish_attrs(s, &ea); /* Handle withdraw during next hop decoding */ if (s->err_withdraw) - a = NULL; + ea = NULL; } - c->desc->decode_nlri(s, nlri, len, a); + c->desc->decode_nlri(s, nlri, len, ea); - rta_free(s->cached_rta); - s->cached_rta = NULL; + rta_free(s->cached_ea); + s->cached_ea = NULL; rt_unlock_source(s->last_src); } @@ -2477,10 +2544,13 @@ bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) bgp_start_timer(conn->hold_timer, conn->hold_time); + struct lp_state tmpp; + lp_save(tmp_linpool, &tmpp); + /* Initialize parse state */ struct bgp_parse_state s = { .proto = p, - .pool = bgp_linpool, + .pool = tmp_linpool, .as4_session = p->as4_session, }; @@ -2546,6 +2616,8 @@ bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) if (s.mp_unreach_len) bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0); + s.reach_nlri_step = 1; + if (s.ip_reach_len) bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len, ea, s.ip_next_hop_data, s.ip_next_hop_len); @@ -2555,8 +2627,8 @@ bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) ea, s.mp_next_hop_data, s.mp_next_hop_len); done: - rta_free(s.cached_rta); - lp_flush(s.pool); + rta_free(s.cached_ea); + lp_restore(tmp_linpool, &tmpp); return; } @@ -2879,7 +2951,11 @@ bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) { ASSERT(conn->sk); - DBG("BGP: Scheduling packet type %d\n", type); + struct bgp_proto *p = conn->bgp; + if (c) + BGP_TRACE(D_PACKETS, "Scheduling packet type %d for channel %s", type, c->c.name); + else + BGP_TRACE(D_PACKETS, "Scheduling packet type %d", type); if (c) { @@ -2944,6 +3020,7 @@ static struct { { 2, 6, "Unacceptable hold time" }, { 2, 7, "Required capability missing" }, /* [RFC5492] */ { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */ + { 2,11, "Role mismatch" }, /* From Open Policy, RFC 9234 */ { 3, 0, "Invalid UPDATE message" }, { 3, 1, "Malformed attribute list" }, { 3, 2, "Unrecognized well-known attribute" }, @@ -3147,6 +3224,21 @@ bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len) } } +void +bgp_uncork(void *vp) +{ + struct bgp_proto *p = vp; + + if (p && p->conn && (p->conn->state == BS_ESTABLISHED) && !p->conn->sk->rx_hook) + { + struct birdsock *sk = p->conn->sk; + ASSERT_DIE(sk->rpos > sk->rbuf); + sk->rx_hook = bgp_rx; + bgp_rx(sk, sk->rpos - sk->rbuf); + BGP_TRACE(D_PACKETS, "Uncorked"); + } +} + /** * bgp_rx - handle received data * @sk: socket @@ -3161,6 +3253,7 @@ int bgp_rx(sock *sk, uint size) { struct bgp_conn *conn = sk->data; + struct bgp_proto *p = conn->bgp; byte *pkt_start = sk->rbuf; byte *end = pkt_start + size; uint i, len; @@ -3170,6 +3263,12 @@ bgp_rx(sock *sk, uint size) { if ((conn->state == BS_CLOSE) || (conn->sk != sk)) return 0; + if ((conn->state == BS_ESTABLISHED) && rt_cork_check(conn->bgp->uncork_ev)) + { + sk->rx_hook = NULL; + BGP_TRACE(D_PACKETS, "Corked"); + return 0; + } for(i=0; i<16; i++) if (pkt_start[i] != 0xff) { diff --git a/proto/mrt/Makefile b/proto/mrt/Makefile index 925fb102..8cd44ac1 100644 --- a/proto/mrt/Makefile +++ b/proto/mrt/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/mrt/mrt.c b/proto/mrt/mrt.c index b40592d2..f07f9ca2 100644 --- a/proto/mrt/mrt.c +++ b/proto/mrt/mrt.c @@ -243,21 +243,15 @@ mrt_next_table(struct mrt_table_dump_state *s) rtable *tab = mrt_next_table_(s->table, s->table_ptr, s->table_expr); if (s->table) - { - RT_LOCK(s->table); - rt_unlock_table(RT_PRIV(s->table)); - RT_UNLOCK(s->table); - } + RT_LOCKED(s->table, tab) + rt_unlock_table(tab); s->table = tab; s->ipv4 = tab ? (tab->addr_type == NET_IP4) : 0; if (s->table) - { - RT_LOCK(s->table); - rt_lock_table(RT_PRIV(s->table)); - RT_UNLOCK(s->table); - } + RT_LOCKED(s->table, tab) + rt_lock_table(tab); return s->table; } @@ -431,7 +425,7 @@ mrt_rib_table_header(struct mrt_table_dump_state *s, net_addr *n) static void mrt_rib_table_entry_bgp_attrs(struct mrt_table_dump_state *s, rte *r) { - struct ea_list *eattrs = r->attrs->eattrs; + struct ea_list *eattrs = r->attrs; buffer *b = &s->buf; if (!eattrs) @@ -439,7 +433,7 @@ mrt_rib_table_entry_bgp_attrs(struct mrt_table_dump_state *s, rte *r) /* Attribute list must be normalized for bgp_encode_attrs() */ if (!rta_is_cached(r->attrs)) - ea_normalize(eattrs); + eattrs = ea_normalize(eattrs, 0); mrt_buffer_need(b, MRT_ATTR_BUFFER_SIZE); byte *pos = b->pos; @@ -533,7 +527,7 @@ mrt_rib_table_dump(struct mrt_table_dump_state *s, net *n, int add_path) } rte e = rt->rte; - if (f_run(s->filter, &e, s->linpool, 0) <= F_ACCEPT) + if (f_run(s->filter, &e, 0) <= F_ACCEPT) mrt_rib_table_entry(s, &e); lp_flush(s->linpool); @@ -565,8 +559,8 @@ mrt_table_dump_init(pool *pp) struct mrt_table_dump_state *s = mb_allocz(pool, sizeof(struct mrt_table_dump_state)); s->pool = pool; - s->linpool = lp_new(pool, 4080); - s->peer_lp = lp_new(pool, 4080); + s->linpool = lp_new(pool); + s->peer_lp = lp_new(pool); mrt_buffer_init(&s->buf, pool, 2 * MRT_ATTR_BUFFER_SIZE); /* We lock the current config as we may reference it indirectly by filter */ @@ -582,22 +576,17 @@ static void mrt_table_dump_free(struct mrt_table_dump_state *s) { if (s->table) - { - RT_LOCK(s->table); - - if (s->table_open) - FIB_ITERATE_UNLINK(&s->fit, &RT_PRIV(s->table)->fib); + RT_LOCKED(s->table, tab) + { + if (s->table_open) + FIB_ITERATE_UNLINK(&s->fit, &tab->fib); - rt_unlock_table(RT_PRIV(s->table)); - RT_UNLOCK(s->table); - } + rt_unlock_table(tab); + } if (s->table_ptr) - { - RT_LOCK(s->table_ptr); - rt_unlock_table(RT_PRIV(s->table_ptr)); - RT_UNLOCK(s->table_ptr); - } + RT_LOCKED(s->table_ptr, tab) + rt_unlock_table(tab); config_del_obstacle(s->config); @@ -613,14 +602,8 @@ mrt_table_dump_step(struct mrt_table_dump_state *s) s->max = 2048; s->bws = &bws; - rtable_private *tab; - if (s->table_open) - { - RT_LOCK(s->table); - tab = RT_PRIV(s->table); goto step; - } while (mrt_next_table(s)) { @@ -629,8 +612,9 @@ mrt_table_dump_step(struct mrt_table_dump_state *s) mrt_peer_table_dump(s); - RT_LOCK(s->table); - tab = RT_PRIV(s->table); + RT_LOCKED(s->table, tab) + { + FIB_ITERATE_INIT(&s->fit, &tab->fib); s->table_open = 1; @@ -640,8 +624,7 @@ mrt_table_dump_step(struct mrt_table_dump_state *s) if (s->max < 0) { FIB_ITERATE_PUT(&s->fit); - RT_UNLOCK(s->table); - return 0; + RT_RETURN(tab, 0); } /* With Always ADD_PATH option, we jump directly to second phase */ @@ -656,11 +639,12 @@ mrt_table_dump_step(struct mrt_table_dump_state *s) FIB_ITERATE_END; s->table_open = 0; + } + mrt_close_file(s); mrt_peer_table_flush(s); } - RT_UNLOCK(s->table); return 1; } @@ -688,11 +672,8 @@ mrt_timer(timer *t) s->always_add_path = cf->always_add_path; if (s->table_ptr) - { - RT_LOCK(s->table_ptr); - rt_lock_table(RT_PRIV(s->table_ptr)); - RT_UNLOCK(s->table_ptr); - } + RT_LOCKED(s->table_ptr, tab) + rt_lock_table(tab); p->table_dump = s; ev_schedule(p->event); @@ -734,14 +715,17 @@ mrt_dump_cont(struct cli *c) cli_printf(c, 0, ""); mrt_table_dump_free(c->rover); - c->cont = c->cleanup = c->rover = NULL; + c->cont = NULL; + c->cleanup = NULL; + c->rover = NULL; } -static void +static int mrt_dump_cleanup(struct cli *c) { mrt_table_dump_free(c->rover); c->rover = NULL; + return 0; } void @@ -765,11 +749,8 @@ mrt_dump_cmd(struct mrt_dump_data *d) s->filename = d->filename; if (s->table_ptr) - { - RT_LOCK(s->table_ptr); - rt_lock_table(RT_PRIV(s->table_ptr)); - RT_UNLOCK(s->table_ptr); - } + RT_LOCKED(s->table_ptr, tab) + rt_lock_table(tab); this_cli->cont = mrt_dump_cont; this_cli->cleanup = mrt_dump_cleanup; @@ -939,7 +920,6 @@ mrt_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUSE struct protocol proto_mrt = { .name = "MRT", .template = "mrt%d", - .class = PROTOCOL_MRT, .proto_size = sizeof(struct mrt_proto), .config_size = sizeof(struct mrt_config), .init = mrt_init, @@ -948,3 +928,9 @@ struct protocol proto_mrt = { .reconfigure = mrt_reconfigure, .copy_config = mrt_copy_config, }; + +void +mrt_build(void) +{ + proto_build(&proto_mrt); +} diff --git a/proto/mrt/mrt.h b/proto/mrt/mrt.h index 04865089..f535a391 100644 --- a/proto/mrt/mrt.h +++ b/proto/mrt/mrt.h @@ -13,7 +13,7 @@ #include "nest/bird.h" #include "nest/protocol.h" #include "lib/lists.h" -#include "nest/route.h" +#include "nest/rt.h" #include "lib/event.h" #include "lib/hash.h" diff --git a/proto/ospf/Makefile b/proto/ospf/Makefile index 39e74f71..015f394a 100644 --- a/proto/ospf/Makefile +++ b/proto/ospf/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/ospf/config.Y b/proto/ospf/config.Y index 4b7d5a36..bc3df8db 100644 --- a/proto/ospf/config.Y +++ b/proto/ospf/config.Y @@ -190,7 +190,7 @@ ospf_check_auth(void) CF_DECLS -CF_KEYWORDS(OSPF, V2, V3, OSPF_METRIC1, OSPF_METRIC2, OSPF_TAG, OSPF_ROUTER_ID) +CF_KEYWORDS(OSPF, V2, V3) CF_KEYWORDS(AREA, NEIGHBORS, RFC1583COMPAT, STUB, TICK, COST, COST2, RETRANSMIT) CF_KEYWORDS(HELLO, TRANSMIT, PRIORITY, DEAD, TYPE, BROADCAST, BCAST, DEFAULT) CF_KEYWORDS(NONBROADCAST, NBMA, POINTOPOINT, PTP, POINTOMULTIPOINT, PTMP) @@ -505,11 +505,6 @@ ospf_iface: ospf_iface_start ospf_iface_patt_list ospf_iface_opt_list { ospf_iface_finish(); } ; -dynamic_attr: OSPF_METRIC1 { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_OSPF_METRIC1); } ; -dynamic_attr: OSPF_METRIC2 { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_OSPF_METRIC2); } ; -dynamic_attr: OSPF_TAG { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_OSPF_TAG); } ; -dynamic_attr: OSPF_ROUTER_ID { $$ = f_new_dynamic_attr(EAF_TYPE_ROUTER_ID, T_QUAD, EA_OSPF_ROUTER_ID); } ; - CF_CLI_HELP(SHOW OSPF, ..., [[Show information about OSPF protocol]]); CF_CLI(SHOW OSPF, optproto, [<name>], [[Show information about OSPF protocol]]) { PROTO_WALK_CMD($3, &proto_ospf, p) ospf_sh(p); }; diff --git a/proto/ospf/ospf.c b/proto/ospf/ospf.c index 16774df6..4e29f960 100644 --- a/proto/ospf/ospf.c +++ b/proto/ospf/ospf.c @@ -106,11 +106,12 @@ #include <stdlib.h> #include "ospf.h" +#include "lib/macro.h" static int ospf_preexport(struct channel *C, rte *new); static void ospf_reload_routes(struct channel *C); static int ospf_rte_better(struct rte *new, struct rte *old); -static u32 ospf_rte_igp_metric(struct rte *rt); +static u32 ospf_rte_igp_metric(const rte *rt); static void ospf_disp(timer *timer); @@ -299,7 +300,7 @@ ospf_start(struct proto *P) p->lsab_size = 256; p->lsab_used = 0; p->lsab = mb_alloc(P->pool, p->lsab_size); - p->nhpool = lp_new(P->pool, 12*sizeof(struct nexthop)); + p->nhpool = lp_new(P->pool); init_list(&(p->iface_list)); init_list(&(p->area_list)); fib_init(&p->rtf, P->pool, ospf_get_af(p), sizeof(ort), OFFSETOF(ort, fn), 0, NULL); @@ -386,23 +387,26 @@ ospf_init(struct proto_config *CF) static int ospf_rte_better(struct rte *new, struct rte *old) { - u32 new_metric1 = ea_get_int(new->attrs->eattrs, EA_OSPF_METRIC1, LSINFINITY); + u32 new_metric1 = ea_get_int(new->attrs, &ea_ospf_metric1, LSINFINITY); if (new_metric1 == LSINFINITY) return 0; - if(new->attrs->source < old->attrs->source) return 1; - if(new->attrs->source > old->attrs->source) return 0; + u32 ns = rt_get_source_attr(new); + u32 os = rt_get_source_attr(old); - if(new->attrs->source == RTS_OSPF_EXT2) + if (ns < os) return 1; + if (ns > os) return 0; + + if (ns == RTS_OSPF_EXT2) { - u32 old_metric2 = ea_get_int(old->attrs->eattrs, EA_OSPF_METRIC2, LSINFINITY); - u32 new_metric2 = ea_get_int(new->attrs->eattrs, EA_OSPF_METRIC2, LSINFINITY); - if(new_metric2 < old_metric2) return 1; - if(new_metric2 > old_metric2) return 0; + u32 old_metric2 = ea_get_int(old->attrs, &ea_ospf_metric2, LSINFINITY); + u32 new_metric2 = ea_get_int(new->attrs, &ea_ospf_metric2, LSINFINITY); + if (new_metric2 < old_metric2) return 1; + if (new_metric2 > old_metric2) return 0; } - u32 old_metric1 = ea_get_int(old->attrs->eattrs, EA_OSPF_METRIC1, LSINFINITY); + u32 old_metric1 = ea_get_int(old->attrs, &ea_ospf_metric1, LSINFINITY); if (new_metric1 < old_metric1) return 1; @@ -410,12 +414,12 @@ ospf_rte_better(struct rte *new, struct rte *old) } static u32 -ospf_rte_igp_metric(struct rte *rt) +ospf_rte_igp_metric(const rte *rt) { - if (rt->attrs->source == RTS_OSPF_EXT2) + if (rt_get_source_attr(rt) == RTS_OSPF_EXT2) return IGP_METRIC_UNKNOWN; - return ea_get_int(rt->attrs->eattrs, EA_OSPF_METRIC1, LSINFINITY); + return ea_get_int(rt->attrs, &ea_ospf_metric1, LSINFINITY); } void @@ -482,13 +486,13 @@ ospf_disp(timer * timer) * import to the filters. */ static int -ospf_preexport(struct channel *c, rte *e) +ospf_preexport(struct channel *C, rte *e) { - struct ospf_proto *p = (struct ospf_proto *) c->proto; + struct ospf_proto *p = (struct ospf_proto *) C->proto; struct ospf_area *oa = ospf_main_area(p); /* Reject our own routes */ - if (e->sender == c->in_req.hook) + if (e->sender == C->in_req.hook) return -1; /* Do not export routes to stub areas */ @@ -531,7 +535,7 @@ ospf_shutdown(struct proto *P) /* Cleanup locked rta entries */ FIB_WALK(&p->rtf, ort, nf) { - rta_free(nf->old_rta); + ea_free(nf->old_ea); } FIB_WALK_END; @@ -570,7 +574,8 @@ ospf_get_route_info(rte * rte, byte * buf) { char *type = "<bug>"; - switch (rte->attrs->source) + uint source = rt_get_source_attr(rte); + switch (source) { case RTS_OSPF: type = "I"; @@ -587,42 +592,26 @@ ospf_get_route_info(rte * rte, byte * buf) } buf += bsprintf(buf, " %s", type); - buf += bsprintf(buf, " (%d/%d", rte->attrs->pref, ea_get_int(rte->attrs->eattrs, EA_OSPF_METRIC1, LSINFINITY)); - if (rte->attrs->source == RTS_OSPF_EXT2) - buf += bsprintf(buf, "/%d", ea_get_int(rte->attrs->eattrs, EA_OSPF_METRIC2, LSINFINITY)); + buf += bsprintf(buf, " (%d/%d", rt_get_preference(rte), ea_get_int(rte->attrs, &ea_ospf_metric1, LSINFINITY)); + if (source == RTS_OSPF_EXT2) + buf += bsprintf(buf, "/%d", ea_get_int(rte->attrs, &ea_ospf_metric2, LSINFINITY)); buf += bsprintf(buf, ")"); - if (rte->attrs->source == RTS_OSPF_EXT1 || rte->attrs->source == RTS_OSPF_EXT2) + if (source == RTS_OSPF_EXT1 || source == RTS_OSPF_EXT2) { - eattr *ea = ea_find(rte->attrs->eattrs, EA_OSPF_TAG); + eattr *ea = ea_find(rte->attrs, &ea_ospf_tag); if (ea && (ea->u.data > 0)) buf += bsprintf(buf, " [%x]", ea->u.data); } - eattr *ea = ea_find(rte->attrs->eattrs, EA_OSPF_ROUTER_ID); + eattr *ea = ea_find(rte->attrs, &ea_ospf_router_id); if (ea) buf += bsprintf(buf, " [%R]", ea->u.data); } -static int -ospf_get_attr(const eattr * a, byte * buf, int buflen UNUSED) +static void +ospf_tag_format(const eattr * a, byte * buf, uint buflen) { - switch (a->id) - { - case EA_OSPF_METRIC1: - bsprintf(buf, "metric1"); - return GA_NAME; - case EA_OSPF_METRIC2: - bsprintf(buf, "metric2"); - return GA_NAME; - case EA_OSPF_TAG: - bsprintf(buf, "tag: 0x%08x", a->u.data); - return GA_FULL; - case EA_OSPF_ROUTER_ID: - bsprintf(buf, "router_id"); - return GA_NAME; - default: - return GA_UNKNOWN; - } + bsnprintf(buf, buflen, "0x%08x", a->u.data); } static void @@ -1526,7 +1515,6 @@ struct rte_owner_class ospf_rte_owner_class = { struct protocol proto_ospf = { .name = "OSPF", .template = "ospf%d", - .class = PROTOCOL_OSPF, .preference = DEF_PREF_OSPF, .channel_mask = NB_IP, .proto_size = sizeof(struct ospf_proto), @@ -1537,5 +1525,38 @@ struct protocol proto_ospf = { .shutdown = ospf_shutdown, .reconfigure = ospf_reconfigure, .get_status = ospf_get_status, - .get_attr = ospf_get_attr, }; + +struct ea_class ea_ospf_metric1 = { + .name = "ospf_metric1", + .type = T_INT, +}; + +struct ea_class ea_ospf_metric2 = { + .name = "ospf_metric2", + .type = T_INT, +}; + +struct ea_class ea_ospf_tag = { + .name = "ospf_tag", + .type = T_INT, + .format = ospf_tag_format, +}; + +struct ea_class ea_ospf_router_id = { + .name = "ospf_router_id", + .type = T_QUAD, +}; + +void +ospf_build(void) +{ + proto_build(&proto_ospf); + + EA_REGISTER_ALL( + &ea_ospf_metric1, + &ea_ospf_metric2, + &ea_ospf_tag, + &ea_ospf_router_id + ); +} diff --git a/proto/ospf/ospf.h b/proto/ospf/ospf.h index a5f83e79..3477ba5a 100644 --- a/proto/ospf/ospf.h +++ b/proto/ospf/ospf.h @@ -22,7 +22,7 @@ #include "lib/resource.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "nest/locks.h" #include "nest/bfd.h" @@ -939,12 +939,7 @@ struct lsadb_show_data { u32 router; /* Advertising router, 0 -> all */ }; - -#define EA_OSPF_METRIC1 EA_CODE(PROTOCOL_OSPF, 0) -#define EA_OSPF_METRIC2 EA_CODE(PROTOCOL_OSPF, 1) -#define EA_OSPF_TAG EA_CODE(PROTOCOL_OSPF, 2) -#define EA_OSPF_ROUTER_ID EA_CODE(PROTOCOL_OSPF, 3) - +extern struct ea_class ea_ospf_metric1, ea_ospf_metric2, ea_ospf_tag, ea_ospf_router_id; /* * For regular networks, neighbor address must match network prefix. diff --git a/proto/ospf/rt.c b/proto/ospf/rt.c index 3e208023..69c2907d 100644 --- a/proto/ospf/rt.c +++ b/proto/ospf/rt.c @@ -28,24 +28,30 @@ nh_is_vlink(struct nexthop *nhs) static inline int unresolved_vlink(ort *ort) { - return ort->n.nhs && nh_is_vlink(ort->n.nhs); + return ort->n.nhs && nh_is_vlink(&ort->n.nhs->nh); } -static inline struct nexthop * +static inline struct nexthop_adata * new_nexthop(struct ospf_proto *p, ip_addr gw, struct iface *iface, byte weight) { - struct nexthop *nh = lp_allocz(p->nhpool, sizeof(struct nexthop)); - nh->gw = gw; - nh->iface = iface; - nh->weight = weight; - return nh; + struct nexthop_adata *nhad = lp_alloc(p->nhpool, sizeof(struct nexthop_adata)); + *nhad = (struct nexthop_adata) { + .ad = { .length = sizeof *nhad - sizeof nhad->ad, }, + .nh = { + .gw = gw, + .iface = iface, + .weight = weight, + }, + }; + + return nhad; } /* Returns true if there are device nexthops in n */ static inline int -has_device_nexthops(const struct nexthop *n) +has_device_nexthops(struct nexthop_adata *nhad) { - for (; n; n = n->next) + NEXTHOP_WALK(n, nhad) if (ipa_zero(n->gw)) return 1; @@ -53,38 +59,22 @@ has_device_nexthops(const struct nexthop *n) } /* Replace device nexthops with nexthops to gw */ -static struct nexthop * -fix_device_nexthops(struct ospf_proto *p, const struct nexthop *n, ip_addr gw) +static struct nexthop_adata * +fix_device_nexthops(struct ospf_proto *p, struct nexthop_adata *old, ip_addr gw) { - struct nexthop *root1 = NULL; - struct nexthop *root2 = NULL; - struct nexthop **nn1 = &root1; - struct nexthop **nn2 = &root2; - if (!p->ecmp) - return new_nexthop(p, gw, n->iface, n->weight); - - /* This is a bit tricky. We cannot just copy the list and update n->gw, - because the list should stay sorted, so we create two lists, one with new - gateways and one with old ones, and then merge them. */ - - for (; n; n = n->next) { - struct nexthop *nn = new_nexthop(p, ipa_zero(n->gw) ? gw : n->gw, n->iface, n->weight); + struct nexthop_adata *new = (struct nexthop_adata *) lp_store_adata(p->nhpool, old->ad.data, old->ad.length); + new->nh.gw = gw; + return new; + } + struct nexthop_adata *tmp = (struct nexthop_adata *) tmp_copy_adata(&old->ad); + NEXTHOP_WALK(n, tmp) if (ipa_zero(n->gw)) - { - *nn1 = nn; - nn1 = &(nn->next); - } - else - { - *nn2 = nn; - nn2 = &(nn->next); - } - } + n->gw = gw; - return nexthop_merge(root1, root2, 1, 1, p->ecmp, p->nhpool); + return nexthop_sort(tmp, p->nhpool); } @@ -169,9 +159,9 @@ orta_compare(const struct ospf_proto *p, const orta *new, const orta *old) return -1; if (!new->nhs) return 1; - if (nh_is_vlink(new->nhs)) + if (nh_is_vlink(&new->nhs->nh)) return -1; - if (nh_is_vlink(old->nhs)) + if (nh_is_vlink(&old->nhs->nh)) return 1; @@ -279,11 +269,7 @@ ort_merge(struct ospf_proto *p, ort *o, const orta *new) orta *old = &o->n; if (old->nhs != new->nhs) - { - old->nhs = nexthop_merge(old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse, - p->ecmp, p->nhpool); - old->nhs_reuse = 1; - } + old->nhs = nexthop_merge(old->nhs, new->nhs, p->ecmp, p->nhpool); if (old->rid < new->rid) old->rid = new->rid; @@ -295,11 +281,7 @@ ort_merge_ext(struct ospf_proto *p, ort *o, const orta *new) orta *old = &o->n; if (old->nhs != new->nhs) - { - old->nhs = nexthop_merge(old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse, - p->ecmp, p->nhpool); - old->nhs_reuse = 1; - } + old->nhs = nexthop_merge(old->nhs, new->nhs, p->ecmp, p->nhpool); if (old->tag != new->tag) old->tag = 0; @@ -1165,7 +1147,7 @@ ospf_check_vlinks(struct ospf_proto *p) if (tmp && (tmp->color == INSPF) && ipa_nonzero(tmp->lb) && tmp->nhs) { - struct ospf_iface *nhi = ospf_iface_find(p, tmp->nhs->iface); + struct ospf_iface *nhi = ospf_iface_find(p, tmp->nhs->nh.iface); if ((ifa->state != OSPF_IS_PTP) || (ifa->vifa != nhi) @@ -1579,10 +1561,7 @@ ospf_ext_spf(struct ospf_proto *p) /* Replace device nexthops with nexthops to forwarding address from LSA */ if (has_device_nexthops(nfa.nhs)) - { nfa.nhs = fix_device_nexthops(p, nfa.nhs, rt.fwaddr); - nfa.nhs_reuse = 1; - } } if (rt.ebit) @@ -1726,10 +1705,10 @@ ospf_rt_spf(struct ospf_proto *p) static inline int -inherit_nexthops(struct nexthop *pn) +inherit_nexthops(struct nexthop_adata *pn) { /* Proper nexthops (with defined GW) or dummy vlink nexthops (without iface) */ - return pn && (ipa_nonzero(pn->gw) || !pn->iface); + return pn && (ipa_nonzero(pn->nh.gw) || !pn->nh.iface); } static inline ip_addr @@ -1744,12 +1723,12 @@ link_lsa_lladdr(struct ospf_proto *p, struct top_hash_entry *en) return ospf_is_ip4(p) ? ipa_from_ip4(ospf3_6to4(ll)) : ipa_from_ip6(ll); } -static struct nexthop * +static struct nexthop_adata * calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry *par, int pos, uint data, uint lif, uint nif) { struct ospf_proto *p = oa->po; - struct nexthop *pn = par->nhs; + struct nexthop_adata *pn = par->nhs; struct top_hash_entry *link = NULL; struct ospf_iface *ifa = NULL; ip_addr nh = IPA_NONE; @@ -1827,10 +1806,10 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, return NULL; } - struct nexthop *nhs = new_nexthop(p, nh, ifa->iface, ifa->ecmp_weight); + struct nexthop_adata *nhs = new_nexthop(p, nh, ifa->iface, ifa->ecmp_weight); if (ifa->addr->flags & IA_HOST) - nhs->flags = RNF_ONLINK; + nhs->nh.flags = RNF_ONLINK; return nhs; } @@ -1851,7 +1830,7 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, if (ipa_zero(en->lb)) goto bad; - return new_nexthop(p, en->lb, pn->iface, pn->weight); + return new_nexthop(p, en->lb, pn->nh.iface, pn->nh.weight); } else /* OSPFv3 */ { @@ -1859,7 +1838,7 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, * Next-hop is taken from lladdr field of Link-LSA, en->lb_id * is computed in link_back(). */ - link = ospf_hash_find(p->gr, pn->iface->index, en->lb_id, rid, LSA_T_LINK); + link = ospf_hash_find(p->gr, pn->nh.iface->index, en->lb_id, rid, LSA_T_LINK); if (!link) return NULL; @@ -1867,7 +1846,7 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, if (ipa_zero(nh)) return NULL; - return new_nexthop(p, nh, pn->iface, pn->weight); + return new_nexthop(p, nh, pn->nh.iface, pn->nh.weight); } } @@ -1914,7 +1893,7 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry if (!link_back(oa, en, par, lif, nif)) return; - struct nexthop *nhs = calc_next_hop(oa, en, par, pos, data, lif, nif); + struct nexthop_adata *nhs = calc_next_hop(oa, en, par, pos, data, lif, nif); if (!nhs) { log(L_WARN "%s: Cannot find next hop for LSA (Type: %04x, Id: %R, Rt: %R)", @@ -1923,7 +1902,7 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry } /* If en->dist > 0, we know that en->color == CANDIDATE and en->nhs is defined. */ - if ((dist == en->dist) && !nh_is_vlink(en->nhs)) + if ((dist == en->dist) && !nh_is_vlink(&en->nhs->nh)) { /* * For multipath, we should merge nexthops. We merge regular nexthops only. @@ -1947,13 +1926,11 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry */ /* Keep old ones */ - if (!p->ecmp || nh_is_vlink(nhs) || (nhs == en->nhs)) + if (!p->ecmp || nh_is_vlink(&nhs->nh) || (nhs == en->nhs)) return; /* Merge old and new */ - int new_reuse = (par->nhs != nhs); - en->nhs = nexthop_merge(en->nhs, nhs, en->nhs_reuse, new_reuse, p->ecmp, p->nhpool); - en->nhs_reuse = 1; + en->nhs = nexthop_merge(en->nhs, nhs, p->ecmp, p->nhpool); return; } @@ -1967,7 +1944,6 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry en->nhs = nhs; en->dist = dist; en->color = CANDIDATE; - en->nhs_reuse = (par->nhs != nhs); prev = NULL; @@ -2001,14 +1977,34 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry } static inline int -ort_changed(ort *nf, rta *nr) +ort_changed(ort *nf, ea_list *nr) { - rta *or = nf->old_rta; - return !or || + ea_list *or = nf->old_ea; + + if (!or || (nf->n.metric1 != nf->old_metric1) || (nf->n.metric2 != nf->old_metric2) || - (nf->n.tag != nf->old_tag) || (nf->n.rid != nf->old_rid) || - (nr->source != or->source) || (nr->dest != or->dest) || - !nexthop_same(&(nr->nh), &(or->nh)); + (nf->n.tag != nf->old_tag) || (nf->n.rid != nf->old_rid)) + return 1; + + eattr *nhea_n = ea_find(nr, &ea_gen_nexthop); + eattr *nhea_o = ea_find(or, &ea_gen_nexthop); + if (!nhea_n != !nhea_o) + return 1; + + if (nhea_n && nhea_o) + { + struct nexthop_adata *nhad_n = (struct nexthop_adata *) nhea_n->u.ptr; + struct nexthop_adata *nhad_o = (struct nexthop_adata *) nhea_o->u.ptr; + + if (!nexthop_same(nhad_n, nhad_o)) + return 1; + } + + if ( ea_get_int(nr, &ea_gen_source, 0) + != ea_get_int(or, &ea_gen_source, 0)) + return 1; + + return 0; } static void @@ -2030,10 +2026,9 @@ again1: FIB_ITERATE_START(fib, &fit, ort, nf) { /* Sanity check of next-hop addresses, failure should not happen */ - if (nf->n.type) + if (nf->n.type && nf->n.nhs) { - struct nexthop *nh; - for (nh = nf->n.nhs; nh; nh = nh->next) + NEXTHOP_WALK(nh, nf->n.nhs) if (ipa_nonzero(nh->gw)) { neighbor *nbr = neigh_find(&p->p, nh->gw, nh->iface, @@ -2052,69 +2047,67 @@ again1: if (nf->n.type) /* Add the route */ { - rta a0 = { - .source = nf->n.type, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, - .nh = *(nf->n.nhs), - .pref = p->p.main_channel->preference, - }; - - if (reload || ort_changed(nf, &a0)) - { - a0.eattrs = alloca(sizeof(ea_list) + 4 * sizeof(eattr)); - memset(a0.eattrs, 0, sizeof(ea_list)); + struct { + ea_list l; + eattr a[7]; + } eattrs; + eattrs.l = (ea_list) {}; + + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_gen_preference, 0, p->p.main_channel->preference); + + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_gen_source, 0, nf->n.type); + + eattrs.a[eattrs.l.count++] = + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nf->n.nhs->ad); + + if (reload || ort_changed(nf, &eattrs.l)) + { nf->old_metric1 = nf->n.metric1; nf->old_metric2 = nf->n.metric2; nf->old_tag = nf->n.tag; nf->old_rid = nf->n.rid; - a0.eattrs->attrs[a0.eattrs->count++] = (eattr) { - .id = EA_OSPF_METRIC1, - .type = EAF_TYPE_INT, - .u.data = nf->n.metric1, - }; + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_ospf_metric1, 0, nf->n.metric1); if (nf->n.type == RTS_OSPF_EXT2) - a0.eattrs->attrs[a0.eattrs->count++] = (eattr) { - .id = EA_OSPF_METRIC2, - .type = EAF_TYPE_INT, - .u.data = nf->n.metric2, - }; + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_ospf_metric2, 0, nf->n.metric2); if ((nf->n.type == RTS_OSPF_EXT1) || (nf->n.type == RTS_OSPF_EXT2)) - a0.eattrs->attrs[a0.eattrs->count++] = (eattr) { - .id = EA_OSPF_TAG, - .type = EAF_TYPE_INT, - .u.data = nf->n.tag, - }; - - a0.eattrs->attrs[a0.eattrs->count++] = (eattr) { - .id = EA_OSPF_ROUTER_ID, - .type = EAF_TYPE_ROUTER_ID, - .u.data = nf->n.rid, - }; + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_ospf_tag, 0, nf->n.tag); + + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_ospf_router_id, 0, nf->n.rid); - rta_free(nf->old_rta); - nf->old_rta = rta_lookup(&a0); + ASSERT_DIE(ARRAY_SIZE(eattrs.a) >= eattrs.l.count); + + ea_list *eal = ea_lookup(&eattrs.l, 0); + ea_free(nf->old_ea); + nf->old_ea = eal; rte e0 = { - .attrs = nf->old_rta, + .attrs = eal, .src = p->p.main_source, }; + /* DBG("Mod rte type %d - %N via %I on iface %s, met %d\n", a0.source, nf->fn.addr, a0.gw, a0.iface ? a0.iface->name : "(none)", nf->n.metric1); + */ rte_update(p->p.main_channel, nf->fn.addr, &e0, p->p.main_source); } } - else if (nf->old_rta) + else if (nf->old_ea) { /* Remove the route */ - rta_free(nf->old_rta); - nf->old_rta = NULL; + rta_free(nf->old_ea); + nf->old_ea = NULL; rte_update(p->p.main_channel, nf->fn.addr, NULL, p->p.main_source); } diff --git a/proto/ospf/rt.h b/proto/ospf/rt.h index 094e125b..88eefef9 100644 --- a/proto/ospf/rt.h +++ b/proto/ospf/rt.h @@ -18,8 +18,6 @@ typedef struct orta { u8 type; /* RTS_OSPF_* */ - u8 nhs_reuse; /* Whether nhs nodes can be reused during merging. - See a note in rt.c:add_cand() */ u32 options; /* * For ORT_ROUTER routes, options field are router-LSA style @@ -53,7 +51,7 @@ typedef struct orta struct ospf_area *oa; struct ospf_area *voa; /* Used when route is replaced in ospf_rt_sum_tr(), NULL otherwise */ - struct nexthop *nhs; /* Next hops computed during SPF */ + struct nexthop_adata *nhs; /* Next hops computed during SPF */ struct top_hash_entry *en; /* LSA responsible for this orta */ } orta; @@ -80,7 +78,7 @@ typedef struct ort */ orta n; u32 old_metric1, old_metric2, old_tag, old_rid; - rta *old_rta; + ea_list *old_ea; u32 lsa_id; u8 external_rte; u8 area_net; diff --git a/proto/ospf/topology.c b/proto/ospf/topology.c index bb88d20a..85bce03d 100644 --- a/proto/ospf/topology.c +++ b/proto/ospf/topology.c @@ -1337,9 +1337,9 @@ ospf_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *n, rt ASSERT(p->asbr); /* Get route attributes */ - rta *a = new->attrs; - eattr *m1a = ea_find(a->eattrs, EA_OSPF_METRIC1); - eattr *m2a = ea_find(a->eattrs, EA_OSPF_METRIC2); + ea_list *a = new->attrs; + eattr *m1a = ea_find(a, &ea_ospf_metric1); + eattr *m2a = ea_find(a, &ea_ospf_metric2); uint m1 = m1a ? m1a->u.data : 0; uint m2 = m2a ? m2a->u.data : 10000; @@ -1363,11 +1363,14 @@ ospf_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *n, rt uint ebit = m2a || !m1a; uint metric = ebit ? m2 : m1; - uint tag = ea_get_int(a->eattrs, EA_OSPF_TAG, 0); + uint tag = ea_get_int(a, &ea_ospf_tag, 0); ip_addr fwd = IPA_NONE; - if ((a->dest == RTD_UNICAST) && use_gw_for_fwaddr(p, a->nh.gw, a->nh.iface)) - fwd = a->nh.gw; + eattr *nhea = ea_find(a, &ea_gen_nexthop); + struct nexthop_adata *nhad = (struct nexthop_adata *) nhea->u.ptr; + if (NEXTHOP_IS_REACHABLE(nhad)) + if (use_gw_for_fwaddr(p, nhad->nh.gw, nhad->nh.iface)) + fwd = nhad->nh.gw; /* NSSA-LSA with P-bit set must have non-zero forwarding address */ if (oa && ipa_zero(fwd)) @@ -2135,7 +2138,7 @@ ospf_hash_delete(struct top_graph *f, struct top_hash_entry *e) if (*ee == e) { *ee = e->next; - sl_free(f->hash_slab, e); + sl_free(e); if (f->hash_entries-- < f->hash_entries_min) ospf_top_rehash(f, -HASH_LO_STEP); return; diff --git a/proto/ospf/topology.h b/proto/ospf/topology.h index c36d0b50..3c92b431 100644 --- a/proto/ospf/topology.h +++ b/proto/ospf/topology.h @@ -28,7 +28,7 @@ struct top_hash_entry u16 next_lsa_opts; /* For postponed LSA origination */ btime inst_time; /* Time of installation into DB */ struct ort *nf; /* Reference fibnode for sum and ext LSAs, NULL for otherwise */ - struct nexthop *nhs; /* Computed nexthops - valid only in ospf_rt_spf() */ + struct nexthop_adata *nhs; /* Computed nexthops - valid only in ospf_rt_spf() */ ip_addr lb; /* In OSPFv2, link back address. In OSPFv3, any global address in the area useful for vlinks */ u32 lb_id; /* Interface ID of link back iface (for bcast or NBMA networks) */ u32 dist; /* Distance from the root */ @@ -39,8 +39,6 @@ struct top_hash_entry #define CANDIDATE 1 #define INSPF 2 u8 mode; /* LSA generated during RT calculation (LSA_RTCALC or LSA_STALE)*/ - u8 nhs_reuse; /* Whether nhs nodes can be reused during merging. - See a note in rt.c:add_cand() */ }; diff --git a/proto/perf/perf.c b/proto/perf/perf.c index aa688d88..9adafe5a 100644 --- a/proto/perf/perf.c +++ b/proto/perf/perf.c @@ -18,7 +18,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "conf/conf.h" #include "filter/filter.h" @@ -85,7 +85,7 @@ random_net_ip4(void) } struct perf_random_routes { - struct rta *a; + ea_list *a; net_addr net; }; @@ -142,17 +142,21 @@ perf_loop(void *data) *((net_addr_ip4 *) &(p->data[i].net)) = random_net_ip4(); if (!p->attrs_per_rte || !(i % p->attrs_per_rte)) { - struct rta a0 = { - .source = RTS_PERF, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, - .pref = p->p.main_channel->preference, + ea_list *ea = NULL; + + ea_set_attr_u32(&ea, &ea_gen_preference, 0, p->p.main_channel->preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_PERF); + + struct nexthop_adata nhad = { .nh.iface = p->ifa->iface, .nh.gw = gw, .nh.weight = 1, }; - p->data[i].a = rta_lookup(&a0); + ea_set_attr_data(&ea, &ea_gen_nexthop, 0, + &nhad.ad.data, sizeof nhad - sizeof nhad.ad); + + p->data[i].a = rta_lookup(ea, 0); } else p->data[i].a = rta_clone(p->data[i-1].a); @@ -198,9 +202,9 @@ perf_loop(void *data) p->exp++; } - RT_LOCK(P->main_channel->table); - rt_schedule_prune(RT_PRIV(P->main_channel->table)); - RT_UNLOCK(P->main_channel->table); + RT_LOCKED(P->main_channel->table, tab) + rt_schedule_prune(tab); + ev_schedule(p->loop); } @@ -307,7 +311,6 @@ perf_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUS struct protocol proto_perf = { .name = "Perf", .template = "perf%d", - .class = PROTOCOL_PERF, .channel_mask = NB_IP, .proto_size = sizeof(struct perf_proto), .config_size = sizeof(struct perf_config), @@ -316,3 +319,9 @@ struct protocol proto_perf = { .reconfigure = perf_reconfigure, .copy_config = perf_copy_config, }; + +void +perf_build(void) +{ + proto_build(&proto_perf); +} diff --git a/proto/pipe/Makefile b/proto/pipe/Makefile index 5093da98..0d68db4c 100644 --- a/proto/pipe/Makefile +++ b/proto/pipe/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/pipe/config.Y b/proto/pipe/config.Y index fc08445f..444de127 100644 --- a/proto/pipe/config.Y +++ b/proto/pipe/config.Y @@ -42,6 +42,12 @@ pipe_proto: pipe_proto_start '{' | pipe_proto proto_item ';' | pipe_proto channel_item_ ';' + | pipe_proto IMPORT IN net_any imexport ';' { + if (this_channel->net_type && ($4->type != this_channel->net_type)) + cf_error("Incompatible export prefilter type"); + PIPE_CFG->in_subprefix = $4; + this_channel->in_filter = $5; + } | pipe_proto PEER TABLE rtable ';' { PIPE_CFG->peer = $4; } | pipe_proto MAX GENERATION expr ';' { if (($4 < 1) || ($4 > 254)) cf_error("Max generation must be in range 1..254, got %u", $4); diff --git a/proto/pipe/pipe.c b/proto/pipe/pipe.c index a30da0e2..0b0d9151 100644 --- a/proto/pipe/pipe.c +++ b/proto/pipe/pipe.c @@ -35,7 +35,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "conf/conf.h" #include "filter/filter.h" @@ -52,24 +52,28 @@ pipe_rt_notify(struct proto *P, struct channel *src_ch, const net_addr *n, rte * { struct pipe_proto *p = (void *) P; struct channel *dst = (src_ch == p->pri) ? p->sec : p->pri; + uint *flags = (src_ch == p->pri) ? &p->sec_flags : &p->pri_flags; if (!new && !old) return; + /* Start the route refresh if requested to */ + if (*flags & PIPE_FL_RR_BEGIN_PENDING) + { + *flags &= ~PIPE_FL_RR_BEGIN_PENDING; + rt_refresh_begin(&dst->in_req); + } + if (new) { - rta *a = alloca(rta_size(new->attrs)); - memcpy(a, new->attrs, rta_size(new->attrs)); - - a->cached = 0; - a->hostentry = NULL; - rte e0 = { - .attrs = a, + .attrs = new->attrs, .src = new->src, .generation = new->generation + 1, }; + ea_unset_attr(&e0.attrs, 0, &ea_gen_hostentry); + rte_update(dst, n, &e0, new->src); } else @@ -77,12 +81,12 @@ pipe_rt_notify(struct proto *P, struct channel *src_ch, const net_addr *n, rte * } static int -pipe_preexport(struct channel *c, rte *e) +pipe_preexport(struct channel *C, rte *e) { - struct pipe_proto *p = (void *) c->proto; + struct pipe_proto *p = (void *) C->proto; /* Avoid direct loopbacks */ - if (e->sender == c->in_req.hook) + if (e->sender == C->in_req.hook) return -1; /* Indirection check */ @@ -90,8 +94,8 @@ pipe_preexport(struct channel *c, rte *e) if (e->generation >= max_generation) { log_rl(&p->rl_gen, L_ERR "Route overpiped (%u hops of %u configured in %s) in table %s: %N %s/%u:%u", - e->generation, max_generation, c->proto->name, - c->table->name, e->net, e->src->owner->name, e->src->private_id, e->src->global_id); + e->generation, max_generation, C->proto->name, + C->table->name, e->net, e->src->owner->name, e->src->private_id, e->src->global_id); return -1; } @@ -108,6 +112,32 @@ pipe_reload_routes(struct channel *C) channel_request_feeding((C == p->pri) ? p->sec : p->pri); } +static void +pipe_feed_begin(struct channel *C, int initial UNUSED) +{ + struct pipe_proto *p = (void *) C->proto; + uint *flags = (C == p->pri) ? &p->sec_flags : &p->pri_flags; + + *flags |= PIPE_FL_RR_BEGIN_PENDING; +} + +static void +pipe_feed_end(struct channel *C) +{ + struct pipe_proto *p = (void *) C->proto; + struct channel *dst = (C == p->pri) ? p->sec : p->pri; + uint *flags = (C == p->pri) ? &p->sec_flags : &p->pri_flags; + + /* If not even started, start the RR now */ + if (*flags & PIPE_FL_RR_BEGIN_PENDING) + { + *flags &= ~PIPE_FL_RR_BEGIN_PENDING; + rt_refresh_begin(&dst->in_req); + } + + /* Finish RR always */ + rt_refresh_end(&dst->in_req); +} static void pipe_postconfig(struct proto_config *CF) @@ -127,10 +157,16 @@ pipe_postconfig(struct proto_config *CF) if (cc->table->addr_type != cf->peer->addr_type) cf_error("Primary table and peer table must have the same type"); + if (cc->out_subprefix && (cc->table->addr_type != cc->out_subprefix->type)) + cf_error("Export subprefix must match table type"); + + if (cf->in_subprefix && (cc->table->addr_type != cf->in_subprefix->type)) + cf_error("Import subprefix must match table type"); + if (cc->rx_limit.action) cf_error("Pipe protocol does not support receive limits"); - if (cc->in_keep_filtered) + if (cc->in_keep) cf_error("Pipe protocol prohibits keeping filtered routes"); cc->debug = cf->c.debug; @@ -146,6 +182,7 @@ pipe_configure_channels(struct pipe_proto *p, struct pipe_config *cf) .channel = cc->channel, .table = cc->table, .out_filter = cc->out_filter, + .out_subprefix = cc->out_subprefix, .in_limit = cc->in_limit, .ra_mode = RA_ANY, .debug = cc->debug, @@ -157,6 +194,7 @@ pipe_configure_channels(struct pipe_proto *p, struct pipe_config *cf) .channel = cc->channel, .table = cf->peer, .out_filter = cc->in_filter, + .out_subprefix = cf->in_subprefix, .in_limit = cc->out_limit, .ra_mode = RA_ANY, .debug = cc->debug, @@ -178,6 +216,8 @@ pipe_init(struct proto_config *CF) P->rt_notify = pipe_rt_notify; P->preexport = pipe_preexport; P->reload_routes = pipe_reload_routes; + P->feed_begin = pipe_feed_begin; + P->feed_end = pipe_feed_end; p->rl_gen = (struct tbf) TBF_DEFAULT_LOG_LIMITS; @@ -299,7 +339,6 @@ pipe_update_debug(struct proto *P) struct protocol proto_pipe = { .name = "Pipe", .template = "pipe%d", - .class = PROTOCOL_PIPE, .proto_size = sizeof(struct pipe_proto), .config_size = sizeof(struct pipe_config), .postconfig = pipe_postconfig, @@ -309,3 +348,9 @@ struct protocol proto_pipe = { .get_status = pipe_get_status, .show_proto_info = pipe_show_proto_info }; + +void +pipe_build(void) +{ + proto_build(&proto_pipe); +} diff --git a/proto/pipe/pipe.h b/proto/pipe/pipe.h index 60c857eb..501b8565 100644 --- a/proto/pipe/pipe.h +++ b/proto/pipe/pipe.h @@ -12,6 +12,7 @@ struct pipe_config { struct proto_config c; struct rtable_config *peer; /* Table we're connected to */ + const net_addr *in_subprefix; u8 max_generation; }; @@ -19,7 +20,11 @@ struct pipe_proto { struct proto p; struct channel *pri; struct channel *sec; + uint pri_flags; + uint sec_flags; struct tbf rl_gen; }; +#define PIPE_FL_RR_BEGIN_PENDING 1 /* Route refresh should start with the first route notified */ + #endif diff --git a/proto/radv/Makefile b/proto/radv/Makefile index 05317eff..5c56fbf3 100644 --- a/proto/radv/Makefile +++ b/proto/radv/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/radv/config.Y b/proto/radv/config.Y index 8d4a3ab9..fb68d2e5 100644 --- a/proto/radv/config.Y +++ b/proto/radv/config.Y @@ -33,7 +33,7 @@ CF_KEYWORDS(RADV, PREFIX, INTERFACE, MIN, MAX, RA, DELAY, INTERVAL, SOLICITED, RETRANS, TIMER, CURRENT, HOP, LIMIT, DEFAULT, VALID, PREFERRED, MULT, LIFETIME, SKIP, ONLINK, AUTONOMOUS, RDNSS, DNSSL, NS, DOMAIN, LOCAL, TRIGGER, SENSITIVE, PREFERENCE, LOW, MEDIUM, HIGH, PROPAGATE, ROUTE, - ROUTES, RA_PREFERENCE, RA_LIFETIME) + ROUTES) CF_ENUM(T_ENUM_RA_PREFERENCE, RA_PREF_, LOW, MEDIUM, HIGH) @@ -336,9 +336,6 @@ radv_sensitive: | SENSITIVE bool { $$ = $2; } ; -dynamic_attr: RA_PREFERENCE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_ENUM_RA_PREFERENCE, EA_RA_PREFERENCE); } ; -dynamic_attr: RA_LIFETIME { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_RA_LIFETIME); } ; - CF_CODE CF_END diff --git a/proto/radv/radv.c b/proto/radv/radv.c index d572c1b7..10d5e3ed 100644 --- a/proto/radv/radv.c +++ b/proto/radv/radv.c @@ -10,6 +10,7 @@ #include <stdlib.h> #include "radv.h" +#include "lib/macro.h" /** * DOC: Router Advertisements @@ -42,6 +43,8 @@ * RFC 6106 - DNS extensions (RDDNS, DNSSL) */ +static struct ea_class ea_radv_preference, ea_radv_lifetime; + static void radv_prune_prefixes(struct radv_iface *ifa); static void radv_prune_routes(struct radv_proto *p); @@ -391,10 +394,10 @@ radv_net_match_trigger(struct radv_config *cf, const net_addr *n) } int -radv_preexport(struct channel *c, rte *new) +radv_preexport(struct channel *C, rte *new) { // struct radv_proto *p = (struct radv_proto *) P; - struct radv_config *cf = (struct radv_config *) (c->proto->cf); + struct radv_config *cf = (struct radv_config *) (C->proto->cf); if (radv_net_match_trigger(cf, new->net)) return RIC_PROCESS; @@ -444,11 +447,11 @@ radv_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *n, rt { /* Update */ - ea = ea_find(new->attrs->eattrs, EA_RA_PREFERENCE); + ea = ea_find(new->attrs, &ea_radv_preference); uint preference = ea ? ea->u.data : RA_PREF_MEDIUM; uint preference_set = !!ea; - ea = ea_find(new->attrs->eattrs, EA_RA_LIFETIME); + ea = ea_find(new->attrs, &ea_radv_lifetime); uint lifetime = ea ? ea->u.data : 0; uint lifetime_set = !!ea; @@ -555,10 +558,7 @@ radv_check_active(struct radv_proto *p) return 1; struct channel *c = p->p.main_channel; - RT_LOCK(c->table); - int active = rt_examine(RT_PRIV(c->table), &cf->trigger, c, c->out_filter); - RT_UNLOCK(c->table); - return active; + return rt_examine(c->table, &cf->trigger, c, c->out_filter); } static void @@ -741,27 +741,26 @@ radv_pref_str(u32 pref) } } -/* The buffer has some minimal size */ -static int -radv_get_attr(const eattr *a, byte *buf, int buflen UNUSED) +static void +radv_preference_format(const eattr *a, byte *buf, uint buflen) { - switch (a->id) - { - case EA_RA_PREFERENCE: - bsprintf(buf, "preference: %s", radv_pref_str(a->u.data)); - return GA_FULL; - case EA_RA_LIFETIME: - bsprintf(buf, "lifetime"); - return GA_NAME; - default: - return GA_UNKNOWN; - } + bsnprintf(buf, buflen, "%s", radv_pref_str(a->u.data)); } +static struct ea_class ea_radv_preference = { + .name = "radv_preference", + .type = T_ENUM_RA_PREFERENCE, + .format = radv_preference_format, +}; + +static struct ea_class ea_radv_lifetime = { + .name = "radv_lifetime", + .type = T_INT, +}; + struct protocol proto_radv = { .name = "RAdv", .template = "radv%d", - .class = PROTOCOL_RADV, .channel_mask = NB_IP6, .proto_size = sizeof(struct radv_proto), .config_size = sizeof(struct radv_config), @@ -772,5 +771,15 @@ struct protocol proto_radv = { .reconfigure = radv_reconfigure, .copy_config = radv_copy_config, .get_status = radv_get_status, - .get_attr = radv_get_attr }; + +void +radv_build(void) +{ + proto_build(&proto_radv); + + EA_REGISTER_ALL( + &ea_radv_preference, + &ea_radv_lifetime + ); +} diff --git a/proto/radv/radv.h b/proto/radv/radv.h index 14d40f8a..c9219bda 100644 --- a/proto/radv/radv.h +++ b/proto/radv/radv.h @@ -19,7 +19,7 @@ #include "lib/resource.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "nest/locks.h" #include "conf/conf.h" @@ -195,10 +195,6 @@ struct radv_iface #define RA_PREF_HIGH 0x08 #define RA_PREF_MASK 0x18 -/* Attributes */ -#define EA_RA_PREFERENCE EA_CODE(PROTOCOL_RADV, 0) -#define EA_RA_LIFETIME EA_CODE(PROTOCOL_RADV, 1) - #ifdef LOCAL_DEBUG #define RADV_FORCE_DEBUG 1 #else diff --git a/proto/rip/Makefile b/proto/rip/Makefile index 7feabcd8..f4a6fa72 100644 --- a/proto/rip/Makefile +++ b/proto/rip/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/rip/config.Y b/proto/rip/config.Y index 28ee9609..3c0973b1 100644 --- a/proto/rip/config.Y +++ b/proto/rip/config.Y @@ -37,7 +37,7 @@ CF_KEYWORDS(RIP, NG, ECMP, LIMIT, WEIGHT, INFINITY, METRIC, UPDATE, TIMEOUT, PASSIVE, VERSION, SPLIT, HORIZON, POISON, REVERSE, CHECK, ZERO, TIME, BFD, AUTHENTICATION, NONE, PLAINTEXT, CRYPTOGRAPHIC, MD5, TTL, SECURITY, RX, TX, BUFFER, LENGTH, PRIORITY, ONLY, LINK, - DEMAND, CIRCUIT, RIP_METRIC, RIP_TAG) + DEMAND, CIRCUIT) %type <i> rip_variant rip_auth @@ -190,9 +190,6 @@ rip_iface: rip_iface_start iface_patt_list_nopx rip_iface_opt_list rip_iface_finish; -dynamic_attr: RIP_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_RIP_METRIC); } ; -dynamic_attr: RIP_TAG { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_RIP_TAG); } ; - CF_CLI_HELP(SHOW RIP, ..., [[Show information about RIP protocol]]); CF_CLI(SHOW RIP INTERFACES, optproto opttext, [<name>] [\"<interface>\"], [[Show information about RIP interfaces]]) diff --git a/proto/rip/rip.c b/proto/rip/rip.c index ee05f058..183fc265 100644 --- a/proto/rip/rip.c +++ b/proto/rip/rip.c @@ -78,6 +78,7 @@ #include <stdlib.h> #include "rip.h" +#include "lib/macro.h" static inline void rip_lock_neighbor(struct rip_neighbor *n); @@ -88,6 +89,7 @@ static inline void rip_iface_kick_timer(struct rip_iface *ifa); static void rip_iface_timer(timer *timer); static void rip_trigger_update(struct rip_proto *p); +static struct ea_class ea_rip_metric, ea_rip_tag, ea_rip_from; /* * RIP routes @@ -108,14 +110,14 @@ rip_add_rte(struct rip_proto *p, struct rip_rte **rp, struct rip_rte *src) } static inline void -rip_remove_rte(struct rip_proto *p, struct rip_rte **rp) +rip_remove_rte(struct rip_proto *p UNUSED, struct rip_rte **rp) { struct rip_rte *rt = *rp; rip_unlock_neighbor(rt->from); *rp = rt->next; - sl_free(p->rte_slab, rt); + sl_free(rt); } static inline int rip_same_rte(struct rip_rte *a, struct rip_rte *b) @@ -124,6 +126,11 @@ static inline int rip_same_rte(struct rip_rte *a, struct rip_rte *b) static inline int rip_valid_rte(struct rip_rte *rt) { return rt->from->ifa != NULL; } +struct rip_iface_adata { + struct adata ad; + struct iface *iface; +}; + /** * rip_announce_rte - announce route from RIP routing table to the core * @p: RIP instance @@ -144,71 +151,87 @@ rip_announce_rte(struct rip_proto *p, struct rip_entry *en) if (rt) { /* Update */ - rta a0 = { - .pref = p->p.main_channel->preference, - .source = RTS_RIP, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, + struct { + ea_list l; + eattr a[3]; + } ea_block = { + .l.count = ARRAY_SIZE(ea_block.a), + .a = { + EA_LITERAL_EMBEDDED(&ea_gen_preference, 0, p->p.main_channel->preference), + EA_LITERAL_EMBEDDED(&ea_gen_source, 0, RTS_RIP), + EA_LITERAL_EMBEDDED(&ea_rip_metric, 0, rt->metric), + }, }; - u8 rt_metric = rt->metric; + ea_list *ea = &ea_block.l; + u16 rt_tag = rt->tag; + struct iface *rt_from = NULL; if (p->ecmp) { /* ECMP route */ - struct nexthop *nhs = NULL; int num = 0; for (rt = en->routes; rt && (num < p->ecmp); rt = rt->next) + if (rip_valid_rte(rt)) + num++; + + struct nexthop_adata *nhad = (struct nexthop_adata *) tmp_alloc_adata((num+1) * sizeof(struct nexthop)); + struct nexthop *nh = &nhad->nh; + + for (rt = en->routes; rt && (num < p->ecmp); rt = rt->next) { if (!rip_valid_rte(rt)) - continue; + continue; - struct nexthop *nh = allocz(sizeof(struct nexthop)); + *nh = (struct nexthop) { + .gw = rt->next_hop, + .iface = rt->from->ifa->iface, + .weight = rt->from->ifa->cf->ecmp_weight, + }; - nh->gw = rt->next_hop; - nh->iface = rt->from->ifa->iface; - nh->weight = rt->from->ifa->cf->ecmp_weight; + if (!rt_from) + rt_from = rt->from->ifa->iface; - nexthop_insert(&nhs, nh); - num++; + nh = NEXTHOP_NEXT(nh); if (rt->tag != rt_tag) rt_tag = 0; } - a0.nh = *nhs; + nhad->ad.length = ((void *) nh - (void *) nhad->ad.data); + + ea_set_attr(&ea, + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, + &(nexthop_sort(nhad, tmp_linpool)->ad))); } else { /* Unipath route */ - a0.from = rt->from->nbr->addr; - a0.nh.gw = rt->next_hop; - a0.nh.iface = rt->from->ifa->iface; + rt_from = rt->from->ifa->iface; + + struct nexthop_adata nhad = { + .nh.gw = rt->next_hop, + .nh.iface = rt->from->ifa->iface, + }; + + ea_set_attr_data(&ea, &ea_gen_nexthop, 0, + &nhad.ad.data, sizeof nhad - sizeof nhad.ad); + ea_set_attr_data(&ea, &ea_gen_from, 0, &rt->from->nbr->addr, sizeof(ip_addr)); } - a0.eattrs = alloca(sizeof(ea_list) + 3*sizeof(eattr)); - memset(a0.eattrs, 0, sizeof(ea_list)); /* Zero-ing only the ea_list header */ - a0.eattrs->count = 3; - a0.eattrs->attrs[0] = (eattr) { - .id = EA_RIP_METRIC, - .type = EAF_TYPE_INT, - .u.data = rt_metric, - }; - a0.eattrs->attrs[1] = (eattr) { - .id = EA_RIP_TAG, - .type = EAF_TYPE_INT, - .u.data = rt_tag, - }; - a0.eattrs->attrs[2] = (eattr) { - .id = EA_RIP_FROM, - .type = EAF_TYPE_PTR, - .u.data = (uintptr_t) a0.nh.iface, + ea_set_attr_u32(&ea, &ea_rip_tag, 0, rt_tag); + + struct rip_iface_adata riad = { + .ad = { .length = sizeof(struct rip_iface_adata) - sizeof(struct adata) }, + .iface = rt_from, }; + ea_set_attr(&ea, + EA_LITERAL_DIRECT_ADATA(&ea_rip_from, 0, &riad.ad)); rte e0 = { - .attrs = &a0, + .attrs = ea, .src = p->p.main_source, }; @@ -320,9 +343,10 @@ rip_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net, s if (new) { /* Update */ - u32 rt_tag = ea_get_int(new->attrs->eattrs, EA_RIP_TAG, 0); - u32 rt_metric = ea_get_int(new->attrs->eattrs, EA_RIP_METRIC, 1); - struct iface *rt_from = (struct iface *) ea_get_int(new->attrs->eattrs, EA_RIP_FROM, 0); + u32 rt_tag = ea_get_int(new->attrs, &ea_rip_tag, 0); + u32 rt_metric = ea_get_int(new->attrs, &ea_rip_metric, 1); + const eattr *rie = ea_find(new->attrs, &ea_rip_from); + struct iface *rt_from = rie ? ((struct rip_iface_adata *) rie->u.ptr)->iface : NULL; if (rt_metric > p->infinity) { @@ -354,8 +378,14 @@ rip_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net, s en->metric = rt_metric; en->tag = rt_tag; en->from = (new->src->owner == &P->sources) ? rt_from : NULL; - en->iface = new->attrs->nh.iface; - en->next_hop = new->attrs->nh.gw; + + eattr *nhea = ea_find(new->attrs, &ea_gen_nexthop); + if (nhea) + { + struct nexthop_adata *nhad = (struct nexthop_adata *) nhea->u.ptr; + en->iface = nhad->nh.iface; + en->next_hop = nhad->nh.gw; + } } else { @@ -1091,22 +1121,16 @@ rip_rte_proto(struct rte *rte) SKIP_BACK(struct rip_proto, p.sources, rte->src->owner) : NULL; } -static int -rip_rte_better(struct rte *new, struct rte *old) +static u32 +rip_rte_igp_metric(const rte *rt) { - ASSERT_DIE(new->src == old->src); - struct rip_proto *p = rip_rte_proto(new); - - u32 new_metric = ea_get_int(new->attrs->eattrs, EA_RIP_METRIC, p->infinity); - u32 old_metric = ea_get_int(old->attrs->eattrs, EA_RIP_METRIC, p->infinity); - - return new_metric < old_metric; + return ea_get_int(rt->attrs, &ea_rip_metric, IGP_METRIC_UNKNOWN); } -static u32 -rip_rte_igp_metric(struct rte *rt) +static int +rip_rte_better(struct rte *new, struct rte *old) { - return ea_get_int(rt->attrs->eattrs, EA_RIP_METRIC, IGP_METRIC_UNKNOWN); + return rip_rte_igp_metric(new) < rip_rte_igp_metric(old); } static void @@ -1206,32 +1230,37 @@ static void rip_get_route_info(rte *rte, byte *buf) { struct rip_proto *p = rip_rte_proto(rte); - u32 rt_metric = ea_get_int(rte->attrs->eattrs, EA_RIP_METRIC, p->infinity); - u32 rt_tag = ea_get_int(rte->attrs->eattrs, EA_RIP_TAG, 0); + u32 rt_metric = ea_get_int(rte->attrs, &ea_rip_metric, p->infinity); + u32 rt_tag = ea_get_int(rte->attrs, &ea_rip_tag, 0); - buf += bsprintf(buf, " (%d/%d)", rte->attrs->pref, rt_metric); + buf += bsprintf(buf, " (%d/%d)", rt_get_preference(rte), rt_metric); if (rt_tag) bsprintf(buf, " [%04x]", rt_tag); } -static int -rip_get_attr(const eattr *a, byte *buf, int buflen UNUSED) +static void +rip_tag_format(const eattr *a, byte *buf, uint buflen) { - switch (a->id) - { - case EA_RIP_METRIC: - bsprintf(buf, "metric: %d", a->u.data); - return GA_FULL; + bsnprintf(buf, buflen, "%04x", a->u.data); +} - case EA_RIP_TAG: - bsprintf(buf, "tag: %04x", a->u.data); - return GA_FULL; +static struct ea_class ea_rip_metric = { + .name = "rip_metric", + .type = T_INT, +}; - default: - return GA_UNKNOWN; - } -} +static struct ea_class ea_rip_tag = { + .name = "rip_tag", + .type = T_INT, + .format = rip_tag_format, +}; + +static struct ea_class ea_rip_from = { + .name = "rip_from", + .type = T_IFACE, + .readonly = 1, +}; void rip_show_interfaces(struct proto *P, const char *iff) @@ -1341,7 +1370,6 @@ static struct rte_owner_class rip_rte_owner_class = { struct protocol proto_rip = { .name = "RIP", .template = "rip%d", - .class = PROTOCOL_RIP, .preference = DEF_PREF_RIP, .channel_mask = NB_IP, .proto_size = sizeof(struct rip_proto), @@ -1352,5 +1380,16 @@ struct protocol proto_rip = { .start = rip_start, .shutdown = rip_shutdown, .reconfigure = rip_reconfigure, - .get_attr = rip_get_attr }; + +void +rip_build(void) +{ + proto_build(&proto_rip); + + EA_REGISTER_ALL( + &ea_rip_metric, + &ea_rip_tag, + &ea_rip_from + ); +} diff --git a/proto/rip/rip.h b/proto/rip/rip.h index f8713c4a..a01f8d3b 100644 --- a/proto/rip/rip.h +++ b/proto/rip/rip.h @@ -16,7 +16,7 @@ #include "nest/cli.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/password.h" #include "nest/locks.h" #include "nest/bfd.h" @@ -195,10 +195,6 @@ struct rip_rte #define RIP_ENTRY_VALID 1 /* Valid outgoing route */ #define RIP_ENTRY_STALE 2 /* Stale outgoing route, waiting for GC */ -#define EA_RIP_METRIC EA_CODE(PROTOCOL_RIP, 0) -#define EA_RIP_TAG EA_CODE(PROTOCOL_RIP, 1) -#define EA_RIP_FROM EA_CODE(PROTOCOL_RIP, 2) - static inline int rip_is_v2(struct rip_proto *p) { return p->rip2; } diff --git a/proto/rpki/Makefile b/proto/rpki/Makefile index eb09b7df..0f60b2a0 100644 --- a/proto/rpki/Makefile +++ b/proto/rpki/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/rpki/packets.c b/proto/rpki/packets.c index abe6abfc..d7895a22 100644 --- a/proto/rpki/packets.c +++ b/proto/rpki/packets.c @@ -729,6 +729,33 @@ rpki_handle_prefix_pdu(struct rpki_cache *cache, const struct pdu_header *pdu) net_addr_union addr = {}; rpki_prefix_pdu_2_net_addr(pdu, &addr); + if (type == IPV4_PREFIX) + { + if ((addr.roa4.pxlen > addr.roa4.max_pxlen) || + (addr.roa4.max_pxlen > IP4_MAX_PREFIX_LENGTH)) + { + RPKI_WARN(cache->p, "Received corrupt packet from RPKI cache server: invalid pxlen or max_pxlen"); + byte tmp[pdu->len]; + const struct pdu_header *hton_pdu = rpki_pdu_back_to_network_byte_order((void *) tmp, (const void *) pdu); + rpki_send_error_pdu(cache, CORRUPT_DATA, pdu->len, hton_pdu, "Corrupted PDU: invalid pxlen or max_pxlen"); + rpki_cache_change_state(cache, RPKI_CS_ERROR_FATAL); + return RPKI_ERROR; + } + } + else + { + if ((addr.roa6.pxlen > addr.roa6.max_pxlen) || + (addr.roa6.max_pxlen > IP6_MAX_PREFIX_LENGTH)) + { + RPKI_WARN(cache->p, "Received corrupt packet from RPKI cache server: invalid pxlen or max_pxlen"); + byte tmp[pdu->len]; + const struct pdu_header *hton_pdu = rpki_pdu_back_to_network_byte_order((void *) tmp, (const void *) pdu); + rpki_send_error_pdu(cache, CORRUPT_DATA, pdu->len, hton_pdu, "Corrupted PDU: invalid pxlen or max_pxlen"); + rpki_cache_change_state(cache, RPKI_CS_ERROR_FATAL); + return RPKI_ERROR; + } + } + if (cf->ignore_max_length) { if (type == IPV4_PREFIX) diff --git a/proto/rpki/rpki.c b/proto/rpki/rpki.c index cc86ab6a..7ec8d72f 100644 --- a/proto/rpki/rpki.c +++ b/proto/rpki/rpki.c @@ -121,14 +121,11 @@ rpki_table_add_roa(struct rpki_cache *cache, struct channel *channel, const net_ { struct rpki_proto *p = cache->p; - rta a0 = { - .pref = channel->preference, - .source = RTS_RPKI, - .scope = SCOPE_UNIVERSE, - .dest = RTD_NONE, - }; + ea_list *ea = NULL; + ea_set_attr_u32(&ea, &ea_gen_preference, 0, channel->preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_RPKI); - rte e0 = { .attrs = &a0, .src = p->p.main_source, }; + rte e0 = { .attrs = ea, .src = p->p.main_source, }; rte_update(channel, &pfxr->n, &e0, p->p.main_source); } @@ -870,16 +867,27 @@ rpki_show_proto_info(struct proto *P) if (cache) { const char *transport_name = "---"; + uint default_port = 0; switch (cf->tr_config.type) { #if HAVE_LIBSSH - case RPKI_TR_SSH: transport_name = "SSHv2"; break; + case RPKI_TR_SSH: + transport_name = "SSHv2"; + default_port = RPKI_SSH_PORT; + break; #endif - case RPKI_TR_TCP: transport_name = "Unprotected over TCP"; break; + case RPKI_TR_TCP: + transport_name = "Unprotected over TCP"; + default_port = RPKI_TCP_PORT; + break; }; cli_msg(-1006, " Cache server: %s", cf->hostname); + + if (cf->port != default_port) + cli_msg(-1006, " Cache port: %u", cf->port); + cli_msg(-1006, " Status: %s", rpki_cache_state_to_str(cache->state)); cli_msg(-1006, " Transport: %s", transport_name); cli_msg(-1006, " Protocol version: %u", cache->version); @@ -977,7 +985,6 @@ rpki_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUS struct protocol proto_rpki = { .name = "RPKI", .template = "rpki%d", - .class = PROTOCOL_RPKI, .preference = DEF_PREF_RPKI, .proto_size = sizeof(struct rpki_proto), .config_size = sizeof(struct rpki_config), @@ -991,3 +998,9 @@ struct protocol proto_rpki = { .reconfigure = rpki_reconfigure, .get_status = rpki_get_status, }; + +void +rpki_build(void) +{ + proto_build(&proto_rpki); +} diff --git a/proto/rpki/rpki.h b/proto/rpki/rpki.h index a70a2027..20253844 100644 --- a/proto/rpki/rpki.h +++ b/proto/rpki/rpki.h @@ -13,7 +13,7 @@ #define _BIRD_RPKI_H_ #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "lib/socket.h" #include "lib/ip.h" diff --git a/proto/rpki/transport.c b/proto/rpki/transport.c index b52495dc..4026fca4 100644 --- a/proto/rpki/transport.c +++ b/proto/rpki/transport.c @@ -86,6 +86,7 @@ rpki_tr_open(struct rpki_tr_sock *tr) sk->tbsize = RPKI_TX_BUFFER_SIZE; sk->tos = IP_PREC_INTERNET_CONTROL; sk->flags |= SKF_THREAD; + sk->vrf = cache->p->p.vrf; if (ipa_zero(sk->daddr) && sk->host) { diff --git a/proto/static/Makefile b/proto/static/Makefile index e38f9b74..de6e819b 100644 --- a/proto/static/Makefile +++ b/proto/static/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/static/config.Y b/proto/static/config.Y index 41e10dbf..9d26ee82 100644 --- a/proto/static/config.Y +++ b/proto/static/config.Y @@ -40,7 +40,7 @@ static_route_finish(void) if (net_type_match(this_srt->net, NB_DEST) == !this_srt->dest) cf_error("Unexpected or missing nexthop/type"); - this_srt->cmds = f_linearize(this_srt_cmds); + this_srt->cmds = f_linearize(this_srt_cmds, 0); } CF_DECLS diff --git a/proto/static/static.c b/proto/static/static.c index bd7f3f5b..42fd20b7 100644 --- a/proto/static/static.c +++ b/proto/static/static.c @@ -38,7 +38,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "conf/conf.h" #include "filter/filter.h" @@ -47,8 +47,6 @@ #include "static.h" -static linpool *static_lp; - static inline struct rte_src * static_get_source(struct static_proto *p, uint i) { return i ? rt_get_source(&p->p, i) : p->p.main_source; } @@ -59,68 +57,77 @@ static void static_announce_rte(struct static_proto *p, struct static_route *r) { struct rte_src *src; - rta *a = allocz(RTA_MAX_SIZE); - a->source = RTS_STATIC; - a->scope = SCOPE_UNIVERSE; - a->dest = r->dest; - a->pref = p->p.main_channel->preference; + ea_list *ea = NULL; + ea_set_attr_u32(&ea, &ea_gen_preference, 0, p->p.main_channel->preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_STATIC); if (r->dest == RTD_UNICAST) { - struct static_route *r2; - struct nexthop *nhs = NULL; + uint sz = 0; + for (struct static_route *r2 = r; r2; r2 = r2->mp_next) + if (r2->active) + sz += NEXTHOP_SIZE_CNT(r2->mls ? r2->mls->length / sizeof(u32) : 0); - for (r2 = r; r2; r2 = r2->mp_next) + if (!sz) + goto withdraw; + + struct nexthop_adata *nhad = allocz(sz + sizeof *nhad); + struct nexthop *nh = &nhad->nh; + + for (struct static_route *r2 = r; r2; r2 = r2->mp_next) { if (!r2->active) continue; - struct nexthop *nh = allocz(NEXTHOP_MAX_SIZE); - nh->gw = r2->via; - nh->iface = r2->neigh->iface; - nh->flags = r2->onlink ? RNF_ONLINK : 0; - nh->weight = r2->weight; + *nh = (struct nexthop) { + .gw = r2->via, + .iface = r2->neigh->iface, + .flags = r2->onlink ? RNF_ONLINK : 0, + .weight = r2->weight, + }; + if (r2->mls) { - nh->labels = r2->mls->len; - memcpy(nh->label, r2->mls->stack, r2->mls->len * sizeof(u32)); + nh->labels = r2->mls->length / sizeof(u32); + memcpy(nh->label, r2->mls->data, r2->mls->length); } - nexthop_insert(&nhs, nh); + nh = NEXTHOP_NEXT(nh); } - if (!nhs) - goto withdraw; - - nexthop_link(a, nhs); + ea_set_attr_data(&ea, &ea_gen_nexthop, 0, + nhad->ad.data, (void *) nh - (void *) nhad->ad.data); } - if (r->dest == RTDX_RECURSIVE) + else if (r->dest == RTDX_RECURSIVE) { rtable *tab = ipa_is_ip4(r->via) ? p->igp_table_ip4 : p->igp_table_ip6; - rta_set_recursive_next_hop(p->p.main_channel->table, a, tab, r->via, IPA_NONE, r->mls, static_lp); + u32 *labels = r->mls ? (void *) r->mls->data : NULL; + u32 lnum = r->mls ? r->mls->length / sizeof(u32) : 0; + + ea_set_hostentry(&ea, p->p.main_channel->table, tab, + r->via, IPA_NONE, lnum, labels); } + else if (r->dest) + ea_set_dest(&ea, 0, r->dest); + /* Already announced */ if (r->state == SRS_CLEAN) return; /* We skip rta_lookup() here */ src = static_get_source(p, r->index); - rte e0 = { .attrs = a, .src = src, .net = r->net, }, *e = &e0; + rte e0 = { .attrs = ea, .src = src, .net = r->net, }, *e = &e0; /* Evaluate the filter */ if (r->cmds) - f_eval_rte(r->cmds, e, static_lp); + f_eval_rte(r->cmds, e); rte_update(p->p.main_channel, r->net, e, src); static_free_source(src, r->index); r->state = SRS_CLEAN; - - if (r->cmds) - lp_flush(static_lp); - return; withdraw: @@ -322,31 +329,17 @@ static_same_dest(struct static_route *x, struct static_route *y) (x->weight != y->weight) || (x->use_bfd != y->use_bfd) || (!x->mls != !y->mls) || - ((x->mls) && (y->mls) && (x->mls->len != y->mls->len))) + ((x->mls) && (y->mls) && adata_same(x->mls, y->mls))) return 0; - - if (!x->mls) - continue; - - for (uint i = 0; i < x->mls->len; i++) - if (x->mls->stack[i] != y->mls->stack[i]) - return 0; } return !x && !y; case RTDX_RECURSIVE: if (!ipa_equal(x->via, y->via) || (!x->mls != !y->mls) || - ((x->mls) && (y->mls) && (x->mls->len != y->mls->len))) + ((x->mls) && (y->mls) && adata_same(x->mls, y->mls))) return 0; - if (!x->mls) - return 1; - - for (uint i = 0; i < x->mls->len; i++) - if (x->mls->stack[i] != y->mls->stack[i]) - return 0; - return 1; default: @@ -415,16 +408,16 @@ static_reload_routes(struct channel *C) static int static_rte_better(rte *new, rte *old) { - u32 n = ea_get_int(new->attrs->eattrs, EA_GEN_IGP_METRIC, IGP_METRIC_UNKNOWN); - u32 o = ea_get_int(old->attrs->eattrs, EA_GEN_IGP_METRIC, IGP_METRIC_UNKNOWN); + u32 n = ea_get_int(new->attrs, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN); + u32 o = ea_get_int(old->attrs, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN); return n < o; } static int static_rte_mergable(rte *pri, rte *sec) { - u32 a = ea_get_int(pri->attrs->eattrs, EA_GEN_IGP_METRIC, IGP_METRIC_UNKNOWN); - u32 b = ea_get_int(sec->attrs->eattrs, EA_GEN_IGP_METRIC, IGP_METRIC_UNKNOWN); + u32 a = ea_get_int(pri->attrs, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN); + u32 b = ea_get_int(sec->attrs, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN); return a == b; } @@ -443,11 +436,11 @@ static_postconfig(struct proto_config *CF) if (!cf->igp_table_ip4) cf->igp_table_ip4 = (cc->table->addr_type == NET_IP4) ? - cc->table : cf->c.global->def_tables[NET_IP4]; + cc->table : rt_get_default_table(cf->c.global, NET_IP4); if (!cf->igp_table_ip6) cf->igp_table_ip6 = (cc->table->addr_type == NET_IP6) ? - cc->table : cf->c.global->def_tables[NET_IP6]; + cc->table : rt_get_default_table(cf->c.global, NET_IP6); WALK_LIST(r, cf->routes) if (r->net && (r->net->type != CF->net_type)) @@ -487,16 +480,11 @@ static_start(struct proto *P) struct static_config *cf = (void *) P->cf; struct static_route *r; - if (!static_lp) - static_lp = lp_new(&root_pool, LP_GOOD_SIZE(1024)); - if (p->igp_table_ip4) - RT_LOCKED(p->igp_table_ip4, t) - rt_lock_table(t); + rt_lock_table(p->igp_table_ip4); if (p->igp_table_ip6) - RT_LOCKED(p->igp_table_ip6, t) - rt_lock_table(t); + rt_lock_table(p->igp_table_ip6); p->event = ev_new_init(p->p.pool, static_announce_marked, p); @@ -506,7 +494,12 @@ static_start(struct proto *P) proto_notify_state(P, PS_UP); WALK_LIST(r, cf->routes) + { + struct lp_state lps; + lp_save(tmp_linpool, &lps); static_add_rte(p, r); + lp_restore(tmp_linpool, &lps); + } return PS_UP; } @@ -523,12 +516,10 @@ static_shutdown(struct proto *P) static_reset_rte(p, r); if (p->igp_table_ip4) - RT_LOCKED(p->igp_table_ip4, t) - rt_unlock_table(t); + rt_unlock_table(p->igp_table_ip4); if (p->igp_table_ip6) - RT_LOCKED(p->igp_table_ip6, t) - rt_unlock_table(t); + rt_unlock_table(p->igp_table_ip6); return PS_DOWN; } @@ -720,11 +711,12 @@ static_copy_config(struct proto_config *dest, struct proto_config *src) static void static_get_route_info(rte *rte, byte *buf) { - eattr *a = ea_find(rte->attrs->eattrs, EA_GEN_IGP_METRIC); - if (a) - buf += bsprintf(buf, " (%d/%u)", rte->attrs->pref, a->u.data); + eattr *a = ea_find(rte->attrs, &ea_gen_igp_metric); + u32 pref = rt_get_preference(rte); + if (a && (a->u.data < IGP_METRIC_UNKNOWN)) + buf += bsprintf(buf, " (%d/%u)", pref, a->u.data); else - buf += bsprintf(buf, " (%d)", rte->attrs->pref); + buf += bsprintf(buf, " (%d)", pref); } static void @@ -783,7 +775,6 @@ static struct rte_owner_class static_rte_owner_class = { struct protocol proto_static = { .name = "Static", .template = "static%d", - .class = PROTOCOL_STATIC, .preference = DEF_PREF_STATIC, .channel_mask = NB_ANY, .proto_size = sizeof(struct static_proto), @@ -796,3 +787,9 @@ struct protocol proto_static = { .reconfigure = static_reconfigure, .copy_config = static_copy_config, }; + +void +static_build(void) +{ + proto_build(&proto_static); +} diff --git a/proto/static/static.h b/proto/static/static.h index fc91f71c..ea7ca33b 100644 --- a/proto/static/static.h +++ b/proto/static/static.h @@ -9,7 +9,7 @@ #ifndef _BIRD_STATIC_H_ #define _BIRD_STATIC_H_ -#include "nest/route.h" +#include "nest/rt.h" #include "nest/bfd.h" #include "lib/buffer.h" @@ -49,7 +49,7 @@ struct static_route { byte weight; /* Multipath next hop weight */ byte use_bfd; /* Configured to use BFD */ struct bfd_request *bfd_req; /* BFD request, if BFD is used */ - mpls_label_stack *mls; /* MPLS label stack; may be NULL */ + struct adata *mls; /* MPLS label stack; may be NULL */ }; /* diff --git a/sysdep/bsd/krt-sock.c b/sysdep/bsd/krt-sock.c index 6f788ac2..1c1bd50c 100644 --- a/sysdep/bsd/krt-sock.c +++ b/sysdep/bsd/krt-sock.c @@ -25,7 +25,7 @@ #include "nest/bird.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "sysdep/unix/unix.h" @@ -366,6 +366,30 @@ krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old) } } +/** + * krt_assume_onlink - check if routes on interface are considered onlink + * @iface: The interface of the next hop + * @ipv6: Switch to only consider IPv6 or IPv4 addresses. + * + * The BSD kernel does not support an onlink flag. If the interface has only + * host addresses configured, all routes should be considered as onlink and + * the function returns 1. + */ +static int +krt_assume_onlink(struct iface *iface, int ipv6) +{ + const u8 type = ipv6 ? NET_IP6 : NET_IP4; + + struct ifa *ifa; + WALK_LIST(ifa, iface->addrs) + { + if ((ifa->prefix.type == type) && !(ifa->flags & IA_HOST)) + return 0; + } + + return 1; +} + #define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0) static void @@ -494,10 +518,10 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan) net = net_get(p->p.main_channel->table, &ndst); rta a = { - .source = RTS_INHERIT, - .scope = SCOPE_UNIVERSE, }; + ea_set_attr_u32(&a->eattrs, &ea_gen_source, 0, RTS_INHERIT); + /* reject/blackhole routes have also set RTF_GATEWAY, we wil check them first. */ @@ -526,15 +550,21 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan) a.dest = RTD_UNICAST; if (flags & RTF_GATEWAY) { - neighbor *ng; a.nh.gw = igate; /* Clean up embedded interface ID returned in link-local address */ if (ipa_is_link_local(a.nh.gw)) _I0(a.nh.gw) = 0xfe800000; - ng = neigh_find(&p->p, a.nh.gw, a.nh.iface, 0); - if (!ng || (ng->scope == SCOPE_HOST)) + /* The BSD kernel does not support an onlink flag. We heuristically + set the onlink flag, if the iface has only host addresses. */ + if (krt_assume_onlink(a.nh.iface, ipv6)) + a.nh.flags |= RNF_ONLINK; + + neighbor *nbr; + nbr = neigh_find(&p->p, a.nh.gw, a.nh.iface, + (a.nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0); + if (!nbr || (nbr->scope == SCOPE_HOST)) { /* Ignore routes with next-hop 127.0.0.1, host routes with such next-hop appear on OpenBSD for address aliases. */ @@ -550,15 +580,8 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan) done:; rte e0 = { .attrs = &a, .net = net, }; - ea_list *ea = alloca(sizeof(ea_list) + 1 * sizeof(eattr)); - *ea = (ea_list) { .count = 1, .next = e0.attrs->eattrs }; - e0.attrs->eattrs = ea; - - ea->attrs[0] = (eattr) { - .id = EA_KRT_SOURCE, - .type = EAF_TYPE_INT, - .u.data = src2, - }; + ea_set_attr(e0.attrs->eattrs, + EA_LITERAL_EMBEDDED(EA_KRT_SOURCE, T_INT, 0, src2)); if (scan) krt_got_route(p, &e0, src); diff --git a/sysdep/bsd/sysio.h b/sysdep/bsd/sysio.h index c757960a..f1887fb4 100644 --- a/sysdep/bsd/sysio.h +++ b/sysdep/bsd/sysio.h @@ -271,3 +271,9 @@ sk_set_priority(sock *s, int prio UNUSED) { ERR_MSG("Socket priority not supported"); } + +static inline int +sk_set_freebind(sock *s) +{ + ERR_MSG("Freebind is not supported"); +} diff --git a/sysdep/cf/README b/sysdep/cf/README index 9a7a4afa..68078bbe 100644 --- a/sysdep/cf/README +++ b/sysdep/cf/README @@ -4,7 +4,6 @@ Available configuration variables: CONFIG_AUTO_ROUTES Device routes are added automagically by the kernel CONFIG_SELF_CONSCIOUS We're able to recognize whether route was installed by us CONFIG_MULTIPLE_TABLES The kernel supports multiple routing tables -CONFIG_ALL_TABLES_AT_ONCE Kernel scanner wants to process all tables at once CONFIG_SINGLE_ROUTE There is only one route per network CONFIG_MC_PROPER_SRC Multicast packets have source address according to socket saddr field diff --git a/sysdep/cf/linux.h b/sysdep/cf/linux.h index 047d3764..c640bef4 100644 --- a/sysdep/cf/linux.h +++ b/sysdep/cf/linux.h @@ -9,7 +9,6 @@ #define CONFIG_AUTO_ROUTES #define CONFIG_SELF_CONSCIOUS #define CONFIG_MULTIPLE_TABLES -#define CONFIG_ALL_TABLES_AT_ONCE #define CONFIG_IP6_SADR_KERNEL #define CONFIG_MC_PROPER_SRC diff --git a/sysdep/config.h b/sysdep/config.h index 55be90f0..5cdadbb0 100644 --- a/sysdep/config.h +++ b/sysdep/config.h @@ -13,7 +13,7 @@ #ifdef GIT_LABEL #define BIRD_VERSION XSTR1(GIT_LABEL) #else -#define BIRD_VERSION "2.0.8" +#define BIRD_VERSION "2.0.10" #endif /* Include parameters determined by configure script */ diff --git a/sysdep/linux/krt-sys.h b/sysdep/linux/krt-sys.h index a8af4c95..aa90f6e4 100644 --- a/sysdep/linux/krt-sys.h +++ b/sysdep/linux/krt-sys.h @@ -34,41 +34,10 @@ static inline struct ifa * kif_get_primary_ip(struct iface *i UNUSED) { return N #define KRT_ALLOW_MERGE_PATHS 1 -#define EA_KRT_PREFSRC EA_CODE(PROTOCOL_KERNEL, 0x10) -#define EA_KRT_REALM EA_CODE(PROTOCOL_KERNEL, 0x11) -#define EA_KRT_SCOPE EA_CODE(PROTOCOL_KERNEL, 0x12) - - -#define KRT_METRICS_MAX 0x10 /* RTAX_QUICKACK+1 */ -#define KRT_METRICS_OFFSET 0x20 /* Offset of EA_KRT_* vs RTAX_* */ - -#define KRT_FEATURES_MAX 4 - -/* - * Following attributes are parts of RTA_METRICS kernel route attribute, their - * ids must be consistent with their RTAX_* constants (+ KRT_METRICS_OFFSET) - */ -#define EA_KRT_METRICS EA_CODE(PROTOCOL_KERNEL, 0x20) /* Dummy one */ -#define EA_KRT_LOCK EA_CODE(PROTOCOL_KERNEL, 0x21) -#define EA_KRT_MTU EA_CODE(PROTOCOL_KERNEL, 0x22) -#define EA_KRT_WINDOW EA_CODE(PROTOCOL_KERNEL, 0x23) -#define EA_KRT_RTT EA_CODE(PROTOCOL_KERNEL, 0x24) -#define EA_KRT_RTTVAR EA_CODE(PROTOCOL_KERNEL, 0x25) -#define EA_KRT_SSTRESH EA_CODE(PROTOCOL_KERNEL, 0x26) -#define EA_KRT_CWND EA_CODE(PROTOCOL_KERNEL, 0x27) -#define EA_KRT_ADVMSS EA_CODE(PROTOCOL_KERNEL, 0x28) -#define EA_KRT_REORDERING EA_CODE(PROTOCOL_KERNEL, 0x29) -#define EA_KRT_HOPLIMIT EA_CODE(PROTOCOL_KERNEL, 0x2a) -#define EA_KRT_INITCWND EA_CODE(PROTOCOL_KERNEL, 0x2b) -#define EA_KRT_FEATURES EA_CODE(PROTOCOL_KERNEL, 0x2c) -#define EA_KRT_RTO_MIN EA_CODE(PROTOCOL_KERNEL, 0x2d) -#define EA_KRT_INITRWND EA_CODE(PROTOCOL_KERNEL, 0x2e) -#define EA_KRT_QUICKACK EA_CODE(PROTOCOL_KERNEL, 0x2f) - - struct krt_params { u32 table_id; /* Kernel table ID we sync with */ u32 metric; /* Kernel metric used for all routes */ + uint netlink_rx_buffer; /* Rx buffer size for the netlink socket */ }; struct krt_state { diff --git a/sysdep/linux/netlink.Y b/sysdep/linux/netlink.Y index 7097f577..7ba8c7c9 100644 --- a/sysdep/linux/netlink.Y +++ b/sysdep/linux/netlink.Y @@ -10,9 +10,7 @@ CF_HDR CF_DECLS -CF_KEYWORDS(KERNEL, TABLE, METRIC, KRT_PREFSRC, KRT_REALM, KRT_SCOPE, KRT_MTU, KRT_WINDOW, - KRT_RTT, KRT_RTTVAR, KRT_SSTRESH, KRT_CWND, KRT_ADVMSS, KRT_REORDERING, - KRT_HOPLIMIT, KRT_INITCWND, KRT_RTO_MIN, KRT_INITRWND, KRT_QUICKACK, +CF_KEYWORDS(KERNEL, TABLE, METRIC, NETLINK, RX, BUFFER, KRT_LOCK_MTU, KRT_LOCK_WINDOW, KRT_LOCK_RTT, KRT_LOCK_RTTVAR, KRT_LOCK_SSTRESH, KRT_LOCK_CWND, KRT_LOCK_ADVMSS, KRT_LOCK_REORDERING, KRT_LOCK_HOPLIMIT, KRT_LOCK_RTO_MIN, KRT_FEATURE_ECN, KRT_FEATURE_ALLFRAG) @@ -24,41 +22,25 @@ kern_proto: kern_proto kern_sys_item ';' ; kern_sys_item: KERNEL TABLE expr { THIS_KRT->sys.table_id = $3; } | METRIC expr { THIS_KRT->sys.metric = $2; } + | NETLINK RX BUFFER expr { THIS_KRT->sys.netlink_rx_buffer = $4; } ; -dynamic_attr: KRT_PREFSRC { $$ = f_new_dynamic_attr(EAF_TYPE_IP_ADDRESS, T_IP, EA_KRT_PREFSRC); } ; -dynamic_attr: KRT_REALM { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_REALM); } ; -dynamic_attr: KRT_SCOPE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SCOPE); } ; - -dynamic_attr: KRT_MTU { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_MTU); } ; -dynamic_attr: KRT_WINDOW { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_WINDOW); } ; -dynamic_attr: KRT_RTT { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTT); } ; -dynamic_attr: KRT_RTTVAR { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTTVAR); } ; -dynamic_attr: KRT_SSTRESH { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SSTRESH); } ; -dynamic_attr: KRT_CWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_CWND); } ; -dynamic_attr: KRT_ADVMSS { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_ADVMSS); } ; -dynamic_attr: KRT_REORDERING { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_REORDERING); } ; -dynamic_attr: KRT_HOPLIMIT { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_HOPLIMIT); } ; -dynamic_attr: KRT_INITCWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_INITCWND); } ; -dynamic_attr: KRT_RTO_MIN { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTO_MIN); } ; -dynamic_attr: KRT_INITRWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_INITRWND); } ; -dynamic_attr: KRT_QUICKACK { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_QUICKACK); } ; - /* Bits of EA_KRT_LOCK, based on RTAX_* constants */ -dynamic_attr: KRT_LOCK_MTU { $$ = f_new_dynamic_attr_bit(2, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_WINDOW { $$ = f_new_dynamic_attr_bit(3, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_RTT { $$ = f_new_dynamic_attr_bit(4, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_RTTVAR { $$ = f_new_dynamic_attr_bit(5, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_SSTRESH { $$ = f_new_dynamic_attr_bit(6, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_CWND { $$ = f_new_dynamic_attr_bit(7, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_ADVMSS { $$ = f_new_dynamic_attr_bit(8, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_REORDERING { $$ = f_new_dynamic_attr_bit(9, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_HOPLIMIT { $$ = f_new_dynamic_attr_bit(10, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_RTO_MIN { $$ = f_new_dynamic_attr_bit(13, T_BOOL, EA_KRT_LOCK); } ; - -dynamic_attr: KRT_FEATURE_ECN { $$ = f_new_dynamic_attr_bit(0, T_BOOL, EA_KRT_FEATURES); } ; -dynamic_attr: KRT_FEATURE_ALLFRAG { $$ = f_new_dynamic_attr(3, T_BOOL, EA_KRT_FEATURES); } ; +attr_bit: KRT_LOCK_MTU { $$ = f_new_dynamic_attr_bit(2, "krt_lock"); } ; +attr_bit: KRT_LOCK_WINDOW { $$ = f_new_dynamic_attr_bit(3, "krt_lock"); } ; +attr_bit: KRT_LOCK_RTT { $$ = f_new_dynamic_attr_bit(4, "krt_lock"); } ; +attr_bit: KRT_LOCK_RTTVAR { $$ = f_new_dynamic_attr_bit(5, "krt_lock"); } ; +attr_bit: KRT_LOCK_SSTRESH { $$ = f_new_dynamic_attr_bit(6, "krt_lock"); } ; +attr_bit: KRT_LOCK_CWND { $$ = f_new_dynamic_attr_bit(7, "krt_lock"); } ; +attr_bit: KRT_LOCK_ADVMSS { $$ = f_new_dynamic_attr_bit(8, "krt_lock"); } ; +attr_bit: KRT_LOCK_REORDERING { $$ = f_new_dynamic_attr_bit(9, "krt_lock"); } ; +attr_bit: KRT_LOCK_HOPLIMIT { $$ = f_new_dynamic_attr_bit(10, "krt_lock"); } ; +attr_bit: KRT_LOCK_RTO_MIN { $$ = f_new_dynamic_attr_bit(13, "krt_lock"); } ; + +/* Bits of EA_KRT_FEATURES */ +attr_bit: KRT_FEATURE_ECN { $$ = f_new_dynamic_attr_bit(0, "krt_features"); } ; +attr_bit: KRT_FEATURE_ALLFRAG { $$ = f_new_dynamic_attr_bit(3, "krt_features"); } ; CF_CODE diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index bff2d579..3443957b 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -17,7 +17,7 @@ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "lib/alloca.h" @@ -26,6 +26,7 @@ #include "lib/socket.h" #include "lib/string.h" #include "lib/hash.h" +#include "lib/macro.h" #include "conf/conf.h" #include <asm/types.h> @@ -69,54 +70,118 @@ #define RTA_ENCAP 22 #endif +#ifndef NETLINK_GET_STRICT_CHK +#define NETLINK_GET_STRICT_CHK 12 +#endif + #define krt_ipv4(p) ((p)->af == AF_INET) -#define krt_ecmp6(p) ((p)->af == AF_INET6) const int rt_default_ecmp = 16; -/* - * Structure nl_parse_state keeps state of received route processing. Ideally, - * we could just independently parse received Netlink messages and immediately - * propagate received routes to the rest of BIRD, but older Linux kernel (before - * version 4.11) represents and announces IPv6 ECMP routes not as one route with - * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of - * routes with the same prefix. More recent kernels work as with IPv4. - * - * Therefore, BIRD keeps currently processed route in nl_parse_state structure - * and postpones its propagation until we expect it to be final; i.e., when - * non-matching route is received or when the scan ends. When another matching - * route is received, it is merged with the already processed route to form an - * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the - * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6 - * routes with RTA_MULTIPATH set are just considered non-matching. - * - * This is ignored for asynchronous notifications (every notification is handled - * as a separate route). It is not an issue for our routes, as we ignore such - * notifications anyways. But importing alien IPv6 ECMP routes does not work - * properly with older kernels. - * - * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes - * for the same prefix. - */ - struct nl_parse_state { + struct krt_proto *proto; struct linpool *pool; int scan; - int merge; - net_addr *net; - rta *attrs; - struct krt_proto *proto; - s8 new; - s8 krt_src; - u8 krt_type; - u8 krt_proto; - u32 krt_metric; + u32 rta_flow; +}; + +/* + * Netlink eattr definitions + */ + +#define KRT_METRICS_MAX ARRAY_SIZE(ea_krt_metrics) +#define KRT_FEATURES_MAX 4 + +static void krt_bitfield_format(const eattr *e, byte *buf, uint buflen); + +static struct ea_class + ea_krt_prefsrc = { + .name = "krt_prefsrc", + .type = T_IP, + }, + ea_krt_realm = { + .name = "krt_realm", + .type = T_INT, + }, + ea_krt_scope = { + .name = "krt_scope", + .type = T_INT, + }; + +static struct ea_class ea_krt_metrics[] = { + [RTAX_LOCK] = { + .name = "krt_lock", + .type = T_INT, + .format = krt_bitfield_format, + }, + [RTAX_FEATURES] = { + .name = "krt_features", + .type = T_INT, + .format = krt_bitfield_format, + }, +#define KRT_METRIC_INT(_rtax, _name) [_rtax] = { .name = _name, .type = T_INT } + KRT_METRIC_INT(RTAX_MTU, "krt_mtu"), + KRT_METRIC_INT(RTAX_WINDOW, "krt_window"), + KRT_METRIC_INT(RTAX_RTT, "krt_rtt"), + KRT_METRIC_INT(RTAX_RTTVAR, "krt_rttvar"), + KRT_METRIC_INT(RTAX_SSTHRESH, "krt_sstresh"), + KRT_METRIC_INT(RTAX_CWND, "krt_cwnd"), + KRT_METRIC_INT(RTAX_ADVMSS, "krt_advmss"), + KRT_METRIC_INT(RTAX_REORDERING, "krt_reordering"), + KRT_METRIC_INT(RTAX_HOPLIMIT, "krt_hoplimit"), + KRT_METRIC_INT(RTAX_INITCWND, "krt_initcwnd"), + KRT_METRIC_INT(RTAX_RTO_MIN, "krt_rto_min"), + KRT_METRIC_INT(RTAX_INITRWND, "krt_initrwnd"), + KRT_METRIC_INT(RTAX_QUICKACK, "krt_quickack"), +#undef KRT_METRIC_INT +}; - u32 rta_flow; /* Used during parsing */ +static const char *krt_metrics_names[KRT_METRICS_MAX] = { + NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss", + "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack" }; +static const char *krt_features_names[KRT_FEATURES_MAX] = { + "ecn", NULL, NULL, "allfrag" +}; + +static void +krt_bitfield_format(const eattr *a, byte *buf, uint buflen) +{ + if (a->id == ea_krt_metrics[RTAX_LOCK].id) + ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX); + else if (a->id == ea_krt_metrics[RTAX_FEATURES].id) + ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX); +} + +static void +nl_ea_register(void) +{ + EA_REGISTER_ALL( + &ea_krt_prefsrc, + &ea_krt_realm, + &ea_krt_scope + ); + + for (uint i = 0; i < KRT_METRICS_MAX; i++) + { + if (!ea_krt_metrics[i].name) + ea_krt_metrics[i] = (struct ea_class) { + .name = mb_sprintf(&root_pool, "krt_metric_%d", i), + .type = T_INT, + }; + + ea_register_init(&ea_krt_metrics[i]); + } + + for (uint i = 1; i < KRT_METRICS_MAX; i++) + ASSERT_DIE(ea_krt_metrics[i].id == ea_krt_metrics[0].id + i); +} + + + /* * Synchronous Netlink interface */ @@ -130,7 +195,7 @@ struct nl_sock uint last_size; }; -#define NL_RX_SIZE 8192 +#define NL_RX_SIZE 32768 #define NL_OP_DELETE 0 #define NL_OP_ADD (NLM_F_CREATE|NLM_F_EXCL) @@ -157,11 +222,51 @@ nl_open_sock(struct nl_sock *nl) } } +static int +nl_set_strict_dump(struct nl_sock *nl UNUSED, int strict UNUSED) +{ +#ifdef SOL_NETLINK + return setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); +#else + return -1; +#endif +} + +static void +nl_set_rcvbuf(int fd, uint val) +{ + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &val, sizeof(val)) < 0) + log(L_WARN "KRT: Cannot set netlink rx buffer size to %u: %m", val); +} + +static uint +nl_cfg_rx_buffer_size(struct config *cfg) +{ + uint bufsize = 0; + + struct proto_config *pc; + WALK_LIST(pc, cfg->protos) + if ((pc->protocol == &proto_unix_kernel) && !pc->disabled) + bufsize = MAX(bufsize, ((struct krt_config *) pc)->sys.netlink_rx_buffer); + + return bufsize; +} + + static void nl_open(void) { + if ((nl_scan.fd >= 0) && (nl_req.fd >= 0)) + return; + nl_open_sock(&nl_scan); nl_open_sock(&nl_req); + + if (nl_set_strict_dump(&nl_scan, 1) < 0) + { + log(L_WARN "KRT: Netlink strict checking failed, will scan all tables at once"); + krt_use_shared_scan(); + } } static void @@ -180,20 +285,72 @@ nl_send(struct nl_sock *nl, struct nlmsghdr *nh) } static void -nl_request_dump(int af, int cmd) +nl_request_dump_link(void) { struct { struct nlmsghdr nh; - struct rtgenmsg g; + struct ifinfomsg ifi; } req = { - .nh.nlmsg_type = cmd, - .nh.nlmsg_len = sizeof(req), + .nh.nlmsg_type = RTM_GETLINK, + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, - .g.rtgen_family = af + .nh.nlmsg_seq = ++(nl_scan.seq), + .ifi.ifi_family = AF_UNSPEC, }; - nl_send(&nl_scan, &req.nh); + + send(nl_scan.fd, &req, sizeof(req), 0); + nl_scan.last_hdr = NULL; } +static void +nl_request_dump_addr(int af) +{ + struct { + struct nlmsghdr nh; + struct ifaddrmsg ifa; + } req = { + .nh.nlmsg_type = RTM_GETADDR, + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .nh.nlmsg_seq = ++(nl_scan.seq), + .ifa.ifa_family = af, + }; + + send(nl_scan.fd, &req, sizeof(req), 0); + nl_scan.last_hdr = NULL; +} + +static void +nl_request_dump_route(int af, int table_id) +{ + struct { + struct nlmsghdr nh; + struct rtmsg rtm; + struct rtattr rta; + u32 table_id; + } req = { + .nh.nlmsg_type = RTM_GETROUTE, + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)), + .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .nh.nlmsg_seq = ++(nl_scan.seq), + .rtm.rtm_family = af, + }; + + if (table_id < 256) + req.rtm.rtm_table = table_id; + else + { + req.rta.rta_type = RTA_TABLE; + req.rta.rta_len = RTA_LENGTH(4); + req.table_id = table_id; + req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + req.rta.rta_len; + } + + send(nl_scan.fd, &req, req.nh.nlmsg_len, 0); + nl_scan.last_hdr = NULL; +} + + static struct nlmsghdr * nl_get_reply(struct nl_sock *nl) { @@ -651,12 +808,12 @@ nl_add_nexthop(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af UNUS } static void -nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, ea_list *eattrs) +nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop_adata *nhad, int af, ea_list *eattrs) { struct rtattr *a = nl_open_attr(h, bufsize, RTA_MULTIPATH); - eattr *flow = ea_find(eattrs, EA_KRT_REALM); + eattr *flow = ea_find(eattrs, &ea_krt_realm); - for (; nh; nh = nh->next) + NEXTHOP_WALK(nh, nhad) { struct rtnexthop *rtnh = nl_open_nexthop(h, bufsize); @@ -680,33 +837,49 @@ nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, e nl_close_attr(h, a); } -static struct nexthop * -nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr *ra, int af) +static struct nexthop_adata * +nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr *n, struct rtattr *ra, int af, int krt_src) { struct rtattr *a[BIRD_RTA_MAX]; - struct rtnexthop *nh = RTA_DATA(ra); - struct nexthop *rv, *first, **last; - unsigned len = RTA_PAYLOAD(ra); + struct rtnexthop *nh, *orig_nh = RTA_DATA(ra); + unsigned len, orig_len = RTA_PAYLOAD(ra); + uint cnt = 0; - first = NULL; - last = &first; + /* First count the nexthops */ + for (len = orig_len, nh = orig_nh; len; len -= NLMSG_ALIGN(nh->rtnh_len), nh = RTNH_NEXT(nh)) + { + /* Use RTNH_OK(nh,len) ?? */ + if ((len < sizeof(*nh)) || (len < nh->rtnh_len)) + goto err; - while (len) + if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD)) + ; + else + cnt++; + } + + struct nexthop_adata *nhad = lp_allocz(s->pool, cnt * NEXTHOP_MAX_SIZE + sizeof *nhad); + struct nexthop *rv = &nhad->nh; + + for (len = orig_len, nh = orig_nh; len; len -= NLMSG_ALIGN(nh->rtnh_len), nh = RTNH_NEXT(nh)) { /* Use RTNH_OK(nh,len) ?? */ if ((len < sizeof(*nh)) || (len < nh->rtnh_len)) - return NULL; + goto err; - if (nh->rtnh_flags & RTNH_F_DEAD) - goto next; + if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD)) + continue; - *last = rv = lp_allocz(s->pool, NEXTHOP_MAX_SIZE); - last = &(rv->next); + *rv = (struct nexthop) { + .weight = nh->rtnh_hops, + .iface = if_find_by_index(nh->rtnh_ifindex), + }; - rv->weight = nh->rtnh_hops; - rv->iface = if_find_by_index(nh->rtnh_ifindex); if (!rv->iface) - return NULL; + { + log(L_ERR "KRT: Received route %N with unknown ifindex %u", n, nh->rtnh_ifindex); + return NULL; + } /* Nonexistent RTNH_PAYLOAD ?? */ nl_attr_len = nh->rtnh_len - RTNH_LENGTH(0); @@ -714,18 +887,18 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr { case AF_INET: if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want4, a, sizeof(a))) - return NULL; + goto err; break; case AF_INET6: if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want6, a, sizeof(a))) - return NULL; + goto err; break; #ifdef HAVE_MPLS_KERNEL case AF_MPLS: if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want_mpls, a, sizeof(a))) - return NULL; + goto err; if (a[RTA_NEWDST]) rv->labels = rta_get_mpls(a[RTA_NEWDST], rv->label); @@ -734,7 +907,7 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr #endif default: - return NULL; + goto err; } if (a[RTA_GATEWAY]) @@ -757,14 +930,19 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr nbr = neigh_find(&p->p, rv->gw, rv->iface, (rv->flags & RNF_ONLINK) ? NEF_ONLINK : 0); if (!nbr || (nbr->scope == SCOPE_HOST)) - return NULL; + { + log(L_ERR "KRT: Received route %N with strange next-hop %I", n, rv->gw); + return NULL; + } } #ifdef HAVE_MPLS_KERNEL if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE]) { - if (rta_get_u16(a[RTA_ENCAP_TYPE]) != LWTUNNEL_ENCAP_MPLS) { - log(L_WARN "KRT: Unknown encapsulation method %d in multipath", rta_get_u16(a[RTA_ENCAP_TYPE])); + if (rta_get_u16(a[RTA_ENCAP_TYPE]) != LWTUNNEL_ENCAP_MPLS) + { + log(L_WARN "KRT: Received route %N with unknown encapsulation method %d", + n, rta_get_u16(a[RTA_ENCAP_TYPE])); return NULL; } @@ -775,16 +953,18 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr } #endif - next: - len -= NLMSG_ALIGN(nh->rtnh_len); - nh = RTNH_NEXT(nh); + rv = NEXTHOP_NEXT(rv); } + /* Store final length */ + nhad->ad.length = (void *) rv - (void *) nhad->ad.data; + /* Ensure nexthops are sorted to satisfy nest invariant */ - if (!nexthop_is_sorted(first)) - first = nexthop_sort(first); + return nexthop_is_sorted(nhad) ? nhad : nexthop_sort(nhad, s->pool); - return first; +err: + log(L_ERR "KRT: Received strange multipath route %N", n); + return NULL; } static void @@ -1139,7 +1319,7 @@ kif_do_scan(struct kif_proto *p UNUSED) if_start_update(); - nl_request_dump(AF_UNSPEC, RTM_GETLINK); + nl_request_dump_link(); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWLINK || h->nlmsg_type == RTM_DELLINK) nl_parse_link(h, 1); @@ -1166,14 +1346,14 @@ kif_do_scan(struct kif_proto *p UNUSED) } } - nl_request_dump(AF_INET, RTM_GETADDR); + nl_request_dump_addr(AF_INET); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR) nl_parse_addr(h, 1); else log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type); - nl_request_dump(AF_INET6, RTM_GETADDR); + nl_request_dump_addr(AF_INET6); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR) nl_parse_addr(h, 1); @@ -1208,11 +1388,16 @@ HASH_DEFINE_REHASH_FN(RTH, struct krt_proto) int krt_capable(rte *e) { - rta *a = e->attrs; + eattr *ea = ea_find(e->attrs, &ea_gen_nexthop); + if (!ea) + return 0; + + struct nexthop_adata *nhad = (void *) ea->u.ptr; + if (NEXTHOP_IS_REACHABLE(nhad)) + return 1; - switch (a->dest) + switch (nhad->dest) { - case RTD_UNICAST: case RTD_BLACKHOLE: case RTD_UNREACHABLE: case RTD_PROHIBIT: @@ -1224,21 +1409,24 @@ krt_capable(rte *e) } static inline int -nh_bufsize(struct nexthop *nh) +nh_bufsize(struct nexthop_adata *nhad) { int rv = 0; - for (; nh != NULL; nh = nh->next) + NEXTHOP_WALK(nh, nhad) rv += RTNH_LENGTH(RTA_LENGTH(sizeof(ip_addr))); return rv; } static int -nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nexthop *nh) +nl_send_route(struct krt_proto *p, const rte *e, int op) { eattr *ea; - rta *a = e->attrs; - ea_list *eattrs = a->eattrs; - int bufsize = 128 + KRT_METRICS_MAX*8 + nh_bufsize(&(a->nh)); + ea_list *eattrs = e->attrs; + eattr *nhea = ea_find(eattrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); + + int bufsize = 128 + KRT_METRICS_MAX*8 + (nh ? nh_bufsize(nh) : 0); u32 priority = 0; struct { @@ -1306,7 +1494,7 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho priority = 0; else if (KRT_CF->sys.metric) priority = KRT_CF->sys.metric; - else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, EA_KRT_METRIC))) + else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, &ea_krt_metric))) priority = ea->u.data; if (priority) @@ -1314,20 +1502,22 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho /* For route delete, we do not specify remaining route attributes */ if (op == NL_OP_DELETE) - goto dest; + goto done; /* Default scope is LINK for device routes, UNIVERSE otherwise */ if (p->af == AF_MPLS) r->r.rtm_scope = RT_SCOPE_UNIVERSE; - else if (ea = ea_find(eattrs, EA_KRT_SCOPE)) + else if (ea = ea_find(eattrs, &ea_krt_scope)) r->r.rtm_scope = ea->u.data; + else if (dest == RTD_UNICAST && ipa_zero(nh->nh.gw)) + r->r.rtm_scope = RT_SCOPE_LINK; else - r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; + r->r.rtm_scope = RT_SCOPE_UNIVERSE; - if (ea = ea_find(eattrs, EA_KRT_PREFSRC)) + if (ea = ea_find(eattrs, &ea_krt_prefsrc)) nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data); - if (ea = ea_find(eattrs, EA_KRT_REALM)) + if (ea = ea_find(eattrs, &ea_krt_realm)) nl_add_attr_u32(&r->h, rsize, RTA_FLOW, ea->u.data); @@ -1335,9 +1525,9 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho metrics[0] = 0; struct ea_walk_state ews = { .eattrs = eattrs }; - while (ea = ea_walk(&ews, EA_KRT_METRICS, KRT_METRICS_MAX)) + while (ea = ea_walk(&ews, ea_krt_metrics[0].id, KRT_METRICS_MAX)) { - int id = ea->id - EA_KRT_METRICS; + int id = ea->id - ea_krt_metrics[0].id; metrics[0] |= 1 << id; metrics[id] = ea->u.data; } @@ -1345,20 +1535,18 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho if (metrics[0]) nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX); - -dest: switch (dest) { case RTD_UNICAST: r->r.rtm_type = RTN_UNICAST; - if (nh->next && !krt_ecmp6(p)) + if (!NEXTHOP_ONE(nh)) nl_add_multipath(&r->h, rsize, nh, p->af, eattrs); else { - nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->iface->index); - nl_add_nexthop(&r->h, rsize, nh, p->af); + nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->nh.iface->index); + nl_add_nexthop(&r->h, rsize, &nh->nh, p->af); - if (nh->flags & RNF_ONLINK) + if (nh->nh.flags & RNF_ONLINK) r->r.rtm_flags |= RTNH_F_ONLINK; } break; @@ -1377,82 +1565,56 @@ dest: bug("krt_capable inconsistent with nl_send_route"); } +done: /* Ignore missing for DELETE */ return nl_exchange(&r->h, (op == NL_OP_DELETE)); } static inline int -nl_add_rte(struct krt_proto *p, rte *e) +nl_allow_replace(struct krt_proto *p, rte *new) { - rta *a = e->attrs; - int err = 0; - - if (krt_ecmp6(p) && a->nh.next) - { - struct nexthop *nh = &(a->nh); - - err = nl_send_route(p, e, NL_OP_ADD, RTD_UNICAST, nh); - if (err < 0) - return err; - - for (nh = nh->next; nh; nh = nh->next) - err += nl_send_route(p, e, NL_OP_APPEND, RTD_UNICAST, nh); - - return err; - } - - return nl_send_route(p, e, NL_OP_ADD, a->dest, &(a->nh)); -} - -static inline int -nl_delete_rte(struct krt_proto *p, const rte *e) -{ - int err = 0; + /* + * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for + * matching rtm_protocol, but that is OK when dedicated priority is used. + * + * For IPv6, the NL_OP_REPLACE is still broken even in Linux 4.19 LTS + * (although it seems to be fixed in Linux 5.10 LTS) for sequence: + * + * ip route add 2001:db8::/32 via fe80::1 dev eth0 + * ip route replace 2001:db8::/32 dev eth0 + * + * (it ends with two routes instead of replacing the first by the second one) + * + * Replacing with direct and special type (e.g. unreachable) routes does not + * work, but replacing with regular routes work reliably + */ - /* For IPv6, we just repeatedly request DELETE until we get error */ - do - err = nl_send_route(p, e, NL_OP_DELETE, RTD_NONE, NULL); - while (krt_ecmp6(p) && !err); + if (krt_ipv4(p)) + return 1; - return err; -} + eattr *nhea = ea_find(new->attrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); -static inline int -nl_replace_rte(struct krt_proto *p, rte *e) -{ - rta *a = e->attrs; - return nl_send_route(p, e, NL_OP_REPLACE, a->dest, &(a->nh)); + return (dest == RTD_UNICAST) && ipa_nonzero(nh->nh.gw); } - void krt_replace_rte(struct krt_proto *p, const net_addr *n UNUSED, rte *new, const rte *old) { int err = 0; - /* - * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for - * matching rtm_protocol, but that is OK when dedicated priority is used. - * - * We do not use NL_OP_REPLACE for IPv6, as it has broken semantics for ECMP - * and with some kernel versions ECMP replace crashes kernel. Would need more - * testing and checks for kernel versions. - * - * For IPv6, we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the - * old route value, so we do not try to optimize IPv6 ECMP reconfigurations. - */ - - if (krt_ipv4(p) && old && new) + if (old && new && nl_allow_replace(p, new)) { - err = nl_replace_rte(p, new); + err = nl_send_route(p, new, NL_OP_REPLACE); } else { if (old) - nl_delete_rte(p, old); + nl_send_route(p, old, NL_OP_DELETE); if (new) - err = nl_add_rte(p, new); + err = nl_send_route(p, new, NL_OP_ADD); } if (new) @@ -1464,75 +1626,9 @@ krt_replace_rte(struct krt_proto *p, const net_addr *n UNUSED, rte *new, const r } } -static int -nl_mergable_route(struct nl_parse_state *s, const net_addr *net, struct krt_proto *p, uint priority, uint krt_type, uint rtm_family) -{ - /* Route merging is used for IPv6 scans */ - if (!s->scan || (rtm_family != AF_INET6)) - return 0; - - /* Saved and new route must have same network, proto/table, and priority */ - if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority)) - return 0; - - /* Both must be regular unicast routes */ - if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST)) - return 0; - - return 1; -} - -static void -nl_announce_route(struct nl_parse_state *s) -{ - rte e0 = { - .attrs = s->attrs, - .net = s->net, - }; - - ea_list *ea = alloca(sizeof(ea_list) + 2 * sizeof(eattr)); - *ea = (ea_list) { .count = 2, .next = e0.attrs->eattrs }; - e0.attrs->eattrs = ea; - - ea->attrs[0] = (eattr) { - .id = EA_KRT_SOURCE, - .type = EAF_TYPE_INT, - .u.data = s->krt_proto, - }; - ea->attrs[1] = (eattr) { - .id = EA_KRT_METRIC, - .type = EAF_TYPE_INT, - .u.data = s->krt_metric, - }; - - if (s->scan) - krt_got_route(s->proto, &e0, s->krt_src); - else - krt_got_route_async(s->proto, &e0, s->new, s->krt_src); - - s->net = NULL; - s->attrs = NULL; - s->proto = NULL; - lp_flush(s->pool); -} - -static inline void -nl_parse_begin(struct nl_parse_state *s, int scan) -{ - memset(s, 0, sizeof (struct nl_parse_state)); - s->pool = nl_linpool; - s->scan = scan; -} - -static inline void -nl_parse_end(struct nl_parse_state *s) -{ - if (s->net) - nl_announce_route(s); -} - -#define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0) +#define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0) +#define SKIP(ARG, ...) do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0) static void nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) @@ -1585,10 +1681,10 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) return; if (!a[RTA_DST]) - SKIP("MPLS route without RTA_DST"); + SKIP0("MPLS route without RTA_DST\n"); if (rta_get_mpls(a[RTA_DST], rta_mpls_stack) != 1) - SKIP("MPLS route with multi-label RTA_DST"); + SKIP0("MPLS route with multi-label RTA_DST\n"); net_fill_mpls(&dst, rta_mpls_stack[0]); break; @@ -1606,6 +1702,9 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) else table_id = i->rtm_table; + if (i->rtm_flags & RTM_F_CLONED) + SKIP("cloned\n"); + /* Do we know this table? */ p = HASH_FIND(nl_table_map, RTH, i->rtm_family, table_id); if (!p) @@ -1662,83 +1761,112 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) net6_prefix(&src), net6_pxlen(&src)); } - if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type, i->rtm_family)) - nl_announce_route(s); - - rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE); - ra->source = RTS_INHERIT; - ra->scope = SCOPE_UNIVERSE; + ea_list *ra = NULL; + ea_set_attr_u32(&ra, &ea_gen_source, 0, RTS_INHERIT); + ea_set_attr_u32(&ra, &ea_krt_source, 0, i->rtm_protocol); + ea_set_attr_u32(&ra, &ea_krt_metric, 0, priority); if (a[RTA_FLOW]) s->rta_flow = rta_get_u32(a[RTA_FLOW]); else s->rta_flow = 0; + union { + struct { + struct adata ad; + struct nexthop nh; + u32 labels[MPLS_MAX_LABEL_STACK]; + }; + struct nexthop_adata nhad; + } nhad = {}; + switch (i->rtm_type) { case RTN_UNICAST: - ra->dest = RTD_UNICAST; - if (a[RTA_MULTIPATH]) { - struct nexthop *nh = nl_parse_multipath(s, p, a[RTA_MULTIPATH], i->rtm_family); + struct nexthop_adata *nh = nl_parse_multipath(s, p, net, a[RTA_MULTIPATH], i->rtm_family, krt_src); if (!nh) - { - log(L_ERR "KRT: Received strange multipath route %N", net); - return; - } + SKIP("strange RTA_MULTIPATH\n"); - nexthop_link(ra, nh); + ea_set_attr(&ra, EA_LITERAL_DIRECT_ADATA( + &ea_gen_nexthop, 0, &nh->ad)); break; } - if (i->rtm_flags & RTNH_F_DEAD) - return; + if ((i->rtm_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD)) + SKIP("ignore RTNH_F_DEAD\n"); - ra->nh.iface = if_find_by_index(oif); - if (!ra->nh.iface) + nhad.nh.iface = if_find_by_index(oif); + if (!nhad.nh.iface) { log(L_ERR "KRT: Received route %N with unknown ifindex %u", net, oif); return; } if (a[RTA_GATEWAY]) - ra->nh.gw = rta_get_ipa(a[RTA_GATEWAY]); + nhad.nh.gw = rta_get_ipa(a[RTA_GATEWAY]); #ifdef HAVE_MPLS_KERNEL if (a[RTA_VIA]) - ra->nh.gw = rta_get_via(a[RTA_VIA]); + nhad.nh.gw = rta_get_via(a[RTA_VIA]); #endif - if (ipa_nonzero(ra->nh.gw)) + if (ipa_nonzero(nhad.nh.gw)) { /* Silently skip strange 6to4 routes */ const net_addr_ip6 sit = NET_ADDR_IP6(IP6_NONE, 96); - if ((i->rtm_family == AF_INET6) && ipa_in_netX(ra->nh.gw, (net_addr *) &sit)) + if ((i->rtm_family == AF_INET6) && ipa_in_netX(nhad.nh.gw, (net_addr *) &sit)) return; if (i->rtm_flags & RTNH_F_ONLINK) - ra->nh.flags |= RNF_ONLINK; + nhad.nh.flags |= RNF_ONLINK; neighbor *nbr; - nbr = neigh_find(&p->p, ra->nh.gw, ra->nh.iface, - (ra->nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0); + nbr = neigh_find(&p->p, nhad.nh.gw, nhad.nh.iface, + (nhad.nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0); if (!nbr || (nbr->scope == SCOPE_HOST)) { - log(L_ERR "KRT: Received route %N with strange next-hop %I", net, ra->nh.gw); + log(L_ERR "KRT: Received route %N with strange next-hop %I", net, + nhad.nh.gw); return; } } +#ifdef HAVE_MPLS_KERNEL + if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !a[RTA_MULTIPATH]) + nhad.nh.labels = rta_get_mpls(a[RTA_NEWDST], nhad.nh.label); + + if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !a[RTA_MULTIPATH]) + { + switch (rta_get_u16(a[RTA_ENCAP_TYPE])) + { + case LWTUNNEL_ENCAP_MPLS: + { + struct rtattr *enca[BIRD_RTA_MAX]; + nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]); + nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca)); + nhad.nh.labels = rta_get_mpls(enca[RTA_DST], nhad.nh.label); + break; + } + default: + SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE])); + break; + } + } +#endif + + /* Finalize the nexthop */ + nhad.ad.length = (void *) NEXTHOP_NEXT(&nhad.nh) - (void *) nhad.ad.data; break; case RTN_BLACKHOLE: - ra->dest = RTD_BLACKHOLE; + nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_BLACKHOLE); break; case RTN_UNREACHABLE: - ra->dest = RTD_UNREACHABLE; + nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_UNREACHABLE); break; case RTN_PROHIBIT: - ra->dest = RTD_PROHIBIT; + nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_PROHIBIT); break; /* FIXME: What about RTN_THROW? */ default: @@ -1746,163 +1874,77 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) return; } -#ifdef HAVE_MPLS_KERNEL - if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !ra->nh.next) - ra->nh.labels = rta_get_mpls(a[RTA_NEWDST], ra->nh.label); - - if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !ra->nh.next) - { - switch (rta_get_u16(a[RTA_ENCAP_TYPE])) - { - case LWTUNNEL_ENCAP_MPLS: - { - struct rtattr *enca[BIRD_RTA_MAX]; - nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]); - nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca)); - ra->nh.labels = rta_get_mpls(enca[RTA_DST], ra->nh.label); - break; - } - default: - SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE])); - break; - } - } -#endif + if (nhad.ad.length) + ea_set_attr(&ra, EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nhad.ad)); if (i->rtm_scope != def_scope) - { - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = ra->eattrs; - ra->eattrs = ea; - ea->flags = EALF_SORTED; - ea->count = 1; - ea->attrs[0].id = EA_KRT_SCOPE; - ea->attrs[0].flags = 0; - ea->attrs[0].type = EAF_TYPE_INT; - ea->attrs[0].u.data = i->rtm_scope; - } + ea_set_attr(&ra, + EA_LITERAL_EMBEDDED(&ea_krt_scope, 0, i->rtm_scope)); if (a[RTA_PREFSRC]) - { - ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]); - - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = ra->eattrs; - ra->eattrs = ea; - ea->flags = EALF_SORTED; - ea->count = 1; - ea->attrs[0].id = EA_KRT_PREFSRC; - ea->attrs[0].flags = 0; - ea->attrs[0].type = EAF_TYPE_IP_ADDRESS; - - struct adata *ad = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps)); - ad->length = sizeof(ps); - memcpy(ad->data, &ps, sizeof(ps)); - - ea->attrs[0].u.ptr = ad; - } + { + ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]); + + ea_set_attr(&ra, + EA_LITERAL_STORE_ADATA(&ea_krt_prefsrc, 0, &ps, sizeof(ps))); + } /* Can be set per-route or per-nexthop */ if (s->rta_flow) - { - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = ra->eattrs; - ra->eattrs = ea; - ea->flags = EALF_SORTED; - ea->count = 1; - ea->attrs[0].id = EA_KRT_REALM; - ea->attrs[0].flags = 0; - ea->attrs[0].type = EAF_TYPE_INT; - ea->attrs[0].u.data = s->rta_flow; - } + ea_set_attr(&ra, + EA_LITERAL_EMBEDDED(&ea_krt_realm, 0, s->rta_flow)); if (a[RTA_METRICS]) { u32 metrics[KRT_METRICS_MAX]; - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr)); - int t, n = 0; - if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0) { log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net); return; } - for (t = 1; t < KRT_METRICS_MAX; t++) + for (uint t = 1; t < KRT_METRICS_MAX; t++) if (metrics[0] & (1 << t)) - { - ea->attrs[n].id = EA_CODE(PROTOCOL_KERNEL, KRT_METRICS_OFFSET + t); - ea->attrs[n].flags = 0; - ea->attrs[n].type = EAF_TYPE_INT; /* FIXME: Some are EAF_TYPE_BITFIELD */ - ea->attrs[n].u.data = metrics[t]; - n++; - } - - if (n > 0) - { - ea->next = ra->eattrs; - ea->flags = EALF_SORTED; - ea->count = n; - ra->eattrs = ea; - } + ea_set_attr(&ra, + EA_LITERAL_EMBEDDED(&ea_krt_metrics[t], 0, metrics[t])); } - /* - * Ideally, now we would send the received route to the rest of kernel code. - * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we - * postpone it and merge next hops until the end of the sequence. Note that - * when doing merging of next hops, we expect the new route to be unipath. - * Otherwise, we ignore additional next hops in nexthop_insert(). - */ + rte e0 = { + .net = net, + .attrs = ra, + }; - if (!s->net) - { - /* Store the new route */ - s->net = lp_alloc(s->pool, net->length); - net_copy(s->net, net); - - s->attrs = ra; - s->proto = p; - s->new = new; - s->krt_src = krt_src; - s->krt_type = i->rtm_type; - s->krt_proto = i->rtm_protocol; - s->krt_metric = priority; - } + if (s->scan) + krt_got_route(p, &e0, krt_src); else - { - /* Merge next hops with the stored route */ - rta *oa = s->attrs; + krt_got_route_async(p, &e0, new, krt_src); - struct nexthop *nhs = &oa->nh; - nexthop_insert(&nhs, &ra->nh); - - /* Perhaps new nexthop is inserted at the first position */ - if (nhs == &ra->nh) - { - /* Swap rtas */ - s->attrs = ra; - - /* Keep old eattrs */ - ra->eattrs = oa->eattrs; - } - } + lp_flush(s->pool); } void -krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */ +krt_do_scan(struct krt_proto *p) { - struct nlmsghdr *h; - struct nl_parse_state s; + struct nl_parse_state s = { + .proto = p, + .pool = nl_linpool, + .scan = 1, + }; - nl_parse_begin(&s, 1); - nl_request_dump(AF_UNSPEC, RTM_GETROUTE); + /* Table-specific scan or shared scan */ + if (p) + nl_request_dump_route(p->af, krt_table_id(p)); + else + nl_request_dump_route(AF_UNSPEC, 0); + + struct nlmsghdr *h; while (h = nl_get_scan()) + { if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) nl_parse_route(&s, h); else log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type); - nl_parse_end(&s); + } } /* @@ -1911,20 +1953,24 @@ krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NUL static sock *nl_async_sk; /* BIRD socket for asynchronous notifications */ static byte *nl_async_rx_buffer; /* Receive buffer */ +static uint nl_async_bufsize; /* Kernel rx buffer size for the netlink socket */ +static struct config *nl_last_config; /* For tracking changes to nl_async_bufsize */ static void nl_async_msg(struct nlmsghdr *h) { - struct nl_parse_state s; + struct nl_parse_state s = { + .proto = NULL, + .pool = nl_linpool, + .scan = 0, + }; switch (h->nlmsg_type) { case RTM_NEWROUTE: case RTM_DELROUTE: DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type); - nl_parse_begin(&s, 0); nl_parse_route(&s, h); - nl_parse_end(&s); break; case RTM_NEWLINK: case RTM_DELLINK: @@ -2046,6 +2092,32 @@ nl_open_async(void) bug("Netlink: sk_open failed"); } +static void +nl_update_async_bufsize(void) +{ + /* No async socket */ + if (!nl_async_sk) + return; + + /* Already reconfigured */ + if (nl_last_config == config) + return; + + /* Update netlink buffer size */ + uint bufsize = nl_cfg_rx_buffer_size(config); + if (bufsize && (bufsize != nl_async_bufsize)) + { + /* Log message for reconfigurations only */ + if (nl_last_config) + log(L_INFO "KRT: Changing netlink rx buffer size to %u", bufsize); + + nl_set_rcvbuf(nl_async_sk->fd, bufsize); + nl_async_bufsize = bufsize; + } + + nl_last_config = config; +} + /* * Interface to the UNIX krt module @@ -2056,6 +2128,8 @@ krt_sys_io_init(void) { nl_linpool = lp_new_default(krt_pool); HASH_INIT(nl_table_map, krt_pool, 6); + + nl_ea_register(); } int @@ -2074,6 +2148,7 @@ krt_sys_start(struct krt_proto *p) nl_open(); nl_open_async(); + nl_update_async_bufsize(); return 1; } @@ -2081,12 +2156,16 @@ krt_sys_start(struct krt_proto *p) void krt_sys_shutdown(struct krt_proto *p) { + nl_update_async_bufsize(); + HASH_REMOVE2(nl_table_map, RTH, krt_pool, p); } int krt_sys_reconfigure(struct krt_proto *p UNUSED, struct krt_config *n, struct krt_config *o) { + nl_update_async_bufsize(); + return (n->sys.table_id == o->sys.table_id) && (n->sys.metric == o->sys.metric); } @@ -2104,56 +2183,6 @@ krt_sys_copy_config(struct krt_config *d, struct krt_config *s) d->sys.metric = s->sys.metric; } -static const char *krt_metrics_names[KRT_METRICS_MAX] = { - NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss", - "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack" -}; - -static const char *krt_features_names[KRT_FEATURES_MAX] = { - "ecn", NULL, NULL, "allfrag" -}; - -int -krt_sys_get_attr(const eattr *a, byte *buf, int buflen UNUSED) -{ - switch (a->id) - { - case EA_KRT_PREFSRC: - bsprintf(buf, "prefsrc"); - return GA_NAME; - - case EA_KRT_REALM: - bsprintf(buf, "realm"); - return GA_NAME; - - case EA_KRT_SCOPE: - bsprintf(buf, "scope"); - return GA_NAME; - - case EA_KRT_LOCK: - buf += bsprintf(buf, "lock:"); - ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX); - return GA_FULL; - - case EA_KRT_FEATURES: - buf += bsprintf(buf, "features:"); - ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX); - return GA_FULL; - - default:; - int id = (int)EA_ID(a->id) - KRT_METRICS_OFFSET; - if (id > 0 && id < KRT_METRICS_MAX) - { - bsprintf(buf, "%s", krt_metrics_names[id]); - return GA_NAME; - } - - return GA_UNKNOWN; - } -} - - - void kif_sys_start(struct kif_proto *p UNUSED) { diff --git a/sysdep/linux/sysio.h b/sysdep/linux/sysio.h index e21ff487..f13eda7c 100644 --- a/sysdep/linux/sysio.h +++ b/sysdep/linux/sysio.h @@ -10,6 +10,10 @@ #define IPV6_MINHOPCOUNT 73 #endif +#ifndef IPV6_FREEBIND +#define IPV6_FREEBIND 78 +#endif + #ifndef TCP_MD5SIG_EXT #define TCP_MD5SIG_EXT 32 #endif @@ -266,3 +270,18 @@ sk_set_priority(sock *s, int prio) return 0; } +static inline int +sk_set_freebind(sock *s) +{ + int y = 1; + + if (sk_is_ipv4(s)) + if (setsockopt(s->fd, SOL_IP, IP_FREEBIND, &y, sizeof(y)) < 0) + ERR("IP_FREEBIND"); + + if (sk_is_ipv6(s)) + if (setsockopt(s->fd, SOL_IPV6, IPV6_FREEBIND, &y, sizeof(y)) < 0) + ERR("IPV6_FREEBIND"); + + return 0; +} diff --git a/sysdep/unix/Makefile b/sysdep/unix/Makefile index 07f454ab..6f6b0d26 100644 --- a/sysdep/unix/Makefile +++ b/sysdep/unix/Makefile @@ -1,4 +1,4 @@ -src := alloc.c io.c io-loop.c krt.c log.c main.c random.c coroutine.c +src := alloc.c io.c io-loop.c krt.c log.c main.c random.c domain.c obj := $(src-o-files) $(all-daemon) $(cf-local) diff --git a/sysdep/unix/alloc.c b/sysdep/unix/alloc.c index 4ae1a9db..847def30 100644 --- a/sysdep/unix/alloc.c +++ b/sysdep/unix/alloc.c @@ -8,130 +8,242 @@ #include "nest/bird.h" #include "lib/resource.h" +#include "lib/lists.h" +#include "lib/event.h" +#include "lib/rcu.h" +#include <errno.h> #include <stdlib.h> #include <unistd.h> -#include <stdatomic.h> -#include <errno.h> #ifdef HAVE_MMAP #include <sys/mman.h> #endif long page_size = 0; -_Bool alloc_multipage = 0; -static _Atomic int global_page_list_not_empty; -static list global_page_list; -static _Atomic int global_page_spinlock; +#ifdef HAVE_MMAP +#define KEEP_PAGES_MAX 512 +#define KEEP_PAGES_MIN 32 +#define KEEP_PAGES_MAX_LOCAL 16 +#define ALLOC_PAGES_AT_ONCE 8 -#define GLOBAL_PAGE_SPIN_LOCK for (int v = 0; !atomic_compare_exchange_weak_explicit(&global_page_spinlock, &v, 1, memory_order_acq_rel, memory_order_acquire); v = 0) -#define GLOBAL_PAGE_SPIN_UNLOCK do { int v = 1; ASSERT_DIE(atomic_compare_exchange_strong_explicit(&global_page_spinlock, &v, 0, memory_order_acq_rel, memory_order_acquire)); } while (0) +STATIC_ASSERT(KEEP_PAGES_MIN * 4 < KEEP_PAGES_MAX); +STATIC_ASSERT(ALLOC_PAGES_AT_ONCE < KEEP_PAGES_MAX_LOCAL); -#ifdef HAVE_MMAP static _Bool use_fake = 0; +static _Bool initialized = 0; + +#if DEBUGGING +struct free_page { + node unused[42]; + struct free_page * _Atomic next; +}; #else -static _Bool use_fake = 1; +struct free_page { + struct free_page * _Atomic next; +}; #endif -void resource_sys_init(void) +static struct free_page * _Atomic page_stack = NULL; +static _Thread_local struct free_page * local_page_stack = NULL; + +static void page_cleanup(void *); +static event page_cleanup_event = { .hook = page_cleanup, }; +#define SCHEDULE_CLEANUP do if (initialized && !shutting_down) ev_send(&global_event_list, &page_cleanup_event); while (0) + +_Atomic int pages_kept = 0; +_Atomic int pages_kept_locally = 0; +static int pages_kept_here = 0; + +static void * +alloc_sys_page(void) { -#ifdef HAVE_MMAP - init_list(&global_page_list); + void *ptr = mmap(NULL, page_size * ALLOC_PAGES_AT_ONCE, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (!(page_size = sysconf(_SC_PAGESIZE))) - die("System page size must be non-zero"); + if (ptr == MAP_FAILED) + bug("mmap(%lu) failed: %m", page_size); + + return ptr; +} + +extern int shutting_down; /* Shutdown requested. */ - if ((u64_popcount(page_size) > 1) || (page_size > 16384)) +#else // ! HAVE_MMAP +#define use_fake 1 #endif + +void * +alloc_page(void) +{ + if (use_fake) { - /* Too big or strange page, use the aligned allocator instead */ - page_size = 4096; - use_fake = 1; + void *ptr = NULL; + int err = posix_memalign(&ptr, page_size, page_size); + + if (err || !ptr) + bug("posix_memalign(%lu) failed", (long unsigned int) page_size); + + return ptr; + } + +#ifdef HAVE_MMAP + struct free_page *fp = local_page_stack; + if (fp) + { + local_page_stack = atomic_load_explicit(&fp->next, memory_order_acquire); + atomic_fetch_sub_explicit(&pages_kept_locally, 1, memory_order_relaxed); + pages_kept_here--; + return fp; + } + + rcu_read_lock(); + fp = atomic_load_explicit(&page_stack, memory_order_acquire); + while (fp && !atomic_compare_exchange_strong_explicit( + &page_stack, &fp, atomic_load_explicit(&fp->next, memory_order_acquire), + memory_order_acq_rel, memory_order_acquire)) + ; + rcu_read_unlock(); + + if (!fp) + { + void *ptr = alloc_sys_page(); + for (int i=1; i<ALLOC_PAGES_AT_ONCE; i++) + free_page(ptr + page_size * i); + return ptr; } + + atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed); + return fp; +#endif } -void * -alloc_sys_page(void) +void +free_page(void *ptr) { + if (use_fake) + { + free(ptr); + return; + } + #ifdef HAVE_MMAP - if (!use_fake) + struct free_page *fp = ptr; + if (shutting_down || (pages_kept_here < KEEP_PAGES_MAX_LOCAL)) { - if (atomic_load_explicit(&global_page_list_not_empty, memory_order_relaxed)) - { - GLOBAL_PAGE_SPIN_LOCK; - if (!EMPTY_LIST(global_page_list)) - { - node *ret = HEAD(global_page_list); - rem_node(ret); - if (EMPTY_LIST(global_page_list)) - atomic_store_explicit(&global_page_list_not_empty, 0, memory_order_relaxed); - GLOBAL_PAGE_SPIN_UNLOCK; - memset(ret, 0, sizeof(node)); - return (void *) ret; - } - GLOBAL_PAGE_SPIN_UNLOCK; - } - - if (alloc_multipage) - { - void *big = mmap(NULL, page_size * 2, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (big == MAP_FAILED) - bug("mmap(%lu) failed: %m", page_size); - - uintptr_t offset = ((uintptr_t) big) % page_size; - if (offset) - { - void *ret = big + page_size - offset; - munmap(big, page_size - offset); - munmap(ret + page_size, offset); - return ret; - } - else - { - munmap(big + page_size, page_size); - return big; - } - } - - void *ret = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (ret == MAP_FAILED) - bug("mmap(%lu) failed: %m", page_size); - - return ret; + atomic_store_explicit(&fp->next, local_page_stack, memory_order_relaxed); + atomic_fetch_add_explicit(&pages_kept_locally, 1, memory_order_relaxed); + pages_kept_here++; + return; } - else + + rcu_read_lock(); + struct free_page *next = atomic_load_explicit(&page_stack, memory_order_acquire); + + do atomic_store_explicit(&fp->next, next, memory_order_release); + while (!atomic_compare_exchange_strong_explicit( + &page_stack, &next, fp, + memory_order_acq_rel, memory_order_acquire)); + rcu_read_unlock(); + + if (atomic_fetch_add_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX) + SCHEDULE_CLEANUP; #endif +} + +void +flush_local_pages(void) +{ + if (use_fake || !local_page_stack || shutting_down) + return; + + struct free_page *last = local_page_stack, *next; + int check_count = 1; + while (next = atomic_load_explicit(&last->next, memory_order_acquire)) + { + check_count++; + last = next; + } + + ASSERT_DIE(check_count == pages_kept_here); + + rcu_read_lock(); + next = atomic_load_explicit(&page_stack, memory_order_acquire); + + do atomic_store_explicit(&last->next, next, memory_order_release); + while (!atomic_compare_exchange_strong_explicit( + &page_stack, &next, local_page_stack, + memory_order_acq_rel, memory_order_acquire)); + rcu_read_unlock(); + + local_page_stack = NULL; + pages_kept_here = 0; + + atomic_fetch_sub_explicit(&pages_kept_locally, check_count, memory_order_relaxed); + if (atomic_fetch_add_explicit(&pages_kept, check_count, memory_order_relaxed) >= KEEP_PAGES_MAX) + SCHEDULE_CLEANUP; +} + +#ifdef HAVE_MMAP +static void +page_cleanup(void *_ UNUSED) +{ + if (shutting_down) + return; + + struct free_page *stack = atomic_exchange_explicit(&page_stack, NULL, memory_order_acq_rel); + if (!stack) + return; + + synchronize_rcu(); + + do { + struct free_page *f = stack; + stack = atomic_load_explicit(&f->next, memory_order_acquire); + + if (munmap(f, page_size) == 0) + continue; + else if (errno != ENOMEM) + bug("munmap(%p) failed: %m", f); + else + free_page(f); + } + while ((atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX / 2) && stack); + + while (stack) { - void *ret = aligned_alloc(page_size, page_size); - if (!ret) - bug("aligned_alloc(%lu) failed", page_size); - return ret; + struct free_page *f = stack; + stack = atomic_load_explicit(&f->next, memory_order_acquire); + free_page(f); + + atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed); } } +#endif void -free_sys_page(void *ptr) +resource_sys_init(void) { #ifdef HAVE_MMAP - if (!use_fake) + if (!(page_size = sysconf(_SC_PAGESIZE))) + die("System page size must be non-zero"); + + if (u64_popcount(page_size) == 1) { - if (munmap(ptr, page_size) < 0) -#ifdef ENOMEM - if (errno == ENOMEM) - { - memset(ptr, 0, page_size); - - GLOBAL_PAGE_SPIN_LOCK; - add_tail(&global_page_list, (node *) ptr); - atomic_store_explicit(&global_page_list_not_empty, 1, memory_order_relaxed); - GLOBAL_PAGE_SPIN_UNLOCK; - } - else -#endif - bug("munmap(%p) failed: %m", ptr); + + for (int i = 0; i < (KEEP_PAGES_MIN * 2); i++) + free_page(alloc_page()); + + page_cleanup(NULL); + initialized = 1; + return; } - else + + /* Too big or strange page, use the aligned allocator instead */ + log(L_WARN "Got strange memory page size (%lu), using the aligned allocator instead", page_size); + use_fake = 1; #endif - free(ptr); + + page_size = 4096; + initialized = 1; } diff --git a/sysdep/unix/coroutine.c b/sysdep/unix/domain.c index 12ba55d8..0a5858a6 100644 --- a/sysdep/unix/coroutine.c +++ b/sysdep/unix/domain.c @@ -1,7 +1,6 @@ /* - * BIRD Coroutines + * BIRD Locking * - * (c) 2017 Martin Mares <mj@ucw.cz> * (c) 2020 Maria Matejka <mq@jmq.cz> * * Can be freely distributed and used under the terms of the GNU GPL. @@ -17,19 +16,11 @@ #include "lib/birdlib.h" #include "lib/locking.h" -#include "lib/coro.h" -#include "lib/rcu.h" #include "lib/resource.h" #include "lib/timer.h" #include "conf/conf.h" -#define CORO_STACK_SIZE 65536 - -/* - * Implementation of coroutines based on POSIX threads - */ - #include <errno.h> #include <fcntl.h> #include <poll.h> @@ -123,80 +114,3 @@ void do_unlock(struct domain_generic *dg, struct domain_generic **lsp) dg->prev = NULL; pthread_mutex_unlock(&dg->mutex); } - -/* Coroutines */ -struct coroutine { - resource r; - pthread_t id; - pthread_attr_t attr; - struct rcu_coro rcu; - void (*entry)(void *); - void *data; -}; - -static _Thread_local _Bool coro_cleaned_up = 0; - -static void coro_free(resource *r) -{ - struct coroutine *c = (void *) r; - rcu_coro_stop(&c->rcu); - ASSERT_DIE(pthread_equal(pthread_self(), c->id)); - pthread_attr_destroy(&c->attr); - coro_cleaned_up = 1; -} - -static struct resclass coro_class = { - .name = "Coroutine", - .size = sizeof(struct coroutine), - .free = coro_free, -}; - -_Thread_local struct coroutine *this_coro = NULL; - -static void *coro_entry(void *p) -{ - struct coroutine *c = p; - - ASSERT_DIE(c->entry); - - this_coro = c; - rcu_coro_start(&c->rcu); - - c->entry(c->data); - ASSERT_DIE(coro_cleaned_up); - - return NULL; -} - -struct coroutine *coro_run(pool *p, void (*entry)(void *), void *data) -{ - ASSERT_DIE(entry); - ASSERT_DIE(p); - - struct coroutine *c = ralloc(p, &coro_class); - - c->entry = entry; - c->data = data; - - int e = 0; - - if (e = pthread_attr_init(&c->attr)) - die("pthread_attr_init() failed: %M", e); - - if (e = pthread_attr_setstacksize(&c->attr, CORO_STACK_SIZE)) - die("pthread_attr_setstacksize(%u) failed: %M", CORO_STACK_SIZE, e); - - if (e = pthread_attr_setdetachstate(&c->attr, PTHREAD_CREATE_DETACHED)) - die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e); - - if (e = pthread_create(&c->id, &c->attr, coro_entry, c)) - die("pthread_create() failed: %M", e); - - return c; -} - -void -coro_yield(void) -{ - usleep(100); -} diff --git a/sysdep/unix/io-loop.c b/sysdep/unix/io-loop.c index c7cf4ad2..5ce2d350 100644 --- a/sysdep/unix/io-loop.c +++ b/sysdep/unix/io-loop.c @@ -17,7 +17,6 @@ #include "nest/bird.h" #include "lib/buffer.h" -#include "lib/coro.h" #include "lib/lists.h" #include "lib/resource.h" #include "lib/event.h" @@ -28,6 +27,8 @@ #include "sysdep/unix/io-loop.h" #include "conf/conf.h" +#define THREAD_STACK_SIZE 65536 /* To be lowered in near future */ + /* * Current thread context */ @@ -58,85 +59,144 @@ birdloop_inside(struct birdloop *loop) return 0; } +void +birdloop_flag(struct birdloop *loop, u32 flag) +{ + atomic_fetch_or_explicit(&loop->flags, flag, memory_order_acq_rel); + birdloop_ping(loop); +} + +void +birdloop_flag_set_handler(struct birdloop *loop, struct birdloop_flag_handler *fh) +{ + ASSERT_DIE(birdloop_inside(loop)); + loop->flag_handler = fh; +} + +static int +birdloop_process_flags(struct birdloop *loop) +{ + if (!loop->flag_handler) + return 0; + + u32 flags = atomic_exchange_explicit(&loop->flags, 0, memory_order_acq_rel); + loop->flag_handler->hook(loop->flag_handler, flags); + return !!flags; +} + +static int +birdloop_run_events(struct birdloop *loop) +{ + btime begin = current_time(); + while (current_time() - begin < 5 MS) + { + if (!ev_run_list(&loop->event_list)) + return 0; + + times_update(); + } + + return 1; +} + /* * Wakeup code for birdloop */ -static void -pipe_new(int *pfds) +void +pipe_new(struct pipe *p) { - int rv = pipe(pfds); + int rv = pipe(p->fd); if (rv < 0) die("pipe: %m"); - if (fcntl(pfds[0], F_SETFL, O_NONBLOCK) < 0) + if (fcntl(p->fd[0], F_SETFL, O_NONBLOCK) < 0) die("fcntl(O_NONBLOCK): %m"); - if (fcntl(pfds[1], F_SETFL, O_NONBLOCK) < 0) + if (fcntl(p->fd[1], F_SETFL, O_NONBLOCK) < 0) die("fcntl(O_NONBLOCK): %m"); } void -pipe_drain(int fd) +pipe_drain(struct pipe *p) { - char buf[64]; - int rv; - - try: - rv = read(fd, buf, 64); - if (rv < 0) - { - if (errno == EINTR) - goto try; - if (errno == EAGAIN) + while (1) { + char buf[64]; + int rv = read(p->fd[0], buf, sizeof(buf)); + if ((rv < 0) && (errno == EAGAIN)) return; - die("wakeup read: %m"); + + if (rv == 0) + bug("wakeup read eof"); + if ((rv < 0) && (errno != EINTR)) + bug("wakeup read: %m"); + } +} + +int +pipe_read_one(struct pipe *p) +{ + while (1) { + char v; + int rv = read(p->fd[0], &v, sizeof(v)); + if (rv == 1) + return 1; + if ((rv < 0) && (errno == EAGAIN)) + return 0; + if (rv > 1) + bug("wakeup read more bytes than expected: %d", rv); + if (rv == 0) + bug("wakeup read eof"); + if (errno != EINTR) + bug("wakeup read: %m"); } - if (rv == 64) - goto try; } void -pipe_kick(int fd) +pipe_kick(struct pipe *p) { - u64 v = 1; + char v = 1; int rv; - try: - rv = write(fd, &v, sizeof(u64)); - if (rv < 0) - { - if (errno == EINTR) - goto try; - if (errno == EAGAIN) + while (1) { + rv = write(p->fd[1], &v, sizeof(v)); + if ((rv >= 0) || (errno == EAGAIN)) return; - die("wakeup write: %m"); + if (errno != EINTR) + bug("wakeup write: %m"); } } +void +pipe_pollin(struct pipe *p, struct pollfd *pfd) +{ + pfd->fd = p->fd[0]; + pfd->events = POLLIN; + pfd->revents = 0; +} + static inline void wakeup_init(struct birdloop *loop) { - pipe_new(loop->wakeup_fds); + pipe_new(&loop->wakeup); } static inline void wakeup_drain(struct birdloop *loop) { - pipe_drain(loop->wakeup_fds[0]); + pipe_drain(&loop->wakeup); } static inline void wakeup_do_kick(struct birdloop *loop) { - pipe_kick(loop->wakeup_fds[1]); + pipe_kick(&loop->wakeup); } -void -birdloop_ping(struct birdloop *loop) +static inline void +birdloop_do_ping(struct birdloop *loop) { - u32 ping_sent = atomic_fetch_add_explicit(&loop->ping_sent, 1, memory_order_acq_rel); - if (ping_sent) + if (atomic_fetch_add_explicit(&loop->ping_sent, 1, memory_order_acq_rel)) return; if (loop == birdloop_wakeup_masked) @@ -145,6 +205,15 @@ birdloop_ping(struct birdloop *loop) wakeup_do_kick(loop); } +void +birdloop_ping(struct birdloop *loop) +{ + if (birdloop_inside(loop) && !loop->ping_pending) + loop->ping_pending++; + else + birdloop_do_ping(loop); +} + /* * Sockets @@ -183,6 +252,9 @@ sk_start(sock *s) static void sockets_remove(struct birdloop *loop, sock *s) { + if (!enlisted(&s->n)) + return; + rem_node(&s->n); loop->sock_num--; @@ -205,7 +277,7 @@ sk_stop(sock *s) } static inline uint sk_want_events(sock *s) -{ return ((s->rx_hook && !ev_corked(s->cork)) ? POLLIN : 0) | ((s->ttx != s->tpos) ? POLLOUT : 0); } +{ return (s->rx_hook ? POLLIN : 0) | ((s->ttx != s->tpos) ? POLLOUT : 0); } /* FIXME: this should be called from sock code @@ -250,9 +322,7 @@ sockets_prepare(struct birdloop *loop) /* Add internal wakeup fd */ *psk = NULL; - pfd->fd = loop->wakeup_fds[0]; - pfd->events = POLLIN; - pfd->revents = 0; + pipe_pollin(&loop->wakeup, pfd); loop->poll_changed = 0; } @@ -297,7 +367,7 @@ sockets_fire(struct birdloop *loop) continue; if (pfd->revents & POLLNVAL) - die("poll: invalid fd %d", pfd->fd); + bug("poll: invalid fd %d", pfd->fd); if (pfd->revents & POLLIN) while (e && *psk && (*psk)->rx_hook) @@ -336,7 +406,7 @@ birdloop_init(void) birdloop_enter_locked(&main_birdloop); } -static void birdloop_main(void *arg); +static void *birdloop_main(void *arg); struct birdloop * birdloop_new(pool *pp, uint order, const char *name) @@ -357,7 +427,19 @@ birdloop_new(pool *pp, uint order, const char *name) timers_init(&loop->time, p); sockets_init(loop); - loop->time.coro = coro_run(p, birdloop_main, loop); + int e = 0; + + if (e = pthread_attr_init(&loop->thread_attr)) + die("pthread_attr_init() failed: %M", e); + + if (e = pthread_attr_setstacksize(&loop->thread_attr, THREAD_STACK_SIZE)) + die("pthread_attr_setstacksize(%u) failed: %M", THREAD_STACK_SIZE, e); + + if (e = pthread_attr_setdetachstate(&loop->thread_attr, PTHREAD_CREATE_DETACHED)) + die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e); + + if (e = pthread_create(&loop->thread_id, &loop->thread_attr, birdloop_main, loop)) + die("pthread_create() failed: %M", e); birdloop_leave(loop); @@ -393,6 +475,11 @@ void birdloop_free(struct birdloop *loop) { ASSERT_DIE(loop->links == 0); + ASSERT_DIE(pthread_equal(pthread_self(), loop->thread_id)); + + rcu_birdloop_stop(&loop->rcu); + pthread_attr_destroy(&loop->thread_attr); + domain_free(loop->time.domain); rfree(loop->pool); } @@ -423,6 +510,13 @@ birdloop_leave_locked(struct birdloop *loop) /* Check the current context */ ASSERT_DIE(birdloop_current == loop); + /* Send pending pings */ + if (loop->ping_pending) + { + loop->ping_pending = 0; + birdloop_do_ping(loop); + } + /* Restore the old context */ birdloop_current = loop->prev_loop; } @@ -466,20 +560,24 @@ birdloop_unlink(struct birdloop *loop) loop->links--; } -static void +static void * birdloop_main(void *arg) { struct birdloop *loop = arg; timer *t; int rv, timeout; + rcu_birdloop_start(&loop->rcu); + btime loop_begin = current_time(); + tmp_init(loop->pool); + birdloop_enter(loop); while (1) { timers_fire(&loop->time, 0); - if (ev_run_list(&loop->event_list)) + if (birdloop_process_flags(loop) + birdloop_run_events(loop)) timeout = 0; else if (t = timers_first(&loop->time)) timeout = (tm_remains(t) TO_MS) + 1; @@ -488,6 +586,9 @@ birdloop_main(void *arg) if (loop->poll_changed) sockets_prepare(loop); + else + if ((timeout < 0) || (timeout > 5000)) + flush_local_pages(); btime duration = current_time() - loop_begin; if (duration > config->watchdog_warning) @@ -501,7 +602,7 @@ birdloop_main(void *arg) { if (errno == EINTR || errno == EAGAIN) goto try; - die("poll: %m"); + bug("poll: %m"); } birdloop_enter(loop); @@ -514,7 +615,7 @@ birdloop_main(void *arg) loop_begin = current_time(); - if (rv) + if (rv && !loop->poll_changed) sockets_fire(loop); atomic_exchange_explicit(&loop->ping_sent, 0, memory_order_acq_rel); @@ -523,13 +624,23 @@ birdloop_main(void *arg) /* Flush remaining events */ ASSERT_DIE(!ev_run_list(&loop->event_list)); - /* No timers allowed */ - ASSERT_DIE(timers_count(&loop->time) == 0); + /* Drop timers */ + while (t = timers_first(&loop->time)) + tm_stop(t); + + /* No sockets allowed */ ASSERT_DIE(EMPTY_LIST(loop->sock_list)); ASSERT_DIE(loop->sock_num == 0); birdloop_leave(loop); loop->stopped(loop->stop_data); -} + flush_local_pages(); + return NULL; +} +void +birdloop_yield(void) +{ + usleep(100); +} diff --git a/sysdep/unix/io-loop.h b/sysdep/unix/io-loop.h index 4024b6c5..29ca96d6 100644 --- a/sysdep/unix/io-loop.h +++ b/sysdep/unix/io-loop.h @@ -7,6 +7,18 @@ #ifndef _BIRD_SYSDEP_UNIX_IO_LOOP_H_ #define _BIRD_SYSDEP_UNIX_IO_LOOP_H_ +#include "lib/rcu.h" + +struct pipe +{ + int fd[2]; +}; + +void pipe_new(struct pipe *); +void pipe_pollin(struct pipe *, struct pollfd *); +void pipe_drain(struct pipe *); +void pipe_kick(struct pipe *); + struct birdloop { pool *pool; @@ -21,11 +33,20 @@ struct birdloop u8 poll_changed; u8 close_scheduled; + uint ping_pending; _Atomic u32 ping_sent; - int wakeup_fds[2]; + struct pipe wakeup; + + pthread_t thread_id; + pthread_attr_t thread_attr; + + struct rcu_birdloop rcu; uint links; + _Atomic u32 flags; + struct birdloop_flag_handler *flag_handler; + void (*stopped)(void *data); void *stop_data; diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index dd385c80..c91bd597 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -774,8 +774,7 @@ sk_ssh_free(sock *s) if (ssh->channel) { - if (ssh_channel_is_open(ssh->channel)) - ssh_channel_close(ssh->channel); + ssh_channel_close(ssh->channel); ssh_channel_free(ssh->channel); ssh->channel = NULL; } @@ -1152,34 +1151,45 @@ sk_ssh_connect(sock *s) { int server_identity_is_ok = 1; +#ifdef HAVE_SSH_OLD_SERVER_VALIDATION_API +#define ssh_session_is_known_server ssh_is_server_known +#define SSH_KNOWN_HOSTS_OK SSH_SERVER_KNOWN_OK +#define SSH_KNOWN_HOSTS_UNKNOWN SSH_SERVER_NOT_KNOWN +#define SSH_KNOWN_HOSTS_CHANGED SSH_SERVER_KNOWN_CHANGED +#define SSH_KNOWN_HOSTS_NOT_FOUND SSH_SERVER_FILE_NOT_FOUND +#define SSH_KNOWN_HOSTS_ERROR SSH_SERVER_ERROR +#define SSH_KNOWN_HOSTS_OTHER SSH_SERVER_FOUND_OTHER +#endif + /* Check server identity */ - switch (ssh_is_server_known(s->ssh->session)) + switch (ssh_session_is_known_server(s->ssh->session)) { #define LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s,msg,args...) log(L_WARN "SSH Identity %s@%s:%u: " msg, (s)->ssh->username, (s)->host, (s)->dport, ## args); - case SSH_SERVER_KNOWN_OK: + case SSH_KNOWN_HOSTS_OK: /* The server is known and has not changed. */ break; - case SSH_SERVER_NOT_KNOWN: + case SSH_KNOWN_HOSTS_UNKNOWN: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server is unknown, its public key was not found in the known host file %s", s->ssh->server_hostkey_path); + server_identity_is_ok = 0; break; - case SSH_SERVER_KNOWN_CHANGED: + case SSH_KNOWN_HOSTS_CHANGED: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server key has changed. Either you are under attack or the administrator changed the key."); server_identity_is_ok = 0; break; - case SSH_SERVER_FILE_NOT_FOUND: + case SSH_KNOWN_HOSTS_NOT_FOUND: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The known host file %s does not exist", s->ssh->server_hostkey_path); server_identity_is_ok = 0; break; - case SSH_SERVER_ERROR: + case SSH_KNOWN_HOSTS_ERROR: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "Some error happened"); server_identity_is_ok = 0; break; - case SSH_SERVER_FOUND_OTHER: + case SSH_KNOWN_HOSTS_OTHER: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server gave use a key of a type while we had an other type recorded. " \ "It is a possible attack."); server_identity_is_ok = 0; @@ -1435,6 +1445,10 @@ sk_open(sock *s) if (sk_set_high_port(s) < 0) log(L_WARN "Socket error: %s%#m", s->err); + if (s->flags & SKF_FREEBIND) + if (sk_set_freebind(s) < 0) + log(L_WARN "Socket error: %s%#m", s->err); + sockaddr_fill(&sa, s->af, bind_addr, s->iface, bind_port); if (bind(fd, &sa.sa, SA_LEN(sa)) < 0) ERR2("bind"); @@ -1879,8 +1893,8 @@ sk_read_ssh(sock *s) /* sk_read() and sk_write() are called from BFD's event loop */ -int -sk_read(sock *s, int revents) +static inline int +sk_read_noflush(sock *s, int revents) { switch (s->type) { @@ -1943,7 +1957,15 @@ sk_read(sock *s, int revents) } int -sk_write(sock *s) +sk_read(sock *s, int revents) +{ + int e = sk_read_noflush(s, revents); + tmp_flush(); + return e; +} + +static inline int +sk_write_noflush(sock *s) { switch (s->type) { @@ -1991,6 +2013,14 @@ sk_write(sock *s) } } +int +sk_write(sock *s) +{ + int e = sk_write_noflush(s); + tmp_flush(); + return e; +} + int sk_is_ipv4(sock *s) { return s->af == AF_INET; } @@ -2009,6 +2039,7 @@ sk_err(sock *s, int revents) } s->err_hook(s, se); + tmp_flush(); } void @@ -2058,8 +2089,8 @@ io_update_time(void) event_open->duration = last_io_time - event_open->timestamp; if (event_open->duration > config->latency_limit) - log(L_WARN "Event 0x%p 0x%p took %d ms", - event_open->hook, event_open->data, (int) (event_open->duration TO_MS)); + log(L_WARN "Event 0x%p 0x%p took %u.%03u ms", + event_open->hook, event_open->data, (uint) (event_open->duration TO_MS), (uint) (event_open->duration % 1000)); event_open = NULL; } @@ -2163,8 +2194,8 @@ watchdog_stop(void) btime duration = last_io_time - loop_time; if (duration > config->watchdog_warning) - log(L_WARN "I/O loop cycle took %d ms for %d events", - (int) (duration TO_MS), event_log_num); + log(L_WARN "I/O loop cycle took %u.%03u ms for %d events", + (uint) (duration TO_MS), (uint) (duration % 1000), event_log_num); } @@ -2192,8 +2223,6 @@ static int short_loops = 0; #define SHORT_LOOP_MAX 10 #define WORK_EVENTS_MAX 10 -void pipe_drain(int fd); - void io_loop(void) { @@ -2225,8 +2254,7 @@ io_loop(void) } /* A hack to reload main io_loop() when something has changed asynchronously. */ - pfd[0].fd = main_birdloop.wakeup_fds[0]; - pfd[0].events = POLLIN; + pipe_pollin(&main_birdloop.wakeup, &pfd[0]); nfds = 1; @@ -2234,7 +2262,7 @@ io_loop(void) { pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */ s = SKIP_BACK(sock, n, n); - if (s->rx_hook && !ev_corked(s->cork)) + if (s->rx_hook) { pfd[nfds].fd = s->fd; pfd[nfds].events |= POLLIN; @@ -2297,14 +2325,14 @@ io_loop(void) { if (errno == EINTR || errno == EAGAIN) continue; - die("poll: %m"); + bug("poll: %m"); } if (pout) { if (pfd[0].revents & POLLIN) { /* IO loop reload requested */ - pipe_drain(main_birdloop.wakeup_fds[0]); + pipe_drain(&main_birdloop.wakeup); atomic_exchange_explicit(&main_birdloop.ping_sent, 0, memory_order_acq_rel); continue; } diff --git a/sysdep/unix/krt.Y b/sysdep/unix/krt.Y index 95b54d65..4ce9a328 100644 --- a/sysdep/unix/krt.Y +++ b/sysdep/unix/krt.Y @@ -29,7 +29,7 @@ kif_set_preferred(ip_addr ip) CF_DECLS -CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, KRT_SOURCE, KRT_METRIC, MERGE, PATHS) +CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, MERGE, PATHS) CF_KEYWORDS(INTERFACE, PREFERRED) %type <i> kern_mp_limit @@ -122,9 +122,6 @@ kif_iface: kif_iface_start iface_patt_list_nopx kif_iface_opt_list; -dynamic_attr: KRT_SOURCE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SOURCE); } ; -dynamic_attr: KRT_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_METRIC); } ; - CF_CODE CF_END diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index 98c56391..b0033a65 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -53,7 +53,7 @@ #include "nest/bird.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "filter/filter.h" #include "conf/conf.h" @@ -232,7 +232,6 @@ kif_copy_config(struct proto_config *dest, struct proto_config *src) struct protocol proto_unix_iface = { .name = "Device", .template = "device%d", - .class = PROTOCOL_DEVICE, .proto_size = sizeof(struct kif_proto), .config_size = sizeof(struct kif_config), .preconfig = kif_preconfig, @@ -243,6 +242,13 @@ struct protocol proto_unix_iface = { .copy_config = kif_copy_config }; +void +kif_build(void) +{ + proto_build(&proto_unix_iface); +} + + /* * Tracing of routes */ @@ -280,30 +286,46 @@ static struct tbf rl_alien = TBF_DEFAULT_LOG_LIMITS; static inline u32 krt_metric(rte *a) { - eattr *ea = ea_find(a->attrs->eattrs, EA_KRT_METRIC); + eattr *ea = ea_find(a->attrs, &ea_krt_metric); return ea ? ea->u.data : 0; } static inline int -krt_rte_better(rte *a, rte *b) +krt_same_key(rte *a, rte *b) { - return (krt_metric(a) > krt_metric(b)); + return (krt_metric(a) == krt_metric(b)); +} + +static inline int +krt_uptodate(rte *a, rte *b) +{ + return (a->attrs == b->attrs); } /* Called when alien route is discovered during scan */ static void -krt_learn_rte(struct krt_proto *p, rte *e) +krt_learn_scan(struct krt_proto *p, rte *e) { - struct rte_src *src = e->src = rt_get_source(&p->p, krt_metric(e)); - rte_update(p->p.main_channel, e->net, e, e->src); - rt_unlock_source(src); + rte e0 = { + .attrs = e->attrs, + .src = rt_get_source(&p->p, krt_metric(e)), + }; + + ea_set_attr_u32(&e0.attrs, &ea_gen_preference, 0, p->p.main_channel->preference); + + rte_update(p->p.main_channel, e->net, &e0, e0.src); + rt_unlock_source(e0.src); } static void -krt_learn_init(struct krt_proto *p) +krt_learn_async(struct krt_proto *p, rte *e, int new) { - if (KRT_CF->learn) - channel_setup_in_table(p->p.main_channel, 1); + if (new) + return krt_learn_scan(p, e); + + struct rte_src *src = rt_get_source(&p->p, krt_metric(e)); + rte_update(p->p.main_channel, e->net, NULL, src); + rt_unlock_source(src); } #endif @@ -323,7 +345,7 @@ rte_feed_count(net *n) { uint count = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) + if (rte_is_valid(RTE_OR_NULL(e))) count++; return count; } @@ -333,7 +355,7 @@ rte_feed_obtain(net *n, rte **feed, uint count) { uint i = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) + if (rte_is_valid(RTE_OR_NULL(e))) { ASSERT_DIE(i < count); feed[i++] = &e->rte; @@ -344,6 +366,13 @@ rte_feed_obtain(net *n, rte **feed, uint count) static struct rte * krt_export_net(struct krt_proto *p, net *net) { + /* FIXME: Here we are calling filters in table-locked context when exporting + * to kernel. Here BIRD can crash if the user requested ROA check in kernel + * export filter. It doesn't make much sense to write the filters like this, + * therefore we may keep this unfinished piece of work here for later as it + * won't really affect anybody. */ + ASSERT_DIE(RT_IS_LOCKED(p->p.main_channel->table)); + struct channel *c = p->p.main_channel; const struct filter *filter = c->out_filter; @@ -372,7 +401,7 @@ krt_export_net(struct krt_proto *p, net *net) if (filter == FILTER_ACCEPT) goto accept; - if (f_run(filter, &rt, krt_filter_lp, FF_SILENT) > F_ACCEPT) + if (f_run(filter, &rt, FF_SILENT) > F_ACCEPT) goto reject; @@ -386,15 +415,12 @@ reject: static int krt_same_dest(rte *k, rte *e) { - rta *ka = k->attrs, *ea = e->attrs; - - if (ka->dest != ea->dest) - return 0; + ea_list *ka = k->attrs, *ea = e->attrs; - if (ka->dest == RTD_UNICAST) - return nexthop_same(&(ka->nh), &(ea->nh)); + eattr *nhea_k = ea_find(ka, &ea_gen_nexthop); + eattr *nhea_e = ea_find(ea, &ea_gen_nexthop); - return 1; + return (!nhea_k == !nhea_e) && adata_same(nhea_k->u.ptr, nhea_e->u.ptr); } /* @@ -419,7 +445,7 @@ krt_got_route(struct krt_proto *p, rte *e, s8 src) case KRT_SRC_ALIEN: if (KRT_CF->learn) - krt_learn_rte(p, e); + krt_learn_scan(p, e); else krt_trace_in_rl(&rl_alien, p, e, "[alien] ignored"); return; @@ -427,7 +453,9 @@ krt_got_route(struct krt_proto *p, rte *e, s8 src) #endif /* The rest is for KRT_SRC_BIRD (or KRT_SRC_UNKNOWN) */ - RT_LOCK(p->p.main_channel->table); + RT_LOCKED(p->p.main_channel->table, tab) + { + /* Deleting all routes if flush is requested */ if (p->flush_routes) goto delete; @@ -436,7 +464,7 @@ krt_got_route(struct krt_proto *p, rte *e, s8 src) if (!p->ready) goto ignore; - net *net = net_find(RT_PRIV(p->p.main_channel->table), e->net); + net *net = net_find(tab, e->net); if (!net || !krt_is_installed(p, net)) goto delete; @@ -481,8 +509,9 @@ delete: krt_replace_rte(p, e->net, NULL, e); goto done; -done: - RT_UNLOCK(p->p.main_channel->table); +done:; + } + lp_flush(krt_filter_lp); } @@ -490,18 +519,13 @@ static void krt_init_scan(struct krt_proto *p) { bmap_reset(&p->seen_map, 1024); - -#ifdef KRT_ALLOW_LEARN - if (KRT_CF->learn) - channel_refresh_begin(p->p.main_channel); -#endif } static void krt_prune(struct krt_proto *p) { - RT_LOCK(p->p.main_channel->table); - rtable_private *t = RT_PRIV(p->p.main_channel->table); + RT_LOCKED(p->p.main_channel->table, t) + { KRT_TRACE(p, D_EVENTS, "Pruning table %s", t->name); FIB_WALK(&t->fib, net, n) @@ -521,15 +545,10 @@ krt_prune(struct krt_proto *p) } FIB_WALK_END; - RT_UNLOCK(p->p.main_channel->table); - -#ifdef KRT_ALLOW_LEARN - if (KRT_CF->learn) - channel_refresh_end(p->p.main_channel); -#endif - if (p->ready) p->initialized = 1; + + } } static void @@ -567,25 +586,24 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src) case KRT_SRC_ALIEN: if (KRT_CF->learn) { - krt_learn_rte(p, e); + krt_learn_async(p, e, new); return; } #endif } } + /* * Periodic scanning */ - -#ifdef CONFIG_ALL_TABLES_AT_ONCE - -static timer *krt_scan_timer; -static int krt_scan_count; +static timer *krt_scan_all_timer; +static int krt_scan_all_count; +static _Bool krt_scan_all_tables; static void -krt_scan(timer *t UNUSED) +krt_scan_all(timer *t UNUSED) { struct krt_proto *p; node *n; @@ -606,35 +624,42 @@ krt_scan(timer *t UNUSED) } static void -krt_scan_timer_start(struct krt_proto *p) +krt_scan_all_timer_start(struct krt_proto *p) { - if (!krt_scan_count) - krt_scan_timer = tm_new_init(krt_pool, krt_scan, NULL, KRT_CF->scan_time, 0); + if (!krt_scan_all_count) + krt_scan_all_timer = tm_new_init(krt_pool, krt_scan_all, NULL, KRT_CF->scan_time, 0); - krt_scan_count++; + krt_scan_all_count++; - tm_start(krt_scan_timer, 1 S); + tm_start(krt_scan_all_timer, 1 S); } static void -krt_scan_timer_stop(struct krt_proto *p UNUSED) +krt_scan_all_timer_stop(void) { - krt_scan_count--; + ASSERT(krt_scan_all_count > 0); - if (!krt_scan_count) + krt_scan_all_count--; + + if (!krt_scan_all_count) { - rfree(krt_scan_timer); - krt_scan_timer = NULL; + rfree(krt_scan_all_timer); + krt_scan_all_timer = NULL; } } static void -krt_scan_timer_kick(struct krt_proto *p UNUSED) +krt_scan_all_timer_kick(void) { - tm_start(krt_scan_timer, 0); + tm_start(krt_scan_all_timer, 0); +} + +void +krt_use_shared_scan(void) +{ + krt_scan_all_tables = 1; } -#else static void krt_scan(timer *t) @@ -652,35 +677,42 @@ krt_scan(timer *t) static void krt_scan_timer_start(struct krt_proto *p) { - p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0); - tm_start(p->scan_timer, 1 S); + if (krt_scan_all_tables) + krt_scan_all_timer_start(p); + else + { + p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0); + tm_start(p->scan_timer, 1 S); + } } static void krt_scan_timer_stop(struct krt_proto *p) { - tm_stop(p->scan_timer); + if (krt_scan_all_tables) + krt_scan_all_timer_stop(); + else + tm_stop(p->scan_timer); } static void krt_scan_timer_kick(struct krt_proto *p) { - tm_start(p->scan_timer, 0); + if (krt_scan_all_tables) + krt_scan_all_timer_kick(); + else + tm_start(p->scan_timer, 0); } -#endif - - - /* * Updates */ static int -krt_preexport(struct channel *c, rte *e) +krt_preexport(struct channel *C, rte *e) { - if (e->src->owner == &c->proto->sources) + if (e->src->owner == &C->proto->sources) return -1; if (!krt_capable(e)) @@ -754,6 +786,14 @@ krt_feed_end(struct channel *C) krt_scan_timer_kick(p); } +static int +krt_rte_better(rte *new, rte *old) +{ + u32 n = ea_get_int(new->attrs, &ea_krt_metric, IGP_METRIC_UNKNOWN); + u32 o = ea_get_int(old->attrs, &ea_krt_metric, IGP_METRIC_UNKNOWN); + + return (n < o); +} /* * Protocol glue @@ -780,11 +820,6 @@ krt_postconfig(struct proto_config *CF) if (! proto_cf_main_channel(CF)) cf_error("Channel not specified"); -#ifdef CONFIG_ALL_TABLES_AT_ONCE - if (krt_cf->scan_time != cf->scan_time) - cf_error("All kernel syncers must use the same table scan interval"); -#endif - struct channel_config *cc = proto_cf_main_channel(CF); struct rtable_config *tab = cc->table; if (tab->krt_attached) @@ -800,6 +835,10 @@ krt_postconfig(struct proto_config *CF) krt_sys_postconfig(cf); } +struct rte_owner_class krt_rte_owner_class = { + .rte_better = krt_rte_better, +}; + static struct proto * krt_init(struct proto_config *CF) { @@ -813,7 +852,8 @@ krt_init(struct proto_config *CF) p->p.if_notify = krt_if_notify; p->p.reload_routes = krt_reload_routes; p->p.feed_end = krt_feed_end; - p->p.rte_better = krt_rte_better; + + p->p.sources.class = &krt_rte_owner_class; krt_sys_init(p); return &p->p; @@ -839,10 +879,6 @@ krt_start(struct proto *P) bmap_init(&p->seen_map, p->p.pool, 1024); add_tail(&krt_proto_list, &p->krt_node); -#ifdef KRT_ALLOW_LEARN - krt_learn_init(p); -#endif - if (!krt_sys_start(p)) { rem_node(&p->krt_node); @@ -922,24 +958,15 @@ krt_copy_config(struct proto_config *dest, struct proto_config *src) krt_sys_copy_config(d, s); } -static int -krt_get_attr(const eattr *a, byte *buf, int buflen) -{ - switch (a->id) - { - case EA_KRT_SOURCE: - bsprintf(buf, "source"); - return GA_NAME; - - case EA_KRT_METRIC: - bsprintf(buf, "metric"); - return GA_NAME; - - default: - return krt_sys_get_attr(a, buf, buflen); - } -} +struct ea_class ea_krt_source = { + .name = "krt_source", + .type = T_INT, +}; +struct ea_class ea_krt_metric = { + .name = "krt_metric", + .type = T_INT, +}; #ifdef CONFIG_IP6_SADR_KERNEL #define MAYBE_IP6_SADR NB_IP6_SADR @@ -956,7 +983,6 @@ krt_get_attr(const eattr *a, byte *buf, int buflen) struct protocol proto_unix_kernel = { .name = "Kernel", .template = "kernel%d", - .class = PROTOCOL_KERNEL, .preference = DEF_PREF_INHERITED, .channel_mask = NB_IP | MAYBE_IP6_SADR | MAYBE_MPLS, .proto_size = sizeof(struct krt_proto), @@ -968,5 +994,15 @@ struct protocol proto_unix_kernel = { .shutdown = krt_shutdown, .reconfigure = krt_reconfigure, .copy_config = krt_copy_config, - .get_attr = krt_get_attr, }; + +void +krt_build(void) +{ + proto_build(&proto_unix_kernel); + + EA_REGISTER_ALL( + &ea_krt_source, + &ea_krt_metric, + ); +} diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h index 968c5b16..9f7ebb4f 100644 --- a/sysdep/unix/krt.h +++ b/sysdep/unix/krt.h @@ -21,8 +21,7 @@ struct kif_proto; #define KRT_DEFAULT_ECMP_LIMIT 16 -#define EA_KRT_SOURCE EA_CODE(PROTOCOL_KERNEL, 0) -#define EA_KRT_METRIC EA_CODE(PROTOCOL_KERNEL, 1) +extern struct ea_class ea_krt_source, ea_krt_metric; #define KRT_REF_SEEN 0x1 /* Seen in table */ #define KRT_REF_BEST 0x2 /* Best in table */ @@ -51,10 +50,7 @@ struct krt_proto { struct proto p; struct krt_state sys; /* Sysdep state */ -#ifndef CONFIG_ALL_TABLES_AT_ONCE timer *scan_timer; -#endif - struct bmap sync_map; /* Keeps track which exported routes were successfully written to kernel */ struct bmap seen_map; /* Routes seen during last periodic scan */ node krt_node; /* Node in krt_proto_list */ @@ -76,6 +72,7 @@ extern pool *krt_pool; struct proto_config * kif_init_config(int class); void kif_request_scan(void); +void krt_use_shared_scan(void); void krt_got_route(struct krt_proto *p, struct rte *e, s8 src); void krt_got_route_async(struct krt_proto *p, struct rte *e, int new, s8 src); diff --git a/sysdep/unix/log.c b/sysdep/unix/log.c index f48588b6..185231e8 100644 --- a/sysdep/unix/log.c +++ b/sysdep/unix/log.c @@ -36,10 +36,10 @@ static FILE *dbgf; static list *current_log_list; static char *current_syslog_name; /* NULL -> syslog closed */ -static _Atomic uint max_coro_id = ATOMIC_VAR_INIT(1); -static _Thread_local uint this_coro_id; +static _Atomic uint max_thread_id = ATOMIC_VAR_INIT(1); +static _Thread_local uint this_thread_id; -#define THIS_CORO_ID (this_coro_id ?: (this_coro_id = atomic_fetch_add_explicit(&max_coro_id, 1, memory_order_acq_rel))) +#define THIS_THREAD_ID (this_thread_id ?: (this_thread_id = atomic_fetch_add_explicit(&max_thread_id, 1, memory_order_acq_rel))) #include <pthread.h> @@ -183,7 +183,7 @@ log_commit(int class, buffer *buf) l->pos += msg_len; } - fprintf(l->fh, "%s [%04x] <%s> ", tbuf, THIS_CORO_ID, class_names[class]); + fprintf(l->fh, "%s [%04x] <%s> ", tbuf, THIS_THREAD_ID, class_names[class]); } fputs(buf->start, l->fh); fputc('\n', l->fh); @@ -329,7 +329,7 @@ debug(const char *msg, ...) sec = dbg_time.tv_sec - dbg_time_start.tv_sec - 1; } - int n = bsnprintf(pos, max, "%u.%09u: [%04x] ", sec, nsec, THIS_CORO_ID); + int n = bsnprintf(pos, max, "%u.%09u: [%04x] ", sec, nsec, THIS_THREAD_ID); pos += n; max -= n; diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c index 5da27cb6..bf9f2be0 100644 --- a/sysdep/unix/main.c +++ b/sysdep/unix/main.c @@ -31,7 +31,7 @@ #include "lib/locking.h" #include "lib/timer.h" #include "lib/string.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "nest/cli.h" @@ -57,7 +57,7 @@ async_dump(void) // XXXX tm_dump_all(); if_dump_all(); neigh_dump_all(); - rta_dump_all(); + ea_dump_all(); rt_dump_all(); protos_dump_all(); @@ -117,7 +117,7 @@ add_num_const(char *name, int val, const char *file, const uint line) struct f_val *v = cfg_alloc(sizeof(struct f_val)); *v = (struct f_val) { .type = T_INT, .val.i = val }; struct symbol *sym = cf_get_symbol(name); - if (sym->class && (sym->scope == conf_this_scope)) + if (sym->class && cf_symbol_is_local(sym)) cf_error("Error reading value for %s from %s:%d: already defined", name, file, line); cf_define_symbol(sym, SYM_CONSTANT | T_INT, val, v); @@ -683,7 +683,7 @@ signal_init(void) * Parsing of command-line arguments */ -static char *opt_list = "B:c:dD:ps:P:u:g:flRh"; +static char *opt_list = "bc:dD:ps:P:u:g:flRh"; int parse_and_exit; char *bird_name; static char *use_user; @@ -704,7 +704,6 @@ display_help(void) fprintf(stderr, "\n" "Options: \n" - " -B <block-size> Use 2^this number as memory allocation block size (default: 12)\n" " -c <config-file> Use given configuration file instead of\n" " " PATH_CONFIG_FILE "\n" " -d Enable debug messages and run bird in foreground\n" @@ -791,15 +790,12 @@ get_gid(const char *s) return gr->gr_gid; } -extern _Bool alloc_multipage; - static void parse_args(int argc, char **argv) { int config_changed = 0; int socket_changed = 0; int c; - int bp; bird_name = get_bird_name(argv[0], "bird"); if (argc == 2) @@ -812,29 +808,6 @@ parse_args(int argc, char **argv) while ((c = getopt(argc, argv, opt_list)) >= 0) switch (c) { - case 'B': - bp = atoi(optarg); - if (bp < 1) - { - fprintf(stderr, "Strange block size power %d\n\n", bp); - display_usage(); - exit(1); - } - - if ((1 << bp) < page_size) - { - fprintf(stderr, "Requested block size %ld is lesser than page size %ld\n\n", (1L<<bp), page_size); - display_usage(); - exit(1); - } - - if ((1L << bp) > page_size) - { - alloc_multipage = 1; - page_size = (1L << bp); - } - - break; case 'c': config_name = optarg; config_changed = 1; @@ -889,8 +862,6 @@ parse_args(int argc, char **argv) } } -void resource_sys_init(void); - /* * Hic Est main() */ @@ -904,7 +875,6 @@ main(int argc, char **argv) #endif times_update(); - resource_sys_init(); parse_args(argc, argv); log_switch(1, NULL, NULL); @@ -915,8 +885,8 @@ main(int argc, char **argv) resource_init(); birdloop_init(); olock_init(); - io_init(); rt_init(); + io_init(); if_init(); // roa_init(); config_init(); @@ -940,8 +910,6 @@ main(int argc, char **argv) open_pid_file(); protos_build(); - proto_build(&proto_unix_kernel); - proto_build(&proto_unix_iface); struct config *conf = read_config(); diff --git a/test/birdtest.c b/test/birdtest.c index c6a09684..5e3de1c5 100644 --- a/test/birdtest.c +++ b/test/birdtest.c @@ -20,6 +20,8 @@ #include "test/birdtest.h" #include "lib/string.h" +#include "lib/event.h" +#include "lib/io-loop.h" #ifdef HAVE_EXECINFO_H #include <execinfo.h> @@ -58,14 +60,14 @@ u64 bt_random_state[] = { 0x53d9772877c1b647, 0xab8ce3eb466de6c5, 0xad02844c8a8e865f, 0xe8cc78080295065d }; -void resource_sys_init(void); - void bt_init(int argc, char *argv[]) { int c; - resource_sys_init(); + /* We have no interest in stdin */ + close(0); + initstate(BT_RANDOM_SEED, (char *) bt_random_state, sizeof(bt_random_state)); bt_verbose = 0; @@ -122,6 +124,11 @@ bt_init(int argc, char *argv[]) clock_gettime(CLOCK_MONOTONIC, &bt_begin); bt_suite_case_begin = bt_suite_begin = bt_begin; + the_bird_lock(); + resource_init(); + ev_init_list(&global_event_list, &main_birdloop, "Global event list in unit tests"); + ev_init_list(&global_work_list, &main_birdloop, "Global work list in unit tests"); + birdloop_init(); return; usage: @@ -175,6 +182,8 @@ int bt_run_test_fn(int (*fn)(const void *), const void *fn_arg, int timeout) if (!bt_suite_result) result = 0; + tmp_flush(); + return result; } @@ -312,6 +321,12 @@ bt_log_suite_case_result(int result, const char *fmt, ...) } } +void +bt_reset_suite_case_timer(void) +{ + clock_gettime(CLOCK_MONOTONIC, &bt_suite_case_begin); +} + int bt_test_suite_base(int (*fn)(const void *), const char *id, const void *fn_arg, int forked, int timeout, const char *dsc, ...) { @@ -504,6 +519,15 @@ bt_fmt_ipa(char *buf, size_t size, const void *data) bsnprintf(buf, size, "(null)"); } +void +bt_format_net(char *buf, size_t size, const void *data) +{ + if (data) + bsnprintf(buf, size, "%N", (const net_addr *) data); + else + bsnprintf(buf, size, "(null)"); +} + int bt_is_char(byte c) { diff --git a/test/birdtest.h b/test/birdtest.h index caec529b..b8978b3e 100644 --- a/test/birdtest.h +++ b/test/birdtest.h @@ -32,6 +32,7 @@ extern const char *bt_test_id; void bt_init(int argc, char *argv[]); int bt_exit_value(void); +void bt_reset_suite_case_timer(void); int bt_test_suite_base(int (*test_fn)(const void *), const char *test_id, const void *test_fn_argument, int forked, int timeout, const char *dsc, ...); static inline u64 bt_random(void) { return ((u64) random() & 0xffffffff) | ((u64) random() << 32); } @@ -39,7 +40,7 @@ static inline u64 bt_random(void) void bt_log_suite_result(int result, const char *fmt, ...); void bt_log_suite_case_result(int result, const char *fmt, ...); -#define BT_TIMEOUT 5 /* Default timeout in seconds */ +#define BT_TIMEOUT 20 /* Default timeout in seconds */ #define BT_FORKING 1 /* Forking is enabled in default */ #define BT_RANDOM_SEED 0x5097d2bb @@ -165,6 +166,8 @@ struct bt_batch { void bt_fmt_str(char *buf, size_t size, const void *data); void bt_fmt_unsigned(char *buf, size_t size, const void *data); void bt_fmt_ipa(char *buf, size_t size, const void *data); +void bt_format_net(char *buf, size_t size, const void *data); + int bt_assert_batch__(struct bt_batch *opts); int bt_is_char(byte c); diff --git a/test/bt-utils.c b/test/bt-utils.c index 90815e77..36e44da4 100644 --- a/test/bt-utils.c +++ b/test/bt-utils.c @@ -14,7 +14,7 @@ #include "test/bt-utils.h" #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "sysdep/unix/unix.h" @@ -58,30 +58,21 @@ void resource_sys_init(void); void bt_bird_init(void) { - resource_sys_init(); if(bt_verbose) log_init_debug(""); log_switch(bt_verbose != 0, NULL, NULL); - the_bird_lock(); - resource_init(); olock_init(); - birdloop_init(); - io_init(); rt_init(); + io_init(); if_init(); config_init(); protos_build(); - proto_build(&proto_unix_kernel); - proto_build(&proto_unix_iface); } void bt_bird_cleanup(void) { - for (int i = 0; i < PROTOCOL__MAX; i++) - class_to_protocol[i] = NULL; - config = new_config = NULL; the_bird_unlock(); } diff --git a/tools/gendist b/tools/gendist index 2ac59030..2dc42ba9 100755 --- a/tools/gendist +++ b/tools/gendist @@ -2,6 +2,7 @@ # # Generate BIRD Distribution Archive # (c) 2000--2004 Martin Mares <mj@ucw.cz> +# (c) 2005--2022 Ondrej Filip <feela@network.cz> # VERSION=`grep 'BIRD_VERSION \"' sysdep/config.h | sed '/BIRD_VERSION/!d;s/^.*"\(.*\)"$/\1/'` @@ -33,8 +34,6 @@ rm -rf `find $T/$REL -name CVS -o -name tmp` $T/$REL/{misc,rfc,doc/slides,doc/sl rm -rf $T/$REL $T/$DREL echo -n "OK? " read OK -echo Uploading to Atrey... -scp $T/$REL.tar.gz $T/$DREL.tar.gz atrey.karlin.mff.cuni.cz:~ftp/pub/bird/ echo Uploading to Trubka... scp $T/$REL.tar.gz $T/$DREL.tar.gz bird.network.cz:~ftp/pub/bird/ echo Done. diff --git a/tools/linuxdoc b/tools/linuxdoc index 51110e79..58f5cbc4 100755 --- a/tools/linuxdoc +++ b/tools/linuxdoc @@ -25,8 +25,13 @@ use FindBin; $prefix = "/usr"; $isoentities_prefix = "/usr"; $DataDir = "$FindBin::Bin/../doc/sbase"; -$AuxBinDir = "/usr/lib/linuxdoc-tools"; - +if (-d "/usr/lib/linuxdoc-tools") +{ + $AuxBinDir = "/usr/lib/linuxdoc-tools"; +} else +{ + $AuxBinDir = "/usr/bin"; +} use lib "$FindBin::Bin/linuxdoc-tools"; # --------------------------------------------------------------------- |