diff options
189 files changed, 17024 insertions, 11826 deletions
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index fefb6fa6..1c8aa869 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -5,9 +5,10 @@ variables: DOCKER_CMD: docker --config="$HOME/.docker/$CI_JOB_ID/" IMG_BASE: registry.nic.cz/labs/bird TOOLS_DIR: /var/lib/gitlab-runner/bird-tools + STAYRTR_BINARY: /var/lib/gitlab-runner/stayrtr-0.1-108-g8d18a41-linux-x86_64 stages: - - image +# - image - build - pkg - test @@ -32,165 +33,12 @@ stages: # That's Docker in Docker - dind -docker_debian-9-amd64: - variables: - IMG_NAME: "debian-9-amd64" - <<: *docker_build - -docker_debian-9-i386: - variables: - IMG_NAME: "debian-9-i386" - <<: *docker_build - -docker_debian-10-amd64: - variables: - IMG_NAME: "debian-10-amd64" - <<: *docker_build - -docker_debian-10-i386: - variables: - IMG_NAME: "debian-10-i386" - <<: *docker_build - -docker_debian-11-amd64: - variables: - IMG_NAME: "debian-11-amd64" - <<: *docker_build - -# GPG error -#docker_debian-11-i386: -# variables: -# IMG_NAME: "debian-11-i386" -# <<: *docker_build - -docker_debian-testing-amd64: - variables: - IMG_NAME: "debian-testing-amd64" - <<: *docker_build - -# GPG error -#docker_debian-testing-i386: +# Docker build example +#docker_debian-11-amd64: # variables: -# IMG_NAME: "debian-testing-i386" +# IMG_NAME: "debian-11-amd64" # <<: *docker_build -docker_fedora-25-amd64: - variables: - IMG_NAME: "fedora-25-amd64" - <<: *docker_build - -docker_fedora-26-amd64: - variables: - IMG_NAME: "fedora-26-amd64" - <<: *docker_build - -docker_fedora-27-amd64: - variables: - IMG_NAME: "fedora-27-amd64" - <<: *docker_build - -docker_fedora-28-amd64: - variables: - IMG_NAME: "fedora-28-amd64" - <<: *docker_build - -docker_fedora-29-amd64: - variables: - IMG_NAME: "fedora-29-amd64" - <<: *docker_build - -docker_fedora-30-amd64: - variables: - IMG_NAME: "fedora-30-amd64" - <<: *docker_build - -docker_fedora-31-amd64: - variables: - IMG_NAME: "fedora-31-amd64" - <<: *docker_build - -docker_fedora-32-amd64: - variables: - IMG_NAME: "fedora-32-amd64" - <<: *docker_build - -docker_fedora-33-amd64: - variables: - IMG_NAME: "fedora-33-amd64" - <<: *docker_build - -docker_fedora-34-amd64: - variables: - IMG_NAME: "fedora-34-amd64" - <<: *docker_build - -docker_centos-8-amd64: - variables: - IMG_NAME: "centos-8-amd64" - <<: *docker_build - -docker_ubuntu-16_04-amd64: - variables: - IMG_NAME: "ubuntu-16.04-amd64" - <<: *docker_build - -docker_ubuntu-18_04-amd64: - variables: - IMG_NAME: "ubuntu-18.04-amd64" - <<: *docker_build - -docker_ubuntu-20_04-amd64: - variables: - IMG_NAME: "ubuntu-20.04-amd64" - <<: *docker_build - -docker_ubuntu-21_10-amd64: - variables: - IMG_NAME: "ubuntu-21.10-amd64" - <<: *docker_build - -# GPG error -#docker_ubuntu-21_04-amd64: -# variables: -# IMG_NAME: "ubuntu-21.04-amd64" -# <<: *docker_build - -docker_opensuse-15.0-amd64: - variables: - IMG_NAME: "opensuse-15.0-amd64" - <<: *docker_build - -docker_opensuse-15.1-amd64: - variables: - IMG_NAME: "opensuse-15.1-amd64" - <<: *docker_build - -docker_opensuse-15.2-amd64: - variables: - IMG_NAME: "opensuse-15.2-amd64" - <<: *docker_build - -docker_opensuse-15.3-amd64: - variables: - IMG_NAME: "opensuse-15.3-amd64" - <<: *docker_build - -# TODO We want to copy these BSDs to our own virtual machines, to make sure -# someone doesn't update them by accident. -.freebsd-11-i386: &freebsd-11-i386_env - tags: - - freebsd - - i386 - #only: - #- master - #- triggers - #- tags - -.freebsd-11-amd64: &freebsd-11-amd64_env - tags: - - freebsd - - amd64 - .build: &build-base stage: build @@ -212,6 +60,14 @@ docker_opensuse-15.3-amd64: - linux - amd64 +build-debian-8-amd64: + <<: *build-linux + image: registry.nic.cz/labs/bird:debian-8-amd64 + +build-debian-8-i386: + <<: *build-linux + image: registry.nic.cz/labs/bird:debian-8-i386 + build-debian-9-amd64: <<: *build-linux image: registry.nic.cz/labs/bird:debian-9-amd64 @@ -324,17 +180,17 @@ build-opensuse-15.3-amd64: <<: *build-linux image: registry.nic.cz/labs/bird:opensuse-15.3-amd64 -build-freebsd-11-amd64: - <<: *build-base - tags: - - freebsd - - amd64 +#build-freebsd-11-amd64: +# <<: *build-base +# tags: +# - freebsd +# - amd64 -build-freebsd-11-i386: - <<: *build-base - tags: - - freebsd - - i386 +#build-freebsd-11-i386: +# <<: *build-base +# tags: +# - freebsd +# - i386 .pkg-deb: &pkg-deb @@ -369,12 +225,22 @@ build-freebsd-11-i386: - pkg/pkgs/* # Dpkg error: PATH is not set +#pkg-debian-8-amd64: +# <<: *pkg-deb +# needs: [build-debian-8-amd64] +# image: registry.nic.cz/labs/bird:debian-8-amd64 + +# Dpkg error: PATH is not set +#pkg-debian-8-i386: +# <<: *pkg-deb +# needs: [build-debian-8-i386] +# image: registry.nic.cz/labs/bird:debian-8-i386 + pkg-debian-9-amd64: <<: *pkg-deb needs: [build-debian-9-amd64] image: registry.nic.cz/labs/bird:debian-9-amd64 -# Dpkg error: PATH is not set pkg-debian-9-i386: <<: *pkg-deb needs: [build-debian-9-i386] @@ -418,7 +284,7 @@ pkg-fedora-33-amd64: pkg-fedora-34-amd64: <<: *pkg-rpm needs: [build-fedora-34-amd64] - image: registry.nic.cz/labs/bird:fedora-34-amd64 + image: registry.labs.nic.cz/labs/bird:fedora-34-amd64 pkg-centos-8-amd64: <<: *pkg-rpm-wa @@ -476,6 +342,7 @@ build-birdlab: - sudo git clean -fx - git pull --ff-only - mv $DIR/bird $DIR/birdc netlab/common + - ln -s $STAYRTR_BINARY netlab/common/stayrtr .test: &test-base stage: test @@ -486,7 +353,7 @@ build-birdlab: script: - cd $TOOLS_DIR/netlab - sudo ./stop - - sudo ./runtest -m check $TEST_NAME + - sudo ./runtest -s v3 -m check $TEST_NAME test-ospf-base: <<: *test-base @@ -558,6 +425,16 @@ test-bgp-merged: variables: TEST_NAME: cf-bgp-merged +test-bgp-flowspec: + <<: *test-base + variables: + TEST_NAME: cf-bgp-flowspec + +test-bgp-rs-multitab: + <<: *test-base + variables: + TEST_NAME: cf-bgp-rs-multitab + test-ebgp-loop: <<: *test-base variables: @@ -568,12 +445,32 @@ test-ebgp-star: variables: TEST_NAME: cf-ebgp-star +test-ebgp-role: + <<: *test-base + variables: + TEST_NAME: cf-ebgp-role + +test-ebgp-graceful: + <<: *test-base + variables: + TEST_NAME: cf-ebgp-graceful + +test-ebgp-import-limit: + <<: *test-base + variables: + TEST_NAME: cf-ebgp-import-limit + test-ibgp-loop: <<: *test-base variables: TEST_NAME: cf-ibgp-loop -test-ibgp-star: +test-ibgp-loop-big: + <<: *test-base + variables: + TEST_NAME: cf-ibgp-loop-big + +test-ibgp-flat: <<: *test-base variables: TEST_NAME: cf-ibgp-flat @@ -592,3 +489,49 @@ test-rip-base: <<: *test-base variables: TEST_NAME: cf-rip-base + +test-kernel-learn: + <<: *test-base + variables: + TEST_NAME: cf-kernel-learn + + +.build-birdlab-base: &build-birdlab-base + stage: build + script: + - autoreconf + - ./configure + - gmake + - gmake check + +build-birdlab-debian-11: + <<: *build-birdlab-base + tags: + - birdlab-debian-11 + - amd64 + +build-birdlab-centos-08: + <<: *build-birdlab-base + tags: + - birdlab-centos-08 + - amd64 + +build-birdlab-fedora-37: + <<: *build-birdlab-base + tags: + - birdlab-fedora-37 + - amd64 + +build-birdlab-freebsd-13: + <<: *build-birdlab-base + tags: + - birdlab-freebsd-13 + - amd64 + +build-birdlab-openbsd-71: + <<: *build-birdlab-base + variables: + AUTOCONF_VERSION: "2.71" + tags: + - birdlab-openbsd-71 + - amd64 diff --git a/Makefile.in b/Makefile.in index e0ff4a1d..839efe24 100644 --- a/Makefile.in +++ b/Makefile.in @@ -26,6 +26,7 @@ INSTALL_DATA=@INSTALL_DATA@ client=$(addprefix $(exedir)/,@CLIENT@) daemon=$(exedir)/bird protocols=@protocols@ +PROTO_BUILD := $(protocols) dev kif krt prefix=@prefix@ exec_prefix=@exec_prefix@ @@ -160,10 +161,11 @@ $(client) $(daemon): $(Q)$(CC) $(LDFLAGS) -o $@ $^ $(LIBS) $(objdir)/sysdep/paths.h: Makefile - echo >$@ "/* Generated by Makefile, don't edit manually! */" - echo >>$@ "#define PATH_CONFIG_FILE \"@CONFIG_FILE@\"" - echo >>$@ "#define PATH_CONTROL_SOCKET \"@CONTROL_SOCKET@\"" - if test -n "@iproutedir@" ; then echo >>$@ "#define PATH_IPROUTE_DIR \"@iproutedir@\"" ; fi + $(E)echo GEN $@ + $(Q)echo >$@ "/* Generated by Makefile, don't edit manually! */" + $(Q)echo >>$@ "#define PATH_CONFIG_FILE \"@CONFIG_FILE@\"" + $(Q)echo >>$@ "#define PATH_CONTROL_SOCKET \"@CONTROL_SOCKET@\"" + $(Q)if test -n "@iproutedir@" ; then echo >>$@ "#define PATH_IPROUTE_DIR \"@iproutedir@\"" ; fi # Unit tests rules @@ -218,7 +220,7 @@ install-docs: # Cleanup clean:: - rm -f $(objdir)/sysdep/paths.h + rm -f $(objdir)/sysdep/paths.h $(objdir)/nest/proto-build.c rm -f $(addprefix $(exedir)/,bird birdc birdcl) find $(objdir) -name "*.[od]" -exec rm -f '{}' '+' @@ -1,3 +1,89 @@ +Version 2.0.12 (2023-01-23) + o Filter: New 'onlink' route attribute + o Compile-time option to use 4-way tries instead of 16-way ones + o BSD: Support for kernel route metric and other improvements + o Important bugfixes + +Version 2.0.11 (2022-11-12) + o BGP roles (RFC 9234) + o BGP: Keepalive time scaling + o BGP: New 'min hold time' and 'min keepalive time' options + o BGP: New 'next hop prefer global' option + o Filter: For loops and direct recursion + o Filter: Mixed declarations of local variables + o Filter: Improved static type checks + o Filter: Literal [] for empty set + o Linux: Netlink KRT improvements + o BSD: Experimental support for Netlink API + o Memory management improvements + o Many bugfixes + + Notes: + + In contrast to prior versions, configured keepalive time in BGP now scales + with negotiated hold time to maintain proportion between the keepalive time + and the hold time. + + The Linux KRT was updated to use the recent API for IPv6 ECMP routes instead + of the legacy one. Consequently, the Linux versions older than 4.11 are no + longer supported, at least for IPv6 ECMP routes. Also, routing table scanning + now runs separately for each table to avoid congestion. + + There is a minor change in recursive next hop processing. Previously, + recursive next hop must be resolved through a non-recursive route, now it must + be resolved through a prefix where both the best route and all routes with the + same preference (as the best route) are non-recursive. The old behavior might + lead in some corner cases to an infinite loop of recursive next hop resolution + due to a priority inversion. + + There is a minor change in the 'configure undo' command, it is no longer + available after failed reconfiguration, as the old configuration is already + released. + + +Version 2.0.10 (2022-06-16) + o BGP performance improvements + o BFD: New 'strict bind' option + o RPKI: VRF support + o Allow use of 240.0.0.0/4 as a private range + o BIRD client uses exit status to report errors + o Important bugfixes + +Version 2.0.9 (2022-02-09) + o BGP: Flowspec validation procedure + o Babel: MAC authentication support + o Routing table configuration blocks + o Optional prefix trie in routing table for faster LPM/interval queries + o CLI: New 'show route in <prefix>' command + o Filter: Faster (16-way) prefix sets + o Filter: MPLS label route attribute + o Filter: Operators to pick community components + o Filter: Operators to find minimum and maximum element of lists + o BGP: New 'free bind' option + o BGP: Log route updates that were changed to withdraws + o BGP: Improved 'invalid next hop' error reporting + o OSPF: Allow ifaces with host address as unnumbered PtP or PtMP ifaces + o OSPF: All packets on PtP networks should be sent to AllSPFRouters address + o Scripts for apkg-powered upstream packaging for deb and rpm + o Support for Blake2s and Blake2b hash functions + o Security keys / passwords can be entered in hexadecimal digits + o Memory statistics split into Effective and Overhead + o Linux: New option 'netlink rx buffer' to specify netlink socket buffer size + o BSD: Assume onlink flag on ifaces with only host addresses + o Many bugfixes + + Notes: + + For OSPF on PtP network, BIRD now sends all packets to multicast AllSPFRouters + address (as required in RFC 2328 8.1). This likely breaks setups with multiple + neighbors on a network configured as PtP, which worked in previous versions. + Such links should be configured as PtMP. + + Since Linux 5.3, netlink socket can be flooded by route cache entries during + route table scan. This version mitigates that issue by using strict netlink + filtering. + + Version 3.0-alpha0 (2022-02-07) o Removal of fixed protocol-specific route attributes o Asynchronous route export @@ -13,6 +99,7 @@ Version 3.0-alpha0 (2022-02-07) o Lots of refactoring o Bugfixes and improvements as they came along + Version 2.0.8 (2021-03-18) o Automatic channel reloads based on RPKI changes o Multiple static routes with the same network @@ -2,6 +2,32 @@ dnl ** Additional Autoconf tests for BIRD configure script dnl ** (c) 1999 Martin Mares <mj@ucw.cz> dnl ** (c) 2021 Maria Matejka <mq@jmq.cz> +AC_DEFUN([BIRD_CHECK_POINTER_ALIGNMENT], +[ + AC_CACHE_CHECK( + [how pointers are aligned], + [bird_cv_pointer_alignment], + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM( + [ + _Static_assert(_Alignof(void *) == 8, "bad"); + ], [] + ) + ], + [bird_cv_pointer_alignment=8], + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM( + [ + _Static_assert(_Alignof(void *) == 4, "bad"); + ], [] + ) + ], + [bird_cv_pointer_alignment=4], + [bird_cv_pointer_alignment=unknown] + )) + ) +]) + AC_DEFUN([BIRD_CHECK_THREAD_LOCAL], [ AC_CACHE_CHECK( diff --git a/bird-gdb.py b/bird-gdb.py index e11f9757..c53a3b55 100644 --- a/bird-gdb.py +++ b/bird-gdb.py @@ -1,6 +1,3 @@ -import itertools -import functools - class BIRDPrinter: def __init__(self, val): self.val = val @@ -29,7 +26,6 @@ class BIRDFValPrinter(BIRDPrinter): "T_ENUM_RTS": "i", "T_ENUM_BGP_ORIGIN": "i", "T_ENUM_SCOPE": "i", - "T_ENUM_RTC": "i", "T_ENUM_RTD": "i", "T_ENUM_ROA": "i", "T_ENUM_NETTYPE": "i", @@ -201,20 +197,12 @@ class BIRDList: if str(self.tail_node["prev"].address) == '0x0': raise Exception("List tail is NULL") - def __iter__(self): + def walk(self, do): cur = self.head while cur.dereference() != self.tail_node: - yield cur + do(cur) cur = cur.dereference()["next"] - def __len__(self): - return sum([1 for _ in self]) - - def __getitem__(self, key): - for item in itertools.islice(self, key): - return item - - raise KeyError("Not enough elements in list") class BIRDListLength(gdb.Function): """Returns length of the list, as in @@ -222,8 +210,13 @@ class BIRDListLength(gdb.Function): def __init__(self): super(BIRDListLength, self).__init__("list_length") + def count(self, _): + self.cnt += 1 + def invoke(self, l): - return len(BIRDList(l)) + self.cnt = 0 + BIRDList(l).walk(self.count) + return self.cnt BIRDListLength() @@ -232,11 +225,30 @@ class BIRDListItem(gdb.Function): def __init__(self): super(BIRDListItem, self).__init__("list_item") + class BLException(Exception): + def __init__(self, node, msg): + Exception.__init__(self, msg) + self.node = node + + def count(self, node): + if self.cnt == self.pos: + raise self.BLException(node, "Node found") + + self.cnt += 1 + def invoke(self, l, n, t=None, item="n"): - if t is None: - return BIRDList(l)[n] - else: - return BIRD.skip_back(t, item, BIRDList(l)[n]) + self.cnt = 0 + self.pos = n + bl = BIRDList(l) + try: + bl.walk(self.count) + except self.BLException as e: + if t is None: + return e.node + else: + return BIRD.skip_back(t, item, e.node) + + raise Exception("List too short") BIRDListItem() @@ -290,26 +302,19 @@ class BIRDLinPoolResource(BIRDResource): self.lptype = gdb.lookup_type("struct linpool") self.val = val.cast(self.lptype) self.info = None - self.std = self.ChunkList(self.val["first"]) - self.large = self.ChunkList(self.val["first_large"]) - - class ChunkList: - def __init__(self, val): - self.val = val - - def __iter__(self): - chunk = self.val - while str(chunk) != 0x0: - yield chunk - chunk = chunk.dereference()["next"] - def __len__(self): - return sum([1 for _ in self]) + def count_chunk(self, which): + cnt = 0 + chunk = self.val[which] + while str(chunk) != '0x0': + cnt += 1 + chunk = chunk.dereference()["next"] + return cnt def parse(self): self.info = { - "std_chunks": len(self.std), - "large_chunks": len(self.large), + "std_chunks": self.count_chunk("first"), + "large_chunks": self.count_chunk("first_large"), } def memsize(self): @@ -334,13 +339,14 @@ class BIRDSlabResource(BIRDResource): self.val = val.cast(self.slabtype) self.info = None + def count_heads_item(self, item): + self.hcnt += 1 + self.used += item.dereference().cast(self.slheadtype)["num_full"] + def count_heads(self, which): self.hcnt = 0 self.used = 0 - for item in BIRDList(self.val[which + "_heads"]): - self.hcnt += 1 - self.used += item.dereference().cast(self.slheadtype)["num_full"] - + BIRDList(self.val[which + "_heads"]).walk(self.count_heads_item) self.info[which + "_heads"] = self.hcnt self.info[which + "_used"] = self.used return (self.hcnt, self.used) @@ -375,36 +381,25 @@ class BIRDSlabResource(BIRDResource): f", {self.val['objs_per_slab']} objects of size {self.val['obj_size']} per head" -class BIRDIOLoopResource(BIRDResource): - def __init__(self, val): - self.iolooptype = gdb.lookup_type("struct birdloop") - self.val = val.cast(self.iolooptype) - self.pages = self.val["pages"] - self.page_size = gdb.lookup_symbol("page_size")[0].value() - - def memsize(self): - return BIRDResourceSize(0, self.iolooptype.sizeof, self.pages['cnt'] * self.page_size) - - def __str__(self): - return f"IO Loop {self.val.address} containing {self.pages['cnt']} free pages (min {self.pages['min']} max {self.pages['max']}), cleanup event {self.pages['cleanup'].dereference().address}: " + \ - ", ".join([ str(p) for p in BIRDList(self.pages["list"])]) - - class BIRDPoolResource(BIRDResource): def __init__(self, val): self.pooltype = gdb.lookup_type("struct pool") self.resptrtype = gdb.lookup_type("struct resource").pointer() + self.page_size = gdb.lookup_symbol("page_size")[0].value() self.val = val.cast(self.pooltype) - self.inside = BIRDList(self.val["inside"]) + self.items = None - def __iter__(self): - for val in self.inside: - yield BIRDNewResource(val.cast(self.resptrtype).dereference()) + def parse_inside(self, val): + self.items.append(BIRDNewResource(val.cast(self.resptrtype).dereference())) - def __len__(self): - return len(self.inside) + def parse(self): + self.items = [] + BIRDList(self.val["inside"]).walk(self.parse_inside) def memsize(self): + if self.items is None: + self.parse() + sum = BIRDResourceSize(0, self.pooltype.sizeof, 0) # for i in self.items: # sum += i.memsize() @@ -412,17 +407,19 @@ class BIRDPoolResource(BIRDResource): return sum def __str__(self): + if self.items is None: + self.parse() + # for i in self.items: # print(i) - return f"Resource pool {self.val.address} \"{self.val['name'].string()}\" containing {len(self)} items" + return f"Resource pool {self.val.address} \"{self.val['name'].string()}\" containing {len(self.items)} items" BIRDResourceMap = { "mbl_memsize": BIRDMBResource, "pool_memsize": BIRDPoolResource, "lp_memsize": BIRDLinPoolResource, "slab_memsize": BIRDSlabResource, - "birdloop_memsize": BIRDIOLoopResource, } def BIRDNewResource(res): @@ -442,13 +439,14 @@ class BIRDResourcePrinter(BIRDPrinter): def __init__(self, val): super(BIRDResourcePrinter, self).__init__(val) self.resource = BIRDNewResource(val) + self.resource.parse() self.resourcetype = gdb.lookup_type("struct resource") if type(self.resource) == BIRDPoolResource: self.children = self.pool_children def pool_children(self): - return iter([ ("\n", i.val.cast(self.resourcetype)) for i in self.resource ]) + return iter([ ("\n", i.val.cast(self.resourcetype)) for i in self.resource.items ]) def to_string(self): return f"[ {str(self.resource.memsize())} ] {str(self.resource)}" diff --git a/client/client.c b/client/client.c index 97cf6639..397db73d 100644 --- a/client/client.c +++ b/client/client.c @@ -50,6 +50,7 @@ static byte *server_read_pos = server_read_buf; int init = 1; /* During intial sequence */ int busy = 1; /* Executing BIRD command */ int interactive; /* Whether stdin is terminal */ +int last_code; /* Last return code */ static int num_lines, skip_input; int term_lns, term_cls; @@ -152,7 +153,7 @@ submit_init_command(char *cmd_raw) if (!cmd) { cleanup(); - exit(0); + exit(1); } submit_server_command(cmd); @@ -196,7 +197,7 @@ init_commands(void) { /* Initial command is finished and we want to exit */ cleanup(); - exit(0); + exit((last_code < 8000) ? 0 : 1); } input_init(); @@ -283,6 +284,8 @@ server_got_reply(char *x) if (code) PRINTF(len, "%s\n", verbose ? x : x+5); + last_code = code; + if (x[4] == ' ') { busy = 0; diff --git a/conf/cf-lex.l b/conf/cf-lex.l index c9d2f5a5..7ce457fe 100644 --- a/conf/cf-lex.l +++ b/conf/cf-lex.l @@ -42,7 +42,7 @@ #define PARSER 1 #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "filter/filter.h" #include "filter/f-inst.h" @@ -77,19 +77,22 @@ static uint cf_hash(const byte *c); #define SYM_NEXT(n) n->next #define SYM_EQ(a,s1,b,s2) !strcmp(a,b) && s1 == s2 #define SYM_FN(k,s) cf_hash(k) -#define SYM_ORDER 6 /* Initial */ +#define SYM_ORDER 4 /* Initial */ #define SYM_REHASH sym_rehash -#define SYM_PARAMS /8, *1, 2, 2, 6, 20 +#define SYM_PARAMS /8, *1, 2, 2, 4, 20 HASH_DEFINE_REHASH_FN(SYM, struct symbol) HASH(struct keyword) kw_hash; - +HASH(struct ea_class) ea_name_hash; struct sym_scope *conf_this_scope; +static struct sym_scope global_root_scope__init = { .active = 1, }; +struct sym_scope *global_root_scope = &global_root_scope__init; + linpool *cfg_mem; int (*cf_read_hook)(byte *buf, unsigned int max, int fd); @@ -347,7 +350,7 @@ else: { return DDOT; } -[={}:;,.()+*/%<>~\[\]?!\|-] { +[={}:;,.()+*/%<>~\[\]?!\|&-] { return yytext[0]; } @@ -574,6 +577,8 @@ check_eof(void) return 0; } +static inline void cf_swap_soft_scope(void); + static struct symbol * cf_new_symbol(const byte *c) { @@ -583,45 +588,64 @@ cf_new_symbol(const byte *c) if (l > SYM_MAX_LEN) cf_error("Symbol too long"); + cf_swap_soft_scope(); + s = cfg_allocz(sizeof(struct symbol) + l + 1); *s = (struct symbol) { .scope = conf_this_scope, .class = SYM_VOID, }; strcpy(s->name, c); - if (!new_config->sym_hash.data) - HASH_INIT(new_config->sym_hash, new_config->pool, SYM_ORDER); + if (!conf_this_scope->hash.data) + HASH_INIT(conf_this_scope->hash, new_config->pool, SYM_ORDER); + + HASH_INSERT2(conf_this_scope->hash, SYM, new_config->pool, s); + + if (conf_this_scope == new_config->root_scope) + add_tail(&(new_config->symbols), &(s->n)); + + return s; +} + +static struct symbol * +cf_root_symbol(const byte *c) +{ + uint l = strlen(c); + if (l > SYM_MAX_LEN) + bug("Root symbol %s too long", c); - HASH_INSERT2(new_config->sym_hash, SYM, new_config->pool, s); + struct symbol *s = mb_alloc(&root_pool, sizeof(struct symbol) + l + 1); + *s = (struct symbol) { .scope = global_root_scope, .class = SYM_VOID, }; + memcpy(s->name, c, l+1); - add_tail(&(new_config->symbols), &(s->n)); + if (!global_root_scope->hash.data) + HASH_INIT(global_root_scope->hash, &root_pool, SYM_ORDER); + HASH_INSERT2(global_root_scope->hash, SYM, &root_pool, s); return s; } + /** - * cf_find_symbol - find a symbol by name - * @cfg: specificed config + * cf_find_symbol_scope - find a symbol by name + * @scope: config scope * @c: symbol name * - * This functions searches the symbol table in the config @cfg for a symbol of - * given name. First it examines the current scope, then the second recent one + * This functions searches the symbol table in the scope @scope for a symbol of + * given name. First it examines the current scope, then the underlying one * and so on until it either finds the symbol and returns a pointer to its * &symbol structure or reaches the end of the scope chain and returns %NULL to * signify no match. */ struct symbol * -cf_find_symbol(const struct config *cfg, const byte *c) +cf_find_symbol_scope(const struct sym_scope *scope, const byte *c) { struct symbol *s; - if (cfg->sym_hash.data && - (s = HASH_FIND(cfg->sym_hash, SYM, c, 1))) - return s; - - /* In CLI command parsing, fallback points to the current config, otherwise it is NULL. */ - if (cfg->fallback && - cfg->fallback->sym_hash.data && - (s = HASH_FIND(cfg->fallback->sym_hash, SYM, c, 1))) - return s; + /* Find the symbol here or anywhere below */ + while (scope) + if (scope->hash.data && (s = HASH_FIND(scope->hash, SYM, c, 1))) + return s; + else + scope = scope->next; return NULL; } @@ -638,7 +662,7 @@ cf_find_symbol(const struct config *cfg, const byte *c) struct symbol * cf_get_symbol(const byte *c) { - return cf_find_symbol(new_config, c) ?: cf_new_symbol(c); + return cf_find_symbol_scope(conf_this_scope, c) ?: cf_new_symbol(c); } /** @@ -654,10 +678,10 @@ cf_localize_symbol(struct symbol *sym) /* If the symbol type is void, it has been recently allocated just in this scope. */ if (!sym->class) return sym; - + /* If the scope is the current, it is already defined in this scope. */ - if (sym->scope == conf_this_scope) - cf_error("Symbol already defined"); + if (cf_symbol_is_local(sym)) + cf_error("Symbol '%s' already defined", sym->name); /* Not allocated here yet, doing it now. */ return cf_new_symbol(sym->name); @@ -689,10 +713,7 @@ cf_lex_symbol(const char *data) struct symbol *sym = cf_get_symbol(data); cf_lval.s = sym; - if (sym->class != SYM_VOID) - return CF_SYM_KNOWN; - - /* Is it a keyword? */ + /* Is it a keyword? Prefer the keyword. */ struct keyword *k = HASH_FIND(kw_hash, KW, data); if (k) { @@ -705,21 +726,51 @@ cf_lex_symbol(const char *data) } } - /* OK, undefined symbol */ - cf_lval.s = sym; - return CF_SYM_UNDEFINED; + /* OK, only a symbol. */ + if (sym->class == SYM_VOID) + return CF_SYM_UNDEFINED; + else + return CF_SYM_KNOWN; } static void cf_lex_init_kh(void) { - HASH_INIT(kw_hash, &root_pool, KW_ORDER); + HASH_INIT(kw_hash, config_pool, KW_ORDER); struct keyword *k; for (k=keyword_list; k->name; k++) HASH_INSERT(kw_hash, KW, k); } +void +ea_lex_register(struct ea_class *def) +{ + struct symbol *sym = cf_root_symbol(def->name); + sym->class = SYM_ATTRIBUTE; + sym->attribute = def; + def->sym = sym; +} + +void +ea_lex_unregister(struct ea_class *def) +{ + struct symbol *sym = def->sym; + HASH_REMOVE2(global_root_scope->hash, SYM, &root_pool, sym); + mb_free(sym); + def->sym = NULL; +} + +struct ea_class * +ea_class_find_by_name(const char *name) +{ + struct symbol *sym = cf_find_symbol(global_root_scope, name); + if (!sym || (sym->class != SYM_ATTRIBUTE)) + return NULL; + else + return sym->attribute; +} + /** * cf_lex_init - initialize the lexer * @is_cli: true if we're going to parse CLI command, false for configuration @@ -753,6 +804,11 @@ cf_lex_init(int is_cli, struct config *c) c->root_scope = cfg_allocz(sizeof(struct sym_scope)); conf_this_scope = c->root_scope; conf_this_scope->active = 1; + + if (is_cli) + conf_this_scope->next = config->root_scope; + else + conf_this_scope->next = global_root_scope; } /** @@ -787,6 +843,8 @@ cf_push_scope(struct symbol *sym) void cf_pop_scope(void) { + ASSERT(!conf_this_scope->soft_scopes); + conf_this_scope->active = 0; conf_this_scope = conf_this_scope->next; @@ -794,6 +852,52 @@ cf_pop_scope(void) } /** + * cf_push_soft_scope - enter new soft scope + * + * If we want to enter a new anonymous scope that most likely will not contain + * any symbols, we can use cf_push_soft_scope() insteas of cf_push_scope(). + * Such scope will be converted to a regular scope on first use. + */ +void +cf_push_soft_scope(void) +{ + if (conf_this_scope->soft_scopes < 0xfe) + conf_this_scope->soft_scopes++; + else + cf_push_block_scope(); +} + +/** + * cf_pop_soft_scope - leave a soft scope + * + * Leave a soft scope entered by cf_push_soft_scope(). + */ +void +cf_pop_soft_scope(void) +{ + if (conf_this_scope->soft_scopes) + conf_this_scope->soft_scopes--; + else + cf_pop_block_scope(); +} + +/** + * cf_swap_soft_scope - convert soft scope to regular scope + * + * Soft scopes cannot hold symbols, so they must be converted to regular scopes + * on first use. It is done automatically by cf_new_symbol(). + */ +static inline void +cf_swap_soft_scope(void) +{ + if (conf_this_scope->soft_scopes) + { + conf_this_scope->soft_scopes--; + cf_push_block_scope(); + } +} + +/** * cf_symbol_class_name - get name of a symbol class * @sym: symbol * diff --git a/conf/conf.c b/conf/conf.c index e8f1559a..daac85c1 100644 --- a/conf/conf.c +++ b/conf/conf.c @@ -46,7 +46,7 @@ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "lib/resource.h" @@ -61,6 +61,7 @@ static jmp_buf conf_jmpbuf; struct config *config, *new_config; +pool *config_pool; static struct config *old_config; /* Old configuration */ static struct config *future_config; /* New config held here if recon requested during recon */ @@ -89,7 +90,7 @@ int undo_available; /* Undo was not requested from last reconfiguration */ struct config * config_alloc(const char *name) { - pool *p = rp_new(&root_pool, &main_birdloop, "Config"); + pool *p = rp_new(config_pool, "Config"); linpool *l = lp_new_default(p); struct config *c = lp_allocz(l, sizeof(struct config)); @@ -140,6 +141,7 @@ config_parse(struct config *c) protos_preconfig(c); rt_preconfig(c); cf_parse(); + rt_postconfig(c); if (EMPTY_LIST(c->protos)) cf_error("No protocol is specified in the config file"); @@ -168,7 +170,6 @@ int cli_parse(struct config *c) { int done = 0; - c->fallback = config; new_config = c; cfg_mem = c->mem; if (setjmp(conf_jmpbuf)) @@ -179,7 +180,6 @@ cli_parse(struct config *c) done = 1; cleanup: - c->fallback = NULL; new_config = NULL; cfg_mem = NULL; return done; @@ -195,8 +195,33 @@ cleanup: void config_free(struct config *c) { - if (c) - rp_free(c->pool, &root_pool); + if (!c) + return; + + ASSERT(!c->obstacle_count); + + rfree(c->pool); +} + +/** + * config_free_old - free stored old configuration + * + * This function frees the old configuration (%old_config) that is saved for the + * purpose of undo. It is useful before parsing a new config when reconfig is + * requested, to avoid keeping three (perhaps memory-heavy) configs together. + * Configuration is not freed when it is still active during reconfiguration. + */ +void +config_free_old(void) +{ + if (!old_config || old_config->obstacle_count) + return; + + tm_stop(config_timer); + undo_available = 0; + + config_free(old_config); + old_config = NULL; } void @@ -490,10 +515,12 @@ config_timeout(timer *t UNUSED) void config_init(void) { - config_event = ev_new(&root_pool); + config_pool = rp_new(&root_pool, "Configurations"); + + config_event = ev_new(config_pool); config_event->hook = config_done; - config_timer = tm_new(&root_pool); + config_timer = tm_new(config_pool); config_timer->hook = config_timeout; } @@ -522,7 +549,6 @@ order_shutdown(int gr) init_list(&c->tables); init_list(&c->symbols); memset(c->def_tables, 0, sizeof(c->def_tables)); - HASH_INIT(c->sym_hash, c->pool, 4); c->shutdown = 1; c->gr_down = gr; diff --git a/conf/conf.h b/conf/conf.h index 4f6aa6eb..b30914c1 100644 --- a/conf/conf.h +++ b/conf/conf.h @@ -16,7 +16,6 @@ #include "lib/timer.h" /* Configuration structure */ - struct config { pool *pool; /* Pool the configuration is stored in */ linpool *mem; /* Linear pool containing configuration data */ @@ -28,7 +27,7 @@ struct config { int mrtdump_file; /* Configured MRTDump file (sysdep, fd in unix) */ const char *syslog_name; /* Name used for syslog (NULL -> no syslog) */ - struct rtable_config *def_tables[NET_MAX]; /* Default routing tables for each network */ + struct symbol *def_tables[NET_MAX]; /* Default routing tables for each network */ struct iface_patt *router_id_from; /* Configured list of router ID iface patterns */ u32 router_id; /* Our Router ID */ @@ -55,8 +54,8 @@ struct config { char *err_file_name; /* File name containing error */ char *file_name; /* Name of main configuration file */ int file_fd; /* File descriptor of main configuration file */ - HASH(struct symbol) sym_hash; /* Lexer: symbol hash table */ - struct config *fallback; /* Link to regular config for CLI parsing */ + int thread_count; /* How many worker threads to prefork */ + struct sym_scope *root_scope; /* Scope for root symbols */ int obstacle_count; /* Number of items blocking freeing of this config */ int shutdown; /* This is a pseudo-config for daemon shutdown */ @@ -72,6 +71,7 @@ struct config *config_alloc(const char *name); int config_parse(struct config *); int cli_parse(struct config *); void config_free(struct config *); +void config_free_old(void); int config_commit(struct config *, int type, uint timeout); int config_confirm(void); int config_undo(void); @@ -98,7 +98,7 @@ void order_shutdown(int gr); /* Pools */ - +extern pool *config_pool; extern linpool *cfg_mem; #define cfg_alloc(size) lp_alloc(cfg_mem, size) @@ -123,7 +123,7 @@ struct symbol { const struct f_line *function; /* For SYM_FUNCTION */ const struct filter *filter; /* For SYM_FILTER */ struct rtable_config *table; /* For SYM_TABLE */ - struct f_dynamic_attr *attribute; /* For SYM_ATTRIBUTE */ + struct ea_class *attribute; /* For SYM_ATTRIBUTE */ struct f_val *val; /* For SYM_CONSTANT */ uint offset; /* For SYM_VARIABLE */ }; @@ -134,10 +134,17 @@ struct symbol { struct sym_scope { struct sym_scope *next; /* Next on scope stack */ struct symbol *name; /* Name of this scope */ + + HASH(struct symbol) hash; /* Local symbol hash */ + uint slots; /* Variable slots */ - int active; /* Currently entered */ + byte active; /* Currently entered */ + byte block; /* No independent stack frame */ + byte soft_scopes; /* Number of soft scopes above */ }; +extern struct sym_scope *global_root_scope; + struct bytestring { size_t length; byte data[]; @@ -186,12 +193,22 @@ int cf_lex(void); void cf_lex_init(int is_cli, struct config *c); void cf_lex_unwind(void); -struct symbol *cf_find_symbol(const struct config *cfg, const byte *c); +struct symbol *cf_find_symbol_scope(const struct sym_scope *scope, const byte *c); +static inline struct symbol *cf_find_symbol_cfg(const struct config *cfg, const byte *c) +{ return cf_find_symbol_scope(cfg->root_scope, c); } + +#define cf_find_symbol(where, what) _Generic(*(where), \ + struct config: cf_find_symbol_cfg, \ + struct sym_scope: cf_find_symbol_scope \ + )((where), (what)) struct symbol *cf_get_symbol(const byte *c); struct symbol *cf_default_name(char *template, int *counter); struct symbol *cf_localize_symbol(struct symbol *sym); +static inline int cf_symbol_is_local(struct symbol *sym) +{ return (sym->scope == conf_this_scope) && !conf_this_scope->soft_scopes; } + /** * cf_define_symbol - define meaning of a symbol * @sym: symbol to be defined @@ -215,6 +232,15 @@ struct symbol *cf_localize_symbol(struct symbol *sym); void cf_push_scope(struct symbol *); void cf_pop_scope(void); +void cf_push_soft_scope(void); +void cf_pop_soft_scope(void); + +static inline void cf_push_block_scope(void) +{ cf_push_scope(NULL); conf_this_scope->block = 1; } + +static inline void cf_pop_block_scope(void) +{ ASSERT(conf_this_scope->block); cf_pop_scope(); } + char *cf_symbol_class_name(struct symbol *sym); /* Parser */ diff --git a/conf/confbase.Y b/conf/confbase.Y index 6985783b..8e5da9e3 100644 --- a/conf/confbase.Y +++ b/conf/confbase.Y @@ -14,11 +14,12 @@ CF_HDR #include "conf/conf.h" #include "lib/resource.h" #include "lib/socket.h" +#include "lib/settle.h" #include "lib/timer.h" #include "lib/string.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/bfd.h" #include "nest/cli.h" #include "filter/filter.h" @@ -71,10 +72,12 @@ CF_DECLS } xp; enum filter_return fret; enum ec_subtype ecs; - struct f_dynamic_attr fda; + struct ea_class *ea_class; struct f_static_attr fsa; + struct f_attr_bit fab; struct f_lval flv; struct f_line *fl; + struct f_arg *fa; const struct filter *f; struct f_tree *e; struct f_trie *trie; @@ -91,7 +94,8 @@ CF_DECLS struct proto_spec ps; struct channel_limit cl; struct timeformat *tf; - mpls_label_stack *mls; + struct settle_config settle; + struct adata *ad; struct bytestring *bs; } @@ -109,17 +113,19 @@ CF_DECLS %type <i> expr bool pxlen4 %type <time> expr_us time +%type <settle> settle %type <a> ipa -%type <net> net_ip4_ net_ip6_ net_ip6 net_ip_ net_ip net_or_ipa +%type <net> net_ip4_ net_ip4 net_ip6_ net_ip6 net_ip_ net_ip net_or_ipa %type <net_ptr> net_ net_any net_vpn4_ net_vpn6_ net_vpn_ net_roa4_ net_roa6_ net_roa_ net_ip6_sadr_ net_mpls_ -%type <mls> label_stack_start label_stack +%type <ad> label_stack_start label_stack %type <t> text opttext -%type <s> symbol +%type <s> symbol symbol_known toksym %nonassoc PREFIX_DUMMY %left AND OR %nonassoc '=' '<' '>' '~' GEQ LEQ NEQ NMA PO PC +%left '|' '&' %left '+' '-' %left '*' '/' '%' %left '!' @@ -151,16 +157,16 @@ conf: definition ; definition: DEFINE symbol '=' term ';' { - struct f_val *val = cfg_allocz(sizeof(struct f_val)); - if (f_eval(f_linearize($4), cfg_mem, val) > F_RETURN) cf_error("Runtime error"); - cf_define_symbol($2, SYM_CONSTANT | val->type, val, val); + struct f_val val; + if (f_eval(f_linearize($4, 1), &val) > F_RETURN) cf_error("Runtime error"); + cf_define_symbol($2, SYM_CONSTANT | val.type, val, lp_val_copy(cfg_mem, &val)); } ; expr: NUM - | '(' term ')' { $$ = f_eval_int(f_linearize($2)); } - | CF_SYM_KNOWN { + | '(' term ')' { $$ = f_eval_int(f_linearize($2, 1)); } + | symbol_known { if ($1->class != (SYM_CONSTANT | T_INT)) cf_error("Number constant expected"); $$ = SYM_VAL($1).i; } ; @@ -171,7 +177,9 @@ expr_us: | expr US { $$ = $1 US_; } ; -symbol: CF_SYM_UNDEFINED | CF_SYM_KNOWN ; +toksym: FROM | PREFERENCE ; +symbol: CF_SYM_UNDEFINED | CF_SYM_KNOWN | toksym ; +symbol_known: CF_SYM_KNOWN | toksym ; /* Switches */ @@ -303,6 +311,15 @@ net_: /* Networks - regular */ +net_ip4: + net_ip4_ + | CF_SYM_KNOWN { + if (($1->class != (SYM_CONSTANT | T_NET)) || (SYM_VAL($1).net->type != NET_IP4)) + cf_error("IPv4 network constant expected"); + $$ = * SYM_VAL($1).net; + } + ; + net_ip6: net_ip6_ | CF_SYM_KNOWN { @@ -347,17 +364,19 @@ net_or_ipa: label_stack_start: NUM { - $$ = cfg_allocz(sizeof(mpls_label_stack)); - $$->len = 1; - $$->stack[0] = $1; + $$ = cfg_allocz(ADATA_SIZE(MPLS_MAX_LABEL_STACK * sizeof(u32))); + $$->length = sizeof(u32); + *((u32 *)$$->data) = $1; }; label_stack: label_stack_start | label_stack '/' NUM { - if ($1->len >= MPLS_MAX_LABEL_STACK) + if ($1->length >= MPLS_MAX_LABEL_STACK * sizeof(u32)) cf_error("Too many labels in stack"); - $1->stack[$1->len++] = $3; + + *((u32 *)($$->data + $1->length)) = $3; + $1->length += sizeof(u32); $$ = $1; } ; @@ -370,6 +389,13 @@ time: } ; +/* Settle timer configuration */ +settle: expr_us expr_us { + if ($1 > $2) cf_error("Minimum settle time %t is bigger than maximum settle time %t", $1, $2); + $$.min = $1; + $$.max = $2; +}; + text: TEXT | CF_SYM_KNOWN { diff --git a/conf/flowspec.Y b/conf/flowspec.Y index 56a7c5dc..102fed45 100644 --- a/conf/flowspec.Y +++ b/conf/flowspec.Y @@ -142,7 +142,7 @@ flow_frag_opts: ; flow4_item: - flow_srcdst net_ip { + flow_srcdst net_ip4 { flow_builder_set_type(this_flow, $1); flow_builder4_add_pfx(this_flow, (net_addr_ip4 *) &($2)); } @@ -180,7 +180,7 @@ flow6_opts: flow_builder_init: { if (this_flow == NULL) - this_flow = flow_builder_init(&root_pool); + this_flow = flow_builder_init(config_pool); /* FIXME: This should be allocated from tmp in future */ else flow_builder_clear(this_flow); }; diff --git a/conf/gen_keywords.m4 b/conf/gen_keywords.m4 index 0c1dc545..53226e4d 100644 --- a/conf/gen_keywords.m4 +++ b/conf/gen_keywords.m4 @@ -26,8 +26,7 @@ m4_define(CF_DEFINES, `m4_divert(-1)') m4_define(CF_handle_kw, `m4_divert(1){ "m4_translit($1,[[A-Z]],[[a-z]])", $1, NULL }, m4_divert(-1)') m4_define(CF_keywd, `m4_ifdef([[CF_tok_$1]],,[[m4_define([[CF_tok_$1]],1)CF_handle_kw($1)]])') -m4_define(CF_KEYWORDS, `m4_define([[CF_toks]],[[]])CF_iterate([[CF_keywd]], [[$@]])m4_ifelse(CF_toks,,,%token[[]]CF_toks -)DNL') +m4_define(CF_KEYWORDS, `CF_iterate([[CF_keywd]], [[$@]])DNL') # CLI commands generate keywords as well m4_define(CF_CLI, `CF_KEYWORDS(m4_translit($1, [[ ]], [[,]])) diff --git a/conf/gen_parser.m4 b/conf/gen_parser.m4 index 5b378a93..af4b1455 100644 --- a/conf/gen_parser.m4 +++ b/conf/gen_parser.m4 @@ -31,7 +31,7 @@ m4_define(CF_iterate, `m4_define([[CF_iter]], m4_defn([[$1]]))CF_itera($2)') # Keywords act as untyped %token m4_define(CF_keywd, `m4_ifdef([[CF_tok_$1]],,[[m4_define([[CF_tok_$1]],1)m4_define([[CF_toks]],CF_toks $1)]])') -m4_define(CF_KEYWORDS, `m4_define([[CF_toks]],[[]])CF_iterate([[CF_keywd]], [[$@]])m4_ifelse(CF_toks,,,%token[[]]CF_toks +m4_define(CF_KEYWORDS, `m4_define([[CF_toks]],[[]])CF_iterate([[CF_keywd]], [[$@]])m4_ifelse(CF_toks,,,%token<s>[[]]CF_toks )DNL') # CLI commands diff --git a/configure.ac b/configure.ac index 5c0cf002..bf5c6df4 100644 --- a/configure.ac +++ b/configure.ac @@ -36,6 +36,12 @@ AC_ARG_ENABLE([memcheck], [enable_memcheck=yes] ) +AC_ARG_ENABLE([compact-tries], + [AS_HELP_STRING([--enable-compact-tries], [use 4-way tries instead of 16-way ones @<:@no@:>@])], + [], + [enable_compact_tries=no] +) + AC_ARG_ENABLE([libssh], [AS_HELP_STRING([--enable-libssh], [enable LibSSH support in RPKI @<:@try@:>@])], [], @@ -226,6 +232,8 @@ else ;; openbsd*) sysdesc=bsd + CPPFLAGS="$CPPFLAGS -I/usr/local/include" + LDFLAGS="$LDFLAGS -L/usr/local/lib" ;; dragonfly*) sysdesc=bsd @@ -276,6 +284,12 @@ if test "$enable_libssh" != no ; then enable_libssh=no fi fi + + AC_CHECK_LIB([ssh], [ssh_session_is_known_server], [ssh_old_server_validation_api=no], [ssh_old_server_validation_api=yes]) + + if test "$ssh_old_server_validation_api" = yes; then + AC_DEFINE([HAVE_SSH_OLD_SERVER_VALIDATION_API], [1], [Define to 1 if ssh_session_is_known_server isn't defined]) + fi fi if test "$enable_mpls_kernel" != no ; then @@ -292,7 +306,8 @@ if test "$enable_mpls_kernel" != no ; then fi fi -all_protocols="bfd babel bgp mrt ospf perf pipe radv rip rpki static" +# temporarily removed "mrt" from all_protocols to speed up 3.0-alpha1 release +all_protocols="bfd babel bgp ospf perf pipe radv rip rpki static" all_protocols=`echo $all_protocols | sed 's/ /,/g'` if test "$with_protocols" = all ; then @@ -337,7 +352,7 @@ case $sysdesc in ;; esac -AC_CHECK_HEADERS_ONCE([alloca.h syslog.h]) +AC_CHECK_HEADERS_ONCE([alloca.h syslog.h stdatomic.h]) AC_CHECK_HEADER([sys/mman.h], [AC_DEFINE([HAVE_MMAP], [1], [Define to 1 if mmap() is available.])], have_mman=no) AC_CHECK_FUNC([aligned_alloc], [AC_DEFINE([HAVE_ALIGNED_ALLOC], [1], [Define to 1 if aligned_alloc() is available.])], have_aligned_alloc=no) AC_CHECK_MEMBERS([struct sockaddr.sa_len], [], [], [#include <sys/socket.h>]) @@ -353,6 +368,13 @@ AC_C_BIGENDIAN( [AC_MSG_ERROR([Cannot determine CPU endianity.])] ) +BIRD_CHECK_POINTER_ALIGNMENT +if test "$bird_cv_pointer_alignment" = "unknown" ; then + AC_MSG_ERROR([Couldn't determine pointer alignment]) +else + AC_DEFINE_UNQUOTED([CPU_POINTER_ALIGNMENT], [$bird_cv_pointer_alignment], [Pointer alignment for macro usage]) +fi + BIRD_CHECK_ANDROID_GLOB if test "$bird_cv_lib_glob" = no ; then AC_MSG_ERROR([glob.h not found.]) @@ -400,6 +422,10 @@ if test "$enable_debug" = yes ; then fi fi +if test "$enable_compact_tries" = yes ; then + AC_DEFINE([ENABLE_COMPACT_TRIES], [1], [Define to 1 if you want 4-way tries instead of 16-way ones.]) +fi + CLIENT=birdcl CLIENT_LIBS= if test "$enable_client" = yes ; then @@ -460,6 +486,7 @@ AC_MSG_RESULT([ Object directory: $objdir]) AC_MSG_RESULT([ Iproute2 directory: $iproutedir]) AC_MSG_RESULT([ System configuration: $sysdesc]) AC_MSG_RESULT([ Debugging: $enable_debug]) +AC_MSG_RESULT([ Compact tries: $enable_compact_tries]) AC_MSG_RESULT([ Routing protocols: $protocols]) AC_MSG_RESULT([ LibSSH support in RPKI: $enable_libssh]) AC_MSG_RESULT([ Kernel MPLS support: $enable_mpls_kernel]) diff --git a/distro/pkg/rpm/bird.service b/distro/pkg/rpm/bird.service index fa203c78..26bcb8a0 100644 --- a/distro/pkg/rpm/bird.service +++ b/distro/pkg/rpm/bird.service @@ -5,8 +5,9 @@ After=network.target [Service] Type=simple +ExecStartPre=/usr/sbin/bird -p ExecStart=/usr/sbin/bird -f -u bird -g bird -ExecReload=/bin/kill -HUP $MAINPID +ExecReload=/usr/sbin/birdc configure Restart=on-failure [Install] diff --git a/doc/bird.sgml b/doc/bird.sgml index 5cf09bfe..3f6acaed 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -14,7 +14,7 @@ configuration - something in config which is not keyword. (set-fill-column 80) - Copyright 1999,2000 Pavel Machek <pavel@ucw.cz>, distribute under GPL version 2 or later. + Copyright 1999 - 2022 CZ.NIC, z.s.p.o , distribute under GPL version 2 or later. --> @@ -144,13 +144,6 @@ BIRD executable by configuring out routing protocols you don't use, and <p>You can pass several command-line options to bird: <descrip> - <tag><label id="argv-block">-B <m/exp/</tag> - allocate memory using 2^<cf/exp/ byte sized blocks; - if you're expecting high memory load, raise this to - reduce number of allocated memory pages. For a million routes - in one table, the recommended setting is 18. - Default is your system page size, typically 12 for 4096 bytes. - <tag><label id="argv-config">-c <m/config name/</tag> use given configuration file instead of <it/prefix/<file>/etc/bird.conf</file>. @@ -258,16 +251,9 @@ The global best route selection algorithm is (roughly) as follows: </itemize> <p><label id="dsc-table-sorted">Usually, a routing table just chooses a selected -route from a list of entries for one network. But if the <cf/sorted/ option is -activated, these lists of entries are kept completely sorted (according to -preference or some protocol-dependent metric). This is needed for some features -of some protocols (e.g. <cf/secondary/ option of BGP protocol, which allows to -accept not just a selected route, but the first route (in the sorted list) that -is accepted by filters), but it is incompatible with some other features (e.g. -<cf/deterministic med/ option of BGP protocol, which activates a way of choosing -selected route that cannot be described using comparison and ordering). Minor -advantage is that routes are shown sorted in <cf/show route/, minor disadvantage -is that it is slightly more computationally expensive. +route from a list of entries for one network. Optionally, these lists of entries +are kept completely sorted (according to preference or some protocol-dependent +metric). See <ref id="rtable-sorted" name="sorted"> table option for details. <sect>Routes and network types <label id="routes"> @@ -519,6 +505,11 @@ include "tablename.conf";; See <ref id="channel-debug" name="debug"> in the channel section. Default: off. + <tag><label id="opt-debug-tables">debug tables all|off|{ states|routes|filters|events [, <m/.../] }</tag> + Set global defaults of table debugging options. + See <ref id="rtable-debug" name="debug"> in the table section. + Default: off. + <tag><label id="opt-debug-commands">debug commands <m/number/</tag> Control logging of client connections (0 for no logging, 1 for logging of connects and disconnects, 2 and higher for logging of all client @@ -541,6 +532,12 @@ include "tablename.conf";; killed by abort signal. The timeout has effective granularity of seconds, zero means disabled. Default: disabled (0). + <tag><label id="opt-threads">threads <m/number/</tag> + Set how many worker threads should BIRD spawn. Tests show that every + thread can utilize one complete CPU core, therefore you probably want to + keep at least one free core. The maximum feasible thread count heavily + depends on the actual workload and must be determined by testing or estimation. Default: 1 + <tag><label id="opt-mrtdump">mrtdump "<m/filename/"</tag> Set MRTdump file name. This option must be specified to allow MRTdump feature. Default: no dump file. @@ -634,18 +631,107 @@ include "tablename.conf";; <cf/protocol/ times, and the <cf/iso long ms/ format for <cf/base/ and <cf/log/ times. - <tag><label id="opt-table"><m/nettype/ table <m/name/ [sorted]</tag> - Create a new routing table. The default routing tables <cf/master4/ and - <cf/master6/ are created implicitly, other routing tables have to be - added by this command. Option <cf/sorted/ can be used to enable sorting - of routes, see <ref id="dsc-table-sorted" name="sorted table"> - description for details. + <tag><label id="opt-table"><m/nettype/ table <m/name/ [ { <m/option/; [<m/.../] } ]</tag> + Define a new routing table. The default routing tables <cf/master4/ and + <cf/master6/ are defined implicitly, other routing tables have to be + defined by this option. See the <ref id="rtable-opts" + name="routing table configuration section"> for routing table options. <tag><label id="opt-eval">eval <m/expr/</tag> Evaluates given filter expression. It is used by the developers for testing of filters. </descrip> +<sect>Routing table options +<label id="rtable-opts"> + +<p>Most routing tables do not need any options and are defined without an option +block, but there are still some options to tweak routing table behavior. Note +that implicit tables (<cf/master4/ and <cf/master6/) can be redefined in order +to set options. + +<descrip> + <tag><label id="rtable-sorted">sorted <m/switch/</tag> + Usually, a routing table just chooses the selected (best) route from a + list of routes for each network, while keeping remaining routes unsorted. + If enabled, these lists of routes are kept completely sorted (according + to preference or some protocol-dependent metric). + + This is needed for some protocol features (e.g. <cf/secondary/ option of + BGP protocol, which allows to accept not just a selected route, but the + first route (in the sorted list) that is accepted by filters), but it is + incompatible with some other features (e.g. <cf/deterministic med/ + option of BGP protocol, which activates a way of choosing selected route + that cannot be described using comparison and ordering). Minor advantage + is that routes are shown sorted in <cf/show route/, minor disadvantage + is that it is slightly more computationally expensive. Default: off. + + <tag><label id="rtable-trie">trie <m/switch/</tag> + BIRD routing tables are implemented with hash tables, which is efficient + for exact-match lookups, but inconvenient for longest-match lookups or + interval lookups (finding superprefix or subprefixes). This option + activates additional trie structure that is used to accelerate these + lookups, while using the hash table for exact-match lookups. + + This has advantage for <ref id="rpki" name="RPKI"> (on ROA tables), + for <ref id="bgp-gateway" name="recursive next-hops"> (on IGP tables), + and is required for <ref id="bgp-validate" name="flowspec validation"> + (on base IP tables). Another advantage is that interval results (like + from <cf/show route in .../ command) are lexicographically sorted. The + disadvantage is that trie-enabled routing tables require more memory, + which may be an issue especially in multi-table setups. Default: off. + + <tag><label id="rtable-gc-threshold">gc threshold <m/number/</tag> + Specify a minimum amount of removed networks that triggers a garbage + collection (GC) cycle. Default: 1000. + + <tag><label id="rtable-gc-period">gc period <m/time/</tag> + Specify a period of time between consecutive GC cycles. When there is a + significant amount of route withdraws, GC cycles are executed repeatedly + with given period time (with some random factor). When there is just + small amount of changes, GC cycles are not executed. In extensive route + server setups, running GC on hundreds of full BGP routing tables can + take significant amount of time, therefore they should use higher GC + periods. Default: adaptive, based on number of routing tables in the + configuration. From 10 s (with <= 25 routing tables) up to 600 s (with + >= 1500 routing tables). + + <tag><label id="rtable-cork-threshold">cork threshold <m/number/ <m/number/</tag> + Too many pending exports may lead to memory bloating. In such cases, + BIRD tries to relieve the memory pressure by pausing some routines until + the queue sizes get low enough. This option allows the user to set the + thresholds; first value is the low threshold (when to resume), the + second one is the high threshold (when to pause). The higher is the + threshold, the more memory can get used. In most cases, the defaults + should work for you. Default: 1024 8192. + + <tag><label id="rtable-export-settle-time">export settle time <m/time/ <m/time/</tag> + Minimum and maximum settle times, respectively, for export announcements. + When multiple routes are changing, this mechanism waits for the changes + to settle before waking up sleeping export threads but if the changes are coming + steadily, BIRD isn't waiting forever; at most the maximum time. + Default values: <cf/1 ms 100 ms/. You have to always provide both values. + + <tag><label id="rtable-route-refresh-export-settle-time">route refresh export settle time <m/time/ <m/time/</tag> + Minimum and maximum settle times, respectively, for export announcements + (the same as above), valid when any channel is currently doing a route refresh. + This serves a purpose of even more aggresive change bundling, knowing that there + is some active process generating changes in a fast pace. If you don't want + this feature, set this to the same values as <ref id="rtable-export-settle-time" name="export settle time">. + Default values: <cf/100 ms 3 s/. + + <tag><label id="rtable-debug">debug all|off|{ states|routes|events [, <m/.../] }</tag> + Set table debugging options. Each table can write some trace messages + into log with category <cf/trace/. You can request <cf/all/ trace messages + or select some types: <cf/states/ for table state changes and auxiliary + processes, <cf/routes/ for auxiliary route notifications (next hop update, + flowspec revalidation) and <cf/events/ for more detailed auxiliary routine + debug. See also <ref id="channel-debug" name="channel debugging option">. + Default: off. + +</descrip> + + <sect>Protocol options <label id="protocol-opts"> @@ -873,10 +959,12 @@ inherited from templates can be updated by new definitions. <cf/none/ is for dropping all routes. Default: <cf/all/ (except for EBGP). - <tag><label id="proto-export">export <m/filter/</tag> + <tag><label id="proto-export">export [ in <m/prefix/ ] <m/filter/</tag> This is similar to the <cf>import</cf> keyword, except that it works in - the direction from the routing table to the protocol. Default: <cf/none/ - (except for EBGP). + the direction from the routing table to the protocol. If <cf/in/ keyword is used, + only routes inside the given prefix are exported. Other routes are completely + ignored (e.g. no logging and no statistics). + Default: <cf/none/ (except for EBGP). <tag><label id="proto-import-keep-filtered">import keep filtered <m/switch/</tag> Usually, if an import filter rejects a route, the route is forgotten. @@ -898,6 +986,16 @@ inherited from templates can be updated by new definitions. <ref id="bgp-export-table" name="export table"> (for respective direction). Default: on. + <tag><label id="rtable-min-settle-time">roa settle time <m/time/ <m/time/</tag> + Minimum and maximum settle times, respectively, for ROA table changes. + The automatic reload is triggered after the minimum time after the last + ROA table change has been received but not later than the maximum time after + first unprocessed ROA table change. Therefore with default values, the + automatic reload happens 1 second after the ROA table stops updating, yet if it + were to be later than 20 seconds after the ROA table starts updating, + the automatic reload is triggered anyway. Default values: <cf/1 s 20 s/. + You have to always provide both values. + <tag><label id="proto-import-limit">import limit [<m/number/ | off ] [action warn | block | restart | disable]</tag> Specify an import route limit (a maximum number of routes imported from the protocol) and optionally the action to be taken when the limit is @@ -1057,14 +1155,16 @@ This argument can be omitted if there exists only a single instance. Show the list of symbols defined in the configuration (names of protocols, routing tables etc.). - <tag><label id="cli-show-route">show route [[for] <m/prefix/|<m/IP/] [table (<m/t/ | all)] [filter <m/f/|where <m/c/] [(export|preexport|noexport) <m/p/] [protocol <m/p/] [(stats|count)] [<m/options/]</tag> + <tag><label id="cli-show-route">show route [[(for|in)] <m/prefix/|for <m/IP/] [table (<m/t/|all)] [(import|export) table <m/p/.<m/c/] [filter <m/f/|where <m/cond/] [(export|preexport|noexport) <m/p/] [protocol <m/p/] [(stats|count)] [<m/options/]</tag> Show contents of specified routing tables, that is routes, their metrics and (in case the <cf/all/ switch is given) all their attributes. <p>You can specify a <m/prefix/ if you want to print routes for a specific network. If you use <cf>for <m/prefix or IP/</cf>, you'll get the entry which will be used for forwarding of packets to the given - destination. By default, all routes for each network are printed with + destination. Finally, if you use <cf>in <m/prefix/</cf>, you get all + prefixes covered by the given prefix. + By default, all routes for each network are printed with the selected one at the top, unless <cf/primary/ is given in which case only the selected route is shown. @@ -1077,6 +1177,11 @@ This argument can be omitted if there exists only a single instance. Last, the set of default tables is used: <cf/master4/, <cf/master6/ and each first table of any other network type. + <p>There are internal tables when <cf/(import|export) table/ options + are used for some channels. They can be selected explicitly with + <cf/(import|export) table/ switch, specifying protocol <m/p/ and + channel name <m/c/. + <p>You can also ask for printing only routes processed and accepted by a given filter (<cf>filter <m/name/</cf> or <cf>filter { <m/filter/ } </cf> or matching a given condition (<cf>where <m/condition/</cf>). @@ -1112,6 +1217,11 @@ This argument can be omitted if there exists only a single instance. restarted otherwise. Changes in filters usually lead to restart of affected protocols. + The previous configuration is saved and the user can switch back to it + with <ref id="cli-configure-undo" name="configure undo"> command. The + old saved configuration is released (even if the reconfiguration attempt + fails due to e.g. a syntax error). + If <cf/soft/ option is used, changes in filters does not cause BIRD to restart affected protocols, therefore already accepted routes (according to old filters) would be still propagated, but new routes would be @@ -1204,8 +1314,8 @@ this: <code> filter not_too_far -int var; { + int var; if defined( rip_metric ) then var = rip_metric; else { @@ -1234,9 +1344,9 @@ local variables. Recursion is not allowed. Function definitions look like this: <code> function name () -int local_variable; { - local_variable = 5; + int local_variable; + int another_variable = 5; } function with_parameters (int parameter) @@ -1245,16 +1355,19 @@ function with_parameters (int parameter) } </code> -<p>Unlike in C, variables are declared after the <cf/function/ line, but before -the first <cf/{/. You can't declare variables in nested blocks. Functions are -called like in C: <cf>name(); with_parameters(5);</cf>. Function may return -values using the <cf>return <m/[expr]/</cf> command. Returning a value exits -from current function (this is similar to C). +<p>Like in C programming language, variables are declared inside function body, +either at the beginning, or mixed with other statements. Declarations may +contain initialization. You can also declare variables in nested blocks, such +variables have scope restricted to such block. There is a deprecated syntax to +declare variables after the <cf/function/ line, but before the first <cf/{/. +Functions are called like in C: <cf>name(); with_parameters(5);</cf>. Function +may return values using the <cf>return <m/[expr]/</cf> command. Returning a +value exits from current function (this is similar to C). -<p>Filters are defined in a way similar to functions except they can't have +<p>Filters are defined in a way similar to functions except they cannot have explicit parameters. They get a route table entry as an implicit parameter, it is also passed automatically to any functions called. The filter must terminate -with either <cf/accept/ or <cf/reject/ statement. If there's a runtime error in +with either <cf/accept/ or <cf/reject/ statement. If there is a runtime error in filter, the route is rejected. <p>A nice trick to debug filters is to use <cf>show route filter <m/name/</cf> @@ -1624,7 +1737,8 @@ prefix and an ASN as arguments. <sect>Control structures <label id="control-structures"> -<p>Filters support two control structures: conditions and case switches. +<p>Filters support several control structures: conditions, for loops and case +switches. <p>Syntax of a condition is: <cf>if <M>boolean expression</M> then <m/commandT/; else <m/commandF/;</cf> and you can use <cf>{ <m/command1/; <m/command2/; @@ -1632,6 +1746,14 @@ else <m/commandF/;</cf> and you can use <cf>{ <m/command1/; <m/command2/; omitted. If the <cf><m>boolean expression</m></cf> is true, <m/commandT/ is executed, otherwise <m/commandF/ is executed. +<p>For loops allow to iterate over elements in compound data like BGP paths or +community lists. The syntax is: <cf>for [ <m/type/ ] <m/variable/ in <m/expr/ +do <m/command/;</cf> and you can also use compound command like in conditions. +The expression is evaluated to a compound data, then for each element from such +data the command is executed with the item assigned to the variable. A variable +may be an existing one (when just name is used) or a locally defined (when type +and name is used). In both cases, it must have the same type as elements. + <p>The <cf>case</cf> is similar to case from Pascal. Syntax is <cf>case <m/expr/ { else: | <m/num_or_prefix [ .. num_or_prefix]/: <m/statement/ ; [ ... ] }</cf>. The expression after <cf>case</cf> can be of any type which can be @@ -1644,16 +1766,21 @@ neither of the <cf/:/ clauses, the statements after <cf/else:/ are executed. <p>Here is example that uses <cf/if/ and <cf/case/ structures: <code> +if 1234 = i then printn "."; else { + print "not 1234"; + print "You need {} around multiple commands"; +} + +for int asn in bgp_path do { + printn "ASN: ", asn; + if asn < 65536 then print " (2B)"; else print " (4B)"; +} + case arg1 { 2: print "two"; print "I can do more commands without {}"; 3 .. 5: print "three to five"; else: print "something else"; } - -if 1234 = i then printn "."; else { - print "not 1234"; - print "You need {} around multiple commands"; -} </code> @@ -1682,17 +1809,8 @@ Common route attributes are: primary key of the routing table. Read-only. (See the <ref id="routes" name="chapter about routes">.) - <tag><label id="rta-scope"><m/enum/ scope</tag> - The scope of the route. Possible values: <cf/SCOPE_HOST/ for routes - local to this host, <cf/SCOPE_LINK/ for those specific for a physical - link, <cf/SCOPE_SITE/ and <cf/SCOPE_ORGANIZATION/ for private routes and - <cf/SCOPE_UNIVERSE/ for globally visible routes. This attribute is not - interpreted by BIRD and can be used to mark routes in filters. The - default value for new routes is <cf/SCOPE_UNIVERSE/. - <tag><label id="rta-preference"><m/int/ preference</tag> - Preference of the route. Valid values are 0-65535. (See the chapter - about routing tables.) + Preference of the route. <tag><label id="rta-from"><m/ip/ from</tag> The router which the route has originated from. @@ -1794,7 +1912,7 @@ protocol sections. <label id="babel-intro"> <p>The Babel protocol -(<rfc id="6126">) is a loop-avoiding distance-vector routing protocol that is +(<rfc id="8966">) is a loop-avoiding distance-vector routing protocol that is robust and efficient both in ordinary wired networks and in wireless mesh networks. Babel is conceptually very simple in its operation and "just works" in its default configuration, though some configuration is possible and in some @@ -1838,6 +1956,7 @@ protocol babel [<name>] { check link <switch>; next hop ipv4 <address>; next hop ipv6 <address>; + extended next hop <switch>; authentication none|mac [permissive]; password "<text>"; password "<text>" { @@ -1942,6 +2061,11 @@ protocol babel [<name>] { source for Babel packets will be used. In normal operation, it should not be necessary to set this option. + <tag><label id="babel-extended-next-hop">extended next hop <m/switch/</tag> + If enabled, BIRD will accept and emit IPv4 routes with an IPv6 next + hop when IPv4 addresses are absent from the interface as described in + <rfc id="9229">. Default: yes. + <tag><label id="babel-authentication">authentication none|mac [permissive]</tag> Selects authentication method to be used. <cf/none/ means that packets are not authenticated at all, <cf/mac/ means MAC authentication is @@ -2112,6 +2236,13 @@ protocol bfd [<name>] { to configure separate BFD protocol instances for IPv4 and for IPv6 sessions. + <tag><label id="bfd-strict-bind">strict bind <m/switch/</tag> + Specify whether each BFD interface should use a separate listening + socket bound to its local address, or just use a shared listening socket + accepting all addresses. Binding to a specific address could be useful + in cases like running multiple BIRD instances on a machine, each + handling a different set of interfaces. Default: disabled. + <tag><label id="bfd-iface">interface <m/pattern/ [, <m/.../] { <m/options/ }</tag> Interface definitions allow to specify options for sessions associated with such interfaces and also may contain interface specific options. @@ -2296,6 +2427,10 @@ avoid routing loops. <item> <rfc id="8092"> - BGP Large Communities Attribute <item> <rfc id="8203"> - BGP Administrative Shutdown Communication <item> <rfc id="8212"> - Default EBGP Route Propagation Behavior without Policies +<item> <rfc id="8654"> - Extended Message Support for BGP +<item> <rfc id="9072"> - Extended Optional Parameters Length for BGP OPEN Message +<item> <rfc id="9117"> - Revised Validation Procedure for BGP Flow Specifications +<item> <rfc id="9234"> - Route Leak Prevention and Detection Using Roles </itemize> <sect1>Route selection rules @@ -2515,12 +2650,19 @@ using the following configuration parameters: keeps MED attribute). Default: disabled. <tag><label id="bgp-allow-local-pref">allow bgp_local_pref <m/switch/</tag> - A standard BGP implementation do not send the Local Preference attribute - to eBGP neighbors and ignore this attribute if received from eBGP + Standard BGP implementations do not send the Local Preference attribute + to EBGP neighbors and ignore this attribute if received from EBGP neighbors, as per <rfc id="4271">. When this option is enabled on an - eBGP session, this attribute will be sent to and accepted from the peer, + EBGP session, this attribute will be sent to and accepted from the peer, which is useful for example if you have a setup like in <rfc id="7938">. - The option does not affect iBGP sessions. Default: off. + The option does not affect IBGP sessions. Default: off. + + <tag><label id="bgp-allow-med">allow bgp_med <m/switch/</tag> + Standard BGP implementations do not propagate the MULTI_EXIT_DESC + attribute unless it is configured locally. When this option is enabled + on an EBGP session, this attribute will be sent to the peer regardless, + which is useful for example if you have a setup like in <rfc id="7938">. + The option does not affect IBGP sessions. Default: off. <tag><label id="bgp-allow-local-as">allow local as [<m/number/]</tag> BGP prevents routing loops by rejecting received routes with the local @@ -2658,9 +2800,16 @@ using the following configuration parameters: <tag><label id="bgp-hold-time">hold time <m/number/</tag> Time in seconds to wait for a Keepalive message from the other side - before considering the connection stale. Default: depends on agreement - with the neighboring router, we prefer 240 seconds if the other side is - willing to accept it. + before considering the connection stale. The effective value is + negotiated during session establishment and it is a minimum of this + configured value and the value proposed by the peer. The zero value has + a special meaning, signifying that no keepalives are used. Default: 240 + seconds. + + <tag><label id="bgp-min-hold-time">min hold time <m/number/</tag> + Minimum value of the hold time that is accepted during session negotiation. + If the peer proposes a lower value, the session is rejected with error. + Default: none. <tag><label id="bgp-startup-hold-time">startup hold time <m/number/</tag> Value of the hold timer used before the routers have a chance to exchange @@ -2668,8 +2817,15 @@ using the following configuration parameters: <tag><label id="bgp-keepalive-time">keepalive time <m/number/</tag> Delay in seconds between sending of two consecutive Keepalive messages. + The effective value depends on the negotiated hold time, as it is scaled + to maintain proportion between the keepalive time and the hold time. Default: One third of the hold time. + <tag><label id="bgp-min-keepalive-time">min keepalive time <m/number/</tag> + Minimum value of the keepalive time that is accepted during session + negotiation. If the proposed hold time would lead to a lower value of + the keepalive time, the session is rejected with error. Default: none. + <tag><label id="bgp-connect-delay-time">connect delay time <m/number/</tag> Delay in seconds between protocol startup and the first attempt to connect. Default: 5 seconds. @@ -2680,7 +2836,7 @@ using the following configuration parameters: <tag><label id="bgp-error-wait-time">error wait time <m/number/,<m/number/</tag> Minimum and maximum delay in seconds between a protocol failure (either - local or reported by the peer) and automatic restart. Doesn't apply + local or reported by the peer) and automatic restart. Doesn not apply when <cf/disable after error/ is configured. If consecutive errors happen, the delay is increased exponentially until it reaches the maximum. Default: 60, 300. @@ -2736,6 +2892,29 @@ using the following configuration parameters: protocol itself (for example, if a route is received through eBGP and therefore does not have such attribute). Default: 100 (0 in pre-1.2.0 versions of BIRD). + + <tag><label id="bgp-local-role">local role <m/role-name/</tag> + BGP roles are a mechanism for route leak prevention and automatic route + filtering based on common BGP topology relationships. They are defined + in <rfc id="9234">. Instead of manually configuring filters and + communities, automatic filtering is done with the help of the OTC + attribute - a flag for routes that should be sent only to customers. + The same attribute is also used to automatically detect and filter route + leaks created by third parties. + + This option is valid for EBGP sessions, but it is not recommended to be + used within AS confederations (which would require manual filtering of + <cf/bgp_otc/ attribute on confederation boundaries). + + Possible <cf><m/role-name/</cf> values are: <cf/provider/, + <cf/rs_server/, <cf/rs_client/, <cf/customer/ and <cf/peer/. + Default: No local role assigned. + + <tag><label id="bgp-require-roles">require roles <m/switch/</tag> + If this option is set, the BGP roles must be defined on both sides, + otherwise the session will not be established. This behavior is defined + in <rfc id="9234"> as "strict mode" and is used to enforce corresponding + configuration at your conterpart side. Default: disabled. </descrip> <sect1>Channel configuration @@ -2806,6 +2985,20 @@ be used in explicit configuration. BGP session (if acceptable), or the preferred address of an associated interface. + <tag><label id="bgp-next-hop-prefer">next hop prefer global</tag> + Prefer global IPv6 address to link-local IPv6 address for immediate next + hops of received routes. For IPv6 routes, the Next Hop attribute may + contain both a global IP address and a link-local IP address. For IBGP + sessions, the global IP address is resolved (<ref id="bgp-gateway" + name="gateway recursive">) through an IGP routing table + (<ref id="bgp-igp-table" name="igp table">) to get an immediate next + hop. If the resulting IGP route is a direct route (i.e., the next hop is + a direct neighbor), then the link-local IP address from the Next Hop + attribute is used as the immediate next hop. This option change it to + use the global IP address instead. Note that even with this option + enabled a route may end with a link-local immediate next hop when the + IGP route has one. Default: disabled. + <tag><label id="bgp-gateway">gateway direct|recursive</tag> For received routes, their <cf/gw/ (immediate next hop) attribute is computed from received <cf/bgp_next_hop/ attribute. This option @@ -2858,6 +3051,31 @@ be used in explicit configuration. explicitly (to conserve memory). This option requires that the connected routing table is <ref id="dsc-table-sorted" name="sorted">. Default: off. + <tag><label id="bgp-validate">validate <m/switch/</tag> + Apply flowspec validation procedure as described in <rfc id="8955"> + section 6 and <rfc id="9117">. The Validation procedure enforces that + only routers in the forwarding path for a network can originate flowspec + rules for that network. The validation procedure should be used for EBGP + to prevent injection of malicious flowspec rules from outside, but it + should also be used for IBGP to ensure that selected flowspec rules are + consistent with selected IP routes. The validation procedure uses an IP + routing table (<ref id="bgp-base-table" name="base table">, see below) + against which flowspec rules are validated. This option is limited to + flowspec channels. Default: off (for compatibility reasons). + + Note that currently the flowspec validation does not work reliably + together with <ref id="bgp-import-table" name="import table"> option + enabled on flowspec channels. + + <tag><label id="bgp-base-table">base table <m/name/</tag> + Specifies an IP table used for the flowspec validation procedure. The + table must have enabled <cf/trie/ option, otherwise the validation + procedure would not work. The type of the table must be <cf/ipv4/ for + <cf/flow4/ channels and <cf/ipv6/ for <cf/flow6/ channels. This option + is limited to flowspec channels. Default: the main table of the + <cf/ipv4/ / <cf/ipv6/ channel of the same BGP instance, or the + <cf/master4/ / <cf/master6/ table if there is no such channel. + <tag><label id="bgp-extended-next-hop">extended next hop <m/switch/</tag> BGP expects that announced next hops have the same address family as associated network prefixes. This option provides an extension to use @@ -3018,6 +3236,11 @@ some of them (marked with `<tt/O/') are optional. This attribute contains accumulated IGP metric, which is a total distance to the destination through multiple autonomous systems. Currently, the attribute is not accessible from filters. + + <tag><label id="bgp-otc">int bgp_otc [O]</tag> + This attribute is defined in <rfc id="9234">. OTC is a flag that marks + routes that should be sent only to customers. If <ref id="bgp-role" + name="local Role"> is configured it set automatically. </descrip> <sect1>Example @@ -5185,7 +5408,7 @@ Note that for negated matches, value must be either zero or equal to bitmask <cf>port 1..1023,1194,3306</cf>). <tag><label id="flow-dport">dport <m/numbers-match/</tag> - Set a mating destination port numbers (e.g. <cf>dport 49151</cf>). + Set a matching destination port numbers (e.g. <cf>dport 49151</cf>). <tag><label id="flow-sport">sport <m/numbers-match/</tag> Set a matching source port numbers (e.g. <cf>sport = 0</cf>). diff --git a/doc/prog-head.sgml b/doc/prog-head.sgml index 43ee4ef6..4daca3a3 100644 --- a/doc/prog-head.sgml +++ b/doc/prog-head.sgml @@ -12,9 +12,9 @@ <title>BIRD Programmer's Documentation <author> Ondrej Filip <it/<feela@network.cz>/, -Maria Matejka <it/<mq@jmq.cz>/, Martin Mares <it/<mj@ucw.cz>/, Ondrej Zajicek <it/<santiago@crfreenet.org>/ +Maria Matejka <it/<mq@jmq.cz>/, </author> <abstract> diff --git a/doc/reply_codes b/doc/reply_codes index 02f4e656..bcbbbb03 100644 --- a/doc/reply_codes +++ b/doc/reply_codes @@ -61,6 +61,7 @@ Reply codes of BIRD command-line interface 1023 Show Babel interfaces 1024 Show Babel neighbors 1025 Show Babel entries +1026 Show threads 8000 Reply too long 8001 Route not found diff --git a/doc/roadmap.md b/doc/roadmap.md new file mode 100644 index 00000000..f6b9d475 --- /dev/null +++ b/doc/roadmap.md @@ -0,0 +1,307 @@ +# Project roadmap + +## Planned for 2023 + +### SNMP AgentX plugin for BIRD status export +Allow for easier status monitoring. + +### BGP Monitoring Protocol (BMP) +BGP Monitoring Protocol (RFC 7854) is a protocol between a BGP speaker and +a monitoring node, which is notified about route updates and neighbor state +changes of the BGP speaker. + +### Better coverage of automatic tests +Functionality tests should cover more possible configurations and +combinations. Integration tests should run automatically between different OS +versions and HW architectures. Experimental support for performance regression tests. + +### Release 3.0-alpha1 +Missing: MRT, merging + +### Show BFD sessions details +CLI command showing detailed information about BFD sessions state + +### Review and merge Babel extended next hop patches (RFC 9229) +Babel extension to allow IPv4 routes with IPv6 next hop. Patch on mailing list. + +### Consolidate protocol statistics +Consolidate protocol statistics, make them useful for SNMP plugin and implement +'show XX stats' command. + +### TCP-AO if it appears in Linux and BSD upstream +Resolve whether we should or shouldn't control the kernel key management. +Design and implement our side for both Linux and BSD. + +### Conditional routes (v3) +Filters should be extended to allow conditional expressions based on a number of +matching routes in a routing table. This would allow to specify aggregate routes +using a static protocol and conditions like 'if there is at least 1000 routes +from this BGP protocol, accept this default route'. This feature comes handy +when a router needs to detect whether its BGP upstream is alive and working. +Based of number of routes received, the router can then announce or retract a +default route to OSPF, making multi-exit network routing simpler and more +effective. + +### Aggregating routes +Requested by customer: aggregating multiple routes by a common set of attributes. + +Implementation choice: the user specifies + + EXPORT filter before aggregation AGGREGATE ON list of expressions to compare MERGE what to do with the remaining attributes + +Example usage: + +* aggregating information from multiple internal BGP routes into one external +* creating a multipath route from multiple BGP routes (currently done by MERGE PATHS) +* (in future) computing a minimal route set for kernel to make forwarding faster instead of writing the received full BGP set there + +### PREF64 option in RA (RFC 8781) +Inform hosts about prefix used to synthesize NAT64 addresses. Requested in list: +http://trubka.network.cz/pipermail/bird-users/2022-November/016401.html + +### Logging via UDP +Got a patch, probably never merged. May be useful. +http://trubka.network.cz/pipermail/bird-users/2022-January/015893.html + +### BGP Tunnel Encapsulation Attribute (RFC 9012) +Packets sent to BGP next hop may be encapsulated using various tunnel +technologies. Useful for L3VPN. + +### BGP AS Cones and ASPA support +Extend the RPKI protocol with AS Cones and ASPA loading. Implement AS Cones +and ASPA validation routines. There may be some pending patches from QRator. + +### DHCPv6 relay agent +DHCPv6 relay agents (RFC 8415, RFC 8987) forward DHCPv6 messages between clients and +servers. They also ensure that prefixes delegated by DHCPv6-PD are routable, +i.e. they should generate routes for these prefixes. + +### Nexthop attributes and ECMP filtering +Currently we have route attributes, but with ECMP routes it is necessary to +store per-nexthop data (like weight or encapsulation). We also do not have +proper way to manipulate with multiple nexthops from filters. Attributes should +be extended to allow per-nexthop ones and filters should be extended to allow +access multiple nexthops and their attributes. + +### Performance accounting +Extended internal statistics about time spent in different modules of BIRD. If +the route server admin checks why it takes 15 minutes to converge, this should +give some basic info about performance. [MM: Internally needed by 3.0, already in progress] + +### MPLS support +Finalize and merge improved MPLS infrastructure (including MPLS label allocator +and supporting code), improve its reconfiguration support and support for +segment routing. + +### BGP Segment Routing Extension (RFC 8669) +Receive and announce Segment Identifiers (SIDs) for BGP next hops. + +## Backlog for following years + +*The order of these items is not significant.* + +### Flowspec attribute filtering +Flowspec routes have many parameters, but these are not accessible from filters. +Filters should be extended to access all these attributes, but first it is +necessary to cleanup attribute handling in filters. + +### BGP Optimal Route Reflection (RFC 9107) +Implement BGP best route selection on route reflectors to adhere to POV of +client, not RR. Also requested by somebody, don't remember who and when. + +### OSPF Traffic engineering extensions (RFC 3630) +Requested in list. May include lots of other RFC's as we have neglected this +feature for a long time. +http://trubka.network.cz/pipermail/bird-users/2022-January/015911.html + +### IPv6 preference in documentation (?) +Address world's reluctance of legacy IPv4 deprecation by updating the +documentation in such a way that IPv6 is preferred and first seen. + +### BGP local prefix leak prevention (?) +Reject local prefixes on eBGP sessions by default to prevent leaks to public Internet. +Unless explicitly enabled by config, of course. + +### Re-bogonization of 240/4 legacy range (?) +We shouldn't believe that every operator does the +filtering right and they could simply rely on pre-2.0.10 behavior which +filtered this out by default. + +### IPv4 multicast +Basic infrastructure for IPv4 multicast routing, including nettypes for +multicast routes and multicast requests, multicast kernel protocol and IGMPv2 +protocol. + +### PIM-BIDIR +Bidirectional PIM (RFC 5015) is a multicast routing protocol, variant of PIM-SM. +It uses bidirectional shared trees rooted in Rendezvous Point (RP) to connect +sources and receivers. + +There is an old branch containing this. We should have merged this years ago. + +### Improved VRF support +BIRD has working VRF support, but it needs improvements. VRF entities should be +first-class objects with explicit configuration, with a set of properties and +default values (like default routing tables, or router ID) for associated +protocols. Default kernel table ID should be autodetected. There should be +better handling of VRF route leaking - when a route is propagated between VRFs, +its nexthop should reflects that. Setup of VRFs in OS is out of scope. + +### Linux kernel nexthop abstraction +Netlink allows setting nexthops as objects and using them in routes. It should +be much faster than conventional route update. + +### Protocol attributes for filtering +Filters can access route attributes, but sometimes it could be useful to access +attributes of associated protocol (like neighbor-as or neighbor-ip for BGP +protocol). But it would require to have internal object model (below) first, +as we do not want to implement it independently for each protocol attribute. + +### Mutable static routes +Extension to the static protocol that would allow to add/remove/change static +routes from CLI. + +### Multipipe +Pipe-like protocol: When a route is exported to this protocol, it runs its +filter extended with capability to announce any number of new routes to any +table from one filter run. Its primary purpose is to allow user-specified +route aggregation and other non-linear operations. + +### BGP minimum route advertisement interval (MRAI) +BGP specifies minimum interval between route advertisements for the same +network. This is not implemented in BIRD. It should be implemented for 3.0 to +avoid unnecessary re-routing spikes. + +### OSPF unnumbered interfaces +The OSPFv2 protocol allows interfaces that do not have proper IP range but have +peer IP addresses (like PtP links). It should be extended to also allow true +unnumbered interfaces with no addresses (by using an IP address from some +loopback device). This would require to have stricter separation between IP +addresses and interfaces in OSPFv2. + +### OSPF Segment Routing Extension (RFC 8665) +MPLS label distribution using segment routing and simple OSPF extension. + +### MPLS Label Distribution Protocol (LDP) +Label Distribution Protocol (RFC 5036) is a protocol for establishing +label-switched paths and distributing of MPLS labels between MPLS routers. +These paths and labels are based on existing unlabeled routing information. + +### IPv6 multicast +Basic infrastructure for IPv6 multicast routing, including nettypes for +multicast routes and multicast requests, multicast kernel protocol and MLDv1 +protocol. Most of these (with the exception of MLDv1) is just a variant of +IPv4 multicast. + +### IGMP/MLD multicast proxy +A simple IGMP/MLD multicast proxy, which sends IGMP/MLD requests on a configured +uplink interface based on received requests on downlink interfaces, and updates +associated multicast routes. + +### Source-specific multicast (SSM) +Infrastructure for multicasts should be extended to handle source-specific +multicasts. Extend multicast nettypes to include source addresses, handle them +in multicast kernel protocols and implement IGMPv3/MLDv2 protocols. + +### PIM-SSM +PIM-SSM is a source-specific multicast routing protocol, a subset of PIM-SM +protocol (RFC 7761). It is restricted to source-specific multicasts, which +eliminates many problematic parts of PIM-SM. + +### Seamless BFD +New version of BFD negotiation defined in RFC 7880-7886 enables faster +continuity tests by dissemination discriminators by the governing protocols. + +### OSPF Graceful Link Shutdown +To enable seamless maintenance of single links, OSPF can advertise such a link +getting down in advance, allowing to re-route. Defined in RFC 8379. + +## Long-term + +### Internal object model +We need to define explicit internal object model, where existing objects +(protocols, channels, tables, routes, interfaces ...) and their properties are +described in a way that allows introspection sufficient for implementing +features (control protocol, CLI, filter access, perhaps reconfiguration) in a +generic manner. + +### Generic configuration model +Configuration options are implicitly defined by the configuration parsing code. +We need to define explicit configuration model independent of the parsing code +and generic parsing code using that model. This will allow uniform validation of +configuration properties, generic access to configuration from control protocol +and possibly independent configuration backends (like one for Netconf). + +### New control protocol +BIRD should have a well-documented machine readable protocol. Requirements for +such protocol are: + +* Generic machine readable abstract-tree representation (like CBOR) +* Both request/reply and subscribe/notify access patterns +* Access objects and properties using internal object model +* In-band introspection based on internal object model + +From Maria's notes: + +* CBOR-based protocol for both control and route exports +* Python3 library with example implementation of CLI +* (maybe) Ansible modules +* RFC 9164: CBOR tags for IP addresses and prefices +* RFC 9254: YANG-CBOR mapping +* RFC 9277: Stable storage of CBOR (files) + +## Perhaps + +### IS-IS +IS-IS routing protocol is a nice-to-have alternative to OSPF. + +### BGPsec +BGPsec (RFC 8205) is a new path security extension to BGP. + +### PIM-SM +PIM-SM (RFC 7761) is a prevailing multicast routing protocol, but more +complicated than planned PIM-BIDIR and PIM-SSM. + +### Netconf +Network Configuration Protocol (RFC 6241) is a XML/JSON protocol for +configuration management of network devices. It would benefit from generic +configuration model (above). + +### NetConf overlay +Machine-friendly config file editor daemon (standalone) with standard NetConf +interface on one side and BIRD config file + reconfiguration requests on the +other side. Python3 seems to be better choice than C for this kind of work. + +### Backend for 802.11r +Let's assume a bunch of boxes, all having some public wifi APs and some (secure) uplinks. +Design and implement an automatic backbone protocol to allow for simple almost-zeroconf +setup of e.g. a conference room or train / bus public wifi or even a local home network, +all with hostapd seamlessly transferring clients between APs via 802.11r. +Possible collab with Turris. + +### BFD Multipoint Connectivity +Checking whether multiple "receivers" can communicate with a single "sender". +Possibly useful after merging PIM-BIDIR and implementing other PIMs. RFC 8562-8563. + +### BGP Link State extension +BGP-LS allows to transport information about network topology across BGP links. +This should help e.g. to run traffic-engineering between more confederated ASs. +Also needed to implement Seamless BFD on BGP: RFC 9247 + +### Locator/ID Separation Protocol +LISP intends to break up addressing to Routing Locators and Endpoint +Identifiers. This may help multihoming networks in future. RFC 9299-9306. + +### Backend for IPv6 Multihoming without BGP +Implement and configure BIRD in such a way that local nodes are seamlessly +connected to the Internet via multiple upstreams, using Network Prefix +Translation and other techniques. Possible collab with Turris. + +## Minor + +* RFC 8510: OSPF LLS Extension for Local Interface ID Advertisement +* RFC 8538: BGP Graceful Restart Hard Reset +* RFC 8326: BGP Graceful Session Shutdown Community auto-apply +* RFC 8962: Become part of the IETF Protocol Police +* RFC 9072: Extended Optional Parameters Length for BGP OPEN Message +* RFC 9339: OSPF Reverse Metric diff --git a/filter/config.Y b/filter/config.Y index 15b77761..e6264a83 100644 --- a/filter/config.Y +++ b/filter/config.Y @@ -22,6 +22,46 @@ static inline u32 pair_b(u32 p) { return p & 0xFFFF; } #define f_generate_complex(fi_code, da, arg) \ f_new_inst(FI_EA_SET, f_new_inst(fi_code, f_new_inst(FI_EA_GET, da), arg), da) +#define f_generate_complex_sym(fi_code, sym, arg) ({ \ + if (sym->class != SYM_ATTRIBUTE) \ + cf_error("Can't empty %s: not an attribute", sym->name); \ + f_generate_complex(fi_code, sym->attribute, arg); \ +}) + +#define f_generate_complex_default(fi_code, da, arg, def) \ + f_new_inst(FI_EA_SET, f_new_inst(fi_code, f_new_inst(FI_DEFAULT, f_new_inst(FI_EA_GET, da), f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_INT, .val.i = def })), arg), da) + + +static int +f_new_var(struct sym_scope *s) +{ + /* + * - A variable is an offset on vstack from vbase. + * - Vbase is set on filter start / function call. + * - Scopes contain (non-frame) block scopes inside filter/function scope + * - Each scope knows number of vars in that scope + * - Offset is therefore a sum of 'slots' up to filter/function scope + * - New variables are added on top of vstk, so intermediate values cannot + * be there during FI_VAR_INIT. I.e. no 'var' inside 'term'. + * - Also, each f_line must always have its scope, otherwise a variable may + * be defined but not initialized if relevant f_line is not executed. + */ + + int offset = s->slots++; + + while (s->block) + { + s = s->next; + ASSERT(s); + offset += s->slots; + } + + if (offset >= 0xff) + cf_error("Too many variables, at most 255 allowed"); + + return offset; +} + /* * Sets and their items are during parsing handled as lists, linked * through left ptr. The first item in a list also contains a pointer @@ -161,28 +201,32 @@ f_new_lc_item(u32 f1, u32 t1, u32 f2, u32 t2, u32 f3, u32 t3) } static inline struct f_inst * -f_generate_empty(struct f_dynamic_attr dyn) +f_generate_empty(const struct symbol *sym) { - struct f_val empty; + if (sym->class != SYM_ATTRIBUTE) + cf_error("Can't empty %s: not an attribute", sym->name); - switch (dyn.type & EAF_TYPE_MASK) { - case EAF_TYPE_AS_PATH: - empty = f_const_empty_path; - break; - case EAF_TYPE_INT_SET: - empty = f_const_empty_clist; - break; - case EAF_TYPE_EC_SET: - empty = f_const_empty_eclist; - break; - case EAF_TYPE_LC_SET: - empty = f_const_empty_lclist; - break; - default: - cf_error("Can't empty that attribute"); - } + const struct ea_class *def = sym->attribute; + const struct f_val *empty = f_get_empty(def->type); + if (!empty) + cf_error("Can't empty attribute %s", def->name); + + return f_new_inst(FI_EA_SET, f_new_inst(FI_CONSTANT, *empty), def); +} - return f_new_inst(FI_EA_SET, f_new_inst(FI_CONSTANT, empty), dyn); +static inline struct f_inst * +f_implicit_roa_check(struct rtable_config *tab) +{ + const struct ea_class *def = ea_class_find("bgp_path"); + if (!def) + cf_error("Fatal: Couldn't find BGP path attribute definition."); + + struct f_static_attr fsa = f_new_static_attr(T_NET, SA_NET, 1); + + return f_new_inst(FI_ROA_CHECK, + f_new_inst(FI_RTA_GET, fsa), + f_new_inst(FI_AS_PATH_LAST, f_new_inst(FI_EA_GET, def)), + tab); } /* @@ -262,7 +306,7 @@ assert_assign(struct f_lval *lval, struct f_inst *expr, const char *start, const checker = f_new_inst(FI_EQ, expr, getter); setter->next = checker; - + return assert_done(setter, start, end); } @@ -273,16 +317,16 @@ CF_KEYWORDS(FUNCTION, PRINT, PRINTN, UNSET, RETURN, INT, BOOL, IP, TYPE, PREFIX, RD, PAIR, QUAD, EC, LC, SET, STRING, BGPMASK, BGPPATH, CLIST, ECLIST, LCLIST, IF, THEN, ELSE, CASE, + FOR, IN, DO, TRUE, FALSE, RT, RO, UNKNOWN, GENERIC, - FROM, GW, NET, MASK, PROTO, SOURCE, SCOPE, DEST, IFNAME, IFINDEX, WEIGHT, GW_MPLS, - PREFERENCE, + FROM, GW, NET, MASK, PROTO, SCOPE, DEST, IFNAME, IFINDEX, WEIGHT, GW_MPLS, ROA_CHECK, ASN, SRC, DST, IS_V4, IS_V6, LEN, MAXLEN, DATA, DATA1, DATA2, DEFINED, - ADD, DELETE, CONTAINS, RESET, - PREPEND, FIRST, LAST, LAST_NONAGGREGATED, MATCH, + ADD, DELETE, RESET, + PREPEND, FIRST, LAST, LAST_NONAGGREGATED, MIN, MAX, EMPTY, FILTER, WHERE, EVAL, ATTRIBUTE, @@ -292,21 +336,22 @@ CF_KEYWORDS(FUNCTION, PRINT, PRINTN, UNSET, RETURN, %nonassoc ELSE %type <xp> cmds_int cmd_prep -%type <x> term block cmd cmds constant constructor print_list var_list function_call symbol_value bgp_path_expr bgp_path bgp_path_tail -%type <fda> dynamic_attr +%type <x> term cmd cmd_var cmds cmds_scoped constant constructor print_list var var_init var_list function_call symbol_value bgp_path_expr bgp_path bgp_path_tail %type <fsa> static_attr %type <f> filter where_filter %type <fl> filter_body function_body %type <flv> lvalue -%type <i> type function_args function_vars +%type <i> type function_vars +%type <fa> function_argsn function_args %type <ecs> ec_kind -%type <fret> break_command +%type <fret> break_command %type <i32> cnum %type <e> pair_item ec_item lc_item set_item switch_item set_items switch_items switch_body %type <trie> fprefix_set %type <v> set_atom switch_atom fipa %type <px> fprefix %type <t> get_cf_position +%type <s> for_var CF_GRAMMAR @@ -330,17 +375,24 @@ filter_def: conf: filter_eval ; filter_eval: - EVAL term { f_eval_int(f_linearize($2)); } + EVAL term { f_eval_int(f_linearize($2, 1)); } ; conf: custom_attr ; custom_attr: ATTRIBUTE type symbol ';' { - cf_define_symbol($3, SYM_ATTRIBUTE, attribute, ca_lookup(new_config->pool, $3->name, $2)->fda); + if (($3->class == SYM_ATTRIBUTE) && ($3->scope == new_config->root_scope)) + cf_error("Duplicate attribute %s definition", $3->name); + + cf_define_symbol($3, SYM_ATTRIBUTE, attribute, + ea_register_alloc(new_config->pool, (struct ea_class) { + .name = $3->name, + .type = $2, + })->class); }; conf: bt_test_suite ; bt_test_suite: - BT_TEST_SUITE '(' CF_SYM_KNOWN ',' text ')' { + BT_TEST_SUITE '(' symbol_known ',' text ')' { cf_assert_symbol($3, SYM_FUNCTION); struct f_bt_test_suite *t = cfg_allocz(sizeof(struct f_bt_test_suite)); t->fn = $3->function; @@ -353,7 +405,7 @@ bt_test_suite: conf: bt_test_same ; bt_test_same: - BT_TEST_SAME '(' CF_SYM_KNOWN ',' CF_SYM_KNOWN ',' NUM ')' { + BT_TEST_SAME '(' symbol_known ',' symbol_known ',' NUM ')' { cf_assert_symbol($3, SYM_FUNCTION); cf_assert_symbol($5, SYM_FUNCTION); struct f_bt_test_suite *t = cfg_allocz(sizeof(struct f_bt_test_suite)); @@ -405,25 +457,28 @@ type: ; function_argsn: - /* EMPTY */ + /* EMPTY */ { $$ = NULL; } | function_argsn type symbol ';' { if ($3->scope->slots >= 0xfe) cf_error("Too many declarations, at most 255 allowed"); - cf_define_symbol($3, SYM_VARIABLE | $2, offset, $3->scope->slots++); + $$ = cfg_alloc(sizeof(struct f_arg)); + $$->arg = cf_define_symbol($3, SYM_VARIABLE | $2, offset, sym_->scope->slots++); + $$->next = $1; } ; function_args: - '(' ')' { $$ = 0; } + '(' ')' { $$ = NULL; } | '(' function_argsn type symbol ')' { - cf_define_symbol($4, SYM_VARIABLE | $3, offset, $4->scope->slots++); - $$ = $4->scope->slots; + $$ = cfg_alloc(sizeof(struct f_arg)); + $$->arg = cf_define_symbol($4, SYM_VARIABLE | $3, offset, sym_->scope->slots++); + $$->next = $2; } ; function_vars: /* EMPTY */ { $$ = 0; } | function_vars type symbol ';' { - cf_define_symbol($3, SYM_VARIABLE | $2, offset, $3->scope->slots++); + cf_define_symbol($3, SYM_VARIABLE | $2, offset, f_new_var(sym_->scope)); $$ = $1 + 1; } ; @@ -431,14 +486,16 @@ function_vars: filter_body: function_body ; filter: - CF_SYM_KNOWN { + symbol_known { cf_assert_symbol($1, SYM_FILTER); $$ = $1->filter; } - | filter_body { + | { cf_push_scope(NULL); } filter_body { struct filter *f = cfg_alloc(sizeof(struct filter)); - *f = (struct filter) { .root = $1 }; + *f = (struct filter) { .root = $2 }; $$ = f; + + cf_pop_scope(); } ; @@ -451,20 +508,35 @@ where_filter: function_body: function_vars '{' cmds '}' { - $$ = f_linearize($3); + $$ = f_linearize($3, 0); $$->vars = $1; } ; conf: function_def ; function_def: - FUNCTION symbol { DBG( "Beginning of function %s\n", $2->name ); + FUNCTION symbol { + DBG( "Beginning of function %s\n", $2->name ); $2 = cf_define_symbol($2, SYM_FUNCTION, function, NULL); cf_push_scope($2); - } function_args function_body { - DBG("Definition of function %s with %u args and %u local vars.\n", $2->name, $4, $5->vars); - $5->args = $4; - $2->function = $5; + } function_args { + /* Make dummy f_line for storing function prototype */ + struct f_line *dummy = cfg_allocz(sizeof(struct f_line)); + $2->function = dummy; + + /* Revert the args */ + while ($4) { + struct f_arg *tmp = $4; + $4 = $4->next; + + tmp->next = dummy->arg_list; + dummy->arg_list = tmp; + dummy->args++; + } + } function_body { + $6->args = $2->function->args; + $6->arg_list = $2->function->arg_list; + $2->function = $6; cf_pop_scope(); } ; @@ -475,7 +547,11 @@ cmds: /* EMPTY */ { $$ = NULL; } | cmds_int { $$ = $1.begin; } ; -cmd_prep: cmd { +cmds_scoped: { cf_push_soft_scope(); } cmds { cf_pop_soft_scope(); $$ = $2; } ; + +cmd_var: var | cmd ; + +cmd_prep: cmd_var { $$.begin = $$.end = $1; if ($1) while ($$.end->next) @@ -497,15 +573,6 @@ cmds_int: cmd_prep } ; -block: - cmd { - $$=$1; - } - | '{' cmds '}' { - $$=$2; - } - ; - /* * Complex types, their bison value is struct f_val */ @@ -529,10 +596,10 @@ set_atom: | VPN_RD { $$.type = T_RD; $$.val.ec = $1; } | ENUM { $$.type = pair_a($1); $$.val.i = pair_b($1); } | '(' term ')' { - if (f_eval(f_linearize($2), cfg_mem, &($$)) > F_RETURN) cf_error("Runtime error"); + if (f_eval(f_linearize($2, 1), &($$)) > F_RETURN) cf_error("Runtime error"); if (!f_valid_set_type($$.type)) cf_error("Set-incompatible type"); } - | CF_SYM_KNOWN { + | symbol_known { cf_assert_symbol($1, SYM_CONSTANT); if (!f_valid_set_type(SYM_TYPE($1))) cf_error("%s: set-incompatible type", $1->name); $$ = *$1->val; @@ -541,13 +608,13 @@ set_atom: switch_atom: NUM { $$.type = T_INT; $$.val.i = $1; } - | '(' term ')' { $$.type = T_INT; $$.val.i = f_eval_int(f_linearize($2)); } + | '(' term ')' { $$.type = T_INT; $$.val.i = f_eval_int(f_linearize($2, 1)); } | fipa { $$ = $1; } | ENUM { $$.type = pair_a($1); $$.val.i = pair_b($1); } ; cnum: - term { $$ = f_eval_int(f_linearize($1)); } + term { $$ = f_eval_int(f_linearize($1, 1)); } pair_item: '(' cnum ',' cnum ')' { $$ = f_new_pair_item($2, $2, $4, $4); } @@ -631,19 +698,18 @@ fprefix_set: ; switch_body: /* EMPTY */ { $$ = NULL; } - | switch_body switch_items ':' cmds { + | switch_body switch_items ':' cmds_scoped { /* Fill data fields */ struct f_tree *t; - struct f_line *line = f_linearize($4); for (t = $2; t; t = t->left) - t->data = line; + t->data = $4; $$ = f_merge_items($1, $2); } - | switch_body ELSECOL cmds { + | switch_body ELSECOL cmds_scoped { struct f_tree *t = f_new_tree(); t->from.type = t->to.type = T_VOID; t->right = t; - t->data = f_linearize($3); + t->data = $3; $$ = f_merge_items($1, t); } ; @@ -660,6 +726,7 @@ bgp_path: bgp_path_tail: NUM bgp_path_tail { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_PATH_MASK_ITEM, .val.pmi = { .asn = $1, .kind = PM_ASN, }, }); $$->next = $2; } | NUM DDOT NUM bgp_path_tail { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_PATH_MASK_ITEM, .val.pmi = { .from = $1, .to = $3, .kind = PM_ASN_RANGE }, }); $$->next = $4; } + | '[' ']' bgp_path_tail { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_PATH_MASK_ITEM, .val.pmi = { .set = NULL, .kind = PM_ASN_SET }, }); $$->next = $3; } | '[' set_items ']' bgp_path_tail { if ($2->from.type != T_INT) cf_error("Only integer sets allowed in path mask"); $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_PATH_MASK_ITEM, .val.pmi = { .set = build_tree($2), .kind = PM_ASN_SET }, }); $$->next = $4; @@ -679,6 +746,7 @@ constant: | fipa { $$ = f_new_inst(FI_CONSTANT, $1); } | VPN_RD { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_RD, .val.ec = $1, }); } | net_ { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_NET, .val.net = $1, }); } + | '[' ']' { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_SET, .val.t = NULL, }); } | '[' set_items ']' { DBG( "We've got a set here..." ); $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_SET, .val.t = build_tree($2), }); @@ -702,31 +770,26 @@ var_list: /* EMPTY */ { $$ = NULL; } | var_list ',' term { $$ = $3; $$->next = $1; } function_call: - CF_SYM_KNOWN '(' var_list ')' { + symbol_known '(' var_list ')' + { if ($1->class != SYM_FUNCTION) cf_error("You can't call something which is not a function. Really."); - struct f_inst *fc = f_new_inst(FI_CALL, $1); - uint args = 0; + /* Revert the var_list */ + struct f_inst *args = NULL; while ($3) { - args++; - struct f_inst *tmp = $3->next; - $3->next = fc; + struct f_inst *tmp = $3; + $3 = $3->next; - fc = $3; - $3 = tmp; + tmp->next = args; + args = tmp; } - if (args != $1->function->args) - cf_error("Function call '%s' got %u arguments, need %u arguments.", - $1->name, args, $1->function->args); - - $$ = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_VOID }); - $$->next = fc; + $$ = f_new_inst(FI_CALL, args, $1); } ; -symbol_value: CF_SYM_KNOWN +symbol_value: symbol_known { switch ($1->class) { case SYM_CONSTANT_RANGE: @@ -736,7 +799,7 @@ symbol_value: CF_SYM_KNOWN $$ = f_new_inst(FI_VAR_GET, $1); break; case SYM_ATTRIBUTE: - $$ = f_new_inst(FI_EA_GET, *$1->attribute); + $$ = f_new_inst(FI_EA_GET, $1->attribute); break; default: cf_error("Can't get value of symbol %s", $1->name); @@ -745,17 +808,13 @@ symbol_value: CF_SYM_KNOWN ; static_attr: - FROM { $$ = f_new_static_attr(T_IP, SA_FROM, 0); } - | GW { $$ = f_new_static_attr(T_IP, SA_GW, 0); } + GW { $$ = f_new_static_attr(T_IP, SA_GW, 0); } | NET { $$ = f_new_static_attr(T_NET, SA_NET, 1); } | PROTO { $$ = f_new_static_attr(T_STRING, SA_PROTO, 1); } - | SOURCE { $$ = f_new_static_attr(T_ENUM_RTS, SA_SOURCE, 1); } - | SCOPE { $$ = f_new_static_attr(T_ENUM_SCOPE, SA_SCOPE, 0); } | DEST { $$ = f_new_static_attr(T_ENUM_RTD, SA_DEST, 0); } | IFNAME { $$ = f_new_static_attr(T_STRING, SA_IFNAME, 0); } | IFINDEX { $$ = f_new_static_attr(T_INT, SA_IFINDEX, 1); } | WEIGHT { $$ = f_new_static_attr(T_INT, SA_WEIGHT, 0); } - | PREFERENCE { $$ = f_new_static_attr(T_INT, SA_PREF, 0); } | GW_MPLS { $$ = f_new_static_attr(T_INT, SA_GW_MPLS, 0); } ; @@ -765,6 +824,8 @@ term: | term '-' term { $$ = f_new_inst(FI_SUBTRACT, $1, $3); } | term '*' term { $$ = f_new_inst(FI_MULTIPLY, $1, $3); } | term '/' term { $$ = f_new_inst(FI_DIVIDE, $1, $3); } + | term '&' term { $$ = f_new_inst(FI_BITAND, $1, $3); } + | term '|' term { $$ = f_new_inst(FI_BITOR, $1, $3); } | term AND term { $$ = f_new_inst(FI_AND, $1, $3); } | term OR term { $$ = f_new_inst(FI_OR, $1, $3); } | term '=' term { $$ = f_new_inst(FI_EQ, $1, $3); } @@ -784,8 +845,6 @@ term: | static_attr { $$ = f_new_inst(FI_RTA_GET, $1); } - | dynamic_attr { $$ = f_new_inst(FI_EA_GET, $1); } - | term '.' IS_V4 { $$ = f_new_inst(FI_IS_V4, $1); } | term '.' TYPE { $$ = f_new_inst(FI_TYPE, $1); } | term '.' IP { $$ = f_new_inst(FI_IP, $1); } @@ -822,13 +881,11 @@ term: | DELETE '(' term ',' term ')' { $$ = f_new_inst(FI_CLIST_DEL, $3, $5); } | FILTER '(' term ',' term ')' { $$ = f_new_inst(FI_CLIST_FILTER, $3, $5); } - | ROA_CHECK '(' rtable ')' { $$ = f_new_inst(FI_ROA_CHECK_IMPLICIT, $3); } - | ROA_CHECK '(' rtable ',' term ',' term ')' { $$ = f_new_inst(FI_ROA_CHECK_EXPLICIT, $5, $7, $3); } + | ROA_CHECK '(' rtable ')' { $$ = f_implicit_roa_check($3); } + | ROA_CHECK '(' rtable ',' term ',' term ')' { $$ = f_new_inst(FI_ROA_CHECK, $5, $7, $3); } | FORMAT '(' term ')' { $$ = f_new_inst(FI_FORMAT, $3); } -/* | term '.' LEN { $$->code = P('P','l'); } */ - | function_call ; @@ -848,20 +905,53 @@ print_list: /* EMPTY */ { $$ = NULL; } } ; +var_init: + /* empty */ { $$ = f_new_inst(FI_CONSTANT, (struct f_val) { }); } + | '=' term { $$ = $2; } + ; + +var: + type symbol var_init ';' { + struct symbol *sym = cf_define_symbol($2, SYM_VARIABLE | $1, offset, f_new_var(sym_->scope)); + $$ = f_new_inst(FI_VAR_INIT, $3, sym); + } + +for_var: + type symbol { $$ = cf_define_symbol($2, SYM_VARIABLE | $1, offset, f_new_var(sym_->scope)); } + | CF_SYM_KNOWN { $$ = $1; cf_assert_symbol($1, SYM_VARIABLE); } + ; + cmd: - IF term THEN block { + '{' cmds_scoped '}' { + $$ = $2; + } + | IF term THEN cmd { $$ = f_new_inst(FI_CONDITION, $2, $4, NULL); } - | IF term THEN block ELSE block { + | IF term THEN cmd ELSE cmd { $$ = f_new_inst(FI_CONDITION, $2, $4, $6); } - | CF_SYM_KNOWN '=' term ';' { + | FOR { + /* Reserve space for walk data on stack */ + cf_push_block_scope(); + conf_this_scope->slots += 2; + } for_var IN + /* Parse term in the parent scope */ + { conf_this_scope->active = 0; } term { conf_this_scope->active = 1; } + DO cmd { + cf_pop_block_scope(); + $$ = f_new_inst(FI_FOR_INIT, $6, $3); + $$->next = f_new_inst(FI_FOR_NEXT, $3, $9); + } + | symbol_known '=' term ';' { switch ($1->class) { case SYM_VARIABLE_RANGE: $$ = f_new_inst(FI_VAR_SET, $3, $1); break; case SYM_ATTRIBUTE: - $$ = f_new_inst(FI_EA_SET, $3, *$1->attribute); + if ($1->attribute->readonly) + cf_error("Attribute %s is read-only", $1->attribute->name); + $$ = f_new_inst(FI_EA_SET, $3, $1->attribute); break; default: cf_error("Can't assign to symbol %s", $1->name); @@ -871,16 +961,17 @@ cmd: DBG( "Ook, we'll return the value\n" ); $$ = f_new_inst(FI_RETURN, $2); } - | dynamic_attr '=' term ';' { - $$ = f_new_inst(FI_EA_SET, $3, $1); - } | static_attr '=' term ';' { if ($1.readonly) cf_error( "This static attribute is read-only."); $$ = f_new_inst(FI_RTA_SET, $3, $1); } - | UNSET '(' dynamic_attr ')' ';' { - $$ = f_new_inst(FI_EA_UNSET, $3); + | UNSET '(' symbol_known ')' ';' { + if ($3->class != SYM_ATTRIBUTE) + cf_error("Can't unset %s", $3->name); + if ($3->attribute->readonly) + cf_error("Attribute %s is read-only", $3->attribute->name); + $$ = f_new_inst(FI_EA_UNSET, $3->attribute); } | break_command print_list ';' { struct f_inst *breaker = f_new_inst(FI_DIE, $1); @@ -900,16 +991,16 @@ cmd: | PRINTN print_list ';' { $$ = f_new_inst(FI_PRINT, $2); } - | function_call ';' { $$ = f_new_inst(FI_DROP_RESULT, $1); } + | function_call ';' { $$ = f_new_inst(FI_DROP_RESULT, $1); } | CASE term '{' switch_body '}' { - $$ = f_new_inst(FI_SWITCH, $2, build_tree($4)); + $$ = f_new_inst(FI_SWITCH, $2, $4); } - | dynamic_attr '.' EMPTY ';' { $$ = f_generate_empty($1); } - | dynamic_attr '.' PREPEND '(' term ')' ';' { $$ = f_generate_complex( FI_PATH_PREPEND, $1, $5 ); } - | dynamic_attr '.' ADD '(' term ')' ';' { $$ = f_generate_complex( FI_CLIST_ADD, $1, $5 ); } - | dynamic_attr '.' DELETE '(' term ')' ';' { $$ = f_generate_complex( FI_CLIST_DEL, $1, $5 ); } - | dynamic_attr '.' FILTER '(' term ')' ';' { $$ = f_generate_complex( FI_CLIST_FILTER, $1, $5 ); } + | symbol_known '.' EMPTY ';' { $$ = f_generate_empty($1); } + | symbol_known '.' PREPEND '(' term ')' ';' { $$ = f_generate_complex_sym( FI_PATH_PREPEND, $1, $5 ); } + | symbol_known '.' ADD '(' term ')' ';' { $$ = f_generate_complex_sym( FI_CLIST_ADD, $1, $5 ); } + | symbol_known '.' DELETE '(' term ')' ';' { $$ = f_generate_complex_sym( FI_CLIST_DEL, $1, $5 ); } + | symbol_known '.' FILTER '(' term ')' ';' { $$ = f_generate_complex_sym( FI_CLIST_FILTER, $1, $5 ); } | BT_ASSERT '(' get_cf_position term get_cf_position ')' ';' { $$ = assert_done($4, $3 + 1, $5 - 1); } | BT_CHECK_ASSIGN '(' get_cf_position lvalue get_cf_position ',' term ')' ';' { $$ = assert_assign(&$4, $7, $3 + 1, $5 - 1); } ; @@ -920,8 +1011,17 @@ get_cf_position: }; lvalue: - CF_SYM_KNOWN { cf_assert_symbol($1, SYM_VARIABLE); $$ = (struct f_lval) { .type = F_LVAL_VARIABLE, .sym = $1 }; } + symbol_known { + switch ($1->class) { + case SYM_VARIABLE_RANGE: + $$ = (struct f_lval) { .type = F_LVAL_VARIABLE, .sym = $1 }; + break; + case SYM_ATTRIBUTE: + $$ = (struct f_lval) { .type = F_LVAL_EA, .da = $1->attribute }; + break; + } + } | static_attr { $$ = (struct f_lval) { .type = F_LVAL_SA, .sa = $1 }; } - | dynamic_attr { $$ = (struct f_lval) { .type = F_LVAL_EA, .da = $1 }; }; + ; CF_END diff --git a/filter/data.c b/filter/data.c index 56c1fb17..f104d2f8 100644 --- a/filter/data.c +++ b/filter/data.c @@ -16,10 +16,10 @@ #include "lib/unaligned.h" #include "lib/net.h" #include "lib/ip.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/attrs.h" +#include "lib/attrs.h" #include "conf/conf.h" #include "filter/filter.h" #include "filter/f-inst.h" @@ -27,6 +27,8 @@ static const char * const f_type_str[] = { [T_VOID] = "void", + [T_OPAQUE] = "opaque byte string", + [T_IFACE] = "interface", [T_INT] = "int", [T_BOOL] = "bool", @@ -36,7 +38,6 @@ static const char * const f_type_str[] = { [T_ENUM_RTS] = "enum rts", [T_ENUM_BGP_ORIGIN] = "enum bgp_origin", [T_ENUM_SCOPE] = "enum scope", - [T_ENUM_RTC] = "enum rtc", [T_ENUM_RTD] = "enum rtd", [T_ENUM_ROA] = "enum roa", [T_ENUM_NETTYPE] = "enum nettype", @@ -47,6 +48,7 @@ static const char * const f_type_str[] = { [T_NET] = "prefix", [T_STRING] = "string", [T_PATH_MASK] = "bgpmask", + [T_PATH_MASK_ITEM] = "bgpmask item", [T_PATH] = "bgppath", [T_CLIST] = "clist", [T_EC] = "ec", @@ -54,24 +56,22 @@ static const char * const f_type_str[] = { [T_LC] = "lc", [T_LCLIST] = "lclist", [T_RD] = "rd", + + [T_SET] = "set", + [T_PREFIX_SET] = "prefix set", }; const char * -f_type_name(enum f_type t) +f_type_name(btype t) { - if (t < ARRAY_SIZE(f_type_str)) - return f_type_str[t] ?: "?"; - - if ((t == T_SET) || (t == T_PREFIX_SET)) - return "set"; - - return "?"; + return (t < ARRAY_SIZE(f_type_str)) ? (f_type_str[t] ?: "?") : "?"; } -enum f_type -f_type_element_type(enum f_type t) +btype +f_type_element_type(btype t) { switch(t) { + case T_PATH: return T_INT; case T_CLIST: return T_PAIR; case T_ECLIST: return T_EC; case T_LCLIST: return T_LC; @@ -79,6 +79,8 @@ f_type_element_type(enum f_type t) }; } +const struct f_trie f_const_empty_trie = { .ipv4 = -1, }; + const struct f_val f_const_empty_path = { .type = T_PATH, .val.ad = &null_adata, @@ -91,16 +93,11 @@ const struct f_val f_const_empty_path = { }, f_const_empty_lclist = { .type = T_LCLIST, .val.ad = &null_adata, +}, f_const_empty_prefix_set = { + .type = T_PREFIX_SET, + .val.ti = &f_const_empty_trie, }; -static struct adata * -adata_empty(struct linpool *pool, int l) -{ - struct adata *res = lp_alloc(pool, sizeof(struct adata) + l); - res->length = l; - return res; -} - static void pm_format(const struct f_path_mask *p, buffer *buf) { @@ -187,7 +184,7 @@ val_compare(const struct f_val *v1, const struct f_val *v2) if (val_is_ip4(v1) && (v2->type == T_QUAD)) return uint_cmp(ipa_to_u32(v1->val.ip), v2->val.i); - debug( "Types do not match in val_compare\n" ); + DBG( "Types do not match in val_compare\n" ); return F_CMP_ERROR; } @@ -301,6 +298,12 @@ val_same(const struct f_val *v1, const struct f_val *v2) int clist_set_type(const struct f_tree *set, struct f_val *v) { + if (!set) + { + v->type = T_VOID; + return 1; + } + switch (set->from.type) { case T_PAIR: @@ -324,7 +327,7 @@ clist_set_type(const struct f_tree *set, struct f_val *v) } } -static int +int clist_match_set(const struct adata *clist, const struct f_tree *set) { if (!clist) @@ -345,7 +348,7 @@ clist_match_set(const struct adata *clist, const struct f_tree *set) return 0; } -static int +int eclist_match_set(const struct adata *list, const struct f_tree *set) { if (!list) @@ -369,7 +372,7 @@ eclist_match_set(const struct adata *list, const struct f_tree *set) return 0; } -static int +int lclist_match_set(const struct adata *list, const struct f_tree *set) { if (!list) @@ -423,7 +426,7 @@ clist_filter(struct linpool *pool, const struct adata *list, const struct f_val if (nl == list->length) return list; - struct adata *res = adata_empty(pool, nl); + struct adata *res = lp_alloc_adata(pool, nl); memcpy(res->data, tmp, nl); return res; } @@ -457,7 +460,7 @@ eclist_filter(struct linpool *pool, const struct adata *list, const struct f_val if (nl == list->length) return list; - struct adata *res = adata_empty(pool, nl); + struct adata *res = lp_alloc_adata(pool, nl); memcpy(res->data, tmp, nl); return res; } @@ -489,7 +492,7 @@ lclist_filter(struct linpool *pool, const struct adata *list, const struct f_val if (nl == list->length) return list; - struct adata *res = adata_empty(pool, nl); + struct adata *res = lp_alloc_adata(pool, nl); memcpy(res->data, tmp, nl); return res; } @@ -537,6 +540,9 @@ val_in_range(const struct f_val *v1, const struct f_val *v2) if (v2->type != T_SET) return F_CMP_ERROR; + if (!v2->val.t) + return 0; + /* With integrated Quad<->IP implicit conversion */ if ((v1->type == v2->val.t->from.type) || ((v1->type == T_QUAD) && val_is_ip4(&(v2->val.t->from)) && val_is_ip4(&(v2->val.t->to)))) @@ -610,3 +616,75 @@ val_dump(const struct f_val *v) { return val_dump_buffer; } + +struct f_val * +lp_val_copy(struct linpool *lp, const struct f_val *v) +{ + switch (v->type) + { + case T_VOID: + case T_BOOL: + case T_INT: + case T_IP: + case T_PAIR: + case T_QUAD: + case T_EC: + case T_LC: + case T_RD: + case T_ENUM: + case T_PATH_MASK_ITEM: + /* These aren't embedded but there is no need to copy them */ + case T_SET: + case T_PREFIX_SET: + case T_PATH_MASK: + case T_IFACE: + { + struct f_val *out = lp_alloc(lp, sizeof(*out)); + *out = *v; + return out; + } + + case T_NET: + { + struct { + struct f_val val; + net_addr net[0]; + } *out = lp_alloc(lp, sizeof(*out) + v->val.net->length); + out->val = *v; + out->val.val.net = out->net; + net_copy(out->net, v->val.net); + return &out->val; + } + + case T_STRING: + { + uint len = strlen(v->val.s); + struct { + struct f_val val; + char buf[0]; + } *out = lp_alloc(lp, sizeof(*out) + len + 1); + out->val = *v; + out->val.val.s = out->buf; + memcpy(out->buf, v->val.s, len+1); + return &out->val; + } + + case T_PATH: + case T_CLIST: + case T_ECLIST: + case T_LCLIST: + { + struct { + struct f_val val; + struct adata ad; + } *out = lp_alloc(lp, sizeof(*out) + v->val.ad->length); + out->val = *v; + out->val.val.ad = &out->ad; + memcpy(&out->ad, v->val.ad, v->val.ad->length); + return &out->val; + } + + default: + bug("Unknown type in value copy: %d", v->type); + } +} diff --git a/filter/data.h b/filter/data.h index 6d4ca333..b5db4aec 100644 --- a/filter/data.h +++ b/filter/data.h @@ -11,110 +11,37 @@ #define _BIRD_FILTER_DATA_H_ #include "nest/bird.h" - -/* Type numbers must be in 0..0xff range */ -#define T_MASK 0xff - -/* Internal types */ -enum f_type { -/* Nothing. Simply nothing. */ - T_VOID = 0, - -/* User visible types, which fit in int */ - T_INT = 0x10, - T_BOOL = 0x11, - T_PAIR = 0x12, /* Notice that pair is stored as integer: first << 16 | second */ - T_QUAD = 0x13, - -/* Put enumerational types in 0x30..0x3f range */ - T_ENUM_LO = 0x30, - T_ENUM_HI = 0x3f, - - T_ENUM_RTS = 0x30, - T_ENUM_BGP_ORIGIN = 0x31, - T_ENUM_SCOPE = 0x32, - T_ENUM_RTC = 0x33, - T_ENUM_RTD = 0x34, - T_ENUM_ROA = 0x35, - T_ENUM_NETTYPE = 0x36, - T_ENUM_RA_PREFERENCE = 0x37, - T_ENUM_AF = 0x38, - -/* new enums go here */ - T_ENUM_EMPTY = 0x3f, /* Special hack for atomic_aggr */ - -#define T_ENUM T_ENUM_LO ... T_ENUM_HI - -/* Bigger ones */ - T_IP = 0x20, - T_NET = 0x21, - T_STRING = 0x22, - T_PATH_MASK = 0x23, /* mask for BGP path */ - T_PATH = 0x24, /* BGP path */ - T_CLIST = 0x25, /* Community list */ - T_EC = 0x26, /* Extended community value, u64 */ - T_ECLIST = 0x27, /* Extended community list */ - T_LC = 0x28, /* Large community value, lcomm */ - T_LCLIST = 0x29, /* Large community list */ - T_RD = 0x2a, /* Route distinguisher for VPN addresses */ - T_PATH_MASK_ITEM = 0x2b, /* Path mask item for path mask constructors */ - - T_SET = 0x80, - T_PREFIX_SET = 0x81, -} PACKED; +#include "lib/type.h" /* Filter value; size of this affects filter memory consumption */ struct f_val { - enum f_type type; /* T_* */ - union { - uint i; - u64 ec; - lcomm lc; - ip_addr ip; - const net_addr *net; - const char *s; - const struct f_tree *t; - const struct f_trie *ti; - const struct adata *ad; - const struct f_path_mask *path_mask; - struct f_path_mask_item pmi; - } val; + btype type; /* T_* */ + union bval_long val; }; -/* Dynamic attribute definition (eattrs) */ -struct f_dynamic_attr { - u8 type; /* EA type (EAF_*) */ - u8 bit; /* For bitfield accessors */ - enum f_type f_type; /* Filter type */ - uint ea_code; /* EA code */ -}; +#define fputip(a) ({ ip_addr *ax = falloc(sizeof(*ax)); *ax = (a); ax; }) enum f_sa_code { - SA_FROM = 1, - SA_GW, + SA_GW = 1, SA_NET, SA_PROTO, - SA_SOURCE, - SA_SCOPE, SA_DEST, SA_IFNAME, SA_IFINDEX, SA_WEIGHT, - SA_PREF, SA_GW_MPLS, } PACKED; /* Static attribute definition (members of struct rta) */ struct f_static_attr { - enum f_type f_type; /* Filter type */ + btype type; /* Data type */ enum f_sa_code sa_code; /* Static attribute id */ - int readonly:1; /* Don't allow writing */ + int readonly:1; /* Don't allow writing */ }; /* Filter l-value type */ enum f_lval_type { F_LVAL_VARIABLE, - F_LVAL_PREFERENCE, F_LVAL_SA, F_LVAL_EA, }; @@ -124,7 +51,7 @@ struct f_lval { enum f_lval_type type; union { struct symbol *sym; - struct f_dynamic_attr da; + const struct ea_class *da; struct f_static_attr sa; }; }; @@ -141,18 +68,30 @@ struct f_tree { void *data; }; +#ifdef ENABLE_COMPACT_TRIES +/* Compact 4-way tries */ +#define TRIE_STEP 2 +#define TRIE_STACK_LENGTH 65 +#else +/* Faster 16-way tries */ +#define TRIE_STEP 4 +#define TRIE_STACK_LENGTH 33 +#endif + struct f_trie_node4 { ip4_addr addr, mask, accept; - uint plen; - struct f_trie_node4 *c[2]; + u16 plen; + u16 local; + struct f_trie_node4 *c[1 << TRIE_STEP]; }; struct f_trie_node6 { ip6_addr addr, mask, accept; - uint plen; - struct f_trie_node6 *c[2]; + u16 plen; + u16 local; + struct f_trie_node6 *c[1 << TRIE_STEP]; }; struct f_trie_node @@ -169,27 +108,101 @@ struct f_trie u8 zero; s8 ipv4; /* -1 for undefined / empty */ u16 data_size; /* Additional data for each trie node */ + u32 prefix_count; /* Works only for restricted tries (pxlen == l == h) */ struct f_trie_node root; /* Root trie node */ }; +struct f_trie_walk_state +{ + u8 ipv4; + u8 accept_length; /* Current inter-node prefix position */ + u8 start_pos; /* Initial prefix position in stack[0] */ + u8 local_pos; /* Current intra-node prefix position */ + u8 stack_pos; /* Current node in stack below */ + const struct f_trie_node *stack[TRIE_STACK_LENGTH]; +}; + struct f_tree *f_new_tree(void); struct f_tree *build_tree(struct f_tree *); const struct f_tree *find_tree(const struct f_tree *t, const struct f_val *val); +const struct f_tree *find_tree_linear(const struct f_tree *t, const struct f_val *val); int same_tree(const struct f_tree *t0, const struct f_tree *t2); +int tree_node_count(const struct f_tree *t); void tree_format(const struct f_tree *t, buffer *buf); void tree_walk(const struct f_tree *t, void (*hook)(const struct f_tree *, void *), void *data); struct f_trie *f_new_trie(linpool *lp, uint data_size); void *trie_add_prefix(struct f_trie *t, const net_addr *n, uint l, uint h); int trie_match_net(const struct f_trie *t, const net_addr *n); +int trie_match_longest_ip4(const struct f_trie *t, const net_addr_ip4 *net, net_addr_ip4 *dst, ip4_addr *found0); +int trie_match_longest_ip6(const struct f_trie *t, const net_addr_ip6 *net, net_addr_ip6 *dst, ip6_addr *found0); +void trie_walk_init(struct f_trie_walk_state *s, const struct f_trie *t, const net_addr *from); +int trie_walk_next(struct f_trie_walk_state *s, net_addr *net); int trie_same(const struct f_trie *t1, const struct f_trie *t2); void trie_format(const struct f_trie *t, buffer *buf); +static inline int +trie_match_next_longest_ip4(net_addr_ip4 *n, ip4_addr *found) +{ + while (n->pxlen) + { + n->pxlen--; + ip4_clrbit(&n->prefix, n->pxlen); + + if (ip4_getbit(*found, n->pxlen)) + return 1; + } + + return 0; +} + +static inline int +trie_match_next_longest_ip6(net_addr_ip6 *n, ip6_addr *found) +{ + while (n->pxlen) + { + n->pxlen--; + ip6_clrbit(&n->prefix, n->pxlen); + + if (ip6_getbit(*found, n->pxlen)) + return 1; + } + + return 0; +} + + +#define TRIE_WALK_TO_ROOT_IP4(trie, net, dst) ({ \ + net_addr_ip4 dst; \ + ip4_addr _found; \ + for (int _n = trie_match_longest_ip4(trie, net, &dst, &_found); \ + _n; \ + _n = trie_match_next_longest_ip4(&dst, &_found)) + +#define TRIE_WALK_TO_ROOT_IP6(trie, net, dst) ({ \ + net_addr_ip6 dst; \ + ip6_addr _found; \ + for (int _n = trie_match_longest_ip6(trie, net, &dst, &_found); \ + _n; \ + _n = trie_match_next_longest_ip6(&dst, &_found)) + +#define TRIE_WALK_TO_ROOT_END }) + + +#define TRIE_WALK(trie, net, from) ({ \ + net_addr net; \ + struct f_trie_walk_state tws_; \ + trie_walk_init(&tws_, trie, from); \ + while (trie_walk_next(&tws_, &net)) + +#define TRIE_WALK_END }) + + #define F_CMP_ERROR 999 -const char *f_type_name(enum f_type t); +const char *f_type_name(btype t); -enum f_type f_type_element_type(enum f_type t); +enum btype f_type_element_type(btype t); int val_same(const struct f_val *v1, const struct f_val *v2); int val_compare(const struct f_val *v1, const struct f_val *v2); @@ -197,15 +210,23 @@ void val_format(const struct f_val *v, buffer *buf); char *val_format_str(struct linpool *lp, const struct f_val *v); const char *val_dump(const struct f_val *v); +struct f_val *lp_val_copy(struct linpool *lp, const struct f_val *v); + static inline int val_is_ip4(const struct f_val *v) { return (v->type == T_IP) && ipa_is_ip4(v->val.ip); } int val_in_range(const struct f_val *v1, const struct f_val *v2); int clist_set_type(const struct f_tree *set, struct f_val *v); static inline int eclist_set_type(const struct f_tree *set) -{ return set->from.type == T_EC; } +{ return !set || set->from.type == T_EC; } static inline int lclist_set_type(const struct f_tree *set) -{ return set->from.type == T_LC; } +{ return !set || set->from.type == T_LC; } +static inline int path_set_type(const struct f_tree *set) +{ return !set || set->from.type == T_INT; } + +int clist_match_set(const struct adata *clist, const struct f_tree *set); +int eclist_match_set(const struct adata *list, const struct f_tree *set); +int lclist_match_set(const struct adata *list, const struct f_tree *set); const struct adata *clist_filter(struct linpool *pool, const struct adata *list, const struct f_val *set, int pos); const struct adata *eclist_filter(struct linpool *pool, const struct adata *list, const struct f_val *set, int pos); @@ -221,8 +242,18 @@ undef_value(struct f_val v) (v.val.ad == &null_adata); } -extern const struct f_val f_const_empty_path, f_const_empty_clist, f_const_empty_eclist, f_const_empty_lclist; +extern const struct f_val f_const_empty_path, f_const_empty_clist, f_const_empty_eclist, f_const_empty_lclist, f_const_empty_prefix_set; +static inline const struct f_val *f_get_empty(btype t) +{ + switch (t) { + case T_PATH: return &f_const_empty_path; + case T_CLIST: return &f_const_empty_clist; + case T_ECLIST: return &f_const_empty_eclist; + case T_LCLIST: return &f_const_empty_lclist; + default: return NULL; + } +} -enum filter_return f_eval(const struct f_line *expr, struct linpool *tmp_pool, struct f_val *pres); +enum filter_return f_eval(const struct f_line *expr, struct f_val *pres); #endif diff --git a/filter/decl.m4 b/filter/decl.m4 index 44537aaa..3179a331 100644 --- a/filter/decl.m4 +++ b/filter/decl.m4 @@ -94,7 +94,7 @@ FID_DUMP_BODY()m4_dnl debug("%s" $4 "\n", INDENT, $5); ]]) FID_INTERPRET_EXEC()m4_dnl -const $1 $2 = whati->$2 +$1 $2 = whati->$2 FID_INTERPRET_BODY') # Instruction arguments are needed only until linearization is done. @@ -191,6 +191,12 @@ if (f$1->type && f$2->type && (f$1->type != f$2->type) && cf_error("Arguments $1 and $2 of %s must be of the same type", f_instruction_name(what->fi_code)); FID_INTERPRET_BODY()') +m4_define(ARG_PREFER_SAME_TYPE, ` +FID_NEW_BODY()m4_dnl +if (f$1->type && f$2->type && (f$1->type != f$2->type)) + (void) (f_const_promotion(f$2, f$1->type) || f_const_promotion(f$1, f$2->type)); +FID_INTERPRET_BODY()') + # Executing another filter line. This replaces the recursion # that was needed in the former implementation. m4_define(LINEX, `FID_INTERPRET_EXEC()LINEX_($1)FID_INTERPRET_NEW()return $1 FID_INTERPRET_BODY()') @@ -216,7 +222,7 @@ whati->f$1 = f$1; FID_DUMP_BODY()m4_dnl f_dump_line(item->fl$1, indent + 1); FID_LINEARIZE_BODY()m4_dnl -item->fl$1 = f_linearize(whati->f$1); +item->fl$1 = f_linearize(whati->f$1, $2); FID_SAME_BODY()m4_dnl if (!f_same(f1->fl$1, f2->fl$1)) return 0; FID_ITERATE_BODY()m4_dnl @@ -244,9 +250,13 @@ m4_define(ERROR, # This macro specifies result type and makes there are no conflicting definitions m4_define(RESULT_TYPE, `m4_ifdef([[INST_RESULT_TYPE]], - [[m4_ifelse(INST_RESULT_TYPE,$1,,[[ERROR([[Multiple type definitons]])]])]], + [[m4_ifelse(INST_RESULT_TYPE,$1,,[[ERROR([[Multiple type definitions in]] INST_NAME)]])]], [[m4_define(INST_RESULT_TYPE,$1) RESULT_TYPE_($1)]])') +m4_define(RESULT_TYPE_CHECK, + `m4_ifelse(INST_OUTVAL,0,, + [[m4_ifdef([[INST_RESULT_TYPE]],,[[ERROR([[Missing type definition in]] INST_NAME)]])]])') + m4_define(RESULT_TYPE_, ` FID_NEW_BODY()m4_dnl what->type = $1; @@ -256,7 +266,7 @@ FID_INTERPRET_BODY()') m4_define(SYMBOL, `FID_MEMBER(struct symbol *, sym, [[strcmp(f1->sym->name, f2->sym->name) || (f1->sym->class != f2->sym->class)]], "symbol %s", item->sym->name)') m4_define(RTC, `FID_MEMBER(struct rtable_config *, rtc, [[strcmp(f1->rtc->name, f2->rtc->name)]], "route table %s", item->rtc->name)') m4_define(STATIC_ATTR, `FID_MEMBER(struct f_static_attr, sa, f1->sa.sa_code != f2->sa.sa_code,,)') -m4_define(DYNAMIC_ATTR, `FID_MEMBER(struct f_dynamic_attr, da, f1->da.ea_code != f2->da.ea_code,,)') +m4_define(DYNAMIC_ATTR, `FID_MEMBER(const struct ea_class *, da, f1->da != f2->da,,)') m4_define(ACCESS_RTE, `FID_HIC(,[[do { if (!fs->rte) runtime("No route to access"); } while (0)]],NEVER_CONSTANT())') # 2) Code wrapping @@ -300,6 +310,7 @@ m4_define(FID_ITERATE, `FID_ZONE(10, Iteration)') # This macro does all the code wrapping. See inline comments. m4_define(INST_FLUSH, `m4_ifdef([[INST_NAME]], [[ +RESULT_TYPE_CHECK()m4_dnl Check for defined RESULT_TYPE() FID_ENUM()m4_dnl Contents of enum fi_code { ... } INST_NAME(), FID_ENUM_STR()m4_dnl Contents of const char * indexed by enum fi_code @@ -375,6 +386,7 @@ case INST_NAME(): { #undef whati #undef item dest->items[pos].fi_code = what->fi_code; + dest->items[pos].flags = what->flags; dest->items[pos].lineno = what->lineno; break; } @@ -402,6 +414,7 @@ m4_define(INST, `m4_dnl This macro is called on beginning of each instruction INST_FLUSH()m4_dnl First, old data is flushed m4_define([[INST_NAME]], [[$1]])m4_dnl Then we store instruction name, m4_define([[INST_INVAL]], [[$2]])m4_dnl instruction input value count, +m4_define([[INST_OUTVAL]], [[$3]])m4_dnl instruction output value count, m4_undefine([[INST_NEVER_CONSTANT]])m4_dnl reset NEVER_CONSTANT trigger, m4_undefine([[INST_RESULT_TYPE]])m4_dnl and reset RESULT_TYPE value. FID_INTERPRET_BODY()m4_dnl By default, every code is interpreter code. @@ -474,7 +487,7 @@ f_instruction_name_(enum f_instruction_code fi) static inline struct f_inst * fi_new(enum f_instruction_code fi_code) { - struct f_inst *what = cfg_allocz(sizeof(struct f_inst)); + struct f_inst *what = tmp_allocz(sizeof(struct f_inst)); what->lineno = ifs->lino; what->size = 1; what->fi_code = fi_code; @@ -490,7 +503,7 @@ fi_constant(struct f_inst *what, struct f_val val) } static int -f_const_promotion(struct f_inst *arg, enum f_type want) +f_const_promotion(struct f_inst *arg, btype want) { if (arg->fi_code != FI_CONSTANT) return 0; @@ -505,6 +518,11 @@ f_const_promotion(struct f_inst *arg, enum f_type want) return 1; } + else if ((c->type == T_SET) && (!c->val.t) && (want == T_PREFIX_SET)) { + *c = f_const_empty_prefix_set; + return 1; + } + return 0; } @@ -560,7 +578,7 @@ FID_WR_PUT(8) } struct f_line * -f_linearize_concat(const struct f_inst * const inst[], uint count) +f_linearize_concat(const struct f_inst * const inst[], uint count, uint results) { uint len = 0; for (uint i=0; i<count; i++) @@ -572,6 +590,8 @@ f_linearize_concat(const struct f_inst * const inst[], uint count) for (uint i=0; i<count; i++) out->len = linearize(out, inst[i], out->len); + out->results = results; + #ifdef LOCAL_DEBUG f_dump_line(out, 0); #endif @@ -640,7 +660,8 @@ FID_WR_PUT(4)m4_dnl struct f_inst { struct f_inst *next; /* Next instruction */ enum f_instruction_code fi_code; /* Instruction code */ - enum f_type type; /* Type of returned value, if known */ + enum f_instruction_flags flags; /* Flags, instruction-specific */ + btype type; /* Type of returned value, if known */ int size; /* How many instructions are underneath */ int lineno; /* Line number */ union { diff --git a/filter/f-inst.c b/filter/f-inst.c index b4795357..caffc2b8 100644 --- a/filter/f-inst.c +++ b/filter/f-inst.c @@ -62,14 +62,14 @@ * m4_dnl INST(FI_NOP, in, out) { enum value, input args, output args * m4_dnl ARG(num, type); argument, its id (in data fields) and type accessible by v1, v2, v3 * m4_dnl ARG_ANY(num); argument with no type check accessible by v1, v2, v3 + * m4_dnl ARG_TYPE(num, type); just declare the type of argument * m4_dnl VARARG; variable-length argument list; accessible by vv(i) and whati->varcount - * m4_dnl LINE(num, unused); this argument has to be converted to its own f_line + * m4_dnl LINE(num, out); this argument has to be converted to its own f_line * m4_dnl SYMBOL; symbol handed from config * m4_dnl STATIC_ATTR; static attribute definition * m4_dnl DYNAMIC_ATTR; dynamic attribute definition * m4_dnl RTC; route table config * m4_dnl ACCESS_RTE; this instruction needs route - * m4_dnl ACCESS_EATTRS; this instruction needs extended attributes * * m4_dnl FID_MEMBER( custom instruction member * m4_dnl C type, for storage in structs @@ -80,10 +80,17 @@ * m4_dnl ) * * m4_dnl RESULT(type, union-field, value); putting this on value stack + * m4_dnl RESULT_(type, union-field, value); like RESULT(), but do not declare the type * m4_dnl RESULT_VAL(value-struct); pass the struct f_val directly + * m4_dnl RESULT_TYPE(type); just declare the type of result value * m4_dnl RESULT_VOID; return undef * m4_dnl } * + * Note that runtime arguments m4_dnl (ARG*, VARARG) must be defined before + * parse-time arguments m4_dnl (LINE, SYMBOL, ...). During linearization, + * first ones move position in f_line by linearizing arguments first, while + * second ones store data to the current position. + * * Also note that the { ... } blocks are not respected by M4 at all. * If you get weird unmatched-brace-pair errors, check what it generated and why. * What is really considered as one instruction is not the { ... } block @@ -91,6 +98,24 @@ * * Other code is just copied into the interpreter part. * + * The filter language uses a simple type system, where values have types + * (constants T_*) and also terms (instructions) are statically typed. Our + * static typing is partial (some terms do not declare types of arguments + * or results), therefore it can detect most but not all type errors and + * therefore we still have runtime type checks. + * + * m4_dnl Types of arguments are declared by macros ARG() and ARG_TYPE(), + * m4_dnl types of results are declared by RESULT() and RESULT_TYPE(). + * m4_dnl Macros ARG_ANY(), RESULT_() and RESULT_VAL() do not declare types + * m4_dnl themselves, but can be combined with ARG_TYPE() / RESULT_TYPE(). + * + * m4_dnl Note that types should be declared only once. If there are + * m4_dnl multiple RESULT() macros in an instruction definition, they must + * m4_dnl use the exact same expression for type, or they should be replaced + * m4_dnl by multiple RESULT_() macros and a common RESULT_TYPE() macro. + * m4_dnl See e.g. FI_EA_GET or FI_MIN instructions. + * + * * If you are satisfied with this, you don't need to read the following * detailed description of what is really done with the instruction definitions. * @@ -211,11 +236,40 @@ * m4_dnl fpool -> the current linpool * m4_dnl NEVER_CONSTANT-> don't generate pre-interpretation code at all * m4_dnl ACCESS_RTE -> check that route is available, also NEVER_CONSTANT - * m4_dnl ACCESS_EATTRS -> pre-cache the eattrs; use only with ACCESS_RTE - * m4_dnl f_rta_cow(fs) -> function to call before any change to route should be done * * m4_dnl If you are stymied, see FI_CALL or FI_CONSTANT or just search for * m4_dnl the mentioned macros in this file to see what is happening there in wild. + * + * + * A note about soundness of the type system: + * + * A type system is sound when types of expressions are consistent with + * types of values resulting from evaluation of such expressions. Untyped + * expressions are ok, but badly typed expressions are not sound. So is + * the type system of BIRD filtering code sound? There are some points: + * + * All cases of (one) m4_dnl RESULT() macro are obviously ok, as the macro + * both declares a type and returns a value. One have to check instructions + * that use m4_dnl RESULT_TYPE() macro. There are two issues: + * + * FI_AND, FI_OR - second argument is statically checked to be T_BOOL and + * passed as result without dynamic typecheck, declared to be T_BOOL. If + * an untyped non-bool expression is used as a second argument, then + * the mismatched type is returned. + * + * FI_VAR_GET - soundness depends on consistency of declared symbol types + * and stored values. This is maintained when values are stored by + * FI_VAR_SET, but when they are stored by FI_CALL, only static checking is + * used, so when an untyped expression returning mismatched value is used + * as a function argument, then inconsistent value is stored and subsequent + * FI_VAR_GET would be unsound. + * + * Both of these issues are inconsequential, as mismatched values from + * unsound expressions will be caught by dynamic typechecks like mismatched + * values from untyped expressions. + * + * Also note that FI_CALL is the only expression without properly declared + * result type. */ /* Binary operators */ @@ -240,13 +294,23 @@ if (v2.val.i == 0) runtime( "Mother told me not to divide by 0" ); RESULT(T_INT, i, v1.val.i / v2.val.i); } + INST(FI_BITOR, 2, 1) { + ARG(1,T_INT); + ARG(2,T_INT); + RESULT(T_INT, i, v1.val.i | v2.val.i); + } + INST(FI_BITAND, 2, 1) { + ARG(1,T_INT); + ARG(2,T_INT); + RESULT(T_INT, i, v1.val.i & v2.val.i); + } INST(FI_AND, 1, 1) { ARG(1,T_BOOL); ARG_TYPE_STATIC(2,T_BOOL); RESULT_TYPE(T_BOOL); if (v1.val.i) - LINE(2,0); + LINE(2,1); else RESULT_VAL(v1); } @@ -256,7 +320,7 @@ RESULT_TYPE(T_BOOL); if (!v1.val.i) - LINE(2,0); + LINE(2,1); else RESULT_VAL(v1); } @@ -349,7 +413,7 @@ break; case T_SET: - if (vv(i).val.t->from.type != T_INT) + if (!path_set_type(vv(i).val.t)) runtime("Only integer sets allowed in path mask"); pm->item[i] = (struct f_path_mask_item) { @@ -371,12 +435,14 @@ INST(FI_NEQ, 2, 1) { ARG_ANY(1); ARG_ANY(2); + ARG_PREFER_SAME_TYPE(1, 2); RESULT(T_BOOL, i, !val_same(&v1, &v2)); } INST(FI_EQ, 2, 1) { ARG_ANY(1); ARG_ANY(2); + ARG_PREFER_SAME_TYPE(1, 2); RESULT(T_BOOL, i, val_same(&v1, &v2)); } @@ -447,6 +513,18 @@ RESULT(T_BOOL, i, ipa_is_ip4(v1.val.ip)); } + INST(FI_VAR_INIT, 1, 0) { + NEVER_CONSTANT; + ARG_ANY(1); + SYMBOL; + ARG_TYPE(1, sym->class & 0xff); + + /* New variable is always the last on stack */ + uint pos = curline.vbase + sym->offset; + fstk->vstk[pos] = v1; + fstk->vcnt = pos + 1; + } + /* Set to indirect value prepared in v1 */ INST(FI_VAR_SET, 1, 0) { NEVER_CONSTANT; @@ -477,12 +555,100 @@ RESULT_VAL(val); } + INST(FI_FOR_INIT, 1, 0) { + NEVER_CONSTANT; + ARG_ANY(1); + SYMBOL; + + FID_NEW_BODY() + ASSERT((sym->class & ~0xff) == SYM_VARIABLE); + + /* Static type check */ + if (f1->type) + { + enum btype t_var = (sym->class & 0xff); + enum btype t_arg = f_type_element_type(f1->type); + if (!t_arg) + cf_error("Value of expression in FOR must be iterable, got %s", + f_type_name(f1->type)); + if (t_var != t_arg) + cf_error("Loop variable '%s' in FOR must be %s, is %s", + sym->name, f_type_name(t_arg), f_type_name(t_var)); + } + + FID_INTERPRET_BODY() + + /* Dynamic type check */ + if ((sym->class & 0xff) != f_type_element_type(v1.type)) + runtime("Mismatched argument and variable type"); + + /* Setup the index */ + v2 = (struct f_val) { .type = T_INT, .val.i = 0 }; + + /* Keep v1 and v2 on the stack */ + fstk->vcnt += 2; + } + + INST(FI_FOR_NEXT, 2, 0) { + NEVER_CONSTANT; + SYMBOL; + + /* Type checks are done in FI_FOR_INIT */ + + /* Loop variable */ + struct f_val *var = &fstk->vstk[curline.vbase + sym->offset]; + int step = 0; + + switch(v1.type) + { + case T_PATH: + var->type = T_INT; + step = as_path_walk(v1.val.ad, &v2.val.i, &var->val.i); + break; + + case T_CLIST: + var->type = T_PAIR; + step = int_set_walk(v1.val.ad, &v2.val.i, &var->val.i); + break; + + case T_ECLIST: + var->type = T_EC; + step = ec_set_walk(v1.val.ad, &v2.val.i, &var->val.ec); + break; + + case T_LCLIST: + var->type = T_LC; + step = lc_set_walk(v1.val.ad, &v2.val.i, &var->val.lc); + break; + + default: + runtime( "Clist or lclist expected" ); + } + + if (step) + { + /* Keep v1 and v2 on the stack */ + fstk->vcnt += 2; + + /* Repeat this instruction */ + curline.pos--; + + /* Execute the loop body */ + LINE(1, 0); + + /* Space for loop variable, may be unused */ + fstk->vcnt += 1; + } + else + var->type = T_VOID; + } + INST(FI_CONDITION, 1, 0) { ARG(1, T_BOOL); if (v1.val.i) LINE(2,0); else - LINE(3,1); + LINE(3,0); } INST(FI_PRINT, 0, 0) { @@ -519,25 +685,43 @@ { STATIC_ATTR; ACCESS_RTE; - struct rta *rta = fs->rte->attrs; switch (sa.sa_code) { - case SA_FROM: RESULT(sa.f_type, ip, rta->from); break; - case SA_GW: RESULT(sa.f_type, ip, rta->nh.gw); break; - case SA_NET: RESULT(sa.f_type, net, fs->rte->net); break; - case SA_PROTO: RESULT(sa.f_type, s, fs->rte->src->owner->name); break; - case SA_SOURCE: RESULT(sa.f_type, i, rta->source); break; - case SA_SCOPE: RESULT(sa.f_type, i, rta->scope); break; - case SA_DEST: RESULT(sa.f_type, i, rta->dest); break; - case SA_IFNAME: RESULT(sa.f_type, s, rta->nh.iface ? rta->nh.iface->name : ""); break; - case SA_IFINDEX: RESULT(sa.f_type, i, rta->nh.iface ? rta->nh.iface->index : 0); break; - case SA_WEIGHT: RESULT(sa.f_type, i, rta->nh.weight + 1); break; - case SA_PREF: RESULT(sa.f_type, i, rta->pref); break; - case SA_GW_MPLS: RESULT(sa.f_type, i, rta->nh.labels ? rta->nh.label[0] : MPLS_NULL); break; - + case SA_NET: RESULT(sa.type, net, fs->rte->net); break; + case SA_PROTO: RESULT(sa.type, s, fs->rte->src->owner->name); break; default: - bug("Invalid static attribute access (%u/%u)", sa.f_type, sa.sa_code); + { + struct eattr *nhea = ea_find(fs->rte->attrs, &ea_gen_nexthop); + struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + struct nexthop *nh = nhad ? &nhad->nh : NULL; + + switch (sa.sa_code) + { + case SA_DEST: + RESULT(sa.type, i, nhad ? + (NEXTHOP_IS_REACHABLE(nhad) ? RTD_UNICAST : nhad->dest) + : RTD_NONE); + break; + case SA_GW: + RESULT(sa.type, ip, nh ? nh->gw : IPA_NONE); + break; + case SA_IFNAME: + RESULT(sa.type, s, (nh && nh->iface) ? nh->iface->name : ""); + break; + case SA_IFINDEX: + RESULT(sa.type, i, (nh && nh->iface) ? nh->iface->index : 0); + break; + case SA_WEIGHT: + RESULT(sa.type, i, (nh ? nh->weight : 0) + 1); + break; + case SA_GW_MPLS: + RESULT(sa.type, i, (nh && nh->labels) ? nh->label[0] : MPLS_NULL); + break; + default: + bug("Invalid static attribute access (%u/%u)", sa.type, sa.sa_code); + } + } } } } @@ -546,52 +730,52 @@ ACCESS_RTE; ARG_ANY(1); STATIC_ATTR; - ARG_TYPE(1, sa.f_type); - - f_rta_cow(fs); + ARG_TYPE(1, sa.type); { - struct rta *rta = fs->rte->attrs; + union { + struct nexthop_adata nha; + struct { + struct adata ad; + struct nexthop nh; + u32 label; + }; + } nha; + + nha.ad = (struct adata) { + .length = sizeof (struct nexthop_adata) - sizeof (struct adata), + }; + + eattr *a = NULL; switch (sa.sa_code) { - case SA_FROM: - rta->from = v1.val.ip; - break; + case SA_DEST: + { + int i = v1.val.i; + if ((i != RTD_BLACKHOLE) && (i != RTD_UNREACHABLE) && (i != RTD_PROHIBIT)) + runtime( "Destination can be changed only to blackhole, unreachable or prohibit" ); + nha.nha.dest = i; + nha.ad.length = NEXTHOP_DEST_SIZE; + break; + } case SA_GW: { + struct eattr *nh_ea = ea_find(fs->rte->attrs, &ea_gen_nexthop); + ip_addr ip = v1.val.ip; - struct iface *ifa = ipa_is_link_local(ip) ? rta->nh.iface : NULL; + struct iface *ifa = (ipa_is_link_local(ip) && nh_ea) ? + ((struct nexthop_adata *) nh_ea->u.ptr)->nh.iface : NULL; + /* XXX this code supposes that every owner is a protocol XXX */ neighbor *n = neigh_find(SKIP_BACK(struct proto, sources, fs->rte->src->owner), ip, ifa, 0); if (!n || (n->scope == SCOPE_HOST)) runtime( "Invalid gw address" ); - rta->dest = RTD_UNICAST; - rta->nh.gw = ip; - rta->nh.iface = n->iface; - rta->nh.next = NULL; - rta->hostentry = NULL; - rta->nh.labels = 0; - } - break; - - case SA_SCOPE: - rta->scope = v1.val.i; - break; - - case SA_DEST: - { - int i = v1.val.i; - if ((i != RTD_BLACKHOLE) && (i != RTD_UNREACHABLE) && (i != RTD_PROHIBIT)) - runtime( "Destination can be changed only to blackhole, unreachable or prohibit" ); - - rta->dest = i; - rta->nh.gw = IPA_NONE; - rta->nh.iface = NULL; - rta->nh.next = NULL; - rta->hostentry = NULL; - rta->nh.labels = 0; + nha.nh = (struct nexthop) { + .gw = ip, + .iface = n->iface, + }; } break; @@ -601,12 +785,9 @@ if (!ifa) runtime( "Invalid iface name" ); - rta->dest = RTD_UNICAST; - rta->nh.gw = IPA_NONE; - rta->nh.iface = ifa; - rta->nh.next = NULL; - rta->hostentry = NULL; - rta->nh.labels = 0; + nha.nh = (struct nexthop) { + .iface = ifa, + }; } break; @@ -615,13 +796,20 @@ if (v1.val.i >= 0x100000) runtime( "Invalid MPLS label" ); + struct eattr *nh_ea = ea_find(fs->rte->attrs, &ea_gen_nexthop); + if (!nh_ea) + runtime( "No nexthop to add a MPLS label to" ); + + nha.nh = ((struct nexthop_adata *) nh_ea->u.ptr)->nh; + if (v1.val.i != MPLS_NULL) { - rta->nh.label[0] = v1.val.i; - rta->nh.labels = 1; + nha.nh.label[0] = v1.val.i; + nha.nh.labels = 1; + nha.ad.length = sizeof nha - sizeof (struct adata); } else - rta->nh.labels = 0; + nha.nh.labels = 0; } break; @@ -630,184 +818,120 @@ int i = v1.val.i; if (i < 1 || i > 256) runtime( "Setting weight value out of bounds" ); - if (rta->dest != RTD_UNICAST) + + struct eattr *nh_ea = ea_find(fs->rte->attrs, &ea_gen_nexthop); + if (!nh_ea) + runtime( "No nexthop to set weight on" ); + + struct nexthop_adata *nhad = (struct nexthop_adata *) nh_ea->u.ptr; + if (!NEXTHOP_IS_REACHABLE(nhad)) runtime( "Setting weight needs regular nexthop " ); + struct nexthop_adata *nhax = (struct nexthop_adata *) tmp_copy_adata(&nhad->ad); + /* Set weight on all next hops */ - for (struct nexthop *nh = &rta->nh; nh; nh = nh->next) + NEXTHOP_WALK(nh, nhax) nh->weight = i - 1; - } - break; - case SA_PREF: - rta->pref = v1.val.i; + a = ea_set_attr(&fs->rte->attrs, + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nhax->ad)); + } break; default: - bug("Invalid static attribute access (%u/%u)", sa.f_type, sa.sa_code); + bug("Invalid static attribute access (%u/%u)", sa.type, sa.sa_code); } + + if (!a) + a = ea_set_attr(&fs->rte->attrs, + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, tmp_copy_adata(&nha.ad))); + + a->originated = 1; + a->fresh = 1; } } INST(FI_EA_GET, 0, 1) { /* Access to extended attributes */ DYNAMIC_ATTR; ACCESS_RTE; - ACCESS_EATTRS; - RESULT_TYPE(da.f_type); + RESULT_TYPE(da->type); { - eattr *e = ea_find(*fs->eattrs, da.ea_code); - - if (!e) { - /* A special case: undefined as_path looks like empty as_path */ - if (da.type == EAF_TYPE_AS_PATH) { - RESULT_(T_PATH, ad, &null_adata); - break; - } + const struct f_val *empty; + const eattr *e = ea_find(fs->rte->attrs, da->id); - /* The same special case for int_set */ - if (da.type == EAF_TYPE_INT_SET) { - RESULT_(T_CLIST, ad, &null_adata); - break; - } - - /* The same special case for ec_set */ - if (da.type == EAF_TYPE_EC_SET) { - RESULT_(T_ECLIST, ad, &null_adata); - break; - } + if (e) + { + ASSERT_DIE(e->type == da->type); - /* The same special case for lc_set */ - if (da.type == EAF_TYPE_LC_SET) { - RESULT_(T_LCLIST, ad, &null_adata); - break; + switch (e->type) { + case T_IP: + RESULT_(T_IP, ip, *((const ip_addr *) e->u.ptr->data)); + break; + default: + RESULT_VAL([[(struct f_val) { + .type = e->type, + .val.bval = e->u, + }]]); } - - /* Undefined value */ - RESULT_VOID; - break; } - - switch (e->type & EAF_TYPE_MASK) { - case EAF_TYPE_INT: - RESULT_(da.f_type, i, e->u.data); - break; - case EAF_TYPE_ROUTER_ID: - RESULT_(T_QUAD, i, e->u.data); - break; - case EAF_TYPE_OPAQUE: - RESULT_(T_ENUM_EMPTY, i, 0); - break; - case EAF_TYPE_IP_ADDRESS: - RESULT_(T_IP, ip, *((ip_addr *) e->u.ptr->data)); - break; - case EAF_TYPE_AS_PATH: - RESULT_(T_PATH, ad, e->u.ptr); - break; - case EAF_TYPE_BITFIELD: - RESULT_(T_BOOL, i, !!(e->u.data & (1u << da.bit))); - break; - case EAF_TYPE_INT_SET: - RESULT_(T_CLIST, ad, e->u.ptr); - break; - case EAF_TYPE_EC_SET: - RESULT_(T_ECLIST, ad, e->u.ptr); - break; - case EAF_TYPE_LC_SET: - RESULT_(T_LCLIST, ad, e->u.ptr); - break; - case EAF_TYPE_UNDEF: + else if (empty = f_get_empty(da->type)) + RESULT_VAL(*empty); + else RESULT_VOID; - break; - default: - bug("Unknown dynamic attribute type"); - } } } INST(FI_EA_SET, 1, 0) { ACCESS_RTE; - ACCESS_EATTRS; ARG_ANY(1); DYNAMIC_ATTR; - ARG_TYPE(1, da.f_type); + ARG_TYPE(1, da->type); { - struct ea_list *l = lp_alloc(fs->pool, sizeof(struct ea_list) + sizeof(eattr)); - - l->next = NULL; - l->flags = EALF_SORTED; - l->count = 1; - l->attrs[0].id = da.ea_code; - l->attrs[0].flags = 0; - l->attrs[0].type = da.type | EAF_ORIGINATED | EAF_FRESH; - - switch (da.type) { - case EAF_TYPE_INT: - case EAF_TYPE_ROUTER_ID: - l->attrs[0].u.data = v1.val.i; - break; - - case EAF_TYPE_OPAQUE: - runtime( "Setting opaque attribute is not allowed" ); - break; + struct eattr *a; - case EAF_TYPE_IP_ADDRESS:; - int len = sizeof(ip_addr); - struct adata *ad = lp_alloc(fs->pool, sizeof(struct adata) + len); - ad->length = len; - (* (ip_addr *) ad->data) = v1.val.ip; - l->attrs[0].u.ptr = ad; - break; + if (da->type >= EAF_TYPE__MAX) + bug("Unsupported attribute type"); - case EAF_TYPE_AS_PATH: - case EAF_TYPE_INT_SET: - case EAF_TYPE_EC_SET: - case EAF_TYPE_LC_SET: - l->attrs[0].u.ptr = v1.val.ad; + switch (da->type) { + case T_OPAQUE: + case T_IFACE: + runtime( "Setting opaque attribute is not allowed" ); break; - case EAF_TYPE_BITFIELD: - { - /* First, we have to find the old value */ - eattr *e = ea_find(*fs->eattrs, da.ea_code); - u32 data = e ? e->u.data : 0; - - if (v1.val.i) - l->attrs[0].u.data = data | (1u << da.bit); - else - l->attrs[0].u.data = data & ~(1u << da.bit); - } + case T_IP: + a = ea_set_attr(&fs->rte->attrs, + EA_LITERAL_STORE_ADATA(da, 0, &v1.val.ip, sizeof(ip_addr))); break; default: - bug("Unknown dynamic attribute type"); + a = ea_set_attr(&fs->rte->attrs, + EA_LITERAL_GENERIC(da->id, da->type, 0, .u = v1.val.bval)); + break; } - f_rta_cow(fs); - l->next = *fs->eattrs; - *fs->eattrs = l; + a->originated = 1; + a->fresh = 1; } } INST(FI_EA_UNSET, 0, 0) { DYNAMIC_ATTR; ACCESS_RTE; - ACCESS_EATTRS; - { - struct ea_list *l = lp_alloc(fs->pool, sizeof(struct ea_list) + sizeof(eattr)); - - l->next = NULL; - l->flags = EALF_SORTED; - l->count = 1; - l->attrs[0].id = da.ea_code; - l->attrs[0].flags = 0; - l->attrs[0].type = EAF_TYPE_UNDEF | EAF_ORIGINATED | EAF_FRESH; - l->attrs[0].u.data = 0; - - f_rta_cow(fs); - l->next = *fs->eattrs; - *fs->eattrs = l; - } + ea_unset_attr(&fs->rte->attrs, 1, da); + } + + INST(FI_DEFAULT, 2, 1) { + ARG_ANY(1); + ARG_ANY(2); + RESULT_TYPE(f_type_element_type(v2.type)); + + log(L_INFO "Type of arg 1 is: %d", v1.type); + + if (v1.type == T_VOID) + RESULT_VAL(v2); + else + RESULT_VAL(v1); } INST(FI_LENGTH, 1, 1) { /* Get length of */ @@ -975,7 +1099,7 @@ RESULT(T_INT, i, v1.val.lc.ldp2); } - INST(FI_MIN, 1, 1) { /* Get minimum element from set */ + INST(FI_MIN, 1, 1) { /* Get minimum element from list */ ARG_ANY(1); RESULT_TYPE(f_type_element_type(v1.type)); switch(v1.type) @@ -1009,7 +1133,7 @@ } } - INST(FI_MAX, 1, 1) { /* Get maximum element from set */ + INST(FI_MAX, 1, 1) { /* Get maximum element from list */ ARG_ANY(1); RESULT_TYPE(f_type_element_type(v1.type)); switch(v1.type) @@ -1043,7 +1167,7 @@ } } - INST(FI_RETURN, 1, 1) { + INST(FI_RETURN, 1, 0) { NEVER_CONSTANT; /* Acquire the return value */ ARG_ANY(1); @@ -1071,28 +1195,59 @@ INST(FI_CALL, 0, 1) { NEVER_CONSTANT; + VARARG; SYMBOL; + /* Fake result type declaration */ + RESULT_TYPE(T_VOID); + + FID_NEW_BODY() + ASSERT(sym->class == SYM_FUNCTION); + + if (whati->varcount != sym->function->args) + cf_error("Function '%s' expects %u arguments, got %u arguments", + sym->name, sym->function->args, whati->varcount); + + /* Typecheck individual arguments */ + struct f_inst *a = fvar; + struct f_arg *b = sym->function->arg_list; + for (uint i = 1; a && b; a = a->next, b = b->next, i++) + { + enum btype b_type = b->arg->class & 0xff; + + if (a->type && (a->type != b_type) && !f_const_promotion(a, b_type)) + cf_error("Argument %u of '%s' must be %s, got %s", + i, sym->name, f_type_name(b_type), f_type_name(a->type)); + } + ASSERT(!a && !b); + + /* Add implicit void slot for the return value */ + struct f_inst *tmp = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_VOID }); + tmp->next = whati->fvar; + whati->fvar = tmp; + what->size += tmp->size; + + /* Mark recursive calls, they have dummy f_line */ + if (!sym->function->len) + what->flags |= FIF_RECURSIVE; + FID_SAME_BODY() - if (!(f1->sym->flags & SYM_FLAG_SAME)) - return 0; + if (!(f1->sym->flags & SYM_FLAG_SAME) && !(f1_->flags & FIF_RECURSIVE)) + return 0; FID_ITERATE_BODY() + if (!(what->flags & FIF_RECURSIVE)) BUFFER_PUSH(fit->lines) = whati->sym->function; FID_INTERPRET_BODY() /* Push the body on stack */ LINEX(sym->function); + curline.vbase = curline.ventry; curline.emask |= FE_RETURN; - /* Before this instruction was called, there was the T_VOID - * automatic return value pushed on value stack and also - * sym->function->args function arguments. Setting the - * vbase to point to first argument. */ - ASSERT(curline.ventry >= sym->function->args); - curline.ventry -= sym->function->args; - curline.vbase = curline.ventry; + /* Arguments on stack */ + fstk->vcnt += sym->function->args; /* Storage for local variables */ f_vcnt_check_overflow(sym->function->vars); @@ -1110,14 +1265,33 @@ FID_MEMBER(struct f_tree *, tree, [[!same_tree(f1->tree, f2->tree)]], "tree %p", item->tree); + FID_LINEARIZE_BODY() + /* Linearize all branches in switch */ + struct f_inst *last_inst = NULL; + struct f_line *last_line = NULL; + for (struct f_tree *t = whati->tree; t; t = t->left) + { + if (t->data != last_inst) + { + last_inst = t->data; + last_line = f_linearize(t->data, 0); + } + + t->data = last_line; + } + + /* Balance the tree */ + item->tree = build_tree(whati->tree); + FID_ITERATE_BODY() - tree_walk(whati->tree, f_add_tree_lines, fit); + tree_walk(whati->tree, f_add_tree_lines, fit); FID_INTERPRET_BODY() - const struct f_tree *t = find_tree(tree, &v1); + /* In parse-time use find_tree_linear(), in runtime use find_tree() */ + const struct f_tree *t = FID_HIC(,find_tree,find_tree_linear)(tree, &v1); if (!t) { v1.type = T_VOID; - t = find_tree(tree, &v1); + t = FID_HIC(,find_tree,find_tree_linear)(tree, &v1); if (!t) { debug( "No else statement?\n"); FID_HIC(,break,return NULL); @@ -1206,17 +1380,10 @@ if (v1.type == T_PATH) { - const struct f_tree *set = NULL; - u32 key = 0; - - if (v2.type == T_INT) - key = v2.val.i; - else if ((v2.type == T_SET) && (v2.val.t->from.type == T_INT)) - set = v2.val.t; + if ((v2.type == T_SET) && path_set_type(v2.val.t) || (v2.type == T_INT)) + RESULT_(T_PATH, ad, [[ as_path_filter(fpool, v1.val.ad, &v2, 0) ]]); else runtime("Can't delete non-integer (set)"); - - RESULT_(T_PATH, ad, [[ as_path_filter(fpool, v1.val.ad, set, key, 0) ]]); } else if (v1.type == T_CLIST) @@ -1268,10 +1435,8 @@ if (v1.type == T_PATH) { - u32 key = 0; - - if ((v2.type == T_SET) && (v2.val.t->from.type == T_INT)) - RESULT_(T_PATH, ad, [[ as_path_filter(fpool, v1.val.ad, v2.val.t, key, 1) ]]); + if ((v2.type == T_SET) && path_set_type(v2.val.t)) + RESULT_(T_PATH, ad, [[ as_path_filter(fpool, v1.val.ad, &v2, 1) ]]); else runtime("Can't filter integer"); } @@ -1309,37 +1474,7 @@ runtime("Can't filter non-[e|l]clist"); } - INST(FI_ROA_CHECK_IMPLICIT, 0, 1) { /* ROA Check */ - NEVER_CONSTANT; - RTC(1); - rtable *table = rtc->table; - ACCESS_RTE; - ACCESS_EATTRS; - const net_addr *net = fs->rte->net; - - /* We ignore temporary attributes, probably not a problem here */ - /* 0x02 is a value of BA_AS_PATH, we don't want to include BGP headers */ - eattr *e = ea_find(*fs->eattrs, EA_CODE(PROTOCOL_BGP, 0x02)); - - if (!e || ((e->type & EAF_TYPE_MASK) != EAF_TYPE_AS_PATH)) - runtime("Missing AS_PATH attribute"); - - u32 as = 0; - as_path_get_last(e->u.ptr, &as); - - if (!table) - runtime("Missing ROA table"); - - if (table->addr_type != NET_ROA4 && table->addr_type != NET_ROA6) - runtime("Table type must be either ROA4 or ROA6"); - - if (table->addr_type != (net->type == NET_IP4 ? NET_ROA4 : NET_ROA6)) - RESULT(T_ENUM_ROA, i, ROA_UNKNOWN); /* Prefix and table type mismatch */ - else - RESULT(T_ENUM_ROA, i, [[ net_roa_check(table, net, as) ]]); - } - - INST(FI_ROA_CHECK_EXPLICIT, 2, 1) { /* ROA Check */ + INST(FI_ROA_CHECK, 2, 1) { /* ROA Check */ NEVER_CONSTANT; ARG(1, T_NET); ARG(2, T_INT); @@ -1361,7 +1496,7 @@ } - INST(FI_FORMAT, 1, 0) { /* Format */ + INST(FI_FORMAT, 1, 1) { /* Format */ ARG_ANY(1); RESULT(T_STRING, s, val_format_str(fpool, &v1)); } diff --git a/filter/f-inst.h b/filter/f-inst.h index df45f88e..fbc59de7 100644 --- a/filter/f-inst.h +++ b/filter/f-inst.h @@ -22,7 +22,7 @@ /* Flags for instructions */ enum f_instruction_flags { - FIF_PRINTED = 1, /* FI_PRINT_AND_DIE: message put in buffer */ + FIF_RECURSIVE = 1, /* FI_CALL: function is directly recursive */ } PACKED; /* Include generated filter instruction declarations */ @@ -35,19 +35,26 @@ const char *f_instruction_name_(enum f_instruction_code fi); static inline const char *f_instruction_name(enum f_instruction_code fi) { return f_instruction_name_(fi) + 3; } +struct f_arg { + struct symbol *arg; + struct f_arg *next; +}; + /* Filter structures for execution */ /* Line of instructions to be unconditionally executed one after another */ struct f_line { uint len; /* Line length */ u8 args; /* Function: Args required */ u8 vars; + u8 results; /* Results left on stack: cmd -> 0, term -> 1 */ + struct f_arg *arg_list; struct f_line_item items[0]; /* The items themselves */ }; /* Convert the f_inst infix tree to the f_line structures */ -struct f_line *f_linearize_concat(const struct f_inst * const inst[], uint count); -static inline struct f_line *f_linearize(const struct f_inst *root) -{ return f_linearize_concat(&root, 1); } +struct f_line *f_linearize_concat(const struct f_inst * const inst[], uint count, uint results); +static inline struct f_line *f_linearize(const struct f_inst *root, uint results) +{ return f_linearize_concat(&root, 1, results); } void f_dump_line(const struct f_line *, uint indent); @@ -87,15 +94,17 @@ void f_add_lines(const struct f_line_item *what, struct filter_iterator *fit); struct filter *f_new_where(struct f_inst *); -static inline struct f_dynamic_attr f_new_dynamic_attr(u8 type, enum f_type f_type, uint code) /* Type as core knows it, type as filters know it, and code of dynamic attribute */ -{ return (struct f_dynamic_attr) { .type = type, .f_type = f_type, .ea_code = code }; } /* f_type currently unused; will be handy for static type checking */ -static inline struct f_dynamic_attr f_new_dynamic_attr_bit(u8 bit, enum f_type f_type, uint code) /* Type as core knows it, type as filters know it, and code of dynamic attribute */ -{ return (struct f_dynamic_attr) { .type = EAF_TYPE_BITFIELD, .bit = bit, .f_type = f_type, .ea_code = code }; } /* f_type currently unused; will be handy for static type checking */ -static inline struct f_static_attr f_new_static_attr(int f_type, int code, int readonly) -{ return (struct f_static_attr) { .f_type = f_type, .sa_code = code, .readonly = readonly }; } -struct f_inst *f_generate_complex(enum f_instruction_code fi_code, struct f_dynamic_attr da, struct f_inst *argument); +static inline struct f_static_attr f_new_static_attr(btype type, int code, int readonly) +{ return (struct f_static_attr) { .type = type, .sa_code = code, .readonly = readonly }; } struct f_inst *f_generate_roa_check(struct rtable_config *table, struct f_inst *prefix, struct f_inst *asn); +struct f_attr_bit { + const struct ea_class *class; + uint bit; +}; + +#define f_new_dynamic_attr_bit(_bit, _name) ((struct f_attr_bit) { .bit = _bit, .class = ea_class_find(_name) }) + /* Hook for call bt_assert() function in configuration */ extern void (*bt_assert_hook)(int result, const struct f_line_item *assert); diff --git a/filter/f-util.c b/filter/f-util.c index 410999a6..82a06bdd 100644 --- a/filter/f-util.c +++ b/filter/f-util.c @@ -2,7 +2,7 @@ * Filters: utility functions * * Copyright 1998 Pavel Machek <pavel@ucw.cz> - * 2017 Jan Maria Matejka <mq@ucw.cz> + * 2017 Maria Matejka <mq@ucw.cz> * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -13,7 +13,7 @@ #include "filter/f-inst.h" #include "lib/idm.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #define P(a,b) ((a<<8) | b) @@ -37,147 +37,6 @@ struct filter *f_new_where(struct f_inst *where) f_new_inst(FI_DIE, F_REJECT)); struct filter *f = cfg_allocz(sizeof(struct filter)); - f->root = f_linearize(cond); + f->root = f_linearize(cond, 0); return f; } - -#define CA_KEY(n) n->name, n->fda.type -#define CA_NEXT(n) n->next -#define CA_EQ(na,ta,nb,tb) (!strcmp(na,nb) && (ta == tb)) -#define CA_FN(n,t) (mem_hash(n, strlen(n)) ^ (t*0xaae99453U)) -#define CA_ORDER 8 /* Fixed */ - -struct ca_storage { - struct ca_storage *next; - struct f_dynamic_attr fda; - u32 uc; - char name[0]; -}; - -HASH(struct ca_storage) ca_hash; - -static struct idm ca_idm; -static struct ca_storage **ca_storage; -static uint ca_storage_max; - -static void -ca_free(resource *r) -{ - struct custom_attribute *ca = (void *) r; - struct ca_storage *cas = HASH_FIND(ca_hash, CA, ca->name, ca->fda->type); - ASSERT(cas); - - ca->name = NULL; - ca->fda = NULL; - if (!--cas->uc) { - uint id = EA_CUSTOM_ID(cas->fda.ea_code); - idm_free(&ca_idm, id); - HASH_REMOVE(ca_hash, CA, cas); - ca_storage[id] = NULL; - mb_free(cas); - } -} - -static void -ca_dump(resource *r) -{ - struct custom_attribute *ca = (void *) r; - debug("name \"%s\" id 0x%04x ea_type 0x%02x f_type 0x%02x\n", - ca->name, ca->fda->ea_code, ca->fda->type, ca->fda->f_type); -} - -static struct resclass ca_class = { - .name = "Custom attribute", - .size = sizeof(struct custom_attribute), - .free = ca_free, - .dump = ca_dump, - .lookup = NULL, - .memsize = NULL, -}; - -struct custom_attribute * -ca_lookup(pool *p, const char *name, int f_type) -{ - int ea_type; - - switch (f_type) { - case T_INT: - ea_type = EAF_TYPE_INT; - break; - case T_IP: - ea_type = EAF_TYPE_IP_ADDRESS; - break; - case T_QUAD: - ea_type = EAF_TYPE_ROUTER_ID; - break; - case T_PATH: - ea_type = EAF_TYPE_AS_PATH; - break; - case T_CLIST: - ea_type = EAF_TYPE_INT_SET; - break; - case T_ECLIST: - ea_type = EAF_TYPE_EC_SET; - break; - case T_LCLIST: - ea_type = EAF_TYPE_LC_SET; - break; - default: - cf_error("Custom route attribute of unsupported type"); - } - - static int inited = 0; - if (!inited) { - idm_init(&ca_idm, &root_pool, 8); - HASH_INIT(ca_hash, &root_pool, CA_ORDER); - - ca_storage_max = 256; - ca_storage = mb_allocz(&root_pool, sizeof(struct ca_storage *) * ca_storage_max); - - inited++; - } - - struct ca_storage *cas = HASH_FIND(ca_hash, CA, name, ea_type); - if (cas) { - cas->uc++; - } else { - - uint id = idm_alloc(&ca_idm); - - if (id >= EA_CUSTOM_BIT) - cf_error("Too many custom attributes."); - - if (id >= ca_storage_max) { - ca_storage_max *= 2; - ca_storage = mb_realloc(ca_storage, sizeof(struct ca_storage *) * ca_storage_max * 2); - } - - cas = mb_allocz(&root_pool, sizeof(struct ca_storage) + strlen(name) + 1); - cas->fda = f_new_dynamic_attr(ea_type, f_type, EA_CUSTOM(id)); - cas->uc = 1; - - strcpy(cas->name, name); - ca_storage[id] = cas; - - HASH_INSERT(ca_hash, CA, cas); - } - - struct custom_attribute *ca = ralloc(p, &ca_class); - ca->fda = &(cas->fda); - ca->name = cas->name; - return ca; -} - -const char * -ea_custom_name(uint ea) -{ - uint id = EA_CUSTOM_ID(ea); - if (id >= ca_storage_max) - return NULL; - - if (!ca_storage[id]) - return NULL; - - return ca_storage[id]->name; -} - diff --git a/filter/filter.c b/filter/filter.c index 625b3ade..0aff4d30 100644 --- a/filter/filter.c +++ b/filter/filter.c @@ -35,10 +35,10 @@ #include "lib/ip.h" #include "lib/net.h" #include "lib/flowspec.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/attrs.h" +#include "lib/attrs.h" #include "conf/conf.h" #include "filter/filter.h" #include "filter/f-inst.h" @@ -76,12 +76,6 @@ struct filter_state { /* The route we are processing. This may be NULL to indicate no route available. */ struct rte *rte; - /* Cached pointer to ea_list */ - struct ea_list **eattrs; - - /* Linpool for adata allocation */ - struct linpool *pool; - /* Buffer for log output */ struct buffer buf; @@ -97,32 +91,6 @@ void (*bt_assert_hook)(int result, const struct f_line_item *assert); #define f_stack_init(fs) ( _f_stack_init(fs, v, 128), _f_stack_init(fs, e, 128) ) -static inline void f_cache_eattrs(struct filter_state *fs) -{ - fs->eattrs = &(fs->rte->attrs->eattrs); -} - -/* - * rta_cow - prepare rta for modification by filter - */ -static void -f_rta_cow(struct filter_state *fs) -{ - if (!rta_is_cached(fs->rte->attrs)) - return; - - /* - * Get shallow copy of rta. Fields eattrs and nexthops of rta are shared - * with fs->old_rta (they will be copied when the cached rta will be obtained - * at the end of f_run()), also the lock of hostentry is inherited (we - * suppose hostentry is not changed by filters). - */ - fs->rte->attrs = rta_do_cow(fs->rte->attrs, fs->pool); - - /* Re-cache the ea_list */ - f_cache_eattrs(fs); -} - static struct tbf rl_runtime_err = TBF_DEFAULT_LOG_LIMITS; /** @@ -185,10 +153,8 @@ interpret(struct filter_state *fs, const struct f_line *line, struct f_val *val) return F_ERROR; \ } while(0) -#define falloc(size) lp_alloc(fs->pool, size) -#define fpool fs->pool - -#define ACCESS_EATTRS do { if (!fs->eattrs) f_cache_eattrs(fs); } while (0) +#define falloc(size) tmp_alloc(size) +#define fpool tmp_linpool #include "filter/inst-interpret.c" #undef res @@ -198,13 +164,11 @@ interpret(struct filter_state *fs, const struct f_line *line, struct f_val *val) #undef runtime #undef falloc #undef fpool -#undef ACCESS_EATTRS } } /* End of current line. Drop local variables before exiting. */ - fstk->vcnt -= curline.line->vars; - fstk->vcnt -= curline.line->args; + fstk->vcnt = curline.ventry + curline.line->results; fstk->ecnt--; } @@ -237,7 +201,7 @@ interpret(struct filter_state *fs, const struct f_line *line, struct f_val *val) * tmp_pool, otherwise the filters may modify it. */ enum filter_return -f_run(const struct filter *filter, struct rte *rte, struct linpool *tmp_pool, int flags) +f_run(const struct filter *filter, struct rte *rte, int flags) { if (filter == FILTER_ACCEPT) return F_ACCEPT; @@ -250,7 +214,6 @@ f_run(const struct filter *filter, struct rte *rte, struct linpool *tmp_pool, in /* Initialize the filter state */ filter_state = (struct filter_state) { .rte = rte, - .pool = tmp_pool, .flags = flags, }; @@ -285,11 +248,10 @@ f_run(const struct filter *filter, struct rte *rte, struct linpool *tmp_pool, in */ enum filter_return -f_eval_rte(const struct f_line *expr, struct rte *rte, struct linpool *tmp_pool) +f_eval_rte(const struct f_line *expr, struct rte *rte) { filter_state = (struct filter_state) { .rte = rte, - .pool = tmp_pool, }; f_stack_init(filter_state); @@ -308,11 +270,9 @@ f_eval_rte(const struct f_line *expr, struct rte *rte, struct linpool *tmp_pool) * @pres: here the output will be stored */ enum filter_return -f_eval(const struct f_line *expr, struct linpool *tmp_pool, struct f_val *pres) +f_eval(const struct f_line *expr, struct f_val *pres) { - filter_state = (struct filter_state) { - .pool = tmp_pool, - }; + filter_state = (struct filter_state) {}; f_stack_init(filter_state); @@ -331,9 +291,7 @@ uint f_eval_int(const struct f_line *expr) { /* Called independently in parse-time to eval expressions */ - filter_state = (struct filter_state) { - .pool = cfg_mem, - }; + filter_state = (struct filter_state) {}; f_stack_init(filter_state); @@ -354,10 +312,10 @@ f_eval_int(const struct f_line *expr) * f_eval_buf - get a value of a term and print it to the supplied buffer */ enum filter_return -f_eval_buf(const struct f_line *expr, struct linpool *tmp_pool, buffer *buf) +f_eval_buf(const struct f_line *expr, buffer *buf) { struct f_val val; - enum filter_return fret = f_eval(expr, tmp_pool, &val); + enum filter_return fret = f_eval(expr, &val); if (fret <= F_RETURN) val_format(&val, buf); return fret; diff --git a/filter/filter.h b/filter/filter.h index 9964831c..3f2e62eb 100644 --- a/filter/filter.h +++ b/filter/filter.h @@ -13,8 +13,8 @@ #include "lib/resource.h" #include "lib/ip.h" #include "lib/macro.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" /* Possible return values of filter execution */ enum filter_return { @@ -51,10 +51,10 @@ struct filter { struct rte; -enum filter_return f_run(const struct filter *filter, struct rte *rte, struct linpool *tmp_pool, int flags); -enum filter_return f_eval_rte(const struct f_line *expr, struct rte *rte, struct linpool *tmp_pool); +enum filter_return f_run(const struct filter *filter, struct rte *rte, int flags); +enum filter_return f_eval_rte(const struct f_line *expr, struct rte *rte); uint f_eval_int(const struct f_line *expr); -enum filter_return f_eval_buf(const struct f_line *expr, struct linpool *tmp_pool, buffer *buf); +enum filter_return f_eval_buf(const struct f_line *expr, buffer *buf); const char *filter_name(const struct filter *filter); int filter_same(const struct filter *new, const struct filter *old); @@ -70,13 +70,4 @@ void filters_dump_all(void); #define FF_SILENT 2 /* Silent filter execution */ -/* Custom route attributes */ -struct custom_attribute { - resource r; - struct f_dynamic_attr *fda; - const char *name; -}; - -struct custom_attribute *ca_lookup(pool *p, const char *name, int ea_type); - #endif diff --git a/filter/filter_test.c b/filter/filter_test.c index 2a0b5431..aa325aef 100644 --- a/filter/filter_test.c +++ b/filter/filter_test.c @@ -46,9 +46,7 @@ run_function(const void *arg) if (t->cmp) return t->result == f_same(t->fn, t->cmp); - linpool *tmp = lp_new_default(&root_pool); - enum filter_return fret = f_eval(t->fn, tmp, NULL); - rfree(tmp); + enum filter_return fret = f_eval(t->fn, NULL); return (fret < F_REJECT); } @@ -81,12 +79,11 @@ main(int argc, char *argv[]) if (!bt_config_file_parse(BT_CONFIG_FILE)) abort(); - bt_test_suite(t_reconfig, "Testing reconfiguration"); + bt_test_suite_extra(t_reconfig, 0, BT_TIMEOUT, "Testing reconfiguration"); struct f_bt_test_suite *t; WALK_LIST(t, config->tests) - bt_test_suite_base(run_function, t->fn_name, t, BT_FORKING, BT_TIMEOUT, "%s", t->dsc); + bt_test_suite_base(run_function, t->fn_name, t, 0, BT_TIMEOUT, "%s", t->dsc); - bt_bird_cleanup(); return bt_exit_value(); } diff --git a/filter/test.conf b/filter/test.conf index 5d766a8e..17aaf8cf 100644 --- a/filter/test.conf +++ b/filter/test.conf @@ -9,7 +9,109 @@ router id 62.168.0.1; /* We have to setup any protocol */ protocol device { } - +/* Setting some custom attributes, enough to force BIRD to reallocate the attribute idmap */ +attribute int test_ca_int1; +attribute int test_ca_int2; +attribute int test_ca_int3; +attribute int test_ca_int4; +attribute int test_ca_int5; +attribute int test_ca_int6; +attribute int test_ca_int7; +attribute int test_ca_int8; +attribute int test_ca_int9; +attribute int test_ca_int10; + +attribute ip test_ca_ip1; +attribute ip test_ca_ip2; +attribute ip test_ca_ip3; +attribute ip test_ca_ip4; +attribute ip test_ca_ip5; +attribute ip test_ca_ip6; +attribute ip test_ca_ip7; +attribute ip test_ca_ip8; +attribute ip test_ca_ip9; +attribute ip test_ca_ip10; + +attribute quad test_ca_quad1; +attribute quad test_ca_quad2; +attribute quad test_ca_quad3; +attribute quad test_ca_quad4; +attribute quad test_ca_quad5; +attribute quad test_ca_quad6; +attribute quad test_ca_quad7; +attribute quad test_ca_quad8; +attribute quad test_ca_quad9; +attribute quad test_ca_quad10; + +attribute bgppath test_ca_bgppath1; +attribute bgppath test_ca_bgppath2; +attribute bgppath test_ca_bgppath3; +attribute bgppath test_ca_bgppath4; +attribute bgppath test_ca_bgppath5; +attribute bgppath test_ca_bgppath6; +attribute bgppath test_ca_bgppath7; +attribute bgppath test_ca_bgppath8; +attribute bgppath test_ca_bgppath9; +attribute bgppath test_ca_bgppath10; + +attribute clist test_ca_clist1; +attribute clist test_ca_clist2; +attribute clist test_ca_clist3; +attribute clist test_ca_clist4; +attribute clist test_ca_clist5; +attribute clist test_ca_clist6; +attribute clist test_ca_clist7; +attribute clist test_ca_clist8; +attribute clist test_ca_clist9; +attribute clist test_ca_clist10; + +attribute eclist test_ca_eclist1; +attribute eclist test_ca_eclist2; +attribute eclist test_ca_eclist3; +attribute eclist test_ca_eclist4; +attribute eclist test_ca_eclist5; +attribute eclist test_ca_eclist6; +attribute eclist test_ca_eclist7; +attribute eclist test_ca_eclist8; +attribute eclist test_ca_eclist9; +attribute eclist test_ca_eclist10; + +attribute lclist test_ca_lclist1; +attribute lclist test_ca_lclist2; +attribute lclist test_ca_lclist3; +attribute lclist test_ca_lclist4; +attribute lclist test_ca_lclist5; +attribute lclist test_ca_lclist6; +attribute lclist test_ca_lclist7; +attribute lclist test_ca_lclist8; +attribute lclist test_ca_lclist9; +attribute lclist test_ca_lclist10; + +attribute lclist test_ca_lclist_max1; +attribute lclist test_ca_lclist_max2; +attribute lclist test_ca_lclist_max3; +attribute lclist test_ca_lclist_max4; +attribute lclist test_ca_lclist_max5; +attribute lclist test_ca_lclist_max6; +attribute lclist test_ca_lclist_max7; +attribute lclist test_ca_lclist_max8; +attribute lclist test_ca_lclist_max9; +attribute lclist test_ca_lclist_max10; +attribute lclist test_ca_lclist_max11; +attribute lclist test_ca_lclist_max12; +attribute lclist test_ca_lclist_max13; +attribute lclist test_ca_lclist_max14; +attribute lclist test_ca_lclist_max15; +attribute lclist test_ca_lclist_max16; +attribute lclist test_ca_lclist_max17; +attribute lclist test_ca_lclist_max18; +attribute lclist test_ca_lclist_max19; +attribute lclist test_ca_lclist_max20; +attribute lclist test_ca_lclist_max21; + + +/* Uncomment this to get an error */ +#attribute int bgp_path; /* * Common definitions and functions @@ -44,9 +146,8 @@ bt_test_same(onef, twof, 0); */ function t_bool() -bool b; { - b = true; + bool b = true; bt_assert(b); bt_assert(!!b); @@ -82,12 +183,11 @@ define xyzzy = (120+10); define '1a-a1' = (xyzzy-100); function t_int() -int i; { bt_assert(xyzzy = 130); bt_assert('1a-a1' = 30); - i = four; + int i = four; i = 12*100 + 60/2 + i; i = (i + 0); bt_assert(i = 1234); @@ -111,6 +211,26 @@ int i; bt_assert(!(i = 4)); bt_assert(1 <= 1); bt_assert(!(1234 < 1234)); + + bt_assert(10 - 5 = 5); + bt_assert(4294967295 + 1 = 0); + bt_assert(6*9=54); + bt_assert(984/41 = 24); + bt_assert(123/45 = 2); + bt_assert(0xfee1a | 0xbeef = 0xffeff); + bt_assert(0xfee1a & 0xbeef = 0xae0a); + + case i { + 4200000000: bt_assert(true); + else: bt_assert(false); + } + + case four { + 4: bt_assert(true); + else: bt_assert(false); + } + + } bt_test_suite(t_int, "Testing integers"); @@ -128,14 +248,19 @@ define is2 = [(17+2), 17, 15, 11, 8, 5, 3, 2]; define is3 = [5, 17, 2, 11, 8, 15, 3, 19]; function t_int_set() -int set is; { + int set is = []; + bt_assert(is = []); + bt_assert(0 !~ is); + bt_assert(1 ~ [1,2,3]); bt_assert(5 ~ [1..20]); bt_assert(2 ~ [ 1, 2, 3 ]); bt_assert(5 ~ [ 4 .. 7 ]); bt_assert(1 !~ [ 2, 3, 4 ]); bt_assert(999 !~ [ 666, 333 ]); + bt_assert(1 !~ []); + bt_assert(1 !~ is); is = [ 2, 3, 4, 7..11 ]; bt_assert(10 ~ is); @@ -170,6 +295,7 @@ int set is; bt_assert([1,4..10,20] = [1,4..10,20]); bt_assert(format([ 1, 2, 1, 1, 1, 3, 4, 1, 1, 1, 5 ]) = "[1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 5]"); + bt_assert(format([]) = "[]"); } bt_test_suite(t_int_set, "Testing sets of integers"); @@ -183,9 +309,8 @@ bt_test_suite(t_int_set, "Testing sets of integers"); */ function t_string() -string st; { - st = "Hello"; + string st = "Hello"; bt_assert(format(st) = "Hello"); bt_assert(st ~ "Hell*"); bt_assert(st ~ "?ello"); @@ -210,9 +335,8 @@ function 'mkpair-a'(int a) } function t_pair() -pair pp; { - pp = (1, 2); + pair pp = (1, 2); bt_assert(format(pp) = "(1,2)"); bt_assert((1,2) = pp); bt_assert((1,1+1) = pp); @@ -233,10 +357,11 @@ bt_test_suite(t_pair, "Testing pairs"); */ function t_pair_set() -pair pp; -pair set ps; { - pp = (1, 2); + pair pp = (1, 2); + pair set ps = []; + bt_assert(pp !~ ps); + ps = [(1,(one+one)), (3,4)..(4,8), (5,*), (6,3..6)]; bt_assert(format(ps) = "[(1,2), (3,4)..(4,8), (5,0)..(5,65535), (6,3)..(6,6)]"); bt_assert(pp ~ ps); @@ -253,6 +378,7 @@ pair set ps; bt_assert((6,6+one) !~ ps); bt_assert(((one+6),2) !~ ps); bt_assert((1,1) !~ ps); + bt_assert(pp !~ []); ps = [(20..150, 200..300), (50100..50200, 1000..50000), (*, 5+5)]; bt_assert((100,200) ~ ps); @@ -304,6 +430,7 @@ quad qq; qq = 1.2.3.4; bt_assert(qq ~ [1.2.3.4, 5.6.7.8]); bt_assert(qq !~ [1.2.1.1, 1.2.3.5]); + bt_assert(qq !~ []); } bt_test_suite(t_quad_set, "Testing sets of quads"); @@ -384,6 +511,7 @@ ip set ips; bt_assert(1.2.3.4 !~ [ 1.2.3.3, 1.2.3.5 ]); bt_assert(1.2.3.4 ~ [ 1.2.3.3..1.2.3.5 ]); + bt_assert(1.2.3.4 !~ []); } bt_test_suite(t_ip_set, "Testing sets of ip address"); @@ -398,9 +526,9 @@ bt_test_suite(t_ip_set, "Testing sets of ip address"); function t_enum() { - bt_assert(format(RTS_STATIC) = "(enum 30)1"); - bt_assert(format(NET_IP4) = "(enum 36)1"); - bt_assert(format(NET_VPN6) = "(enum 36)4"); + bt_assert(format(RTS_STATIC) = "(enum 31)1"); + bt_assert(format(NET_IP4) = "(enum 3b)1"); + bt_assert(format(NET_VPN6) = "(enum 3b)4"); bt_assert(RTS_STATIC ~ [RTS_STATIC, RTS_DEVICE]); bt_assert(RTS_BGP !~ [RTS_STATIC, RTS_DEVICE]); @@ -472,13 +600,34 @@ function test_pxset(prefix set pxs) bt_assert(1.0.0.0/8 ~ [ 1.0.0.0/8+ ]); bt_assert(1.0.0.0/9 !~ [ 1.0.0.0/8- ]); bt_assert(1.2.0.0/17 !~ [ 1.0.0.0/8{ 15 , 16 } ]); + bt_assert(net10 !~ []); bt_assert([ 10.0.0.0/8{ 15 , 17 } ] = [ 10.0.0.0/8{ 15 , 17 } ]); } +function test_empty_pxset(prefix set pxs) +int set s0; +prefix set s1; +{ + s0 = []; + s1 = []; + bt_assert(pxs != s0); + bt_assert(pxs = s1); + bt_assert(pxs = []); +} + function t_prefix_set() prefix set pxs; { + pxs = []; + bt_assert(format(pxs) = "[]"); + bt_assert(pxs = []); + bt_assert(1.2.0.0/16 !~ []); + bt_assert(1.2.0.0/16 !~ pxs); + + test_empty_pxset([]); + test_empty_pxset(pxs); + pxs = [ 1.2.0.0/16, 1.4.0.0/16+, 44.66.88.64/30{24,28}, 12.34.56.0/24{8,16} ]; bt_assert(format(pxs) = "[1.2.0.0/16{0.1.0.0}, 1.4.0.0/16{0.1.255.255}, 12.34.0.0/16{1.255.0.0}, 44.66.88.64/28{0.0.1.240}]"); @@ -498,6 +647,33 @@ prefix set pxs; bt_assert(1.2.0.0/16 ~ [ 1.0.0.0/8{ 15 , 17 } ]); bt_assert([ 10.0.0.0/8{ 15 , 17 } ] != [ 11.0.0.0/8{ 15 , 17 } ]); + + /* Formatting of prefix sets, some cases are a bit strange */ + bt_assert(format([ 0.0.0.0/0 ]) = "[0.0.0.0/0]"); + bt_assert(format([ 10.10.0.0/32 ]) = "[10.10.0.0/32{0.0.0.1}]"); + bt_assert(format([ 10.10.0.0/17 ]) = "[10.10.0.0/17{0.0.128.0}]"); + bt_assert(format([ 10.10.0.0/17{17,19} ]) = "[10.10.0.0/17{0.0.224.0}]"); # 224 = 128+64+32 + bt_assert(format([ 10.10.128.0/17{18,19} ]) = "[10.10.128.0/18{0.0.96.0}, 10.10.192.0/18{0.0.96.0}]"); # 96 = 64+32 + bt_assert(format([ 10.10.64.0/18- ]) = "[0.0.0.0/0, 0.0.0.0/1{128.0.0.0}, 0.0.0.0/2{64.0.0.0}, 0.0.0.0/3{32.0.0.0}, 10.10.0.0/16{255.255.0.0}, 10.10.0.0/17{0.0.128.0}, 10.10.64.0/18{0.0.64.0}]"); + bt_assert(format([ 10.10.64.0/18+ ]) = "[10.10.64.0/18{0.0.96.0}, 10.10.64.0/20{0.0.31.255}, 10.10.80.0/20{0.0.31.255}, 10.10.96.0/20{0.0.31.255}, 10.10.112.0/20{0.0.31.255}]"); + + bt_assert(format([ 10.10.160.0/19 ]) = "[10.10.160.0/19{0.0.32.0}]"); + bt_assert(format([ 10.10.160.0/19{19,22} ]) = "[10.10.160.0/19{0.0.32.0}, 10.10.160.0/20{0.0.28.0}, 10.10.176.0/20{0.0.28.0}]"); # 28 = 16+8+4 + bt_assert(format([ 10.10.160.0/19+ ]) = "[10.10.160.0/19{0.0.32.0}, 10.10.160.0/20{0.0.31.255}, 10.10.176.0/20{0.0.31.255}]"); + + bt_assert(format([ ::/0 ]) = "[::/0]"); + bt_assert(format([ 11:22:33:44:55:66:77:88/128 ]) = "[11:22:33:44:55:66:77:88/128{::1}]"); + bt_assert(format([ 11:22:33:44::/64 ]) = "[11:22:33:44::/64{0:0:0:1::}]"); + bt_assert(format([ 11:22:33:44::/64+ ]) = "[11:22:33:44::/64{::1:ffff:ffff:ffff:ffff}]"); + + bt_assert(format([ 11:22:33:44::/65 ]) = "[11:22:33:44::/65{::8000:0:0:0}]"); + bt_assert(format([ 11:22:33:44::/65{65,67} ]) = "[11:22:33:44::/65{::e000:0:0:0}]"); # e = 8+4+2 + bt_assert(format([ 11:22:33:44:8000::/65{66,67} ]) = "[11:22:33:44:8000::/66{::6000:0:0:0}, 11:22:33:44:c000::/66{::6000:0:0:0}]"); # 6 = 4+2 + bt_assert(format([ 11:22:33:44:4000::/66- ]) = "[::/0, ::/1{8000::}, ::/2{4000::}, ::/3{2000::}, 11:22:33:44::/64{ffff:ffff:ffff:ffff::}, 11:22:33:44::/65{::8000:0:0:0}, 11:22:33:44:4000::/66{::4000:0:0:0}]"); + bt_assert(format([ 11:22:33:44:4000::/66+ ]) = "[11:22:33:44:4000::/66{::6000:0:0:0}, 11:22:33:44:4000::/68{::1fff:ffff:ffff:ffff}, 11:22:33:44:5000::/68{::1fff:ffff:ffff:ffff}, 11:22:33:44:6000::/68{::1fff:ffff:ffff:ffff}, 11:22:33:44:7000::/68{::1fff:ffff:ffff:ffff}]"); + bt_assert(format([ 11:22:33:44:c000::/67 ]) = "[11:22:33:44:c000::/67{::2000:0:0:0}]"); + bt_assert(format([ 11:22:33:44:c000::/67{67,71} ]) = "[11:22:33:44:c000::/67{::2000:0:0:0}, 11:22:33:44:c000::/68{::1e00:0:0:0}, 11:22:33:44:d000::/68{::1e00:0:0:0}]"); + bt_assert(format([ 11:22:33:44:c000::/67+ ]) = "[11:22:33:44:c000::/67{::2000:0:0:0}, 11:22:33:44:c000::/68{::1fff:ffff:ffff:ffff}, 11:22:33:44:d000::/68{::1fff:ffff:ffff:ffff}]"); } bt_test_suite(t_prefix_set, "Testing prefix sets"); @@ -536,6 +712,12 @@ bt_test_suite(t_prefix6, "Testing prefix IPv6"); function t_prefix6_set() prefix set pxs; { + pxs = []; + bt_assert(format(pxs) = "[]"); + bt_assert(pxs = []); + bt_assert(12::34/128 !~ []); + bt_assert(12::34/128 !~ pxs); + bt_assert(1180::/16 ~ [ 1100::/8{15, 17} ]); bt_assert(12::34 = 12::34); bt_assert(12::34 ~ [ 12::33..12::35 ]); @@ -577,6 +759,12 @@ prefix set pxs; bt_assert(2000::/29 !~ pxs); bt_assert(1100::/10 !~ pxs); bt_assert(2010::/26 !~ pxs); + + pxs = [ 52E0::/13{13,128} ]; + bt_assert(52E7:BE81:379B:E6FD:541F:B0D0::/93 ~ pxs); + + pxs = [ 41D8:8718::/30{0,30}, 413A:99A8:6C00::/38{38,128} ]; + bt_assert(4180::/9 ~ pxs); } bt_test_suite(t_prefix6_set, "Testing prefix IPv6 sets"); @@ -647,6 +835,7 @@ int set set12; bt_assert(3 ~ p2); bt_assert(p2 ~ [2, 10..20]); bt_assert(p2 ~ [4, 10..20]); + bt_assert(p2 !~ []); p2 = prepend(p2, 5); bt_assert(p2 !~ pm1); @@ -657,6 +846,8 @@ int set set12; bt_assert(p2 ~ [= 5 [2, 4, 6] 3 [1..2] 1 =]); bt_assert(p2 ~ [= 5 set35 3 set12 set12 =]); bt_assert(p2 ~ mkpath(5, 4)); + bt_assert(p2 ~ [= * [3] * =]); + bt_assert(p2 !~ [= * [] * =]); bt_assert(p2.len = 5); bt_assert(p2.first = 5); @@ -665,6 +856,10 @@ int set set12; bt_assert(p2.len = 5); bt_assert(delete(p2, 3) = prepend(prepend(prepend(prepend(+empty+, 1), 2), 4), 5)); bt_assert(filter(p2, [1..3]) = prepend(prepend(prepend(+empty+, 1), 2), 3)); + bt_assert(delete(p2, []) = p2); + bt_assert(filter(p2, []) = +empty+); + bt_assert(delete(prepend(prepend(+empty+, 0), 1), []) = prepend(prepend(+empty+, 0), 1)); + bt_assert(filter(prepend(prepend(+empty+, 0), 1), []) = +empty+); p2 = prepend( + empty +, 5 ); p2 = prepend( p2, 4 ); @@ -684,6 +879,15 @@ int set set12; bt_assert(delete(p2, [4..5]) = prepend(prepend(prepend(prepend(+empty+, 3), 3), 2), 1)); bt_assert(format([= 1 2+ 3 =]) = "[= 1 2 + 3 =]"); + + # iteration over path + int x = 0; + int y = 0; + for int i in p2 do { + x = x + i; + y = y + x; + } + bt_assert(x = 18 && y = 50); } bt_test_suite(t_path, "Testing paths"); @@ -725,6 +929,7 @@ clist r; bt_assert(l ~ [(2,2..3)]); bt_assert(l ~ [(1,1..2)]); bt_assert(l ~ [(1,1)..(1,2)]); + bt_assert(l !~ []); l = add(l, (2,5)); l = add(l, (5,one)); @@ -762,6 +967,9 @@ clist r; bt_assert(l !~ [(*,(one+6))]); bt_assert(l !~ [(*, (one+one+one))]); + bt_assert(delete(l, []) = l); + bt_assert(filter(l, []) = -empty-); + l = delete(l, [(*,(one+onef(3)))]); l = delete(l, [(*,(4+one))]); bt_assert(l = add(-empty-, (3,1))); @@ -806,6 +1014,12 @@ clist r; bt_assert(format(r) = "(clist (2,1) (1,3) (2,2) (3,1) (2,3))"); bt_assert(r.min = (1,3)); bt_assert(r.max = (3,1)); + + # iteration over clist + int x = 0; + for pair c in r do + x = x + c.asn * c.asn * c.data; + bt_assert(x = 36); } bt_test_suite(t_clist, "Testing lists of communities"); @@ -879,11 +1093,15 @@ eclist r; bt_assert((ro, 10.20.30.40, 100) !~ el); bt_assert(el !~ [(rt, 10, 35..40)]); bt_assert(el !~ [(ro, 10, *)]); + bt_assert(el !~ []); el = add(el, (rt, 10, 40)); el2 = filter(el, [(rt, 10, 20..40)] ); el2 = add(el2, (rt, 10, 50)); + bt_assert(delete(el, []) = el); + bt_assert(filter(el, []) = --empty--); + # eclist A (1,30,40) bt_assert(el = add(add(add(--empty--, (rt, 10, 1)), (rt, 10, 30)), (rt, 10, 40))); bt_assert(format(el) = "(eclist (rt, 10, 1) (rt, 10, 30) (rt, 10, 40))"); @@ -917,6 +1135,13 @@ eclist r; bt_assert(format(r) = "(eclist (rt, 2, 1) (rt, 1, 3) (rt, 2, 2) (rt, 3, 1) (rt, 2, 3))"); bt_assert(r.min = (rt, 1, 3)); bt_assert(r.max = (rt, 3, 1)); + + # iteration over eclist + int x = 0; + for ec c in r do + if c > (rt, 2, 0) && c < (rt, 3, 0) then + x = x + 1; + bt_assert(x = 3); } bt_test_suite(t_eclist, "Testing lists of extended communities"); @@ -1001,6 +1226,9 @@ lclist r; ll2 = add(ll2, (30, 30, 30)); ll2 = add(ll2, (40, 40, 40)); + bt_assert(delete(ll, []) = ll); + bt_assert(filter(ll, []) = ---empty---); + # lclist A (10, 20, 30) bt_assert(format(ll) = "(lclist (10, 10, 10) (20, 20, 20) (30, 30, 30))"); @@ -1032,6 +1260,19 @@ lclist r; bt_assert(format(r) = "(lclist (2, 3, 3) (1, 2, 3) (2, 3, 1) (3, 1, 2) (2, 1, 3))"); bt_assert(r.min = (1, 2, 3)); bt_assert(r.max = (3, 1, 2)); + + # iteration over lclist + int x = 0; + int y = 0; + lc mx = (0, 0, 0); + for lc c in r do { + int asn2 = c.asn * c.asn; + x = x + asn2 * c.data1; + y = y + asn2 * c.data2; + if c > mx then mx = c; + } + bt_assert(x = 39 && y = 49); + bt_assert(mx = r.max); } bt_test_suite(t_lclist, "Testing lists of large communities"); @@ -1060,6 +1301,7 @@ lc set lls; bt_assert(ll !~ [(5,10,15), (10,21,30)]); bt_assert(ll !~ [(10,21..25,*)]); bt_assert(ll !~ [(11, *, *)]); + bt_assert(ll !~ []); lls = [(10, 10, 10), (20, 20, 15..25), (30, 30, *), (40, 35..45, *), (50, *, *), (55..65, *, *)]; bt_assert(format(lls) = "[(10, 10, 10), (20, 20, 15)..(20, 20, 25), (30, 30, 0)..(30, 30, 4294967295), (40, 35, 0)..(40, 45, 4294967295), (50, 0, 0)..(50, 4294967295, 4294967295), (55, 0, 0)..(65, 4294967295, 4294967295)]"); @@ -1116,6 +1358,10 @@ bt_test_suite(t_rd, "Testing route distinguishers"); function t_rd_set() rd set rds; { + rds = []; + bt_assert(rds = []); + bt_assert(10:20 !~ rds); + rds = [10:20, 100000:100..100000:200]; bt_assert(format(rds) = "[10:20, 100000:100..100000:200]"); @@ -1126,6 +1372,7 @@ rd set rds; bt_assert(100000:128 ~ rds); bt_assert(100000:200 ~ rds); bt_assert(100010:150 !~ rds); + bt_assert(100010:150 !~ []); } bt_test_suite(t_rd_set, "Testing sets of route distinguishers"); @@ -1192,7 +1439,85 @@ function fifteen() return 15; } +function local_vars(int j) +{ + int k = 10; + bt_assert(j = 5 && k = 10); + { + int j = 15; + k = 20; + bt_assert(j = 15 && k = 20); + } + bt_assert(j = 5 && k = 20); + + if j < 10 then + { + int j = 25; + string k = "hello"; + bt_assert(j = 25 && k = "hello"); + } + bt_assert(j = 5 && k = 20); + + int m = 100; + { + j = 35; + int k = 40; + bt_assert(j = 35 && k = 40 && m = 100); + } + bt_assert(j = 35 && k = 20 && m = 100); +} + +function factorial(int x) +{ + if x = 0 then return 0; + if x = 1 then return 1; + else return x * factorial(x - 1); +} + +function fibonacci(int x) +{ + if x = 0 then return 0; + if x = 1 then return 1; + else return fibonacci(x - 1) + fibonacci(x - 2); +} + +function hanoi_init(int a; int b) +{ + if b = 0 + then return +empty+; + else return prepend(hanoi_init(a + 1, b - 1), a); +} + +function hanoi_solve(int n; bgppath h_src; bgppath h_dst; bgppath h_aux; bool x; bool y) +{ + # x -> return src or dst + # y -> print state + + if n = 0 then { if x then return h_src; else return h_dst; } + + bgppath tmp1 = hanoi_solve(n - 1, h_src, h_aux, h_dst, true, y); + bgppath tmp2 = hanoi_solve(n - 1, h_src, h_aux, h_dst, false, false); + h_src = tmp1; + h_aux = tmp2; + + int v = h_src.first; + # bt_assert(h_dst = +empty+ || v < h_dst.first); + h_src = delete(h_src, v); + h_dst = prepend(h_dst, v); + + if y then + print "move: ", v, " src: ", h_src, " dst:", h_dst, " aux:", h_aux; + + tmp1 = hanoi_solve(n - 1, h_aux, h_dst, h_src, true, y); + tmp2 = hanoi_solve(n - 1, h_aux, h_dst, h_src, false, false); + h_aux = tmp1; + h_dst = tmp2; + + if x then return h_src; else return h_dst; +} + function t_call_function() +bgppath h_src; { bt_assert(fifteen() = 15); @@ -1204,6 +1529,17 @@ function t_call_function() bt_assert(callme(4, 4) = 16); bt_assert(callme(7, 2) = 14); bt_assert(callmeagain(1, 2, 3) = 6); + local_vars(5); + + bt_assert(factorial(5) = 120); + bt_assert(factorial(10) = 3628800); + + bt_assert(fibonacci(10) = 55); + bt_assert(fibonacci(20) = 6765); + + h_src = hanoi_init(1, 6); + bt_assert(format(h_src) = "(path 1 2 3 4 5 6)"); + bt_assert(hanoi_solve(6, h_src, +empty+, +empty+, false, false) = h_src); } bt_test_suite(t_call_function, "Testing calling functions"); @@ -1290,6 +1626,7 @@ function __test2() filter testf int j; +bool t; { print "Heya, filtering route to ", net.ip, " prefixlen ", net.len, " source ", source; print "This route was from ", from; @@ -1301,6 +1638,52 @@ int j; rip_metric = 14; unset(rip_metric); + preference = 1234; + + test_ca_int1 = 42; + test_ca_ip2 = 1.3.5.7; + test_ca_quad3 = 2.4.6.8; + test_ca_bgppath4 = +empty+; + test_ca_clist5 = -empty-; + test_ca_eclist6 = --empty--; + test_ca_lclist7 = ---empty---; + + igp_metric = 53; + babel_metric = 64; + t = defined(babel_router_id); + + bgp_origin = ORIGIN_IGP; + bgp_path = +empty+; + bgp_next_hop = 3456:789a:bcde:f012::3456:789a; + bgp_med = 71; + bgp_local_pref = 942; + t = defined(bgp_atomic_aggr); + t = defined(bgp_aggregator); + bgp_community = -empty-; + bgp_originator_id = 9.7.5.3; + bgp_cluster_list = -empty-; + bgp_ext_community = --empty--; + t = defined(bgp_aigp); + bgp_large_community = ---empty---; + t = defined(bgp_mpls_label_stack); + + ospf_metric1 = 64; + ospf_metric2 = 111; + ospf_tag = 654432; + + radv_preference = RA_PREF_LOW; + radv_lifetime = 28; + + rip_metric = 2; + rip_tag = 4; + + krt_source = 17; + krt_metric = 19; + +# krt_lock_mtu = false; +# krt_lock_window = true; +# krt_lock_rtt = krt_lock_rttvar && krt_lock_sstresh || krt_lock_cwnd; + accept "ok I take that"; } @@ -1400,13 +1783,16 @@ filter vpn_filter bt_assert(net.type != NET_IP6); bt_assert(net.rd = 0:1:2); + bool b = false; case (net.type) { NET_IP4: print "IPV4"; NET_IP6: print "IPV6"; + else: b = true; } + bt_assert(b); bt_check_assign(from, 10.20.30.40); - bt_check_assign(gw, 55.55.55.44); + # bt_check_assign(gw, 55.55.55.44); bgp_community.add((3,5)); bgp_ext_community.add((ro, 135, 999)); diff --git a/filter/test.conf2 b/filter/test.conf2 index e95f9563..9fc8330f 100644 --- a/filter/test.conf2 +++ b/filter/test.conf2 @@ -38,12 +38,6 @@ protocol static { print from; from = 1.2.3.4; print from; - print scope; - scope = SCOPE_HOST; - print scope; - if !(scope ~ [ SCOPE_HOST, SCOPE_SITE ]) then { - print "Failed in test"; - } preference = 15; print preference; diff --git a/filter/tree.c b/filter/tree.c index 5da86b9d..25cf93e4 100644 --- a/filter/tree.c +++ b/filter/tree.c @@ -38,6 +38,26 @@ find_tree(const struct f_tree *t, const struct f_val *val) return find_tree(t->left, val); } +/** + * find_tree_linear + * @t: tree to search in + * @val: value to find + * + * Search for given value in the degenerated linear tree, which is generated by + * parser before build_tree() is applied. The tree is not sorted and all nodes + * are linked by left ptr. + */ +const struct f_tree * +find_tree_linear(const struct f_tree *t, const struct f_val *val) +{ + for (; t; t = t->left) + if ((val_compare(&(t->from), val) != 1) && + (val_compare(&(t->to), val) != -1)) + return t; + + return NULL; +} + static struct f_tree * build_tree_rec(struct f_tree **buf, int l, int h) { @@ -134,6 +154,14 @@ same_tree(const struct f_tree *t1, const struct f_tree *t2) return 1; } +int +tree_node_count(const struct f_tree *t) +{ + if (t == NULL) + return 0; + + return 1 + tree_node_count(t->left) + tree_node_count(t->right); +} static void tree_node_format(const struct f_tree *t, buffer *buf) diff --git a/filter/tree_test.c b/filter/tree_test.c index 883d13bf..655205c2 100644 --- a/filter/tree_test.c +++ b/filter/tree_test.c @@ -15,16 +15,6 @@ #define MAX_TREE_HEIGHT 13 -static void -start_conf_env(void) -{ - bt_bird_init(); - - pool *p = rp_new(&root_pool, &main_birdloop, "helper_pool"); - linpool *l = lp_new_default(p); - cfg_mem = l; -} - static struct f_tree * new_tree(uint id) { @@ -156,8 +146,6 @@ get_balanced_tree_with_ranged_values(uint nodes_count) static int t_balancing(void) { - start_conf_env(); - uint height; for (height = 1; height < MAX_TREE_HEIGHT; height++) { @@ -173,6 +161,8 @@ t_balancing(void) show_tree(balanced_tree_from_simple); bt_assert(same_tree(balanced_tree_from_simple, expected_balanced_tree)); + + tmp_flush(); } return 1; @@ -182,8 +172,6 @@ t_balancing(void) static int t_balancing_random(void) { - start_conf_env(); - uint height; for (height = 1; height < MAX_TREE_HEIGHT; height++) { @@ -194,6 +182,9 @@ t_balancing_random(void) uint i; for(i = 0; i < 10; i++) { + struct lp_state lps; + lp_save(tmp_linpool, &lps); + struct f_tree *random_degenerated_tree = get_random_degenerated_left_tree(nodes_count); show_tree(random_degenerated_tree); @@ -203,7 +194,11 @@ t_balancing_random(void) show_tree(balanced_tree_from_random); bt_assert(same_tree(balanced_tree_from_random, expected_balanced_tree)); + + lp_restore(tmp_linpool, &lps); } + + tmp_flush(); } return 1; @@ -212,8 +207,6 @@ t_balancing_random(void) static int t_find(void) { - start_conf_env(); - uint height; for (height = 1; height < MAX_TREE_HEIGHT; height++) { @@ -230,6 +223,8 @@ t_find(void) const struct f_tree *found_tree = find_tree(tree, &looking_up_value); bt_assert((val_compare(&looking_up_value, &(found_tree->from)) == 0) && (val_compare(&looking_up_value, &(found_tree->to)) == 0)); } + + tmp_flush(); } return 1; @@ -258,8 +253,6 @@ get_max_value_in_unbalanced_tree(struct f_tree *node, uint max) static int t_find_ranges(void) { - start_conf_env(); - uint height; for (height = 1; height < MAX_TREE_HEIGHT; height++) { @@ -286,6 +279,8 @@ t_find_ranges(void) ((val_compare(&needle, &(found_tree->from)) == 1) && (val_compare(&needle, &(found_tree->to)) == -1)) ); } + + tmp_flush(); } return 1; @@ -295,6 +290,8 @@ int main(int argc, char *argv[]) { bt_init(argc, argv); + bt_bird_init(); + cfg_mem = tmp_linpool; bt_test_suite(t_balancing, "Balancing strong unbalanced trees"); bt_test_suite(t_balancing_random, "Balancing random unbalanced trees"); diff --git a/filter/trie.c b/filter/trie.c index 1a4e1ac3..12ba0b82 100644 --- a/filter/trie.c +++ b/filter/trie.c @@ -1,7 +1,8 @@ /* * Filters: Trie for prefix sets * - * Copyright 2009 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2009--2021 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2009--2021 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -9,53 +10,68 @@ /** * DOC: Trie for prefix sets * - * We use a (compressed) trie to represent prefix sets. Every node - * in the trie represents one prefix (&addr/&plen) and &plen also - * indicates the index of the bit in the address that is used to - * branch at the node. If we need to represent just a set of - * prefixes, it would be simple, but we have to represent a - * set of prefix patterns. Each prefix pattern consists of - * &ppaddr/&pplen and two integers: &low and &high, and a prefix - * &paddr/&plen matches that pattern if the first MIN(&plen, &pplen) - * bits of &paddr and &ppaddr are the same and &low <= &plen <= &high. - * - * We use a bitmask (&accept) to represent accepted prefix lengths - * at a node. As there are 33 prefix lengths (0..32 for IPv4), but - * there is just one prefix of zero length in the whole trie so we - * have &zero flag in &f_trie (indicating whether the trie accepts - * prefix 0.0.0.0/0) as a special case, and &accept bitmask + * We use a (compressed) trie to represent prefix sets. Every node in the trie + * represents one prefix (&addr/&plen) and &plen also indicates the index of + * bits in the address that are used to branch at the node. Note that such + * prefix is not necessary a member of the prefix set, it is just a canonical + * prefix associated with a node. Prefix lengths of nodes are aligned to + * multiples of &TRIE_STEP (4) and there is 16-way branching in each + * node. Therefore, we say that a node is associated with a range of prefix + * lengths (&plen .. &plen + TRIE_STEP - 1). + * + * The prefix set is not just a set of prefixes, it is defined by a set of + * prefix patterns. Each prefix pattern consists of &ppaddr/&pplen and two + * integers: &low and &high. The tested prefix &paddr/&plen matches that pattern + * if the first MIN(&plen, &pplen) bits of &paddr and &ppaddr are the same and + * &low <= &plen <= &high. + * + * There are two ways to represent accepted prefixes for a node. First, there is + * a bitmask &local, which represents independently all 15 prefixes that extend + * the canonical prefix of the node and are within a range of prefix lengths + * associated with the node. E.g., for node 10.0.0.0/8 they are 10.0.0.0/8, + * 10.0.0.0/9, 10.128.0.0/9, .. 10.224.0.0/11. This order (first by length, then + * lexicographically) is used for indexing the bitmask &local, starting at + * position 1. I.e., index is 2^(plen - base) + offset within the same length, + * see function trie_local_mask6() for details. + * + * Second, we use a bitmask &accept to represent accepted prefix lengths at a + * node. The bit is set means that all prefixes of given length that are either + * subprefixes or superprefixes of the canonical prefix are accepted. As there + * are 33 prefix lengths (0..32 for IPv4), but there is just one prefix of zero + * length in the whole trie so we have &zero flag in &f_trie (indicating whether + * the trie accepts prefix 0.0.0.0/0) as a special case, and &accept bitmask * represents accepted prefix lengths from 1 to 32. * - * There are two cases in prefix matching - a match when the length - * of the prefix is smaller that the length of the prefix pattern, - * (&plen < &pplen) and otherwise. The second case is simple - we - * just walk through the trie and look at every visited node - * whether that prefix accepts our prefix length (&plen). The - * first case is tricky - we don't want to examine every descendant - * of a final node, so (when we create the trie) we have to propagate - * that information from nodes to their ascendants. - * - * Suppose that we have two masks (M1 and M2) for a node. Mask M1 - * represents accepted prefix lengths by just the node and mask M2 - * represents accepted prefix lengths by the node or any of its - * descendants. Therefore M2 is a bitwise or of M1 and children's - * M2 and this is a maintained invariant during trie building. - * Basically, when we want to match a prefix, we walk through the trie, - * check mask M1 for our prefix length and when we came to - * final node, we check mask M2. - * - * There are two differences in the real implementation. First, - * we use a compressed trie so there is a case that we skip our - * final node (if it is not in the trie) and we came to node that - * is either extension of our prefix, or completely out of path - * In the first case, we also have to check M2. - * - * Second, we really need not to maintain two separate bitmasks. - * Checks for mask M1 are always larger than &applen and we need - * just the first &pplen bits of mask M2 (if trie compression - * hadn't been used it would suffice to know just $applen-th bit), - * so we have to store them together in &accept mask - the first - * &pplen bits of mask M2 and then mask M1. + * One complication is handling of prefix patterns with unaligned prefix length. + * When such pattern is to be added, we add a primary node above (with rounded + * down prefix length &nlen) and a set of secondary nodes below (with rounded up + * prefix lengths &slen). Accepted prefix lengths of the original prefix pattern + * are then represented in different places based on their lengths. For prefixes + * shorter than &nlen, it is &accept bitmask of the primary node, for prefixes + * between &nlen and &slen - 1 it is &local bitmask of the primary node, and for + * prefixes longer of equal &slen it is &accept bitmasks of secondary nodes. + * + * There are two cases in prefix matching - a match when the length of the + * prefix is smaller that the length of the prefix pattern, (&plen < &pplen) and + * otherwise. The second case is simple - we just walk through the trie and look + * at every visited node whether that prefix accepts our prefix length (&plen). + * The first case is tricky - we do not want to examine every descendant of a + * final node, so (when we create the trie) we have to propagate that + * information from nodes to their ascendants. + * + * There are two kinds of propagations - propagation from child's &accept + * bitmask to parent's &accept bitmask, and propagation from child's &accept + * bitmask to parent's &local bitmask. The first kind is simple - as all + * superprefixes of a parent are also all superprefixes of appropriate length of + * a child, then we can just add (by bitwise or) a child &accept mask masked by + * parent prefix length mask to the parent &accept mask. This handles prefixes + * shorter than node &plen. + * + * The second kind of propagation is necessary to handle superprefixes of a + * child that are represented by parent &local mask - that are in the range of + * prefix lengths associated with the parent. For each accepted (by child + * &accept mask) prefix length from that range, we need to set appropriate bit + * in &local mask. See function trie_amask_to_local() for details. * * There are four cases when we walk through a trie: * @@ -65,8 +81,32 @@ * - we are beyond the end of path (node length > &plen) * - we are still on path and keep walking (node length < &plen) * - * The walking code in trie_match_prefix() is structured according to - * these cases. + * The walking code in trie_match_net() is structured according to these cases. + * + * Iteration over prefixes in a trie can be done using TRIE_WALK() macro, or + * directly using trie_walk_init() and trie_walk_next() functions. The second + * approach allows suspending the iteration and continuing in it later. + * Prefixes are enumerated in the usual lexicographic order and may be + * restricted to a subset of the trie (all subnets of a specified prefix). + * + * Note that the trie walk does not reliably enumerate `implicit' prefixes + * defined by &low and &high fields in prefix patterns, it is supposed to be + * used on tries constructed from `explicit' prefixes (&low == &plen == &high + * in call to trie_add_prefix()). + * + * The trie walk has three basic state variables stored in the struct + * &f_trie_walk_state -- the current node in &stack[stack_pos], &accept_length + * for iteration over inter-node prefixes (non-branching prefixes on compressed + * path between the current node and its parent node, stored in the bitmap + * &accept of the current node) and &local_pos for iteration over intra-node + * prefixes (stored in the bitmap &local). + * + * The trie also supports longest-prefix-match query by trie_match_longest_ip4() + * and it can be extended to iteration over all covering prefixes for a given + * prefix (from longest to shortest) using TRIE_WALK_TO_ROOT_IP4() macro. There + * are also IPv6 versions (for practical reasons, these functions and macros are + * separate for IPv4 and IPv6). There is the same limitation to enumeration of + * `implicit' prefixes like with the previous TRIE_WALK() macro. */ #include "nest/bird.h" @@ -86,7 +126,10 @@ #define ipa_mkmask(x) ip6_mkmask(x) #define ipa_masklen(x) ip6_masklen(&x) #define ipa_pxlen(x,y) ip6_pxlen(x,y) -#define ipa_getbit(x,n) ip6_getbit(x,n) +#define ipa_getbit(a,p) ip6_getbit(a,p) +#define ipa_getbits(a,p,n) ip6_getbits(a,p,n) +#define ipa_setbits(a,p,n) ip6_setbits(a,p,n) +#define trie_local_mask(a,b,c) trie_local_mask6(a,b,c) #define ipt_from_ip4(x) _MI6(_I(x), 0, 0, 0) #define ipt_to_ip4(x) _MI4(_I0(x)) @@ -109,10 +152,11 @@ f_new_trie(linpool *lp, uint data_size) } static inline struct f_trie_node4 * -new_node4(struct f_trie *t, int plen, ip4_addr paddr, ip4_addr pmask, ip4_addr amask) +new_node4(struct f_trie *t, uint plen, uint local, ip4_addr paddr, ip4_addr pmask, ip4_addr amask) { struct f_trie_node4 *n = lp_allocz(t->lp, sizeof(struct f_trie_node4) + t->data_size); n->plen = plen; + n->local = local; n->addr = paddr; n->mask = pmask; n->accept = amask; @@ -120,10 +164,11 @@ new_node4(struct f_trie *t, int plen, ip4_addr paddr, ip4_addr pmask, ip4_addr a } static inline struct f_trie_node6 * -new_node6(struct f_trie *t, int plen, ip6_addr paddr, ip6_addr pmask, ip6_addr amask) +new_node6(struct f_trie *t, uint plen, uint local, ip6_addr paddr, ip6_addr pmask, ip6_addr amask) { struct f_trie_node6 *n = lp_allocz(t->lp, sizeof(struct f_trie_node6) + t->data_size); n->plen = plen; + n->local = local; n->addr = paddr; n->mask = pmask; n->accept = amask; @@ -131,24 +176,24 @@ new_node6(struct f_trie *t, int plen, ip6_addr paddr, ip6_addr pmask, ip6_addr a } static inline struct f_trie_node * -new_node(struct f_trie *t, int plen, ip_addr paddr, ip_addr pmask, ip_addr amask) +new_node(struct f_trie *t, uint plen, uint local, ip_addr paddr, ip_addr pmask, ip_addr amask) { if (t->ipv4) - return (struct f_trie_node *) new_node4(t, plen, ipt_to_ip4(paddr), ipt_to_ip4(pmask), ipt_to_ip4(amask)); + return (struct f_trie_node *) new_node4(t, plen, local, ipt_to_ip4(paddr), ipt_to_ip4(pmask), ipt_to_ip4(amask)); else - return (struct f_trie_node *) new_node6(t, plen, ipa_to_ip6(paddr), ipa_to_ip6(pmask), ipa_to_ip6(amask)); + return (struct f_trie_node *) new_node6(t, plen, local, ipa_to_ip6(paddr), ipa_to_ip6(pmask), ipa_to_ip6(amask)); } static inline void attach_node4(struct f_trie_node4 *parent, struct f_trie_node4 *child) { - parent->c[ip4_getbit(child->addr, parent->plen) ? 1 : 0] = child; + parent->c[ip4_getbits(child->addr, parent->plen, TRIE_STEP)] = child; } static inline void attach_node6(struct f_trie_node6 *parent, struct f_trie_node6 *child) { - parent->c[ip6_getbit(child->addr, parent->plen) ? 1 : 0] = child; + parent->c[ip6_getbits(child->addr, parent->plen, TRIE_STEP)] = child; } static inline void @@ -160,63 +205,96 @@ attach_node(struct f_trie_node *parent, struct f_trie_node *child, int v4) attach_node6(&parent->v6, &child->v6); } -#define GET_ADDR(N,F,X) ((X) ? ipt_from_ip4((N)->v4.F) : ipa_from_ip6((N)->v6.F)) -#define SET_ADDR(N,F,X,V) ({ if (X) (N)->v4.F =ipt_to_ip4(V); else (N)->v6.F =ipa_to_ip6(V); }) -#define GET_CHILD(N,F,X,I) ((X) ? (struct f_trie_node *) (N)->v4.c[I] : (struct f_trie_node *) (N)->v6.c[I]) -/** - * trie_add_prefix - * @t: trie to add to - * @net: IP network prefix - * @l: prefix lower bound - * @h: prefix upper bound +/* + * Internal prefixes of a node a represented by the local bitmask, each bit for + * one prefix. Bit 0 is unused, Bit 1 is for the main prefix of the node, + * remaining bits correspond to subprefixes by this pattern: * - * Adds prefix (prefix pattern) @n to trie @t. @l and @h are lower - * and upper bounds on accepted prefix lengths, both inclusive. - * 0 <= l, h <= 32 (128 for IPv6). + * 1 + * 2 3 + * 4 5 6 7 + * 8 9 A B C D E F * - * Returns a pointer to the allocated node. The function can return a pointer to - * an existing node if @px and @plen are the same. If px/plen == 0/0 (or ::/0), - * a pointer to the root node is returned. Returns NULL when called with - * mismatched IPv4/IPv6 net type. + * E.g. for 10.0.0.0/8 node, the 10.64.0.0/10 would be position 5. */ -void * -trie_add_prefix(struct f_trie *t, const net_addr *net, uint l, uint h) +/* + * Compute appropriate mask representing prefix px/plen in local bitmask of node + * with prefix length nlen. Assuming that nlen <= plen < (nlen + TRIE_STEP). + */ +static inline uint +trie_local_mask4(ip4_addr px, uint plen, uint nlen) { - uint plen = net_pxlen(net); - ip_addr px; - int v4; + uint step = plen - nlen; + uint pos = (1u << step) + ip4_getbits(px, nlen, step); + return 1u << pos; +} - switch (net->type) - { - case NET_IP4: px = ipt_from_ip4(net4_prefix(net)); v4 = 1; break; - case NET_IP6: px = ipa_from_ip6(net6_prefix(net)); v4 = 0; break; - default: bug("invalid type"); - } +static inline uint +trie_local_mask6(ip6_addr px, uint plen, uint nlen) +{ + uint step = plen - nlen; + uint pos = (1u << step) + ip6_getbits(px, nlen, step); + return 1u << pos; +} - if (t->ipv4 != v4) - { - if (t->ipv4 < 0) - t->ipv4 = v4; - else - return NULL; - } +/* + * Compute an appropriate local mask (for a node with prefix length nlen) + * representing prefixes of px that are accepted by amask and fall within the + * range associated with that node. Used for propagation of child accept mask + * to parent local mask. + */ +static inline uint +trie_amask_to_local(ip_addr px, ip_addr amask, uint nlen) +{ + uint local = 0; - if (l == 0) - t->zero = 1; - else - l--; + for (uint plen = MAX(nlen, 1); plen < (nlen + TRIE_STEP); plen++) + if (ipa_getbit(amask, plen - 1)) + local |= trie_local_mask(px, plen, nlen); - if (h < plen) - plen = h; + return local; +} + +/* + * Compute a bitmask representing a level of subprefixes (of the same length), + * using specified position as a root. E.g., level 2 from root position 3 would + * be bit positions C-F, returned as bitmask 0xf000. + */ +static inline uint +trie_level_mask(uint pos, uint level) +{ + return ((1u << (1u << level)) - 1) << (pos << level); +} - ip_addr amask = ipa_xor(ipa_mkmask(l), ipa_mkmask(h)); + +#define GET_ADDR(N,F,X) ((X) ? ipt_from_ip4((N)->v4.F) : ipa_from_ip6((N)->v6.F)) +#define SET_ADDR(N,F,X,V) ({ if (X) (N)->v4.F =ipt_to_ip4(V); else (N)->v6.F =ipa_to_ip6(V); }) + +#define GET_LOCAL(N,X) ((X) ? (N)->v4.local : (N)->v6.local) +#define ADD_LOCAL(N,X,V) ({ uint v_ = (V); if (X) (N)->v4.local |= v_; else (N)->v6.local |= v_; }) + +#define GET_CHILD(N,X,I) ((X) ? (struct f_trie_node *) (N)->v4.c[I] : (struct f_trie_node *) (N)->v6.c[I]) + + +static void * +trie_add_node(struct f_trie *t, uint plen, ip_addr px, uint local, uint l, uint h) +{ + uint l_ = l ? (l - 1) : 0; + ip_addr amask = (l_ < h) ? ipa_xor(ipa_mkmask(l_), ipa_mkmask(h)) : IPA_NONE; ip_addr pmask = ipa_mkmask(plen); ip_addr paddr = ipa_and(px, pmask); struct f_trie_node *o = NULL; struct f_trie_node *n = &t->root; + int v4 = t->ipv4; + /* Add all bits for each active level (0x0002 0x000c 0x00f0 0xff00) */ + for (uint i = 0; i < TRIE_STEP; i++) + if ((l <= (plen + i)) && ((plen + i) <= h)) + local |= trie_level_mask(1, i); + + DBG("Insert node %I/%u (%I %x)\n", paddr, plen, amask, local); while (n) { ip_addr naddr = GET_ADDR(n, addr, v4); @@ -225,23 +303,31 @@ trie_add_prefix(struct f_trie *t, const net_addr *net, uint l, uint h) ip_addr cmask = ipa_and(nmask, pmask); uint nlen = v4 ? n->v4.plen : n->v6.plen; + DBG("Found node %I/%u (%I %x)\n", + naddr, nlen, accept, v4 ? n->v4.local : n->v6.local); + if (ipa_compare(ipa_and(paddr, cmask), ipa_and(naddr, cmask))) { /* We are out of path - we have to add branching node 'b' between node 'o' and node 'n', and attach new node 'a' as the other child of 'b'. */ - int blen = ipa_pxlen(paddr, naddr); + int blen = ROUND_DOWN_POW2(ipa_pxlen(paddr, naddr), TRIE_STEP); ip_addr bmask = ipa_mkmask(blen); ip_addr baddr = ipa_and(px, bmask); /* Merge accept masks from children to get accept mask for node 'b' */ ip_addr baccm = ipa_and(ipa_or(amask, accept), bmask); + uint bloc = trie_amask_to_local(naddr, accept, blen) | + trie_amask_to_local(paddr, amask, blen); - struct f_trie_node *a = new_node(t, plen, paddr, pmask, amask); - struct f_trie_node *b = new_node(t, blen, baddr, bmask, baccm); + struct f_trie_node *a = new_node(t, plen, local, paddr, pmask, amask); + struct f_trie_node *b = new_node(t, blen, bloc, baddr, bmask, baccm); attach_node(o, b, v4); attach_node(b, n, v4); attach_node(b, a, v4); + t->prefix_count++; + + DBG("Case 1\n"); return a; } @@ -249,66 +335,195 @@ trie_add_prefix(struct f_trie *t, const net_addr *net, uint l, uint h) { /* We add new node 'a' between node 'o' and node 'n' */ amask = ipa_or(amask, ipa_and(accept, pmask)); - struct f_trie_node *a = new_node(t, plen, paddr, pmask, amask); + local |= trie_amask_to_local(naddr, accept, plen); + struct f_trie_node *a = new_node(t, plen, local, paddr, pmask, amask); attach_node(o, a, v4); attach_node(a, n, v4); + t->prefix_count++; + + DBG("Case 2\n"); return a; } if (plen == nlen) { - /* We already found added node in trie. Just update accept mask */ + /* We already found added node in trie. Just update accept and local mask */ accept = ipa_or(accept, amask); SET_ADDR(n, accept, v4, accept); + + if ((GET_LOCAL(n, v4) & local) != local) + t->prefix_count++; + + ADD_LOCAL(n, v4, local); + + DBG("Case 3\n"); return n; } /* Update accept mask part M2 and go deeper */ accept = ipa_or(accept, ipa_and(amask, nmask)); SET_ADDR(n, accept, v4, accept); + ADD_LOCAL(n, v4, trie_amask_to_local(paddr, amask, nlen)); + + DBG("Step %u\n", ipa_getbits(paddr, nlen)); /* n->plen < plen and plen <= 32 (128) */ o = n; - n = GET_CHILD(n, c, v4, ipa_getbit(paddr, nlen) ? 1 : 0); + n = GET_CHILD(n, v4, ipa_getbits(paddr, nlen, TRIE_STEP)); } /* We add new tail node 'a' after node 'o' */ - struct f_trie_node *a = new_node(t, plen, paddr, pmask, amask); + struct f_trie_node *a = new_node(t, plen, local, paddr, pmask, amask); attach_node(o, a, v4); + t->prefix_count++; + DBG("Case 4\n"); return a; } +/** + * trie_add_prefix + * @t: trie to add to + * @net: IP network prefix + * @l: prefix lower bound + * @h: prefix upper bound + * + * Adds prefix (prefix pattern) @n to trie @t. @l and @h are lower + * and upper bounds on accepted prefix lengths, both inclusive. + * 0 <= l, h <= 32 (128 for IPv6). + * + * Returns a pointer to the allocated node. The function can return a pointer to + * an existing node if @px and @plen are the same. If px/plen == 0/0 (or ::/0), + * a pointer to the root node is returned. Returns NULL when called with + * mismatched IPv4/IPv6 net type. + */ +void * +trie_add_prefix(struct f_trie *t, const net_addr *net, uint l, uint h) +{ + uint plen = net_pxlen(net); + ip_addr px; + int v4; + + switch (net->type) + { + case NET_IP4: + case NET_VPN4: + case NET_ROA4: + px = ipt_from_ip4(net4_prefix(net)); + v4 = 1; + break; + + case NET_IP6: + case NET_VPN6: + case NET_ROA6: + case NET_IP6_SADR: + px = ipa_from_ip6(net6_prefix(net)); + v4 = 0; + break; + + default: + bug("invalid type"); + } + + if (t->ipv4 != v4) + { + if (t->ipv4 < 0) + t->ipv4 = v4; + else + return NULL; + } + + DBG("\nInsert net %N (%u-%u)\n", net, l, h); + + if (l == 0) + t->zero = 1; + + if (h < plen) + plen = h; + + /* Primary node length, plen rounded down */ + uint nlen = ROUND_DOWN_POW2(plen, TRIE_STEP); + + if (plen == nlen) + return trie_add_node(t, nlen, px, 0, l, h); + + /* Secondary node length, plen rouned up */ + uint slen = nlen + TRIE_STEP; + void *node = NULL; + + /* + * For unaligned prefix lengths it is more complicated. We need to encode + * matching prefixes of lengths from l to h. There are three cases of lengths: + * + * 1) 0..nlen are encoded by the accept mask of the primary node + * 2) nlen..(slen-1) are encoded by the local mask of the primary node + * 3) slen..max are encoded in secondary nodes + */ + + if (l < slen) + { + uint local = 0; + + /* Compute local bits for accepted nlen..(slen-1) prefixes */ + for (uint i = 0; i < TRIE_STEP; i++) + if ((l <= (nlen + i)) && ((nlen + i) <= h)) + { + uint pos = (1u << i) + ipa_getbits(px, nlen, i); + uint len = ((nlen + i) <= plen) ? 1 : (1u << (nlen + i - plen)); + + /* We need to fill 'len' bits starting at 'pos' position */ + local |= ((1u << len) - 1) << pos; + } + + /* Add the primary node */ + node = trie_add_node(t, nlen, px, local, l, nlen); + } + + if (slen <= h) + { + uint l2 = MAX(l, slen); + uint max = (1u << (slen - plen)); + + /* Add secondary nodes */ + for (uint i = 0; i < max; i++) + node = trie_add_node(t, slen, ipa_setbits(px, slen - 1, i), 0, l2, h); + } + + return node; +} + + static int trie_match_net4(const struct f_trie *t, ip4_addr px, uint plen) { - ip4_addr pmask = ip4_mkmask(plen); - ip4_addr paddr = ip4_and(px, pmask); - if (plen == 0) return t->zero; int plentest = plen - 1; + uint nlen = ROUND_DOWN_POW2(plen, TRIE_STEP); + uint local = trie_local_mask4(px, plen, nlen); const struct f_trie_node4 *n = &t->root.v4; while (n) { - ip4_addr cmask = ip4_and(n->mask, pmask); - /* We are out of path */ - if (ip4_compare(ip4_and(paddr, cmask), ip4_and(n->addr, cmask))) + if (!ip4_prefix_equal(px, n->addr, MIN(plen, n->plen))) return 0; + /* Check local mask */ + if ((n->plen == nlen) && (n->local & local)) + return 1; + /* Check accept mask */ if (ip4_getbit(n->accept, plentest)) return 1; /* We finished trie walk and still no match */ - if (plen <= n->plen) + if (nlen <= n->plen) return 0; /* Choose children */ - n = n->c[(ip4_getbit(paddr, n->plen)) ? 1 : 0]; + n = n->c[ip4_getbits(px, n->plen, TRIE_STEP)]; } return 0; @@ -317,33 +532,34 @@ trie_match_net4(const struct f_trie *t, ip4_addr px, uint plen) static int trie_match_net6(const struct f_trie *t, ip6_addr px, uint plen) { - ip6_addr pmask = ip6_mkmask(plen); - ip6_addr paddr = ip6_and(px, pmask); - if (plen == 0) return t->zero; int plentest = plen - 1; + uint nlen = ROUND_DOWN_POW2(plen, TRIE_STEP); + uint local = trie_local_mask6(px, plen, nlen); const struct f_trie_node6 *n = &t->root.v6; while (n) { - ip6_addr cmask = ip6_and(n->mask, pmask); - /* We are out of path */ - if (ip6_compare(ip6_and(paddr, cmask), ip6_and(n->addr, cmask))) + if (!ip6_prefix_equal(px, n->addr, MIN(plen, n->plen))) return 0; + /* Check local mask */ + if ((n->plen == nlen) && (n->local & local)) + return 1; + /* Check accept mask */ if (ip6_getbit(n->accept, plentest)) return 1; /* We finished trie walk and still no match */ - if (plen <= n->plen) + if (nlen <= n->plen) return 0; /* Choose children */ - n = n->c[(ip6_getbit(paddr, n->plen)) ? 1 : 0]; + n = n->c[ip6_getbits(px, n->plen, TRIE_STEP)]; } return 0; @@ -378,6 +594,412 @@ trie_match_net(const struct f_trie *t, const net_addr *n) } } + +/** + * trie_match_longest_ip4 + * @t: trie + * @net: net address + * @dst: return value + * @found0: optional returned bitmask of found nodes + * + * Perform longest prefix match for the address @net and return the resulting + * prefix in the buffer @dst. The bitmask @found0 is used to report lengths of + * prefixes on the path from the root to the resulting prefix. E.g., if there is + * also a /20 shorter matching prefix, then 20-th bit is set in @found0. This + * can be used to enumerate all matching prefixes for the network @net using + * function trie_match_next_longest_ip4() or macro TRIE_WALK_TO_ROOT_IP4(). + * + * This function assumes IPv4 trie, there is also an IPv6 variant. The @net + * argument is typed as net_addr_ip4, but would accept any IPv4-based net_addr, + * like net4_prefix(). Anyway, returned @dst is always net_addr_ip4. + * + * Result: 1 if a matching prefix was found, 0 if not. + */ +int +trie_match_longest_ip4(const struct f_trie *t, const net_addr_ip4 *net, net_addr_ip4 *dst, ip4_addr *found0) +{ + ASSERT(t->ipv4); + + const ip4_addr prefix = net->prefix; + const int pxlen = net->pxlen; + + const struct f_trie_node4 *n = &t->root.v4; + int len = 0; + + ip4_addr found = IP4_NONE; + int last = -1; + + while (n) + { + /* We are out of path */ + if (!ip4_prefix_equal(prefix, n->addr, MIN(pxlen, n->plen))) + goto done; + + /* Check accept mask */ + for (; len < n->plen; len++) + { + if (len > pxlen) + goto done; + + if (ip4_getbit(n->accept, len - 1)) + { + /* len is always < 32 due to len < n->plen */ + ip4_setbit(&found, len); + last = len; + } + } + + /* Special case for max length, there is only one valid local position */ + if (len == IP4_MAX_PREFIX_LENGTH) + { + if (n->local & (1u << 1)) + last = len; + + goto done; + } + + /* Check local mask */ + for (int pos = 1; pos < (1 << TRIE_STEP); pos = 2 * pos + ip4_getbit(prefix, len), len++) + { + if (len > pxlen) + goto done; + + if (n->local & (1u << pos)) + { + /* len is always < 32 due to special case above */ + ip4_setbit(&found, len); + last = len; + } + } + + /* Choose child */ + n = n->c[ip4_getbits(prefix, n->plen, TRIE_STEP)]; + } + +done: + if (last < 0) + return 0; + + *dst = NET_ADDR_IP4(ip4_and(prefix, ip4_mkmask(last)), last); + + if (found0) + *found0 = found; + + return 1; +} + + +/** + * trie_match_longest_ip6 + * @t: trie + * @net: net address + * @dst: return value + * @found0: optional returned bitmask of found nodes + * + * Perform longest prefix match for the address @net and return the resulting + * prefix in the buffer @dst. The bitmask @found0 is used to report lengths of + * prefixes on the path from the root to the resulting prefix. E.g., if there is + * also a /20 shorter matching prefix, then 20-th bit is set in @found0. This + * can be used to enumerate all matching prefixes for the network @net using + * function trie_match_next_longest_ip6() or macro TRIE_WALK_TO_ROOT_IP6(). + * + * This function assumes IPv6 trie, there is also an IPv4 variant. The @net + * argument is typed as net_addr_ip6, but would accept any IPv6-based net_addr, + * like net6_prefix(). Anyway, returned @dst is always net_addr_ip6. + * + * Result: 1 if a matching prefix was found, 0 if not. + */ +int +trie_match_longest_ip6(const struct f_trie *t, const net_addr_ip6 *net, net_addr_ip6 *dst, ip6_addr *found0) +{ + ASSERT(!t->ipv4); + + const ip6_addr prefix = net->prefix; + const int pxlen = net->pxlen; + + const struct f_trie_node6 *n = &t->root.v6; + int len = 0; + + ip6_addr found = IP6_NONE; + int last = -1; + + while (n) + { + /* We are out of path */ + if (!ip6_prefix_equal(prefix, n->addr, MIN(pxlen, n->plen))) + goto done; + + /* Check accept mask */ + for (; len < n->plen; len++) + { + if (len > pxlen) + goto done; + + if (ip6_getbit(n->accept, len - 1)) + { + /* len is always < 128 due to len < n->plen */ + ip6_setbit(&found, len); + last = len; + } + } + + /* Special case for max length, there is only one valid local position */ + if (len == IP6_MAX_PREFIX_LENGTH) + { + if (n->local & (1u << 1)) + last = len; + + goto done; + } + + /* Check local mask */ + for (int pos = 1; pos < (1 << TRIE_STEP); pos = 2 * pos + ip6_getbit(prefix, len), len++) + { + if (len > pxlen) + goto done; + + if (n->local & (1u << pos)) + { + /* len is always < 128 due to special case above */ + ip6_setbit(&found, len); + last = len; + } + } + + /* Choose child */ + n = n->c[ip6_getbits(prefix, n->plen, TRIE_STEP)]; + } + +done: + if (last < 0) + return 0; + + *dst = NET_ADDR_IP6(ip6_and(prefix, ip6_mkmask(last)), last); + + if (found0) + *found0 = found; + + return 1; +} + +#define SAME_PREFIX(A,B,X,L) ((X) ? ip4_prefix_equal((A)->v4.addr, net4_prefix(B), (L)) : ip6_prefix_equal((A)->v6.addr, net6_prefix(B), (L))) +#define GET_NET_BITS(N,X,A,B) ((X) ? ip4_getbits(net4_prefix(N), (A), (B)) : ip6_getbits(net6_prefix(N), (A), (B))) + +/** + * trie_walk_init + * @s: walk state + * @t: trie + * @net: optional subnet for walk + * + * Initialize walk state for subsequent walk through nodes of the trie @t by + * trie_walk_next(). The argument @net allows to restrict walk to given subnet, + * otherwise full walk over all nodes is used. This is done by finding node at + * or below @net and starting position in it. + */ +void +trie_walk_init(struct f_trie_walk_state *s, const struct f_trie *t, const net_addr *net) +{ + *s = (struct f_trie_walk_state) { + .ipv4 = t->ipv4, + .accept_length = 0, + .start_pos = 1, + .local_pos = 1, + .stack_pos = 0, + .stack[0] = &t->root + }; + + if (!net) + return; + + /* We want to find node of level at least plen */ + int plen = ROUND_DOWN_POW2(net->pxlen, TRIE_STEP); + const struct f_trie_node *n = &t->root; + const int v4 = t->ipv4; + + while (n) + { + int nlen = v4 ? n->v4.plen : n->v6.plen; + + /* We are out of path */ + if (!SAME_PREFIX(n, net, v4, MIN(net->pxlen, nlen))) + break; + + /* We found final node */ + if (nlen >= plen) + { + if (nlen == plen) + { + /* Find proper local_pos, while accept_length is not used */ + int step = net->pxlen - plen; + s->start_pos = s->local_pos = (1u << step) + GET_NET_BITS(net, v4, plen, step); + s->accept_length = plen; + } + else + { + /* Start from pos 1 in local node, but first try accept mask */ + s->accept_length = net->pxlen; + } + + s->stack[0] = n; + return; + } + + /* Choose child */ + n = GET_CHILD(n, v4, GET_NET_BITS(net, v4, nlen, TRIE_STEP)); + } + + s->stack[0] = NULL; + return; +} + +#define GET_ACCEPT_BIT(N,X,B) ((X) ? ip4_getbit((N)->v4.accept, (B)) : ip6_getbit((N)->v6.accept, (B))) +#define GET_LOCAL_BIT(N,X,B) (((X) ? (N)->v4.local : (N)->v6.local) & (1u << (B))) + +/** + * trie_walk_next + * @s: walk state + * @net: return value + * + * Find the next prefix in the trie walk and return it in the buffer @net. + * Prefixes are walked in the usual lexicographic order and may be restricted + * to a subset of the trie during walk setup by trie_walk_init(). Note that the + * trie walk does not iterate reliably over 'implicit' prefixes defined by &low + * and &high fields in prefix patterns, it is supposed to be used on tries + * constructed from 'explicit' prefixes (&low == &plen == &high in call to + * trie_add_prefix()). + * + * Result: 1 if the next prefix was found, 0 for the end of walk. + */ +int +trie_walk_next(struct f_trie_walk_state *s, net_addr *net) +{ + const struct f_trie_node *n = s->stack[s->stack_pos]; + int len = s->accept_length; + int pos = s->local_pos; + int v4 = s->ipv4; + + /* + * The walk has three basic state variables -- n, len and pos. In each node n, + * we first walk superprefixes (by len in &accept bitmask), and then we walk + * internal positions (by pos in &local bitmask). These positions are: + * + * 1 + * 2 3 + * 4 5 6 7 + * 8 9 A B C D E F + * + * We walk them depth-first, including virtual positions 10-1F that are + * equivalent of position 1 in child nodes 0-F. + */ + + if (!n) + { + memset(net, 0, v4 ? sizeof(net_addr_ip4) : sizeof(net_addr_ip6)); + return 0; + } + +next_node:; + /* Current node prefix length */ + int nlen = v4 ? n->v4.plen : n->v6.plen; + + /* First, check for accept prefix */ + for (; len < nlen; len++) + if (GET_ACCEPT_BIT(n, v4, len - 1)) + { + if (v4) + net_fill_ip4(net, ip4_and(n->v4.addr, ip4_mkmask(len)), len); + else + net_fill_ip6(net, ip6_and(n->v6.addr, ip6_mkmask(len)), len); + + s->local_pos = pos; + s->accept_length = len + 1; + return 1; + } + +next_pos: + /* Bottom of this node */ + if (pos >= (1 << TRIE_STEP)) + { + const struct f_trie_node *child = GET_CHILD(n, v4, pos - (1 << TRIE_STEP)); + int dir = 0; + + /* No child node */ + if (!child) + { + /* Step up until return from left child (pos is even) */ + do + { + /* Step up from start node */ + if ((s->stack_pos == 0) && (pos == s->start_pos)) + { + s->stack[0] = NULL; + memset(net, 0, v4 ? sizeof(net_addr_ip4) : sizeof(net_addr_ip6)); + return 0; + } + + /* Top of this node */ + if (pos == 1) + { + ASSERT(s->stack_pos); + const struct f_trie_node *old = n; + + /* Move to parent node */ + s->stack_pos--; + n = s->stack[s->stack_pos]; + nlen = v4 ? n->v4.plen : n->v6.plen; + + pos = v4 ? + ip4_getbits(old->v4.addr, nlen, TRIE_STEP) : + ip6_getbits(old->v6.addr, nlen, TRIE_STEP); + pos += (1 << TRIE_STEP); + len = nlen; + + ASSERT(GET_CHILD(n, v4, pos - (1 << TRIE_STEP)) == old); + } + + /* Step up */ + dir = pos % 2; + pos = pos / 2; + } + while (dir); + + /* Continue with step down to the right child */ + pos = 2 * pos + 1; + goto next_pos; + } + + /* Move to child node */ + pos = 1; + len = nlen + TRIE_STEP; + + s->stack_pos++; + n = s->stack[s->stack_pos] = child; + goto next_node; + } + + /* Check for local prefix */ + if (GET_LOCAL_BIT(n, v4, pos)) + { + /* Convert pos to address of local network */ + int x = (pos >= 2) + (pos >= 4) + (pos >= 8); + int y = pos & ((1u << x) - 1); + + if (v4) + net_fill_ip4(net, !x ? n->v4.addr : ip4_setbits(n->v4.addr, nlen + x - 1, y), nlen + x); + else + net_fill_ip6(net, !x ? n->v6.addr : ip6_setbits(n->v6.addr, nlen + x - 1, y), nlen + x); + + s->local_pos = 2 * pos; + s->accept_length = len; + return 1; + } + + /* Step down */ + pos = 2 * pos; + goto next_pos; +} + + static int trie_node_same4(const struct f_trie_node4 *t1, const struct f_trie_node4 *t2) { @@ -392,7 +1014,11 @@ trie_node_same4(const struct f_trie_node4 *t1, const struct f_trie_node4 *t2) (! ip4_equal(t1->accept, t2->accept))) return 0; - return trie_node_same4(t1->c[0], t2->c[0]) && trie_node_same4(t1->c[1], t2->c[1]); + for (uint i = 0; i < (1 << TRIE_STEP); i++) + if (! trie_node_same4(t1->c[i], t2->c[i])) + return 0; + + return 1; } static int @@ -409,7 +1035,11 @@ trie_node_same6(const struct f_trie_node6 *t1, const struct f_trie_node6 *t2) (! ip6_equal(t1->accept, t2->accept))) return 0; - return trie_node_same6(t1->c[0], t2->c[0]) && trie_node_same6(t1->c[1], t2->c[1]); + for (uint i = 0; i < (1 << TRIE_STEP); i++) + if (! trie_node_same6(t1->c[i], t2->c[i])) + return 0; + + return 1; } /** @@ -431,30 +1061,70 @@ trie_same(const struct f_trie *t1, const struct f_trie *t2) return trie_node_same6(&t1->root.v6, &t2->root.v6); } + +static const u8 log2[16] = {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3}; + static void -trie_node_format4(const struct f_trie_node4 *t, buffer *buf) +trie_node_format(const struct f_trie_node *n, buffer *buf, int v4) { - if (t == NULL) + if (n == NULL) return; - if (ip4_nonzero(t->accept)) - buffer_print(buf, "%I4/%d{%I4}, ", t->addr, t->plen, t->accept); + if (v4) + { + if (ip4_nonzero(n->v4.accept)) + buffer_print(buf, "%I4/%d{%I4}, ", n->v4.addr, n->v4.plen, n->v4.accept); + } + else + { + if (ip6_nonzero(n->v6.accept)) + buffer_print(buf, "%I6/%d{%I6}, ", n->v6.addr, n->v6.plen, n->v6.accept); + } - trie_node_format4(t->c[0], buf); - trie_node_format4(t->c[1], buf); -} + int nlen = v4 ? n->v4.plen : n->v6.plen; + uint local = v4 ? n->v4.local : n->v6.local; -static void -trie_node_format6(const struct f_trie_node6 *t, buffer *buf) -{ - if (t == NULL) - return; + for (int i = (nlen ? 0 : 1); i < TRIE_STEP; i++) + if (GET_ACCEPT_BIT(n, v4, nlen + i - 1)) + local &= ~trie_level_mask(1, i); - if (ip6_nonzero(t->accept)) - buffer_print(buf, "%I6/%d{%I6}, ", t->addr, t->plen, t->accept); + for (int pos = 2; local && (pos < (1 << TRIE_STEP)); pos++) + if (local & (1u << pos)) + { + int lvl = log2[pos]; + int plen = nlen + lvl; + + int i; + for (i = 0; lvl + i < TRIE_STEP; i++) + { + uint lmask = trie_level_mask(pos, i); + + if ((local & lmask) != lmask) + break; + + local &= ~lmask; + } + + uint addr_bits = pos & ((1u << lvl) - 1); + uint accept_bits = (1u << i) - 1; + int h = plen + i - 1; + + if (v4) + { + ip4_addr addr = ip4_setbits(n->v4.addr, plen - 1, addr_bits); + ip4_addr mask = ip4_setbits(IP4_NONE, h - 1, accept_bits); + buffer_print(buf, "%I4/%d{%I4}, ", addr, plen, mask); + } + else + { + ip6_addr addr = ip6_setbits(n->v6.addr, plen - 1, addr_bits); + ip6_addr mask = ip6_setbits(IP6_NONE, h - 1, accept_bits); + buffer_print(buf, "%I6/%d{%I6}, ", addr, plen, mask); + } + } - trie_node_format6(t->c[0], buf); - trie_node_format6(t->c[1], buf); + for (int i = 0; i < (1 << TRIE_STEP); i++) + trie_node_format(GET_CHILD(n, v4, i), buf, v4); } /** @@ -472,10 +1142,7 @@ trie_format(const struct f_trie *t, buffer *buf) if (t->zero) buffer_print(buf, "%I/%d, ", t->ipv4 ? IPA_NONE4 : IPA_NONE6, 0); - if (t->ipv4) - trie_node_format4(&t->root.v4, buf); - else - trie_node_format6(&t->root.v6, buf); + trie_node_format(&t->root, buf, t->ipv4); if (buf->pos == buf->end) return; diff --git a/filter/trie_test.c b/filter/trie_test.c index b2b36716..81f38db9 100644 --- a/filter/trie_test.c +++ b/filter/trie_test.c @@ -14,9 +14,12 @@ #include "conf/conf.h" #define TESTS_NUM 10 -#define PREFIXES_NUM 10 +#define PREFIXES_NUM 32 #define PREFIX_TESTS_NUM 10000 +#define PREFIX_BENCH_NUM 100000000 +#define TRIE_BUFFER_SIZE 1024 +#define TEST_BUFFER_SIZE (1024*1024) #define BIG_BUFFER_SIZE 10000 /* Wrapping structure for storing f_prefixes structures in list */ @@ -31,144 +34,813 @@ xrandom(u32 max) return (bt_random() % max); } +static inline uint +get_exp_random(void) +{ + uint r, n = 0; + + for (r = bt_random(); r & 1; r = r >> 1) + n++; + + return n; +} + static int -is_prefix_included(list *prefixes, struct f_prefix *needle) +compare_prefixes(const void *a, const void *b) { - struct f_prefix_node *n; - WALK_LIST(n, *prefixes) - { - ip6_addr cmask = ip6_mkmask(MIN(n->prefix.net.pxlen, needle->net.pxlen)); + return net_compare(&((const struct f_prefix *) a)->net, + &((const struct f_prefix *) b)->net); +} - ip6_addr ip = net6_prefix(&n->prefix.net); - ip6_addr needle_ip = net6_prefix(&needle->net); +static inline int +matching_ip4_nets(const net_addr_ip4 *a, const net_addr_ip4 *b) +{ + ip4_addr cmask = ip4_mkmask(MIN(a->pxlen, b->pxlen)); + return ip4_compare(ip4_and(a->prefix, cmask), ip4_and(b->prefix, cmask)) == 0; +} + +static inline int +matching_ip6_nets(const net_addr_ip6 *a, const net_addr_ip6 *b) +{ + ip6_addr cmask = ip6_mkmask(MIN(a->pxlen, b->pxlen)); + return ip6_compare(ip6_and(a->prefix, cmask), ip6_and(b->prefix, cmask)) == 0; +} + +static inline int +matching_nets(const net_addr *a, const net_addr *b) +{ + if (a->type != b->type) + return 0; + + return (a->type == NET_IP4) ? + matching_ip4_nets((const net_addr_ip4 *) a, (const net_addr_ip4 *) b) : + matching_ip6_nets((const net_addr_ip6 *) a, (const net_addr_ip6 *) b); +} - if ((ipa_compare(ipa_and(ip, cmask), ipa_and(needle_ip, cmask)) == 0) && - (n->prefix.lo <= needle->net.pxlen) && (needle->net.pxlen <= n->prefix.hi)) +static int +is_prefix_included(list *prefixes, const net_addr *needle) +{ + struct f_prefix_node *n; + WALK_LIST(n, *prefixes) + if (matching_nets(&n->prefix.net, needle) && + (n->prefix.lo <= needle->pxlen) && (needle->pxlen <= n->prefix.hi)) { - bt_debug("FOUND\t" PRIip6 "/%d %d-%d\n", ARGip6(net6_prefix(&n->prefix.net)), n->prefix.net.pxlen, n->prefix.lo, n->prefix.hi); + char buf[64]; + bt_format_net(buf, 64, &n->prefix.net); + bt_debug("FOUND %s %d-%d\n", buf, n->prefix.lo, n->prefix.hi); + return 1; /* OK */ } - } + return 0; /* FAIL */ } -static struct f_prefix -get_random_ip6_prefix(void) +static void +get_random_net(net_addr *net, int v6) { - struct f_prefix p; - u8 pxlen = xrandom(120)+8; - ip6_addr ip6 = ip6_build(bt_random(),bt_random(),bt_random(),bt_random()); - net_addr_ip6 net6 = NET_ADDR_IP6(ip6, pxlen); + if (!v6) + { + uint pxlen = xrandom(24)+8; + ip4_addr ip4 = ip4_from_u32((u32) bt_random()); + net_fill_ip4(net, ip4_and(ip4, ip4_mkmask(pxlen)), pxlen); + } + else + { + uint pxlen = xrandom(120)+8; + ip6_addr ip6 = ip6_build(bt_random(), bt_random(), bt_random(), bt_random()); + net_fill_ip6(net, ip6_and(ip6, ip6_mkmask(pxlen)), pxlen); + } +} - p.net = *((net_addr*) &net6); +static void +get_random_prefix(struct f_prefix *px, int v6, int tight) +{ + get_random_net(&px->net, v6); + + if (tight) + { + px->lo = px->hi = px->net.pxlen; + } + else if (bt_random() % 2) + { + px->lo = 0; + px->hi = px->net.pxlen; + } + else + { + px->lo = px->net.pxlen; + px->hi = net_max_prefix_length[px->net.type]; + } +} + +static void +get_random_ip4_subnet(net_addr_ip4 *net, const net_addr_ip4 *src, int pxlen) +{ + *net = NET_ADDR_IP4(ip4_and(src->prefix, ip4_mkmask(pxlen)), pxlen); + + if (pxlen > src->pxlen) + { + ip4_addr rnd = ip4_from_u32((u32) bt_random()); + ip4_addr mask = ip4_xor(ip4_mkmask(src->pxlen), ip4_mkmask(pxlen)); + net->prefix = ip4_or(net->prefix, ip4_and(rnd, mask)); + } +} + +static void +get_random_ip6_subnet(net_addr_ip6 *net, const net_addr_ip6 *src, int pxlen) +{ + *net = NET_ADDR_IP6(ip6_and(src->prefix, ip6_mkmask(pxlen)), pxlen); + + if (pxlen > src->pxlen) + { + ip6_addr rnd = ip6_build(bt_random(), bt_random(), bt_random(), bt_random()); + ip6_addr mask = ip6_xor(ip6_mkmask(src->pxlen), ip6_mkmask(pxlen)); + net->prefix = ip6_or(net->prefix, ip6_and(rnd, mask)); + } +} + +static void +get_random_subnet(net_addr *net, const net_addr *src, int pxlen) +{ + if (src->type == NET_IP4) + get_random_ip4_subnet((net_addr_ip4 *) net, (const net_addr_ip4 *) src, pxlen); + else + get_random_ip6_subnet((net_addr_ip6 *) net, (const net_addr_ip6 *) src, pxlen); +} + +static void +get_inner_net(net_addr *net, const struct f_prefix *src) +{ + int pxlen, step; if (bt_random() % 2) { - p.lo = 0; - p.hi = p.net.pxlen; + step = get_exp_random(); + step = MIN(step, src->hi - src->lo); + pxlen = (bt_random() % 2) ? (src->lo + step) : (src->hi - step); } else + pxlen = src->lo + bt_random() % (src->hi - src->lo + 1); + + get_random_subnet(net, &src->net, pxlen); +} + +static void +swap_random_bits_ip4(net_addr_ip4 *net, int num) +{ + for (int i = 0; i < num; i++) { - p.lo = p.net.pxlen; - p.hi = net_max_prefix_length[p.net.type]; + ip4_addr swap = IP4_NONE; + ip4_setbit(&swap, bt_random() % net->pxlen); + net->prefix = ip4_xor(net->prefix, swap); } +} - return p; +static void +swap_random_bits_ip6(net_addr_ip6 *net, int num) +{ + for (int i = 0; i < num; i++) + { + ip6_addr swap = IP6_NONE; + ip6_setbit(&swap, bt_random() % net->pxlen); + net->prefix = ip6_xor(net->prefix, swap); + } +} + +static void +swap_random_bits(net_addr *net, int num) +{ + if (net->type == NET_IP4) + swap_random_bits_ip4((net_addr_ip4 *) net, num); + else + swap_random_bits_ip6((net_addr_ip6 *) net, num); } static void -generate_random_ipv6_prefixes(list *prefixes) +get_outer_net(net_addr *net, const struct f_prefix *src) { - int i; - for (i = 0; i < PREFIXES_NUM; i++) + int pxlen, step; + int inside = 0; + int max = net_max_prefix_length[src->net.type]; + + if ((src->lo > 0) && (bt_random() % 3)) + { + step = 1 + get_exp_random(); + step = MIN(step, src->lo); + pxlen = src->lo - step; + } + else if ((src->hi < max) && (bt_random() % 2)) { - struct f_prefix f = get_random_ip6_prefix(); + step = 1 + get_exp_random(); + step = MIN(step, max - src->hi); + pxlen = src->hi + step; + } + else + { + pxlen = src->lo + bt_random() % (src->hi - src->lo + 1); + inside = 1; + } - struct f_prefix_node *px = calloc(1, sizeof(struct f_prefix_node)); - px->prefix = f; + get_random_subnet(net, &src->net, pxlen); + + /* Perhaps swap some bits in prefix */ + if ((net->pxlen > 0) && (inside || (bt_random() % 4))) + swap_random_bits(net, 1 + get_exp_random()); +} - bt_debug("ADD\t" PRIip6 "/%d %d-%d\n", ARGip6(net6_prefix(&px->prefix.net)), px->prefix.net.pxlen, px->prefix.lo, px->prefix.hi); +static list * +make_random_prefix_list(int num, int v6, int tight) +{ + list *prefixes = tmp_allocz(sizeof(struct f_prefix_node)); + init_list(prefixes); + + for (int i = 0; i < num; i++) + { + struct f_prefix_node *px = tmp_allocz(sizeof(struct f_prefix_node)); + get_random_prefix(&px->prefix, v6, tight); add_tail(prefixes, &px->n); + + char buf[64]; + bt_format_net(buf, 64, &px->prefix.net); + bt_debug("ADD %s{%d,%d}\n", buf, px->prefix.lo, px->prefix.hi); + } + + return prefixes; +} + +static struct f_trie * +make_trie_from_prefix_list(list *prefixes) +{ + struct f_trie *trie = f_new_trie(tmp_linpool, 0); + + struct f_prefix_node *n; + WALK_LIST(n, *prefixes) + trie_add_prefix(trie, &n->prefix.net, n->prefix.lo, n->prefix.hi); + + return trie; +} + +/* + * Read sequence of prefixes from file handle and return prefix list. + * Each prefix is on one line, sequence terminated by empty line or eof. + * Arg @plus means prefix should include all longer ones. + */ +static list * +read_prefix_list(FILE *f, int v6, int plus) +{ + ASSERT(!v6); + + uint a0, a1, a2, a3, pl; + char s[32]; + int n; + + list *pxlist = tmp_allocz(sizeof(struct f_prefix_node)); + init_list(pxlist); + + errno = 0; + while (fgets(s, 32, f)) + { + if (s[0] == '\n') + return pxlist; + + n = sscanf(s, "%u.%u.%u.%u/%u", &a0, &a1, &a2, &a3, &pl); + + if (n != 5) + bt_abort_msg("Invalid content of trie_data"); + + struct f_prefix_node *px = tmp_allocz(sizeof(struct f_prefix_node)); + net_fill_ip4(&px->prefix.net, ip4_build(a0, a1, a2, a3), pl); + px->prefix.lo = pl; + px->prefix.hi = plus ? IP4_MAX_PREFIX_LENGTH : pl; + add_tail(pxlist, &px->n); + + char buf[64]; + bt_format_net(buf, 64, &px->prefix.net); + bt_debug("ADD %s{%d,%d}\n", buf, px->prefix.lo, px->prefix.hi); } + + bt_syscall(errno, "fgets()"); + return EMPTY_LIST(*pxlist) ? NULL : pxlist; } +/* + * Open file, read multiple sequences of prefixes from it. Fill @data with + * prefix lists and @trie with generated tries. Return number of sequences / + * tries. Use separate linpool @lp0 for prefix lists and @lp1 for tries. + * Arg @plus means prefix should include all longer ones. + */ static int -t_match_net(void) +read_prefix_file(const char *filename, int plus, + list *data[], struct f_trie *trie[]) { - bt_bird_init(); - bt_config_parse(BT_CONFIG_SIMPLE); + FILE *f = fopen(filename, "r"); + bt_syscall(!f, "fopen(%s)", filename); - uint round; - for (round = 0; round < TESTS_NUM; round++) + int n = 0; + list *pxlist; + while (pxlist = read_prefix_list(f, 0, plus)) { - list prefixes; /* of structs f_extended_prefix */ - init_list(&prefixes); - struct f_trie *trie = f_new_trie(config->mem, 0); + data[n] = pxlist; + trie[n] = make_trie_from_prefix_list(pxlist); + bt_debug("NEXT\n"); + n++; + } - generate_random_ipv6_prefixes(&prefixes); - struct f_prefix_node *n; - WALK_LIST(n, prefixes) + fclose(f); + bt_debug("DONE reading %d tries\n", n); + + return n; +} + +/* + * Select random subset of @dn prefixes from prefix list @src of length @sn, + * and store them to buffer @dst (of size @dn). Prefixes may be chosen multiple + * times. Randomize order of prefixes in @dst buffer. + */ +static void +select_random_prefix_subset(list *src[], net_addr dst[], int sn, int dn) +{ + int pn = 0; + + if (!dn) + return; + + /* Compute total prefix number */ + for (int i = 0; i < sn; i++) + pn += list_length(src[i]); + + /* Change of selecting a prefix */ + int rnd = (pn / dn) + 10; + int n = 0; + + /* Iterate indefinitely over src array */ + for (int i = 0; 1; i++, i = (i < sn) ? i : 0) + { + struct f_prefix_node *px; + WALK_LIST(px, *src[i]) { - trie_add_prefix(trie, &n->prefix.net, n->prefix.lo, n->prefix.hi); + if (xrandom(rnd) != 0) + continue; + + net_copy(&dst[n], &px->prefix.net); + n++; + + /* We have enough */ + if (n == dn) + goto done; } + } - int i; - for (i = 0; i < PREFIX_TESTS_NUM; i++) +done: + /* Shuffle networks */ + for (int i = 0; i < dn; i++) + { + int j = xrandom(dn); + + if (i == j) + continue; + + net_addr tmp; + net_copy(&tmp, &dst[i]); + net_copy(&dst[i], &dst[j]); + net_copy(&dst[j], &tmp); + } +} + +/* Fill @dst buffer with @dn randomly generated /32 prefixes */ +static void +make_random_addresses(net_addr dst[], int dn) +{ + for (int i = 0; i < dn; i++) + net_fill_ip4(&dst[i], ip4_from_u32((u32) bt_random()), IP4_MAX_PREFIX_LENGTH); +} + +static void +test_match_net(list *prefixes, struct f_trie *trie, const net_addr *net) +{ + char buf[64]; + bt_format_net(buf, 64, net); + bt_debug("TEST %s\n", buf); + + int should_be = is_prefix_included(prefixes, net); + int is_there = trie_match_net(trie, net); + + bt_assert_msg(should_be == is_there, "Prefix %s %s match", buf, + (should_be ? "should" : "should not")); +} + +static int +t_match_random_net(void) +{ + int v6 = 0; + for (int round = 0; round < TESTS_NUM; round++) + { + list *prefixes = make_random_prefix_list(PREFIXES_NUM, v6, 0); + struct f_trie *trie = make_trie_from_prefix_list(prefixes); + + for (int i = 0; i < PREFIX_TESTS_NUM; i++) + { + net_addr net; + get_random_net(&net, v6); + test_match_net(prefixes, trie, &net); + } + + v6 = !v6; + tmp_flush(); + } + + return 1; +} + +static int +t_match_inner_net(void) +{ + int v6 = 0; + for (int round = 0; round < TESTS_NUM; round++) + { + list *prefixes = make_random_prefix_list(PREFIXES_NUM, v6, 0); + struct f_trie *trie = make_trie_from_prefix_list(prefixes); + + struct f_prefix_node *n = HEAD(*prefixes); + for (int i = 0; i < PREFIX_TESTS_NUM; i++) { - struct f_prefix f = get_random_ip6_prefix(); - bt_debug("TEST\t" PRIip6 "/%d\n", ARGip6(net6_prefix(&f.net)), f.net.pxlen); + net_addr net; + get_inner_net(&net, &n->prefix); + test_match_net(prefixes, trie, &net); - int should_be = is_prefix_included(&prefixes, &f); - int is_there = trie_match_net(trie, &f.net); - bt_assert_msg(should_be == is_there, "Prefix " PRIip6 "/%d %s", ARGip6(net6_prefix(&f.net)), f.net.pxlen, (should_be ? "should be found in trie" : "should not be found in trie")); + n = NODE_VALID(NODE_NEXT(n)) ? NODE_NEXT(n) : HEAD(*prefixes); } - struct f_prefix_node *nxt; - WALK_LIST_DELSAFE(n, nxt, prefixes) + v6 = !v6; + tmp_flush(); + } + + return 1; +} + +static int +t_match_outer_net(void) +{ + int v6 = 0; + for (int round = 0; round < TESTS_NUM; round++) + { + list *prefixes = make_random_prefix_list(PREFIXES_NUM, v6, 0); + struct f_trie *trie = make_trie_from_prefix_list(prefixes); + + struct f_prefix_node *n = HEAD(*prefixes); + for (int i = 0; i < PREFIX_TESTS_NUM; i++) { - free(n); + net_addr net; + get_outer_net(&net, &n->prefix); + test_match_net(prefixes, trie, &net); + + n = NODE_VALID(NODE_NEXT(n)) ? NODE_NEXT(n) : HEAD(*prefixes); } + + v6 = !v6; + tmp_flush(); } - bt_bird_cleanup(); + v6 = !v6; return 1; } +/* + * Read prefixes from @filename, build set of tries, prepare test data and do + * PREFIX_BENCH_NUM trie lookups. With @plus = 0, use random subset of known + * prefixes as test data, with @plus = 1, use randomly generated /32 prefixes + * as test data. + */ +static int +benchmark_trie_dataset(const char *filename, int plus) +{ + int n = 0; + list *data[TRIE_BUFFER_SIZE]; + struct f_trie *trie[TRIE_BUFFER_SIZE]; + net_addr *nets; + + bt_reset_suite_case_timer(); + bt_log_suite_case_result(1, "Reading %s", filename, n); + n = read_prefix_file(filename, plus, data, trie); + bt_log_suite_case_result(1, "Read prefix data, %d lists, ", n); + + size_t trie_size = rmemsize(tmp_linpool).effective * 1000 / (1024*1024); + bt_log_suite_case_result(1, "Trie size %u.%03u MB", + (uint) (trie_size / 1000), (uint) (trie_size % 1000)); + + int t = PREFIX_BENCH_NUM / n; + int tb = MIN(t, TEST_BUFFER_SIZE); + nets = tmp_alloc(tb * sizeof(net_addr)); + + if (!plus) + select_random_prefix_subset(data, nets, n, tb); + else + make_random_addresses(nets, tb); + + bt_log_suite_case_result(1, "Make test data, %d (%d) tests", t, tb); + bt_reset_suite_case_timer(); + + /* + int match = 0; + for (int i = 0; i < t; i++) + for (int j = 0; j < n; j++) + test_match_net(data[j], trie[j], &nets[i]); + */ + + int match = 0; + for (int i = 0; i < t; i++) + for (int j = 0; j < n; j++) + if (trie_match_net(trie[j], &nets[i % TEST_BUFFER_SIZE])) + match++; + + bt_log_suite_case_result(1, "Matching done, %d / %d matches", match, t * n); + + tmp_flush(); + return 1; +} + +static int UNUSED +t_bench_trie_datasets_subset(void) +{ + /* Specific datasets, not included */ + benchmark_trie_dataset("trie-data-bgp-1", 0); + benchmark_trie_dataset("trie-data-bgp-10", 0); + benchmark_trie_dataset("trie-data-bgp-100", 0); + benchmark_trie_dataset("trie-data-bgp-1000", 0); + + return 1; +} + +static int UNUSED +t_bench_trie_datasets_random(void) +{ + /* Specific datasets, not included */ + benchmark_trie_dataset("trie-data-bgp-1", 1); + benchmark_trie_dataset("trie-data-bgp-10", 1); + benchmark_trie_dataset("trie-data-bgp-100", 1); + benchmark_trie_dataset("trie-data-bgp-1000", 1); + + return 1; +} + + static int t_trie_same(void) { - bt_bird_init(); - bt_config_parse(BT_CONFIG_SIMPLE); + int v6 = 0; + for (int round = 0; round < TESTS_NUM*4; round++) + { + list *prefixes = make_random_prefix_list(100 * PREFIXES_NUM, v6, 0); + struct f_trie *trie1 = f_new_trie(tmp_linpool, 0); + struct f_trie *trie2 = f_new_trie(tmp_linpool, 0); - int round; - for (round = 0; round < TESTS_NUM*4; round++) + struct f_prefix_node *n; + WALK_LIST(n, *prefixes) + trie_add_prefix(trie1, &n->prefix.net, n->prefix.lo, n->prefix.hi); + + WALK_LIST_BACKWARDS(n, *prefixes) + trie_add_prefix(trie2, &n->prefix.net, n->prefix.lo, n->prefix.hi); + + bt_assert(trie_same(trie1, trie2)); + + v6 = !v6; + tmp_flush(); + } + + return 1; +} + +static inline void +log_networks(const net_addr *a, const net_addr *b) +{ + if (bt_verbose >= BT_VERBOSE_ABSOLUTELY_ALL) { - struct f_trie * trie1 = f_new_trie(config->mem, 0); - struct f_trie * trie2 = f_new_trie(config->mem, 0); + char buf0[64]; + char buf1[64]; + bt_format_net(buf0, 64, a); + bt_format_net(buf1, 64, b); + bt_debug("Found %s expected %s\n", buf0, buf1); + } +} - list prefixes; /* a list of f_extended_prefix structures */ - init_list(&prefixes); - int i; - for (i = 0; i < 100; i++) - generate_random_ipv6_prefixes(&prefixes); +static int +t_trie_walk(void) +{ + for (int round = 0; round < TESTS_NUM*8; round++) + { + int level = round / TESTS_NUM; + int v6 = level % 2; + int num = PREFIXES_NUM * (int[]){1, 10, 100, 1000}[level / 2]; + int pos = 0, end = 0; + list *prefixes = make_random_prefix_list(num, v6, 1); + struct f_trie *trie = make_trie_from_prefix_list(prefixes); + struct f_prefix *pxset = malloc((num + 1) * sizeof(struct f_prefix)); struct f_prefix_node *n; - WALK_LIST(n, prefixes) + WALK_LIST(n, *prefixes) + pxset[pos++] = n->prefix; + memset(&pxset[pos], 0, sizeof (struct f_prefix)); + + qsort(pxset, num, sizeof(struct f_prefix), compare_prefixes); + + + /* Full walk */ + bt_debug("Full walk (round %d, %d nets)\n", round, num); + + pos = 0; + uint pxc = 0; + TRIE_WALK(trie, net, NULL) { - trie_add_prefix(trie1, &n->prefix.net, n->prefix.lo, n->prefix.hi); + log_networks(&net, &pxset[pos].net); + bt_assert(net_equal(&net, &pxset[pos].net)); + + /* Skip possible duplicates */ + while (net_equal(&pxset[pos].net, &pxset[pos + 1].net)) + pos++; + + pos++; + pxc++; } - WALK_LIST_BACKWARDS(n, prefixes) + TRIE_WALK_END; + + bt_assert(pos == num); + bt_assert(pxc == trie->prefix_count); + bt_debug("Full walk done\n"); + + + /* Prepare net for subnet walk - start with random prefix */ + pos = bt_random() % num; + end = pos + (int[]){2, 2, 3, 4}[level / 2]; + end = MIN(end, num); + + struct f_prefix from = pxset[pos]; + + /* Find a common superprefix to several subsequent prefixes */ + for (; pos < end; pos++) { - trie_add_prefix(trie2, &n->prefix.net, n->prefix.lo, n->prefix.hi); + if (net_equal(&from.net, &pxset[pos].net)) + continue; + + int common = !v6 ? + ip4_pxlen(net4_prefix(&from.net), net4_prefix(&pxset[pos].net)) : + ip6_pxlen(net6_prefix(&from.net), net6_prefix(&pxset[pos].net)); + from.net.pxlen = MIN(from.net.pxlen, common); + + if (!v6) + ((net_addr_ip4 *) &from.net)->prefix = + ip4_and(net4_prefix(&from.net), net4_prefix(&pxset[pos].net)); + else + ((net_addr_ip6 *) &from.net)->prefix = + ip6_and(net6_prefix(&from.net), net6_prefix(&pxset[pos].net)); } - bt_assert(trie_same(trie1, trie2)); + /* Fix irrelevant bits */ + if (!v6) + ((net_addr_ip4 *) &from.net)->prefix = + ip4_and(net4_prefix(&from.net), ip4_mkmask(net4_pxlen(&from.net))); + else + ((net_addr_ip6 *) &from.net)->prefix = + ip6_and(net6_prefix(&from.net), ip6_mkmask(net6_pxlen(&from.net))); + + + /* Find initial position for final prefix */ + for (pos = 0; pos < num; pos++) + if (compare_prefixes(&pxset[pos], &from) >= 0) + break; + + int p0 = pos; + char buf0[64]; + bt_format_net(buf0, 64, &from.net); + bt_debug("Subnet walk for %s (round %d, %d nets)\n", buf0, round, num); + + /* Subnet walk */ + TRIE_WALK(trie, net, &from.net) + { + log_networks(&net, &pxset[pos].net); + bt_assert(net_equal(&net, &pxset[pos].net)); + bt_assert(net_in_netX(&net, &from.net)); + + /* Skip possible duplicates */ + while (net_equal(&pxset[pos].net, &pxset[pos + 1].net)) + pos++; - struct f_prefix_node *nxt; - WALK_LIST_DELSAFE(n, nxt, prefixes) + pos++; + } + TRIE_WALK_END; + + bt_assert((pos == num) || !net_in_netX(&pxset[pos].net, &from.net)); + bt_debug("Subnet walk done for %s (found %d nets)\n", buf0, pos - p0); + + tmp_flush(); + } + + return 1; +} + +static int +find_covering_nets(struct f_prefix *prefixes, int num, const net_addr *net, net_addr *found) +{ + struct f_prefix key; + net_addr *n = &key.net; + int found_num = 0; + + net_copy(n, net); + + while (1) + { + struct f_prefix *px = + bsearch(&key, prefixes, num, sizeof(struct f_prefix), compare_prefixes); + + if (px) { - free(n); + net_copy(&found[found_num], n); + found_num++; } + + if (n->pxlen == 0) + return found_num; + + n->pxlen--; + + if (n->type == NET_IP4) + ip4_clrbit(&((net_addr_ip4 *) n)->prefix, n->pxlen); + else + ip6_clrbit(&((net_addr_ip6 *) n)->prefix, n->pxlen); + } +} + +static int +t_trie_walk_to_root(void) +{ + for (int round = 0; round < TESTS_NUM * 4; round++) + { + int level = round / TESTS_NUM; + int v6 = level % 2; + int num = PREFIXES_NUM * (int[]){32, 512}[level / 2]; + int pos = 0; + int st = 0, sn = 0, sm = 0; + + list *prefixes = make_random_prefix_list(num, v6, 1); + struct f_trie *trie = make_trie_from_prefix_list(prefixes); + struct f_prefix *pxset = malloc((num + 1) * sizeof(struct f_prefix)); + + struct f_prefix_node *pxn; + WALK_LIST(pxn, *prefixes) + pxset[pos++] = pxn->prefix; + memset(&pxset[pos], 0, sizeof (struct f_prefix)); + + qsort(pxset, num, sizeof(struct f_prefix), compare_prefixes); + + int i; + for (i = 0; i < (PREFIX_TESTS_NUM / 10); i++) + { + net_addr from; + get_random_net(&from, v6); + + net_addr found[129]; + int found_num = find_covering_nets(pxset, num, &from, found); + int n = 0; + + if (bt_verbose >= BT_VERBOSE_ABSOLUTELY_ALL) + { + char buf[64]; + bt_format_net(buf, 64, &from); + bt_debug("Lookup for %s (expect %d)\n", buf, found_num); + } + + /* Walk to root, separate for IPv4 and IPv6 */ + if (!v6) + { + TRIE_WALK_TO_ROOT_IP4(trie, (net_addr_ip4 *) &from, net) + { + log_networks((net_addr *) &net, &found[n]); + bt_assert((n < found_num) && net_equal((net_addr *) &net, &found[n])); + n++; + } + TRIE_WALK_TO_ROOT_END; + } + else + { + TRIE_WALK_TO_ROOT_IP6(trie, (net_addr_ip6 *) &from, net) + { + log_networks((net_addr *) &net, &found[n]); + bt_assert((n < found_num) && net_equal((net_addr *) &net, &found[n])); + n++; + } + TRIE_WALK_TO_ROOT_END; + } + + bt_assert(n == found_num); + + /* Stats */ + st += n; + sn += !!n; + sm = MAX(sm, n); + } + + bt_debug("Success in %d / %d, sum %d, max %d\n", sn, i, st, sm); + + tmp_flush(); } return 1; @@ -178,9 +850,18 @@ int main(int argc, char *argv[]) { bt_init(argc, argv); + bt_bird_init(); + bt_config_parse(BT_CONFIG_SIMPLE); - bt_test_suite(t_match_net, "Testing random prefix matching"); + bt_test_suite(t_match_random_net, "Testing random prefix matching"); + bt_test_suite(t_match_inner_net, "Testing random inner prefix matching"); + bt_test_suite(t_match_outer_net, "Testing random outer prefix matching"); bt_test_suite(t_trie_same, "A trie filled forward should be same with a trie filled backward."); + bt_test_suite(t_trie_walk, "Testing TRIE_WALK() on random tries"); + bt_test_suite(t_trie_walk_to_root, "Testing TRIE_WALK_TO_ROOT() on random tries"); + + // bt_test_suite(t_bench_trie_datasets_subset, "Benchmark tries from datasets by random subset of nets"); + // bt_test_suite(t_bench_trie_datasets_random, "Benchmark tries from datasets by generated addresses"); return bt_exit_value(); } diff --git a/lib/Makefile b/lib/Makefile index 98c5db3c..f4ade9a6 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -1,7 +1,7 @@ -src := bitmap.c bitops.c blake2s.c blake2b.c checksum.c event.c flowspec.c idm.c ip.c lists.c mac.c md5.c mempool.c net.c patmatch.c printf.c resource.c rcu.c sha1.c sha256.c sha512.c slab.c slists.c strtoul.c tbf.c timer.c xmalloc.c +src := a-path.c a-set.c bitmap.c bitops.c blake2s.c blake2b.c checksum.c event.c flowspec.c idm.c ip.c lists.c mac.c md5.c mempool.c net.c patmatch.c printf.c rcu.c resource.c sha1.c sha256.c sha512.c slab.c slists.c strtoul.c tbf.c timer.c xmalloc.c obj := $(src-o-files) $(all-daemon) -tests_src := bitmap_test.c heap_test.c buffer_test.c event_test.c flowspec_test.c bitops_test.c patmatch_test.c fletcher16_test.c slist_test.c checksum_test.c lists_test.c mac_test.c ip_test.c hash_test.c printf_test.c +tests_src := a-set_test.c a-path_test.c bitmap_test.c heap_test.c buffer_test.c event_test.c flowspec_test.c bitops_test.c patmatch_test.c fletcher16_test.c slist_test.c checksum_test.c lists_test.c mac_test.c ip_test.c hash_test.c printf_test.c slab_test.c type_test.c tests_targets := $(tests_targets) $(tests-target-files) tests_objs := $(tests_objs) $(src-o-files) diff --git a/nest/a-path.c b/lib/a-path.c index 2e34a3d1..a7a22e40 100644 --- a/nest/a-path.c +++ b/lib/a-path.c @@ -8,8 +8,8 @@ */ #include "nest/bird.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "lib/resource.h" #include "lib/unaligned.h" #include "lib/string.h" @@ -591,7 +591,7 @@ as_path_match_set(const struct adata *path, const struct f_tree *set) p += 2; for (i=0; i<n; i++) { - struct f_val v = {T_INT, .val.i = get_as(p)}; + struct f_val v = { .type = T_INT, .val.i = get_as(p)}; if (find_tree(set, &v)) return 1; p += BS; @@ -602,8 +602,10 @@ as_path_match_set(const struct adata *path, const struct f_tree *set) } const struct adata * -as_path_filter(struct linpool *pool, const struct adata *path, const struct f_tree *set, u32 key, int pos) +as_path_filter(struct linpool *pool, const struct adata *path, const struct f_val *set, int pos) { + ASSERT((set->type == T_SET) || (set->type == T_INT)); + if (!path) return NULL; @@ -629,13 +631,13 @@ as_path_filter(struct linpool *pool, const struct adata *path, const struct f_tr u32 as = get_as(p); int match; - if (set) + if (set->type == T_SET) { - struct f_val v = {T_INT, .val.i = as}; - match = !!find_tree(set, &v); + struct f_val v = { .type = T_INT, .val.i = as}; + match = !!find_tree(set->val.t, &v); } - else - match = (as == key); + else /* T_INT */ + match = (as == set->val.i); if (match == pos) { @@ -667,6 +669,35 @@ as_path_filter(struct linpool *pool, const struct adata *path, const struct f_tr return res; } +int +as_path_walk(const struct adata *path, uint *pos, uint *val) +{ + if (!path) + return 0; + + const u8 *p = path->data; + const u8 *q = p + path->length; + uint n, x = *pos; + + while (p < q) + { + n = p[1]; + p += 2; + + if (x < n) + { + *val = get_as(p + x * BS); + *pos += 1; + return 1; + } + + p += n * BS; + x -= n; + } + + return 0; +} + struct pm_pos { diff --git a/nest/a-path_test.c b/lib/a-path_test.c index 2533dbae..08c6c96c 100644 --- a/nest/a-path_test.c +++ b/lib/a-path_test.c @@ -9,10 +9,10 @@ #include "test/birdtest.h" #include "test/bt-utils.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "lib/resource.h" -#include "lib/io-loop.h" +#include "filter/data.h" #define TESTS_NUM 30 #define AS_PATH_LENGTH 1000 @@ -31,14 +31,13 @@ t_as_path_match(void) struct adata *as_path = &empty_as_path; u32 first_prepended, last_prepended; first_prepended = last_prepended = 0; - struct linpool *lp = lp_new_default(&root_pool); struct f_path_mask *mask = alloca(sizeof(struct f_path_mask) + AS_PATH_LENGTH * sizeof(struct f_path_mask_item)); mask->len = AS_PATH_LENGTH; for (int i = AS_PATH_LENGTH - 1; i >= 0; i--) { u32 val = bt_random(); - as_path = as_path_prepend(lp, as_path, val); + as_path = as_path_prepend(tmp_linpool, as_path, val); bt_debug("Prepending ASN: %10u \n", val); if (i == 0) @@ -60,7 +59,7 @@ t_as_path_match(void) bt_assert(as_path_get_last(as_path, &asn)); bt_assert_msg(asn == first_prepended, "as_path_get_last() should return the first prepended ASN"); - rfree(lp); + tmp_flush(); } return 1; @@ -71,22 +70,21 @@ t_path_format(void) { struct adata empty_as_path = {}; struct adata *as_path = &empty_as_path; - struct linpool *lp = lp_new_default(&root_pool); uint i; for (i = 4294967285; i <= 4294967294; i++) { - as_path = as_path_prepend(lp, as_path, i); + as_path = as_path_prepend(tmp_linpool, as_path, i); bt_debug("Prepending ASN: %10u \n", i); } -#define BUFFER_SIZE 120 - byte buf[BUFFER_SIZE] = {}; +#define T_BUFFER_SIZE 120 + byte buf[T_BUFFER_SIZE] = {}; - as_path_format(&empty_as_path, buf, BUFFER_SIZE); + as_path_format(&empty_as_path, buf, T_BUFFER_SIZE); bt_assert_msg(strcmp(buf, "") == 0, "Buffer(%zu): '%s'", strlen(buf), buf); - as_path_format(as_path, buf, BUFFER_SIZE); + as_path_format(as_path, buf, T_BUFFER_SIZE); bt_assert_msg(strcmp(buf, "4294967294 4294967293 4294967292 4294967291 4294967290 4294967289 4294967288 4294967287 4294967286 4294967285") == 0, "Buffer(%zu): '%s'", strlen(buf), buf); #define SMALL_BUFFER_SIZE 25 @@ -94,7 +92,7 @@ t_path_format(void) as_path_format(as_path, buf2, SMALL_BUFFER_SIZE); bt_assert_msg(strcmp(buf2, "4294967294 42...") == 0, "Small Buffer(%zu): '%s'", strlen(buf2), buf2); - rfree(lp); + tmp_flush(); return 1; } @@ -115,7 +113,6 @@ t_path_include(void) { struct adata empty_as_path = {}; struct adata *as_path = &empty_as_path; - struct linpool *lp = lp_new_default(&root_pool); u32 as_nums[AS_PATH_LENGTH] = {}; int i; @@ -123,7 +120,7 @@ t_path_include(void) { u32 val = bt_random(); as_nums[i] = val; - as_path = as_path_prepend(lp, as_path, val); + as_path = as_path_prepend(tmp_linpool, as_path, val); } for (i = 0; i < AS_PATH_LENGTH; i++) @@ -131,8 +128,9 @@ t_path_include(void) int counts_of_contains = count_asn_in_array(as_nums, as_nums[i]); bt_assert_msg(as_path_contains(as_path, as_nums[i], counts_of_contains), "AS Path should contains %d-times number %d", counts_of_contains, as_nums[i]); - bt_assert(as_path_filter(lp, as_path, NULL, as_nums[i], 0) != NULL); - bt_assert(as_path_filter(lp, as_path, NULL, as_nums[i], 1) != NULL); + struct f_val v = { .type = T_INT, .val.i = as_nums[i] }; + bt_assert(as_path_filter(tmp_linpool, as_path, &v, 0) != NULL); + bt_assert(as_path_filter(tmp_linpool, as_path, &v, 1) != NULL); } for (i = 0; i < 10000; i++) @@ -147,7 +145,7 @@ t_path_include(void) bt_assert_msg(result == 0, "As path should not contain the number %u", test_val); } - rfree(lp); + tmp_flush(); return 1; } @@ -158,12 +156,11 @@ t_as_path_converting(void) { struct adata empty_as_path = {}; struct adata *as_path = &empty_as_path; - struct linpool *lp = lp_new_default(&root_pool); #define AS_PATH_LENGTH_FOR_CONVERTING_TEST 10 int i; for (i = 0; i < AS_PATH_LENGTH_FOR_CONVERTING_TEST; i++) - as_path = as_path_prepend(lp, as_path, i); + as_path = as_path_prepend(tmp_linpool, as_path, i); bt_debug("data length: %u \n", as_path->length); @@ -197,18 +194,10 @@ t_as_path_converting(void) } #endif -void resource_sys_init(void); -void io_init(void); - int main(int argc, char *argv[]) { bt_init(argc, argv); - resource_sys_init(); - resource_init(); - the_bird_lock(); - birdloop_init(); - io_init(); bt_test_suite(t_as_path_match, "Testing AS path matching and some a-path utilities."); bt_test_suite(t_path_format, "Testing formating as path into byte buffer"); diff --git a/nest/a-set.c b/lib/a-set.c index 71fbac94..dcb86058 100644 --- a/nest/a-set.c +++ b/lib/a-set.c @@ -10,8 +10,8 @@ #include <stdlib.h> #include "nest/bird.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "lib/resource.h" #include "lib/string.h" @@ -693,3 +693,51 @@ lc_set_max(const struct adata *list, lcomm *val) *val = (lcomm) { res[0], res[1], res[2] }; return 1; } + +int +int_set_walk(const struct adata *list, uint *pos, uint *val) +{ + if (!list) + return 0; + + if (*pos >= (uint) int_set_get_size(list)) + return 0; + + u32 *res = int_set_get_data(list) + *pos; + *val = *res; + *pos += 1; + + return 1; +} + +int +ec_set_walk(const struct adata *list, uint *pos, u64 *val) +{ + if (!list) + return 0; + + if (*pos >= (uint) int_set_get_size(list)) + return 0; + + u32 *res = int_set_get_data(list) + *pos; + *val = ec_generic(res[0], res[1]); + *pos += 2; + + return 1; +} + +int +lc_set_walk(const struct adata *list, uint *pos, lcomm *val) +{ + if (!list) + return 0; + + if (*pos >= (uint) int_set_get_size(list)) + return 0; + + u32 *res = int_set_get_data(list) + *pos; + *val = (lcomm) { res[0], res[1], res[2] }; + *pos += 3; + + return 1; +} diff --git a/nest/a-set_test.c b/lib/a-set_test.c index f8f6e781..79b5cf9a 100644 --- a/nest/a-set_test.c +++ b/lib/a-set_test.c @@ -10,10 +10,9 @@ #include "test/bt-utils.h" #include "lib/net.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "lib/resource.h" -#include "lib/io-loop.h" #define SET_SIZE 10 static const struct adata *set_sequence; /* <0; SET_SIZE) */ @@ -21,13 +20,11 @@ static const struct adata *set_sequence_same; /* <0; SET_SIZE) */ static const struct adata *set_sequence_higher; /* <SET_SIZE; 2*SET_SIZE) */ static const struct adata *set_random; -#define BUFFER_SIZE 1000 -static byte buf[BUFFER_SIZE] = {}; +#define T_BUFFER_SIZE 1000 +static byte buf[T_BUFFER_SIZE] = {}; #define SET_SIZE_FOR_FORMAT_OUTPUT 10 -struct linpool *lp; - enum set_type { SET_TYPE_INT, @@ -39,24 +36,23 @@ generate_set_sequence(enum set_type type, int len) { struct adata empty_as_path = {}; set_sequence = set_sequence_same = set_sequence_higher = set_random = &empty_as_path; - lp = lp_new_default(&root_pool); int i; for (i = 0; i < len; i++) { if (type == SET_TYPE_INT) { - set_sequence = int_set_add(lp, set_sequence, i); - set_sequence_same = int_set_add(lp, set_sequence_same, i); - set_sequence_higher = int_set_add(lp, set_sequence_higher, i + SET_SIZE); - set_random = int_set_add(lp, set_random, bt_random()); + set_sequence = int_set_add(tmp_linpool, set_sequence, i); + set_sequence_same = int_set_add(tmp_linpool, set_sequence_same, i); + set_sequence_higher = int_set_add(tmp_linpool, set_sequence_higher, i + SET_SIZE); + set_random = int_set_add(tmp_linpool, set_random, bt_random()); } else if (type == SET_TYPE_EC) { - set_sequence = ec_set_add(lp, set_sequence, i); - set_sequence_same = ec_set_add(lp, set_sequence_same, i); - set_sequence_higher = ec_set_add(lp, set_sequence_higher, i + SET_SIZE); - set_random = ec_set_add(lp, set_random, (bt_random() << 32 | bt_random())); + set_sequence = ec_set_add(tmp_linpool, set_sequence, i); + set_sequence_same = ec_set_add(tmp_linpool, set_sequence_same, i); + set_sequence_higher = ec_set_add(tmp_linpool, set_sequence_higher, i + SET_SIZE); + set_random = ec_set_add(tmp_linpool, set_random, (bt_random() << 32 | bt_random())); } else bt_abort_msg("This should be unreachable"); @@ -85,7 +81,6 @@ t_set_int_contains(void) for (i = 0; i < SET_SIZE; i++) bt_assert_msg(data[i] == i, "(data[i] = %d) == i = %d)", data[i], i); - rfree(lp); return 1; } @@ -95,15 +90,14 @@ t_set_int_union(void) generate_set_sequence(SET_TYPE_INT, SET_SIZE); const struct adata *set_union; - set_union = int_set_union(lp, set_sequence, set_sequence_same); + set_union = int_set_union(tmp_linpool, set_sequence, set_sequence_same); bt_assert(int_set_get_size(set_union) == SET_SIZE); - bt_assert(int_set_format(set_union, 0, 2, buf, BUFFER_SIZE) == 0); + bt_assert(int_set_format(set_union, 0, 2, buf, T_BUFFER_SIZE) == 0); - set_union = int_set_union(lp, set_sequence, set_sequence_higher); + set_union = int_set_union(tmp_linpool, set_sequence, set_sequence_higher); bt_assert_msg(int_set_get_size(set_union) == SET_SIZE*2, "int_set_get_size(set_union) %d, SET_SIZE*2 %d", int_set_get_size(set_union), SET_SIZE*2); - bt_assert(int_set_format(set_union, 0, 2, buf, BUFFER_SIZE) == 0); + bt_assert(int_set_format(set_union, 0, 2, buf, T_BUFFER_SIZE) == 0); - rfree(lp); return 1; } @@ -112,18 +106,17 @@ t_set_int_format(void) { generate_set_sequence(SET_TYPE_INT, SET_SIZE_FOR_FORMAT_OUTPUT); - bt_assert(int_set_format(set_sequence, 0, 0, buf, BUFFER_SIZE) == 0); + bt_assert(int_set_format(set_sequence, 0, 0, buf, T_BUFFER_SIZE) == 0); bt_assert(strcmp(buf, "0.0.0.0 0.0.0.1 0.0.0.2 0.0.0.3 0.0.0.4 0.0.0.5 0.0.0.6 0.0.0.7 0.0.0.8 0.0.0.9") == 0); - bzero(buf, BUFFER_SIZE); - bt_assert(int_set_format(set_sequence, 0, 2, buf, BUFFER_SIZE) == 0); + bzero(buf, T_BUFFER_SIZE); + bt_assert(int_set_format(set_sequence, 0, 2, buf, T_BUFFER_SIZE) == 0); bt_assert(strcmp(buf, "0.0.0.2 0.0.0.3 0.0.0.4 0.0.0.5 0.0.0.6 0.0.0.7 0.0.0.8 0.0.0.9") == 0); - bzero(buf, BUFFER_SIZE); - bt_assert(int_set_format(set_sequence, 1, 0, buf, BUFFER_SIZE) == 0); + bzero(buf, T_BUFFER_SIZE); + bt_assert(int_set_format(set_sequence, 1, 0, buf, T_BUFFER_SIZE) == 0); bt_assert(strcmp(buf, "(0,0) (0,1) (0,2) (0,3) (0,4) (0,5) (0,6) (0,7) (0,8) (0,9)") == 0); - rfree(lp); return 1; } @@ -136,7 +129,7 @@ t_set_int_delete(void) u32 i; for (i = 0; i < SET_SIZE; i++) { - deleting_sequence = int_set_del(lp, deleting_sequence, i); + deleting_sequence = int_set_del(tmp_linpool, deleting_sequence, i); bt_assert_msg(int_set_get_size(deleting_sequence) == (int) (SET_SIZE-1-i), "int_set_get_size(deleting_sequence) %d == SET_SIZE-1-i %d", int_set_get_size(deleting_sequence), @@ -170,7 +163,6 @@ t_set_ec_contains(void) // for (i = 0; i < SET_SIZE; i++) // bt_assert_msg(data[i] == (SET_SIZE-1-i), "(data[i] = %d) == ((SET_SIZE-1-i) = %d)", data[i], SET_SIZE-1-i); - rfree(lp); return 1; } @@ -180,15 +172,14 @@ t_set_ec_union(void) generate_set_sequence(SET_TYPE_EC, SET_SIZE); const struct adata *set_union; - set_union = ec_set_union(lp, set_sequence, set_sequence_same); + set_union = ec_set_union(tmp_linpool, set_sequence, set_sequence_same); bt_assert(ec_set_get_size(set_union) == SET_SIZE); - bt_assert(ec_set_format(set_union, 0, buf, BUFFER_SIZE) == 0); + bt_assert(ec_set_format(set_union, 0, buf, T_BUFFER_SIZE) == 0); - set_union = ec_set_union(lp, set_sequence, set_sequence_higher); + set_union = ec_set_union(tmp_linpool, set_sequence, set_sequence_higher); bt_assert_msg(ec_set_get_size(set_union) == SET_SIZE*2, "ec_set_get_size(set_union) %d, SET_SIZE*2 %d", ec_set_get_size(set_union), SET_SIZE*2); - bt_assert(ec_set_format(set_union, 0, buf, BUFFER_SIZE) == 0); + bt_assert(ec_set_format(set_union, 0, buf, T_BUFFER_SIZE) == 0); - rfree(lp); return 1; } @@ -197,18 +188,16 @@ t_set_ec_format(void) { const struct adata empty_as_path = {}; set_sequence = set_sequence_same = set_sequence_higher = set_random = &empty_as_path; - lp = lp_new_default(&root_pool); u64 i = 0; - set_sequence = ec_set_add(lp, set_sequence, i); + set_sequence = ec_set_add(tmp_linpool, set_sequence, i); for (i = 1; i < SET_SIZE_FOR_FORMAT_OUTPUT; i++) - set_sequence = ec_set_add(lp, set_sequence, i + ((i%2) ? ((u64)EC_RO << 48) : ((u64)EC_RT << 48))); + set_sequence = ec_set_add(tmp_linpool, set_sequence, i + ((i%2) ? ((u64)EC_RO << 48) : ((u64)EC_RT << 48))); - bt_assert(ec_set_format(set_sequence, 0, buf, BUFFER_SIZE) == 0); + bt_assert(ec_set_format(set_sequence, 0, buf, T_BUFFER_SIZE) == 0); bt_assert_msg(strcmp(buf, "(unknown 0x0, 0, 0) (ro, 0, 1) (rt, 0, 2) (ro, 0, 3) (rt, 0, 4) (ro, 0, 5) (rt, 0, 6) (ro, 0, 7) (rt, 0, 8) (ro, 0, 9)") == 0, "ec_set_format() returns '%s'", buf); - rfree(lp); return 1; } @@ -221,7 +210,7 @@ t_set_ec_delete(void) u32 i; for (i = 0; i < SET_SIZE; i++) { - deleting_sequence = ec_set_del(lp, deleting_sequence, i); + deleting_sequence = ec_set_del(tmp_linpool, deleting_sequence, i); bt_assert_msg(ec_set_get_size(deleting_sequence) == (int) (SET_SIZE-1-i), "ec_set_get_size(deleting_sequence) %d == SET_SIZE-1-i %d", ec_set_get_size(deleting_sequence), SET_SIZE-1-i); @@ -233,16 +222,10 @@ t_set_ec_delete(void) } -void resource_sys_init(void); - int main(int argc, char *argv[]) { bt_init(argc, argv); - resource_sys_init(); - resource_init(); - the_bird_lock(); - birdloop_init(); bt_test_suite(t_set_int_contains, "Testing sets of integers: contains, get_data"); bt_test_suite(t_set_int_format, "Testing sets of integers: format"); diff --git a/nest/attrs.h b/lib/attrs.h index ef2b95e6..b4dd2e83 100644 --- a/nest/attrs.h +++ b/lib/attrs.h @@ -11,7 +11,39 @@ #include <stdint.h> #include "lib/unaligned.h" -#include "nest/route.h" + +typedef struct adata { + uint length; /* Length of data */ + byte data[0]; +} adata; + +#define ADATA_SIZE(s) BIRD_CPU_ALIGN(sizeof(struct adata) + s) + +extern const adata null_adata; /* adata of length 0 */ + +static inline struct adata * +lp_alloc_adata(struct linpool *pool, uint len) +{ + struct adata *ad = lp_alloc(pool, sizeof(struct adata) + len); + ad->length = len; + return ad; +} + +static inline struct adata * +lp_store_adata(struct linpool *pool, const void *buf, uint len) +{ + struct adata *ad = lp_alloc_adata(pool, len); + memcpy(ad->data, buf, len); + return ad; +} + +#define tmp_alloc_adata(len) lp_alloc_adata(tmp_linpool, len) +#define tmp_store_adata(buf, len) lp_store_adata(tmp_linpool, buf, len) +#define tmp_copy_adata(ad) tmp_store_adata((ad)->data, (ad)->length) + +static inline int adata_same(const struct adata *a, const struct adata *b) +{ return (!a && !b) || (a->length == b->length && !memcmp(a->data, b->data, a->length)); } + /* a-path.c */ @@ -28,6 +60,7 @@ * to 16bit slot (like in 16bit AS_PATH). See RFC 4893 for details */ +struct f_val; struct f_tree; int as_path_valid(byte *data, uint len, int bs, int sets, int confed, char *err, uint elen); @@ -49,7 +82,8 @@ int as_path_get_last(const struct adata *path, u32 *last_as); u32 as_path_get_last_nonaggregated(const struct adata *path); int as_path_contains(const struct adata *path, u32 as, int min); int as_path_match_set(const struct adata *path, const struct f_tree *set); -const struct adata *as_path_filter(struct linpool *pool, const struct adata *path, const struct f_tree *set, u32 key, int pos); +const struct adata *as_path_filter(struct linpool *pool, const struct adata *path, const struct f_val *set, int pos); +int as_path_walk(const struct adata *path, uint *pos, uint *val); static inline struct adata *as_path_prepend(struct linpool *pool, const struct adata *path, u32 as) { return as_path_prepend2(pool, path, AS_PATH_SEQUENCE, as); } @@ -136,6 +170,11 @@ static inline const char *ec_subtype_str(const enum ec_subtype ecs) { } } +/* Check for EC_RT subtype within different types (0-2) */ +static inline int ec_type_is_rt(uint type) +{ return (type == EC_RT) || (type == (0x0100 | EC_RT)) || (type == (0x0200 | EC_RT)); } + + /* Transitive bit (for first u32 half of EC) */ #define EC_TBIT 0x40000000 @@ -155,9 +194,13 @@ static inline u32 *int_set_get_data(const struct adata *list) static inline u32 ec_hi(u64 ec) { return ec >> 32; } static inline u32 ec_lo(u64 ec) { return ec; } + static inline u64 ec_get(const u32 *l, int i) { return (((u64) l[i]) << 32) | l[i+1]; } +static inline void ec_put(u32 *l, int i, u64 val) +{ l[i] = ec_hi(val); l[i+1] = ec_lo(val); } + /* RFC 4360 3.1. Two-Octet AS Specific Extended Community */ static inline u64 ec_as2(enum ec_subtype kind, u64 key, u64 val) { return (((u64) kind | 0x0000) << 48) | (key << 32) | val; } @@ -224,6 +267,9 @@ int lc_set_min(const struct adata *list, lcomm *val); int int_set_max(const struct adata *list, u32 *val); int ec_set_max(const struct adata *list, u64 *val); int lc_set_max(const struct adata *list, lcomm *val); +int int_set_walk(const struct adata *list, uint *pos, u32 *val); +int ec_set_walk(const struct adata *list, uint *pos, u64 *val); +int lc_set_walk(const struct adata *list, uint *pos, lcomm *val); void ec_set_sort_x(struct adata *set); /* Sort in place */ diff --git a/lib/birdlib.h b/lib/birdlib.h index 385bf75c..3f65b8c2 100644 --- a/lib/birdlib.h +++ b/lib/birdlib.h @@ -9,17 +9,39 @@ #ifndef _BIRD_BIRDLIB_H_ #define _BIRD_BIRDLIB_H_ +#include <stddef.h> + #include "sysdep/config.h" #include "lib/alloca.h" /* Ugly structure offset handling macros */ -struct align_probe { char x; long int y; }; +#define SAME_TYPE(a, b) ({ int _ = ((a) != (b)); !_; }) +#define TYPE_CAST(from, to, what) ( SAME_TYPE(((from) NULL), (what)), ((to) (what))) +#ifdef offsetof +#define OFFSETOF offsetof +#else #define OFFSETOF(s, i) ((size_t) &((s *)0)->i) -#define SKIP_BACK(s, i, p) ({ s *_ptr = ((s *)((char *)p - OFFSETOF(s, i))); ASSERT_DIE(&_ptr->i == p); _ptr; }) +#endif + +#define SKIP_BACK(s, i, p) ({ s *_ptr = ((s *)((char *)p - OFFSETOF(s, i))); SAME_TYPE(&_ptr->i, p); _ptr; }) #define BIRD_ALIGN(s, a) (((s)+a-1)&~(a-1)) -#define CPU_STRUCT_ALIGN (sizeof(struct align_probe)) +#define CPU_STRUCT_ALIGN (MAX_(_Alignof(void*), _Alignof(u64))) +#define BIRD_CPU_ALIGN(s) BIRD_ALIGN((s), CPU_STRUCT_ALIGN) + +/* Structure item alignment macros */ + +#define PADDING_NAME(id) _padding_##id +#define PADDING_(id, sz) u8 PADDING_NAME(id)[sz] + +#if CPU_POINTER_ALIGNMENT == 4 +#define PADDING(id, n32, n64) PADDING_(id, n32) +#elif CPU_POINTER_ALIGNMENT == 8 +#define PADDING(id, n32, n64) PADDING_(id, n64) +#else +#error "Strange CPU pointer alignment: " CPU_POINTER_ALIGNMENT +#endif /* Utility macros */ @@ -33,6 +55,9 @@ struct align_probe { char x; long int y; }; #define MAX(a,b) MAX_(a,b) #endif +#define ROUND_DOWN_POW2(a,b) ((a) & ~((b)-1)) +#define ROUND_UP_POW2(a,b) (((a)+((b)-1)) & ~((b)-1)) + #define U64(c) UINT64_C(c) #define ABS(a) ((a)>=0 ? (a) : -(a)) #define DELTA(a,b) (((a)>=(b))?(a)-(b):(b)-(a)) @@ -90,6 +115,7 @@ typedef s64 btime; #define TO_S /1000000 #define TO_MS /1000 #define TO_US /1 +#define TO_NS * (btime) 1000 #ifndef PARSER #define S S_ @@ -155,13 +181,19 @@ void bug(const char *msg, ...) NORET; #define L_BUG "\011" /* BIRD bugs */ void debug(const char *msg, ...); /* Printf to debug output */ +void debug_safe(const char *msg); /* Printf to debug output, async-safe */ /* Debugging */ #if defined(LOCAL_DEBUG) || defined(GLOBAL_DEBUG) #define DBG(x, y...) debug(x, ##y) +#define DBGL(x, y...) debug(x "\n", ##y) +#elif defined(DEBUG_TO_LOG) +#define DBG(...) do { } while (0) +#define DBGL(...) log(L_DEBUG __VA_ARGS__) #else -#define DBG(x, y...) do { } while(0) +#define DBG(...) do { } while(0) +#define DBGL(...) do { } while (0) #endif #define ASSERT_DIE(x) do { if (!(x)) bug("Assertion '%s' failed at %s:%d", #x, __FILE__, __LINE__); } while(0) diff --git a/lib/bitmap_test.c b/lib/bitmap_test.c index 0595a4d0..07860c94 100644 --- a/lib/bitmap_test.c +++ b/lib/bitmap_test.c @@ -24,7 +24,6 @@ t_bmap_set_clear_random(void) { struct bmap b; - resource_init(); bmap_init(&b, &root_pool, 1024); char expected[MAX_NUM] = {}; @@ -60,7 +59,6 @@ t_hmap_set_clear_random(void) { struct hmap b; - resource_init(); hmap_init(&b, &root_pool, 1024); char expected[MAX_NUM] = {}; @@ -119,7 +117,6 @@ t_hmap_set_clear_fill(void) { struct hmap b; - resource_init(); hmap_init(&b, &root_pool, 1024); char expected[MAX_NUM] = {}; diff --git a/lib/buffer_test.c b/lib/buffer_test.c index 5b7de330..0629e901 100644 --- a/lib/buffer_test.c +++ b/lib/buffer_test.c @@ -41,7 +41,6 @@ fill_expected_array(void) static void init_buffer(void) { - resource_init(); buffer_pool = &root_pool; BUFFER_INIT(buf, buffer_pool, MAX_NUM); } diff --git a/lib/coro.h b/lib/coro.h deleted file mode 100644 index b36f1d2c..00000000 --- a/lib/coro.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * BIRD Coroutines - * - * (c) 2017 Martin Mares <mj@ucw.cz> - * (c) 2020-2021 Maria Matejka <mq@jmq.cz> - * - * Can be freely distributed and used under the terms of the GNU GPL. - */ - -#ifndef _BIRD_CORO_H_ -#define _BIRD_CORO_H_ - -#include "lib/resource.h" - -/* A completely opaque coroutine handle. */ -struct coroutine; - -/* Coroutines are independent threads bound to pools. - * You request a coroutine by calling coro_run(). - * It is forbidden to free a running coroutine from outside. - * The running coroutine must free itself by rfree() before returning. - */ -struct coroutine *coro_run(pool *, void (*entry)(void *), void *data); - -/* Get self. */ -extern _Thread_local struct coroutine *this_coro; - -/* Just wait for a little while. Not intended for general use; use events if possible. */ -void coro_yield(void); - -#endif diff --git a/lib/event.c b/lib/event.c index 10f83c28..68888a4c 100644 --- a/lib/event.c +++ b/lib/event.c @@ -23,38 +23,159 @@ #include "nest/bird.h" #include "lib/event.h" -#include "lib/locking.h" #include "lib/io-loop.h" -extern _Thread_local struct coroutine *this_coro; - event_list global_event_list; event_list global_work_list; +//#ifdef DEBUGGING +#if 0 +#define EDL_MAX 16384 +enum edl_caller { + EDL_REMOVE_FROM = 1, + EDL_POSTPONE = 2, + EDL_RUN = 3, + EDL_SEND = 4, + EDL_RUN_LIST = 5, +} caller; +static struct event_debug_log { + event_list *target_list; + event *event; + event *receiver; + uint pos; + uint prev_edl_pos; + uint thread; + enum edl_caller caller; +} edl[EDL_MAX]; +static _Atomic uint edl_cnt; +_Thread_local static uint edl_thread; +_Thread_local static uint prev_edl_pos = ~0; +static inline void edlog(event_list *list, event *e, event *receiver, uint pos, enum edl_caller caller) +{ + uint edl_pos = atomic_fetch_add_explicit(&edl_cnt, 1, memory_order_acq_rel); + if (!edl_thread) + edl_thread = edl_pos; + + edl[edl_pos % EDL_MAX] = (struct event_debug_log) { + .target_list = list, + .event = e, + .receiver = receiver, + .pos = pos, + .prev_edl_pos = prev_edl_pos, + .thread = edl_thread, + .caller = caller, + }; + + prev_edl_pos = edl_pos; +} +#else +#define edlog(...) +#endif + + +void +ev_init_list(event_list *el, struct birdloop *loop, const char *name) +{ + el->name = name; + el->loop = loop; + + atomic_store_explicit(&el->receiver, NULL, memory_order_release); + atomic_store_explicit(&el->_executor, NULL, memory_order_release); +} + +/* + * The event list should work as a message passing point. Sending a message + * must be a fairly fast process with no locks and low waiting times. OTOH, + * processing messages always involves running the assigned code and the + * receiver is always a single one thread with no concurrency at all. There is + * also a postponing requirement to synchronously remove an event from a queue, + * yet we allow this only when the caller has its receiver event loop locked. + * It still means that the event may get postponed from other event in the same + * list, therefore we have to be careful. + */ + +static inline int +ev_remove_from(event *e, event * _Atomic * head) +{ + /* The head pointer stores where cur is pointed to from */ + event * _Atomic *prev = head; + + /* The current event in queue to check */ + event *cur = atomic_load_explicit(prev, memory_order_acquire); + + /* This part of queue is empty! */ + if (!cur) + return 0; + + edlog(NULL, e, cur, 1, EDL_REMOVE_FROM); + while (cur) + { + /* Pre-loaded next pointer */ + event *next = atomic_load_explicit(&cur->next, memory_order_acquire); + + if (e == cur) + { + edlog(NULL, e, next, 3, EDL_REMOVE_FROM); + + /* Check whether we have collided with somebody else + * adding an item to the queue. */ + if (!atomic_compare_exchange_strong_explicit( + prev, &cur, next, + memory_order_acq_rel, memory_order_acquire)) + { + /* This may happen only on list head */ + ASSERT_DIE(prev == head); + + /* Restart. The collision should never happen again. */ + return ev_remove_from(e, head); + } + + /* Successfully removed from the list; inactivate this event. */ + atomic_store_explicit(&cur->next, NULL, memory_order_release); + return 1; + } + + edlog(NULL, e, next, 2, EDL_REMOVE_FROM); + + /* Go to the next event. */ + prev = &cur->next; + cur = next; + } + + edlog(NULL, e, cur, 4, EDL_REMOVE_FROM); + + return 0; +} + inline void ev_postpone(event *e) { - event_list *el = e->list; - if (!el) + /* Find the list to remove the event from */ + event_list *sl = ev_get_list(e); + edlog(sl, e, NULL, 1, EDL_POSTPONE); + if (!sl) return; - ASSERT_DIE(birdloop_inside(el->loop)); + /* Postponing allowed only from the target loop */ + ASSERT_DIE(birdloop_inside(sl->loop)); + + /* Remove from one of these lists. */ + ASSERT(ev_remove_from(e, &sl->_executor) || ev_remove_from(e, &sl->receiver)); - LOCK_DOMAIN(event, el->lock); - if (ev_active(e)) - rem_node(&e->n); - UNLOCK_DOMAIN(event, el->lock); + /* Mark as inactive */ + ASSERT_DIE(sl == atomic_exchange_explicit(&e->list, NULL, memory_order_acq_rel)); + edlog(sl, e, NULL, 2, EDL_POSTPONE); } static void -ev_dump(resource *r) +ev_dump(resource *r, unsigned indent UNUSED) { event *e = (event *) r; debug("(code %p, data %p, %s)\n", e->hook, e->data, - e->n.next ? "scheduled" : "inactive"); + atomic_load_explicit(&e->next, memory_order_relaxed) ? "scheduled" : "inactive"); } static struct resclass ev_class = { @@ -93,8 +214,10 @@ ev_new(pool *p) inline void ev_run(event *e) { + edlog(NULL, e, NULL, 1, EDL_RUN); ev_postpone(e); e->hook(e->data); + edlog(NULL, e, NULL, 2, EDL_RUN); } /** @@ -108,48 +231,37 @@ ev_run(event *e) inline void ev_send(event_list *l, event *e) { - DBG("ev_send(%p, %p)\n", l, e); - ASSERT_DIE(e->hook); - ASSERT_DIE(!e->list || (e->list == l) || (e->list->loop == l->loop)); - - e->list = l; - - struct event_cork *ec = e->cork; - - uint ping = 0; - - if (ec) - { - LOCK_DOMAIN(cork, ec->lock); - LOCK_DOMAIN(event, l->lock); - - if (!enlisted(&e->n)) - if (ec->count) - add_tail(&ec->events, &e->n); - else - { - add_tail(&l->events, &e->n); - ping = 1; - } - - UNLOCK_DOMAIN(event, l->lock); - UNLOCK_DOMAIN(cork, ec->lock); - } - else - { - LOCK_DOMAIN(event, l->lock); - - if (!enlisted(&e->n)) + edlog(l, e, NULL, 1, EDL_SEND); + /* Set the target list */ + event_list *ol = NULL; + if (!atomic_compare_exchange_strong_explicit( + &e->list, &ol, l, + memory_order_acq_rel, memory_order_acquire)) + if (ol == l) + return; + else + bug("Queuing an already queued event to another queue is not supported."); + + /* Here should be no concurrent senders */ + event *next = atomic_load_explicit(&l->receiver, memory_order_acquire); + edlog(l, e, next, 2, EDL_SEND); + event *old_next = NULL; + do + if (!atomic_compare_exchange_strong_explicit( + &e->next, &old_next, next, + memory_order_acq_rel, memory_order_acquire)) + bug("Event %p in inconsistent state"); + else { - add_tail(&l->events, &e->n); - ping = 1; + old_next = next; + edlog(l, old_next, next, 3, EDL_SEND); } + while (!atomic_compare_exchange_strong_explicit( + &l->receiver, &next, e, + memory_order_acq_rel, memory_order_acquire)); - UNLOCK_DOMAIN(event, l->lock); - } - - if (ping) - birdloop_ping(l->loop); + edlog(l, e, next, 4, EDL_SEND); + if (l->loop) birdloop_ping(l->loop); } void io_log_event(void *hook, void *data); @@ -161,135 +273,66 @@ void io_log_event(void *hook, void *data); * This function calls ev_run() for all events enqueued in the list @l. */ int -ev_run_list(event_list *l) -{ - const _Bool legacy = LEGACY_EVENT_LIST(l); - - if (legacy) - ASSERT_THE_BIRD_LOCKED; - - node *n; - - list tmp_list; - init_list(&tmp_list); - - /* Move the event list contents to a local list to avoid executing repeatedly added events */ - LOCK_DOMAIN(event, l->lock); - add_tail_list(&tmp_list, &l->events); - init_list(&l->events); - UNLOCK_DOMAIN(event, l->lock); - - WALK_LIST_FIRST(n, tmp_list) - { - event *e = SKIP_BACK(event, n, n); - ASSERT_DIE(n->next->prev == n); - - if (legacy) - { - /* The legacy way of event execution */ - io_log_event(e->hook, e->data); - ev_postpone(e); - e->hook(e->data); - } - else - { - // io_log_event(e->hook, e->data); /* TODO: add support for event logging in other io loops */ - ASSERT_DIE(e->list == l); - LOCK_DOMAIN(event, l->lock); - rem_node(&e->n); - UNLOCK_DOMAIN(event, l->lock); - e->hook(e->data); - } - } - - LOCK_DOMAIN(event, l->lock); - int repeat = ! EMPTY_LIST(l->events); - UNLOCK_DOMAIN(event, l->lock); - return repeat; -} - -int ev_run_list_limited(event_list *l, uint limit) { - ASSERT_DIE(LEGACY_EVENT_LIST(l)); - ASSERT_THE_BIRD_LOCKED; + event * _Atomic *ep = &l->_executor; + edlog(l, NULL, NULL, 1, EDL_RUN_LIST); - node *n; - list tmp_list; - - LOCK_DOMAIN(event, l->lock); - init_list(&tmp_list); - add_tail_list(&tmp_list, &l->events); - init_list(&l->events); - UNLOCK_DOMAIN(event, l->lock); - - WALK_LIST_FIRST(n, tmp_list) - { - event *e = SKIP_BACK(event, n, n); - - if (!limit) - break; - - io_log_event(e->hook, e->data); - - ev_run(e); - limit--; - } - - LOCK_DOMAIN(event, l->lock); - if (!EMPTY_LIST(tmp_list)) + /* No pending events, refill the queue. */ + if (!atomic_load_explicit(ep, memory_order_acquire)) { - /* Attach new items after the unprocessed old items */ - add_tail_list(&tmp_list, &l->events); - init_list(&l->events); - add_tail_list(&l->events, &tmp_list); - } + /* Move the current event list aside and create a new one. */ + event *received = atomic_exchange_explicit(&l->receiver, NULL, memory_order_acq_rel); + edlog(l, NULL, received, 2, EDL_RUN_LIST); - int repeat = ! EMPTY_LIST(l->events); - UNLOCK_DOMAIN(event, l->lock); + /* No event to run. */ + if (!received) + return 0; - return repeat; -} + /* Setup the executor queue */ + event *head = NULL; -void ev_cork(struct event_cork *ec) -{ - LOCK_DOMAIN(cork, ec->lock); - ec->count++; - UNLOCK_DOMAIN(cork, ec->lock); -} - -void ev_uncork(struct event_cork *ec) -{ - LOCK_DOMAIN(cork, ec->lock); + /* Flip the order of the events by relinking them one by one (push-pop) */ + while (received) + { + event *cur = received; + received = atomic_exchange_explicit(&cur->next, head, memory_order_acq_rel); + edlog(l, head, received, 3, EDL_RUN_LIST); + head = cur; + } - if (--ec->count) - { - UNLOCK_DOMAIN(cork, ec->lock); - return; + /* Store the executor queue to its designated place */ + ASSERT_DIE(atomic_exchange_explicit(ep, head, memory_order_acq_rel) == NULL); + edlog(l, NULL, head, 4, EDL_RUN_LIST); } - node *n; - WALK_LIST_FIRST(n, ec->events) + /* Run the events in order. */ + event *e; + while (e = atomic_load_explicit(ep, memory_order_acquire)) { - event *e = SKIP_BACK(event, n, n); - event_list *el = e->list; + edlog(l, e, NULL, 5, EDL_RUN_LIST); + /* Check limit */ + if (!--limit) + return 1; - rem_node(&e->n); + /* This is ugly hack, we want to log just events executed from the main I/O loop */ + if ((l == &global_event_list) || (l == &global_work_list)) + io_log_event(e->hook, e->data); - LOCK_DOMAIN(event, el->lock); - add_tail(&el->events, &e->n); - UNLOCK_DOMAIN(event, el->lock); + edlog(l, e, NULL, 6, EDL_RUN_LIST); + /* Inactivate the event */ + event *next = atomic_load_explicit(&e->next, memory_order_relaxed); + ASSERT_DIE(e == atomic_exchange_explicit(ep, next, memory_order_acq_rel)); + ASSERT_DIE(next == atomic_exchange_explicit(&e->next, NULL, memory_order_acq_rel)); + ASSERT_DIE(l == atomic_exchange_explicit(&e->list, NULL, memory_order_acq_rel)); + edlog(l, e, next, 7, EDL_RUN_LIST); - birdloop_ping(el->loop); - } + /* Run the event */ + e->hook(e->data); + tmp_flush(); - struct birdsock *sk; - WALK_LIST_FIRST2(sk, cork_node, ec->sockets) - { -// log(L_TRACE "Socket %p uncorked", sk); - rem_node(&sk->cork_node); - sk_ping(sk); + edlog(l, e, next, 8, EDL_RUN_LIST); } - UNLOCK_DOMAIN(cork, ec->lock); + return !!atomic_load_explicit(&l->receiver, memory_order_acquire); } diff --git a/lib/event.h b/lib/event.h index cbabfc66..6fd9f31c 100644 --- a/lib/event.h +++ b/lib/event.h @@ -14,91 +14,52 @@ #include <stdatomic.h> -DEFINE_DOMAIN(event); -DEFINE_DOMAIN(cork); +struct birdloop; typedef struct event { resource r; void (*hook)(void *); void *data; - node n; /* Internal link */ - struct event_list *list; /* List where this event is put in */ - struct event_cork *cork; /* Event execution limiter */ - node cork_node; + struct event * _Atomic next; + struct event_list * _Atomic list; } event; typedef struct event_list { - list events; - pool *pool; - struct birdloop *loop; - DOMAIN(event) lock; + event * _Atomic receiver; /* Event receive list */ + event * _Atomic _executor; /* Event execute list */ + const char *name; + struct birdloop *loop; /* The executor loop */ } event_list; -struct event_cork { - DOMAIN(cork) lock; - u32 count; - list events; - list sockets; -}; - extern event_list global_event_list; extern event_list global_work_list; event *ev_new(pool *); void ev_run(event *); - -static inline void ev_init_list(event_list *el, struct birdloop *loop, const char *name) -{ - init_list(&el->events); - el->loop = loop; - el->lock = DOMAIN_NEW(event, name); -} - -static inline void ev_init_cork(struct event_cork *ec, const char *name) -{ - init_list(&ec->events); - init_list(&ec->sockets); - ec->lock = DOMAIN_NEW(cork, name); - ec->count = 0; -}; - -void ev_send(event_list *, event *); -#define ev_send_self(e) ({ ASSERT_DIE((e)->list); ev_send((e)->list, (e)); }) +void ev_init_list(event_list *, struct birdloop *loop, const char *name); +void ev_enqueue(event_list *, event *); +#define ev_send ev_enqueue #define ev_send_loop(l, e) ev_send(birdloop_event_list((l)), (e)) #define ev_schedule(e) ({ ASSERT_THE_BIRD_LOCKED; if (!ev_active((e))) ev_send(&global_event_list, (e)); }) #define ev_schedule_work(e) ({ ASSERT_THE_BIRD_LOCKED; if (!ev_active((e))) ev_send(&global_work_list, (e)); }) void ev_postpone(event *); -int ev_run_list(event_list *); int ev_run_list_limited(event_list *, uint); +#define ev_run_list(l) ev_run_list_limited((l), ~0) #define LEGACY_EVENT_LIST(l) (((l) == &global_event_list) || ((l) == &global_work_list)) -void ev_cork(struct event_cork *); -void ev_uncork(struct event_cork *); - -static inline u32 ev_corked(struct event_cork *ec) -{ - if (!ec) - return 0; - - LOCK_DOMAIN(cork, ec->lock); - u32 out = ec->count; - UNLOCK_DOMAIN(cork, ec->lock); - return out; -} - -_Bool birdloop_inside(struct birdloop *loop); - static inline int ev_active(event *e) { - if (e->list == NULL) - return 0; + return atomic_load_explicit(&e->list, memory_order_acquire) != NULL; +} - ASSERT_DIE(birdloop_inside(e->list->loop)); - return enlisted(&e->n); +static inline event_list * +ev_get_list(event *e) +{ + return atomic_load_explicit(&e->list, memory_order_acquire); } static inline event* diff --git a/lib/event_test.c b/lib/event_test.c index f8bb303b..612deb25 100644 --- a/lib/event_test.c +++ b/lib/event_test.c @@ -15,7 +15,7 @@ #include "nest/locks.h" #include "sysdep/unix/unix.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #define MAX_NUM 4 @@ -48,19 +48,14 @@ init_event_check_points(void) event_check_points[i] = 0; } -void resource_sys_init(void); - static int t_ev_run_list(void) { int i; - resource_sys_init(); - resource_init(); - birdloop_init(); olock_init(); - io_init(); rt_init(); + io_init(); if_init(); // roa_init(); config_init(); @@ -85,9 +80,7 @@ main(int argc, char *argv[]) { bt_init(argc, argv); - the_bird_lock(); bt_test_suite(t_ev_run_list, "Schedule and run 3 events in right order."); - the_bird_unlock(); return bt_exit_value(); } diff --git a/lib/fib.h b/lib/fib.h new file mode 100644 index 00000000..bec2a8d4 --- /dev/null +++ b/lib/fib.h @@ -0,0 +1,119 @@ +/* + * BIRD Internet Routing Daemon -- Network prefix storage + * + * (c) 1998--2000 Martin Mares <mj@ucw.cz> + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_LIB_FIB_H_ +#define _BIRD_LIB_FIB_H_ + +/* + * BIRD FIBs are generic data structure for storing network prefixes. + * Also used for the master routing table. Currently implemented as + * a hash table. + * + * Available operations: + * - insertion of new entry + * - deletion of entry + * - searching for entry by network prefix + * - asynchronous retrieval of fib contents + */ + +struct fib; + +struct fib_node { + struct fib_node *next; /* Next in hash chain */ + struct fib_iterator *readers; /* List of readers of this node */ + net_addr addr[0]; +}; + +struct fib_iterator { /* See lib/slists.h for an explanation */ + struct fib_iterator *prev, *next; /* Must be synced with struct fib_node! */ + byte efef; /* 0xff to distinguish between iterator and node */ + byte pad[3]; + struct fib_node *node; /* Or NULL if freshly merged */ + uint hash; +}; + +typedef void (*fib_init_fn)(struct fib *, void *); + +struct fib { + pool *fib_pool; /* Pool holding all our data */ + slab *fib_slab; /* Slab holding all fib nodes */ + struct fib_node **hash_table; /* Node hash table */ + uint hash_size; /* Number of hash table entries (a power of two) */ + uint hash_order; /* Binary logarithm of hash_size */ + uint hash_shift; /* 32 - hash_order */ + uint addr_type; /* Type of address data stored in fib (NET_*) */ + uint node_size; /* FIB node size, 0 for nonuniform */ + uint node_offset; /* Offset of fib_node struct inside of user data */ + uint entries; /* Number of entries */ + uint entries_min, entries_max; /* Entry count limits (else start rehashing) */ + fib_init_fn init; /* Constructor */ +}; + +static inline void * fib_node_to_user(struct fib *f, struct fib_node *e) +{ return e ? (void *) ((char *) e - f->node_offset) : NULL; } + +static inline struct fib_node * fib_user_to_node(struct fib *f, void *e) +{ return e ? (void *) ((char *) e + f->node_offset) : NULL; } + +void fib_init(struct fib *f, pool *p, uint addr_type, uint node_size, uint node_offset, uint hash_order, fib_init_fn init); +void *fib_find(struct fib *, const net_addr *); /* Find or return NULL if doesn't exist */ +void *fib_get_chain(struct fib *f, const net_addr *a); /* Find first node in linked list from hash table */ +void *fib_get(struct fib *, const net_addr *); /* Find or create new if nonexistent */ +void *fib_route(struct fib *, const net_addr *); /* Longest-match routing lookup */ +void fib_delete(struct fib *, void *); /* Remove fib entry */ +void fib_free(struct fib *); /* Destroy the fib */ +void fib_check(struct fib *); /* Consistency check for debugging */ + +void fit_init(struct fib_iterator *, struct fib *); /* Internal functions, don't call */ +struct fib_node *fit_get(struct fib *, struct fib_iterator *); +void fit_put(struct fib_iterator *, struct fib_node *); +void fit_put_next(struct fib *f, struct fib_iterator *i, struct fib_node *n, uint hpos); +void fit_put_end(struct fib_iterator *i); +void fit_copy(struct fib *f, struct fib_iterator *dst, struct fib_iterator *src); + + +#define FIB_WALK(fib, type, z) do { \ + struct fib_node *fn_, **ff_ = (fib)->hash_table; \ + uint count_ = (fib)->hash_size; \ + type *z; \ + while (count_--) \ + for (fn_ = *ff_++; z = fib_node_to_user(fib, fn_); fn_=fn_->next) + +#define FIB_WALK_END } while (0) + +#define FIB_ITERATE_INIT(it, fib) fit_init(it, fib) + +#define FIB_ITERATE_START(fib, it, type, z) do { \ + struct fib_node *fn_ = fit_get(fib, it); \ + uint count_ = (fib)->hash_size; \ + uint hpos_ = (it)->hash; \ + type *z; \ + for(;;) { \ + if (!fn_) \ + { \ + if (++hpos_ >= count_) \ + break; \ + fn_ = (fib)->hash_table[hpos_]; \ + continue; \ + } \ + z = fib_node_to_user(fib, fn_); + +#define FIB_ITERATE_END fn_ = fn_->next; } } while(0) + +#define FIB_ITERATE_PUT(it) fit_put(it, fn_) + +#define FIB_ITERATE_PUT_NEXT(it, fib) fit_put_next(fib, it, fn_, hpos_) + +#define FIB_ITERATE_PUT_END(it) fit_put_end(it) + +#define FIB_ITERATE_UNLINK(it, fib) fit_get(fib, it) + +#define FIB_ITERATE_COPY(dst, src, fib) fit_copy(fib, dst, src) + +#endif diff --git a/lib/flowspec_test.c b/lib/flowspec_test.c index 518b8fc7..03649b99 100644 --- a/lib/flowspec_test.c +++ b/lib/flowspec_test.c @@ -8,7 +8,6 @@ #include "test/birdtest.h" #include "lib/flowspec.h" -#include "lib/io-loop.h" #define NET_ADDR_FLOW4_(what,prefix,pxlen,data_) \ do \ @@ -448,7 +447,6 @@ static int t_builder4(void) { struct flow_builder *fb = flow_builder_init(&root_pool); - linpool *lp = lp_new_default(&root_pool); /* Expectation */ @@ -491,7 +489,7 @@ t_builder4(void) flow_builder_set_type(fb, FLOW_TYPE_TCP_FLAGS); flow_builder_add_op_val(fb, 0, 0x55); - net_addr_flow4 *res = flow_builder4_finalize(fb, lp); + net_addr_flow4 *res = flow_builder4_finalize(fb, tmp_linpool); bt_assert(memcmp(res, expect, expect->length) == 0); @@ -528,7 +526,6 @@ t_builder6(void) { net_addr_ip6 ip; - linpool *lp = lp_new_default(&root_pool); struct flow_builder *fb = flow_builder_init(&root_pool); fb->ipv6 = 1; @@ -572,7 +569,7 @@ t_builder6(void) flow_builder_set_type(fb, FLOW_TYPE_LABEL); flow_builder_add_op_val(fb, 0, 0x55); - net_addr_flow6 *res = flow_builder6_finalize(fb, lp); + net_addr_flow6 *res = flow_builder6_finalize(fb, tmp_linpool); bt_assert(memcmp(res, expect, expect->length) == 0); /* Reverse order */ @@ -599,7 +596,7 @@ t_builder6(void) flow_builder_set_type(fb, FLOW_TYPE_DST_PREFIX); flow_builder6_add_pfx(fb, &ip, 61); - res = flow_builder6_finalize(fb, lp); + res = flow_builder6_finalize(fb, tmp_linpool); bt_assert(memcmp(res, expect, expect->length) == 0); return 1; @@ -664,16 +661,10 @@ t_formatting6(void) return 1; } -void resource_sys_init(void); - int main(int argc, char *argv[]) { bt_init(argc, argv); - resource_sys_init(); - resource_init(); - the_bird_lock(); - birdloop_init(); bt_test_suite(t_read_length, "Testing get NLRI length"); bt_test_suite(t_write_length, "Testing set NLRI length"); @@ -689,6 +680,5 @@ main(int argc, char *argv[]) bt_test_suite(t_formatting4, "Formatting Flow Specification (IPv4) into text representation"); bt_test_suite(t_formatting6, "Formatting Flow Specification (IPv6) into text representation"); - the_bird_unlock(); return bt_exit_value(); } @@ -10,7 +10,7 @@ #ifndef _BIRD_HASH_H_ #define _BIRD_HASH_H_ -#define HASH(type) struct { type **data; uint count, order; } +#define HASH(type) struct { type **data; uint count; u16 iterators; u8 order; u8 down_requested:1; } #define HASH_TYPE(v) typeof(** (v).data) #define HASH_SIZE(v) (1U << (v).order) @@ -125,20 +125,26 @@ #define HASH_MAY_STEP_DOWN_(v,pool,rehash_fn,args) \ ({ \ - if (((v).count < (HASH_SIZE(v) REHASH_LO_MARK(args))) && \ - ((v).order > (REHASH_LO_BOUND(args)))) \ + if ((v).iterators) \ + (v).down_requested = 1; \ + else if (((v).count < (HASH_SIZE(v) REHASH_LO_MARK(args))) && \ + ((v).order > (REHASH_LO_BOUND(args)))) \ rehash_fn(&(v), pool, -(REHASH_LO_STEP(args))); \ }) #define HASH_MAY_RESIZE_DOWN_(v,pool,rehash_fn,args) \ ({ \ - uint _o = (v).order; \ - while (((v).count < ((1U << _o) REHASH_LO_MARK(args))) && \ - (_o > (REHASH_LO_BOUND(args)))) \ - _o -= (REHASH_LO_STEP(args)); \ - if (_o < (v).order) \ - rehash_fn(&(v), pool, _o - (v).order); \ - }) + if ((v).iterators) \ + (v).down_requested = 1; \ + else { \ + uint _o = (v).order; \ + while (((v).count < ((1U << _o) REHASH_LO_MARK(args))) && \ + (_o > (REHASH_LO_BOUND(args)))) \ + _o -= (REHASH_LO_STEP(args)); \ + if (_o < (v).order) \ + rehash_fn(&(v), pool, _o - (v).order); \ + } \ + }) #define HASH_INSERT2(v,id,pool,node) \ @@ -195,6 +201,20 @@ #define HASH_WALK_FILTER_END } while (0) +#define HASH_WALK_ITER(v, id, n, iter) \ + do { \ + uint _hash_walk_iter_put = 0; \ + uint _shift = 32 - (v).order; \ + for ( ; !_hash_walk_iter_put; (iter) += (1U << _shift)) { \ + _hash_walk_iter_put = ((iter) + (1U << _shift) == 0); \ + for (HASH_TYPE(v) *n = (v).data[(iter) >> _shift]; n; n = id##_NEXT((n)))\ + if (HASH_FN(v, id, id##_KEY(n)) >= ((iter) >> _shift)) \ + +#define HASH_WALK_ITER_PUT (_hash_walk_iter_put = 1) + +#define HASH_WALK_ITER_END } } while (0) + + static inline void mem_hash_init(u64 *h) { diff --git a/lib/hash_test.c b/lib/hash_test.c index 7ef54662..ecfcdd66 100644 --- a/lib/hash_test.c +++ b/lib/hash_test.c @@ -9,9 +9,7 @@ #undef LOCAL_DEBUG #include "test/birdtest.h" -#include "test/bt-utils.h" -#include "lib/io-loop.h" #include "lib/hash.h" struct test_node { @@ -63,7 +61,7 @@ dump_nodes(void) static void init_hash_(uint order) { - my_pool = rp_new(&root_pool, &main_birdloop, "Test pool"); + my_pool = rp_new(&root_pool, "Test pool"); HASH_INIT(hash, my_pool, order); @@ -287,11 +285,50 @@ t_walk_filter(void) return 1; } +static int +t_walk_iter(void) +{ + init_hash(); + fill_hash(); + + u32 hit = 0; + + u32 prev_hash = ~0; + for (uint cnt = 0; cnt < MAX_NUM; ) + { + u32 last_hash = ~0; +// printf("PUT!\n"); + HASH_WALK_ITER(hash, TEST, n, hit) + { + cnt++; + u32 cur_hash = HASH_FN(hash, TEST, n->key); + /* + printf("C%08x L%08x P%08x K%08x H%08x N%p S%d I%ld\n", + cur_hash, last_hash, prev_hash, n->key, hit, n, _shift, n - &nodes[0]); + */ + + if (last_hash == ~0U) + { + if (prev_hash != ~0U) + bt_assert(prev_hash < cur_hash); + last_hash = prev_hash = cur_hash; + } + else + bt_assert(last_hash == cur_hash); + + if (cnt < MAX_NUM) + HASH_WALK_ITER_PUT; + } + HASH_WALK_ITER_END; + } + + return 1; +} + int main(int argc, char *argv[]) { bt_init(argc, argv); - bt_bird_init(); bt_test_suite(t_insert_find, "HASH_INSERT and HASH_FIND"); bt_test_suite(t_insert_find_random, "HASH_INSERT pseudo-random keys and HASH_FIND"); @@ -302,6 +339,7 @@ main(int argc, char *argv[]) bt_test_suite(t_walk_delsafe_remove, "HASH_WALK_DELSAFE and HASH_REMOVE"); bt_test_suite(t_walk_delsafe_remove2, "HASH_WALK_DELSAFE and HASH_REMOVE2. HASH_REMOVE2 is HASH_REMOVE and smart auto-resize function"); bt_test_suite(t_walk_filter, "HASH_WALK_FILTER"); + bt_test_suite(t_walk_iter, "HASH_WALK_ITER"); return bt_exit_value(); } diff --git a/lib/io-loop.h b/lib/io-loop.h index d60fb1ae..877cd5ce 100644 --- a/lib/io-loop.h +++ b/lib/io-loop.h @@ -14,22 +14,17 @@ #include "lib/event.h" #include "lib/socket.h" -void sk_start(sock *s); -void sk_stop(sock *s); -void sk_reloop(sock *s, struct birdloop *loop); -void sk_ping(sock *s); - extern struct birdloop main_birdloop; -/* Start a new birdloop owned by given pool and domain. - * The loop allocates its internal pool for local allocations - * which is freed when the loop itself is stopped. */ -struct birdloop *birdloop_new(pool *p, uint order, const char *name); +/* Start a new birdloop owned by given pool and domain */ +struct birdloop *birdloop_new(pool *p, uint order, const char *name, btime max_latency); -/* Stop the loop. At the end, the @stopped callback is called with locked - * parent to finish cleanup. The loop then frees itself together with its pool. */ +/* Stop the loop. At the end, the @stopped callback is called unlocked in tail + * position to finish cleanup. Run birdloop_free() from that callback to free + * the loop itself. */ void birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data); void birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data); +void birdloop_free(struct birdloop *loop); /* Get birdloop's event list */ event_list *birdloop_event_list(struct birdloop *loop); @@ -37,9 +32,6 @@ event_list *birdloop_event_list(struct birdloop *loop); /* Get birdloop's time heap */ struct timeloop *birdloop_time_loop(struct birdloop *loop); -/* Get birdloop's resource pool */ -pool *birdloop_pool(struct birdloop *loop); - /* Enter and exit the birdloop */ void birdloop_enter(struct birdloop *loop); void birdloop_leave(struct birdloop *loop); @@ -54,5 +46,21 @@ void birdloop_unlink(struct birdloop *loop); void birdloop_ping(struct birdloop *loop); +struct birdloop_flag_handler { + void (*hook)(struct birdloop_flag_handler *, u32 flags); + void *data; +}; + +void birdloop_flag(struct birdloop *loop, u32 flag); +void birdloop_flag_set_handler(struct birdloop *, struct birdloop_flag_handler *); + +/* Setup sockets */ +void birdloop_add_socket(struct birdloop *, struct birdsock *); +void birdloop_remove_socket(struct birdloop *, struct birdsock *); + void birdloop_init(void); + +/* Yield for a little while. Use only in special cases. */ +void birdloop_yield(void); + #endif /* _BIRD_IO_LOOP_H_ */ @@ -85,25 +85,29 @@ ip4_classify(ip4_addr ad) u32 a = _I(ad); u32 b = a >> 24U; - if (b && b <= 0xdf) + if (b < 0xe0) { - if (b == 0x7f) + if (b == 0x00) /* 0.0.0.0/8 This network */ + return IADDR_INVALID; + + if (b == 0x7f) /* 127.0.0.0/8 Loopback address */ return IADDR_HOST | SCOPE_HOST; - else if ((b == 0x0a) || - ((a & 0xffff0000) == 0xc0a80000) || - ((a & 0xfff00000) == 0xac100000)) + + if ((b == 0x0a) || /* 10.0.0.0/8 Private range */ + ((a & 0xffff0000) == 0xc0a80000) || /* 192.168.0.0/16 Private range */ + ((a & 0xfff00000) == 0xac100000)) /* 172.16.0.0/12 Private range */ return IADDR_HOST | SCOPE_SITE; - else - return IADDR_HOST | SCOPE_UNIVERSE; + + return IADDR_HOST | SCOPE_UNIVERSE; } - if (b >= 0xe0 && b <= 0xef) + if (b < 0xf0) /* 224.0.0.0/4 Multicast address */ return IADDR_MULTICAST | SCOPE_UNIVERSE; - if (a == 0xffffffff) + if (a == 0xffffffff) /* 255.255.255.255 Broadcast address */ return IADDR_BROADCAST | SCOPE_LINK; - return IADDR_INVALID; + return IADDR_HOST | SCOPE_SITE; /* 240.0.0.0/4 Reserved / private */ } int @@ -279,11 +279,35 @@ static inline uint ip6_pxlen(ip6_addr a, ip6_addr b) return 32 * i + 31 - u32_log2(a.addr[i] ^ b.addr[i]); } +static inline int ip4_prefix_equal(ip4_addr a, ip4_addr b, uint n) +{ + return (_I(a) ^ _I(b)) < ((u64) 1 << (32 - n)); +} + +static inline int ip6_prefix_equal(ip6_addr a, ip6_addr b, uint n) +{ + uint n0 = n / 32; + uint n1 = n % 32; + + return + ((n0 <= 0) || (_I0(a) == _I0(b))) && + ((n0 <= 1) || (_I1(a) == _I1(b))) && + ((n0 <= 2) || (_I2(a) == _I2(b))) && + ((n0 <= 3) || (_I3(a) == _I3(b))) && + (!n1 || ((a.addr[n0] ^ b.addr[n0]) < (1u << (32 - n1)))); +} + static inline u32 ip4_getbit(ip4_addr a, uint pos) -{ return _I(a) & (0x80000000 >> pos); } +{ return (_I(a) >> (31 - pos)) & 1; } + +static inline u32 ip4_getbits(ip4_addr a, uint pos, uint n) +{ return (_I(a) >> ((32 - n) - pos)) & ((1u << n) - 1); } static inline u32 ip6_getbit(ip6_addr a, uint pos) -{ return a.addr[pos / 32] & (0x80000000 >> (pos % 32)); } +{ return (a.addr[pos / 32] >> (31 - (pos % 32))) & 0x1; } + +static inline u32 ip6_getbits(ip6_addr a, uint pos, uint n) +{ return (a.addr[pos / 32] >> ((32 - n) - (pos % 32))) & ((1u << n) - 1); } static inline u32 ip4_setbit(ip4_addr *a, uint pos) { return _I(*a) |= (0x80000000 >> pos); } @@ -297,6 +321,13 @@ static inline u32 ip4_clrbit(ip4_addr *a, uint pos) static inline u32 ip6_clrbit(ip6_addr *a, uint pos) { return a->addr[pos / 32] &= ~(0x80000000 >> (pos % 32)); } +static inline ip4_addr ip4_setbits(ip4_addr a, uint pos, uint val) +{ _I(a) |= val << (31 - pos); return a; } + +static inline ip6_addr ip6_setbits(ip6_addr a, uint pos, uint val) +{ a.addr[pos / 32] |= val << (31 - pos % 32); return a; } + + static inline ip4_addr ip4_opposite_m1(ip4_addr a) { return _MI4(_I(a) ^ 1); } @@ -331,11 +362,7 @@ static inline ip6_addr ip6_hton(ip6_addr a) static inline ip6_addr ip6_ntoh(ip6_addr a) { return _MI6(ntohl(_I0(a)), ntohl(_I1(a)), ntohl(_I2(a)), ntohl(_I3(a))); } -#define MPLS_MAX_LABEL_STACK 8 -typedef struct mpls_label_stack { - uint len; - u32 stack[MPLS_MAX_LABEL_STACK]; -} mpls_label_stack; +#define MPLS_MAX_LABEL_STACK 16 static inline int mpls_get(const char *buf, int buflen, u32 *stack) diff --git a/lib/ip_test.c b/lib/ip_test.c index 36d10d68..eee0a427 100644 --- a/lib/ip_test.c +++ b/lib/ip_test.c @@ -167,6 +167,70 @@ t_ip6_ntop(void) return bt_assert_batch(test_vectors, test_ipa_ntop, bt_fmt_ipa, bt_fmt_str); } +static int +t_ip4_prefix_equal(void) +{ + bt_assert( ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x1234ffff), 16)); + bt_assert(!ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x1234ffff), 17)); + bt_assert( ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345000), 21)); + bt_assert(!ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345000), 22)); + + bt_assert( ip4_prefix_equal(ip4_from_u32(0x00000000), ip4_from_u32(0xffffffff), 0)); + bt_assert( ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345678), 0)); + + bt_assert( ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345678), 32)); + bt_assert(!ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x12345679), 32)); + bt_assert(!ip4_prefix_equal(ip4_from_u32(0x12345678), ip4_from_u32(0x92345678), 32)); + + return 1; +} + +static int +t_ip6_prefix_equal(void) +{ + bt_assert( ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x1234ffff, 0xfefefefe, 0xdcdcdcdc), + 48)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x1234ffff, 0xfefefefe, 0xdcdcdcdc), + 49)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20020db8, 0x12345678, 0xfefefefe, 0xdcdcdcdc), + 48)); + + bt_assert( ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x12345678, 0xfefefefe, 0xdcdcdcdc), + 64)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x1234567e, 0xfefefefe, 0xdcdcdcdc), + 64)); + + bt_assert( ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20002020), + ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + 106)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20002020), + ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + 107)); + + bt_assert( ip6_prefix_equal(ip6_build(0xfeef0db8, 0x87654321, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x12345678, 0xfefefefe, 0xdcdcdcdc), + 0)); + + bt_assert( ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + 128)); + + bt_assert(!ip6_prefix_equal(ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202020), + ip6_build(0x20010db8, 0x12345678, 0x10101010, 0x20202021), + 128)); + + return 1; +} + int main(int argc, char *argv[]) { @@ -176,6 +240,8 @@ main(int argc, char *argv[]) bt_test_suite(t_ip6_pton, "Converting IPv6 string to ip6_addr struct"); bt_test_suite(t_ip4_ntop, "Converting ip4_addr struct to IPv4 string"); bt_test_suite(t_ip6_ntop, "Converting ip6_addr struct to IPv6 string"); + bt_test_suite(t_ip4_prefix_equal, "Testing ip4_prefix_equal()"); + bt_test_suite(t_ip6_prefix_equal, "Testing ip6_prefix_equal()"); return bt_exit_value(); } diff --git a/lib/lists.c b/lib/lists.c index 2f92defe..cabfddba 100644 --- a/lib/lists.c +++ b/lib/lists.c @@ -35,10 +35,10 @@ check_list(list *l, node *n) if (!l) { ASSERT_DIE(n); + ASSERT_DIE(n->prev); node *nn = n; - while (nn->prev) - nn = nn->prev; + do { nn = nn->prev; } while (nn->prev); l = SKIP_BACK(list, head_node, nn); } @@ -61,7 +61,7 @@ check_list(list *l, node *n) } ASSERT_DIE(cur == &(l->tail_node)); - ASSERT_DIE(!n || (seen == 1) || (n == &l->head_node) || (n == &l->tail_node)); + ASSERT_DIE(!n || (seen == 1)); return 1; } @@ -110,15 +110,6 @@ add_head(list *l, node *n) l->head = n; } -LIST_INLINE void -self_link(node *n) -{ - ASSUME(n->prev == NULL); - ASSUME(n->next == NULL); - - n->prev = n->next = n; -} - /** * insert_node - insert a node to a list * @n: a new list node @@ -130,7 +121,7 @@ self_link(node *n) LIST_INLINE void insert_node(node *n, node *after) { - EXPENSIVE_CHECK(check_list(NULL, after)); + EXPENSIVE_CHECK((after->prev == NULL) || check_list(NULL, after)); ASSUME(n->prev == NULL); ASSUME(n->next == NULL); @@ -151,7 +142,7 @@ insert_node(node *n, node *after) LIST_INLINE void rem_node(node *n) { - EXPENSIVE_CHECK((n == n->prev) && (n == n->next) || check_list(NULL, n)); + EXPENSIVE_CHECK((n->prev == n) && (n->next == n) || check_list(NULL, n)); node *z = n->prev; node *x = n->next; @@ -208,9 +199,6 @@ add_tail_list(list *to, list *l) EXPENSIVE_CHECK(check_list(to, NULL)); EXPENSIVE_CHECK(check_list(l, NULL)); - ASSERT_DIE(l->head->prev == &l->head_node); - ASSERT_DIE(l->tail->next == &l->tail_node); - node *p = to->tail; node *q = l->head; diff --git a/lib/lists.h b/lib/lists.h index f36c051c..86ff59c9 100644 --- a/lib/lists.h +++ b/lib/lists.h @@ -91,7 +91,6 @@ enlisted(node *n) #define LIST_INLINE void add_tail(list *, node *); void add_head(list *, node *); -void self_link(node *); void rem_node(node *); void add_tail_list(list *, list *); void init_list(list *); diff --git a/lib/locking.h b/lib/locking.h index 1a8bdcd4..6bed14bf 100644 --- a/lib/locking.h +++ b/lib/locking.h @@ -13,12 +13,13 @@ struct domain_generic; /* Here define the global lock order; first to last. */ struct lock_order { + struct domain_generic *meta; struct domain_generic *the_bird; + struct domain_generic *control; struct domain_generic *proto; + struct domain_generic *service; struct domain_generic *rtable; struct domain_generic *attrs; - struct domain_generic *cork; - struct domain_generic *event; struct domain_generic *resource; }; @@ -35,6 +36,9 @@ struct domain_generic *domain_new(const char *name, uint order); #define DOMAIN_FREE(type, d) domain_free((d).type) void domain_free(struct domain_generic *); +#define DOMAIN_NAME(type, d) domain_name((d).type) +const char *domain_name(struct domain_generic *); + #define DOMAIN_NULL(type) (DOMAIN(type)) {} #define LOCK_DOMAIN(type, d) do_lock(((d).type), &(locking_stack.type)) diff --git a/lib/macro.h b/lib/macro.h index 24fc3393..8f5d4b0e 100644 --- a/lib/macro.h +++ b/lib/macro.h @@ -26,6 +26,8 @@ #define MACRO_DROP(...) #define MACRO_UNPAREN(...) __VA_ARGS__ #define MACRO_SEP(a, b, sep) a sep b +#define MACRO_STR(a) #a +#define MACRO_STR_AFTER(a) MACRO_STR(a) /* Aliases for some special chars */ #define MACRO_COMMA , diff --git a/lib/mempool.c b/lib/mempool.c index 6d157e5c..be98de51 100644 --- a/lib/mempool.c +++ b/lib/mempool.c @@ -27,24 +27,22 @@ struct lp_chunk { struct lp_chunk *next; - uint size; uintptr_t data_align[0]; byte data[0]; }; -const int lp_chunk_size = sizeof(struct lp_chunk); +#define LP_DATA_SIZE (page_size - OFFSETOF(struct lp_chunk, data)) struct linpool { resource r; byte *ptr, *end; - pool *p; struct lp_chunk *first, *current; /* Normal (reusable) chunks */ struct lp_chunk *first_large; /* Large chunks */ - uint chunk_size, threshold, total:31, use_pages:1, total_large; + uint total, total_large; }; static void lp_free(resource *); -static void lp_dump(resource *); +static void lp_dump(resource *, unsigned); static resource *lp_lookup(resource *, unsigned long); static struct resmem lp_memsize(resource *r); @@ -60,26 +58,14 @@ static struct resclass lp_class = { /** * lp_new - create a new linear memory pool * @p: pool - * @blk: block size * * lp_new() creates a new linear memory pool resource inside the pool @p. - * The linear pool consists of a list of memory chunks of size at least - * @blk. + * The linear pool consists of a list of memory chunks of page size. */ linpool -*lp_new(pool *p, uint blk) +*lp_new(pool *p) { - linpool *m = ralloc(p, &lp_class); - m->p = p; - if (!blk) - { - m->use_pages = 1; - blk = page_size - lp_chunk_size; - } - - m->chunk_size = blk; - m->threshold = 3*blk/4; - return m; + return ralloc(p, &lp_class); } /** @@ -110,14 +96,13 @@ lp_alloc(linpool *m, uint size) else { struct lp_chunk *c; - if (size >= m->threshold) + if (size > LP_DATA_SIZE) { /* Too large => allocate large chunk */ c = xmalloc(sizeof(struct lp_chunk) + size); m->total_large += size; c->next = m->first_large; m->first_large = c; - c->size = size; } else { @@ -129,14 +114,10 @@ lp_alloc(linpool *m, uint size) else { /* Need to allocate a new chunk */ - if (m->use_pages) - c = alloc_page(); - else - c = xmalloc(sizeof(struct lp_chunk) + m->chunk_size); + c = alloc_page(); - m->total += m->chunk_size; + m->total += LP_DATA_SIZE; c->next = NULL; - c->size = m->chunk_size; if (m->current) m->current->next = c; @@ -145,7 +126,7 @@ lp_alloc(linpool *m, uint size) } m->current = c; m->ptr = c->data + size; - m->end = c->data + m->chunk_size; + m->end = c->data + LP_DATA_SIZE; } return c->data; } @@ -204,10 +185,17 @@ lp_flush(linpool *m) { struct lp_chunk *c; - /* Move ptr to the first chunk and free all large chunks */ + /* Move ptr to the first chunk and free all other chunks */ m->current = c = m->first; m->ptr = c ? c->data : NULL; - m->end = c ? c->data + m->chunk_size : NULL; + m->end = c ? c->data + LP_DATA_SIZE : NULL; + + while (c && c->next) + { + struct lp_chunk *d = c->next; + c->next = d->next; + free_page(d); + } while (c = m->first_large) { @@ -230,6 +218,7 @@ lp_save(linpool *m, lp_state *p) { p->current = m->current; p->large = m->first_large; + p->total_large = m->total_large; p->ptr = m->ptr; } @@ -249,14 +238,14 @@ lp_restore(linpool *m, lp_state *p) struct lp_chunk *c; /* Move ptr to the saved pos and free all newer large chunks */ - m->current = c = p->current; - m->ptr = p->ptr; - m->end = c ? c->data + m->chunk_size : NULL; + m->current = c = p->current ?: m->first; + m->ptr = p->ptr ?: (c ? c->data : NULL); + m->end = c ? (c->data + LP_DATA_SIZE) : NULL; + m->total_large = p->total_large; while ((c = m->first_large) && (c != p->large)) { m->first_large = c->next; - m->total_large -= c->size; xfree(c); } } @@ -270,10 +259,7 @@ lp_free(resource *r) for(d=m->first; d; d = c) { c = d->next; - if (m->use_pages) - free_page(d); - else - xfree(d); + free_page(d); } for(d=m->first_large; d; d = c) { @@ -283,42 +269,52 @@ lp_free(resource *r) } static void -lp_dump(resource *r) +lp_dump(resource *r, unsigned indent) { linpool *m = (linpool *) r; struct lp_chunk *c; int cnt, cntl; + char x[32]; for(cnt=0, c=m->first; c; c=c->next, cnt++) ; for(cntl=0, c=m->first_large; c; c=c->next, cntl++) ; - debug("(chunk=%d threshold=%d count=%d+%d total=%d+%d)\n", - m->chunk_size, - m->threshold, + debug("(count=%d+%d total=%d+%d)\n", cnt, cntl, m->total, m->total_large); + + bsprintf(x, "%%%dschunk %%p\n", indent + 2); + for (c=m->first; c; c=c->next) + debug(x, "", c); + + bsprintf(x, "%%%dslarge %%p\n", indent + 2); + for (c=m->first_large; c; c=c->next) + debug(x, "", c); } static struct resmem lp_memsize(resource *r) { linpool *m = (linpool *) r; - struct lp_chunk *c; - int cnt = 0; - - for(c=m->first; c; c=c->next) - cnt++; - for(c=m->first_large; c; c=c->next) - cnt++; - - return (struct resmem) { - .effective = m->total + m->total_large, - .overhead = ALLOC_OVERHEAD + sizeof(struct linpool) + - cnt * (ALLOC_OVERHEAD + sizeof(struct lp_chunk)), + struct resmem sz = { + .overhead = sizeof(struct linpool) + ALLOC_OVERHEAD, + .effective = m->total_large, }; + + for (struct lp_chunk *c = m->first_large; c; c = c->next) + sz.overhead += sizeof(struct lp_chunk) + ALLOC_OVERHEAD; + + uint regular = 0; + for (struct lp_chunk *c = m->first; c; c = c->next) + regular++; + + sz.effective += LP_DATA_SIZE * regular; + sz.overhead += (sizeof(struct lp_chunk) + ALLOC_OVERHEAD) * regular; + + return sz; } @@ -329,10 +325,7 @@ lp_lookup(resource *r, unsigned long a) struct lp_chunk *c; for(c=m->first; c; c=c->next) - if ((unsigned long) c->data <= a && (unsigned long) c->data + c->size > a) - return r; - for(c=m->first_large; c; c=c->next) - if ((unsigned long) c->data <= a && (unsigned long) c->data + c->size > a) + if ((unsigned long) c->data <= a && (unsigned long) c->data + LP_DATA_SIZE > a) return r; return NULL; } @@ -57,6 +57,19 @@ const u16 net_max_text_length[] = { [NET_MPLS] = 7, /* "1048575" */ }; +/* There should be no implicit padding in net_addr structures */ +STATIC_ASSERT(sizeof(net_addr) == 24); +STATIC_ASSERT(sizeof(net_addr_ip4) == 8); +STATIC_ASSERT(sizeof(net_addr_ip6) == 20); +STATIC_ASSERT(sizeof(net_addr_vpn4) == 16); +STATIC_ASSERT(sizeof(net_addr_vpn6) == 32); +STATIC_ASSERT(sizeof(net_addr_roa4) == 16); +STATIC_ASSERT(sizeof(net_addr_roa6) == 28); +STATIC_ASSERT(sizeof(net_addr_flow4) == 8); +STATIC_ASSERT(sizeof(net_addr_flow6) == 20); +STATIC_ASSERT(sizeof(net_addr_ip6_sadr) == 40); +STATIC_ASSERT(sizeof(net_addr_mpls) == 8); + int rd_format(const u64 rd, char *buf, int buflen) @@ -310,22 +323,3 @@ net_in_netX(const net_addr *a, const net_addr *n) return (net_pxlen(n) <= net_pxlen(a)) && ipa_in_netX(net_prefix(a), n); } - -#define CHECK_NET(T,S) \ - ({ if (sizeof(T) != S) die("sizeof %s is %d/%d", #T, (int) sizeof(T), S); }) - -void -net_init(void) -{ - CHECK_NET(net_addr, 24); - CHECK_NET(net_addr_ip4, 8); - CHECK_NET(net_addr_ip6, 20); - CHECK_NET(net_addr_vpn4, 16); - CHECK_NET(net_addr_vpn6, 32); - CHECK_NET(net_addr_roa4, 16); - CHECK_NET(net_addr_roa6, 28); - CHECK_NET(net_addr_flow4, 8); - CHECK_NET(net_addr_flow6, 20); - CHECK_NET(net_addr_ip6_sadr, 40); - CHECK_NET(net_addr_mpls, 8); -} @@ -38,6 +38,7 @@ #define NB_IP (NB_IP4 | NB_IP6) #define NB_VPN (NB_VPN4 | NB_VPN6) +#define NB_ROA (NB_ROA4 | NB_ROA6) #define NB_FLOW (NB_FLOW4 | NB_FLOW6) #define NB_DEST (NB_IP | NB_IP6_SADR | NB_VPN | NB_MPLS) #define NB_ANY 0xffffffff @@ -619,6 +620,4 @@ static inline int net_in_net_src_ip6_sadr(const net_addr_ip6_sadr *a, const net_ int ipa_in_netX(const ip_addr A, const net_addr *N); int net_in_netX(const net_addr *A, const net_addr *N); -void net_init(void); - #endif diff --git a/lib/printf.c b/lib/printf.c index 236df427..424d545f 100644 --- a/lib/printf.c +++ b/lib/printf.c @@ -568,3 +568,51 @@ buffer_puts(buffer *buf, const char *str) buf->pos = (bp < be) ? bp : buf->end; } + +#define POOL_PRINTF_MAXBUF 1024 + +char *mb_vsprintf(pool *p, const char *fmt, va_list args) +{ + char buf[POOL_PRINTF_MAXBUF]; + int count = bvsnprintf(buf, POOL_PRINTF_MAXBUF, fmt, args); + + if (count < 0) + bug("Attempted to mb_vsprintf() a too long string"); + + char *out = mb_alloc(p, count + 1); + memcpy(out, buf, count + 1); + return out; +} + +char *mb_sprintf(pool *p, const char *fmt, ...) +{ + va_list args; + char *out; + va_start(args, fmt); + out = mb_vsprintf(p, fmt, args); + va_end(args); + return out; +} + +char *lp_vsprintf(linpool *p, const char *fmt, va_list args) +{ + char buf[POOL_PRINTF_MAXBUF]; + int count = bvsnprintf(buf, POOL_PRINTF_MAXBUF, fmt, args); + + if (count < 0) + bug("Attempted to mb_vsprintf() a too long string"); + + char *out = lp_alloc(p, count + 1); + memcpy(out, buf, count + 1); + return out; +} + +char *lp_sprintf(linpool *p, const char *fmt, ...) +{ + va_list args; + char *out; + va_start(args, fmt); + out = lp_vsprintf(p, fmt, args); + va_end(args); + return out; +} diff --git a/lib/printf_test.c b/lib/printf_test.c index 47ea905d..88ecf05e 100644 --- a/lib/printf_test.c +++ b/lib/printf_test.c @@ -32,11 +32,14 @@ t_simple(void) BSPRINTF(1, "@", buf, "@", 64); BSPRINTF(1, "\xff", buf, "%c", 0xff); - errno = 5; - BSPRINTF(18, "Input/output error", buf, "%m"); + const char *io_error_str = lp_strdup(tmp_linpool, strerror(EIO)); + const int io_error_len = strlen(io_error_str); + + errno = EIO; + BSPRINTF(io_error_len, io_error_str, buf, "%m"); errno = 0; - BSPRINTF(18, "Input/output error", buf, "%M", 5); + BSPRINTF(io_error_len, io_error_str, buf, "%M", EIO); BSPRINTF(11, "TeSt%StRiNg", buf, "%s", "TeSt%StRiNg"); @@ -13,15 +13,15 @@ */ #include "lib/rcu.h" -#include "lib/coro.h" +#include "lib/io-loop.h" #include "lib/locking.h" _Atomic uint rcu_gp_ctl = RCU_NEST_CNT; -_Thread_local struct rcu_coro *this_rcu_coro = NULL; +_Thread_local struct rcu_thread *this_rcu_thread = NULL; -static list rcu_coro_list; +static list rcu_thread_list; -static struct rcu_coro main_rcu_coro; +static struct rcu_thread main_rcu_thread; DEFINE_DOMAIN(resource); static DOMAIN(resource) rcu_domain; @@ -37,10 +37,10 @@ static void update_counter_and_wait(void) { atomic_fetch_xor(&rcu_gp_ctl, RCU_GP_PHASE); - struct rcu_coro *rc; - WALK_LIST(rc, rcu_coro_list) + struct rcu_thread *rc; + WALK_LIST(rc, rcu_thread_list) while (rcu_gp_ongoing(&rc->ctl)) - coro_yield(); + birdloop_yield(); } void @@ -53,19 +53,19 @@ synchronize_rcu(void) } void -rcu_coro_start(struct rcu_coro *rc) +rcu_thread_start(struct rcu_thread *rc) { LOCK_DOMAIN(resource, rcu_domain); - add_tail(&rcu_coro_list, &rc->n); - this_rcu_coro = rc; + add_tail(&rcu_thread_list, &rc->n); + this_rcu_thread = rc; UNLOCK_DOMAIN(resource, rcu_domain); } void -rcu_coro_stop(struct rcu_coro *rc) +rcu_thread_stop(struct rcu_thread *rc) { LOCK_DOMAIN(resource, rcu_domain); - this_rcu_coro = NULL; + this_rcu_thread = NULL; rem_node(&rc->n); UNLOCK_DOMAIN(resource, rcu_domain); } @@ -74,6 +74,6 @@ void rcu_init(void) { rcu_domain = DOMAIN_NEW(resource, "Read-Copy-Update"); - init_list(&rcu_coro_list); - rcu_coro_start(&main_rcu_coro); + init_list(&rcu_thread_list); + rcu_thread_start(&main_rcu_thread); } @@ -21,33 +21,33 @@ extern _Atomic uint rcu_gp_ctl; -struct rcu_coro { +struct rcu_thread { node n; _Atomic uint ctl; }; -extern _Thread_local struct rcu_coro *this_rcu_coro; +extern _Thread_local struct rcu_thread *this_rcu_thread; static inline void rcu_read_lock(void) { - uint cmp = atomic_load_explicit(&this_rcu_coro->ctl, memory_order_acquire); + uint cmp = atomic_load_explicit(&this_rcu_thread->ctl, memory_order_acquire); if (cmp & RCU_NEST_MASK) - atomic_store_explicit(&this_rcu_coro->ctl, cmp + RCU_NEST_CNT, memory_order_relaxed); + atomic_store_explicit(&this_rcu_thread->ctl, cmp + RCU_NEST_CNT, memory_order_relaxed); else - atomic_store(&this_rcu_coro->ctl, atomic_load_explicit(&rcu_gp_ctl, memory_order_acquire)); + atomic_store(&this_rcu_thread->ctl, atomic_load_explicit(&rcu_gp_ctl, memory_order_acquire)); } static inline void rcu_read_unlock(void) { - atomic_fetch_sub(&this_rcu_coro->ctl, RCU_NEST_CNT); + atomic_fetch_sub(&this_rcu_thread->ctl, RCU_NEST_CNT); } void synchronize_rcu(void); -/* Registering and unregistering a coroutine. To be called from coroutine implementation */ -void rcu_coro_start(struct rcu_coro *); -void rcu_coro_stop(struct rcu_coro *); +/* Registering and unregistering a birdloop. To be called from birdloop implementation */ +void rcu_thread_start(struct rcu_thread *); +void rcu_thread_stop(struct rcu_thread *); /* Run this from resource init */ void rcu_init(void); diff --git a/lib/resource.c b/lib/resource.c index 9a4c7717..94b8d019 100644 --- a/lib/resource.c +++ b/lib/resource.c @@ -15,7 +15,6 @@ #include "lib/resource.h" #include "lib/string.h" #include "lib/rcu.h" -#include "lib/io-loop.h" /** * DOC: Resource pools @@ -31,7 +30,7 @@ * is freed upon shutdown of the module. */ -static void pool_dump(resource *); +static void pool_dump(resource *, unsigned); static void pool_free(resource *); static resource *pool_lookup(resource *, unsigned long); static struct resmem pool_memsize(resource *P); @@ -47,44 +46,43 @@ static struct resclass pool_class = { pool root_pool; -static int indent; - /** * rp_new - create a resource pool * @p: parent pool - * @l: loop to assign * @name: pool name (to be included in debugging dumps) * * rp_new() creates a new resource pool inside the specified * parent pool. */ pool * -rp_new(pool *p, struct birdloop *loop, const char *name) +rp_new(pool *p, const char *name) { - ASSERT_DIE(birdloop_inside(p->loop)); - ASSERT_DIE(birdloop_inside(loop)); - pool *z = ralloc(p, &pool_class); - z->loop = loop; z->name = name; init_list(&z->inside); return z; } -_Thread_local static pool *pool_parent = NULL; +pool * +rp_newf(pool *p, const char *fmt, ...) +{ + pool *z = rp_new(p, NULL); + + va_list args; + va_start(args, fmt); + z->name = mb_vsprintf(p, fmt, args); + va_end(args); + + return z; +} + static void pool_free(resource *P) { - ASSERT_DIE(pool_parent); - pool *p = (pool *) P; - ASSERT_DIE(birdloop_inside(p->loop)); - - pool *parent = pool_parent; - pool_parent = p; - resource *r, *rr; + r = HEAD(p->inside); while (rr = (resource *) r->n.next) { @@ -92,73 +90,23 @@ pool_free(resource *P) xfree(r); r = rr; } - - pool_parent = parent; -} - -void -rp_free(pool *p, pool *parent) -{ - ASSERT_DIE(pool_parent == NULL); - pool_parent = parent; - rfree(p); - ASSERT_DIE(pool_parent == parent); - pool_parent = NULL; } static void -pool_dump_locked(pool *p) +pool_dump(resource *P, unsigned indent) { + pool *p = (pool *) P; resource *r; - + debug("%s\n", p->name); - indent += 3; WALK_LIST(r, p->inside) - rdump(r); - indent -= 3; -} - -static void -pool_dump(resource *P) -{ - pool *p = (pool *) P; - - if (p->loop != pool_parent->loop) - birdloop_enter(p->loop); - - pool *parent = pool_parent; - pool_parent = p; - - pool_dump_locked(p); - - pool_parent = parent; - - if (p->loop != pool_parent->loop) - birdloop_leave(p->loop); -} - -void -rp_dump(pool *p) -{ - int inside = birdloop_inside(p->loop); - if (!inside) - birdloop_enter(p->loop); - - ASSERT_DIE(pool_parent == NULL); - pool_parent = p; - - pool_dump_locked(p); - - ASSERT_DIE(pool_parent == p); - pool_parent = NULL; - - if (!inside) - birdloop_leave(p->loop); + rdump(r, indent + 3); } static struct resmem -pool_memsize_locked(pool *p) +pool_memsize(resource *P) { + pool *p = (pool *) P; resource *r; struct resmem sum = { .effective = 0, @@ -175,46 +123,6 @@ pool_memsize_locked(pool *p) return sum; } -static struct resmem -pool_memsize(resource *P) -{ - pool *p = (pool *) P; - - pool *parent = pool_parent; - pool_parent = p; - - if (p->loop != parent->loop) - birdloop_enter(p->loop); - - struct resmem sum = pool_memsize_locked(p); - - if (p->loop != parent->loop) - birdloop_leave(p->loop); - - pool_parent = parent; - - return sum; -} - -struct resmem -rp_memsize(pool *p) -{ - int inside = birdloop_inside(p->loop); - if (!inside) - birdloop_enter(p->loop); - - ASSERT_DIE(pool_parent == NULL); - pool_parent = p; - struct resmem sum = pool_memsize_locked(p); - ASSERT_DIE(pool_parent == p); - pool_parent = NULL; - - if (!inside) - birdloop_leave(p->loop); - - return sum; -} - static resource * pool_lookup(resource *P, unsigned long a) { @@ -282,7 +190,7 @@ rfree(void *res) * It works by calling a class-specific dump function. */ void -rdump(void *res) +rdump(void *res, unsigned indent) { char x[16]; resource *r = res; @@ -292,7 +200,7 @@ rdump(void *res) if (r) { debug("%s ", r->class->name); - r->class->dump(r); + r->class->dump(r, indent); } else debug("NULL\n"); @@ -325,15 +233,12 @@ rmemsize(void *res) void * ralloc(pool *p, struct resclass *c) { - ASSERT_DIE(p); - ASSERT_DIE(birdloop_inside(p->loop)); - resource *r = xmalloc(c->size); bzero(r, c->size); r->class = c; - add_tail(&p->inside, &r->n); - + if (p) + add_tail(&p->inside, &r->n); return r; } @@ -355,7 +260,7 @@ rlookup(unsigned long a) debug("Looking up %08lx\n", a); if (r = pool_lookup(&root_pool.r, a)) - rdump(r); + rdump(r, 3); else debug("Not found.\n"); } @@ -371,12 +276,33 @@ void resource_init(void) { rcu_init(); + resource_sys_init(); root_pool.r.class = &pool_class; root_pool.name = "Root"; init_list(&root_pool.inside); + tmp_init(&root_pool); } +_Thread_local struct tmp_resources tmp_res; + +void +tmp_init(pool *p) +{ + tmp_res.lp = lp_new_default(p); + tmp_res.parent = p; + tmp_res.pool = rp_new(p, "TMP"); +} + +void +tmp_flush(void) +{ + lp_flush(tmp_linpool); + rfree(tmp_res.pool); + tmp_res.pool = rp_new(tmp_res.parent, "TMP"); +} + + /** * DOC: Memory blocks * @@ -401,7 +327,7 @@ static void mbl_free(resource *r UNUSED) { } -static void mbl_debug(resource *r) +static void mbl_debug(resource *r, unsigned indent UNUSED) { struct mblock *m = (struct mblock *) r; @@ -509,21 +435,6 @@ mb_realloc(void *m, unsigned size) return b->data; } -/** - * mb_move - move a memory block - * @m: memory block - * @p: target pool - * - * mb_move() moves the given memory block to another pool in the same way - * as rmove() moves a plain resource. - */ -void -mb_move(void *m, pool *p) -{ - struct mblock *b = SKIP_BACK(struct mblock, data, m); - rmove(b, p); -} - /** * mb_free - free a memory block @@ -542,6 +453,7 @@ mb_free(void *m) } + #define STEP_UP(x) ((x) + (x)/2 + 4) void diff --git a/lib/resource.h b/lib/resource.h index 3e589e32..911b990d 100644 --- a/lib/resource.h +++ b/lib/resource.h @@ -30,7 +30,7 @@ struct resclass { char *name; /* Resource class name */ unsigned size; /* Standard size of single resource */ void (*free)(resource *); /* Freeing function */ - void (*dump)(resource *); /* Dump to debug output */ + void (*dump)(resource *, unsigned indent); /* Dump to debug output */ resource *(*lookup)(resource *, unsigned long); /* Look up address (only for debugging) */ struct resmem (*memsize)(resource *); /* Return size of memory used by the resource, may be NULL */ }; @@ -43,25 +43,21 @@ struct resclass { typedef struct pool { resource r; list inside; - struct birdloop *loop; const char *name; } pool; -void resource_init(void); +void resource_init(void); +pool *rp_new(pool *, const char *); /* Create new pool */ +pool *rp_newf(pool *, const char *, ...); /* Create a new pool with a formatted string as its name */ void rfree(void *); /* Free single resource */ -void rdump(void *); /* Dump to debug output */ +void rdump(void *, unsigned indent); /* Dump to debug output */ struct resmem rmemsize(void *res); /* Return size of memory used by the resource */ void rlookup(unsigned long); /* Look up address (only for debugging) */ void rmove(void *, pool *); /* Move to a different pool */ void *ralloc(pool *, struct resclass *); -pool *rp_new(pool *, struct birdloop *loop, const char *); /* Create new pool */ -void rp_free(pool *p, pool *parent); /* Free parent pool */ -struct resmem rp_memsize(pool *p); /* Return size of memory used by the pool */ -void rp_dump(pool *p); /* Dump pool to debug output */ - extern pool root_pool; /* Normal memory blocks */ @@ -69,7 +65,6 @@ extern pool root_pool; void *mb_alloc(pool *, unsigned size); void *mb_allocz(pool *, unsigned size); void *mb_realloc(void *m, unsigned size); -void mb_move(void *, pool *); void mb_free(void *); /* Memory pools with linear allocation */ @@ -79,9 +74,10 @@ typedef struct linpool linpool; typedef struct lp_state { void *current, *large; byte *ptr; + uint total_large; } lp_state; -linpool *lp_new(pool *, unsigned blk); +linpool *lp_new(pool *); void *lp_alloc(linpool *, unsigned size); /* Aligned */ void *lp_allocu(linpool *, unsigned size); /* Unaligned */ void *lp_allocz(linpool *, unsigned size); /* With clear */ @@ -89,10 +85,23 @@ void lp_flush(linpool *); /* Free everything, but leave linpool */ void lp_save(linpool *m, lp_state *p); /* Save state */ void lp_restore(linpool *m, lp_state *p); /* Restore state */ -extern const int lp_chunk_size; -#define LP_GAS 1024 -#define LP_GOOD_SIZE(x) (((x + LP_GAS - 1) & (~(LP_GAS - 1))) - lp_chunk_size) -#define lp_new_default(p) lp_new(p, 0) +struct tmp_resources { + pool *pool, *parent; + linpool *lp; +}; + +extern _Thread_local struct tmp_resources tmp_res; + +#define tmp_linpool tmp_res.lp +#define tmp_alloc(sz) lp_alloc(tmp_linpool, sz) +#define tmp_allocu(sz) lp_allocu(tmp_linpool, sz) +#define tmp_allocz(sz) lp_allocz(tmp_linpool, sz) + +void tmp_init(pool *p); +void tmp_flush(void); + + +#define lp_new_default lp_new /* Slabs */ @@ -101,7 +110,7 @@ typedef struct slab slab; slab *sl_new(pool *, unsigned size); void *sl_alloc(slab *); void *sl_allocz(slab *); -void sl_free(slab *, void *); +void sl_free(void *); /* * Low-level memory allocation functions, please don't use @@ -110,12 +119,16 @@ void sl_free(slab *, void *); void buffer_realloc(void **buf, unsigned *size, unsigned need, unsigned item_size); -extern long page_size; - /* Allocator of whole pages; for use in slabs and other high-level allocators. */ +#define PAGE_HEAD(x) ((void *) (((uintptr_t) (x)) & ~(page_size-1))) +extern long page_size; +extern _Atomic int pages_kept; +extern _Atomic int pages_kept_locally; void *alloc_page(void); void free_page(void *); -#define PAGE_HEAD(x) ((void *) (((intptr_t) (x)) & ~(page_size-1))) +void flush_local_pages(void); + +void resource_sys_init(void); #ifdef HAVE_LIBDMALLOC /* diff --git a/lib/route.h b/lib/route.h new file mode 100644 index 00000000..6189895a --- /dev/null +++ b/lib/route.h @@ -0,0 +1,560 @@ +/* + * BIRD Internet Routing Daemon -- Routing data structures + * + * (c) 1998--2000 Martin Mares <mj@ucw.cz> + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_LIB_ROUTE_H_ +#define _BIRD_LIB_ROUTE_H_ + +#undef RT_SOURCE_DEBUG + +#include "lib/type.h" +#include "lib/rcu.h" +#include "lib/hash.h" +#include "lib/event.h" + +struct network; +struct proto; +struct cli; +struct rtable_private; + +typedef struct rte { + struct ea_list *attrs; /* Attributes of this route */ + const net_addr *net; /* Network this RTE belongs to */ + struct rte_src *src; /* Route source that created the route */ + struct rt_import_hook *sender; /* Import hook used to send the route to the routing table */ + btime lastmod; /* Last modified (set by table) */ + u32 id; /* Table specific route id */ + byte flags; /* Table-specific flags */ + byte pflags; /* Protocol-specific flags */ + u8 generation; /* If this route import is based on other previously exported route, + this value should be 1 + MAX(generation of the parent routes). + Otherwise the route is independent and this value is zero. */ + u8 stale_cycle; /* Auxiliary value for route refresh */ +} rte; + +#define REF_FILTERED 2 /* Route is rejected by import filter */ +#define REF_PENDING 32 /* Route has not propagated completely yet */ + +/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ +static inline int rte_is_valid(const rte *r) { return r && !(r->flags & REF_FILTERED); } + +/* Route just has REF_FILTERED flag */ +static inline int rte_is_filtered(const rte *r) { return !!(r->flags & REF_FILTERED); } + +/* Strip the route of the table-specific values */ +static inline rte rte_init_from(const rte *r) +{ + return (rte) { + .attrs = r->attrs, + .net = r->net, + .src = r->src, + }; +} + +struct rte_src { + struct rte_src *next; /* Hash chain */ + struct rte_owner *owner; /* Route source owner */ + u32 private_id; /* Private ID, assigned by the protocol */ + u32 global_id; /* Globally unique ID of the source */ + _Atomic u64 uc; /* Use count */ +}; + +struct rte_owner_class { + void (*get_route_info)(const rte *, byte *buf); /* Get route information (for `show route' command) */ + int (*rte_better)(const rte *, const rte *); + int (*rte_mergable)(const rte *, const rte *); + u32 (*rte_igp_metric)(const rte *); +}; + +struct rte_owner { + struct rte_owner_class *class; + int (*rte_recalculate)(struct rtable_private *, struct network *, struct rte *, struct rte *, struct rte *); + HASH(struct rte_src) hash; + const char *name; + u32 hash_key; + u32 uc; + event_list *list; + event *prune; + event *stop; +}; + +DEFINE_DOMAIN(attrs); +extern DOMAIN(attrs) attrs_domain; + +#define RTA_LOCK LOCK_DOMAIN(attrs, attrs_domain) +#define RTA_UNLOCK UNLOCK_DOMAIN(attrs, attrs_domain) + +#define RTE_SRC_PU_SHIFT 44 +#define RTE_SRC_IN_PROGRESS (1ULL << RTE_SRC_PU_SHIFT) + +/* Get a route source. This also locks the source, therefore the caller has to + * unlock the source after the route has been propagated. */ +struct rte_src *rt_get_source_o(struct rte_owner *o, u32 id); +#define rt_get_source(p, id) rt_get_source_o(&(p)->sources, (id)) + +struct rte_src *rt_find_source_global(u32 id); + +#ifdef RT_SOURCE_DEBUG +#define rt_lock_source _rt_lock_source_internal +#define rt_unlock_source _rt_unlock_source_internal +#endif + +static inline void rt_lock_source(struct rte_src *src) +{ + /* Locking a source is trivial; somebody already holds it so we just increase + * the use count. Nothing can be freed underneath our hands. */ + u64 uc = atomic_fetch_add_explicit(&src->uc, 1, memory_order_acq_rel); + ASSERT_DIE(uc > 0); +} + +static inline void rt_unlock_source(struct rte_src *src) +{ + /* Unlocking is tricky. We do it lockless so at the same time, the prune + * event may be running, therefore if the unlock gets us to zero, it must be + * the last thing in this routine, otherwise the prune routine may find the + * source's usecount zeroed, freeing it prematurely. + * + * The usecount is split into two parts: + * the top 20 bits are an in-progress indicator + * the bottom 44 bits keep the actual usecount. + * + * Therefore at most 1 million of writers can simultaneously unlock the same + * source, while at most ~17T different routes can reference it. Both limits + * are insanely high from the 2022 point of view. Let's suppose that when 17T + * routes or 1M writers get real, we get also 128bit atomic variables in the + * C norm. */ + + /* First, we push the in-progress indicator */ + u64 uc = atomic_fetch_add_explicit(&src->uc, RTE_SRC_IN_PROGRESS, memory_order_acq_rel); + + /* Then we split the indicator to its parts. Remember, we got the value before the operation happened. */ + u64 pending = (uc >> RTE_SRC_PU_SHIFT) + 1; + uc &= RTE_SRC_IN_PROGRESS - 1; + + /* We per-use the RCU critical section indicator to make the prune event wait + * until we finish here in the rare case we get preempted. */ + rcu_read_lock(); + + /* Obviously, there can't be more pending unlocks than the usecount itself */ + if (uc == pending) + /* If we're the last unlocker, schedule the owner's prune event */ + ev_send(src->owner->list, src->owner->prune); + else + ASSERT_DIE(uc > pending); + + /* And now, finally, simultaneously pop the in-progress indicator and the + * usecount, possibly allowing the source pruning routine to free this structure */ + atomic_fetch_sub_explicit(&src->uc, RTE_SRC_IN_PROGRESS + 1, memory_order_acq_rel); + + /* ... and to reduce the load a bit, the source pruning routine will better wait for + * RCU synchronization instead of a busy loop. */ + rcu_read_unlock(); +} + +#ifdef RT_SOURCE_DEBUG +#undef rt_lock_source +#undef rt_unlock_source + +#define rt_lock_source(x) ( log(L_INFO "Lock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_lock_source_internal(x) ) +#define rt_unlock_source(x) ( log(L_INFO "Unlock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_unlock_source_internal(x) ) +#endif + +void rt_init_sources(struct rte_owner *, const char *name, event_list *list); +void rt_destroy_sources(struct rte_owner *, event *); + +/* + * Route Attributes + * + * Beware: All standard BGP attributes must be represented here instead + * of making them local to the route. This is needed to ensure proper + * construction of BGP route attribute lists. + */ + +/* Nexthop structure */ +struct nexthop { + ip_addr gw; /* Next hop */ + struct iface *iface; /* Outgoing interface */ + byte flags; + byte weight; + byte labels; /* Number of all labels */ + u32 label[0]; +}; + +/* For packing one into eattrs */ +struct nexthop_adata { + struct adata ad; + /* There is either a set of nexthops or a special destination (RTD_*) */ + union { + struct nexthop nh; + uint dest; + }; +}; + +#define NEXTHOP_DEST_SIZE (OFFSETOF(struct nexthop_adata, dest) + sizeof(uint) - OFFSETOF(struct adata, data)) +#define NEXTHOP_DEST_LITERAL(x) ((struct nexthop_adata) { \ + .ad.length = NEXTHOP_DEST_SIZE, .dest = (x), }) + +#define RNF_ONLINK 0x1 /* Gateway is onlink regardless of IP ranges */ + + +#define RTS_STATIC 1 /* Normal static route */ +#define RTS_INHERIT 2 /* Route inherited from kernel */ +#define RTS_DEVICE 3 /* Device route */ +#define RTS_STATIC_DEVICE 4 /* Static device route */ +#define RTS_REDIRECT 5 /* Learned via redirect */ +#define RTS_RIP 6 /* RIP route */ +#define RTS_OSPF 7 /* OSPF route */ +#define RTS_OSPF_IA 8 /* OSPF inter-area route */ +#define RTS_OSPF_EXT1 9 /* OSPF external route type 1 */ +#define RTS_OSPF_EXT2 10 /* OSPF external route type 2 */ +#define RTS_BGP 11 /* BGP route */ +#define RTS_PIPE 12 /* Inter-table wormhole */ +#define RTS_BABEL 13 /* Babel route */ +#define RTS_RPKI 14 /* Route Origin Authorization */ +#define RTS_PERF 15 /* Perf checker */ +#define RTS_MAX 16 + +#define RTD_NONE 0 /* Undefined next hop */ +#define RTD_UNICAST 1 /* A standard next hop */ +#define RTD_BLACKHOLE 2 /* Silently drop packets */ +#define RTD_UNREACHABLE 3 /* Reject as unreachable */ +#define RTD_PROHIBIT 4 /* Administratively prohibited */ +#define RTD_MAX 5 + +extern const char * rta_dest_names[RTD_MAX]; + +static inline const char *rta_dest_name(uint n) +{ return (n < RTD_MAX) ? rta_dest_names[n] : "???"; } + + +/* + * Extended Route Attributes + */ + +typedef struct eattr { + word id; /* EA_CODE(PROTOCOL_..., protocol-dependent ID) */ + byte flags; /* Protocol-dependent flags */ + byte type; /* Attribute type */ + byte rfu:5; + byte originated:1; /* The attribute has originated locally */ + byte fresh:1; /* An uncached attribute (e.g. modified in export filter) */ + byte undef:1; /* Explicitly undefined */ + + PADDING(unused, 3, 3); + + union bval u; +} eattr; + + +#define EA_CODE_MASK 0xffff +#define EA_ALLOW_UNDEF 0x10000 /* ea_find: allow EAF_TYPE_UNDEF */ +#define EA_BIT(n) ((n) << 24) /* Used in bitfield accessors */ +#define EA_BIT_GET(ea) ((ea) >> 24) + +typedef struct ea_list { + struct ea_list *next; /* In case we have an override list */ + byte flags; /* Flags: EALF_... */ + byte rfu; + word count; /* Number of attributes */ + eattr attrs[0]; /* Attribute definitions themselves */ +} ea_list; + +struct ea_storage { + struct ea_storage *next_hash; /* Next in hash chain */ + struct ea_storage **pprev_hash; /* Previous in hash chain */ + _Atomic u32 uc; /* Use count */ + u32 hash_key; /* List hash */ + ea_list l[0]; /* The list itself */ +}; + +#define EALF_SORTED 1 /* Attributes are sorted by code */ +#define EALF_BISECT 2 /* Use interval bisection for searching */ +#define EALF_CACHED 4 /* List is cached */ +#define EALF_HUGE 8 /* List is too big to fit into slab */ + +struct ea_class { +#define EA_CLASS_INSIDE \ + const char *name; /* Name (both print and filter) */ \ + struct symbol *sym; /* Symbol to export to configs */ \ + uint id; /* Autoassigned attribute ID */ \ + uint uc; /* Reference count */ \ + btype type; /* Data type ID */ \ + uint readonly:1; /* This attribute can't be changed by filters */ \ + uint conf:1; /* Requested by config */ \ + uint hidden:1; /* Technical attribute, do not show, do not expose to filters */ \ + void (*format)(const eattr *ea, byte *buf, uint size); \ + void (*stored)(const eattr *ea); /* When stored into global hash */ \ + void (*freed)(const eattr *ea); /* When released from global hash */ \ + + EA_CLASS_INSIDE; +}; + +struct ea_class_ref { + resource r; + struct ea_class *class; +}; + +void ea_register_init(struct ea_class *); +struct ea_class_ref *ea_register_alloc(pool *, struct ea_class); + +#define EA_REGISTER_ALL_HELPER(x) ea_register_init(x); +#define EA_REGISTER_ALL(...) MACRO_FOREACH(EA_REGISTER_ALL_HELPER, __VA_ARGS__) + +struct ea_class *ea_class_find_by_id(uint id); +struct ea_class *ea_class_find_by_name(const char *name); +static inline struct ea_class *ea_class_self(struct ea_class *self) { return self; } +#define ea_class_find(_arg) _Generic((_arg), \ + uint: ea_class_find_by_id, \ + word: ea_class_find_by_id, \ + char *: ea_class_find_by_name, \ + const char *: ea_class_find_by_name, \ + struct ea_class *: ea_class_self)(_arg) + +struct ea_walk_state { + ea_list *eattrs; /* Ccurrent ea_list, initially set by caller */ + eattr *ea; /* Current eattr, initially NULL */ + u32 visited[4]; /* Bitfield, limiting max to 128 */ +}; + +#define ea_find(_l, _arg) _Generic((_arg), uint: ea_find_by_id, struct ea_class *: ea_find_by_class, char *: ea_find_by_name)(_l, _arg) +eattr *ea_find_by_id(ea_list *, unsigned ea); +static inline eattr *ea_find_by_class(ea_list *l, const struct ea_class *def) +{ return ea_find_by_id(l, def->id); } +static inline eattr *ea_find_by_name(ea_list *l, const char *name) +{ + const struct ea_class *def = ea_class_find_by_name(name); + return def ? ea_find_by_class(l, def) : NULL; +} + +#define ea_get_int(_l, _ident, _def) ({ \ + struct ea_class *cls = ea_class_find((_ident)); \ + ASSERT_DIE(cls->type & EAF_EMBEDDED); \ + const eattr *ea = ea_find((_l), cls->id); \ + (ea ? ea->u.data : (_def)); \ + }) + +#define ea_get_ip(_l, _ident, _def) ({ \ + struct ea_class *cls = ea_class_find((_ident)); \ + ASSERT_DIE(cls->type == T_IP); \ + const eattr *ea = ea_find((_l), cls->id); \ + (ea ? *((const ip_addr *) ea->u.ptr->data) : (_def)); \ + }) + +eattr *ea_walk(struct ea_walk_state *s, uint id, uint max); +void ea_dump(ea_list *); +int ea_same(ea_list *x, ea_list *y); /* Test whether two ea_lists are identical */ +uint ea_hash(ea_list *e); /* Calculate 16-bit hash value */ +ea_list *ea_append(ea_list *to, ea_list *what); +void ea_format_bitfield(const struct eattr *a, byte *buf, int bufsize, const char **names, int min, int max); + +/* Normalize ea_list; allocates the result from tmp_linpool */ +ea_list *ea_normalize(ea_list *e, int overlay); + +uint ea_list_size(ea_list *); +void ea_list_copy(ea_list *dest, ea_list *src, uint size); + +#define EA_LOCAL_LIST(N) struct { ea_list l; eattr a[N]; } + +#define EA_LITERAL_EMBEDDED(_class, _flags, _val) ({ \ + btype _type = (_class)->type; \ + ASSERT_DIE(_type & EAF_EMBEDDED); \ + EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.i = _val); \ + }) + +#define EA_LITERAL_STORE_ADATA(_class, _flags, _buf, _len) ({ \ + btype _type = (_class)->type; \ + ASSERT_DIE(!(_type & EAF_EMBEDDED)); \ + EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.ad = tmp_store_adata((_buf), (_len))); \ + }) + +#define EA_LITERAL_DIRECT_ADATA(_class, _flags, _adata) ({ \ + btype _type = (_class)->type; \ + ASSERT_DIE(!(_type & EAF_EMBEDDED)); \ + EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.ad = _adata); \ + }) + +#define EA_LITERAL_GENERIC(_id, _type, _flags, ...) \ + ((eattr) { .id = _id, .type = _type, .flags = _flags, __VA_ARGS__ }) + +static inline eattr * +ea_set_attr(ea_list **to, eattr a) +{ + EA_LOCAL_LIST(1) *ea = tmp_alloc(sizeof(*ea)); + *ea = (typeof(*ea)) { + .l.flags = EALF_SORTED, + .l.count = 1, + .l.next = *to, + .a[0] = a, + }; + + *to = &ea->l; + return &ea->a[0]; +} + +static inline void +ea_unset_attr(ea_list **to, _Bool local, const struct ea_class *def) +{ + ea_set_attr(to, EA_LITERAL_GENERIC(def->id, 0, 0, + .fresh = local, .originated = local, .undef = 1)); +} + +static inline void +ea_set_attr_u32(ea_list **to, const struct ea_class *def, uint flags, u64 data) +{ ea_set_attr(to, EA_LITERAL_EMBEDDED(def, flags, data)); } + +static inline void +ea_set_attr_data(ea_list **to, const struct ea_class *def, uint flags, const void *data, uint len) +{ ea_set_attr(to, EA_LITERAL_STORE_ADATA(def, flags, data, len)); } + +static inline void +ea_copy_attr(ea_list **to, ea_list *from, const struct ea_class *def) +{ + eattr *e = ea_find_by_class(from, def); + if (e) + if (e->type & EAF_EMBEDDED) + ea_set_attr_u32(to, def, e->flags, e->u.data); + else + ea_set_attr_data(to, def, e->flags, e->u.ptr->data, e->u.ptr->length); + else + ea_unset_attr(to, 0, def); +} + +/* + * Common route attributes + */ + +/* Preference: first-order comparison */ +extern struct ea_class ea_gen_preference; +static inline u32 rt_get_preference(const rte *rt) +{ return ea_get_int(rt->attrs, &ea_gen_preference, 0); } + +/* IGP metric: second-order comparison */ +extern struct ea_class ea_gen_igp_metric; +u32 rt_get_igp_metric(const rte *rt); +#define IGP_METRIC_UNKNOWN 0x80000000 /* Default igp_metric used when no other + protocol-specific metric is availabe */ + +/* From: Advertising router */ +extern struct ea_class ea_gen_from; + +/* Source: An old method to devise the route source protocol and kind. + * To be superseded in a near future by something more informative. */ +extern struct ea_class ea_gen_source; +static inline u32 rt_get_source_attr(const rte *rt) +{ return ea_get_int(rt->attrs, &ea_gen_source, 0); } + +/* Flowspec validation result */ +enum flowspec_valid { + FLOWSPEC_UNKNOWN = 0, + FLOWSPEC_VALID = 1, + FLOWSPEC_INVALID = 2, + FLOWSPEC__MAX, +}; + +extern const char * flowspec_valid_names[FLOWSPEC__MAX]; +static inline const char *flowspec_valid_name(enum flowspec_valid v) +{ return (v < FLOWSPEC__MAX) ? flowspec_valid_names[v] : "???"; } + +extern struct ea_class ea_gen_flowspec_valid; +static inline enum flowspec_valid rt_get_flowspec_valid(rte *rt) +{ return ea_get_int(rt->attrs, &ea_gen_flowspec_valid, FLOWSPEC_UNKNOWN); } + +/* Next hop: For now, stored as adata */ +extern struct ea_class ea_gen_nexthop; + +static inline void ea_set_dest(struct ea_list **to, uint flags, uint dest) +{ + struct nexthop_adata nhad = NEXTHOP_DEST_LITERAL(dest); + ea_set_attr_data(to, &ea_gen_nexthop, flags, &nhad.ad.data, nhad.ad.length); +} + +/* Next hop structures */ + +#define NEXTHOP_ALIGNMENT (_Alignof(struct nexthop)) +#define NEXTHOP_MAX_SIZE (sizeof(struct nexthop) + sizeof(u32)*MPLS_MAX_LABEL_STACK) +#define NEXTHOP_SIZE(_nh) NEXTHOP_SIZE_CNT(((_nh)->labels)) +#define NEXTHOP_SIZE_CNT(cnt) BIRD_ALIGN((sizeof(struct nexthop) + sizeof(u32) * (cnt)), NEXTHOP_ALIGNMENT) +#define nexthop_size(nh) NEXTHOP_SIZE((nh)) + +#define NEXTHOP_NEXT(_nh) ((void *) (_nh) + NEXTHOP_SIZE(_nh)) +#define NEXTHOP_END(_nhad) ((_nhad)->ad.data + (_nhad)->ad.length) +#define NEXTHOP_VALID(_nh, _nhad) ((void *) (_nh) < (void *) NEXTHOP_END(_nhad)) +#define NEXTHOP_ONE(_nhad) (NEXTHOP_NEXT(&(_nhad)->nh) == NEXTHOP_END(_nhad)) + +#define NEXTHOP_WALK(_iter, _nhad) for ( \ + struct nexthop *_iter = &(_nhad)->nh; \ + (void *) _iter < (void *) NEXTHOP_END(_nhad); \ + _iter = NEXTHOP_NEXT(_iter)) + + +static inline int nexthop_same(struct nexthop_adata *x, struct nexthop_adata *y) +{ return adata_same(&x->ad, &y->ad); } +struct nexthop_adata *nexthop_merge(struct nexthop_adata *x, struct nexthop_adata *y, int max, linpool *lp); +struct nexthop_adata *nexthop_sort(struct nexthop_adata *x, linpool *lp); +int nexthop_is_sorted(struct nexthop_adata *x); + +#define NEXTHOP_IS_REACHABLE(nhad) ((nhad)->ad.length > NEXTHOP_DEST_SIZE) + +/* Route has regular, reachable nexthop (i.e. not RTD_UNREACHABLE and like) */ +static inline int rte_is_reachable(rte *r) +{ + eattr *nhea = ea_find(r->attrs, &ea_gen_nexthop); + if (!nhea) + return 0; + + struct nexthop_adata *nhad = (void *) nhea->u.ptr; + return NEXTHOP_IS_REACHABLE(nhad); +} + +static inline int nhea_dest(eattr *nhea) +{ + if (!nhea) + return RTD_NONE; + + struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + if (NEXTHOP_IS_REACHABLE(nhad)) + return RTD_UNICAST; + else + return nhad->dest; +} + +static inline int rte_dest(const rte *r) +{ + return nhea_dest(ea_find(r->attrs, &ea_gen_nexthop)); +} + +void rta_init(void); +ea_list *ea_lookup(ea_list *, int overlay); /* Get a cached (and normalized) variant of this attribute list */ +static inline int ea_is_cached(const ea_list *r) { return r->flags & EALF_CACHED; } +static inline struct ea_storage *ea_get_storage(ea_list *r) +{ + ASSERT_DIE(ea_is_cached(r)); + return SKIP_BACK(struct ea_storage, l[0], r); +} + +static inline ea_list *ea_clone(ea_list *r) { + ASSERT_DIE(0 < atomic_fetch_add_explicit(&ea_get_storage(r)->uc, 1, memory_order_acq_rel)); + return r; +} +void ea__free(struct ea_storage *r); +static inline void ea_free(ea_list *l) { + if (!l) return; + struct ea_storage *r = ea_get_storage(l); + if (1 == atomic_fetch_sub_explicit(&r->uc, 1, memory_order_acq_rel)) ea__free(r); +} + +void ea_dump(ea_list *); +void ea_dump_all(void); +void ea_show_list(struct cli *, ea_list *); + +#define rta_lookup ea_lookup +#define rta_is_cached ea_is_cached +#define rta_clone ea_clone +#define rta_free ea_free + +#endif diff --git a/lib/settle.h b/lib/settle.h new file mode 100644 index 00000000..e721f21f --- /dev/null +++ b/lib/settle.h @@ -0,0 +1,64 @@ +/* + * BIRD -- Settle timer + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * (c) 2022 CZ.NIC z.s.p.o. + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_SETTLE_H_ +#define _BIRD_SETTLE_H_ + +#include "lib/birdlib.h" +#include "lib/timer.h" + +struct settle_config { + btime min, max; +}; + +struct settle { + union { + /* Timer hook polymorphism. */ + struct { + resource _r; + void (*hook)(struct settle *); + }; + timer tm; + }; + struct settle_config cf; + btime started; +}; + +STATIC_ASSERT(OFFSETOF(struct settle, hook) == OFFSETOF(struct settle, tm) + OFFSETOF(timer, hook)); + +#define SETTLE_INIT(_cfp, _hook, _data) (struct settle) { .tm = { .data = (_data), .hook = TYPE_CAST(void (*)(struct settle *), void (*)(struct timer *), (_hook)), }, .cf = ({ASSERT_DIE((_cfp)->min <= (_cfp)->max); *(_cfp); }), } + + +static inline void settle_init(struct settle *s, struct settle_config *cf, void (*hook)(struct settle *), void *data) +{ + *s = SETTLE_INIT(cf, hook, data); +} + +#define settle_active(s) tm_active(&(s)->tm) + +static inline void settle_kick(struct settle *s, struct birdloop *loop) +{ + if (!tm_active(&s->tm)) + { + s->started = current_time(); + tm_set_in(&s->tm, s->started + s->cf.min, loop); + } + else + { + btime now = current_time(); + tm_set_in(&s->tm, MIN_(now + s->cf.min, s->started + s->cf.max), loop); + } +} + +static inline void settle_cancel(struct settle *s) +{ + tm_stop(&s->tm); +} + +#endif @@ -32,6 +32,7 @@ #include "nest/bird.h" #include "lib/resource.h" #include "lib/string.h" +#include "lib/tlists.h" #undef FAKE_SLAB /* Turn on if you want to debug memory allocations */ @@ -40,7 +41,7 @@ #endif static void slab_free(resource *r); -static void slab_dump(resource *r); +static void slab_dump(resource *r, unsigned indent); static resource *slab_lookup(resource *r, unsigned long addr); static struct resmem slab_memsize(resource *r); @@ -98,7 +99,7 @@ sl_allocz(slab *s) } void -sl_free(slab *s, void *oo) +sl_free(void *oo) { struct sl_obj *o = SKIP_BACK(struct sl_obj, data, oo); @@ -117,7 +118,7 @@ slab_free(resource *r) } static void -slab_dump(resource *r) +slab_dump(resource *r, unsigned indent UNUSED) { slab *s = (slab *) r; int cnt = 0; @@ -153,12 +154,38 @@ slab_memsize(resource *r) #define MAX_EMPTY_HEADS 1 +enum sl_head_state { + slh_empty = 2, + slh_partial = 0, + slh_full = 1, +} PACKED; + +struct sl_head { + struct slab *slab; + TLIST_NODE(sl_head, struct sl_head) n; + u16 num_full; + enum sl_head_state state; + u32 used_bits[0]; +}; + +struct sl_alignment { /* Magic structure for testing of alignment */ + byte data; + int x[0]; +}; + +#define TLIST_PREFIX sl_head +#define TLIST_TYPE struct sl_head +#define TLIST_ITEM n +#define TLIST_WANT_WALK +#define TLIST_WANT_ADD_HEAD + +#include "lib/tlists.h" + struct slab { resource r; - pool *p; uint obj_size, head_size, head_bitfield_len; uint objs_per_slab, num_empty_heads, data_size; - list empty_heads, partial_heads, full_heads; + struct sl_head_list empty_heads, partial_heads, full_heads; }; static struct resclass sl_class = { @@ -170,18 +197,15 @@ static struct resclass sl_class = { slab_memsize }; -struct sl_head { - node n; - u32 num_full; - u32 used_bits[0]; -}; +#define SL_GET_HEAD(x) PAGE_HEAD(x) -struct sl_alignment { /* Magic structure for testing of alignment */ - byte data; - int x[0]; -}; +#define SL_HEAD_CHANGE_STATE(_s, _h, _from, _to) ({ \ + ASSERT_DIE(_h->state == slh_##_from); \ + sl_head_rem_node(&_s->_from##_heads, _h); \ + sl_head_add_head(&_s->_to##_heads, _h); \ + _h->state = slh_##_to; \ + }) -#define SL_GET_HEAD(x) ((struct sl_head *) PAGE_HEAD(x)) /** * sl_new - create a new Slab @@ -195,10 +219,9 @@ slab * sl_new(pool *p, uint size) { slab *s = ralloc(p, &sl_class); - s->p = p; uint align = sizeof(struct sl_alignment); - if (align < sizeof(int)) - align = sizeof(int); + if (align < sizeof(void *)) + align = sizeof(void *); s->data_size = size; size = (size + align - 1) / align * align; s->obj_size = size; @@ -213,15 +236,12 @@ sl_new(pool *p, uint size) + sizeof(u32) * s->head_bitfield_len + align - 1) / align * align; - } while (s->objs_per_slab * size + s->head_size > (uint) page_size); + } while (s->objs_per_slab * size + s->head_size > (size_t) page_size); if (!s->objs_per_slab) bug("Slab: object too large"); s->num_empty_heads = 0; - init_list(&s->empty_heads); - init_list(&s->partial_heads); - init_list(&s->full_heads); return s; } @@ -238,8 +258,7 @@ sl_alloc(slab *s) struct sl_head *h; redo: - h = HEAD(s->partial_heads); - if (!h->n.next) + if (!(h = s->partial_heads.first)) goto no_partial; okay: for (uint i=0; i<s->head_bitfield_len; i++) @@ -259,26 +278,27 @@ okay: return out; } - rem_node(&h->n); - add_tail(&s->full_heads, &h->n); + SL_HEAD_CHANGE_STATE(s, h, partial, full); goto redo; no_partial: - h = HEAD(s->empty_heads); - if (h->n.next) + if (h = s->empty_heads.first) { - rem_node(&h->n); - add_head(&s->partial_heads, &h->n); + SL_HEAD_CHANGE_STATE(s, h, empty, partial); s->num_empty_heads--; goto okay; } + h = alloc_page(); + ASSERT_DIE(SL_GET_HEAD(h) == h); + #ifdef POISON memset(h, 0xba, page_size); #endif - ASSERT_DIE(SL_GET_HEAD(h) == h); + memset(h, 0, s->head_size); - add_head(&s->partial_heads, &h->n); + h->slab = s; + sl_head_add_head(&s->partial_heads, h); goto okay; } @@ -307,9 +327,10 @@ sl_allocz(slab *s) * and returns it back to the Slab @s. */ void -sl_free(slab *s, void *oo) +sl_free(void *oo) { struct sl_head *h = SL_GET_HEAD(oo); + struct slab *s = h->slab; #ifdef POISON memset(oo, 0xdb, s->data_size); @@ -322,14 +343,11 @@ sl_free(slab *s, void *oo) h->used_bits[pos / 32] &= ~(1 << (pos % 32)); - if (h->num_full-- == s->objs_per_slab) - { - rem_node(&h->n); - add_head(&s->partial_heads, &h->n); - } + if ((h->num_full-- == s->objs_per_slab) && (h->state == slh_full)) + SL_HEAD_CHANGE_STATE(s, h, full, partial); else if (!h->num_full) { - rem_node(&h->n); + sl_head_rem_node(&s->partial_heads, h); if (s->num_empty_heads >= MAX_EMPTY_HEADS) { #ifdef POISON @@ -339,7 +357,8 @@ sl_free(slab *s, void *oo) } else { - add_head(&s->empty_heads, &h->n); + sl_head_add_head(&s->empty_heads, h); + h->state = slh_empty; s->num_empty_heads++; } } @@ -349,30 +368,37 @@ static void slab_free(resource *r) { slab *s = (slab *) r; - struct sl_head *h, *g; - WALK_LIST_DELSAFE(h, g, s->empty_heads) + WALK_TLIST_DELSAFE(sl_head, h, &s->empty_heads) free_page(h); - WALK_LIST_DELSAFE(h, g, s->partial_heads) + WALK_TLIST_DELSAFE(sl_head, h, &s->partial_heads) free_page(h); - WALK_LIST_DELSAFE(h, g, s->full_heads) + WALK_TLIST_DELSAFE(sl_head, h, &s->full_heads) free_page(h); } static void -slab_dump(resource *r) +slab_dump(resource *r, unsigned indent UNUSED) { slab *s = (slab *) r; int ec=0, pc=0, fc=0; - struct sl_head *h; - WALK_LIST(h, s->empty_heads) + WALK_TLIST(sl_head, h, &s->empty_heads) ec++; - WALK_LIST(h, s->partial_heads) + WALK_TLIST(sl_head, h, &s->partial_heads) pc++; - WALK_LIST(h, s->full_heads) + WALK_TLIST(sl_head, h, &s->full_heads) fc++; debug("(%de+%dp+%df blocks per %d objs per %d bytes)\n", ec, pc, fc, s->objs_per_slab, s->obj_size); + + char x[16]; + bsprintf(x, "%%%ds%%s %%p\n", indent + 2); + WALK_TLIST(sl_head, h, &s->full_heads) + debug(x, "", "full", h); + WALK_TLIST(sl_head, h, &s->partial_heads) + debug(x, "", "partial", h); + WALK_TLIST(sl_head, h, &s->empty_heads) + debug(x, "", "empty", h); } static struct resmem @@ -380,23 +406,22 @@ slab_memsize(resource *r) { slab *s = (slab *) r; size_t heads = 0; - struct sl_head *h; - WALK_LIST(h, s->full_heads) + WALK_TLIST(sl_head, h, &s->full_heads) heads++; size_t items = heads * s->objs_per_slab; - WALK_LIST(h, s->partial_heads) + WALK_TLIST(sl_head, h, &s->partial_heads) { heads++; items += h->num_full; } - WALK_LIST(h, s->empty_heads) + WALK_TLIST(sl_head, h, &s->empty_heads) heads++; - size_t eff = items * s->obj_size; + size_t eff = items * s->data_size; return (struct resmem) { .effective = eff, @@ -408,12 +433,11 @@ static resource * slab_lookup(resource *r, unsigned long a) { slab *s = (slab *) r; - struct sl_head *h; - WALK_LIST(h, s->partial_heads) + WALK_TLIST(sl_head, h, &s->partial_heads) if ((unsigned long) h < a && (unsigned long) h + page_size < a) return r; - WALK_LIST(h, s->full_heads) + WALK_TLIST(sl_head, h, &s->full_heads) if ((unsigned long) h < a && (unsigned long) h + page_size < a) return r; return NULL; diff --git a/lib/slab_test.c b/lib/slab_test.c new file mode 100644 index 00000000..803d0215 --- /dev/null +++ b/lib/slab_test.c @@ -0,0 +1,171 @@ +/* + * BIRD Library -- Slab Alloc / Dealloc Tests + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include "test/birdtest.h" +#include "lib/resource.h" +#include "lib/bitops.h" + +static const int sizes[] = { + 8, 12, 18, 27, 41, 75, 131, 269, +}; + +#define TEST_SIZE 1024 * 128 +#define ITEMS(sz) TEST_SIZE / ( (sz) >> u32_log2((sz))/2 ) + +struct test_request { + int size; + enum strategy { + TEST_NONE, + TEST_FORWARDS, + TEST_BACKWARDS, + TEST_RANDOM, + TEST_MIXED, + TEST__MAX, + } strategy; +}; + +const char * const strategy_name[TEST__MAX] = { + [TEST_FORWARDS] = "forwards", + [TEST_BACKWARDS] = "backwards", + [TEST_RANDOM] = "random", + [TEST_MIXED] = "mixed", +}; + +static inline byte *test_alloc(slab *s, int sz, struct resmem *sliz) +{ + byte *out = sl_alloc(s); + + for (int p=0; p < sz; p++) + out[p] = p & 0xff; + + struct resmem ns = rmemsize((resource *) s); + + bt_assert(sliz->effective + sz == ns.effective); + bt_assert((sliz->overhead - sz - ns.overhead) % page_size == 0); + + *sliz = ns; + + return out; +} + +static inline void test_free(slab *s, byte *block, int sz, struct resmem *sliz) +{ + for (int p=0; p < sz; p++) + { + bt_assert(block[p] == (p & 0xff)); + block[p]++; + } + + sl_free(block); + + struct resmem ns = rmemsize((resource *) s); + + bt_assert(sliz->effective - sz == ns.effective); + bt_assert((sliz->overhead + sz - ns.overhead) % page_size == 0); + + *sliz = ns; +} + +static inline struct resmem get_memsize(slab *s) +{ + struct resmem sz = rmemsize((resource *) s); + bt_assert(sz.effective == 0); + return sz; +} + +static int +t_slab(const void *data) +{ + const struct test_request *tr = data; + int sz = tr->size; + + slab *s = sl_new(&root_pool, sz); + struct resmem sliz = get_memsize(s); + + int n = ITEMS(sz); + byte **block = mb_alloc(&root_pool, n * sizeof(*block)); + + switch (tr->strategy) { + case TEST_FORWARDS: + for (int i = 0; i < n; i++) + block[i] = test_alloc(s, sz, &sliz); + + for (int i = 0; i < n; i++) + test_free(s, block[i], sz, &sliz); + + break; + + case TEST_BACKWARDS: + for (int i = 0; i < n; i++) + block[i] = test_alloc(s, sz, &sliz); + + for (int i = n - 1; i >= 0; i--) + test_free(s, block[i], sz, &sliz); + + break; + + case TEST_RANDOM: + for (int i = 0; i < n; i++) + block[i] = test_alloc(s, sz, &sliz); + + for (int i = 0; i < n; i++) + { + int pos = bt_random() % (n - i); + test_free(s, block[pos], sz, &sliz); + if (pos != n - i - 1) + block[pos] = block[n - i - 1]; + } + + break; + + case TEST_MIXED: + { + int cur = 0; + int pending = n; + + while (cur + pending > 0) { + int action = bt_random() % (cur + pending); + + if (action < cur) { + test_free(s, block[action], sz, &sliz); + if (action != --cur) + block[action] = block[cur]; + } else { + block[cur++] = test_alloc(s, sz, &sliz); + pending--; + } + } + + break; + } + + default: bug("This shouldn't happen"); + } + + mb_free(block); + return 1; +} +int main(int argc, char *argv[]) +{ + bt_init(argc, argv); + + struct test_request tr; + + for (uint i = 0; i < sizeof(sizes) / sizeof(*sizes); i++) + for (uint strategy = TEST_FORWARDS; strategy < TEST__MAX; strategy++) + { + tr = (struct test_request) { + .size = sizes[i], + .strategy = strategy, + }; + bt_test_suite_arg(t_slab, &tr, "Slab allocator test, size=%d, strategy=%s", + tr.size, strategy_name[strategy]); + } + + return bt_exit_value(); +} diff --git a/lib/socket.h b/lib/socket.h index ce4382e9..4b169581 100644 --- a/lib/socket.h +++ b/lib/socket.h @@ -40,7 +40,6 @@ struct ssh_sock { typedef struct birdsock { resource r; pool *pool; /* Pool where incoming connections should be allocated (for SK_xxx_PASSIVE) */ - struct birdloop *loop; /* The birdloop where this socket belongs to */ int type; /* Socket type */ int subtype; /* Socket subtype */ void *data; /* User data */ @@ -58,8 +57,6 @@ typedef struct birdsock { uint fast_rx; /* RX has higher priority in event loop */ uint rbsize; int (*rx_hook)(struct birdsock *, uint size); /* NULL=receiving turned off, returns 1 to clear rx buffer */ - struct event_cork *cork; /* Cork to temporarily stop receiving data */ - node cork_node; /* Node in cork list */ byte *tbuf, *tpos; /* NULL=allocate automatically */ byte *ttx; /* Internal */ @@ -83,17 +80,22 @@ typedef struct birdsock { const char *password; /* Password for MD5 authentication */ const char *err; /* Error message */ struct ssh_sock *ssh; /* Used in SK_SSH */ - struct event reloop; /* Reloop event */ + struct birdloop *loop; /* BIRDLoop owning this socket */ } sock; sock *sock_new(pool *); /* Allocate new socket */ #define sk_new(X) sock_new(X) /* Wrapper to avoid name collision with OpenSSL */ -int sk_open(sock *); /* Open socket */ +int sk_open(sock *, struct birdloop *); /* Open socket */ +void sk_reloop(sock *, struct birdloop *); /* Move socket to another loop. Both loops must be locked. */ + int sk_rx_ready(sock *s); +_Bool sk_tx_pending(sock *s); int sk_send(sock *, uint len); /* Send data, <0=err, >0=ok, 0=sleep */ int sk_send_to(sock *, uint len, ip_addr to, uint port); /* sk_send to given destination */ void sk_reallocate(sock *); /* Free and allocate tbuf & rbuf */ +void sk_pause_rx(struct birdloop *loop, sock *s); +void sk_resume_rx(struct birdloop *loop, sock *s, int (*hook)(sock *, uint)); void sk_set_rbsize(sock *s, uint val); /* Resize RX buffer */ void sk_set_tbsize(sock *s, uint val); /* Resize TX buffer, keeping content */ void sk_set_tbuf(sock *s, void *tbuf); /* Switch TX buffer, NULL-> return to internal */ @@ -117,6 +119,7 @@ int sk_set_icmp6_filter(sock *s, int p1, int p2); void sk_log_error(sock *s, const char *p); byte * sk_rx_buffer(sock *s, int *len); /* Temporary */ +sock *sk_next(sock *s); extern int sk_priority_control; /* Suggested priority for control traffic, should be sysdep define */ @@ -130,11 +133,9 @@ extern int sk_priority_control; /* Suggested priority for control traffic, shou #define SKF_HIGH_PORT 0x20 /* Choose port from high range if possible */ #define SKF_FREEBIND 0x40 /* Allow socket to bind to a nonlocal address */ -#define SKF_THREAD 0x100 /* Socked used in thread, Do not add to main loop */ #define SKF_TRUNCATED 0x200 /* Received packet was truncated, set by IO layer */ #define SKF_HDRINCL 0x400 /* Used internally */ #define SKF_PKTINFO 0x800 /* Used internally */ -#define SKF_PASSIVE_THREAD 0x1000 /* Child sockets used in thread, do not add to main loop */ /* * Socket types SA SP DA DP IF TTL SendTo (?=may, -=must not, *=must) diff --git a/lib/string.h b/lib/string.h index 976b1c24..2829943d 100644 --- a/lib/string.h +++ b/lib/string.h @@ -20,6 +20,11 @@ int bvsprintf(char *str, const char *fmt, va_list args); int bsnprintf(char *str, int size, const char *fmt, ...); int bvsnprintf(char *str, int size, const char *fmt, va_list args); +char *mb_sprintf(pool *p, const char *fmt, ...); +char *mb_vsprintf(pool *p, const char *fmt, va_list args); +char *lp_sprintf(linpool *p, const char *fmt, ...); +char *lp_vsprintf(linpool *p, const char *fmt, va_list args); + int buffer_vprint(buffer *buf, const char *fmt, va_list args); int buffer_print(buffer *buf, const char *fmt, ...); void buffer_puts(buffer *buf, const char *str); diff --git a/lib/timer.c b/lib/timer.c index eb7ea690..4e4aa55e 100644 --- a/lib/timer.c +++ b/lib/timer.c @@ -32,7 +32,6 @@ #include "nest/bird.h" -#include "lib/coro.h" #include "lib/heap.h" #include "lib/resource.h" #include "lib/timer.h" @@ -59,7 +58,7 @@ tm_free(resource *r) } static void -tm_dump(resource *r) +tm_dump(resource *r, unsigned indent UNUSED) { timer *t = (void *) r; @@ -117,7 +116,7 @@ tm_set_in_tl(timer *t, btime when, struct timeloop *local_timeloop) t->loop = local_timeloop; - if ((t->index == 1) && (local_timeloop->coro != this_coro)) + if (t->index == 1) birdloop_ping(local_timeloop->loop); } @@ -193,6 +192,7 @@ timers_fire(struct timeloop *loop, int io_log) io_log_event(t->hook, t->data); t->hook(t); + tmp_flush(); } } diff --git a/lib/timer.h b/lib/timer.h index 161e39d3..4a3a2108 100644 --- a/lib/timer.h +++ b/lib/timer.h @@ -41,7 +41,6 @@ struct timeloop BUFFER_(timer *) timers; struct domain_generic *domain; struct birdloop *loop; - struct coroutine *coro; }; #define TLOCK_TIMER_ASSERT(loop) ASSERT_DIE((loop)->domain && DG_IS_LOCKED((loop)->domain)) @@ -55,7 +54,6 @@ static inline timer *timers_first(struct timeloop *loop) #define current_time() atomic_load_explicit(&last_time, memory_order_acquire) #define current_real_time() atomic_load_explicit(&real_time, memory_order_acquire) -#define current_time_update() ({ times_update(); current_time(); }) //#define now (current_time() TO_S) //#define now_real (current_real_time() TO_S) @@ -101,12 +99,14 @@ tm_set_max(timer *t, btime when) } static inline void -tm_start_max(timer *t, btime after) +tm_start_max_in(timer *t, btime after, struct birdloop *loop) { btime rem = tm_remains(t); - tm_start(t, MAX_(rem, after)); + tm_start_in(t, MAX_(rem, after), loop); } +#define tm_start_max(t, after) tm_start_max_in(t, after, &main_birdloop) + /* In sysdep code */ void times_update(void); diff --git a/lib/tlists.h b/lib/tlists.h new file mode 100644 index 00000000..1437e17e --- /dev/null +++ b/lib/tlists.h @@ -0,0 +1,177 @@ +/* + * BIRD Library -- Typed Linked Lists + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + * + * + * This implementation of linked lists forces its members to be + * typed. On the other hand, it needs to be implemented as ugly macros to + * keep the needed genericity. + * + * Usage: + * 1. Include this file + * 2. Define the node structure + * 3. For every list type you need to define: + * A. #define TLIST_PREFIX and other macros + * B. Include this file once again + * + * Macros to define: + * TLIST_PREFIX: prefix to prepend to everything generated + * TLIST_TYPE: the actual node type + * TLIST_ITEM: where the tlist structure is + * TLIST_WANT_WALK: if defined, generates a helper functions for list walking macros + * TLIST_WANT_ADD_HEAD: if defined, TLIST_PREFIX_add_head() is generated to + * add an item to the beginning of the list + * TLIST_WANT_ADD_TAIL: if defined, TLIST_PREFIX_add_tail() is generated to + * add an item to the end of the list + * + * TLIST_PREFIX_rem_node() is generated always. + * + * All these macros are #undef-ed by including this file. + * + * Example: + * + * #include "lib/tlists.h" + * + * struct foo { + * ... + * TLIST_NODE(bar, struct foo) baz; + * ... + * }; + * + * #define TLIST_PREFIX bar + * #define TLIST_TYPE struct foo + * #define TLIST_ITEM baz + * + * #define TLIST_WANT_WALK + * #define TLIST_WANT_ADD_HEAD + * + * #include "lib/tlists.h" + * + * ... + * (end of example) + * + */ + +#ifdef _BIRD_LIB_TLISTS_H_ +# ifdef TLIST_PREFIX + +/* Check for mandatory arguments */ +#ifndef TLIST_TYPE +#error "TLIST_TYPE must be defined" +#endif +#ifndef TLIST_ITEM +#error "TLIST_ITEM must be defined" +#endif +#ifndef TLIST_PREFIX +#error "TLIST_PREFIX must be defined" +#endif + +#define TLIST_NAME(x) MACRO_CONCAT_AFTER(TLIST_PREFIX,_##x) +#ifndef TLIST_LIST_STRUCT +#define TLIST_LIST_STRUCT TLIST_NAME(list) +#endif + +typedef struct TLIST_LIST_STRUCT { + TLIST_TYPE *first; + TLIST_TYPE *last; +} TLIST_LIST_STRUCT; + +#ifdef TLIST_WANT_WALK +static inline struct TLIST_NAME(node) * TLIST_NAME(node_get)(TLIST_TYPE *node) +{ return &(node->TLIST_ITEM); } +#endif + +#ifdef TLIST_WANT_ADD_HEAD +static inline void TLIST_NAME(add_head)(TLIST_LIST_STRUCT *list, TLIST_TYPE *node) +{ + ASSERT_DIE(!node->TLIST_ITEM.prev && !node->TLIST_ITEM.next); + if (node->TLIST_ITEM.next = list->first) + list->first->TLIST_ITEM.prev = node; + else + list->last = node; + list->first = node; +} +#endif + +#ifdef TLIST_WANT_ADD_TAIL +static inline void TLIST_NAME(add_tail)(TLIST_LIST_STRUCT *list, TLIST_TYPE *node) +{ + ASSERT_DIE(!node->TLIST_ITEM.prev && !node->TLIST_ITEM.next); + if (node->TLIST_ITEM.prev = list->last) + list->last->TLIST_ITEM.next = node; + else + list->first = node; + list->last = node; +} +#endif + +static inline void TLIST_NAME(rem_node)(TLIST_LIST_STRUCT *list, TLIST_TYPE *node) +{ + if (node->TLIST_ITEM.prev) + node->TLIST_ITEM.prev->TLIST_ITEM.next = node->TLIST_ITEM.next; + else + { + ASSERT_DIE(list->first == node); + list->first = node->TLIST_ITEM.next; + } + + if (node->TLIST_ITEM.next) + node->TLIST_ITEM.next->TLIST_ITEM.prev = node->TLIST_ITEM.prev; + else + { + ASSERT_DIE(list->last == node); + list->last = node->TLIST_ITEM.prev; + } + + node->TLIST_ITEM.next = node->TLIST_ITEM.prev = NULL; +} + +#undef TLIST_PREFIX +#undef TLIST_NAME +#undef TLIST_LIST_STRUCT +#undef TLIST_TYPE +#undef TLIST_ITEM +#undef TLIST_WANT_ADD_HEAD +#undef TLIST_WANT_ADD_TAIL + +# endif +#else +#define _BIRD_LIB_TLISTS_H_ + +#include "lib/macro.h" + +#if defined(TLIST_NAME) || defined(TLIST_PREFIX) +#error "You should first include lib/tlists.h without requesting a TLIST" +#endif + +#define TLIST_NODE_CONTENTS(_type) { _type *next; _type *prev; } +#define TLIST_NODE(_name, _type) struct _name##_node TLIST_NODE_CONTENTS(_type) +#define TLIST_DEFAULT_NODE struct MACRO_CONCAT_AFTER(TLIST_PREFIX,_node) \ + TLIST_NODE_CONTENTS(TLIST_TYPE) TLIST_ITEM + +#define TLIST_LIST(_name) struct _name##_list + + +/* Use ->first and ->last to access HEAD and TAIL */ +#define THEAD(_name, _list) (_list)->first +#define TTAIL(_name, _list) (_list)->last + +/* Walkaround macros: simple and resilient to node removal */ +#define WALK_TLIST(_name, _node, _list) \ + for (typeof((_list)->first) _node = (_list)->first; \ + _node; _node = _name##_node_get((_node))->next) + +#define WALK_TLIST_DELSAFE(_name, _node, _list) \ + for (typeof((_list)->first) _node = (_list)->first, \ + _helper = _node ? _name##_node_get((_list)->first)->next : NULL; \ + _node; \ + (_node = _helper) ? (_helper = _name##_node_get(_helper)->next) : 0) + +/* Empty check */ +#define EMPTY_TLIST(_name, _list) (!(_list)->first) + +#endif + diff --git a/lib/type.h b/lib/type.h new file mode 100644 index 00000000..b54744c1 --- /dev/null +++ b/lib/type.h @@ -0,0 +1,112 @@ +/* + * BIRD Internet Routing Daemon -- Internal Data Types + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_TYPE_H_ +#define _BIRD_TYPE_H_ + +#include "lib/birdlib.h" +#include "lib/attrs.h" + +union bval { +#define BVAL_ITEMS \ + struct { \ + u32 data; /* Integer type inherited from eattrs */ \ + PADDING(data, 0, 4); /* Must be padded on 64-bits */ \ + }; \ + struct { \ + u32 i; /* Integer type inherited from filters */ \ + PADDING(i, 0, 4); /* Must be padded on 64-bits */ \ + }; \ + const struct adata *ptr; /* Generic attribute data inherited from eattrs */ \ + const struct adata *ad; /* Generic attribute data inherited from filters */ \ + + BVAL_ITEMS; +}; + +union bval_long { + union bval bval; /* For direct assignments */ + BVAL_ITEMS; /* For item-wise access */ + + u64 ec; + lcomm lc; + ip_addr ip; + const net_addr *net; + const char *s; + const struct f_tree *t; + const struct f_trie *ti; + const struct f_path_mask *path_mask; + struct f_path_mask_item pmi; +}; + + +/* Internal types */ +enum btype { +/* Nothing. Simply nothing. */ + T_VOID = 0, + +/* Something but inaccessible. */ + T_OPAQUE = 0x02, /* Opaque byte string (not filterable) */ + T_IFACE = 0x0c, /* Pointer to an interface (inside adata) */ + T_NEXTHOP_LIST = 0x2c, /* The whole nexthop block */ + T_HOSTENTRY = 0x2e, /* Hostentry with possible MPLS labels */ + +/* Types shared with eattrs */ + T_INT = 0x01, /* 32-bit unsigned integer number */ + T_IP = 0x04, /* IP address */ + T_QUAD = 0x05, /* Router ID (IPv4 address) */ + T_PATH = 0x06, /* BGP AS path (encoding per RFC 1771:4.3) */ + T_CLIST = 0x0a, /* Set of u32's (e.g., a community list) */ + T_ECLIST = 0x0e, /* Set of pairs of u32's - ext. community list */ + T_LCLIST = 0x08, /* Set of triplets of u32's - large community list */ + + T_ENUM_BGP_ORIGIN = 0x11, /* BGP Origin enum */ + T_ENUM_RA_PREFERENCE = 0x13, /* RA Preference enum */ + T_ENUM_FLOWSPEC_VALID = 0x15, /* Flowspec validation result */ + +#define EAF_TYPE__MAX 0x1f +#define EAF_EMBEDDED 0x01 /* Data stored in eattr.u.data (part of type spec) */ + /* Otherwise, attribute data is adata */ + +/* Other user visible types which fit in int */ + T_BOOL = 0xa0, + T_PAIR = 0xa4, /* Notice that pair is stored as integer: first << 16 | second */ + +/* Put enumerational types in 0x20..0x3f range */ + T_ENUM_LO = 0x10, + T_ENUM_HI = 0x3f, + + T_ENUM_RTS = 0x31, + T_ENUM_SCOPE = 0x33, + T_ENUM_RTD = 0x37, + T_ENUM_ROA = 0x39, + T_ENUM_NETTYPE = 0x3b, + T_ENUM_AF = 0x3d, + +/* new enums go here */ + +#define T_ENUM T_ENUM_LO ... T_ENUM_HI + +/* Bigger ones */ + T_NET = 0xb0, + T_STRING = 0xb4, + T_PATH_MASK = 0xb8, /* mask for BGP path */ + T_EC = 0xbc, /* Extended community value, u64 */ + T_LC = 0xc0, /* Large community value, lcomm */ + T_RD = 0xc4, /* Route distinguisher for VPN addresses */ + T_PATH_MASK_ITEM = 0xc8, /* Path mask item for path mask constructors */ + + T_SET = 0x80, + T_PREFIX_SET = 0x84, +} PACKED; + +typedef enum btype btype; + +STATIC_ASSERT(sizeof(btype) == sizeof(byte)); + + +#endif diff --git a/lib/type_test.c b/lib/type_test.c new file mode 100644 index 00000000..b526db69 --- /dev/null +++ b/lib/type_test.c @@ -0,0 +1,79 @@ +/* + * BIRD Library -- Data Type Alignment Tests + * + * (c) 2022 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include "test/birdtest.h" +#include "lib/type.h" +#include "lib/route.h" + +#define CHECK_ONE(val) \ + for (uint i=0; i<sizeof(val); i++) \ + bt_assert(((const u8 *) &val)[i] == (u8) ~0); + +#define SET_PADDING(val, name) \ + for (uint i=0; i<sizeof(val.PADDING_NAME(name)); i++) \ + val.PADDING_NAME(name)[i] = ~0; + + +static int +t_bval(void) +{ + union bval v; + + memset(&v, 0, sizeof(v)); + v.data = ~0; + SET_PADDING(v, data); + CHECK_ONE(v); + + memset(&v, 0, sizeof(v)); + v.i = ~0; + SET_PADDING(v, i); + CHECK_ONE(v); + + memset(&v, 0, sizeof(v)); + v.ptr = (void *) ~0; + CHECK_ONE(v); + + memset(&v, 0, sizeof(v)); + v.ad = (void *) ~0; + CHECK_ONE(v); + + return 1; +} + +static int +t_eattr(void) +{ + struct eattr e; + memset(&e, 0, sizeof(e)); + + e.id = ~0; + e.flags = ~0; + e.type = ~0; + e.rfu = ~0; + e.originated = ~0; + e.fresh = ~0; + e.undef = ~0; + memset(&e.u, ~0, sizeof(e.u)); /* Assumes t_bval passed */ + + SET_PADDING(e, unused); + + CHECK_ONE(e); + + return 1; +} + + +int main(int argc, char *argv[]) +{ + bt_init(argc, argv); + + bt_test_suite(t_bval, "Structure alignment test: bval"); + bt_test_suite(t_eattr, "Structure alignment test: eattr"); + + return bt_exit_value(); +} @@ -60,10 +60,12 @@ main(int argc, char **argv) { uint i, e; if (scanf("%x/%d", &i, &e) != 2) + { if (feof(stdin)) break; - else - fprintf(stderr, "BUGGG\n"); + else + fprintf(stderr, "BUGGG\n"); + } // i >>= (32-e); // i |= (i >> e); cnt++; diff --git a/nest/Makefile b/nest/Makefile index 884d3950..1a71c2fb 100644 --- a/nest/Makefile +++ b/nest/Makefile @@ -1,8 +1,18 @@ -src := a-path.c a-set.c cli.c cmds.c iface.c locks.c neighbor.c password.c proto.c rt-attr.c rt-dev.c rt-fib.c rt-show.c rt-table.c +src := cli.c cmds.c iface.c locks.c neighbor.c password.c proto.c proto-build.c rt-attr.c rt-dev.c rt-fib.c rt-show.c rt-table.c obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_src := a-set_test.c a-path_test.c +$(o)proto-build.c: Makefile $(lastword $(MAKEFILE_LIST)) $(objdir)/.dir-stamp + $(E)echo GEN $@ + $(Q)echo "#include \"lib/birdlib.h\"" > $@ + $(Q)$(patsubst %,echo 'void %_build(void);' >> $@;,$(PROTO_BUILD)) + $(Q)echo "void protos_build_gen(void) {" >> $@ + $(Q)$(patsubst %,echo ' %_build();'>>$@;,$(PROTO_BUILD)) + $(Q)echo "}" >> $@ + +prepare: $(o)proto-build.c + +tests_src := tests_targets := $(tests_targets) $(tests-target-files) tests_objs := $(tests_objs) $(src-o-files) @@ -23,11 +23,6 @@ struct bfd_options { u8 mode; }; -struct bfd_session_state { - u8 state; - u8 diag; -}; - struct bfd_request { resource r; node n; @@ -40,12 +35,13 @@ struct bfd_request { void (*hook)(struct bfd_request *); void *data; - event event; + struct birdloop *target; struct bfd_session *session; - struct bfd_session_state old_state; + u8 state; u8 diag; + u8 old_state; u8 down; }; @@ -56,19 +52,20 @@ struct bfd_request { #define BFD_STATE_INIT 2 #define BFD_STATE_UP 3 + static inline struct bfd_options * bfd_new_options(void) { return cfg_allocz(sizeof(struct bfd_options)); } #ifdef CONFIG_BFD -struct bfd_request * bfd_request_session(pool *p, ip_addr addr, ip_addr local, struct iface *iface, struct iface *vrf, void (*hook)(struct bfd_request *), void *data, struct event_list *list, const struct bfd_options *opts); +struct bfd_request * bfd_request_session(pool *p, ip_addr addr, ip_addr local, struct iface *iface, struct iface *vrf, void (*hook)(struct bfd_request *), void *data, struct birdloop *target, const struct bfd_options *opts); void bfd_update_request(struct bfd_request *req, const struct bfd_options *opts); static inline void cf_check_bfd(int use UNUSED) { } #else -static inline struct bfd_request * bfd_request_session(pool *p UNUSED, ip_addr addr UNUSED, ip_addr local UNUSED, struct iface *iface UNUSED, struct iface *vrf UNUSED, void (*hook)(struct bfd_request *) UNUSED, void *data UNUSED, const struct bfd_options *opts UNUSED) { return NULL; } +static inline struct bfd_request * bfd_request_session(pool *p UNUSED, ip_addr addr UNUSED, ip_addr local UNUSED, struct iface *iface UNUSED, struct iface *vrf UNUSED, void (*hook)(struct bfd_request *) UNUSED, void *data UNUSED, struct birdloop *target UNUSED, const struct bfd_options *opts UNUSED) { return NULL; } static inline void bfd_update_request(struct bfd_request *req UNUSED, const struct bfd_options *opts UNUSED) { }; static inline void cf_check_bfd(int use) { if (use) cf_error("BFD not available"); } diff --git a/nest/bird.h b/nest/bird.h index 55712abe..931974a0 100644 --- a/nest/bird.h +++ b/nest/bird.h @@ -9,7 +9,6 @@ #ifndef _BIRD_BIRD_H_ #define _BIRD_BIRD_H_ -#include "sysdep/config.h" #include "lib/birdlib.h" #include "lib/ip.h" #include "lib/net.h" @@ -262,7 +262,7 @@ cli_command(struct cli *c) log(L_TRACE "CLI: %s", c->rx_buf); bzero(&f, sizeof(f)); f.mem = c->parser_pool; - f.pool = rp_new(c->pool, &main_birdloop, "Config"); + f.pool = rp_new(c->pool, "Config"); init_list(&f.symbols); cf_read_hook = cli_cmd_read_hook; cli_rh_pos = c->rx_buf; @@ -302,24 +302,24 @@ cli_event(void *data) cli_command(c); } - cli_write_trigger(c); + if (c->tx_pos) + cli_write_trigger(c); } cli * -cli_new(void *priv) +cli_new(struct birdsock *sock) { - pool *p = rp_new(cli_pool, &main_birdloop, "CLI"); + pool *p = rp_new(cli_pool, "CLI"); cli *c = mb_alloc(p, sizeof(cli)); bzero(c, sizeof(cli)); c->pool = p; - c->priv = priv; + c->sock = sock; c->event = ev_new(p); c->event->hook = cli_event; c->event->data = c; c->cont = cli_hello; c->parser_pool = lp_new_default(c->pool); - c->show_pool = lp_new_default(c->pool); c->rx_buf = mb_alloc(c->pool, CLI_RX_BUF_SIZE); ev_schedule(c->event); return c; @@ -409,11 +409,19 @@ void cli_free(cli *c) { cli_set_log_echo(c, 0, 0); + int defer = 0; if (c->cleanup) - c->cleanup(c); + defer = c->cleanup(c); if (c == cmd_reconfig_stored_cli) cmd_reconfig_stored_cli = NULL; - rp_free(c->pool, &root_pool); + + if (defer) + { + rfree(c->sock); + c->sock = NULL; + } + else + rfree(c->pool); } /** @@ -425,7 +433,7 @@ cli_free(cli *c) void cli_init(void) { - cli_pool = rp_new(&root_pool, &main_birdloop, "CLI"); + cli_pool = rp_new(&root_pool, "CLI"); init_list(&cli_log_hooks); cli_log_inited = 1; } @@ -28,17 +28,17 @@ struct cli_out { typedef struct cli { node n; /* Node in list of all log hooks */ pool *pool; - void *priv; /* Private to sysdep layer */ - byte *rx_buf, *rx_pos, *rx_aux; /* sysdep */ + struct birdsock *sock; /* Underlying socket */ + byte *rx_buf, *rx_pos; /* sysdep */ struct cli_out *tx_buf, *tx_pos, *tx_write; event *event; void (*cont)(struct cli *c); - void (*cleanup)(struct cli *c); + int (*cleanup)(struct cli *c); /* Return 0 if finished and cli may be freed immediately. + Otherwise return 1 and call rfree(c->pool) when appropriate. */ void *rover; /* Private to continuation routine */ int last_reply; int restricted; /* CLI is restricted to read-only commands */ struct linpool *parser_pool; /* Pool used during parsing */ - struct linpool *show_pool; /* Pool used during route show */ byte *ring_buf; /* Ring buffer for asynchronous messages */ byte *ring_end, *ring_read, *ring_write; /* Pointers to the ring buffer */ uint ring_overflow; /* Counter of ring overflows */ @@ -63,7 +63,7 @@ static inline void cli_separator(cli *c) /* Functions provided to sysdep layer */ -cli *cli_new(void *); +cli *cli_new(struct birdsock *); void cli_init(void); void cli_free(cli *); void cli_kick(cli *); diff --git a/nest/cmds.c b/nest/cmds.c index 77c92077..6717be0c 100644 --- a/nest/cmds.c +++ b/nest/cmds.c @@ -8,7 +8,7 @@ #include "nest/bird.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "conf/conf.h" #include "nest/cmds.h" @@ -51,17 +51,18 @@ cmd_show_symbols(struct sym_show_data *sd) cli_msg(1010, "%-8s\t%s", sd->sym->name, cf_symbol_class_name(sd->sym)); else { - HASH_WALK(config->sym_hash, next, sym) - { - if (!sym->scope->active) - continue; + for (const struct sym_scope *scope = config->root_scope; scope; scope = scope->next) + HASH_WALK(scope->hash, next, sym) + { + if (!sym->scope->active) + continue; - if (sd->type && (sym->class != sd->type)) - continue; + if (sd->type && (sym->class != sd->type)) + continue; - cli_msg(-1010, "%-8s\t%s", sym->name, cf_symbol_class_name(sym)); - } - HASH_WALK_END; + cli_msg(-1010, "%-8s\t%s", sym->name, cf_symbol_class_name(sym)); + } + HASH_WALK_END; cli_msg(0, ""); } @@ -114,10 +115,18 @@ cmd_show_memory(void) { cli_msg(-1018, "BIRD memory usage"); cli_msg(-1018, "%-17s Effective Overhead", ""); - print_size("Routing tables:", rp_memsize(rt_table_pool)); - print_size("Route attributes:", rp_memsize(rta_pool)); - print_size("Protocols:", rp_memsize(proto_pool)); - print_size("Total:", rp_memsize(&root_pool)); + print_size("Routing tables:", rmemsize(rt_table_pool)); + print_size("Route attributes:", rmemsize(rta_pool)); + print_size("Protocols:", rmemsize(proto_pool)); + print_size("Current config:", rmemsize(config_pool)); + struct resmem total = rmemsize(&root_pool); +#ifdef HAVE_MMAP + int pk = atomic_load_explicit(&pages_kept, memory_order_relaxed) + + atomic_load_explicit(&pages_kept_locally, memory_order_relaxed); + print_size("Standby memory:", (struct resmem) { .overhead = page_size * pk }); + total.overhead += page_size * pk; +#endif + print_size("Total:", total); cli_msg(0, ""); } @@ -127,7 +136,7 @@ cmd_eval(const struct f_line *expr) buffer buf; LOG_BUFFER_INIT(buf); - if (f_eval_buf(expr, this_cli->parser_pool, &buf) > F_RETURN) + if (f_eval_buf(expr, &buf) > F_RETURN) { cli_msg(8008, "runtime error"); return; diff --git a/nest/config.Y b/nest/config.Y index f9ed0e69..bce8b228 100644 --- a/nest/config.Y +++ b/nest/config.Y @@ -17,6 +17,7 @@ CF_HDR CF_DEFINES +static struct rtable_config *this_table; static struct proto_config *this_proto; static struct channel_config *this_channel; static struct iface_patt *this_ipatt; @@ -117,13 +118,14 @@ CF_KEYWORDS(IPV4, IPV6, VPN4, VPN6, ROA4, ROA6, FLOW4, FLOW6, SADR, MPLS) CF_KEYWORDS(RECEIVE, LIMIT, ACTION, WARN, BLOCK, RESTART, DISABLE, KEEP, FILTERED, RPKI) CF_KEYWORDS(PASSWORD, KEY, FROM, PASSIVE, TO, ID, EVENTS, PACKETS, PROTOCOLS, CHANNELS, INTERFACES) CF_KEYWORDS(ALGORITHM, KEYED, HMAC, MD5, SHA1, SHA256, SHA384, SHA512, BLAKE2S128, BLAKE2S256, BLAKE2B256, BLAKE2B512) -CF_KEYWORDS(PRIMARY, STATS, COUNT, BY, FOR, COMMANDS, PREEXPORT, NOEXPORT, EXPORTED, GENERATE) -CF_KEYWORDS(BGP, PASSWORDS, DESCRIPTION, SORTED) -CF_KEYWORDS(RELOAD, IN, OUT, MRTDUMP, MESSAGES, RESTRICT, MEMORY, IGP_METRIC, CLASS, DSCP) +CF_KEYWORDS(PRIMARY, STATS, COUNT, FOR, IN, COMMANDS, PREEXPORT, NOEXPORT, EXPORTED, GENERATE) +CF_KEYWORDS(BGP, PASSWORDS, DESCRIPTION) +CF_KEYWORDS(RELOAD, IN, OUT, MRTDUMP, MESSAGES, RESTRICT, MEMORY, CLASS, DSCP) CF_KEYWORDS(TIMEFORMAT, ISO, SHORT, LONG, ROUTE, PROTOCOL, BASE, LOG, S, MS, US) -CF_KEYWORDS(GRACEFUL, RESTART, WAIT, MAX, FLUSH, AS) +CF_KEYWORDS(GRACEFUL, RESTART, WAIT, MAX, AS) CF_KEYWORDS(MIN, IDLE, RX, TX, INTERVAL, MULTIPLIER, PASSIVE) CF_KEYWORDS(CHECK, LINK) +CF_KEYWORDS(CORK, SORTED, TRIE, MIN, MAX, ROA, ROUTE, REFRESH, SETTLE, TIME, GC, THRESHOLD, PERIOD) /* For r_args_channel */ CF_KEYWORDS(IPV4, IPV4_MC, IPV4_MPLS, IPV6, IPV6_MC, IPV6_MPLS, IPV6_SADR, VPN4, VPN4_MC, VPN4_MPLS, VPN6, VPN6_MC, VPN6_MPLS, ROA4, ROA6, FLOW4, FLOW6, MPLS, PRI, SEC) @@ -131,7 +133,7 @@ CF_KEYWORDS(IPV4, IPV4_MC, IPV4_MPLS, IPV6, IPV6_MC, IPV6_MPLS, IPV6_SADR, VPN4, CF_ENUM(T_ENUM_RTS, RTS_, STATIC, INHERIT, DEVICE, STATIC_DEVICE, REDIRECT, RIP, OSPF, OSPF_IA, OSPF_EXT1, OSPF_EXT2, BGP, PIPE, BABEL) CF_ENUM(T_ENUM_SCOPE, SCOPE_, HOST, LINK, SITE, ORGANIZATION, UNIVERSE, UNDEFINED) -CF_ENUM(T_ENUM_RTD, RTD_, UNICAST, BLACKHOLE, UNREACHABLE, PROHIBIT) +CF_ENUM(T_ENUM_RTD, RTD_, BLACKHOLE, UNREACHABLE, PROHIBIT) CF_ENUM(T_ENUM_ROA, ROA_, UNKNOWN, VALID, INVALID) CF_ENUM_PX(T_ENUM_AF, AF_, AFI_, IPV4, IPV6) @@ -141,7 +143,7 @@ CF_ENUM_PX(T_ENUM_AF, AF_, AFI_, IPV4, IPV6) %type <s> optproto %type <ra> r_args %type <sd> sym_args -%type <i> proto_start echo_mask echo_size debug_mask debug_list debug_flag mrtdump_mask mrtdump_list mrtdump_flag export_mode limit_action net_type table_sorted tos password_algorithm +%type <i> proto_start echo_mask echo_size debug_mask debug_list debug_flag mrtdump_mask mrtdump_list mrtdump_flag export_mode limit_action net_type tos password_algorithm %type <ps> proto_patt proto_patt2 %type <cc> channel_start proto_channel %type <cl> limit_spec @@ -163,7 +165,7 @@ rtrid: idval: NUM { $$ = $1; } - | '(' term ')' { $$ = f_eval_int(f_linearize($2)); } + | '(' term ')' { $$ = f_eval_int(f_linearize($2, 1)); } | IP4 { $$ = ip4_to_u32($1); } | CF_SYM_KNOWN { if ($1->class == (SYM_CONSTANT | T_INT) || $1->class == (SYM_CONSTANT | T_QUAD)) @@ -206,16 +208,44 @@ CF_ENUM(T_ENUM_NETTYPE, NET_, IP4, IP6, VPN4, VPN6, ROA4, ROA6, FLOW4, FLOW6, IP conf: table ; +table: table_start table_sorted table_opt_list ; + +table_start: net_type TABLE symbol { + this_table = rt_new_table($3, $1); + } + ; + table_sorted: - { $$ = 0; } - | SORTED { $$ = 1; } + /* empty */ + | SORTED { this_table->sorted = 1; } ; -table: net_type TABLE symbol table_sorted { - struct rtable_config *cf; - cf = rt_new_table($3, $1); - cf->sorted = $4; +table_opt: + SORTED bool { this_table->sorted = $2; } + | TRIE bool { + if (!net_val_match(this_table->addr_type, NB_IP | NB_VPN | NB_ROA | NB_IP6_SADR)) + cf_error("Trie option not supported for %s table", net_label[this_table->addr_type]); + this_table->trie_used = $2; } + | GC THRESHOLD expr { this_table->gc_threshold = $3; } + | GC PERIOD expr_us { this_table->gc_period = (uint) $3; if ($3 > 3600 S_) cf_error("GC period must be at most 3600 s"); } + | CORK THRESHOLD expr expr { + if ($3 > $4) cf_error("Cork low threshold must be lower than the high threshold."); + this_table->cork_threshold.low = $3; + this_table->cork_threshold.high = $4; } + | EXPORT SETTLE TIME settle { this_table->export_settle = $4; } + | ROUTE REFRESH EXPORT SETTLE TIME settle { this_table->export_rr_settle = $6; } + | DEBUG bool { this_table->debug = $2; } + ; + +table_opts: + /* empty */ + | table_opts table_opt ';' + ; + +table_opt_list: + /* empty */ + | '{' table_opts '}' ; @@ -283,12 +313,26 @@ channel_item_: this_channel->table = $2; } | IMPORT imexport { this_channel->in_filter = $2; } + | EXPORT IN net_any imexport { + if (this_channel->net_type && ($3->type != this_channel->net_type)) + cf_error("Incompatible export prefilter type"); + this_channel->out_subprefix = $3; + this_channel->out_filter = $4; + } | EXPORT imexport { this_channel->out_filter = $2; } | RECEIVE LIMIT limit_spec { this_channel->rx_limit = $3; } | IMPORT LIMIT limit_spec { this_channel->in_limit = $3; } | EXPORT LIMIT limit_spec { this_channel->out_limit = $3; } + | ROA SETTLE TIME settle { this_channel->roa_settle = $4; } | PREFERENCE expr { this_channel->preference = $2; check_u16($2); } - | IMPORT KEEP FILTERED bool { this_channel->in_keep_filtered = $4; } + | IMPORT KEEP FILTERED bool { + if ($4) + this_channel->in_keep |= RIK_REJECTED; + else if ((this_channel->in_keep & RIK_PREFILTER) == RIK_PREFILTER) + cf_error("Import keep filtered is implied by the import table."); + else + this_channel->in_keep &= ~RIK_REJECTED; + } | RPKI RELOAD bool { this_channel->rpki_reload = $3; } ; @@ -319,7 +363,11 @@ channel_end: proto_channel: channel_start channel_opt_list channel_end; -rtable: CF_SYM_KNOWN { cf_assert_symbol($1, SYM_TABLE); $$ = $1->table; } ; +rtable: CF_SYM_KNOWN { + cf_assert_symbol($1, SYM_TABLE); + if (!$1->table) rt_new_default_table($1); + $$ = $1->table; +} ; imexport: FILTER filter { $$ = $2; } @@ -348,7 +396,7 @@ debug_default: DEBUG PROTOCOLS debug_mask { new_config->proto_default_debug = $3; } | DEBUG CHANNELS debug_mask { new_config->channel_default_debug = $3; } | DEBUG COMMANDS expr { new_config->cli_debug = $3; } - | DEBUG TABLES bool { new_config->table_debug = $3; } + | DEBUG TABLES debug_mask { new_config->table_debug = $3; } ; /* MRTDUMP PROTOCOLS is in systep/unix/config.Y */ @@ -377,7 +425,6 @@ timeformat_base: TIMEFORMAT timeformat_spec ';' ; - /* Interface patterns */ iface_patt_node_init: @@ -609,7 +656,7 @@ CF_CLI(SHOW INTERFACES SUMMARY,,, [[Show summary of network interfaces]]) { if_show_summary(); } ; CF_CLI_HELP(SHOW ROUTE, ..., [[Show routing table]]) -CF_CLI(SHOW ROUTE, r_args, [[[<prefix>|for <prefix>|for <ip>] [table <t>] [filter <f>|where <cond>] [all] [primary] [filtered] [(export|preexport|noexport) <p>] [protocol <p>] [stats|count]]], [[Show routing table]]) +CF_CLI(SHOW ROUTE, r_args, [[[<prefix>|for <prefix>|for <ip>|in <prefix>] [table <t>] [(import|export) table <p>.<c>] [filter <f>|where <cond>] [all] [primary] [filtered] [(export|preexport|noexport) <p>] [protocol <p>] [stats|count]]], [[Show routing table]]) { rt_show($3); } ; r_args: @@ -617,21 +664,31 @@ r_args: $$ = cfg_allocz(sizeof(struct rt_show_data)); init_list(&($$->tables)); $$->filter = FILTER_ACCEPT; - $$->running_on_config = new_config->fallback; + $$->running_on_config = config; + $$->cli = this_cli; } | r_args net_any { $$ = $1; if ($$->addr) cf_error("Only one prefix expected"); $$->addr = $2; + $$->addr_mode = TE_ADDR_EQUAL; } | r_args FOR r_args_for { $$ = $1; if ($$->addr) cf_error("Only one prefix expected"); - $$->show_for = 1; $$->addr = $3; + $$->addr_mode = TE_ADDR_FOR; + } + | r_args IN net_any { + $$ = $1; + if ($$->addr) cf_error("Only one prefix expected"); + if (!net_type_match($3, NB_IP)) cf_error("Only IP networks accepted for 'in' argument"); + $$->addr = $3; + $$->addr_mode = TE_ADDR_IN; } - | r_args TABLE CF_SYM_KNOWN { +| r_args TABLE symbol_known { cf_assert_symbol($3, SYM_TABLE); + if (!$3->table) cf_error("Table %s not configured", $3->name); $$ = $1; rt_show_add_table($$, $3->table->table); $$->tables_defined_by = RSD_TDB_DIRECT; @@ -644,13 +701,14 @@ r_args: $$->tables_defined_by = RSD_TDB_ALL; } | r_args IMPORT TABLE channel_arg { - if (!$4->in_table) cf_error("No import table in channel %s.%s", $4->proto->name, $4->name); - rt_show_add_table($$, $4->in_table->tab); + if (!($4->in_keep & RIK_PREFILTER)) cf_error("No import table in channel %s.%s", $4->proto->name, $4->name); + RT_LOCKED($4->table, tab) + rt_show_add_exporter($$, &tab->exporter.e, "import")->prefilter = $4; $$->tables_defined_by = RSD_TDB_DIRECT; } | r_args EXPORT TABLE channel_arg { if (!$4->out_table) cf_error("No export table in channel %s.%s", $4->proto->name, $4->name); - rt_show_add_table($$, $4->out_table->tab); + rt_show_add_exporter($$, $4->out_table, "export"); $$->tables_defined_by = RSD_TDB_DIRECT; } | r_args FILTER filter { @@ -675,7 +733,7 @@ r_args: $$ = $1; $$->filtered = 1; } - | r_args export_mode CF_SYM_KNOWN { + | r_args export_mode symbol_known { cf_assert_symbol($3, SYM_PROTO); struct proto_config *c = (struct proto_config *) $3->proto; $$ = $1; @@ -692,7 +750,7 @@ r_args: $$->export_channel = $3; $$->tables_defined_by = RSD_TDB_INDIRECT; } - | r_args PROTOCOL CF_SYM_KNOWN { + | r_args PROTOCOL symbol_known { cf_assert_symbol($3, SYM_PROTO); struct proto_config *c = (struct proto_config *) $3->proto; $$ = $1; @@ -810,7 +868,7 @@ sym_args: CF_CLI_HELP(DUMP, ..., [[Dump debugging information]]) CF_CLI(DUMP RESOURCES,,, [[Dump all allocated resource]]) -{ rp_dump(&root_pool); cli_msg(0, ""); } ; +{ rdump(&root_pool, 0); cli_msg(0, ""); } ; CF_CLI(DUMP SOCKETS,,, [[Dump open sockets]]) { sk_dump_all(); cli_msg(0, ""); } ; CF_CLI(DUMP EVENTS,,, [[Dump event log]]) @@ -820,7 +878,7 @@ CF_CLI(DUMP INTERFACES,,, [[Dump interface information]]) CF_CLI(DUMP NEIGHBORS,,, [[Dump neighbor cache]]) { neigh_dump_all(); cli_msg(0, ""); } ; CF_CLI(DUMP ATTRIBUTES,,, [[Dump attribute cache]]) -{ rta_dump_all(); cli_msg(0, ""); } ; +{ ea_dump_all(); cli_msg(0, ""); } ; CF_CLI(DUMP ROUTES,,, [[Dump routes]]) { rt_dump_all(); cli_msg(0, ""); } ; CF_CLI(DUMP TABLES,,, [[Dump table connections]]) @@ -831,7 +889,7 @@ CF_CLI(DUMP FILTER ALL,,, [[Dump all filters in linearized form]]) { filters_dump_all(); cli_msg(0, ""); } ; CF_CLI(EVAL, term, <expr>, [[Evaluate an expression]]) -{ cmd_eval(f_linearize($2)); } ; +{ cmd_eval(f_linearize($2, 1)); } ; CF_CLI_HELP(ECHO, ..., [[Control echoing of log messages]]) CF_CLI(ECHO, echo_mask echo_size, (all | off | { debug|trace|info|remote|warning|error|auth [, ...] }) [<buffer-size>], [[Control echoing of log messages]]) { @@ -894,9 +952,6 @@ proto_patt2: | TEXT { $$.ptr = $1; $$.patt = 1; } ; -dynamic_attr: IGP_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_GEN_IGP_METRIC); } ; - - CF_CODE CF_END diff --git a/nest/iface.c b/nest/iface.c index 78f4eb0f..f1938664 100644 --- a/nest/iface.c +++ b/nest/iface.c @@ -31,17 +31,50 @@ #include "nest/cli.h" #include "lib/resource.h" #include "lib/string.h" +#include "lib/locking.h" #include "conf/conf.h" #include "sysdep/unix/krt.h" +DOMAIN(attrs) iface_domain; + +#define IFACE_LOCK LOCK_DOMAIN(attrs, iface_domain) +#define IFACE_UNLOCK UNLOCK_DOMAIN(attrs, iface_domain) +#define IFACE_ASSERT_LOCKED ASSERT_DIE(DOMAIN_IS_LOCKED(attrs, iface_domain)) + +static TLIST_LIST(ifsub) iface_sub_list; +static slab *iface_sub_slab; static pool *if_pool; -DOMAIN(attrs) iface_domain; list global_iface_list; struct iface default_vrf; static void if_recalc_preferred(struct iface *i); -static void ifa_delete_locked(struct ifa *a); + +static void ifa_dump_locked(struct ifa *); +static void if_dump_locked(struct iface *); + +struct iface * +if_walk_first(void) +{ + IFACE_LOCK; + struct iface *i = HEAD(global_iface_list); + return NODE_VALID(i) ? i : NULL; +} + +struct iface * +if_walk_next(struct iface *i) +{ + IFACE_ASSERT_LOCKED; + i = NODE_NEXT(i); + return NODE_VALID(i) ? i : NULL; +} + +void +if_walk_done(void) +{ + IFACE_ASSERT_LOCKED; + IFACE_UNLOCK; +} /** * ifa_dump - dump interface address @@ -52,7 +85,14 @@ static void ifa_delete_locked(struct ifa *a); void ifa_dump(struct ifa *a) { - IFACE_LEGACY_ACCESS; + IFACE_LOCK; + ifa_dump_locked(a); + IFACE_UNLOCK; +} + +static void +ifa_dump_locked(struct ifa *a) +{ debug("\t%I, net %N bc %I -> %I%s%s%s%s\n", a->ip, &a->prefix, a->brd, a->opposite, (a->flags & IA_PRIMARY) ? " PRIMARY" : "", (a->flags & IA_SECONDARY) ? " SEC" : "", @@ -70,9 +110,16 @@ ifa_dump(struct ifa *a) void if_dump(struct iface *i) { + IFACE_LOCK; + if_dump_locked(i); + IFACE_UNLOCK; +} + +static void +if_dump_locked(struct iface *i) +{ struct ifa *a; - IFACE_LEGACY_ACCESS; debug("IF%d: %s", i->index, i->name); if (i->flags & IF_SHUTDOWN) debug(" SHUTDOWN"); @@ -97,7 +144,7 @@ if_dump(struct iface *i) debug(" MTU=%d\n", i->mtu); WALK_LIST(a, i->addrs) { - ifa_dump(a); + ifa_dump_locked(a); ASSERT(!!(a->flags & IA_PRIMARY) == ((a == i->addr4) || (a == i->addr6) || (a == i->llv6))); } @@ -112,15 +159,60 @@ if_dump(struct iface *i) void if_dump_all(void) { - struct iface *i; - - IFACE_LEGACY_ACCESS; debug("Known network interfaces:\n"); - WALK_LIST(i, global_iface_list) + IFACE_WALK(i) if_dump(i); debug("Router ID: %08x\n", config->router_id); } +void +if_link(struct iface *i) +{ + IFACE_ASSERT_LOCKED; + + if (i) + i->uc++; +} + +void +if_unlink(struct iface *i) +{ + IFACE_ASSERT_LOCKED; + + if (i) + i->uc--; + /* TODO: Do some interface object cleanup */ +} + +void ifa_link(struct ifa *a) +{ + IFACE_ASSERT_LOCKED; + + if (a) + { + debug("ifa_link: %p %d\n", a, a->uc); + a->uc++; + } +} + +void ifa_unlink(struct ifa *a) +{ + IFACE_ASSERT_LOCKED; + + if (!a) + return; + + debug("ifa_unlink: %p %d\n", a, a->uc); + if (--a->uc) + return; + + if_unlink(a->iface); +#if DEBUGGING + memset(a, 0x5b, sizeof(struct ifa)); +#endif + mb_free(a); +} + static inline unsigned if_what_changed(struct iface *i, struct iface *j) { @@ -145,13 +237,57 @@ if_copy(struct iface *to, struct iface *from) to->flags = from->flags | (to->flags & IF_TMP_DOWN); to->mtu = from->mtu; to->master_index = from->master_index; - to->master = from->master; + + if_unlink(to->master); + if_link(to->master = from->master); +} + +void +if_enqueue_notify_to(struct iface_notification x, struct iface_subscription *s) +{ + IFACE_ASSERT_LOCKED; + + switch (x.type) { + case IFNOT_ADDRESS: + if (!s->ifa_notify) return; + ifa_link(x.a); + break; + case IFNOT_INTERFACE: + if (!s->if_notify) return; + if_link(x.i); + break; + case IFNOT_NEIGHBOR: + if (!s->neigh_notify) return; + neigh_link(x.n); + break; + default: + bug("Unknown interface notification type: %d", x.type); + } + + struct iface_notification *in = sl_alloc(iface_sub_slab); + *in = x; + + debug("Enqueue notify %d/%p (%p) to %p\n", x.type, x.a, in, s); + + ifnot_add_tail(&s->queue, in); + ev_send(s->target, &s->event); +} + +void +if_enqueue_notify(struct iface_notification x) +{ + IFACE_ASSERT_LOCKED; + + WALK_TLIST(ifsub, s, &iface_sub_list) + if_enqueue_notify_to(x, s); } static inline void -ifa_send_notify(struct proto *p, unsigned c, struct ifa *a) +ifa_send_notify(struct iface_subscription *s, unsigned c, struct ifa *a) { - if (p->ifa_notify && + struct proto *p = SKIP_BACK(struct proto, iface_sub, s); + + if (s->ifa_notify && (p->proto_state != PS_DOWN) && (!p->vrf || p->vrf == a->iface->master)) { @@ -159,20 +295,21 @@ ifa_send_notify(struct proto *p, unsigned c, struct ifa *a) log(L_TRACE "%s < address %N on interface %s %s", p->name, &a->prefix, a->iface->name, (c & IF_CHANGE_UP) ? "added" : "removed"); - p->ifa_notify(p, c, a); + s->ifa_notify(p, c, a); } } static void ifa_notify_change_(unsigned c, struct ifa *a) { - struct proto *p; - DBG("IFA change notification (%x) for %s:%I\n", c, a->iface->name, a->ip); - WALK_LIST(p, proto_list) - PROTO_LOCKED_FROM_MAIN(p) - ifa_send_notify(p, c, a); + if_enqueue_notify((struct iface_notification) { + .type = IFNOT_ADDRESS, + .a = a, + .flags = c, + }); + } static inline void @@ -188,9 +325,11 @@ ifa_notify_change(unsigned c, struct ifa *a) } static inline void -if_send_notify(struct proto *p, unsigned c, struct iface *i) +if_send_notify(struct iface_subscription *s, unsigned c, struct iface *i) { - if (p->if_notify && + struct proto *p = SKIP_BACK(struct proto, iface_sub, s); + + if (s->if_notify && (p->proto_state != PS_DOWN) && (!p->vrf || p->vrf == i->master)) { @@ -203,14 +342,13 @@ if_send_notify(struct proto *p, unsigned c, struct iface *i) (c & IF_CHANGE_PREFERRED) ? "changes preferred address" : (c & IF_CHANGE_CREATE) ? "created" : "sends unknown event"); - p->if_notify(p, c, i); + s->if_notify(p, c, i); } } static void if_notify_change(unsigned c, struct iface *i) { - struct proto *p; struct ifa *a; if (i->flags & IF_JUST_CREATED) @@ -221,7 +359,7 @@ if_notify_change(unsigned c, struct iface *i) DBG("Interface change notification (%x) for %s\n", c, i->name); #ifdef LOCAL_DEBUG - if_dump(i); + if_dump_locked(i); #endif if (c & IF_CHANGE_DOWN) @@ -231,9 +369,11 @@ if_notify_change(unsigned c, struct iface *i) WALK_LIST(a, i->addrs) ifa_notify_change_(IF_CHANGE_DOWN, a); - WALK_LIST(p, proto_list) - PROTO_LOCKED_FROM_MAIN(p) - if_send_notify(p, c, i); + if_enqueue_notify((struct iface_notification) { + .type = IFNOT_INTERFACE, + .i = i, + .flags = c, + }); if (c & IF_CHANGE_UP) WALK_LIST(a, i->addrs) @@ -281,10 +421,12 @@ if_change_flags(struct iface *i, uint flags) void if_delete(struct iface *old) { + IFACE_LOCK; struct iface f = {}; strncpy(f.name, old->name, sizeof(f.name)-1); f.flags = IF_SHUTDOWN; - if_update(&f); + if_update_locked(&f); + IFACE_UNLOCK; } /** @@ -306,15 +448,21 @@ if_delete(struct iface *old) struct iface * if_update(struct iface *new) { + IFACE_LOCK; + struct iface *i = if_update_locked(new); + IFACE_UNLOCK; + return i; +} + +struct iface * +if_update_locked(struct iface *new) +{ struct iface *i; unsigned c; if (!new->master) new->master = &default_vrf; - IFACE_LEGACY_ACCESS; - IFACE_LOCK; - WALK_LIST(i, global_iface_list) if (!strcmp(new->name, i->name)) { @@ -330,14 +478,13 @@ if_update(struct iface *new) new->llv6 = i->llv6; new->sysdep = i->sysdep; memcpy(&new->addrs, &i->addrs, sizeof(i->addrs)); + memcpy(&new->neighbors, &i->neighbors, sizeof(i->neighbors)); memcpy(i, new, sizeof(*i)); i->flags &= ~IF_UP; /* IF_TMP_DOWN will be added later */ goto newif; } if_copy(i, new); - IFACE_UNLOCK; - if (c) if_notify_change(c, i); @@ -346,24 +493,21 @@ if_update(struct iface *new) } i = mb_alloc(if_pool, sizeof(struct iface)); memcpy(i, new, sizeof(*i)); + if_link(i->master); init_list(&i->addrs); -newif: init_list(&i->neighbors); +newif: i->flags |= IF_UPDATED | IF_TMP_DOWN; /* Tmp down as we don't have addresses yet */ add_tail(&global_iface_list, &i->n); - IFACE_UNLOCK; - return i; } void if_start_update(void) { - struct iface *i; struct ifa *a; - IFACE_LEGACY_ACCESS; - WALK_LIST(i, global_iface_list) + IFACE_WALK(i) { i->flags &= ~IF_UPDATED; WALK_LIST(a, i->addrs) @@ -371,11 +515,9 @@ if_start_update(void) } } -void -if_end_partial_update(struct iface *i) +static void +if_end_partial_update_locked(struct iface *i) { - IFACE_LEGACY_ACCESS; - if (i->flags & IF_NEEDS_RECALC) if_recalc_preferred(i); @@ -384,13 +526,19 @@ if_end_partial_update(struct iface *i) } void +if_end_partial_update(struct iface *i) +{ + IFACE_LOCK; + if_end_partial_update_locked(i); + IFACE_UNLOCK; +} + +void if_end_update(void) { - struct iface *i; struct ifa *a, *b; - IFACE_LEGACY_ACCESS; - WALK_LIST(i, global_iface_list) + IFACE_WALK(i) { if (!(i->flags & IF_UPDATED)) if_change_flags(i, (i->flags & ~IF_ADMIN_UP) | IF_SHUTDOWN); @@ -398,51 +546,146 @@ if_end_update(void) { WALK_LIST_DELSAFE(a, b, i->addrs) if (!(a->flags & IA_UPDATED)) - { - IFACE_LOCK; - ifa_delete_locked(a); - IFACE_UNLOCK; - } - if_end_partial_update(i); + ifa_delete(a); + if_end_partial_update_locked(i); } } } -void -if_flush_ifaces(struct proto *p) +static void +iface_notify_hook(void *_s) { - IFACE_LEGACY_ACCESS; - if (p->debug & D_EVENTS) - log(L_TRACE "%s: Flushing interfaces", p->name); - if_start_update(); - if_end_update(); + struct iface_subscription *s = _s; + + IFACE_LOCK; + + while (!EMPTY_TLIST(ifnot, &s->queue)) + { + struct iface_notification *n = THEAD(ifnot, &s->queue); + debug("Process notify %d/%p (%p) to %p\n", n->type, n->a, n, s); + IFACE_UNLOCK; + + switch (n->type) { + case IFNOT_ADDRESS: + ifa_send_notify(s, n->flags, n->a); + IFACE_LOCK; + ifa_unlink(n->a); + IFACE_UNLOCK; + break; + case IFNOT_INTERFACE: + if_send_notify(s, n->flags, n->i); + IFACE_LOCK; + if_unlink(n->i); + IFACE_UNLOCK; + break; + case IFNOT_NEIGHBOR: + s->neigh_notify(n->n); + IFACE_LOCK; + neigh_unlink(n->n); + IFACE_UNLOCK; + break; + default: + bug("Bad interface notification type: %d", n->type); + } + + IFACE_LOCK; + ifnot_rem_node(&s->queue, n); + sl_free(n); + } + + IFACE_UNLOCK; } + /** - * if_feed_baby - advertise interfaces to a new protocol - * @p: protocol to feed + * iface_subscribe - request interface updates + * @s: subscription structure * * When a new protocol starts, this function sends it a series * of notifications about all existing interfaces. */ void -if_feed_baby(struct proto *p) +iface_subscribe(struct iface_subscription *s) { - struct iface *i; - struct ifa *a; - - IFACE_LEGACY_ACCESS; + IFACE_LOCK; + ifsub_add_tail(&iface_sub_list, s); + s->event = (event) { + .hook = iface_notify_hook, + .data = s, + }; - if (!p->if_notify && !p->ifa_notify) /* shortcut */ + if (!s->if_notify && !s->ifa_notify) /* shortcut */ + { + IFACE_UNLOCK; return; + } + + struct iface *i; DBG("Announcing interfaces to new protocol %s\n", p->name); WALK_LIST(i, global_iface_list) { - if_send_notify(p, IF_CHANGE_CREATE | ((i->flags & IF_UP) ? IF_CHANGE_UP : 0), i); + if_enqueue_notify_to( + (struct iface_notification) { + .type = IFNOT_INTERFACE, + .i = i, + .flags = IF_CHANGE_CREATE | ((i->flags & IF_UP) ? IF_CHANGE_UP : 0), + }, s); + + struct ifa *a; if (i->flags & IF_UP) WALK_LIST(a, i->addrs) - ifa_send_notify(p, IF_CHANGE_CREATE | IF_CHANGE_UP, a); + if_enqueue_notify_to( + (struct iface_notification) { + .type = IFNOT_ADDRESS, + .a = a, + .flags = IF_CHANGE_CREATE | IF_CHANGE_UP, + }, s); } + + IFACE_UNLOCK; +} + +/** + * iface_unsubscribe - unsubscribe from interface updates + * @s: subscription structure + */ +void +iface_unsubscribe(struct iface_subscription *s) +{ + IFACE_LOCK; + + struct proto *p = SKIP_BACK(struct proto, iface_sub, s); + WALK_TLIST_DELSAFE(proto_neigh, n, &p->neighbors) + neigh_unlink(n); + + ifsub_rem_node(&iface_sub_list, s); + ev_postpone(&s->event); + + WALK_TLIST_DELSAFE(ifnot, n, &s->queue) + { + debug("Drop notify %d/%p (%p) to %p\n", n->type, n->a, n, s); + switch (n->type) + { + case IFNOT_ADDRESS: + ifa_unlink(n->a); + break; + case IFNOT_INTERFACE: + if_unlink(n->i); + break; + case IFNOT_NEIGHBOR: + neigh_unlink(n->n); + break; + default: + bug("Bad interface notification type: %d", n->type); + } + + ifnot_rem_node(&s->queue, n); + sl_free(n); + } + + ASSERT_DIE(EMPTY_TLIST(proto_neigh, &p->neighbors)); + + IFACE_UNLOCK; } /** @@ -454,22 +697,26 @@ if_feed_baby(struct proto *p) * if no such structure exists. */ struct iface * -if_find_by_index(unsigned idx) +if_find_by_index_locked(unsigned idx) { struct iface *i; - IFACE_LOCK; WALK_LIST(i, global_iface_list) if (i->index == idx && !(i->flags & IF_SHUTDOWN)) - { - IFACE_UNLOCK; return i; - } - IFACE_UNLOCK; return NULL; } +struct iface * +if_find_by_index(unsigned idx) +{ + IFACE_LOCK; + struct iface *i = if_find_by_index_locked(idx); + IFACE_UNLOCK; + return i; +} + /** * if_find_by_name - find interface by name * @name: interface name @@ -500,20 +747,22 @@ if_get_by_name(const char *name) { struct iface *i; - IFACE_LEGACY_ACCESS; - + IFACE_LOCK; WALK_LIST(i, global_iface_list) if (!strcmp(i->name, name)) + { + IFACE_UNLOCK; return i; + } /* No active iface, create a dummy */ - IFACE_LOCK; i = mb_allocz(if_pool, sizeof(struct iface)); strncpy(i->name, name, sizeof(i->name)-1); i->flags = IF_SHUTDOWN; init_list(&i->addrs); init_list(&i->neighbors); add_tail(&global_iface_list, &i->n); + IFACE_UNLOCK; return i; } @@ -598,11 +847,7 @@ if_recalc_preferred(struct iface *i) void if_recalc_all_preferred_addresses(void) { - struct iface *i; - - IFACE_LEGACY_ACCESS; - - WALK_LIST(i, global_iface_list) + IFACE_WALK(i) { if_recalc_preferred(i); @@ -629,11 +874,11 @@ ifa_same(struct ifa *a, struct ifa *b) struct ifa * ifa_update(struct ifa *a) { + IFACE_LOCK; + struct iface *i = a->iface; struct ifa *b; - IFACE_LEGACY_ACCESS; - WALK_LIST(b, i->addrs) if (ifa_same(b, a)) { @@ -643,6 +888,8 @@ ifa_update(struct ifa *a) !((b->flags ^ a->flags) & (IA_SECONDARY | IA_PEER | IA_HOST))) { b->flags |= IA_UPDATED; + + IFACE_UNLOCK; return b; } ifa_delete(b); @@ -652,16 +899,18 @@ ifa_update(struct ifa *a) if ((a->prefix.type == NET_IP4) && (i->flags & IF_BROADCAST) && ipa_zero(a->brd)) log(L_WARN "Missing broadcast address for interface %s", i->name); - IFACE_LOCK; b = mb_alloc(if_pool, sizeof(struct ifa)); memcpy(b, a, sizeof(struct ifa)); + ifa_link(b); + if_link(i); add_tail(&i->addrs, &b->n); b->flags |= IA_UPDATED; - IFACE_UNLOCK; i->flags |= IF_NEEDS_RECALC; if (i->flags & IF_UP) ifa_notify_change(IF_CHANGE_CREATE | IF_CHANGE_UP, b); + + IFACE_UNLOCK; return b; } @@ -679,20 +928,11 @@ ifa_delete(struct ifa *a) struct iface *i = a->iface; struct ifa *b; + IFACE_LOCK; + WALK_LIST(b, i->addrs) if (ifa_same(b, a)) { - IFACE_LOCK; - ifa_delete_locked(b); - IFACE_UNLOCK; - return; - } -} - -static void -ifa_delete_locked(struct ifa *b) -{ - struct iface *i = b->iface; rem_node(&b->n); if (b->flags & IA_PRIMARY) @@ -713,18 +953,22 @@ ifa_delete_locked(struct ifa *b) if (i->flags & IF_UP) ifa_notify_change(IF_CHANGE_DOWN, b); - mb_free(b); + ifa_unlink(b); + IFACE_UNLOCK; return; + } + + IFACE_UNLOCK; } u32 if_choose_router_id(struct iface_patt *mask, u32 old_id) { + IFACE_LOCK; + struct iface *i; struct ifa *a, *b; - IFACE_LEGACY_ACCESS; - b = NULL; WALK_LIST(i, global_iface_list) { @@ -753,6 +997,8 @@ if_choose_router_id(struct iface_patt *mask, u32 old_id) } } + IFACE_UNLOCK; + if (!b) return 0; @@ -772,11 +1018,12 @@ if_choose_router_id(struct iface_patt *mask, u32 old_id) void if_init(void) { - iface_domain = DOMAIN_NEW(attrs, "Interfaces"); - if_pool = rp_new(&root_pool, &main_birdloop, "Interfaces"); + if_pool = rp_new(&root_pool, "Interfaces"); init_list(&global_iface_list); + iface_sub_slab = sl_new(if_pool, sizeof(struct iface_notification)); strcpy(default_vrf.name, "default"); neigh_init(if_pool); + iface_domain = DOMAIN_NEW(attrs, "Interfaces"); } /* @@ -898,13 +1145,10 @@ if_show_addr(struct ifa *a) void if_show(void) { - struct iface *i; struct ifa *a; char *type; - IFACE_LEGACY_ACCESS; - - WALK_LIST(i, global_iface_list) + IFACE_WALK(i) { if (i->flags & IF_SHUTDOWN) continue; @@ -944,12 +1188,8 @@ if_show(void) void if_show_summary(void) { - struct iface *i; - - IFACE_LEGACY_ACCESS; - cli_msg(-2005, "%-10s %-6s %-18s %s", "Interface", "State", "IPv4 address", "IPv6 address"); - WALK_LIST(i, global_iface_list) + IFACE_WALK(i) { byte a4[IPA_MAX_TEXT_LENGTH + 17]; byte a6[IPA_MAX_TEXT_LENGTH + 17]; diff --git a/nest/iface.h b/nest/iface.h index 87ad86cf..05898a55 100644 --- a/nest/iface.h +++ b/nest/iface.h @@ -9,20 +9,11 @@ #ifndef _BIRD_IFACE_H_ #define _BIRD_IFACE_H_ +#include "lib/locking.h" #include "lib/event.h" #include "lib/lists.h" +#include "lib/tlists.h" #include "lib/ip.h" -#include "lib/locking.h" - -DEFINE_DOMAIN(attrs); -extern list global_iface_list; -extern DOMAIN(attrs) iface_domain; - -#define IFACE_LEGACY_ACCESS ASSERT_DIE(birdloop_inside(&main_birdloop)) - -#define IFACE_LOCK LOCK_DOMAIN(attrs, iface_domain) -#define IFACE_UNLOCK UNLOCK_DOMAIN(attrs, iface_domain) -#define ASSERT_IFACE_LOCKED ASSERT_DIE(DOMAIN_IS_LOCKED(attrs, iface_domain)) struct proto; struct pool; @@ -36,6 +27,7 @@ struct ifa { /* Interface address */ ip_addr opposite; /* Opposite end of a point-to-point link */ unsigned scope; /* Interface address scope */ unsigned flags; /* Analogous to iface->flags */ + unsigned uc; /* Use (link) count */ }; extern struct iface default_vrf; @@ -54,6 +46,7 @@ struct iface { struct ifa *llv6; /* Primary link-local address for IPv6 */ ip4_addr sysdep; /* Arbitrary IPv4 address for internal sysdep use */ list neighbors; /* All neighbors on this interface */ + unsigned uc; /* Use (link) count */ }; #define IF_UP 1 /* Currently just IF_ADMIN_UP */ @@ -118,49 +111,59 @@ void ifa_dump(struct ifa *); void if_show(void); void if_show_summary(void); struct iface *if_update(struct iface *); +struct iface *if_update_locked(struct iface *); void if_delete(struct iface *old); struct ifa *ifa_update(struct ifa *); void ifa_delete(struct ifa *); void if_start_update(void); void if_end_partial_update(struct iface *); void if_end_update(void); -void if_flush_ifaces(struct proto *p); -void if_feed_baby(struct proto *); struct iface *if_find_by_index(unsigned); +struct iface *if_find_by_index_locked(unsigned); struct iface *if_find_by_name(const char *); struct iface *if_get_by_name(const char *); void if_recalc_all_preferred_addresses(void); +struct iface *if_walk_first(void); +struct iface *if_walk_next(struct iface *); +void if_walk_done(void); + +#define IFACE_WALK(_i) for (struct iface *_i = if_walk_first(); _i || (if_walk_done(), 0); _i = if_walk_next(_i)) /* The Neighbor Cache */ typedef struct neighbor { node n; /* Node in neighbor hash table chain */ node if_n; /* Node in per-interface neighbor list */ + TLIST_NODE(proto_neigh, struct neighbor) proto_n; ip_addr addr; /* Address of the neighbor */ struct ifa *ifa; /* Ifa on related iface */ struct iface *iface; /* Interface it's connected to */ struct iface *ifreq; /* Requested iface, NULL for any */ - struct event event; /* Notification event */ struct proto *proto; /* Protocol this belongs to */ void *data; /* Protocol-specific data */ uint aux; /* Protocol-specific data */ u16 flags; /* NEF_* flags */ s16 scope; /* Address scope, -1 for unreachable neighbors, SCOPE_HOST when it's our own address */ + uint uc; /* Use (link) count */ } neighbor; +#define TLIST_PREFIX proto_neigh +#define TLIST_TYPE struct neighbor +#define TLIST_ITEM proto_n +#define TLIST_WANT_WALK +#define TLIST_WANT_ADD_TAIL +#include "lib/tlists.h" + #define NEF_STICKY 1 #define NEF_ONLINK 2 #define NEF_IFACE 4 /* Entry for whole iface */ -#define NEF_NOTIFY_MAIN 0x100 /* Notify from main_birdloop context */ neighbor *neigh_find(struct proto *p, ip_addr a, struct iface *ifa, uint flags); -void neigh_dump(neighbor *); void neigh_dump_all(void); -void neigh_prune(struct proto *p); void neigh_if_up(struct iface *); void neigh_if_down(struct iface *); void neigh_if_link(struct iface *); @@ -168,6 +171,64 @@ void neigh_ifa_up(struct ifa *a); void neigh_ifa_down(struct ifa *a); void neigh_init(struct pool *); +void neigh_link(neighbor *); +void neigh_unlink(neighbor *); + +/* + * Notification mechanism + */ + +#define TLIST_PREFIX ifnot +#define TLIST_TYPE struct iface_notification +#define TLIST_ITEM nn +#define TLIST_WANT_WALK +#define TLIST_WANT_ADD_TAIL + +struct iface_notification { + TLIST_DEFAULT_NODE; + enum { + IFNOT_INVALID, + IFNOT_ADDRESS, + IFNOT_INTERFACE, + IFNOT_NEIGHBOR, + } type; + unsigned flags; + union { + struct ifa *a; + struct iface *i; + neighbor *n; + }; +}; + +#include "lib/tlists.h" + +#define TLIST_PREFIX ifsub +#define TLIST_TYPE struct iface_subscription +#define TLIST_ITEM n +#define TLIST_WANT_WALK +#define TLIST_WANT_ADD_TAIL + +struct iface_subscription { + TLIST_DEFAULT_NODE; + + event event; + event_list *target; + TLIST_LIST(ifnot) queue; + + void (*if_notify)(struct proto *, unsigned flags, struct iface *i); + void (*ifa_notify)(struct proto *, unsigned flags, struct ifa *a); + void (*neigh_notify)(struct neighbor *neigh); +}; + +#include "lib/tlists.h" + +void if_enqueue_notify(struct iface_notification); +void if_enqueue_notify_to(struct iface_notification x, struct iface_subscription *s); + +void iface_flush_notifications(struct iface_subscription *); +void iface_subscribe(struct iface_subscription *); +void iface_unsubscribe(struct iface_subscription *); + /* * Interface Pattern Lists */ diff --git a/nest/limit.h b/nest/limit.h index 5838ad3b..f8d4b212 100644 --- a/nest/limit.h +++ b/nest/limit.h @@ -32,6 +32,7 @@ static inline int limit_push(struct limit *l, void *data) static inline void limit_pop(struct limit *l) { + ASSERT_DIE(l->count > 0); --l->count; } diff --git a/nest/locks.c b/nest/locks.c index 812a6534..247d0c56 100644 --- a/nest/locks.c +++ b/nest/locks.c @@ -37,7 +37,11 @@ #include "nest/iface.h" static list olock_list; -static event *olock_event; + +DEFINE_DOMAIN(attrs); +static DOMAIN(attrs) olock_domain; +#define OBJ_LOCK LOCK_DOMAIN(attrs, olock_domain) +#define OBJ_UNLOCK UNLOCK_DOMAIN(attrs, olock_domain) static inline int olock_same(struct object_lock *x, struct object_lock *y) @@ -54,39 +58,56 @@ olock_same(struct object_lock *x, struct object_lock *y) static void olock_free(resource *r) { - struct object_lock *q, *l = (struct object_lock *) r; + /* Called externally from rfree() */ + struct object_lock *l = SKIP_BACK(struct object_lock, r, r); node *n; + OBJ_LOCK; DBG("olock: Freeing %p\n", l); switch (l->state) { case OLOCK_STATE_FREE: break; case OLOCK_STATE_LOCKED: - case OLOCK_STATE_EVENT: + /* Remove myself from the olock_list */ rem_node(&l->n); + + /* Maybe the notification is still pending. */ + ev_postpone(&l->event); + + /* Get new lock candidate */ n = HEAD(l->waiters); - if (n->next) + if (NODE_VALID(n)) { - DBG("olock: -> %p becomes locked\n", n); - q = SKIP_BACK(struct object_lock, n, n); + struct object_lock *q = SKIP_BACK(struct object_lock, n, n); + + /* Remove this candidate from waiters list */ rem_node(n); + + /* Move waiter lists */ + DBG("olock: -> %p becomes locked\n", n); add_tail_list(&q->waiters, &l->waiters); - q->state = OLOCK_STATE_EVENT; + + /* Add the new olock to olock_list */ add_head(&olock_list, n); - ev_schedule(olock_event); + + /* Inform */ + q->state = OLOCK_STATE_LOCKED; + ev_send(q->target, &q->event); } break; case OLOCK_STATE_WAITING: + /* Remove from the waiters list */ rem_node(&l->n); break; default: ASSERT(0); } + OBJ_UNLOCK; } static void -olock_dump(resource *r) +olock_dump(resource *r, unsigned indent UNUSED) { struct object_lock *l = (struct object_lock *) r; static char *olock_states[] = { "free", "locked", "waiting", "event" }; @@ -140,6 +161,8 @@ olock_acquire(struct object_lock *l) node *n; struct object_lock *q; + OBJ_LOCK; + WALK_LIST(n, olock_list) { q = SKIP_BACK(struct object_lock, n, n); @@ -148,36 +171,19 @@ olock_acquire(struct object_lock *l) l->state = OLOCK_STATE_WAITING; add_tail(&q->waiters, &l->n); DBG("olock: %p waits\n", l); + + OBJ_UNLOCK; return; } } + DBG("olock: %p acquired immediately\n", l); - l->state = OLOCK_STATE_EVENT; add_head(&olock_list, &l->n); - ev_schedule(olock_event); -} -static void -olock_run_event(void *unused UNUSED) -{ - node *n; - struct object_lock *q; + l->state = OLOCK_STATE_LOCKED; + ev_send(l->target, &l->event); - DBG("olock: Processing events\n"); - for(;;) - { - n = HEAD(olock_list); - if (!n->next) - break; - q = SKIP_BACK(struct object_lock, n, n); - if (q->state != OLOCK_STATE_EVENT) - break; - DBG("olock: %p locked\n", q); - q->state = OLOCK_STATE_LOCKED; - rem_node(&q->n); - add_tail(&olock_list, &q->n); - q->hook(q); - } + OBJ_UNLOCK; } /** @@ -191,5 +197,5 @@ olock_init(void) { DBG("olock: init\n"); init_list(&olock_list); - olock_event = ev_new_init(&root_pool, olock_run_event, NULL); + olock_domain = DOMAIN_NEW(attrs, "Object lock"); } diff --git a/nest/locks.h b/nest/locks.h index 37026c68..04571e69 100644 --- a/nest/locks.h +++ b/nest/locks.h @@ -31,8 +31,8 @@ struct object_lock { uint inst; /* ... instance ID */ struct iface *iface; /* ... interface */ struct iface *vrf; /* ... or VRF (if iface is unknown) */ - void (*hook)(struct object_lock *); /* Called when the lock succeeds */ - void *data; /* User data */ + event event; /* Enqueued when the lock succeeds */ + event_list *target; /* Where to put the event */ /* ... internal to lock manager, don't touch ... */ node n; /* Node in list of olocks */ int state; /* OLOCK_STATE_xxx */ @@ -50,6 +50,5 @@ void olock_init(void); #define OLOCK_STATE_FREE 0 #define OLOCK_STATE_LOCKED 1 #define OLOCK_STATE_WAITING 2 -#define OLOCK_STATE_EVENT 3 /* waiting for unlock processing */ #endif diff --git a/nest/neighbor.c b/nest/neighbor.c index 7b951366..934cd35c 100644 --- a/nest/neighbor.c +++ b/nest/neighbor.c @@ -59,9 +59,19 @@ static slab *neigh_slab; static list neigh_hash_table[NEIGH_HASH_SIZE], sticky_neigh_list; -static void neigh_do_notify(void *); -static void neigh_do_notify_main(void *); -static void neigh_free(neighbor *n); + +void if_link(struct iface *); +void if_unlink(struct iface *); +void ifa_link(struct ifa *); +void ifa_unlink(struct ifa *); + +extern list global_iface_list; + +extern DOMAIN(attrs) iface_domain; + +#define IFACE_LOCK LOCK_DOMAIN(attrs, iface_domain) +#define IFACE_UNLOCK UNLOCK_DOMAIN(attrs, iface_domain) +#define IFACE_ASSERT_LOCKED ASSERT_DIE(DOMAIN_IS_LOCKED(attrs, iface_domain)) static inline uint neigh_hash(struct proto *p, ip_addr a, struct iface *i) @@ -213,13 +223,14 @@ if_intersect(struct iface *ia, struct iface *ib) neighbor * neigh_find(struct proto *p, ip_addr a, struct iface *iface, uint flags) { + IFACE_LOCK; + neighbor *n; int class, scope = -1; uint h = neigh_hash(p, a, iface); struct iface *ifreq = iface; struct ifa *addr = NULL; - IFACE_LOCK; WALK_LIST(n, neigh_hash_table[h]) /* Search the cache */ if ((n->proto == p) && ipa_equal(n->addr, a) && (n->ifreq == iface)) { @@ -227,26 +238,24 @@ neigh_find(struct proto *p, ip_addr a, struct iface *iface, uint flags) return n; } -#define NOT_FOUND goto not_found - if (flags & NEF_IFACE) { if (ipa_nonzero(a) || !iface) - NOT_FOUND; + goto bad; } else { class = ipa_classify(a); if (class < 0) /* Invalid address */ - NOT_FOUND; + goto bad; if (((class & IADDR_SCOPE_MASK) == SCOPE_HOST) || (((class & IADDR_SCOPE_MASK) == SCOPE_LINK) && !iface) || !(class & IADDR_HOST)) - NOT_FOUND; /* Bad scope or a somecast */ + goto bad; /* Bad scope or a somecast */ } if ((flags & NEF_ONLINK) && !iface) - NOT_FOUND; + goto bad; if (iface) { @@ -260,47 +269,26 @@ neigh_find(struct proto *p, ip_addr a, struct iface *iface, uint flags) /* scope >= 0 <=> iface != NULL */ if ((scope < 0) && !(flags & NEF_STICKY)) - NOT_FOUND; + goto bad; n = sl_allocz(neigh_slab); add_tail(&neigh_hash_table[h], &n->n); add_tail((scope >= 0) ? &iface->neighbors : &sticky_neigh_list, &n->if_n); + proto_neigh_add_tail(&p->neighbors, n); n->addr = a; - n->ifa = addr; - n->iface = iface; - n->ifreq = ifreq; + ifa_link(n->ifa = addr); + if_link(n->iface = iface); + if_link(n->ifreq = ifreq); n->proto = p; n->flags = flags; n->scope = scope; - ASSERT_DIE(birdloop_inside(p->loop)); - - if (flags & NEF_NOTIFY_MAIN) - n->event = (event) { - .hook = neigh_do_notify_main, - .data = n, - .list = &global_event_list, - }; - else if (p->loop == &main_birdloop) - n->event = (event) { - .hook = neigh_do_notify, - .data = n, - .list = &global_event_list, - }; - else - { - birdloop_link(p->loop); - n->event = (event) { - .hook = neigh_do_notify, - .data = n, - .list = birdloop_event_list(p->loop), - }; - } + neigh_link(n); IFACE_UNLOCK; return n; -not_found: +bad: IFACE_UNLOCK; return NULL; } @@ -311,7 +299,7 @@ not_found: * * This functions dumps the contents of a given neighbor entry to debug output. */ -void +static void neigh_dump(neighbor *n) { debug("%p %I %s %s ", n, n->addr, @@ -333,11 +321,11 @@ neigh_dump(neighbor *n) void neigh_dump_all(void) { + IFACE_LOCK; + neighbor *n; int i; - IFACE_LOCK; - debug("Known neighbors:\n"); for(i=0; i<NEIGH_HASH_SIZE; i++) WALK_LIST(n, neigh_hash_table[i]) @@ -350,43 +338,21 @@ neigh_dump_all(void) static inline void neigh_notify(neighbor *n) { - if (!n->proto->neigh_notify) - return; - - ev_send(n->event.list, &n->event); -} - -static void -neigh_do_notify_main(void *data) -{ - neighbor *n = data; - PROTO_LOCKED_FROM_MAIN(n->proto) - neigh_do_notify(data); -} - -static void -neigh_do_notify(void *data) -{ - neighbor *n = data; - - ASSERT_DIE(birdloop_inside(n->proto->loop)); - - if (n->proto->proto_state != PS_STOP) - n->proto->neigh_notify(n); - - if ((n->scope < 0) && !(n->flags & NEF_STICKY)) - neigh_free(n); + IFACE_ASSERT_LOCKED; + if_enqueue_notify_to((struct iface_notification) { .type = IFNOT_NEIGHBOR, .n = n, }, &n->proto->iface_sub); } static void neigh_up(neighbor *n, struct iface *i, struct ifa *a, int scope) { DBG("Waking up sticky neighbor %I\n", n->addr); - n->iface = i; - n->ifa = a; + if_link(n->iface = i); + ifa_link(n->ifa = a); + n->scope = scope; - rem_node(&n->if_n); + rem_node(&n->if_n); /* HACK: Here the neighbor is always in the sticky list, + regardless whether it is sticky or not */ add_tail(&i->neighbors, &n->if_n); neigh_notify(n); @@ -396,32 +362,51 @@ static void neigh_down(neighbor *n) { DBG("Flushing neighbor %I on %s\n", n->addr, n->iface->name); - n->iface = NULL; - n->ifa = NULL; + n->scope = -1; rem_node(&n->if_n); add_tail(&sticky_neigh_list, &n->if_n); + ifa_unlink(n->ifa); + n->ifa = NULL; + + if_unlink(n->iface); + n->iface = NULL; + neigh_notify(n); } -static void -neigh_free(neighbor *n) +void +neigh_link(neighbor *n) { - ASSERT_DIE(birdloop_inside(n->proto->loop)); + IFACE_ASSERT_LOCKED; + n->uc++; +} + +void +neigh_unlink(neighbor *n) +{ + IFACE_ASSERT_LOCKED; + if (--n->uc) + return; + + struct proto *p = n->proto; + proto_neigh_rem_node(&p->neighbors, n); + + if ((p->proto_state == PS_DOWN) && EMPTY_TLIST(proto_neigh, &p->neighbors)) + proto_send_event(p, p->event); - if (n->flags & NEF_NOTIFY_MAIN) - ASSERT_DIE(birdloop_inside(&main_birdloop)); + n->proto = NULL; rem_node(&n->n); rem_node(&n->if_n); - if (n->event.list != &global_event_list) - birdloop_unlink(n->proto->loop); + ifa_unlink(n->ifa); + if_unlink(n->iface); + if_unlink(n->ifreq); - ev_postpone(&n->event); - sl_free(neigh_slab, n); + sl_free(n); } /** @@ -436,7 +421,7 @@ neigh_free(neighbor *n) void neigh_update(neighbor *n, struct iface *iface) { - ASSERT_IFACE_LOCKED; + IFACE_ASSERT_LOCKED; struct proto *p = n->proto; struct ifa *ifa = NULL; @@ -472,7 +457,8 @@ neigh_update(neighbor *n, struct iface *iface) { if (ifa != n->ifa) { - n->ifa = ifa; + ifa_unlink(n->ifa); + ifa_link(n->ifa = ifa); neigh_notify(n); } @@ -484,6 +470,12 @@ neigh_update(neighbor *n, struct iface *iface) if (n->scope >= 0) neigh_down(n); + if ((n->scope < 0) && !(n->flags & NEF_STICKY)) + { + neigh_unlink(n); + return; + } + if (scope >= 0) neigh_up(n, iface, ifa, scope); } @@ -501,12 +493,11 @@ neigh_update(neighbor *n, struct iface *iface) void neigh_if_up(struct iface *i) { + IFACE_ASSERT_LOCKED; struct iface *ii; neighbor *n; node *x, *y; - IFACE_LOCK; - /* Update neighbors that might be better off with the new iface */ WALK_LIST(ii, global_iface_list) if (!EMPTY_LIST(ii->neighbors) && (ii != i) && if_intersect(i, ii)) @@ -515,8 +506,6 @@ neigh_if_up(struct iface *i) WALK_LIST2_DELSAFE(n, x, y, sticky_neigh_list, if_n) neigh_update(n, i); - - IFACE_UNLOCK; } /** @@ -530,15 +519,12 @@ neigh_if_up(struct iface *i) void neigh_if_down(struct iface *i) { + IFACE_ASSERT_LOCKED; neighbor *n; node *x, *y; - IFACE_LOCK; - WALK_LIST2_DELSAFE(n, x, y, i->neighbors, if_n) neigh_update(n, i); - - IFACE_UNLOCK; } /** @@ -551,15 +537,12 @@ neigh_if_down(struct iface *i) void neigh_if_link(struct iface *i) { + IFACE_ASSERT_LOCKED; neighbor *n; node *x, *y; - IFACE_LOCK; - WALK_LIST2_DELSAFE(n, x, y, i->neighbors, if_n) neigh_notify(n); - - IFACE_UNLOCK; } /** @@ -575,12 +558,11 @@ neigh_if_link(struct iface *i) void neigh_ifa_up(struct ifa *a) { + IFACE_ASSERT_LOCKED; struct iface *i = a->iface, *ii; neighbor *n; node *x, *y; - IFACE_LOCK; - /* Update neighbors that might be better off with the new ifa */ WALK_LIST(ii, global_iface_list) if (!EMPTY_LIST(ii->neighbors) && ifa_intersect(a, ii)) @@ -590,59 +572,20 @@ neigh_ifa_up(struct ifa *a) /* Wake up all sticky neighbors that are reachable now */ WALK_LIST2_DELSAFE(n, x, y, sticky_neigh_list, if_n) neigh_update(n, i); - - IFACE_UNLOCK; } void neigh_ifa_down(struct ifa *a) { + IFACE_ASSERT_LOCKED; struct iface *i = a->iface; neighbor *n; node *x, *y; - IFACE_LOCK; - /* Update all neighbors whose scope has changed */ WALK_LIST2_DELSAFE(n, x, y, i->neighbors, if_n) if (n->ifa == a) neigh_update(n, i); - - IFACE_UNLOCK; -} - -static inline void -neigh_prune_one(neighbor *n) -{ - if (n->proto->proto_state != PS_DOWN) - return; - - neigh_free(n); -} - -/** - * neigh_prune - prune neighbor cache - * - * neigh_prune() examines all neighbor entries cached and removes those - * corresponding to inactive protocols. It's called whenever a protocol - * is shut down to get rid of all its heritage. - */ -void -neigh_prune(struct proto *p) -{ - neighbor *n; - node *m; - int i; - - IFACE_LOCK; - - DBG("Pruning neighbors\n"); - for(i=0; i<NEIGH_HASH_SIZE; i++) - WALK_LIST_DELSAFE(n, m, neigh_hash_table[i]) - if (n->proto == p) - neigh_prune_one(n); - - IFACE_UNLOCK; } /** diff --git a/nest/proto.c b/nest/proto.c index 2546e812..cd6a3faa 100644 --- a/nest/proto.c +++ b/nest/proto.c @@ -15,19 +15,17 @@ #include "lib/event.h" #include "lib/timer.h" #include "lib/string.h" -#include "lib/coro.h" #include "conf/conf.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/iface.h" #include "nest/cli.h" #include "filter/filter.h" #include "filter/f-inst.h" pool *proto_pool; -list proto_list; +list STATIC_LIST_INIT(proto_list); -static list protocol_list; -struct protocol *class_to_protocol[PROTOCOL__MAX]; +static list STATIC_LIST_INIT(protocol_list); #define CD(c, msg, args...) ({ if (c->debug & D_STATES) log(L_TRACE "%s.%s: " msg, c->proto->name, c->name ?: "?", ## args); }) #define PD(p, msg, args...) ({ if (p->debug & D_STATES) log(L_TRACE "%s: " msg, p->name, ## args); }) @@ -47,28 +45,20 @@ static char *c_states[] = { "DOWN", "START", "UP", "STOP", "RESTART" }; extern struct protocol proto_unix_iface; -static void channel_aux_request_refeed(struct channel_aux_table *cat); +static void channel_request_reload(struct channel *c); static void proto_rethink_goal(struct proto *p); static char *proto_state_name(struct proto *p); static void channel_init_limit(struct channel *c, struct limit *l, int dir, struct channel_limit *cf); static void channel_update_limit(struct channel *c, struct limit *l, int dir, struct channel_limit *cf); static void channel_reset_limit(struct channel *c, struct limit *l, int dir); static void channel_feed_end(struct channel *c); +static void channel_stop_export(struct channel *c); static void channel_export_stopped(struct rt_export_request *req); +static void channel_check_stopped(struct channel *c); static inline int proto_is_done(struct proto *p) { return (p->proto_state == PS_DOWN) && proto_is_inactive(p); } -static inline event_list *proto_event_list(struct proto *p) -{ return p->loop == &main_birdloop ? &global_event_list : birdloop_event_list(p->loop); } - -static inline event_list *proto_work_list(struct proto *p) -{ return p->loop == &main_birdloop ? &global_work_list : birdloop_event_list(p->loop); } - -static inline void proto_send_event(struct proto *p) -{ ev_send(proto_event_list(p), p->event); } - - static inline int channel_is_active(struct channel *c) { return (c->channel_state != CS_DOWN); } @@ -97,9 +87,7 @@ channel_export_log_state_change(struct rt_export_request *req, u8 state) switch (state) { case TES_FEEDING: - if (c->out_table) - rt_refresh_begin(&c->out_table->push); - else if (c->proto->feed_begin) + if (c->proto->feed_begin) c->proto->feed_begin(c, !c->refeeding); break; case TES_READY: @@ -189,16 +177,6 @@ proto_find_channel_by_name(struct proto *p, const char *n) return NULL; } -rte * channel_preimport(struct rt_import_request *req, rte *new, rte *old); -rte * channel_in_preimport(struct rt_import_request *req, rte *new, rte *old); - -void rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); -void rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); -void rt_feed_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe, rte **feed, uint count); -void rt_notify_accepted(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe, rte **feed, uint count); -void rt_notify_merged(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe, rte **feed, uint count); - - /** * proto_add_channel - connect protocol to a routing table * @p: protocol instance @@ -223,25 +201,22 @@ proto_add_channel(struct proto *p, struct channel_config *cf) c->channel = cf->channel; c->proto = p; c->table = cf->table->table; - - RT_LOCKED(c->table, t) - rt_lock_table(t); + rt_lock_table(c->table); c->in_filter = cf->in_filter; c->out_filter = cf->out_filter; + c->out_subprefix = cf->out_subprefix; channel_init_limit(c, &c->rx_limit, PLD_RX, &cf->rx_limit); channel_init_limit(c, &c->in_limit, PLD_IN, &cf->in_limit); channel_init_limit(c, &c->out_limit, PLD_OUT, &cf->out_limit); - c->rte_update_pool = lp_new_default(proto_pool); - c->net_type = cf->net_type; c->ra_mode = cf->ra_mode; c->preference = cf->preference; c->debug = cf->debug; c->merge_limit = cf->merge_limit; - c->in_keep_filtered = cf->in_keep_filtered; + c->in_keep = cf->in_keep; c->rpki_reload = cf->rpki_reload; c->channel_state = CS_DOWN; @@ -266,9 +241,7 @@ proto_remove_channel(struct proto *p UNUSED, struct channel *c) CD(c, "Removed", c->name); - RT_LOCKED(c->table, t) - rt_unlock_table(t); - + rt_unlock_table(c->table); rem_node(&c->n); mb_free(c); } @@ -309,51 +282,76 @@ proto_remove_channels(struct proto *p) proto_remove_channel(p, c); } +struct roa_subscription { + node roa_node; + struct settle settle; + struct channel *c; + struct rt_export_request req; +}; + static void -channel_roa_in_changed(void *_data) +channel_roa_in_changed(struct settle *se) { - struct channel *c = _data; + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, settle, se); + struct channel *c = s->c; CD(c, "Reload triggered by RPKI change"); - channel_request_reload(c); } static void -channel_roa_out_changed(void *_data) +channel_roa_out_changed(struct settle *se) { - struct channel *c = _data; + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, settle, se); + struct channel *c = s->c; + CD(c, "Feeding triggered by RPKI change"); c->refeed_pending = 1; + channel_stop_export(c); +} - if (c->out_req.hook) - rt_stop_export(&c->out_req, channel_export_stopped); +static void +channel_export_one_roa(struct rt_export_request *req, const net_addr *net UNUSED, struct rt_pending_export *first) +{ + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, req, req); + + /* TODO: use the information about what roa has changed */ + settle_kick(&s->settle, s->c->proto->loop); + + rpe_mark_seen_all(req->hook, first, NULL, NULL); } -/* Temporary code, subscriptions should be changed to resources */ -struct roa_subscription { - struct rt_subscription s; - node roa_node; -}; +static void +channel_dump_roa_req(struct rt_export_request *req) +{ + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, req, req); + struct channel *c = s->c; + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter.e, req->hook->table); + + debug(" Channel %s.%s ROA %s change notifier from table %s request %p\n", + c->proto->name, c->name, + (s->settle.hook == channel_roa_in_changed) ? "import" : "export", + tab->name, req); +} static int channel_roa_is_subscribed(struct channel *c, rtable *tab, int dir) { - void (*hook)(void *) = + void (*hook)(struct settle *) = dir ? channel_roa_in_changed : channel_roa_out_changed; struct roa_subscription *s; node *n; WALK_LIST2(s, n, c->roa_subscriptions, roa_node) - if ((s->s.tab == tab) && (s->s.event->hook == hook)) + if ((tab == SKIP_BACK(rtable, priv.exporter.e, s->req.hook->table)) + && (s->settle.hook == hook)) return 1; return 0; } - static void channel_roa_subscribe(struct channel *c, rtable *tab, int dir) { @@ -361,21 +359,41 @@ channel_roa_subscribe(struct channel *c, rtable *tab, int dir) return; struct roa_subscription *s = mb_allocz(c->proto->pool, sizeof(struct roa_subscription)); - s->s.event = ev_new_init(c->proto->pool, dir ? channel_roa_in_changed : channel_roa_out_changed, c); - s->s.event->list = proto_work_list(c->proto); - rt_subscribe(tab, &s->s); + *s = (struct roa_subscription) { + .settle = SETTLE_INIT(&c->roa_settle, dir ? channel_roa_in_changed : channel_roa_out_changed, NULL), + .c = c, + .req = { + .name = mb_sprintf(c->proto->pool, "%s.%s.roa-%s.%s", + c->proto->name, c->name, dir ? "in" : "out", tab->name), + .list = proto_work_list(c->proto), + .trace_routes = c->debug | c->proto->debug, + .dump_req = channel_dump_roa_req, + .export_one = channel_export_one_roa, + }, + }; add_tail(&c->roa_subscriptions, &s->roa_node); + rt_request_export(tab, &s->req); } static void -channel_roa_unsubscribe(struct roa_subscription *s) +channel_roa_unsubscribed(struct rt_export_request *req) { - rt_unsubscribe(&s->s); + struct roa_subscription *s = SKIP_BACK(struct roa_subscription, req, req); + struct channel *c = s->c; + rem_node(&s->roa_node); - rfree(s->s.event); mb_free(s); + + channel_check_stopped(c); +} + +static void +channel_roa_unsubscribe(struct roa_subscription *s) +{ + rt_stop_export(&s->req, channel_roa_unsubscribed); + settle_cancel(&s->settle); } static void @@ -395,7 +413,7 @@ channel_roa_subscribe_filter(struct channel *c, int dir) #ifdef CONFIG_BGP /* No automatic reload for BGP channels without in_table / out_table */ if (c->channel == &channel_bgp) - valid = dir ? !!c->in_table : !!c->out_table; + valid = dir ? ((c->in_keep & RIK_PREFILTER) == RIK_PREFILTER) : !!c->out_table; #endif struct filter_iterator fit; @@ -405,14 +423,8 @@ channel_roa_subscribe_filter(struct channel *c, int dir) { switch (fi->fi_code) { - case FI_ROA_CHECK_IMPLICIT: - tab = fi->i_FI_ROA_CHECK_IMPLICIT.rtc->table; - if (valid) channel_roa_subscribe(c, tab, dir); - found = 1; - break; - - case FI_ROA_CHECK_EXPLICIT: - tab = fi->i_FI_ROA_CHECK_EXPLICIT.rtc->table; + case FI_ROA_CHECK: + tab = fi->i_FI_ROA_CHECK.rtc->table; if (valid) channel_roa_subscribe(c, tab, dir); found = 1; break; @@ -449,14 +461,10 @@ channel_start_import(struct channel *c) return; } - int nlen = strlen(c->name) + strlen(c->proto->name) + 2; - char *rn = mb_allocz(c->proto->pool, nlen); - bsprintf(rn, "%s.%s", c->proto->name, c->name); - c->in_req = (struct rt_import_request) { - .name = rn, - .list = proto_work_list(c->proto), + .name = mb_sprintf(c->proto->pool, "%s.%s", c->proto->name, c->name), .trace_routes = c->debug | c->proto->debug, + .list = proto_work_list(c->proto), .dump_req = channel_dump_import_req, .log_state_change = channel_import_log_state_change, .preimport = channel_preimport, @@ -478,26 +486,24 @@ channel_start_export(struct channel *c) { if (c->out_req.hook) { - c->restart_export = 1; - log(L_WARN "%s.%s: Fast channel export restart", c->proto->name, c->name); + log(L_WARN "%s.%s: Attempted to start channel's already started export", c->proto->name, c->name); return; } ASSERT(c->channel_state == CS_UP); - int nlen = strlen(c->name) + strlen(c->proto->name) + 2; - char *rn = mb_allocz(c->proto->pool, nlen); - bsprintf(rn, "%s.%s", c->proto->name, c->name); c->out_req = (struct rt_export_request) { - .name = rn, + .name = mb_sprintf(c->proto->pool, "%s.%s", c->proto->name, c->name), .list = proto_work_list(c->proto), + .addr = c->out_subprefix, + .addr_mode = c->out_subprefix ? TE_ADDR_IN : TE_ADDR_NONE, .trace_routes = c->debug | c->proto->debug, .dump_req = channel_dump_export_req, .log_state_change = channel_export_log_state_change, }; - bmap_init(&c->export_map, c->proto->pool, 1024); - bmap_init(&c->export_reject_map, c->proto->pool, 1024); + bmap_init(&c->export_map, c->proto->pool, 16); + bmap_init(&c->export_reject_map, c->proto->pool, 16); channel_reset_limit(c, &c->out_limit, PLD_OUT); @@ -531,32 +537,28 @@ channel_check_stopped(struct channel *c) switch (c->channel_state) { case CS_STOP: - if (c->out_req.hook || c->in_req.hook || c->out_table || c->in_table) + if (!EMPTY_LIST(c->roa_subscriptions) || c->out_req.hook || c->in_req.hook || c->reload_req.hook) return; channel_set_state(c, CS_DOWN); - proto_send_event(c->proto); + proto_send_event(c->proto, c->proto->event); break; case CS_PAUSE: - if (c->out_req.hook) + if (!EMPTY_LIST(c->roa_subscriptions) || c->out_req.hook || c->reload_req.hook) return; channel_set_state(c, CS_START); break; - default: - bug("Stopped channel in a bad state: %d", c->channel_state); } DBG("%s.%s: Channel requests/hooks stopped (in state %s)\n", c->proto->name, c->name, c_states[c->channel_state]); } void -channel_import_stopped(void *_c) +channel_import_stopped(struct rt_import_request *req) { - struct channel *c = _c; - - c->in_req.hook = NULL; + struct channel *c = SKIP_BACK(struct channel, in_req, req); mb_free(c->in_req.name); c->in_req.name = NULL; @@ -577,8 +579,10 @@ channel_export_stopped(struct rt_export_request *req) c->refeeding = 1; c->refeed_pending = 0; - bmap_reset(&c->export_map, 1024); - bmap_reset(&c->export_reject_map, 1024); + channel_reset_limit(c, &c->out_limit, PLD_OUT); + + bmap_reset(&c->export_map, 16); + bmap_reset(&c->export_reject_map, 16); rt_request_export(c->table, req); return; @@ -590,20 +594,12 @@ channel_export_stopped(struct rt_export_request *req) bmap_free(&c->export_map); bmap_free(&c->export_reject_map); - if (c->restart_export) - { - c->restart_export = 0; - channel_start_export(c); - } - else - channel_check_stopped(c); + channel_check_stopped(c); } static void channel_feed_end(struct channel *c) { - struct rt_export_request *req = &c->out_req; - /* Reset export limit if the feed ended with acceptable number of exported routes */ struct limit *l = &c->out_limit; if (c->refeeding && @@ -615,377 +611,96 @@ channel_feed_end(struct channel *c) channel_reset_limit(c, &c->out_limit, PLD_OUT); c->refeed_pending = 1; - rt_stop_export(req, channel_export_stopped); + channel_stop_export(c); return; } - if (c->out_table) - rt_refresh_end(&c->out_table->push); - else if (c->proto->feed_end) + if (c->proto->feed_end) c->proto->feed_end(c); if (c->refeed_pending) - rt_stop_export(req, channel_export_stopped); -} - -#define CHANNEL_AUX_TABLE_DUMP_REQ(inout, imex, pgimex, pushget) static void \ - channel_##inout##_##pushget##_dump_req(struct rt_##pgimex##_request *req) { \ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, pushget, req); \ - debug(" Channel %s.%s " #imex " table " #pushget " request %p\n", cat->c->proto->name, cat->c->name, req); } - -CHANNEL_AUX_TABLE_DUMP_REQ(in, import, import, push) -CHANNEL_AUX_TABLE_DUMP_REQ(in, import, export, get) -CHANNEL_AUX_TABLE_DUMP_REQ(out, export, import, push) -CHANNEL_AUX_TABLE_DUMP_REQ(out, export, export, get) - -#undef CHANNEL_AUX_TABLE_DUMP_REQ - -static uint channel_aux_imex(struct channel_aux_table *cat) -{ - if (cat->c->in_table == cat) - return 0; - else if (cat->c->out_table == cat) - return 1; + channel_stop_export(c); else - bug("Channel aux table must be in_table or out_table"); -} - -static void -channel_aux_stopped(void *data) -{ - struct channel_aux_table *cat; - - RT_LOCKED((rtable *) data, t) - cat = t->config->owner; - - ASSERT_DIE(cat->push.hook == NULL); - ASSERT_DIE(cat->get.hook == NULL); - ASSERT_DIE(cat->stop_pending); - - struct channel *c = cat->c; - - if (channel_aux_imex(cat)) - c->out_table = NULL; - else - c->in_table = NULL; - - mb_free(cat); - channel_check_stopped(c); -} - -static void -channel_aux_import_stopped(void *_cat) -{ - struct channel_aux_table *cat = _cat; - - cat->push.hook = NULL; - - if (!cat->get.hook) - RT_LOCKED(cat->tab, t) - { - t->delete = channel_aux_stopped; - rt_unlock_table(t); - } + c->refeeding = 0; } -static void -channel_aux_export_stopped(struct rt_export_request *req) +/* Called by protocol for reload from in_table */ +void +channel_schedule_reload(struct channel *c) { - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - req->hook = NULL; + ASSERT(c->in_req.hook); - if (cat->refeed_pending && !cat->stop_pending) + if (c->reload_req.hook) { - cat->refeed_pending = 0; - rt_request_export(cat->tab, req); - + CD(c, "Reload triggered before the previous one has finished"); + c->reload_pending = 1; return; } - if (!cat->push.hook) - RT_LOCKED(cat->tab, t) - { - t->delete = channel_aux_stopped; - rt_unlock_table(t); - } + rt_refresh_begin(&c->in_req); + rt_request_export(c->table, &c->reload_req); } static void -channel_aux_stop(struct channel_aux_table *cat) +channel_reload_stopped(struct rt_export_request *req) { - ASSERT_DIE(!cat->stop_pending); - - cat->stop_pending = 1; - - RT_LOCKED(cat->tab, t) - rt_lock_table(t); - - cat->push_stopped = (event) { - .hook = channel_aux_import_stopped, - .data = cat, - .list = proto_event_list(cat->c->proto), - }; - - rt_stop_import(&cat->push, &cat->push_stopped); - rt_stop_export(&cat->get, channel_aux_export_stopped); -} + struct channel *c = SKIP_BACK(struct channel, reload_req, req); -static void -channel_push_log_state_change(struct rt_import_request *req, u8 state) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, push, req); - const char *imex = channel_aux_imex(cat) ? "export" : "import"; - CD(cat->c, "Channel %s table import state changed to %s", imex, rt_import_state_name(state)); -} - -static void -channel_get_log_state_change(struct rt_export_request *req, u8 state) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - const char *imex = channel_aux_imex(cat) ? "export" : "import"; - CD(cat->c, "Channel %s table export state changed to %s", imex, rt_export_state_name(state)); - - switch (state) + /* Restart reload */ + if (c->reload_pending) { - case TES_FEEDING: - if (imex && cat->c->proto->feed_begin) - cat->c->proto->feed_begin(cat->c, !cat->c->refeeding); - else if (!imex) - rt_refresh_begin(&cat->c->in_req); - break; - - case TES_READY: - if (imex && cat->c->proto->feed_end) - cat->c->proto->feed_end(cat->c); - else if (!imex) - rt_refresh_end(&cat->c->in_req); - - if (cat->refeed_pending) - rt_stop_export(&cat->get, channel_aux_export_stopped); - - break; - } -} - -void rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src *src); - -static int -channel_aux_export_one_any(struct rt_export_request *req, struct rt_pending_export *rpe, rte **new, rte **old) -{ - struct rte_src *src = rpe->new ? rpe->new->rte.src : rpe->old->rte.src; - *old = RTES_OR_NULL(rpe->old); - struct rte_storage *new_stored; - - while (rpe) - { - new_stored = rpe->new; - rpe_mark_seen(req->hook, rpe); - rpe = rpe_next(rpe, src); - } - - *new = RTES_CLONE(new_stored, *new); - - return (*new || *old) && (&new_stored->rte != *old); -} - -static int -channel_aux_export_one_best(struct rt_export_request *req, struct rt_pending_export *rpe, rte **new, rte **old) -{ - *old = RTES_OR_NULL(rpe->old_best); - struct rte_storage *new_stored; - - while (rpe) - { - new_stored = rpe->new_best; - rpe_mark_seen(req->hook, rpe); - rpe = rpe_next(rpe, NULL); + c->reload_pending = 0; + channel_request_reload(c); } - *new = RTES_CLONE(new_stored, *new); - - return (*new || *old) && (&new_stored->rte != *old); -} - -static void -channel_in_export_one_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - - rte n0, *new = &n0, *old; - if (channel_aux_export_one_any(req, rpe, &new, &old)) - rte_update_direct(cat->c, net, new, old ? old->src : new->src); + if (c->channel_state != CS_UP) + channel_check_stopped(c); } static void -channel_in_export_one_best(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +channel_reload_log_state_change(struct rt_export_request *req, u8 state) { - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - - rte n0, *new = &n0, *old; - if (channel_aux_export_one_best(req, rpe, &new, &old)) - rte_update_direct(cat->c, net, new, old ? old->src : new->src); -} + struct channel *c = SKIP_BACK(struct channel, reload_req, req); -static void -channel_in_export_bulk_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - for (uint i=0; i<count; i++) + if (state == TES_READY) { - rte n0 = *feed[i]; - rte_update_direct(cat->c, net, &n0, n0.src); - } -} + if (c->channel_state == CS_UP) + rt_refresh_end(&c->in_req); -static void -channel_in_export_bulk_best(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - if (!count) - return; - - rte n0 = *feed[0]; - rte_update_direct(cat->c, net, &n0, n0.src); -} - -void do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte *old); - -static void -channel_out_export_one_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - rte n0, *new = &n0, *old; - if (channel_aux_export_one_any(req, rpe, &new, &old)) - do_rt_notify_direct(cat->c, net, new, old); -} - -static void -channel_out_export_one_best(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - rte n0, *new = &n0, *old; - if (channel_aux_export_one_best(req, rpe, &new, &old)) - do_rt_notify_direct(cat->c, net, new, old); + rt_stop_export(req, channel_reload_stopped); + } } static void -channel_out_export_bulk(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) +channel_reload_dump_req(struct rt_export_request *req) { - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, get, req); - if (cat->c->ra_mode != RA_ANY) - ASSERT_DIE(count <= 1); - - for (uint i=0; i<count; i++) - { - rte n0 = *feed[i]; - do_rt_notify_direct(cat->c, net, &n0, NULL); - } + struct channel *c = SKIP_BACK(struct channel, reload_req, req); + debug(" Channel %s.%s import reload request %p\n", c->proto->name, c->name, req); } /* Called by protocol to activate in_table */ -void -channel_setup_in_table(struct channel *c, int best) -{ - int nlen = sizeof("import") + strlen(c->name) + strlen(c->proto->name) + 3; - - struct { - struct channel_aux_table cat; - struct rtable_config tab_cf; - char name[0]; - } *cat = mb_allocz(c->proto->pool, sizeof(*cat) + nlen); - - bsprintf(cat->name, "%s.%s.import", c->proto->name, c->name); - - cat->tab_cf.owner = cat; - cat->tab_cf.name = cat->name; - cat->tab_cf.addr_type = c->net_type; - cat->tab_cf.cork_limit = 4 * page_size / sizeof(struct rt_pending_export); - - c->in_table = &cat->cat; - c->in_table->push = (struct rt_import_request) { - .name = cat->name, - .list = proto_work_list(c->proto), - .trace_routes = c->debug | c->proto->debug, - .dump_req = channel_in_push_dump_req, - .log_state_change = channel_push_log_state_change, - .preimport = channel_in_preimport, - }; - c->in_table->get = (struct rt_export_request) { - .name = cat->name, - .list = proto_work_list(c->proto), - .trace_routes = c->debug | c->proto->debug, - .dump_req = channel_in_get_dump_req, - .log_state_change = channel_get_log_state_change, - .export_one = best ? channel_in_export_one_best : channel_in_export_one_any, - .export_bulk = best ? channel_in_export_bulk_best : channel_in_export_bulk_any, - }; - - c->in_table->c = c; - c->in_table->tab = rt_setup(c->proto->pool, &cat->tab_cf); - - rt_request_import(c->in_table->tab, &c->in_table->push); - rt_request_export(c->in_table->tab, &c->in_table->get); -} - -/* Called by protocol to activate out_table */ -void -channel_setup_out_table(struct channel *c) +static void +channel_setup_in_table(struct channel *c) { - int nlen = sizeof("export") + strlen(c->name) + strlen(c->proto->name) + 3; - - struct { - struct channel_aux_table cat; - struct rtable_config tab_cf; - char name[0]; - } *cat = mb_allocz(c->proto->pool, sizeof(*cat) + nlen); - - bsprintf(cat->name, "%s.%s.export", c->proto->name, c->name); - - cat->tab_cf.owner = cat; - cat->tab_cf.name = cat->name; - cat->tab_cf.addr_type = c->net_type; - cat->tab_cf.cork_limit = 4 * page_size / sizeof(struct rt_pending_export); - - c->out_table = &cat->cat; - c->out_table->push = (struct rt_import_request) { - .name = cat->name, + c->reload_req = (struct rt_export_request) { + .name = mb_sprintf(c->proto->pool, "%s.%s.import", c->proto->name, c->name), .list = proto_work_list(c->proto), .trace_routes = c->debug | c->proto->debug, - .dump_req = channel_out_push_dump_req, - .log_state_change = channel_push_log_state_change, + .export_bulk = channel_reload_export_bulk, + .dump_req = channel_reload_dump_req, + .log_state_change = channel_reload_log_state_change, }; - c->out_table->get = (struct rt_export_request) { - .name = cat->name, - .list = proto_work_list(c->proto), - .trace_routes = c->debug | c->proto->debug, - .dump_req = channel_out_get_dump_req, - .log_state_change = channel_get_log_state_change, - .export_one = (c->ra_mode == RA_ANY) ? channel_out_export_one_any : channel_out_export_one_best, - .export_bulk = channel_out_export_bulk, - }; - - c->out_table->c = c; - c->out_table->tab = rt_setup(c->proto->pool, &cat->tab_cf); - - rt_request_import(c->out_table->tab, &c->out_table->push); - rt_request_export(c->out_table->tab, &c->out_table->get); } -static void -channel_aux_request_refeed(struct channel_aux_table *cat) -{ - if (cat->stop_pending) - return; - - cat->refeed_pending = 1; - rt_stop_export(&cat->get, channel_aux_export_stopped); -} static void channel_do_start(struct channel *c) { c->proto->active_channels++; + if ((c->in_keep & RIK_PREFILTER) == RIK_PREFILTER) + channel_setup_in_table(c); + CALL(c->channel->start, c); channel_start_import(c); @@ -1005,36 +720,26 @@ channel_do_up(struct channel *c) static void channel_do_pause(struct channel *c) { - /* Stop export */ - if (c->out_req.hook) - { - rt_stop_export(&c->out_req, channel_export_stopped); - c->refeeding = 0; - } - + /* Drop ROA subscriptions */ channel_roa_unsubscribe_all(c); + + /* Need to abort feeding */ + c->reload_pending = 0; + + if (c->reload_req.hook && c->reload_req.hook->export_state != TES_STOP) + rt_stop_export(&c->reload_req, channel_reload_stopped); + + /* Stop export */ + c->refeed_pending = 0; + channel_stop_export(c); } static void channel_do_stop(struct channel *c) { - /* Drop auxiliary tables */ - if (c->in_table) - channel_aux_stop(c->in_table); - - if (c->out_table) - channel_aux_stop(c->out_table); - /* Stop import */ if (c->in_req.hook) - { - c->in_stopped = (event) { - .hook = channel_import_stopped, - .data = c, - .list = proto_event_list(c->proto), - }; - rt_stop_import(&c->in_req, &c->in_stopped); - } + rt_stop_import(&c->in_req, channel_import_stopped); c->gr_wait = 0; if (c->gr_lock) @@ -1042,24 +747,27 @@ channel_do_stop(struct channel *c) CALL(c->channel->shutdown, c); - channel_roa_unsubscribe_all(c); } static void channel_do_down(struct channel *c) { - ASSERT(!c->out_req.hook && !c->in_req.hook && !c->out_table && !c->in_table); + ASSERT(!c->reload_req.hook); c->proto->active_channels--; memset(&c->import_stats, 0, sizeof(struct channel_import_stats)); memset(&c->export_stats, 0, sizeof(struct channel_export_stats)); + c->out_table = NULL; + + /* The in_table and out_table are going to be freed by freeing their resource pools. */ + CALL(c->channel->cleanup, c); /* Schedule protocol shutddown */ if (proto_is_done(c->proto)) - proto_send_event(c->proto); + proto_send_event(c->proto, c->proto->event); } void @@ -1085,7 +793,7 @@ channel_set_state(struct channel *c, uint state) break; case CS_UP: - ASSERT(cs == CS_DOWN || cs == CS_START || cs == CS_PAUSE); + ASSERT(cs == CS_DOWN || cs == CS_START); if (cs == CS_DOWN) channel_do_start(c); @@ -1135,32 +843,28 @@ channel_set_state(struct channel *c, uint state) * completed, it will switch back to ES_READY. This function can be called * even when feeding is already running, in that case it is restarted. */ -static void -channel_request_table_feeding(struct channel *c) +void +channel_request_feeding(struct channel *c) { ASSERT(c->out_req.hook); + if (c->refeed_pending) + return; + c->refeed_pending = 1; - rt_stop_export(&c->out_req, channel_export_stopped); + channel_stop_export(c); } -void -channel_request_feeding(struct channel *c) +static void +channel_stop_export(struct channel *c) { - if (c->gr_wait || !c->proto->rt_notify) + if (!c->out_req.hook || (c->out_req.hook->export_state == TES_STOP)) return; - CD(c, "Refeed requested"); - - ASSERT_DIE(c->out_req.hook); - - if (c->out_table) - channel_aux_request_refeed(c->out_table); - else - channel_request_table_feeding(c); + rt_stop_export(&c->out_req, channel_export_stopped); } -void +static void channel_request_reload(struct channel *c) { ASSERT(c->in_req.hook); @@ -1168,31 +872,12 @@ channel_request_reload(struct channel *c) CD(c, "Reload requested"); - if (c->in_table) - channel_aux_request_refeed(c->in_table); + if ((c->in_keep & RIK_PREFILTER) == RIK_PREFILTER) + channel_schedule_reload(c); else c->proto->reload_routes(c); } -void -channel_refresh_begin(struct channel *c) -{ - CD(c, "Channel route refresh begin"); - if (c->in_table) - rt_refresh_begin(&c->in_table->push); - else - rt_refresh_begin(&c->in_req); -} - -void -channel_refresh_end(struct channel *c) -{ - if (c->in_table) - rt_refresh_end(&c->in_table->push); - else - rt_refresh_end(&c->in_req); -} - const struct channel_class channel_basic = { .channel_size = sizeof(struct channel), .config_size = sizeof(struct channel_config) @@ -1212,7 +897,7 @@ channel_config_new(const struct channel_class *cc, const char *name, uint net_ty if (proto->net_type && (net_type != proto->net_type)) cf_error("Different channel type"); - tab = new_config->def_tables[net_type]; + tab = rt_get_default_table(new_config, net_type); } if (!cc) @@ -1231,6 +916,11 @@ channel_config_new(const struct channel_class *cc, const char *name, uint net_ty cf->debug = new_config->channel_default_debug; cf->rpki_reload = 1; + cf->roa_settle = (struct settle_config) { + .min = 1 S, + .max = 20 S, + }; + add_tail(&proto->channels, &cf->n); return cf; @@ -1250,6 +940,7 @@ channel_config_get(const struct channel_class *cc, const char *name, uint net_ty cf_error("Multiple %s channels", name); cf->parent = proto; + cf->copy = 1; return cf; } @@ -1275,8 +966,16 @@ static int reconfigure_type; /* Hack to propagate type info to channel_reconfig int channel_reconfigure(struct channel *c, struct channel_config *cf) { + /* Touched by reconfiguration */ + c->stale = 0; + /* FIXME: better handle these changes, also handle in_keep_filtered */ - if ((c->table != cf->table->table) || (cf->ra_mode && (c->ra_mode != cf->ra_mode))) + if ((c->table != cf->table->table) || + (cf->ra_mode && (c->ra_mode != cf->ra_mode)) || + (cf->in_keep != c->in_keep) || + cf->out_subprefix && c->out_subprefix && + !net_equal(cf->out_subprefix, c->out_subprefix) || + (!cf->out_subprefix != !c->out_subprefix)) return 0; /* Note that filter_same() requires arguments in (new, old) order */ @@ -1301,11 +1000,27 @@ channel_reconfigure(struct channel *c, struct channel_config *cf) // c->ra_mode = cf->ra_mode; c->merge_limit = cf->merge_limit; c->preference = cf->preference; + c->out_req.addr = c->out_subprefix = cf->out_subprefix; c->debug = cf->debug; c->in_req.trace_routes = c->out_req.trace_routes = c->debug | c->proto->debug; - c->in_keep_filtered = cf->in_keep_filtered; c->rpki_reload = cf->rpki_reload; + if ( (c->roa_settle.min != cf->roa_settle.min) + || (c->roa_settle.max != cf->roa_settle.max)) + { + c->roa_settle = cf->roa_settle; + + struct roa_subscription *s; + node *n; + + WALK_LIST2(s, n, c->roa_subscriptions, roa_node) + { + s->settle.cf = cf->roa_settle; + if (settle_active(&s->settle)) + settle_kick(&s->settle, &main_birdloop); + } + } + /* Execute channel-specific reconfigure hook */ if (c->channel->reconfigure && !c->channel->reconfigure(c, cf, &import_changed, &export_changed)) return 0; @@ -1348,7 +1063,7 @@ channel_reconfigure(struct channel *c, struct channel_config *cf) channel_request_reload(c); if (export_changed) - channel_request_table_feeding(c); + channel_request_feeding(c); done: CD(c, "Reconfigured"); @@ -1399,6 +1114,11 @@ proto_configure_channel(struct proto *p, struct channel **pc, struct channel_con static void proto_cleanup(struct proto *p) { + CALL(p->proto->cleanup, p); + + rfree(p->pool); + p->pool = NULL; + p->active = 0; proto_log_state_change(p); proto_rethink_goal(p); @@ -1409,13 +1129,13 @@ proto_loop_stopped(void *ptr) { struct proto *p = ptr; - ASSERT_DIE(birdloop_inside(&main_birdloop)); + birdloop_enter(&main_birdloop); + birdloop_free(p->loop); p->loop = &main_birdloop; - p->pool = NULL; - p->event->list = NULL; - proto_cleanup(p); + + birdloop_leave(&main_birdloop); } static void @@ -1425,8 +1145,7 @@ proto_event(void *ptr) if (p->do_stop) { - if (p->proto == &proto_unix_iface) - if_flush_ifaces(p); + iface_unsubscribe(&p->iface_sub); p->do_stop = 0; } @@ -1435,11 +1154,7 @@ proto_event(void *ptr) if (p->loop != &main_birdloop) birdloop_stop_self(p->loop, proto_loop_stopped, p); else - { - rp_free(p->pool, proto_pool); - p->pool = NULL; proto_cleanup(p); - } } @@ -1499,24 +1214,15 @@ proto_start(struct proto *p) DBG("Kicking %s up\n", p->name); PD(p, "Starting"); - int ns = strlen("Protocol ") + strlen(p->cf->name) + 1; - void *nb = mb_alloc(proto_pool, ns); - ASSERT_DIE(ns - 1 == bsnprintf(nb, ns, "Protocol %s", p->cf->name)); + p->pool = rp_newf(proto_pool, "Protocol %s", p->cf->name); if (graceful_restart_state == GRS_INIT) p->gr_recovery = 1; - if (p->cf->loop_order == DOMAIN_ORDER(the_bird)) - p->pool = rp_new(proto_pool, &main_birdloop, nb); - else - { - p->loop = birdloop_new(proto_pool, p->cf->loop_order, nb); - p->pool = birdloop_pool(p->loop); - } - - p->event->list = proto_event_list(p); + if (p->cf->loop_order != DOMAIN_ORDER(the_bird)) + p->loop = birdloop_new(p->pool, p->cf->loop_order, p->pool->name, p->cf->loop_max_latency); - mb_move(nb, p->pool); + p->iface_sub.target = proto_event_list(p); PROTO_LOCKED_FROM_MAIN(p) proto_notify_state(p, (p->proto->start ? p->proto->start(p) : PS_UP)); @@ -1674,7 +1380,7 @@ proto_reconfigure(struct proto *p, struct proto_config *oc, struct proto_config (nc->vrf != oc->vrf)) return 0; - p->name = nc->name; + p->sources.name = p->name = nc->name; p->debug = nc->debug; p->mrtdump = nc->mrtdump; reconfigure_type = type; @@ -1735,6 +1441,8 @@ protos_commit(struct config *new, struct config *old, int force_reconfig, int ty p = oc->proto; sym = cf_find_symbol(new, oc->name); + struct birdloop *proto_loop = PROTO_ENTER_FROM_MAIN(p); + /* Handle dynamic protocols */ if (!sym && oc->parent && !new->shutdown) { @@ -1760,13 +1468,10 @@ protos_commit(struct config *new, struct config *old, int force_reconfig, int ty nc->proto = p; /* We will try to reconfigure protocol p */ - if (!force_reconfig) + if (!force_reconfig && proto_reconfigure(p, oc, nc, type)) { - int ok; - PROTO_LOCKED_FROM_MAIN(p) - ok = proto_reconfigure(p, oc, nc, type); - if (ok) - continue; + PROTO_LEAVE_FROM_MAIN(proto_loop); + continue; } if (nc->parent) @@ -1805,6 +1510,8 @@ protos_commit(struct config *new, struct config *old, int force_reconfig, int ty } p->reconfiguring = 1; + PROTO_LEAVE_FROM_MAIN(proto_loop); + config_add_obstacle(old); proto_rethink_goal(p); } @@ -2088,7 +1795,7 @@ protos_dump_all(void) debug("Protocols:\n"); struct proto *p; - WALK_LIST(p, proto_list) + WALK_LIST(p, proto_list) PROTO_LOCKED_FROM_MAIN(p) { #define DPF(x) (p->x ? " " #x : "") debug(" protocol %s (%p) state %s with %d active channels flags: %s%s%s%s\n", @@ -2107,20 +1814,6 @@ protos_dump_all(void) debug("\tChannel state: %s/%s/%s\n", c_states[c->channel_state], c->in_req.hook ? rt_import_state_name(rt_import_get_state(c->in_req.hook)) : "-", c->out_req.hook ? rt_export_state_name(rt_export_get_state(c->out_req.hook)) : "-"); - if (c->in_table) - { - debug("\tInput aux table:\n"); - rt_dump_hooks(c->in_table->tab); - rt_dump(c->in_table->tab); - debug("\tEnd of input aux table.\n"); - } - if (c->out_table) - { - debug("\tOutput aux table:\n"); - rt_dump_hooks(c->in_table->tab); - rt_dump(c->in_table->tab); - debug("\tEnd of output aux table.\n"); - } } if (p->proto->dump && (p->proto_state != PS_DOWN)) @@ -2140,14 +1833,13 @@ void proto_build(struct protocol *p) { add_tail(&protocol_list, &p->n); - ASSERT(p->class); - ASSERT(!class_to_protocol[p->class]); - class_to_protocol[p->class] = p; } /* FIXME: convert this call to some protocol hook */ extern void bfd_init_all(void); +void protos_build_gen(void); + /** * protos_build - build a protocol list * @@ -2160,46 +1852,9 @@ extern void bfd_init_all(void); void protos_build(void) { - init_list(&proto_list); - init_list(&protocol_list); + protos_build_gen(); - proto_build(&proto_device); -#ifdef CONFIG_RADV - proto_build(&proto_radv); -#endif -#ifdef CONFIG_RIP - proto_build(&proto_rip); -#endif -#ifdef CONFIG_STATIC - proto_build(&proto_static); -#endif -#ifdef CONFIG_MRT - proto_build(&proto_mrt); -#endif -#ifdef CONFIG_OSPF - proto_build(&proto_ospf); -#endif -#ifdef CONFIG_PIPE - proto_build(&proto_pipe); -#endif -#ifdef CONFIG_BGP - proto_build(&proto_bgp); -#endif -#ifdef CONFIG_BFD - proto_build(&proto_bfd); - bfd_init_all(); -#endif -#ifdef CONFIG_BABEL - proto_build(&proto_babel); -#endif -#ifdef CONFIG_RPKI - proto_build(&proto_rpki); -#endif -#ifdef CONFIG_PERF - proto_build(&proto_perf); -#endif - - proto_pool = rp_new(&root_pool, &main_birdloop, "Protocols"); + proto_pool = rp_new(&root_pool, "Protocols"); } @@ -2207,29 +1862,40 @@ protos_build(void) int proto_restart; static void -proto_shutdown_loop(void *data UNUSED) +proto_restart_event_hook(void *_p) { - struct proto *p, *p_next; + struct proto *p = _p; + if (!p->down_sched) + return; - WALK_LIST_DELSAFE(p, p_next, proto_list) - if (p->down_sched) - { - proto_restart = (p->down_sched == PDS_RESTART); + proto_restart = (p->down_sched == PDS_RESTART); + p->disabled = 1; + proto_rethink_goal(p); - p->disabled = 1; - proto_rethink_goal(p); - if (proto_restart) - { - p->disabled = 0; - proto_rethink_goal(p); - } - } + p->restart_event = NULL; + p->restart_timer = NULL; + + if (proto_restart) + /* No need to call proto_rethink_goal() here again as the proto_cleanup() routine will + * call it after the protocol stops ... and both these routines are fixed to main_birdloop. + */ + p->disabled = 0; } -static event proto_schedule_down_event = { - .hook = proto_shutdown_loop, - .list = &global_event_list, -}; +static void +proto_send_restart_event(struct proto *p) +{ + if (!p->restart_event) + p->restart_event = ev_new_init(p->pool, proto_restart_event_hook, p); + + ev_send(&global_event_list, p->restart_event); +} + +static void +proto_send_restart_event_from_timer(struct timer *t) +{ + proto_send_restart_event((struct proto *) t->data); +} static inline void proto_schedule_down(struct proto *p, byte restart, byte code) @@ -2244,7 +1910,20 @@ proto_schedule_down(struct proto *p, byte restart, byte code) p->down_sched = restart ? PDS_RESTART : PDS_DISABLE; p->down_code = code; - ev_send_self(&proto_schedule_down_event); + if (!restart) + { + if (p->restart_timer && tm_active(p->restart_timer)) + tm_stop(p->restart_timer); + + proto_send_restart_event(p); + } + else + { + if (!p->restart_timer) + p->restart_timer = tm_new_init(p->pool, proto_send_restart_event_from_timer, p, 0, 0); + + tm_start_max_in(p->restart_timer, 250 MS, p->loop); + } } /** @@ -2386,23 +2065,19 @@ static struct rte_owner_class default_rte_owner_class; static inline void proto_do_start(struct proto *p) { - ASSERT_DIE(birdloop_inside(p->loop)); - p->active = 1; - rt_init_sources(&p->sources, p->name, proto_work_list(p)); + rt_init_sources(&p->sources, p->name, proto_event_list(p)); if (!p->sources.class) p->sources.class = &default_rte_owner_class; if (!p->cf->late_if_feed) - if_feed_baby(p); + iface_subscribe(&p->iface_sub); } static void proto_do_up(struct proto *p) { - ASSERT_DIE(birdloop_inside(p->loop)); - if (!p->main_source) p->main_source = rt_get_source(p, 0); // Locked automaticaly @@ -2410,7 +2085,7 @@ proto_do_up(struct proto *p) proto_start_channels(p); if (p->cf->late_if_feed) - if_feed_baby(p); + iface_subscribe(&p->iface_sub); } static inline void @@ -2435,18 +2110,17 @@ proto_do_stop(struct proto *p) rt_destroy_sources(&p->sources, p->event); p->do_stop = 1; - proto_send_event(p); + proto_send_event(p, p->event); } static void proto_do_down(struct proto *p) { p->down_code = 0; - neigh_prune(p); /* Shutdown is finished in the protocol event */ if (proto_is_done(p)) - proto_send_event(p); + proto_send_event(p, p->event); } @@ -2552,18 +2226,18 @@ channel_show_stats(struct channel *c) u32 in_routes = c->in_limit.count; u32 out_routes = c->out_limit.count; - if (c->in_keep_filtered) + if (c->in_keep) cli_msg(-1006, " Routes: %u imported, %u filtered, %u exported, %u preferred", in_routes, (rx_routes - in_routes), out_routes, SRI(pref)); else cli_msg(-1006, " Routes: %u imported, %u exported, %u preferred", in_routes, out_routes, SRI(pref)); - cli_msg(-1006, " Route change stats: received rejected filtered ignored limited accepted"); - cli_msg(-1006, " Import updates: %10u %10u %10u %10u %10u %10u", + cli_msg(-1006, " Route change stats: received rejected filtered ignored RX limit IN limit accepted"); + cli_msg(-1006, " Import updates: %10u %10u %10u %10u %10u %10u %10u", SCI(updates_received), SCI(updates_invalid), SCI(updates_filtered), SRI(updates_ignored), - SCI(updates_limited_rx) + SCI(updates_limited_in), + SCI(updates_limited_rx), SCI(updates_limited_in), SRI(updates_accepted)); cli_msg(-1006, " Import withdraws: %10u %10u --- %10u --- %10u", SCI(withdraws_received), SCI(withdraws_invalid), diff --git a/nest/protocol.h b/nest/protocol.h index a4b6152b..01153162 100644 --- a/nest/protocol.h +++ b/nest/protocol.h @@ -9,10 +9,12 @@ #ifndef _BIRD_PROTOCOL_H_ #define _BIRD_PROTOCOL_H_ -#include "lib/lists.h" +#include "lib/tlists.h" #include "lib/resource.h" #include "lib/event.h" -#include "nest/route.h" +#include "nest/iface.h" +#include "lib/settle.h" +#include "nest/rt.h" #include "nest/limit.h" #include "conf/conf.h" @@ -37,38 +39,20 @@ struct symbol; * Routing Protocol */ -enum protocol_class { - PROTOCOL_NONE, - PROTOCOL_BABEL, - PROTOCOL_BFD, - PROTOCOL_BGP, - PROTOCOL_DEVICE, - PROTOCOL_DIRECT, - PROTOCOL_KERNEL, - PROTOCOL_OSPF, - PROTOCOL_MRT, - PROTOCOL_PERF, - PROTOCOL_PIPE, - PROTOCOL_RADV, - PROTOCOL_RIP, - PROTOCOL_RPKI, - PROTOCOL_STATIC, - PROTOCOL__MAX -}; - -extern struct protocol *class_to_protocol[PROTOCOL__MAX]; struct protocol { node n; char *name; char *template; /* Template for automatic generation of names */ int name_counter; /* Counter for automatic name generation */ - enum protocol_class class; /* Machine readable protocol class */ uint preference; /* Default protocol preference */ uint channel_mask; /* Mask of accepted channel types (NB_*) */ uint proto_size; /* Size of protocol data structure */ uint config_size; /* Size of protocol config data structure */ + uint eattr_begin; /* First ID of registered eattrs */ + uint eattr_end; /* End of eattr id zone */ + void (*preconfig)(struct protocol *, struct config *); /* Just before configuring */ void (*postconfig)(struct proto_config *); /* After configuring each instance */ struct proto * (*init)(struct proto_config *); /* Create new instance */ @@ -76,14 +60,15 @@ struct protocol { void (*dump)(struct proto *); /* Debugging dump */ int (*start)(struct proto *); /* Start the instance */ int (*shutdown)(struct proto *); /* Stop the instance */ + void (*cleanup)(struct proto *); /* Cleanup the instance right before tearing it all down */ void (*get_status)(struct proto *, byte *buf); /* Get instance status (for `show protocols' command) */ - int (*get_attr)(const struct eattr *, byte *buf, int buflen); /* ASCIIfy dynamic attribute (returns GA_*) */ +// int (*get_attr)(const struct eattr *, byte *buf, int buflen); /* ASCIIfy dynamic attribute (returns GA_*) */ void (*show_proto_info)(struct proto *); /* Show protocol info (for `show protocols all' command) */ void (*copy_config)(struct proto_config *, struct proto_config *); /* Copy config from given protocol instance */ }; -void protos_build(void); -void proto_build(struct protocol *); +void protos_build(void); /* Called from sysdep to initialize protocols */ +void proto_build(struct protocol *); /* Called from protocol to register itself */ void protos_preconfig(struct config *); void protos_commit(struct config *new, struct config *old, int force_restart, int type); struct proto * proto_spawn(struct proto_config *cf, uint disabled); @@ -121,6 +106,7 @@ struct proto_config { u32 debug, mrtdump; /* Debugging bitfields, both use D_* constants */ u32 router_id; /* Protocol specific router ID */ uint loop_order; /* Launch a birdloop on this locking level; use DOMAIN_ORDER(the_bird) for mainloop */ + btime loop_max_latency; /* Request this specific maximum latency of loop; zero to default */ list channels; /* List of channel configs (struct channel_config) */ struct iface *vrf; /* Related VRF instance, NULL if global */ @@ -138,6 +124,8 @@ struct proto { struct proto_config *cf_new; /* Configuration we want to switch to after shutdown (NULL=delete) */ pool *pool; /* Pool containing local objects */ event *event; /* Protocol event */ + timer *restart_timer; /* Timer to restart the protocol from limits */ + event *restart_event; /* Event to restart/shutdown the protocol from limits */ struct birdloop *loop; /* BIRDloop running this protocol */ list channels; /* List of channels to rtables (struct channel) */ @@ -145,12 +133,14 @@ struct proto { struct rte_src *main_source; /* Primary route source */ struct rte_owner sources; /* Route source owner structure */ struct iface *vrf; /* Related VRF instance, NULL if global */ + TLIST_LIST(proto_neigh) neighbors; /* List of neighbor structures */ + struct iface_subscription iface_sub; /* Interface notification subscription */ - const char *name; /* Name of this instance (== cf->name) */ + const char *name; /* Name of this instance (== cf->name) */ u32 debug; /* Debugging flags */ u32 mrtdump; /* MRTDump flags */ uint active_channels; /* Number of active channels */ - uint active_coroutines; /* Number of active coroutines */ + uint active_loops; /* Number of active IO loops */ byte net_type; /* Protocol network type (NET_*), 0 for undefined */ byte disabled; /* Manually disabled */ byte proto_state; /* Protocol state machine (PS_*, see below) */ @@ -184,10 +174,7 @@ struct proto { * feed_end Notify channel about finish of route feeding. */ - void (*if_notify)(struct proto *, unsigned flags, struct iface *i); - void (*ifa_notify)(struct proto *, unsigned flags, struct ifa *a); void (*rt_notify)(struct proto *, struct channel *, const net_addr *net, struct rte *new, const struct rte *old); - void (*neigh_notify)(struct neighbor *neigh); int (*preexport)(struct channel *, struct rte *rt); void (*reload_routes)(struct channel *); void (*feed_begin)(struct channel *, int initial); @@ -197,19 +184,16 @@ struct proto { * Routing entry hooks (called only for routes belonging to this protocol): * * rte_recalculate Called at the beginning of the best route selection - * rte_better Compare two rte's and decide which one is better (1=first, 0=second). - * rte_same Compare two rte's and decide whether they are identical (1=yes, 0=no). * rte_mergable Compare two rte's and decide whether they could be merged (1=yes, 0=no). * rte_insert Called whenever a rte is inserted to a routing table. * rte_remove Called whenever a rte is removed from the routing table. */ - int (*rte_recalculate)(rtable *, struct network *, struct rte *, struct rte *, struct rte *); - int (*rte_better)(struct rte *, struct rte *); + int (*rte_recalculate)(struct rtable_private *, struct network *, struct rte *, struct rte *, struct rte *); int (*rte_mergable)(struct rte *, struct rte *); void (*rte_insert)(struct network *, struct rte *); void (*rte_remove)(struct network *, struct rte *); - u32 (*rte_igp_metric)(struct rte *); + u32 (*rte_igp_metric)(const struct rte *); /* Hic sunt protocol-specific data */ }; @@ -249,6 +233,15 @@ void channel_graceful_restart_unlock(struct channel *c); #define DEFAULT_GR_WAIT 240 +static inline event_list *proto_event_list(struct proto *p) +{ return p->loop == &main_birdloop ? &global_event_list : birdloop_event_list(p->loop); } + +static inline event_list *proto_work_list(struct proto *p) +{ return p->loop == &main_birdloop ? &global_work_list : birdloop_event_list(p->loop); } + +static inline void proto_send_event(struct proto *p, event *e) +{ ev_send(proto_event_list(p), e); } + void channel_show_limit(struct limit *l, const char *dsc, int active, int action); void channel_show_info(struct channel *c); void channel_cmd_debug(struct channel *c, uint mask); @@ -267,7 +260,6 @@ struct proto *proto_iterate_named(struct symbol *sym, struct protocol *proto, st #define PROTO_WALK_CMD(sym,pr,p) for(struct proto *p = NULL; p = proto_iterate_named(sym, pr, p); ) - #define PROTO_ENTER_FROM_MAIN(p) ({ \ ASSERT_DIE(birdloop_inside(&main_birdloop)); \ struct birdloop *_loop = (p)->loop; \ @@ -369,7 +361,13 @@ void proto_notify_state(struct proto *p, unsigned state); */ static inline int proto_is_inactive(struct proto *p) -{ return (p->active_channels == 0) && (p->active_coroutines == 0) && (p->sources.uc == 0); } +{ + return (p->active_channels == 0) + && (p->active_loops == 0) + && (p->sources.uc == 0) + && EMPTY_TLIST(proto_neigh, &p->neighbors) + ; +} /* @@ -483,18 +481,22 @@ struct channel_config { struct proto_config *parent; /* Where channel is defined (proto or template) */ struct rtable_config *table; /* Table we're attached to */ const struct filter *in_filter, *out_filter; /* Attached filters */ + const net_addr *out_subprefix; /* Export only subprefixes of this net */ struct channel_limit rx_limit; /* Limit for receiving routes from protocol - (relevant when in_keep_filtered is active) */ + (relevant when in_keep & RIK_REJECTED) */ struct channel_limit in_limit; /* Limit for importing routes from protocol */ struct channel_limit out_limit; /* Limit for exporting routes to protocol */ + struct settle_config roa_settle; /* Settle times for ROA-induced reload */ + u8 net_type; /* Routing table network type (NET_*), 0 for undefined */ u8 ra_mode; /* Mode of received route advertisements (RA_*) */ u16 preference; /* Default route preference */ u32 debug; /* Debugging flags (D_*) */ + u8 copy; /* Value from channel_config_get() is new (0) or from template (1) */ u8 merge_limit; /* Maximal number of nexthops for RA_MERGED */ - u8 in_keep_filtered; /* Routes rejected in import filter are kept */ + u8 in_keep; /* Which states of routes to keep (RIK_*) */ u8 rpki_reload; /* RPKI changes trigger channel reload */ }; @@ -508,19 +510,19 @@ struct channel { rtable *table; const struct filter *in_filter; /* Input filter */ const struct filter *out_filter; /* Output filter */ + const net_addr *out_subprefix; /* Export only subprefixes of this net */ struct bmap export_map; /* Keeps track which routes were really exported */ struct bmap export_reject_map; /* Keeps track which routes were rejected by export filter */ - struct limit rx_limit; /* Receive limit (for in_keep_filtered) */ + struct limit rx_limit; /* Receive limit (for in_keep & RIK_REJECTED) */ struct limit in_limit; /* Input limit */ struct limit out_limit; /* Output limit */ + struct settle_config roa_settle; /* Settle times for ROA-induced reload */ + u8 limit_actions[PLD_MAX]; /* Limit actions enum */ u8 limit_active; /* Flags for active limits */ - linpool *rte_update_pool; - uint rte_update_nest_cnt; - struct channel_import_stats { /* Import - from protocol to core */ u32 updates_received; /* Number of route updates received */ @@ -551,7 +553,7 @@ struct channel { u16 preference; /* Default route preference */ u32 debug; /* Debugging flags (D_*) */ u8 merge_limit; /* Maximal number of nexthops for RA_MERGED */ - u8 in_keep_filtered; /* Routes rejected in import filter are kept */ + u8 in_keep; /* Which states of routes to keep (RIK_*) */ u8 disabled; u8 stale; /* Used in reconfiguration */ @@ -560,32 +562,22 @@ struct channel { u8 reloadable; /* Hook reload_routes() is allowed on the channel */ u8 gr_lock; /* Graceful restart mechanism should wait for this channel */ u8 gr_wait; /* Route export to channel is postponed until graceful restart */ - u8 restart_export; /* Route export should restart as soon as it stops */ btime last_state_change; /* Time of last state transition */ - struct channel_aux_table *in_table; /* Internal table for received routes */ - struct event in_stopped; /* Import stop callback */ + struct rt_export_request reload_req; /* Feeder for import reload */ u8 reload_pending; /* Reloading and another reload is scheduled */ u8 refeed_pending; /* Refeeding and another refeed is scheduled */ u8 rpki_reload; /* RPKI changes trigger channel reload */ - struct channel_aux_table *out_table; /* Internal table for exported routes */ + struct rt_exporter *out_table; /* Internal table for exported routes */ - list roa_subscriptions; /* List of active ROA table subscriptions based on filters roa_check() */ + list roa_subscriptions; /* List of active ROA table subscriptions based on filters' roa_check() calls */ }; -struct channel_aux_table { - struct channel *c; - struct rt_import_request push; - struct rt_export_request get; - event push_stopped; - rtable *tab; - event *stop; - u8 refeed_pending; - u8 stop_pending; -}; +#define RIK_REJECTED 1 /* Routes rejected in import filter are kept */ +#define RIK_PREFILTER (2 | RIK_REJECTED) /* All routes' attribute state before import filter is kept */ /* * Channel states @@ -651,8 +643,6 @@ struct channel *proto_add_channel(struct proto *p, struct channel_config *cf); int proto_configure_channel(struct proto *p, struct channel **c, struct channel_config *cf); void channel_set_state(struct channel *c, uint state); -void channel_setup_in_table(struct channel *c, int best); -void channel_setup_out_table(struct channel *c); void channel_schedule_reload(struct channel *c); static inline void channel_init(struct channel *c) { channel_set_state(c, CS_START); } @@ -660,9 +650,6 @@ static inline void channel_open(struct channel *c) { channel_set_state(c, CS_UP) static inline void channel_close(struct channel *c) { channel_set_state(c, CS_STOP); } void channel_request_feeding(struct channel *c); -void channel_request_reload(struct channel *c); -void channel_refresh_begin(struct channel *c); -void channel_refresh_end(struct channel *c); void *channel_config_new(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto); void *channel_config_get(const struct channel_class *cc, const char *name, uint net_type, struct proto_config *proto); int channel_reconfigure(struct channel *c, struct channel_config *cf); diff --git a/nest/route.h b/nest/route.h deleted file mode 100644 index 9093108b..00000000 --- a/nest/route.h +++ /dev/null @@ -1,967 +0,0 @@ -/* - * BIRD Internet Routing Daemon -- Routing Table - * - * (c) 1998--2000 Martin Mares <mj@ucw.cz> - * (c) 2019--2021 Maria Matejka <mq@jmq.cz> - * - * Can be freely distributed and used under the terms of the GNU GPL. - */ - -#ifndef _BIRD_ROUTE_H_ -#define _BIRD_ROUTE_H_ - -#include "lib/lists.h" -#include "lib/event.h" -#include "lib/bitmap.h" -#include "lib/resource.h" -#include "lib/net.h" -#include "lib/hash.h" -#include "lib/event.h" - -#include <stdatomic.h> - -struct ea_list; -struct protocol; -struct proto; -struct channel; -struct rte_src; -struct symbol; -struct timer; -struct filter; -struct cli; - -/* - * Generic data structure for storing network prefixes. Also used - * for the master routing table. Currently implemented as a hash - * table. - * - * Available operations: - * - insertion of new entry - * - deletion of entry - * - searching for entry by network prefix - * - asynchronous retrieval of fib contents - */ - -struct fib_node { - struct fib_node *next; /* Next in hash chain */ - struct fib_iterator *readers; /* List of readers of this node */ - net_addr addr[0]; -}; - -struct fib_iterator { /* See lib/slists.h for an explanation */ - struct fib_iterator *prev, *next; /* Must be synced with struct fib_node! */ - byte efef; /* 0xff to distinguish between iterator and node */ - byte pad[3]; - struct fib_node *node; /* Or NULL if freshly merged */ - uint hash; -}; - -typedef void (*fib_init_fn)(void *); - -struct fib { - pool *fib_pool; /* Pool holding all our data */ - slab *fib_slab; /* Slab holding all fib nodes */ - struct fib_node **hash_table; /* Node hash table */ - uint hash_size; /* Number of hash table entries (a power of two) */ - uint hash_order; /* Binary logarithm of hash_size */ - uint hash_shift; /* 32 - hash_order */ - uint addr_type; /* Type of address data stored in fib (NET_*) */ - uint node_size; /* FIB node size, 0 for nonuniform */ - uint node_offset; /* Offset of fib_node struct inside of user data */ - uint entries; /* Number of entries */ - uint entries_min, entries_max; /* Entry count limits (else start rehashing) */ - fib_init_fn init; /* Constructor */ -}; - -static inline void * fib_node_to_user(struct fib *f, struct fib_node *e) -{ return e ? (void *) ((char *) e - f->node_offset) : NULL; } - -static inline struct fib_node * fib_user_to_node(struct fib *f, void *e) -{ return e ? (void *) ((char *) e + f->node_offset) : NULL; } - -void fib_init(struct fib *f, pool *p, uint addr_type, uint node_size, uint node_offset, uint hash_order, fib_init_fn init); -void *fib_find(struct fib *, const net_addr *); /* Find or return NULL if doesn't exist */ -void *fib_get_chain(struct fib *f, const net_addr *a); /* Find first node in linked list from hash table */ -void *fib_get(struct fib *, const net_addr *); /* Find or create new if nonexistent */ -void *fib_route(struct fib *, const net_addr *); /* Longest-match routing lookup */ -void fib_delete(struct fib *, void *); /* Remove fib entry */ -void fib_free(struct fib *); /* Destroy the fib */ -void fib_check(struct fib *); /* Consistency check for debugging */ - -void fit_init(struct fib_iterator *, struct fib *); /* Internal functions, don't call */ -struct fib_node *fit_get(struct fib *, struct fib_iterator *); -void fit_put(struct fib_iterator *, struct fib_node *); -void fit_put_next(struct fib *f, struct fib_iterator *i, struct fib_node *n, uint hpos); -void fit_put_end(struct fib_iterator *i); -void fit_copy(struct fib *f, struct fib_iterator *dst, struct fib_iterator *src); - - -#define FIB_WALK(fib, type, z) do { \ - struct fib_node *fn_, **ff_ = (fib)->hash_table; \ - uint count_ = (fib)->hash_size; \ - type *z; \ - while (count_--) \ - for (fn_ = *ff_++; z = fib_node_to_user(fib, fn_); fn_=fn_->next) - -#define FIB_WALK_END } while (0) - -#define FIB_ITERATE_INIT(it, fib) fit_init(it, fib) - -#define FIB_ITERATE_START(fib, it, type, z) do { \ - struct fib_node *fn_ = fit_get(fib, it); \ - uint count_ = (fib)->hash_size; \ - uint hpos_ = (it)->hash; \ - type *z; \ - for(;;) { \ - if (!fn_) \ - { \ - if (++hpos_ >= count_) \ - break; \ - fn_ = (fib)->hash_table[hpos_]; \ - continue; \ - } \ - z = fib_node_to_user(fib, fn_); - -#define FIB_ITERATE_END fn_ = fn_->next; } } while(0) - -#define FIB_ITERATE_PUT(it) fit_put(it, fn_) - -#define FIB_ITERATE_PUT_NEXT(it, fib) fit_put_next(fib, it, fn_, hpos_) - -#define FIB_ITERATE_PUT_END(it) fit_put_end(it) - -#define FIB_ITERATE_UNLINK(it, fib) fit_get(fib, it) - -#define FIB_ITERATE_COPY(dst, src, fib) fit_copy(fib, dst, src) - - -/* - * Master Routing Tables. Generally speaking, each of them contains a FIB - * with each entry pointing to a list of route entries representing routes - * to given network (with the selected one at the head). - * - * Each of the RTE's contains variable data (the preference and protocol-dependent - * metrics) and a pointer to a route attribute block common for many routes). - * - * It's guaranteed that there is at most one RTE for every (prefix,proto) pair. - */ - -typedef struct rtable_private { -#define RTABLE_PUBLIC \ - resource r; \ - node n; /* Node in list of all tables */ \ - struct birdloop *loop; /* This loop runs the table */ \ - char *name; /* Name of this table */ \ - uint addr_type; /* Type of address data stored in table (NET_*) */ \ - struct rtable_config *config; /* Configuration of this table */ \ - struct event *nhu_event; /* Event to update next hops */ \ - _Atomic byte nhu_state; /* Next Hop Update state */ \ - - RTABLE_PUBLIC; - pool *rp; /* Resource pool to allocate everything from, including itself */ - struct slab *rte_slab; /* Slab to allocate route objects */ - struct fib fib; - int use_count; /* Number of protocols using this table */ - u32 rt_count; /* Number of routes in the table */ - u32 rr_count; /* Number of running route refresh requests */ - u32 imports_up; /* Number of imports in TIS_UP state */ - - list imports; /* Registered route importers */ - list exports; /* Registered route exporters */ - - struct hmap id_map; - struct hostcache *hostcache; - struct event *prune_event; /* Event to prune abandoned routes */ - struct event *announce_event; /* Event to announce pending exports */ - struct event *ec_event; /* Event to prune finished exports */ - struct event *hcu_event; /* Event to update host cache */ - void (*delete)(void *); /* Delete callback (in parent loop context) */ - btime last_rt_change; /* Last time when route changed */ - btime base_settle_time; /* Start time of rtable settling interval */ - btime gc_time; /* Time of last GC */ - int gc_counter; /* Number of operations since last GC */ - byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */ - - byte cork_active; /* Congestion control activated */ - - struct fib_iterator prune_fit; /* Rtable prune FIB iterator */ - struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ - struct tbf rl_pipe; /* Rate limiting token buffer for pipe collisions */ - - linpool *nhu_lp; /* Linpool used for NHU */ - - list subscribers; /* Subscribers for notifications */ - struct timer *settle_timer; /* Settle time for notifications */ - - list pending_exports; /* List of packed struct rt_pending_export */ - - struct rt_pending_export *first_export; /* First export to announce */ - u64 next_export_seq; /* The next export will have this ID */ -} rtable_private; - -typedef union { - struct { RTABLE_PUBLIC }; - rtable_private priv; -} rtable; - -#define RT_LOCK(tab) ({ birdloop_enter((tab)->loop); &(tab)->priv; }) -#define RT_UNLOCK(tab) birdloop_leave((tab)->loop) -#define RT_PRIV(tab) ({ ASSERT_DIE(birdloop_inside((tab)->loop)); &(tab)->priv; }) - -#define RT_LOCKED(tpub, tpriv) for (rtable_private *tpriv = RT_LOCK(tpub); tpriv; RT_UNLOCK(tpriv), (tpriv = NULL)) - -struct rtable_config { - node n; - char *name; - void *owner; /* Main config if global table, channel_aux_table if channel table */ - rtable *table; - struct proto_config *krt_attached; /* Kernel syncer attached to this table */ - uint addr_type; /* Type of address data stored in table (NET_*) */ - int gc_max_ops; /* Maximum number of operations before GC is run */ - int gc_min_time; /* Minimum time between two consecutive GC runs */ - byte sorted; /* Routes of network are sorted according to rte_better() */ - btime min_settle_time; /* Minimum settle time for notifications */ - btime max_settle_time; /* Maximum settle time for notifications */ - btime min_rr_settle_time; /* Minimum settle time for notifications when route refresh is running */ - btime max_rr_settle_time; /* Maximum settle time for notifications when route refresh is running */ - uint cork_limit; /* Amount of routes to be pending on export to cork imports */ -}; - -struct rt_subscription { - node n; - rtable *tab; - event *event; -}; - -#define NHU_CLEAN 0 -#define NHU_SCHEDULED 1 -#define NHU_RUNNING 2 -#define NHU_DIRTY 3 - -typedef struct network { - struct rte_storage *routes; /* Available routes for this network */ - struct rt_pending_export *last, *first; /* Routes with unfinished exports */ - struct fib_node n; /* FIB flags reserved for kernel syncer */ -} net; - -struct hostcache { - slab *slab; /* Slab holding all hostentries */ - struct hostentry **hash_table; /* Hash table for hostentries */ - unsigned hash_order, hash_shift; - unsigned hash_max, hash_min; - unsigned hash_items; - linpool *lp; /* Linpool for trie */ - struct f_trie *trie; /* Trie of prefixes that might affect hostentries */ - list hostentries; /* List of all hostentries */ - byte update_hostcache; -}; - -struct hostentry { - node ln; - ip_addr addr; /* IP address of host, part of key */ - ip_addr link; /* (link-local) IP address of host, used as gw - if host is directly attached */ - rtable *tab; /* Dependent table, part of key */ - struct hostentry *next; /* Next in hash chain */ - unsigned hash_key; /* Hash key */ - unsigned uc; /* Use count */ - struct rta *src; /* Source rta entry */ - byte dest; /* Chosen route destination type (RTD_...) */ - byte nexthop_linkable; /* Nexthop list is completely non-device */ - u32 igp_metric; /* Chosen route IGP metric */ -}; - -typedef struct rte { - struct rta *attrs; /* Attributes of this route */ - const net_addr *net; /* Network this RTE belongs to */ - struct rte_src *src; /* Route source that created the route */ - struct rt_import_hook *sender; /* Import hook used to send the route to the routing table */ - btime lastmod; /* Last modified (set by table) */ - u32 id; /* Table specific route id */ - byte flags; /* Table-specific flags */ - byte pflags; /* Protocol-specific flags */ - u8 generation; /* If this route import is based on other previously exported route, - this value should be 1 + MAX(generation of the parent routes). - Otherwise the route is independent and this value is zero. */ - u8 stale_cycle; /* Auxiliary value for route refresh */ -} rte; - -struct rte_storage { - struct rte_storage *next; /* Next in chain */ - struct rte rte; /* Route data */ -}; - -#define RTES_CLONE(r, l) ((r) ? (((*(l)) = (r)->rte), (l)) : NULL) -#define RTES_OR_NULL(r) ((r) ? &((r)->rte) : NULL) - -#define REF_FILTERED 2 /* Route is rejected by import filter */ -#define REF_USE_STALE 4 /* Do not reset route's stale_cycle to the actual value */ - -/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ -static inline int rte_is_valid(const rte *r) { return r && !(r->flags & REF_FILTERED); } - -/* Route just has REF_FILTERED flag */ -static inline int rte_is_filtered(const rte *r) { return !!(r->flags & REF_FILTERED); } - - -/* Table-channel connections */ - -struct rt_import_request { - struct rt_import_hook *hook; /* The table part of importer */ - char *name; - u8 trace_routes; - - event_list *list; /* Where to schedule import events */ - - void (*dump_req)(struct rt_import_request *req); - void (*log_state_change)(struct rt_import_request *req, u8 state); - /* Preimport is called when the @new route is just-to-be inserted, replacing @old. - * Return a route (may be different or modified in-place) to continue or NULL to withdraw. */ - struct rte *(*preimport)(struct rt_import_request *req, struct rte *new, struct rte *old); -}; - -struct rt_import_hook { - node n; - rtable *table; /* The connected table */ - struct rt_import_request *req; /* The requestor */ - - struct rt_import_stats { - /* Import - from protocol to core */ - u32 pref; /* Number of routes selected as best in the (adjacent) routing table */ - u32 updates_ignored; /* Number of route updates rejected as already in route table */ - u32 updates_accepted; /* Number of route updates accepted and imported */ - u32 withdraws_ignored; /* Number of route withdraws rejected as already not in route table */ - u32 withdraws_accepted; /* Number of route withdraws accepted and processed */ - } stats; - - u64 flush_seq; /* Table export seq when the channel announced flushing */ - btime last_state_change; /* Time of last state transition */ - - u8 import_state; /* IS_* */ - u8 stale_set; /* Set this stale_cycle to imported routes */ - u8 stale_valid; /* Routes with this stale_cycle and bigger are considered valid */ - u8 stale_pruned; /* Last prune finished when this value was set at stale_valid */ - u8 stale_pruning; /* Last prune started when this value was set at stale_valid */ - - struct event *export_announce_event; /* Event to run to announce new exports */ - struct event *stopped; /* Event to run when import is stopped */ -}; - -struct rt_pending_export { - struct rt_pending_export * _Atomic next; /* Next export for the same destination */ - struct rte_storage *new, *new_best, *old, *old_best; - u64 seq; /* Sequential ID (table-local) of the pending export */ -}; - -struct rt_export_request { - struct rt_export_hook *hook; /* Table part of the export */ - char *name; - u8 trace_routes; - - event_list *list; /* Where to schedule export events */ - - /* There are two methods of export. You can either request feeding every single change - * or feeding the whole route feed. In case of regular export, &export_one is preferred. - * Anyway, when feeding, &export_bulk is preferred, falling back to &export_one. - * Thus, for RA_OPTIMAL, &export_one is only set, - * for RA_MERGED and RA_ACCEPTED, &export_bulk is only set - * and for RA_ANY, both are set to accomodate for feeding all routes but receiving single changes - */ - void (*export_one)(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); - void (*export_bulk)(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe, rte **feed, uint count); - - void (*dump_req)(struct rt_export_request *req); - void (*log_state_change)(struct rt_export_request *req, u8); -}; - -struct rt_export_hook { - node n; - rtable *table; /* The connected table */ - - pool *pool; - - struct rt_export_request *req; /* The requestor */ - - struct rt_export_stats { - /* Export - from core to protocol */ - u32 updates_received; /* Number of route updates received */ - u32 withdraws_received; /* Number of route withdraws received */ - } stats; - - struct fib_iterator feed_fit; /* Routing table iterator used during feeding */ - - struct bmap seq_map; /* Keep track which exports were already procesed */ - - struct rt_pending_export * _Atomic last_export;/* Last export processed */ - struct rt_pending_export *rpe_next; /* Next pending export to process */ - - btime last_state_change; /* Time of last state transition */ - - u8 refeed_pending; /* Refeeding and another refeed is scheduled */ - _Atomic u8 export_state; /* Route export state (TES_*, see below) */ - - struct event *event; /* Event running all the export operations */ - - void (*stopped)(struct rt_export_request *); /* Stored callback when export is stopped */ -}; - -extern struct event_cork rt_cork; - -#define TIS_DOWN 0 -#define TIS_UP 1 -#define TIS_STOP 2 -#define TIS_FLUSHING 3 -#define TIS_WAITING 4 -#define TIS_CLEARED 5 -#define TIS_MAX 6 - -#define TES_DOWN 0 -#define TES_HUNGRY 1 -#define TES_FEEDING 2 -#define TES_READY 3 -#define TES_STOP 4 -#define TES_MAX 5 - -void rt_request_import(rtable *tab, struct rt_import_request *req); -void rt_request_export(rtable *tab, struct rt_export_request *req); - -void rt_stop_import(struct rt_import_request *, struct event *stopped); -void rt_stop_export(struct rt_export_request *, void (*stopped)(struct rt_export_request *)); - -const char *rt_import_state_name(u8 state); -const char *rt_export_state_name(u8 state); - -static inline u8 rt_import_get_state(struct rt_import_hook *ih) { return ih ? ih->import_state : TIS_DOWN; } -static inline u8 rt_export_get_state(struct rt_export_hook *eh) { return eh ? eh->export_state : TES_DOWN; } - -void rte_import(struct rt_import_request *req, const net_addr *net, rte *new, struct rte_src *src); - -/* Get next rpe. If src is given, it must match. */ -struct rt_pending_export *rpe_next(struct rt_pending_export *rpe, struct rte_src *src); - -/* Mark the pending export processed */ -void rpe_mark_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); - -/* Get pending export seen status */ -int rpe_get_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); - - -/* Types of route announcement, also used as flags */ -#define RA_UNDEF 0 /* Undefined RA type */ -#define RA_OPTIMAL 1 /* Announcement of optimal route change */ -#define RA_ACCEPTED 2 /* Announcement of first accepted route */ -#define RA_ANY 3 /* Announcement of any route change */ -#define RA_MERGED 4 /* Announcement of optimal route merged with next ones */ - -/* Return value of preexport() callback */ -#define RIC_ACCEPT 1 /* Accepted by protocol */ -#define RIC_PROCESS 0 /* Process it through import filter */ -#define RIC_REJECT -1 /* Rejected by protocol */ -#define RIC_DROP -2 /* Silently dropped by protocol */ - -#define rte_update channel_rte_import -/** - * rte_update - enter a new update to a routing table - * @c: channel doing the update - * @net: network address - * @rte: a &rte representing the new route - * @src: old route source identifier - * - * This function imports a new route to the appropriate table (via the channel). - * Table keys are @net (obligatory) and @rte->attrs->src. - * Both the @net and @rte pointers can be local. - * - * The route attributes (@rte->attrs) are obligatory. They can be also allocated - * locally. Anyway, if you use an already-cached attribute object, you shall - * call rta_clone() on that object yourself. (This semantics may change in future.) - * - * If the route attributes are local, you may set @rte->attrs->src to NULL, then - * the protocol's default route source will be supplied. - * - * When rte_update() gets a route, it automatically validates it. This includes - * checking for validity of the given network and next hop addresses and also - * checking for host-scope or link-scope routes. Then the import filters are - * processed and if accepted, the route is passed to route table recalculation. - * - * The accepted routes are then inserted into the table, replacing the old route - * for the same @net identified by @src. Then the route is announced - * to all the channels connected to the table using the standard export mechanism. - * Setting @rte to NULL makes this a withdraw, otherwise @rte->src must be the same - * as @src. - * - * All memory used for temporary allocations is taken from a special linpool - * @rte_update_pool and freed when rte_update() finishes. - */ -void rte_update(struct channel *c, const net_addr *net, struct rte *rte, struct rte_src *src); - -extern list routing_tables; -struct config; - -void rt_init(void); -void rt_preconfig(struct config *); -void rt_commit(struct config *new, struct config *old); -void rt_lock_table(rtable_private *); -void rt_unlock_table(rtable_private *); -void rt_subscribe(rtable *tab, struct rt_subscription *s); -void rt_unsubscribe(struct rt_subscription *s); -rtable *rt_setup(pool *, struct rtable_config *); - -static inline net *net_find(rtable_private *tab, const net_addr *addr) { return (net *) fib_find(&tab->fib, addr); } -static inline net *net_find_valid(rtable_private *tab, const net_addr *addr) -{ net *n = net_find(tab, addr); return (n && n->routes && rte_is_valid(&n->routes->rte)) ? n : NULL; } -static inline net *net_get(rtable_private *tab, const net_addr *addr) { return (net *) fib_get(&tab->fib, addr); } -void *net_route(rtable_private *tab, const net_addr *n); -int net_roa_check(rtable *tab, const net_addr *n, u32 asn); -int rt_examine(rtable_private *t, net_addr *a, struct channel *c, const struct filter *filter); -rte *rt_export_merged(struct channel *c, rte ** feed, uint count, linpool *pool, int silent); - -void rt_refresh_begin(struct rt_import_request *); -void rt_refresh_end(struct rt_import_request *); -void rt_schedule_prune(rtable_private *t); -void rte_dump(struct rte_storage *); -void rte_free(struct rte_storage *, rtable_private *); -void rt_dump(rtable *); -void rt_dump_all(void); -void rt_dump_hooks(rtable *); -void rt_dump_hooks_all(void); -void rt_prune_sync(rtable *t, int all); -struct rtable_config *rt_new_table(struct symbol *s, uint addr_type); - -/* Default limit for ECMP next hops, defined in sysdep code */ -extern const int rt_default_ecmp; - -struct rt_show_data_rtable { - node n; - rtable *table; - struct channel *export_channel; -}; - -struct rt_show_data { - net_addr *addr; - list tables; - struct rt_show_data_rtable *tab; /* Iterator over table list */ - struct rt_show_data_rtable *last_table; /* Last table in output */ - struct fib_iterator fit; /* Iterator over networks in table */ - int verbose, tables_defined_by; - const struct filter *filter; - struct proto *show_protocol; - struct proto *export_protocol; - struct channel *export_channel; - struct config *running_on_config; - struct krt_proto *kernel; - struct rt_export_hook *kernel_export_hook; - int export_mode, primary_only, filtered, stats, show_for; - - int table_open; /* Iteration (fit) is open */ - int net_counter, rt_counter, show_counter, table_counter; - int net_counter_last, rt_counter_last, show_counter_last; -}; - -void rt_show(struct rt_show_data *); -struct rt_show_data_rtable * rt_show_add_table(struct rt_show_data *d, rtable *t); - -/* Value of table definition mode in struct rt_show_data */ -#define RSD_TDB_DEFAULT 0 /* no table specified */ -#define RSD_TDB_INDIRECT 0 /* show route ... protocol P ... */ -#define RSD_TDB_ALL RSD_TDB_SET /* show route ... table all ... */ -#define RSD_TDB_DIRECT RSD_TDB_SET | RSD_TDB_NMN /* show route ... table X table Y ... */ - -#define RSD_TDB_SET 0x1 /* internal: show empty tables */ -#define RSD_TDB_NMN 0x2 /* internal: need matching net */ - -/* Value of export_mode in struct rt_show_data */ -#define RSEM_NONE 0 /* Export mode not used */ -#define RSEM_PREEXPORT 1 /* Routes ready for export, before filtering */ -#define RSEM_EXPORT 2 /* Routes accepted by export filter */ -#define RSEM_NOEXPORT 3 /* Routes rejected by export filter */ -#define RSEM_EXPORTED 4 /* Routes marked in export map */ - -/* - * Route Attributes - * - * Beware: All standard BGP attributes must be represented here instead - * of making them local to the route. This is needed to ensure proper - * construction of BGP route attribute lists. - */ - -/* Nexthop structure */ -struct nexthop { - ip_addr gw; /* Next hop */ - struct iface *iface; /* Outgoing interface */ - struct nexthop *next; - byte flags; - byte weight; - byte labels_orig; /* Number of labels before hostentry was applied */ - byte labels; /* Number of all labels */ - u32 label[0]; -}; - -#define RNF_ONLINK 0x1 /* Gateway is onlink regardless of IP ranges */ - - -struct rte_src { - struct rte_src *next; /* Hash chain */ - struct rte_owner *owner; /* Route source owner */ - u32 private_id; /* Private ID, assigned by the protocol */ - u32 global_id; /* Globally unique ID of the source */ - _Atomic u64 uc; /* Use count */ -}; - - -typedef struct rta { - struct rta * _Atomic next, * _Atomic *pprev; /* Hash chain */ - _Atomic u32 uc; /* Use count */ - u32 hash_key; /* Hash over important fields */ - struct ea_list *eattrs; /* Extended Attribute chain */ - struct hostentry *hostentry; /* Hostentry for recursive next-hops */ - ip_addr from; /* Advertising router */ - u32 igp_metric; /* IGP metric to next hop (for iBGP routes) */ - u16 cached:1; /* Are attributes cached? */ - u16 source:7; /* Route source (RTS_...) */ - u16 scope:4; /* Route scope (SCOPE_... -- see ip.h) */ - u16 dest:4; /* Route destination type (RTD_...) */ - word pref; - struct nexthop nh; /* Next hop */ -} rta; - -#define RTS_STATIC 1 /* Normal static route */ -#define RTS_INHERIT 2 /* Route inherited from kernel */ -#define RTS_DEVICE 3 /* Device route */ -#define RTS_STATIC_DEVICE 4 /* Static device route */ -#define RTS_REDIRECT 5 /* Learned via redirect */ -#define RTS_RIP 6 /* RIP route */ -#define RTS_OSPF 7 /* OSPF route */ -#define RTS_OSPF_IA 8 /* OSPF inter-area route */ -#define RTS_OSPF_EXT1 9 /* OSPF external route type 1 */ -#define RTS_OSPF_EXT2 10 /* OSPF external route type 2 */ -#define RTS_BGP 11 /* BGP route */ -#define RTS_PIPE 12 /* Inter-table wormhole */ -#define RTS_BABEL 13 /* Babel route */ -#define RTS_RPKI 14 /* Route Origin Authorization */ -#define RTS_PERF 15 /* Perf checker */ -#define RTS_MAX 16 - -#define RTD_NONE 0 /* Undefined next hop */ -#define RTD_UNICAST 1 /* Next hop is neighbor router */ -#define RTD_BLACKHOLE 2 /* Silently drop packets */ -#define RTD_UNREACHABLE 3 /* Reject as unreachable */ -#define RTD_PROHIBIT 4 /* Administratively prohibited */ -#define RTD_MAX 5 - -#define IGP_METRIC_UNKNOWN 0x80000000 /* Default igp_metric used when no other - protocol-specific metric is availabe */ - - -extern const char * rta_dest_names[RTD_MAX]; - -static inline const char *rta_dest_name(uint n) -{ return (n < RTD_MAX) ? rta_dest_names[n] : "???"; } - -/* Route has regular, reachable nexthop (i.e. not RTD_UNREACHABLE and like) */ -static inline int rte_is_reachable(rte *r) -{ return r->attrs->dest == RTD_UNICAST; } - - -/* - * Extended Route Attributes - */ - -typedef struct eattr { - word id; /* EA_CODE(PROTOCOL_..., protocol-dependent ID) */ - byte flags; /* Protocol-dependent flags */ - byte type; /* Attribute type and several flags (EAF_...) */ - union { - uintptr_t data; - const struct adata *ptr; /* Attribute data elsewhere */ - } u; -} eattr; - - -#define EA_CODE(proto,id) (((proto) << 8) | (id)) -#define EA_ID(ea) ((ea) & 0xff) -#define EA_PROTO(ea) ((ea) >> 8) -#define EA_CUSTOM(id) ((id) | EA_CUSTOM_BIT) -#define EA_IS_CUSTOM(ea) ((ea) & EA_CUSTOM_BIT) -#define EA_CUSTOM_ID(ea) ((ea) & ~EA_CUSTOM_BIT) - -const char *ea_custom_name(uint ea); - -#define EA_GEN_IGP_METRIC EA_CODE(PROTOCOL_NONE, 0) - -#define EA_CODE_MASK 0xffff -#define EA_CUSTOM_BIT 0x8000 -#define EA_ALLOW_UNDEF 0x10000 /* ea_find: allow EAF_TYPE_UNDEF */ -#define EA_BIT(n) ((n) << 24) /* Used in bitfield accessors */ -#define EA_BIT_GET(ea) ((ea) >> 24) - -#define EAF_TYPE_MASK 0x1f /* Mask with this to get type */ -#define EAF_TYPE_INT 0x01 /* 32-bit unsigned integer number */ -#define EAF_TYPE_OPAQUE 0x02 /* Opaque byte string (not filterable) */ -#define EAF_TYPE_IP_ADDRESS 0x04 /* IP address */ -#define EAF_TYPE_ROUTER_ID 0x05 /* Router ID (IPv4 address) */ -#define EAF_TYPE_AS_PATH 0x06 /* BGP AS path (encoding per RFC 1771:4.3) */ -#define EAF_TYPE_BITFIELD 0x09 /* 32-bit embedded bitfield */ -#define EAF_TYPE_INT_SET 0x0a /* Set of u32's (e.g., a community list) */ -#define EAF_TYPE_PTR 0x0d /* Pointer to an object */ -#define EAF_TYPE_EC_SET 0x0e /* Set of pairs of u32's - ext. community list */ -#define EAF_TYPE_LC_SET 0x12 /* Set of triplets of u32's - large community list */ -#define EAF_TYPE_UNDEF 0x1f /* `force undefined' entry */ -#define EAF_EMBEDDED 0x01 /* Data stored in eattr.u.data (part of type spec) */ -#define EAF_VAR_LENGTH 0x02 /* Attribute length is variable (part of type spec) */ -#define EAF_ORIGINATED 0x20 /* The attribute has originated locally */ -#define EAF_FRESH 0x40 /* An uncached attribute (e.g. modified in export filter) */ - -typedef struct adata { - uint length; /* Length of data */ - byte data[0]; -} adata; - -extern const adata null_adata; /* adata of length 0 */ - -static inline struct adata * -lp_alloc_adata(struct linpool *pool, uint len) -{ - struct adata *ad = lp_alloc(pool, sizeof(struct adata) + len); - ad->length = len; - return ad; -} - -static inline int adata_same(const struct adata *a, const struct adata *b) -{ return (a->length == b->length && !memcmp(a->data, b->data, a->length)); } - - -typedef struct ea_list { - struct ea_list *next; /* In case we have an override list */ - byte flags; /* Flags: EALF_... */ - byte rfu; - word count; /* Number of attributes */ - eattr attrs[0]; /* Attribute definitions themselves */ -} ea_list; - -#define EALF_SORTED 1 /* Attributes are sorted by code */ -#define EALF_BISECT 2 /* Use interval bisection for searching */ -#define EALF_CACHED 4 /* Attributes belonging to cached rta */ - -struct rte_owner_class { - void (*get_route_info)(struct rte *, byte *buf); /* Get route information (for `show route' command) */ - int (*rte_better)(struct rte *, struct rte *); - int (*rte_mergable)(struct rte *, struct rte *); - u32 (*rte_igp_metric)(struct rte *); -}; - -struct rte_owner { - struct rte_owner_class *class; - int (*rte_recalculate)(rtable_private *, struct network *, struct rte *, struct rte *, struct rte *); - HASH(struct rte_src) hash; - const char *name; - u32 hash_key; - u32 uc; - event_list *list; - event *prune; - event *stop; -}; - -#define RTE_SRC_PU_SHIFT 44 -#define RTE_SRC_IN_PROGRESS (1ULL << RTE_SRC_PU_SHIFT) - -struct rte_src *rt_get_source_o(struct rte_owner *o, u32 id); -#define rt_get_source(p, id) rt_get_source_o(&(p)->sources, (id)) -static inline void rt_lock_source(struct rte_src *src) -{ - u64 uc = atomic_fetch_add_explicit(&src->uc, 1, memory_order_acq_rel); - ASSERT_DIE(uc > 0); -} - -static inline void rt_unlock_source(struct rte_src *src) -{ - u64 uc = atomic_fetch_add_explicit(&src->uc, RTE_SRC_IN_PROGRESS, memory_order_acq_rel); - u64 pending = uc >> RTE_SRC_PU_SHIFT; - uc &= RTE_SRC_IN_PROGRESS - 1; - - ASSERT_DIE(uc > pending); - if (uc == pending + 1) - ev_send(src->owner->list, src->owner->prune); - - atomic_fetch_sub_explicit(&src->uc, RTE_SRC_IN_PROGRESS + 1, memory_order_acq_rel); -} - -void rt_init_sources(struct rte_owner *, const char *name, event_list *list); -void rt_destroy_sources(struct rte_owner *, event *); - -struct ea_walk_state { - ea_list *eattrs; /* Ccurrent ea_list, initially set by caller */ - eattr *ea; /* Current eattr, initially NULL */ - u32 visited[4]; /* Bitfield, limiting max to 128 */ -}; - -eattr *ea_find(ea_list *, unsigned ea); -eattr *ea_walk(struct ea_walk_state *s, uint id, uint max); -uintptr_t ea_get_int(ea_list *, unsigned ea, uintptr_t def); -void ea_dump(ea_list *); -void ea_sort(ea_list *); /* Sort entries in all sub-lists */ -unsigned ea_scan(ea_list *); /* How many bytes do we need for merged ea_list */ -void ea_merge(ea_list *from, ea_list *to); /* Merge sub-lists to allocated buffer */ -int ea_same(ea_list *x, ea_list *y); /* Test whether two ea_lists are identical */ -uint ea_hash(ea_list *e); /* Calculate 16-bit hash value */ -ea_list *ea_append(ea_list *to, ea_list *what); -void ea_format_bitfield(const struct eattr *a, byte *buf, int bufsize, const char **names, int min, int max); - -#define ea_normalize(ea) do { \ - if (ea->next) { \ - ea_list *t = alloca(ea_scan(ea)); \ - ea_merge(ea, t); \ - ea = t; \ - } \ - ea_sort(ea); \ - if (ea->count == 0) \ - ea = NULL; \ -} while(0) \ - -static inline eattr * -ea_set_attr(ea_list **to, struct linpool *pool, uint id, uint flags, uint type, uintptr_t val) -{ - ea_list *a = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr)); - eattr *e = &a->attrs[0]; - - a->flags = EALF_SORTED; - a->count = 1; - a->next = *to; - *to = a; - - e->id = id; - e->type = type; - e->flags = flags; - - if (type & EAF_EMBEDDED) - e->u.data = (u32) val; - else - e->u.ptr = (struct adata *) val; - - return e; -} - -static inline void -ea_set_attr_u32(ea_list **to, struct linpool *pool, uint id, uint flags, uint type, u32 val) -{ ea_set_attr(to, pool, id, flags, type, (uintptr_t) val); } - -static inline void -ea_set_attr_ptr(ea_list **to, struct linpool *pool, uint id, uint flags, uint type, struct adata *val) -{ ea_set_attr(to, pool, id, flags, type, (uintptr_t) val); } - -static inline void -ea_set_attr_data(ea_list **to, struct linpool *pool, uint id, uint flags, uint type, void *data, uint len) -{ - struct adata *a = lp_alloc_adata(pool, len); - memcpy(a->data, data, len); - ea_set_attr(to, pool, id, flags, type, (uintptr_t) a); -} - - -#define NEXTHOP_MAX_SIZE (sizeof(struct nexthop) + sizeof(u32)*MPLS_MAX_LABEL_STACK) - -static inline size_t nexthop_size(const struct nexthop *nh) -{ return sizeof(struct nexthop) + sizeof(u32)*nh->labels; } -int nexthop__same(struct nexthop *x, struct nexthop *y); /* Compare multipath nexthops */ -static inline int nexthop_same(struct nexthop *x, struct nexthop *y) -{ return (x == y) || nexthop__same(x, y); } -struct nexthop *nexthop_merge(struct nexthop *x, struct nexthop *y, int rx, int ry, int max, linpool *lp); -struct nexthop *nexthop_sort(struct nexthop *x); -static inline void nexthop_link(struct rta *a, struct nexthop *from) -{ memcpy(&a->nh, from, nexthop_size(from)); } -void nexthop_insert(struct nexthop **n, struct nexthop *y); -int nexthop_is_sorted(struct nexthop *x); - -void rta_init(void); -static inline size_t rta_size(const rta *a) { return sizeof(rta) + sizeof(u32)*a->nh.labels; } -#define RTA_MAX_SIZE (sizeof(rta) + sizeof(u32)*MPLS_MAX_LABEL_STACK) -rta *rta_lookup(rta *); /* Get rta equivalent to this one, uc++ */ -static inline int rta_is_cached(rta *r) { return r->cached; } - -static inline rta *rta_clone(rta *r) { - u32 uc = atomic_fetch_add_explicit(&r->uc, 1, memory_order_acq_rel); - ASSERT_DIE(uc > 0); - return r; -} - -#define RTA_OBSOLETE_LIMIT 512 - -extern _Atomic u32 rta_obsolete_count; -extern event rta_cleanup_event; - -static inline void rta_free(rta *r) { - if (!r) - return; - - u32 uc = atomic_fetch_sub_explicit(&r->uc, 1, memory_order_acq_rel); - if (uc > 1) - return; - - u32 obs = atomic_fetch_add_explicit(&rta_obsolete_count, 1, memory_order_acq_rel); - if (obs == RTA_OBSOLETE_LIMIT) - ev_send(&global_work_list, &rta_cleanup_event); -} - -rta *rta_do_cow(rta *o, linpool *lp); -static inline rta * rta_cow(rta *r, linpool *lp) { return rta_is_cached(r) ? rta_do_cow(r, lp) : r; } -static inline void rta_uncache(rta *r) { r->cached = 0; r->uc = 0; } -void rta_dump(const rta *); -void rta_dump_all(void); -void rta_show(struct cli *, const rta *); - -u32 rt_get_igp_metric(rte *); -struct hostentry * rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep); -void rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls, linpool *lp); - -static inline void -rta_set_recursive_next_hop(rtable *dep, rta *a, rtable *tab, ip_addr gw, ip_addr ll, mpls_label_stack *mls, linpool *lp) -{ - rta_apply_hostentry(a, rt_get_hostentry(tab, gw, ll, dep), mls, lp); -} - -/* - * rta_set_recursive_next_hop() acquires hostentry from hostcache and fills - * rta->hostentry field. New hostentry has zero use count. Cached rta locks its - * hostentry (increases its use count), uncached rta does not lock it. Hostentry - * with zero use count is removed asynchronously during host cache update, - * therefore it is safe to hold such hostentry temorarily. Hostentry holds a - * lock for a 'source' rta, mainly to share multipath nexthops. - * - * There is no need to hold a lock for hostentry->dep table, because that table - * contains routes responsible for that hostentry, and therefore is non-empty if - * given hostentry has non-zero use count. If the hostentry has zero use count, - * the entry is removed before dep is referenced. - * - * The protocol responsible for routes with recursive next hops should hold a - * lock for a 'source' table governing that routes (argument tab to - * rta_set_recursive_next_hop()), because its routes reference hostentries - * (through rta) related to the governing table. When all such routes are - * removed, rtas are immediately removed achieving zero uc. Then the 'source' - * table lock could be immediately released, although hostentries may still - * exist - they will be freed together with the 'source' table. - */ - -static inline void rt_lock_hostentry(struct hostentry *he) { if (he) he->uc++; } -static inline void rt_unlock_hostentry(struct hostentry *he) { if (he) he->uc--; } - -/* - * Default protocol preferences - */ - -#define DEF_PREF_DIRECT 240 /* Directly connected */ -#define DEF_PREF_STATIC 200 /* Static route */ -#define DEF_PREF_OSPF 150 /* OSPF intra-area, inter-area and type 1 external routes */ -#define DEF_PREF_BABEL 130 /* Babel */ -#define DEF_PREF_RIP 120 /* RIP */ -#define DEF_PREF_BGP 100 /* BGP */ -#define DEF_PREF_RPKI 100 /* RPKI */ -#define DEF_PREF_INHERITED 10 /* Routes inherited from other routing daemons */ - -/* - * Route Origin Authorization - */ - -#define ROA_UNKNOWN 0 -#define ROA_VALID 1 -#define ROA_INVALID 2 - -#endif diff --git a/nest/rt-attr.c b/nest/rt-attr.c index c3da4782..903926f6 100644 --- a/nest/rt-attr.c +++ b/nest/rt-attr.c @@ -45,22 +45,37 @@ */ #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "nest/cli.h" -#include "nest/attrs.h" +#include "lib/attrs.h" #include "lib/alloca.h" #include "lib/hash.h" #include "lib/idm.h" #include "lib/resource.h" -#include "lib/rcu.h" #include "lib/string.h" #include <stddef.h> +#include <stdlib.h> const adata null_adata; /* adata of length 0 */ +struct ea_class ea_gen_igp_metric = { + .name = "igp_metric", + .type = T_INT, +}; + +struct ea_class ea_gen_preference = { + .name = "preference", + .type = T_INT, +}; + +struct ea_class ea_gen_from = { + .name = "from", + .type = T_IP, +}; + const char * const rta_src_names[RTS_MAX] = { [RTS_STATIC] = "static", [RTS_INHERIT] = "inherit", @@ -78,6 +93,71 @@ const char * const rta_src_names[RTS_MAX] = { [RTS_RPKI] = "RPKI", }; +static void +ea_gen_source_format(const eattr *a, byte *buf, uint size) +{ + if ((a->u.data >= RTS_MAX) || !rta_src_names[a->u.data]) + bsnprintf(buf, size, "unknown"); + else + bsnprintf(buf, size, "%s", rta_src_names[a->u.data]); +} + +struct ea_class ea_gen_source = { + .name = "source", + .type = T_ENUM_RTS, + .readonly = 1, + .format = ea_gen_source_format, +}; + +struct ea_class ea_gen_nexthop = { + .name = "nexthop", + .type = T_NEXTHOP_LIST, +}; + +/* + * ea_set_hostentry() acquires hostentry from hostcache. + * New hostentry has zero use count. Cached rta locks its + * hostentry (increases its use count), uncached rta does not lock it. + * Hostentry with zero use count is removed asynchronously + * during host cache update, therefore it is safe to hold + * such hostentry temporarily as long as you hold the table lock. + * + * There is no need to hold a lock for hostentry->dep table, because that table + * contains routes responsible for that hostentry, and therefore is non-empty if + * given hostentry has non-zero use count. If the hostentry has zero use count, + * the entry is removed before dep is referenced. + * + * The protocol responsible for routes with recursive next hops should hold a + * lock for a 'source' table governing that routes (argument tab), + * because its routes reference hostentries related to the governing table. + * When all such routes are + * removed, rtas are immediately removed achieving zero uc. Then the 'source' + * table lock could be immediately released, although hostentries may still + * exist - they will be freed together with the 'source' table. + */ + + static void +ea_gen_hostentry_stored(const eattr *ea) +{ + struct hostentry_adata *had = (struct hostentry_adata *) ea->u.ptr; + had->he->uc++; +} + +static void +ea_gen_hostentry_freed(const eattr *ea) +{ + struct hostentry_adata *had = (struct hostentry_adata *) ea->u.ptr; + had->he->uc--; +} + +struct ea_class ea_gen_hostentry = { + .name = "hostentry", + .type = T_HOSTENTRY, + .readonly = 1, + .stored = ea_gen_hostentry_stored, + .freed = ea_gen_hostentry_freed, +}; + const char * rta_dest_names[RTD_MAX] = { [RTD_NONE] = "", [RTD_UNICAST] = "unicast", @@ -86,16 +166,26 @@ const char * rta_dest_names[RTD_MAX] = { [RTD_PROHIBIT] = "prohibited", }; -static DOMAIN(attrs) src_domain; +struct ea_class ea_gen_flowspec_valid = { + .name = "flowspec_valid", + .type = T_ENUM_FLOWSPEC_VALID, + .readonly = 1, +}; + +const char * flowspec_valid_names[FLOWSPEC__MAX] = { + [FLOWSPEC_UNKNOWN] = "unknown", + [FLOWSPEC_VALID] = "", + [FLOWSPEC_INVALID] = "invalid", +}; -#define SRC_LOCK LOCK_DOMAIN(attrs, src_domain) -#define SRC_UNLOCK UNLOCK_DOMAIN(attrs, src_domain) +DOMAIN(attrs) attrs_domain; pool *rta_pool; -pool *src_pool; -static slab *rta_slab_[4]; -static slab *nexthop_slab_[4]; +/* Assuming page size of 4096, these are magic values for slab allocation */ +static const uint ea_slab_sizes[] = { 56, 112, 168, 288, 448, 800, 1344 }; +static slab *ea_slab[ARRAY_SIZE(ea_slab_sizes)]; + static slab *rte_src_slab; static struct idm src_ids; @@ -111,15 +201,16 @@ static struct idm src_ids; #define RSH_REHASH rte_src_rehash #define RSH_PARAMS /2, *2, 1, 1, 8, 20 #define RSH_INIT_ORDER 2 +static struct rte_src **rte_src_global; +static uint rte_src_global_max = SRC_ID_INIT_SIZE; static void rte_src_init(void) { - src_domain = DOMAIN_NEW(attrs, "Route sources"); - src_pool = rp_new(&root_pool, &main_birdloop, "Route sources"); - rte_src_slab = sl_new(src_pool, sizeof(struct rte_src)); + rte_src_slab = sl_new(rta_pool, sizeof(struct rte_src)); + rte_src_global = mb_allocz(rta_pool, sizeof(struct rte_src *) * rte_src_global_max); - idm_init(&src_ids, src_pool, SRC_ID_INIT_SIZE); + idm_init(&src_ids, rta_pool, SRC_ID_INIT_SIZE); } HASH_DEFINE_REHASH_FN(RSH, struct rte_src) @@ -144,7 +235,7 @@ rt_get_source_o(struct rte_owner *p, u32 id) return src; } - SRC_LOCK; + RTA_LOCK; src = sl_allocz(rte_src_slab); src->owner = p; src->private_id = id; @@ -153,23 +244,37 @@ rt_get_source_o(struct rte_owner *p, u32 id) atomic_store_explicit(&src->uc, 1, memory_order_release); p->uc++; - HASH_INSERT2(p->hash, RSH, src_pool, src); + HASH_INSERT2(p->hash, RSH, rta_pool, src); if (config->table_debug) log(L_TRACE "Allocated new rte_src for %s, ID %uL %uG, have %u sources now", p->name, src->private_id, src->global_id, p->uc); - SRC_UNLOCK; + if (src->global_id >= rte_src_global_max) + { + rte_src_global = mb_realloc(rte_src_global, sizeof(struct rte_src *) * (rte_src_global_max *= 2)); + memset(&rte_src_global[rte_src_global_max / 2], 0, + sizeof(struct rte_src *) * (rte_src_global_max / 2)); + } + + rte_src_global[src->global_id] = src; + RTA_UNLOCK; return src; } +struct rte_src * +rt_find_source_global(u32 id) +{ + if (id >= rte_src_global_max) + return NULL; + else + return rte_src_global[id]; +} + static inline void rt_done_sources(struct rte_owner *o) { - if (o->stop->list) - ev_send(o->stop->list, o->stop); - else - ev_send(o->list, o->stop); + ev_send(o->list, o->stop); } void @@ -181,7 +286,7 @@ rt_prune_sources(void *data) { u64 uc; while ((uc = atomic_load_explicit(&src->uc, memory_order_acquire)) >> RTE_SRC_PU_SHIFT) - ; + synchronize_rcu(); if (uc == 0) { @@ -189,21 +294,22 @@ rt_prune_sources(void *data) HASH_DO_REMOVE(o->hash, RSH, sp); - SRC_LOCK; + RTA_LOCK; + rte_src_global[src->global_id] = NULL; idm_free(&src_ids, src->global_id); - sl_free(rte_src_slab, src); - SRC_UNLOCK; + sl_free(src); + RTA_UNLOCK; } } HASH_WALK_FILTER_END; - SRC_LOCK; - HASH_MAY_RESIZE_DOWN(o->hash, RSH, src_pool); + RTA_LOCK; + HASH_MAY_RESIZE_DOWN(o->hash, RSH, rta_pool); if (o->stop && !o->uc) { rfree(o->prune); - SRC_UNLOCK; + RTA_UNLOCK; if (config->table_debug) log(L_TRACE "All rte_src's for %s pruned, scheduling stop event", o->name); @@ -211,21 +317,21 @@ rt_prune_sources(void *data) rt_done_sources(o); } else - SRC_UNLOCK; + RTA_UNLOCK; } void rt_init_sources(struct rte_owner *o, const char *name, event_list *list) { - SRC_LOCK; - HASH_INIT(o->hash, src_pool, RSH_INIT_ORDER); + RTA_LOCK; + HASH_INIT(o->hash, rta_pool, RSH_INIT_ORDER); o->hash_key = random_u32(); o->uc = 0; o->name = name; - o->prune = ev_new_init(src_pool, rt_prune_sources, o); + o->prune = ev_new_init(rta_pool, rt_prune_sources, o); o->stop = NULL; o->list = list; - SRC_UNLOCK; + RTA_UNLOCK; } void @@ -238,9 +344,9 @@ rt_destroy_sources(struct rte_owner *o, event *done) if (config->table_debug) log(L_TRACE "Source owner %s destroy requested. All rte_src's already pruned, scheduling stop event", o->name); - SRC_LOCK; + RTA_LOCK; rfree(o->prune); - SRC_UNLOCK; + RTA_UNLOCK; rt_done_sources(o); } @@ -253,50 +359,10 @@ rt_destroy_sources(struct rte_owner *o, event *done) * Multipath Next Hop */ -static inline u32 -nexthop_hash(struct nexthop *x) -{ - u32 h = 0; - for (; x; x = x->next) - { - h ^= ipa_hash(x->gw) ^ (h << 5) ^ (h >> 9); - - for (int i = 0; i < x->labels; i++) - h ^= x->label[i] ^ (h << 6) ^ (h >> 7); - } - - return h; -} - -int -nexthop__same(struct nexthop *x, struct nexthop *y) -{ - for (; x && y; x = x->next, y = y->next) - { - if (!ipa_equal(x->gw, y->gw) || (x->iface != y->iface) || - (x->flags != y->flags) || (x->weight != y->weight) || - (x->labels_orig != y->labels_orig) || (x->labels != y->labels)) - return 0; - - for (int i = 0; i < x->labels; i++) - if (x->label[i] != y->label[i]) - return 0; - } - - return x == y; -} - static int nexthop_compare_node(const struct nexthop *x, const struct nexthop *y) { int r; - - if (!x) - return 1; - - if (!y) - return -1; - /* Should we also compare flags ? */ r = ((int) y->weight) - ((int) x->weight); @@ -321,23 +387,16 @@ nexthop_compare_node(const struct nexthop *x, const struct nexthop *y) return ((int) x->iface->index) - ((int) y->iface->index); } -static inline struct nexthop * -nexthop_copy_node(const struct nexthop *src, linpool *lp) +static int +nexthop_compare_qsort(const void *x, const void *y) { - struct nexthop *n = lp_alloc(lp, nexthop_size(src)); - - memcpy(n, src, nexthop_size(src)); - n->next = NULL; - - return n; + return nexthop_compare_node( *(const struct nexthop **) x, *(const struct nexthop **) y ); } /** * nexthop_merge - merge nexthop lists * @x: list 1 * @y: list 2 - * @rx: reusability of list @x - * @ry: reusability of list @y * @max: max number of nexthops * @lp: linpool for allocating nexthops * @@ -354,138 +413,229 @@ nexthop_copy_node(const struct nexthop *src, linpool *lp) * resulting list is no longer needed. When reusability is not set, the * corresponding lists are not modified nor linked from the resulting list. */ -struct nexthop * -nexthop_merge(struct nexthop *x, struct nexthop *y, int rx, int ry, int max, linpool *lp) +struct nexthop_adata * +nexthop_merge(struct nexthop_adata *xin, struct nexthop_adata *yin, int max, linpool *lp) { - struct nexthop *root = NULL; - struct nexthop **n = &root; + uint outlen = ADATA_SIZE(xin->ad.length) + ADATA_SIZE(yin->ad.length); + struct nexthop_adata *out = lp_alloc(lp, outlen); + out->ad.length = outlen - sizeof (struct adata); + + struct nexthop *x = &xin->nh, *y = &yin->nh, *cur = &out->nh; + int xvalid, yvalid; - while ((x || y) && max--) + while (max--) { - int cmp = nexthop_compare_node(x, y); + xvalid = NEXTHOP_VALID(x, xin); + yvalid = NEXTHOP_VALID(y, yin); + + if (!xvalid && !yvalid) + break; + + ASSUME(NEXTHOP_VALID(cur, out)); + + int cmp = !xvalid ? 1 : !yvalid ? -1 : nexthop_compare_node(x, y); if (cmp < 0) { - ASSUME(x); - *n = rx ? x : nexthop_copy_node(x, lp); - x = x->next; + ASSUME(NEXTHOP_VALID(x, xin)); + memcpy(cur, x, nexthop_size(x)); + x = NEXTHOP_NEXT(x); } else if (cmp > 0) { - ASSUME(y); - *n = ry ? y : nexthop_copy_node(y, lp); - y = y->next; + ASSUME(NEXTHOP_VALID(y, yin)); + memcpy(cur, y, nexthop_size(y)); + y = NEXTHOP_NEXT(y); } else { - ASSUME(x && y); - *n = rx ? x : (ry ? y : nexthop_copy_node(x, lp)); - x = x->next; - y = y->next; + ASSUME(NEXTHOP_VALID(x, xin)); + memcpy(cur, x, nexthop_size(x)); + x = NEXTHOP_NEXT(x); + + ASSUME(NEXTHOP_VALID(y, yin)); + y = NEXTHOP_NEXT(y); } - n = &((*n)->next); + cur = NEXTHOP_NEXT(cur); } - *n = NULL; - return root; + out->ad.length = (void *) cur - (void *) out->ad.data; + + return out; } -void -nexthop_insert(struct nexthop **n, struct nexthop *x) +struct nexthop_adata * +nexthop_sort(struct nexthop_adata *nhad, linpool *lp) { - for (; *n; n = &((*n)->next)) - { - int cmp = nexthop_compare_node(*n, x); + /* Count the nexthops */ + uint cnt = 0; + NEXTHOP_WALK(nh, nhad) + cnt++; - if (cmp < 0) - continue; - else if (cmp > 0) - break; - else - return; - } + if (cnt <= 1) + return nhad; - x->next = *n; - *n = x; -} + /* Get pointers to them */ + struct nexthop **sptr = tmp_alloc(cnt * sizeof(struct nexthop *)); -struct nexthop * -nexthop_sort(struct nexthop *x) -{ - struct nexthop *s = NULL; + uint i = 0; + NEXTHOP_WALK(nh, nhad) + sptr[i++] = nh; - /* Simple insert-sort */ - while (x) + /* Sort the pointers */ + qsort(sptr, cnt, sizeof(struct nexthop *), nexthop_compare_qsort); + + /* Allocate the output */ + struct nexthop_adata *out = (struct nexthop_adata *) lp_alloc_adata(lp, nhad->ad.length); + struct nexthop *dest = &out->nh; + + /* Deduplicate nexthops while storing them */ + for (uint i = 0; i < cnt; i++) { - struct nexthop *n = x; - x = n->next; - n->next = NULL; + if (i && !nexthop_compare_node(sptr[i], sptr[i-1])) + continue; - nexthop_insert(&s, n); + memcpy(dest, sptr[i], NEXTHOP_SIZE(sptr[i])); + dest = NEXTHOP_NEXT(dest); } - return s; + out->ad.length = (void *) dest - (void *) out->ad.data; + return out; } int -nexthop_is_sorted(struct nexthop *x) +nexthop_is_sorted(struct nexthop_adata *nhad) { - for (; x && x->next; x = x->next) - if (nexthop_compare_node(x, x->next) >= 0) + struct nexthop *prev = NULL; + NEXTHOP_WALK(nh, nhad) + { + if (prev && (nexthop_compare_node(prev, nh) >= 0)) return 0; + prev = nh; + } + return 1; } -static inline slab * -nexthop_slab(struct nexthop *nh) +/* + * Extended Attributes + */ + +#define EA_CLASS_INITIAL_MAX 128 +static struct ea_class **ea_class_global = NULL; +static uint ea_class_max; +static struct idm ea_class_idm; + +/* Config parser lex register function */ +void ea_lex_register(struct ea_class *def); +void ea_lex_unregister(struct ea_class *def); + +static void +ea_class_free(struct ea_class *cl) { - return nexthop_slab_[MIN(nh->labels, 3)]; + /* No more ea class references. Unregister the attribute. */ + idm_free(&ea_class_idm, cl->id); + ea_class_global[cl->id] = NULL; + if (!cl->hidden) + ea_lex_unregister(cl); } -static struct nexthop * -nexthop_copy(struct nexthop *o) +static void +ea_class_ref_free(resource *r) { - struct nexthop *first = NULL; - struct nexthop **last = &first; - - for (; o; o = o->next) - { - struct nexthop *n = sl_allocz(nexthop_slab(o)); - n->gw = o->gw; - n->iface = o->iface; - n->next = NULL; - n->flags = o->flags; - n->weight = o->weight; - n->labels_orig = o->labels_orig; - n->labels = o->labels; - for (int i=0; i<o->labels; i++) - n->label[i] = o->label[i]; - - *last = n; - last = &(n->next); - } + struct ea_class_ref *ref = SKIP_BACK(struct ea_class_ref, r, r); + if (!--ref->class->uc) + ea_class_free(ref->class); +} - return first; +static void +ea_class_ref_dump(resource *r, unsigned indent UNUSED) +{ + struct ea_class_ref *ref = SKIP_BACK(struct ea_class_ref, r, r); + debug("name \"%s\", type=%d\n", ref->class->name, ref->class->type); } +static struct resclass ea_class_ref_class = { + .name = "Attribute class reference", + .size = sizeof(struct ea_class_ref), + .free = ea_class_ref_free, + .dump = ea_class_ref_dump, + .lookup = NULL, + .memsize = NULL, +}; + static void -nexthop_free(struct nexthop *o) +ea_class_init(void) { - struct nexthop *n; + ASSERT_DIE(ea_class_global == NULL); - while (o) - { - n = o->next; - sl_free(nexthop_slab(o), o); - o = n; - } + idm_init(&ea_class_idm, rta_pool, EA_CLASS_INITIAL_MAX); + ea_class_global = mb_allocz(rta_pool, + sizeof(*ea_class_global) * (ea_class_max = EA_CLASS_INITIAL_MAX)); } +static struct ea_class_ref * +ea_ref_class(pool *p, struct ea_class *def) +{ + def->uc++; + struct ea_class_ref *ref = ralloc(p, &ea_class_ref_class); + ref->class = def; + return ref; +} -/* - * Extended Attributes - */ +static struct ea_class_ref * +ea_register(pool *p, struct ea_class *def) +{ + def->id = idm_alloc(&ea_class_idm); + + ASSERT_DIE(ea_class_global); + while (def->id >= ea_class_max) + ea_class_global = mb_realloc(ea_class_global, sizeof(*ea_class_global) * (ea_class_max *= 2)); + + ASSERT_DIE(def->id < ea_class_max); + ea_class_global[def->id] = def; + + if (!def->hidden) + ea_lex_register(def); + + return ea_ref_class(p, def); +} + +struct ea_class_ref * +ea_register_alloc(pool *p, struct ea_class cl) +{ + struct ea_class *clp = ea_class_find_by_name(cl.name); + if (clp && clp->type == cl.type) + return ea_ref_class(p, clp); + + uint namelen = strlen(cl.name) + 1; + + struct { + struct ea_class cl; + char name[0]; + } *cla = mb_alloc(rta_pool, sizeof(struct ea_class) + namelen); + cla->cl = cl; + memcpy(cla->name, cl.name, namelen); + cla->cl.name = cla->name; + + return ea_register(p, &cla->cl); +} + +void +ea_register_init(struct ea_class *clp) +{ + ASSERT_DIE(!ea_class_find_by_name(clp->name)); + ea_register(&root_pool, clp); +} + +struct ea_class * +ea_class_find_by_id(uint id) +{ + ASSERT_DIE(id < ea_class_max); + ASSERT_DIE(ea_class_global[id]); + return ea_class_global[id]; +} static inline eattr * ea__find(ea_list *e, unsigned id) @@ -530,12 +680,11 @@ ea__find(ea_list *e, unsigned id) * to its &eattr structure or %NULL if no such attribute exists. */ eattr * -ea_find(ea_list *e, unsigned id) +ea_find_by_id(ea_list *e, unsigned id) { eattr *a = ea__find(e, id & EA_CODE_MASK); - if (a && (a->type & EAF_TYPE_MASK) == EAF_TYPE_UNDEF && - !(id & EA_ALLOW_UNDEF)) + if (a && a->undef && !(id & EA_ALLOW_UNDEF)) return NULL; return a; } @@ -602,7 +751,7 @@ ea_walk(struct ea_walk_state *s, uint id, uint max) BIT32_SET(s->visited, n); - if ((a->type & EAF_TYPE_MASK) == EAF_TYPE_UNDEF) + if (a->undef) continue; s->eattrs = e; @@ -616,25 +765,6 @@ ea_walk(struct ea_walk_state *s, uint id, uint max) return NULL; } -/** - * ea_get_int - fetch an integer attribute - * @e: attribute list - * @id: attribute ID - * @def: default value - * - * This function is a shortcut for retrieving a value of an integer attribute - * by calling ea_find() to find the attribute, extracting its value or returning - * a provided default if no such attribute is present. - */ -uintptr_t -ea_get_int(ea_list *e, unsigned id, uintptr_t def) -{ - eattr *a = ea_find(e, id); - if (!a) - return def; - return a->u.data; -} - static inline void ea_do_sort(ea_list *e) { @@ -701,15 +831,18 @@ ea_do_prune(ea_list *e) s++; /* Now s0 is the most recent version, s[-1] the oldest one */ - /* Drop undefs */ - if ((s0->type & EAF_TYPE_MASK) == EAF_TYPE_UNDEF) + /* Drop undefs unless this is a true overlay */ + if (s0->undef && (s[-1].undef || !e->next)) continue; /* Copy the newest version to destination */ *d = *s0; /* Preserve info whether it originated locally */ - d->type = (d->type & ~(EAF_ORIGINATED|EAF_FRESH)) | (s[-1].type & EAF_ORIGINATED); + d->originated = s[-1].originated; + + /* Not fresh any more, we prefer surstroemming */ + d->fresh = 0; /* Next destination */ d++; @@ -729,21 +862,18 @@ ea_do_prune(ea_list *e) * If an attribute occurs multiple times in a single &ea_list, * ea_sort() leaves only the first (the only significant) occurrence. */ -void +static void ea_sort(ea_list *e) { - while (e) - { - if (!(e->flags & EALF_SORTED)) - { - ea_do_sort(e); - ea_do_prune(e); - e->flags |= EALF_SORTED; - } - if (e->count > 5) - e->flags |= EALF_BISECT; - e = e->next; - } + if (!(e->flags & EALF_SORTED)) + { + ea_do_sort(e); + ea_do_prune(e); + e->flags |= EALF_SORTED; + } + + if (e->count > 5) + e->flags |= EALF_BISECT; } /** @@ -753,8 +883,8 @@ ea_sort(ea_list *e) * This function calculates an upper bound of the size of * a given &ea_list after merging with ea_merge(). */ -unsigned -ea_scan(ea_list *e) +static unsigned +ea_scan(const ea_list *e, int overlay) { unsigned cnt = 0; @@ -762,6 +892,8 @@ ea_scan(ea_list *e) { cnt += e->count; e = e->next; + if (e && overlay && ea_is_cached(e)) + break; } return sizeof(ea_list) + sizeof(eattr)*cnt; } @@ -780,21 +912,36 @@ ea_scan(ea_list *e) * segments with ea_merge() and finally sort and prune the result * by calling ea_sort(). */ -void -ea_merge(ea_list *e, ea_list *t) +static void +ea_merge(ea_list *e, ea_list *t, int overlay) { eattr *d = t->attrs; t->flags = 0; t->count = 0; - t->next = NULL; + while (e) { memcpy(d, e->attrs, sizeof(eattr)*e->count); t->count += e->count; d += e->count; e = e->next; + + if (e && overlay && ea_is_cached(e)) + break; } + + t->next = e; +} + +ea_list * +ea_normalize(ea_list *e, int overlay) +{ + ea_list *t = tmp_alloc(ea_scan(e, overlay)); + ea_merge(e, t, overlay); + ea_sort(t); + + return t->count ? t : t->next; } /** @@ -812,7 +959,8 @@ ea_same(ea_list *x, ea_list *y) if (!x || !y) return x == y; - ASSERT(!x->next && !y->next); + if (x->next != y->next) + return 0; if (x->count != y->count) return 0; for(c=0; c<x->count; c++) @@ -823,39 +971,46 @@ ea_same(ea_list *x, ea_list *y) if (a->id != b->id || a->flags != b->flags || a->type != b->type || + a->originated != b->originated || + a->fresh != b->fresh || + a->undef != b->undef || ((a->type & EAF_EMBEDDED) ? a->u.data != b->u.data : !adata_same(a->u.ptr, b->u.ptr))) return 0; } return 1; } -static inline ea_list * -ea_list_copy(ea_list *o) +uint +ea_list_size(ea_list *o) { - ea_list *n; - unsigned i, adpos, elen; + unsigned i, elen; - if (!o) - return NULL; - ASSERT(!o->next); - elen = adpos = sizeof(ea_list) + sizeof(eattr) * o->count; + ASSERT_DIE(o); + elen = BIRD_CPU_ALIGN(sizeof(ea_list) + sizeof(eattr) * o->count); for(i=0; i<o->count; i++) { eattr *a = &o->attrs[i]; - if (!(a->type & EAF_EMBEDDED)) - elen += sizeof(struct adata) + a->u.ptr->length; + if (!a->undef && !(a->type & EAF_EMBEDDED)) + elen += ADATA_SIZE(a->u.ptr->length); } - n = mb_alloc(rta_pool, elen); + return elen; +} + +void +ea_list_copy(ea_list *n, ea_list *o, uint elen) +{ + uint adpos = sizeof(ea_list) + sizeof(eattr) * o->count; memcpy(n, o, adpos); - n->flags |= EALF_CACHED; - for(i=0; i<o->count; i++) + adpos = BIRD_CPU_ALIGN(adpos); + + for(uint i=0; i<o->count; i++) { eattr *a = &n->attrs[i]; - if (!(a->type & EAF_EMBEDDED)) + if (!a->undef && !(a->type & EAF_EMBEDDED)) { - unsigned size = sizeof(struct adata) + a->u.ptr->length; + unsigned size = ADATA_SIZE(a->u.ptr->length); ASSERT_DIE(adpos + size <= elen); struct adata *d = ((void *) n) + adpos; @@ -865,30 +1020,58 @@ ea_list_copy(ea_list *o) adpos += size; } } + ASSERT_DIE(adpos == elen); - return n; } -static inline void -ea_free(ea_list *o) +static void +ea_list_ref(ea_list *l) { - if (o) + for(uint i=0; i<l->count; i++) { - ASSERT(!o->next); - mb_free(o); + eattr *a = &l->attrs[i]; + ASSERT_DIE(a->id < ea_class_max); + + if (a->undef) + continue; + + struct ea_class *cl = ea_class_global[a->id]; + ASSERT_DIE(cl && cl->uc); + + CALL(cl->stored, a); + cl->uc++; } + + if (l->next) + { + ASSERT_DIE(ea_is_cached(l->next)); + ea_clone(l->next); + } } -static int -get_generic_attr(const eattr *a, byte **buf, int buflen UNUSED) +static void ea_free_nested(ea_list *l); + +static void +ea_list_unref(ea_list *l) { - if (a->id == EA_GEN_IGP_METRIC) + for(uint i=0; i<l->count; i++) { - *buf += bsprintf(*buf, "igp_metric"); - return GA_NAME; + eattr *a = &l->attrs[i]; + ASSERT_DIE(a->id < ea_class_max); + + if (a->undef) + continue; + + struct ea_class *cl = ea_class_global[a->id]; + ASSERT_DIE(cl && cl->uc); + + CALL(cl->freed, a); + if (!--cl->uc) + ea_class_free(cl); } - return GA_UNKNOWN; + if (l->next) + ea_free_nested(l->next); } void @@ -941,41 +1124,90 @@ opaque_format(const struct adata *ad, byte *buf, uint size) } static inline void -ea_show_int_set(struct cli *c, const struct adata *ad, int way, byte *pos, byte *buf, byte *end) +ea_show_int_set(struct cli *c, const char *name, const struct adata *ad, int way, byte *buf) { - int i = int_set_format(ad, way, 0, pos, end - pos); - cli_printf(c, -1012, "\t%s", buf); + int nlen = strlen(name); + int i = int_set_format(ad, way, 0, buf, CLI_MSG_SIZE - nlen - 3); + cli_printf(c, -1012, "\t%s: %s", name, buf); while (i) { - i = int_set_format(ad, way, i, buf, end - buf - 1); + i = int_set_format(ad, way, i, buf, CLI_MSG_SIZE - 1); cli_printf(c, -1012, "\t\t%s", buf); } } static inline void -ea_show_ec_set(struct cli *c, const struct adata *ad, byte *pos, byte *buf, byte *end) +ea_show_ec_set(struct cli *c, const char *name, const struct adata *ad, byte *buf) { - int i = ec_set_format(ad, 0, pos, end - pos); - cli_printf(c, -1012, "\t%s", buf); + int nlen = strlen(name); + int i = ec_set_format(ad, 0, buf, CLI_MSG_SIZE - nlen - 3); + cli_printf(c, -1012, "\t%s: %s", name, buf); while (i) { - i = ec_set_format(ad, i, buf, end - buf - 1); + i = ec_set_format(ad, i, buf, CLI_MSG_SIZE - 1); cli_printf(c, -1012, "\t\t%s", buf); } } static inline void -ea_show_lc_set(struct cli *c, const struct adata *ad, byte *pos, byte *buf, byte *end) +ea_show_lc_set(struct cli *c, const char *name, const struct adata *ad, byte *buf) { - int i = lc_set_format(ad, 0, pos, end - pos); - cli_printf(c, -1012, "\t%s", buf); + int nlen = strlen(name); + int i = lc_set_format(ad, 0, buf, CLI_MSG_SIZE - nlen - 3); + cli_printf(c, -1012, "\t%s: %s", name, buf); while (i) { - i = lc_set_format(ad, i, buf, end - buf - 1); + i = lc_set_format(ad, i, buf, CLI_MSG_SIZE - 1); cli_printf(c, -1012, "\t\t%s", buf); } } +void +ea_show_nexthop_list(struct cli *c, struct nexthop_adata *nhad) +{ + if (!NEXTHOP_IS_REACHABLE(nhad)) + return; + + NEXTHOP_WALK(nh, nhad) + { + char mpls[MPLS_MAX_LABEL_STACK*12 + 5], *lsp = mpls; + char *onlink = (nh->flags & RNF_ONLINK) ? " onlink" : ""; + char weight[16] = ""; + + if (nh->labels) + { + lsp += bsprintf(lsp, " mpls %d", nh->label[0]); + for (int i=1;i<nh->labels; i++) + lsp += bsprintf(lsp, "/%d", nh->label[i]); + } + *lsp = '\0'; + + if (!NEXTHOP_ONE(nhad)) + bsprintf(weight, " weight %d", nh->weight + 1); + + if (ipa_nonzero(nh->gw)) + if (nh->iface) + cli_printf(c, -1007, "\tvia %I on %s%s%s%s", + nh->gw, nh->iface->name, mpls, onlink, weight); + else + cli_printf(c, -1007, "\tvia %I", nh->gw); + else + cli_printf(c, -1007, "\tdev %s%s%s", + nh->iface->name, mpls, onlink, weight); + } +} + +void +ea_show_hostentry(const struct adata *ad, byte *buf, uint size) +{ + const struct hostentry_adata *had = (const struct hostentry_adata *) ad; + + if (ipa_nonzero(had->he->link) && !ipa_equal(had->he->link, had->he->addr)) + bsnprintf(buf, size, "via %I %I table %s", had->he->addr, had->he->link, had->he->tab->name); + else + bsnprintf(buf, size, "via %I table %s", had->he->addr, had->he->tab->name); +} + /** * ea_show - print an &eattr to CLI * @c: destination CLI @@ -987,79 +1219,80 @@ ea_show_lc_set(struct cli *c, const struct adata *ad, byte *pos, byte *buf, byte * If the protocol defining the attribute provides its own * get_attr() hook, it's consulted first. */ -void +static void ea_show(struct cli *c, const eattr *e) { - struct protocol *p; - int status = GA_UNKNOWN; const struct adata *ad = (e->type & EAF_EMBEDDED) ? NULL : e->u.ptr; byte buf[CLI_MSG_SIZE]; byte *pos = buf, *end = buf + sizeof(buf); - if (EA_IS_CUSTOM(e->id)) - { - const char *name = ea_custom_name(e->id); - if (name) - { - pos += bsprintf(pos, "%s", name); - status = GA_NAME; - } - else - pos += bsprintf(pos, "%02x.", EA_PROTO(e->id)); - } - else if (p = class_to_protocol[EA_PROTO(e->id)]) - { - pos += bsprintf(pos, "%s.", p->name); - if (p->get_attr) - status = p->get_attr(e, pos, end - pos); - pos += strlen(pos); - } - else if (EA_PROTO(e->id)) - pos += bsprintf(pos, "%02x.", EA_PROTO(e->id)); + ASSERT_DIE(e->id < ea_class_max); + + struct ea_class *cls = ea_class_global[e->id]; + ASSERT_DIE(cls); + + if (e->undef || cls->hidden) + return; + else if (cls->format) + cls->format(e, buf, end - buf); else - status = get_generic_attr(e, &pos, end - pos); + switch (e->type) + { + case T_INT: + if ((cls == &ea_gen_igp_metric) && e->u.data >= IGP_METRIC_UNKNOWN) + return; - if (status < GA_NAME) - pos += bsprintf(pos, "%02x", EA_ID(e->id)); - if (status < GA_FULL) - { - *pos++ = ':'; - *pos++ = ' '; - switch (e->type & EAF_TYPE_MASK) - { - case EAF_TYPE_INT: bsprintf(pos, "%u", e->u.data); break; - case EAF_TYPE_OPAQUE: + case T_OPAQUE: opaque_format(ad, pos, end - pos); break; - case EAF_TYPE_IP_ADDRESS: + case T_IP: bsprintf(pos, "%I", *(ip_addr *) ad->data); break; - case EAF_TYPE_ROUTER_ID: + case T_QUAD: bsprintf(pos, "%R", e->u.data); break; - case EAF_TYPE_AS_PATH: + case T_PATH: as_path_format(ad, pos, end - pos); break; - case EAF_TYPE_BITFIELD: - bsprintf(pos, "%08x", e->u.data); - break; - case EAF_TYPE_INT_SET: - ea_show_int_set(c, ad, 1, pos, buf, end); + case T_CLIST: + ea_show_int_set(c, cls->name, ad, 1, buf); return; - case EAF_TYPE_EC_SET: - ea_show_ec_set(c, ad, pos, buf, end); + case T_ECLIST: + ea_show_ec_set(c, cls->name, ad, buf); return; - case EAF_TYPE_LC_SET: - ea_show_lc_set(c, ad, pos, buf, end); + case T_LCLIST: + ea_show_lc_set(c, cls->name, ad, buf); return; - case EAF_TYPE_UNDEF: + case T_NEXTHOP_LIST: + ea_show_nexthop_list(c, (struct nexthop_adata *) e->u.ptr); + return; + case T_HOSTENTRY: + ea_show_hostentry(ad, pos, end - pos); + break; default: bsprintf(pos, "<type %02x>", e->type); - } + } + + cli_printf(c, -1012, "\t%s: %s", cls->name, buf); +} + +static void +nexthop_dump(const struct adata *ad) +{ + struct nexthop_adata *nhad = (struct nexthop_adata *) ad; + + debug(":"); + + NEXTHOP_WALK(nh, nhad) + { + if (ipa_nonzero(nh->gw)) debug(" ->%I", nh->gw); + if (nh->labels) debug(" L %d", nh->label[0]); + for (int i=1; i<nh->labels; i++) + debug("/%d", nh->label[i]); + debug(" [%s]", nh->iface ? nh->iface->name : "???"); } - cli_printf(c, -1012, "\t%s", buf); } /** @@ -1081,19 +1314,26 @@ ea_dump(ea_list *e) } while (e) { - debug("[%c%c%c]", + struct ea_storage *s = ea_is_cached(e) ? ea_get_storage(e) : NULL; + debug("[%c%c%c] uc=%d h=%08x", (e->flags & EALF_SORTED) ? 'S' : 's', (e->flags & EALF_BISECT) ? 'B' : 'b', - (e->flags & EALF_CACHED) ? 'C' : 'c'); + (e->flags & EALF_CACHED) ? 'C' : 'c', + s ? s->uc : 0, s ? s->hash_key : 0); for(i=0; i<e->count; i++) { eattr *a = &e->attrs[i]; - debug(" %02x:%02x.%02x", EA_PROTO(a->id), EA_ID(a->id), a->flags); - debug("=%c", "?iO?I?P???S?????" [a->type & EAF_TYPE_MASK]); - if (a->type & EAF_ORIGINATED) + debug(" %04x.%02x", a->id, a->flags); + debug("=%c", + "?iO?IRP???S??pE?" + "??L???N?????????" + "?o???r??????????" [a->type]); + if (a->originated) debug("o"); if (a->type & EAF_EMBEDDED) debug(":%08x", a->u.data); + else if (a->id == ea_gen_nexthop.id) + nexthop_dump(a->u.ptr); else { int j, len = a->u.ptr->length; @@ -1123,10 +1363,13 @@ ea_hash(ea_list *e) if (e) /* Assuming chain of length 1 */ { + h ^= mem_hash(&e->next, sizeof(e->next)); for(i=0; i<e->count; i++) { struct eattr *a = &e->attrs[i]; h ^= a->id; h *= mul; + if (a->undef) + continue; if (a->type & EAF_EMBEDDED) h ^= a->u.data; else @@ -1166,163 +1409,52 @@ ea_append(ea_list *to, ea_list *what) * rta's */ -static DOMAIN(attrs) attrs_domain; - -#define RTA_LOCK LOCK_DOMAIN(attrs, attrs_domain) -#define RTA_UNLOCK UNLOCK_DOMAIN(attrs, attrs_domain) +static uint rta_cache_count; +static uint rta_cache_size = 32; +static uint rta_cache_limit; +static uint rta_cache_mask; +static struct ea_storage **rta_hash_table; -struct rta_cache { - u32 count; - u32 size; - u32 limit; - u32 mask; - rta * _Atomic table[0]; -} * _Atomic rta_cache; -// rta_aux, rta_cache = { .size = ATOMIC_VAR_INIT(32), }; - -static struct rta_cache * -rta_alloc_hash(u32 size) -{ - struct rta_cache *c = mb_allocz(rta_pool, sizeof(struct rta_cache) + sizeof(rta * _Atomic) * size); - c->size = size; - c->limit = (size >> 20) ? (~0U) : (size * 2); - c->mask = size - 1; - return c; -} - -static inline uint -rta_hash(rta *a) -{ - u64 h; - mem_hash_init(&h); -#define MIX(f) mem_hash_mix(&h, &(a->f), sizeof(a->f)); -#define BMIX(f) mem_hash_mix_num(&h, a->f); - MIX(hostentry); - MIX(from); - MIX(igp_metric); - BMIX(source); - BMIX(scope); - BMIX(dest); - MIX(pref); -#undef MIX - - return mem_hash_value(&h) ^ nexthop_hash(&(a->nh)) ^ ea_hash(a->eattrs); -} - -static inline int -rta_same(rta *x, rta *y) -{ - return (x->source == y->source && - x->scope == y->scope && - x->dest == y->dest && - x->igp_metric == y->igp_metric && - ipa_equal(x->from, y->from) && - x->hostentry == y->hostentry && - nexthop_same(&(x->nh), &(y->nh)) && - ea_same(x->eattrs, y->eattrs)); -} - -static inline slab * -rta_slab(rta *a) -{ - return rta_slab_[a->nh.labels > 2 ? 3 : a->nh.labels]; -} - -static rta * -rta_copy(rta *o) +static void +rta_alloc_hash(void) { - rta *r = sl_alloc(rta_slab(o)); - - memcpy(r, o, rta_size(o)); - r->uc = 1; - r->nh.next = nexthop_copy(o->nh.next); - r->eattrs = ea_list_copy(o->eattrs); - return r; + rta_hash_table = mb_allocz(rta_pool, sizeof(struct ea_storage *) * rta_cache_size); + if (rta_cache_size < 32768) + rta_cache_limit = rta_cache_size * 2; + else + rta_cache_limit = ~0; + rta_cache_mask = rta_cache_size - 1; } static inline void -rta_insert(rta *r, struct rta_cache *c) +rta_insert(struct ea_storage *r) { - uint h = r->hash_key & c->mask; - rta *next = atomic_load_explicit(&c->table[h], memory_order_relaxed); - - atomic_store_explicit(&r->next, next, memory_order_relaxed); - r->pprev = &c->table[h]; - - if (next) - next->pprev = &r->next; - - /* This store MUST be the last and MUST have release order for thread-safety */ - atomic_store_explicit(&c->table[h], r, memory_order_release); + uint h = r->hash_key & rta_cache_mask; + r->next_hash = rta_hash_table[h]; + if (r->next_hash) + r->next_hash->pprev_hash = &r->next_hash; + r->pprev_hash = &rta_hash_table[h]; + rta_hash_table[h] = r; } static void -rta_rehash(struct rta_cache *c) +rta_rehash(void) { - u32 os = c->size; - - struct rta_cache *nc = rta_alloc_hash(os * 2); - nc->count = c->count; - - /* First we simply copy every chain to both new locations */ - for (u32 h = 0; h < os; h++) - { - rta *r = atomic_load_explicit(&c->table[h], memory_order_relaxed); - atomic_store_explicit(&nc->table[h], r, memory_order_relaxed); - atomic_store_explicit(&nc->table[h + os], r, memory_order_relaxed); - } - - /* Then we exchange the hashes; release semantics forces the previous code to be already done */ - atomic_store_explicit(&rta_cache, nc, memory_order_release); - - /* And now we pass through both chains and filter them */ - for (u32 h = 0; h < c->size; h++) - { - rta * _Atomic * ap = &nc->table[h]; - rta * _Atomic * bp = &nc->table[h + os]; - - rta *r = atomic_load_explicit(ap, memory_order_relaxed); - ASSERT_DIE(r == atomic_load_explicit(bp, memory_order_relaxed)); - - while (r) - { - if (r->hash_key & os) - { - r->pprev = bp; - atomic_store_explicit(bp, r, memory_order_release); - bp = &r->next; - } - else + uint ohs = rta_cache_size; + uint h; + struct ea_storage *r, *n; + struct ea_storage **oht = rta_hash_table; + + rta_cache_size = 2*rta_cache_size; + DBG("Rehashing rta cache from %d to %d entries.\n", ohs, rta_cache_size); + rta_alloc_hash(); + for(h=0; h<ohs; h++) + for(r=oht[h]; r; r=n) { - r->pprev = ap; - atomic_store_explicit(ap, r, memory_order_release); - ap = &r->next; + n = r->next_hash; + rta_insert(r); } - - r = atomic_load_explicit(&r->next, memory_order_acquire); - } - - atomic_store_explicit(ap, NULL, memory_order_release); - atomic_store_explicit(bp, NULL, memory_order_release); - } - - synchronize_rcu(); - mb_free(c); -} - -static rta * -rta_find(rta *o, u32 h, struct rta_cache *c) -{ - rta *r = NULL; - - for (r = atomic_load_explicit(&c->table[h & c->mask], memory_order_acquire); r; r = atomic_load_explicit(&r->next, memory_order_acquire)) - if (r->hash_key == h && rta_same(r, o)) - { - atomic_fetch_add_explicit(&r->uc, 1, memory_order_acq_rel); - return r; - } - - return NULL; + mb_free(oht); } /** @@ -1338,173 +1470,89 @@ rta_find(rta *o, u32 h, struct rta_cache *c) * The extended attribute lists attached to the &rta are automatically * converted to the normalized form. */ -rta * -rta_lookup(rta *o) +ea_list * +ea_lookup(ea_list *o, int overlay) { - rta *r; + struct ea_storage *r; uint h; - ASSERT(!o->cached); - if (o->eattrs) - ea_normalize(o->eattrs); - - h = rta_hash(o); - - /* Lockless lookup */ - rcu_read_lock(); - r = rta_find(o, h, atomic_load_explicit(&rta_cache, memory_order_acquire)); - rcu_read_unlock(); - - if (r) - return r; - - RTA_LOCK; - - /* Locked lookup to avoid duplicates if possible */ - struct rta_cache *c = atomic_load_explicit(&rta_cache, memory_order_acquire); - r = rta_find(o, h, c); - if (r) - { - RTA_UNLOCK; - return r; - } - - /* Store the rta */ - r = rta_copy(o); - r->hash_key = h; - r->cached = 1; - rt_lock_hostentry(r->hostentry); - rta_insert(r, c); - - if (++c->count > c->limit) - rta_rehash(c); - - RTA_UNLOCK; - return r; -} - -static void -rta_cleanup(void *data UNUSED) -{ - u32 count = 0; - rta *ax[RTA_OBSOLETE_LIMIT]; + ASSERT(!ea_is_cached(o)); + o = ea_normalize(o, overlay); + h = ea_hash(o); RTA_LOCK; - struct rta_cache *c = atomic_load_explicit(&rta_cache, memory_order_acquire); - for(u32 h=0; h<c->size; h++) - for(rta *a = atomic_load_explicit(&c->table[h], memory_order_acquire), *next; - a; - a = next) + for(r=rta_hash_table[h & rta_cache_mask]; r; r=r->next_hash) + if (r->hash_key == h && ea_same(r->l, o)) { - next = atomic_load_explicit(&a->next, memory_order_acquire); - if (atomic_load_explicit(&a->uc, memory_order_acquire) > 0) - continue; - - /* Check if the cleanup fits in the buffer */ - if (count == RTA_OBSOLETE_LIMIT) - { - ev_send(&global_work_list, &rta_cleanup_event); - goto wait; - } - - /* Relink the forward pointer */ - atomic_store_explicit(a->pprev, next, memory_order_release); - - /* Relink the backwards pointer */ - if (next) - next->pprev = a->pprev; + atomic_fetch_add_explicit(&r->uc, 1, memory_order_acq_rel); + RTA_UNLOCK; + return r->l; + } - /* Store for freeing and go to the next */ - ax[count++] = a; - a = next; + uint elen = ea_list_size(o); + uint sz = elen + sizeof(struct ea_storage); + for (uint i=0; i<ARRAY_SIZE(ea_slab_sizes); i++) + if (sz <= ea_slab_sizes[i]) + { + r = sl_alloc(ea_slab[i]); + break; } -wait: - /* Wait until nobody knows about us */ - synchronize_rcu(); + int huge = r ? 0 : EALF_HUGE;; + if (huge) + r = mb_alloc(rta_pool, sz); - u32 freed = 0; + ea_list_copy(r->l, o, elen); + ea_list_ref(r->l); - for (u32 i=0; i<count; i++) - { - rta *a = ax[i]; - /* Acquired inbetween, relink back */ - if (atomic_load_explicit(&a->uc, memory_order_acquire)) - { - rta_insert(a, c); - continue; - } + r->l->flags |= EALF_CACHED | huge; + r->hash_key = h; + r->uc = 1; - /* Cleared to free the memory */ - rt_unlock_hostentry(a->hostentry); - if (a->nh.next) - nexthop_free(a->nh.next); - ea_free(a->eattrs); - a->cached = 0; - c->count--; - sl_free(rta_slab(a), a); - freed++; - } + rta_insert(r); - atomic_fetch_sub_explicit(&rta_obsolete_count, freed, memory_order_release); + if (++rta_cache_count > rta_cache_limit) + rta_rehash(); RTA_UNLOCK; + return r->l; } -_Atomic u32 rta_obsolete_count; -event rta_cleanup_event = { .hook = rta_cleanup, .list = &global_work_list }; +static void +ea_free_locked(struct ea_storage *a) +{ + /* Somebody has cloned this rta inbetween. This sometimes happens. */ + if (atomic_load_explicit(&a->uc, memory_order_acquire)) + return; + + ASSERT(rta_cache_count); + rta_cache_count--; + *a->pprev_hash = a->next_hash; + if (a->next_hash) + a->next_hash->pprev_hash = a->pprev_hash; + + ea_list_unref(a->l); + if (a->l->flags & EALF_HUGE) + mb_free(a); + else + sl_free(a); +} -rta * -rta_do_cow(rta *o, linpool *lp) +static void +ea_free_nested(struct ea_list *l) { - rta *r = lp_alloc(lp, rta_size(o)); - memcpy(r, o, rta_size(o)); - for (struct nexthop **nhn = &(r->nh.next), *nho = o->nh.next; nho; nho = nho->next) - { - *nhn = lp_alloc(lp, nexthop_size(nho)); - memcpy(*nhn, nho, nexthop_size(nho)); - nhn = &((*nhn)->next); - } - rta_uncache(r); - return r; + struct ea_storage *r = ea_get_storage(l); + if (1 == atomic_fetch_sub_explicit(&r->uc, 1, memory_order_acq_rel)) + ea_free_locked(r); } -/** - * rta_dump - dump route attributes - * @a: attribute structure to dump - * - * This function takes a &rta and dumps its contents to the debug output. - */ void -rta_dump(const rta *a) +ea__free(struct ea_storage *a) { - static char *rts[] = { "", "RTS_STATIC", "RTS_INHERIT", "RTS_DEVICE", - "RTS_STAT_DEV", "RTS_REDIR", "RTS_RIP", - "RTS_OSPF", "RTS_OSPF_IA", "RTS_OSPF_EXT1", - "RTS_OSPF_EXT2", "RTS_BGP", "RTS_PIPE", "RTS_BABEL" }; - static char *rtd[] = { "", " DEV", " HOLE", " UNREACH", " PROHIBIT" }; - - debug("pref=%d uc=%d %s %s%s h=%04x", - a->pref, a->uc, rts[a->source], ip_scope_text(a->scope), - rtd[a->dest], a->hash_key); - if (!a->cached) - debug(" !CACHED"); - debug(" <-%I", a->from); - if (a->dest == RTD_UNICAST) - for (const struct nexthop *nh = &(a->nh); nh; nh = nh->next) - { - if (ipa_nonzero(nh->gw)) debug(" ->%I", nh->gw); - if (nh->labels) debug(" L %d", nh->label[0]); - for (int i=1; i<nh->labels; i++) - debug("/%d", nh->label[i]); - debug(" [%s]", nh->iface ? nh->iface->name : "???"); - } - if (a->eattrs) - { - debug(" EA: "); - ea_dump(a->eattrs); - } + RTA_LOCK; + ea_free_locked(a); + RTA_UNLOCK; } /** @@ -1514,23 +1562,16 @@ rta_dump(const rta *a) * to the debug output. */ void -rta_dump_all(void) +ea_dump_all(void) { - rta *a; - uint h; - RTA_LOCK; - struct rta_cache *c = atomic_load_explicit(&rta_cache, memory_order_acquire); - - debug("Route attribute cache (%d entries, rehash at %d):\n", c->count, c->limit); - for(h=0; h<c->size; h++) - for(a = atomic_load_explicit(&c->table[h], memory_order_acquire); - a; - a = atomic_load_explicit(&a->next, memory_order_acquire)) + debug("Route attribute cache (%d entries, rehash at %d):\n", rta_cache_count, rta_cache_limit); + for (uint h=0; h < rta_cache_size; h++) + for (struct ea_storage *a = rta_hash_table[h]; a; a = a->next_hash) { debug("%p ", a); - rta_dump(a); + ea_dump(a->l); debug("\n"); } debug("\n"); @@ -1539,13 +1580,11 @@ rta_dump_all(void) } void -rta_show(struct cli *c, const rta *a) +ea_show_list(struct cli *c, ea_list *eal) { - cli_printf(c, -1008, "\tType: %s %s", rta_src_names[a->source], ip_scope_text(a->scope)); - - for(ea_list *eal = a->eattrs; eal; eal=eal->next) - for(int i=0; i<eal->count; i++) - ea_show(c, &eal->attrs[i]); + ea_list *n = ea_normalize(eal, 0); + for (int i =0; i < n->count; i++) + ea_show(c, &n->attrs[i]); } /** @@ -1559,20 +1598,25 @@ rta_init(void) { attrs_domain = DOMAIN_NEW(attrs, "Attributes"); - rta_pool = rp_new(&root_pool, &main_birdloop, "Attributes"); - - rta_slab_[0] = sl_new(rta_pool, sizeof(rta)); - rta_slab_[1] = sl_new(rta_pool, sizeof(rta) + sizeof(u32)); - rta_slab_[2] = sl_new(rta_pool, sizeof(rta) + sizeof(u32)*2); - rta_slab_[3] = sl_new(rta_pool, sizeof(rta) + sizeof(u32)*MPLS_MAX_LABEL_STACK); + rta_pool = rp_new(&root_pool, "Attributes"); - nexthop_slab_[0] = sl_new(rta_pool, sizeof(struct nexthop)); - nexthop_slab_[1] = sl_new(rta_pool, sizeof(struct nexthop) + sizeof(u32)); - nexthop_slab_[2] = sl_new(rta_pool, sizeof(struct nexthop) + sizeof(u32)*2); - nexthop_slab_[3] = sl_new(rta_pool, sizeof(struct nexthop) + sizeof(u32)*MPLS_MAX_LABEL_STACK); + for (uint i=0; i<ARRAY_SIZE(ea_slab_sizes); i++) + ea_slab[i] = sl_new(rta_pool, ea_slab_sizes[i]); - atomic_store_explicit(&rta_cache, rta_alloc_hash(32), memory_order_relaxed); + rta_alloc_hash(); rte_src_init(); + ea_class_init(); + + /* These attributes are required to be first for nice "show route" output */ + ea_register_init(&ea_gen_nexthop); + ea_register_init(&ea_gen_hostentry); + + /* Other generic route attributes */ + ea_register_init(&ea_gen_preference); + ea_register_init(&ea_gen_igp_metric); + ea_register_init(&ea_gen_from); + ea_register_init(&ea_gen_source); + ea_register_init(&ea_gen_flowspec_valid); } /* diff --git a/nest/rt-dev.c b/nest/rt-dev.c index c1251675..8ae563b5 100644 --- a/nest/rt-dev.c +++ b/nest/rt-dev.c @@ -18,7 +18,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/rt-dev.h" #include "conf/conf.h" #include "lib/resource.h" @@ -80,16 +80,18 @@ dev_ifa_notify(struct proto *P, uint flags, struct ifa *ad) /* Use iface ID as local source ID */ struct rte_src *src = rt_get_source(P, ad->iface->index); - rta a0 = { - .pref = c->preference, - .source = RTS_DEVICE, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, - .nh.iface = ad->iface, + ea_list *ea = NULL; + struct nexthop_adata nhad = { + .nh = { .iface = ad->iface, }, + .ad = { .length = (void *) NEXTHOP_NEXT(&nhad.nh) - (void *) nhad.ad.data, }, }; + ea_set_attr_u32(&ea, &ea_gen_preference, 0, c->preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_DEVICE); + ea_set_attr_data(&ea, &ea_gen_nexthop, 0, nhad.ad.data, nhad.ad.length); + rte e0 = { - .attrs = rta_lookup(&a0), + .attrs = ea, .src = src, }; @@ -143,8 +145,8 @@ dev_init(struct proto_config *CF) proto_configure_channel(P, &p->ip4_channel, cf->ip4_channel); proto_configure_channel(P, &p->ip6_channel, cf->ip6_channel); - P->if_notify = dev_if_notify; - P->ifa_notify = dev_ifa_notify; + P->iface_sub.if_notify = dev_if_notify; + P->iface_sub.ifa_notify = dev_ifa_notify; return P; } @@ -186,7 +188,6 @@ dev_copy_config(struct proto_config *dest, struct proto_config *src) struct protocol proto_device = { .name = "Direct", .template = "direct%d", - .class = PROTOCOL_DIRECT, .preference = DEF_PREF_DIRECT, .channel_mask = NB_IP | NB_IP6_SADR, .proto_size = sizeof(struct rt_dev_proto), @@ -196,3 +197,9 @@ struct protocol proto_device = { .reconfigure = dev_reconfigure, .copy_config = dev_copy_config }; + +void +dev_build(void) +{ + proto_build(&proto_device); +} diff --git a/nest/rt-fib.c b/nest/rt-fib.c index a7f70371..801561da 100644 --- a/nest/rt-fib.c +++ b/nest/rt-fib.c @@ -55,7 +55,7 @@ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "lib/string.h" /* @@ -331,7 +331,7 @@ fib_get(struct fib *f, const net_addr *a) memset(b, 0, f->node_offset); if (f->init) - f->init(b); + f->init(f, b); if (f->entries++ > f->entries_max) fib_rehash(f, HASH_HI_STEP); @@ -475,7 +475,7 @@ fib_delete(struct fib *f, void *E) } if (f->fib_slab) - sl_free(f->fib_slab, E); + sl_free(E); else mb_free(E); diff --git a/nest/rt-show.c b/nest/rt-show.c index 65b59af4..a5c7dc8f 100644 --- a/nest/rt-show.c +++ b/nest/rt-show.c @@ -10,123 +10,92 @@ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/cli.h" #include "nest/iface.h" #include "filter/filter.h" +#include "filter/data.h" #include "sysdep/unix/krt.h" static void -rt_show_table(struct cli *c, struct rt_show_data *d) +rt_show_table(struct rt_show_data *d) { + struct cli *c = d->cli; + /* No table blocks in 'show route count' */ if (d->stats == 2) return; if (d->last_table) cli_printf(c, -1007, ""); - cli_printf(c, -1007, "Table %s:", d->tab->table->name); + cli_printf(c, -1007, "Table %s:", + d->tab->name); d->last_table = d->tab; } -static inline struct krt_proto * -rt_show_get_kernel(struct rt_show_data *d) -{ - struct proto_config *krt = d->tab->table->config->krt_attached; - return krt ? (struct krt_proto *) krt->proto : NULL; -} - static void rt_show_rte(struct cli *c, byte *ia, rte *e, struct rt_show_data *d, int primary) { byte from[IPA_MAX_TEXT_LENGTH+8]; byte tm[TM_DATETIME_BUFFER_SIZE], info[256]; - rta *a = e->attrs; - int sync_error = d->kernel ? krt_get_sync_error(d->kernel, e) : 0; - void (*get_route_info)(struct rte *, byte *buf); - struct nexthop *nh; + ea_list *a = e->attrs; + int sync_error = d->tab->kernel ? krt_get_sync_error(d->tab->kernel, e) : 0; + void (*get_route_info)(const rte *, byte *buf); + eattr *nhea = net_type_match(e->net, NB_DEST) ? + ea_find(a, &ea_gen_nexthop) : NULL; + struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhad ? (NEXTHOP_IS_REACHABLE(nhad) ? RTD_UNICAST : nhad->dest) : RTD_NONE; + int flowspec_valid = net_is_flow(e->net) ? rt_get_flowspec_valid(e) : FLOWSPEC_UNKNOWN; tm_format_time(tm, &config->tf_route, e->lastmod); - if (ipa_nonzero(a->from) && !ipa_equal(a->from, a->nh.gw)) - bsprintf(from, " from %I", a->from); + ip_addr a_from = ea_get_ip(a, &ea_gen_from, IPA_NONE); + if (ipa_nonzero(a_from) && (!nhad || !ipa_equal(a_from, nhad->nh.gw))) + bsprintf(from, " from %I", a_from); else from[0] = 0; /* Need to normalize the extended attributes */ - if (d->verbose && !rta_is_cached(a) && a->eattrs) - ea_normalize(a->eattrs); + if (d->verbose && !rta_is_cached(a) && a) + a = ea_normalize(a, 0); get_route_info = e->src->owner->class ? e->src->owner->class->get_route_info : NULL; if (get_route_info) get_route_info(e, info); else - bsprintf(info, " (%d)", a->pref); + bsprintf(info, " (%d)", rt_get_preference(e)); if (d->last_table != d->tab) - rt_show_table(c, d); - - cli_printf(c, -1007, "%-20s %s [%s %s%s]%s%s", ia, rta_dest_name(a->dest), - e->src->owner->name, tm, from, primary ? (sync_error ? " !" : " *") : "", info); - - if (a->dest == RTD_UNICAST) - for (nh = &(a->nh); nh; nh = nh->next) - { - char mpls[MPLS_MAX_LABEL_STACK*12 + 5], *lsp = mpls; - char *onlink = (nh->flags & RNF_ONLINK) ? " onlink" : ""; - char weight[16] = ""; - - if (nh->labels) - { - lsp += bsprintf(lsp, " mpls %d", nh->label[0]); - for (int i=1;i<nh->labels; i++) - lsp += bsprintf(lsp, "/%d", nh->label[i]); - } - *lsp = '\0'; + rt_show_table(d); - if (a->nh.next) - bsprintf(weight, " weight %d", nh->weight + 1); + eattr *heea; + struct hostentry_adata *had = NULL; + if (!net_is_flow(e->net) && (dest == RTD_NONE) && (heea = ea_find(a, &ea_gen_hostentry))) + had = (struct hostentry_adata *) heea->u.ptr; - if (ipa_nonzero(nh->gw)) - cli_printf(c, -1007, "\tvia %I on %s%s%s%s", - nh->gw, nh->iface->name, mpls, onlink, weight); - else - cli_printf(c, -1007, "\tdev %s%s%s", - nh->iface->name, mpls, onlink, weight); - } + cli_printf(c, -1007, "%-20s %s [%s %s%s]%s%s", ia, + net_is_flow(e->net) ? flowspec_valid_name(flowspec_valid) : had ? "recursive" : rta_dest_name(dest), + e->src->owner->name, tm, from, primary ? (sync_error ? " !" : " *") : "", info); if (d->verbose) { - cli_printf(c, -1008, "\tInternal route ID: %uL %uG %uS", e->src->private_id, e->src->global_id, e->stale_cycle); - rta_show(c, a); + ea_show_list(c, a); + cli_printf(c, -1008, "\tInternal route handling values: %uL %uG %uS id %u", + e->src->private_id, e->src->global_id, e->stale_cycle, e->id); + } + else if (dest == RTD_UNICAST) + ea_show_nexthop_list(c, nhad); + else if (had) + { + char hetext[256]; + ea_show_hostentry(&had->ad, hetext, sizeof hetext); + cli_printf(c, -1007, "\t%s", hetext); } -} - -static uint -rte_feed_count(net *n) -{ - uint count = 0; - for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) - count++; - return count; -} - -static void -rte_feed_obtain(net *n, rte **feed, uint count) -{ - uint i = 0; - for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) - { - ASSERT_DIE(i < count); - feed[i++] = &e->rte; - } - ASSERT_DIE(i == count); } static void -rt_show_net(struct cli *c, net *n, struct rt_show_data *d) +rt_show_net(struct rt_show_data *d, const net_addr *n, const rte **feed, uint count) { + struct cli *c = d->cli; byte ia[NET_MAX_TEXT_LENGTH+1]; struct channel *ec = d->tab->export_channel; @@ -135,13 +104,12 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) ASSUME(!d->export_mode || ec); int first = 1; + int first_show = 1; int pass = 0; - bsnprintf(ia, sizeof(ia), "%N", n->n.addr); - - for (struct rte_storage *er = n->routes; er; er = er->next) + for (uint i = 0; i < count; i++) { - if (rte_is_filtered(&er->rte) != d->filtered) + if (!d->tab->prefilter && (rte_is_filtered(feed[i]) != d->filtered)) continue; d->rt_counter++; @@ -151,7 +119,12 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) if (pass) continue; - struct rte e = er->rte; + struct rte e = *feed[i]; + if (d->tab->prefilter) + if (e.sender != d->tab->prefilter->in_req.hook) + continue; + else while (e.attrs->next) + e.attrs = e.attrs->next; /* Export channel is down, do not try to export routes to it */ if (ec && !ec->out_req.hook) @@ -169,13 +142,7 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) { /* Special case for merged export */ pass = 1; - uint count = rte_feed_count(n); - if (!count) - goto skip; - - rte **feed = alloca(count * sizeof(rte *)); - rte_feed_obtain(n, feed, count); - rte *em = rt_export_merged(ec, feed, count, c->show_pool, 1); + rte *em = rt_export_merged(ec, feed, count, tmp_linpool, 1); if (em) e = *em; @@ -201,7 +168,7 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) * command may change the export filter and do not update routes. */ int do_export = (ic > 0) || - (f_run(ec->out_filter, &e, c->show_pool, FF_SILENT) <= F_ACCEPT); + (f_run(ec->out_filter, &e, FF_SILENT) <= F_ACCEPT); if (do_export != (d->export_mode == RSEM_EXPORT)) goto skip; @@ -214,130 +181,194 @@ rt_show_net(struct cli *c, net *n, struct rt_show_data *d) if (d->show_protocol && (&d->show_protocol->sources != e.src->owner)) goto skip; - if (f_run(d->filter, &e, c->show_pool, 0) > F_ACCEPT) + if (f_run(d->filter, &e, 0) > F_ACCEPT) goto skip; if (d->stats < 2) - rt_show_rte(c, ia, &e, d, (n->routes == er)); + { + if (first_show) + net_format(n, ia, sizeof(ia)); + else + ia[0] = 0; + + rt_show_rte(c, ia, &e, d, !d->tab->prefilter && !i); + first_show = 0; + } d->show_counter++; - ia[0] = 0; skip: - lp_flush(c->show_pool); - if (d->primary_only) break; } + + if ((d->show_counter - d->show_counter_last_flush) > 64) + { + d->show_counter_last_flush = d->show_counter; + cli_write_trigger(d->cli); + } +} + +static void +rt_show_net_export_bulk(struct rt_export_request *req, const net_addr *n, + struct rt_pending_export *first UNUSED, struct rt_pending_export *last UNUSED, + const rte **feed, uint count) +{ + struct rt_show_data *d = SKIP_BACK(struct rt_show_data, req, req); + return rt_show_net(d, n, feed, count); } static void +rt_show_export_stopped_cleanup(struct rt_export_request *req) +{ + struct rt_show_data *d = SKIP_BACK(struct rt_show_data, req, req); + + /* The hook is now invalid */ + req->hook = NULL; + + /* And free the CLI (deferred) */ + rfree(d->cli->pool); +} + +static int rt_show_cleanup(struct cli *c) { struct rt_show_data *d = c->rover; - struct rt_show_data_rtable *tab; + c->cleanup = NULL; - /* Unlink the iterator */ - if (d->table_open) - RT_LOCKED(d->tab->table, t) - fit_get(&t->fib, &d->fit); + /* Cancel the feed */ + if (d->req.hook) + { + rt_stop_export(&d->req, rt_show_export_stopped_cleanup); + return 1; + } + else + return 0; +} + +static void rt_show_export_stopped(struct rt_export_request *req); - /* Unlock referenced tables */ - WALK_LIST(tab, d->tables) - RT_LOCKED(tab->table, t) - rt_unlock_table(t); +static void +rt_show_log_state_change(struct rt_export_request *req, u8 state) +{ + if (state == TES_READY) + rt_stop_export(req, rt_show_export_stopped); } static void -rt_show_cont(struct cli *c) +rt_show_dump_req(struct rt_export_request *req) { - struct rt_show_data *d = c->rover; -#ifdef DEBUGGING - unsigned max = 4; -#else - unsigned max = 64; -#endif + debug(" CLI Show Route Feed %p\n", req); +} + +static void +rt_show_done(struct rt_show_data *d) +{ + /* No more action */ + d->cli->cleanup = NULL; + d->cli->cont = NULL; + d->cli->rover = NULL; + + /* Write pending messages */ + cli_write_trigger(d->cli); +} + +static void +rt_show_cont(struct rt_show_data *d) +{ + struct cli *c = d->cli; if (d->running_on_config && (d->running_on_config != config)) { cli_printf(c, 8004, "Stopped due to reconfiguration"); - goto done; + return rt_show_done(d); } - rtable_private *t = RT_LOCK(d->tab->table); + d->req = (struct rt_export_request) { + .addr = d->addr, + .name = "CLI Show Route", + .list = &global_work_list, + .export_bulk = rt_show_net_export_bulk, + .dump_req = rt_show_dump_req, + .log_state_change = rt_show_log_state_change, + .addr_mode = d->addr_mode, + }; - struct fib *fib = &t->fib; - struct fib_iterator *it = &d->fit; + d->table_counter++; - if (!d->table_open) - { - FIB_ITERATE_INIT(&d->fit, fib); - d->table_open = 1; - d->table_counter++; - d->kernel = rt_show_get_kernel(d); + d->show_counter_last = d->show_counter; + d->rt_counter_last = d->rt_counter; + d->net_counter_last = d->net_counter; - d->show_counter_last = d->show_counter; - d->rt_counter_last = d->rt_counter; - d->net_counter_last = d->net_counter; + if (d->tables_defined_by & RSD_TDB_SET) + rt_show_table(d); - if (d->tables_defined_by & RSD_TDB_SET) - rt_show_table(c, d); - } + rt_request_export_other(d->tab->table, &d->req); +} - FIB_ITERATE_START(fib, it, net, n) - { - if (!max--) - { - FIB_ITERATE_PUT(it); - RT_UNLOCK(d->tab->table); - return; - } - rt_show_net(c, n, d); - } - FIB_ITERATE_END; +static void +rt_show_export_stopped(struct rt_export_request *req) +{ + struct rt_show_data *d = SKIP_BACK(struct rt_show_data, req, req); + + /* The hook is now invalid */ + req->hook = NULL; if (d->stats) { if (d->last_table != d->tab) - rt_show_table(c, d); + rt_show_table(d); - cli_printf(c, -1007, "%d of %d routes for %d networks in table %s", + cli_printf(d->cli, -1007, "%d of %d routes for %d networks in table %s", d->show_counter - d->show_counter_last, d->rt_counter - d->rt_counter_last, - d->net_counter - d->net_counter_last, d->tab->table->name); + d->net_counter - d->net_counter_last, d->tab->name); } - RT_UNLOCK(d->tab->table); - - d->kernel = NULL; - d->table_open = 0; d->tab = NODE_NEXT(d->tab); if (NODE_VALID(d->tab)) - return; + return rt_show_cont(d); + /* Printout total stats */ if (d->stats && (d->table_counter > 1)) { - if (d->last_table) cli_printf(c, -1007, ""); - cli_printf(c, 14, "Total: %d of %d routes for %d networks in %d tables", + if (d->last_table) cli_printf(d->cli, -1007, ""); + cli_printf(d->cli, 14, "Total: %d of %d routes for %d networks in %d tables", d->show_counter, d->rt_counter, d->net_counter, d->table_counter); } + else if (!d->rt_counter && ((d->addr_mode == TE_ADDR_EQUAL) || (d->addr_mode == TE_ADDR_FOR))) + cli_printf(d->cli, 8001, "Network not found"); else - cli_printf(c, 0, ""); + cli_printf(d->cli, 0, ""); -done: - rt_show_cleanup(c); - c->cont = c->cleanup = NULL; + /* No more route showing */ + rt_show_done(d); } struct rt_show_data_rtable * -rt_show_add_table(struct rt_show_data *d, rtable *t) +rt_show_add_exporter(struct rt_show_data *d, struct rt_exporter *t, const char *name) { struct rt_show_data_rtable *tab = cfg_allocz(sizeof(struct rt_show_data_rtable)); tab->table = t; + tab->name = name; add_tail(&(d->tables), &(tab->n)); return tab; } +struct rt_show_data_rtable * +rt_show_add_table(struct rt_show_data *d, rtable *t) +{ + struct rt_show_data_rtable *rsdr; + RT_LOCKED(t, tp) + rsdr = rt_show_add_exporter(d, &tp->exporter.e, t->name); + + struct proto_config *krt = t->config->krt_attached; + if (krt) + rsdr->kernel = (struct krt_proto *) krt->proto; + + return rsdr; +} + static inline void rt_show_get_default_tables(struct rt_show_data *d) { @@ -373,8 +404,8 @@ rt_show_get_default_tables(struct rt_show_data *d) } for (int i=1; i<NET_MAX; i++) - if (config->def_tables[i] && config->def_tables[i]->table) - rt_show_add_table(d, config->def_tables[i]->table); + if (config->def_tables[i] && config->def_tables[i]->table && config->def_tables[i]->table->table) + rt_show_add_table(d, config->def_tables[i]->table->table); } static inline void @@ -391,17 +422,18 @@ rt_show_prepare_tables(struct rt_show_data *d) /* Ensure there is defined export_channel for each table */ if (d->export_mode) { + rtable *rt = SKIP_BACK(rtable, priv.exporter.e, tab->table); if (!tab->export_channel && d->export_channel && - (tab->table == d->export_channel->table)) + (rt == d->export_channel->table)) tab->export_channel = d->export_channel; if (!tab->export_channel && d->export_protocol) - tab->export_channel = proto_find_channel_by_table(d->export_protocol, tab->table); + tab->export_channel = proto_find_channel_by_table(d->export_protocol, rt); if (!tab->export_channel) { if (d->tables_defined_by & RSD_TDB_NMN) - cf_error("No export channel for table %s", tab->table->name); + cf_error("No export channel for table %s", tab->name); rem_node(&(tab->n)); continue; @@ -412,7 +444,7 @@ rt_show_prepare_tables(struct rt_show_data *d) if (d->addr && (tab->table->addr_type != d->addr->type)) { if (d->tables_defined_by & RSD_TDB_NMN) - cf_error("Incompatible type of prefix/ip for table %s", tab->table->name); + cf_error("Incompatible type of prefix/ip for table %s", tab->name); rem_node(&(tab->n)); continue; @@ -424,53 +456,29 @@ rt_show_prepare_tables(struct rt_show_data *d) cf_error("No valid tables"); } +static void +rt_show_dummy_cont(struct cli *c UNUSED) +{ + /* Explicitly do nothing to prevent CLI from trying to parse another command. */ +} + void rt_show(struct rt_show_data *d) { - struct rt_show_data_rtable *tab; - net *n; - /* Filtered routes are neither exported nor have sensible ordering */ if (d->filtered && (d->export_mode || d->primary_only)) cf_error("Incompatible show route options"); rt_show_prepare_tables(d); - if (!d->addr) - { - WALK_LIST(tab, d->tables) - RT_LOCKED(tab->table, t) - rt_lock_table(t); - - /* There is at least one table */ - d->tab = HEAD(d->tables); - this_cli->cont = rt_show_cont; - this_cli->cleanup = rt_show_cleanup; - this_cli->rover = d; - } - else - { - WALK_LIST(tab, d->tables) - { - d->tab = tab; - d->kernel = rt_show_get_kernel(d); - - RT_LOCK(tab->table); + if (EMPTY_LIST(d->tables)) + cf_error("No suitable tables found"); - if (d->show_for) - n = net_route(RT_PRIV(tab->table), d->addr); - else - n = net_find(RT_PRIV(tab->table), d->addr); + d->tab = HEAD(d->tables); - if (n) - rt_show_net(this_cli, n, d); + this_cli->cleanup = rt_show_cleanup; + this_cli->rover = d; + this_cli->cont = rt_show_dummy_cont; - RT_UNLOCK(tab->table); - } - - if (d->rt_counter) - cli_msg(0, ""); - else - cli_msg(8001, "Network not found"); - } + rt_show_cont(d); } diff --git a/nest/rt-table.c b/nest/rt-table.c index 5f1e1679..b18727b1 100644 --- a/nest/rt-table.c +++ b/nest/rt-table.c @@ -21,17 +21,79 @@ * on the list being the best one (i.e., the one we currently use * for routing), the order of the other ones is undetermined. * - * The &rte contains information specific to the route (preference, protocol - * metrics, time of last modification etc.) and a pointer to a &rta structure - * (see the route attribute module for a precise explanation) holding the - * remaining route attributes which are expected to be shared by multiple - * routes in order to conserve memory. + * The &rte contains information about the route. There are net and src, which + * together forms a key identifying the route in a routing table. There is a + * pointer to a &rta structure (see the route attribute module for a precise + * explanation) holding the route attributes, which are primary data about the + * route. There are several technical fields used by routing table code (route + * id, REF_* flags), There is also the pflags field, holding protocol-specific + * flags. They are not used by routing table code, but by protocol-specific + * hooks. In contrast to route attributes, they are not primary data and their + * validity is also limited to the routing table. + * + * There are several mechanisms that allow automatic update of routes in one + * routing table (dst) as a result of changes in another routing table (src). + * They handle issues of recursive next hop resolving, flowspec validation and + * RPKI validation. + * + * The first such mechanism is handling of recursive next hops. A route in the + * dst table has an indirect next hop address, which is resolved through a route + * in the src table (which may also be the same table) to get an immediate next + * hop. This is implemented using structure &hostcache attached to the src + * table, which contains &hostentry structures for each tracked next hop + * address. These structures are linked from recursive routes in dst tables, + * possibly multiple routes sharing one hostentry (as many routes may have the + * same indirect next hop). There is also a trie in the hostcache, which matches + * all prefixes that may influence resolving of tracked next hops. + * + * When a best route changes in the src table, the hostcache is notified using + * an auxiliary export request, which checks using the trie whether the + * change is relevant and if it is, then it schedules asynchronous hostcache + * recomputation. The recomputation is done by rt_update_hostcache() (called + * as an event of src table), it walks through all hostentries and resolves + * them (by rt_update_hostentry()). It also updates the trie. If a change in + * hostentry resolution was found, then it schedules asynchronous nexthop + * recomputation of associated dst table. That is done by rt_next_hop_update() + * (called from rt_event() of dst table), it iterates over all routes in the dst + * table and re-examines their hostentries for changes. Note that in contrast to + * hostcache update, next hop update can be interrupted by main loop. These two + * full-table walks (over hostcache and dst table) are necessary due to absence + * of direct lookups (route -> affected nexthop, nexthop -> its route). + * + * The second mechanism is for flowspec validation, where validity of flowspec + * routes depends of resolving their network prefixes in IP routing tables. This + * is similar to the recursive next hop mechanism, but simpler as there are no + * intermediate hostcache and hostentries (because flows are less likely to + * share common net prefix than routes sharing a common next hop). Every dst + * table has its own export request in every src table. Each dst table has its + * own trie of prefixes that may influence validation of flowspec routes in it + * (flowspec_trie). + * + * When a best route changes in the src table, the notification mechanism is + * invoked by the export request which checks its dst table's trie to see + * whether the change is relevant, and if so, an asynchronous re-validation of + * flowspec routes in the dst table is scheduled. That is also done by function + * rt_next_hop_update(), like nexthop recomputation above. It iterates over all + * flowspec routes and re-validates them. It also recalculates the trie. + * + * Note that in contrast to the hostcache update, here the trie is recalculated + * during the rt_next_hop_update(), which may be interleaved with IP route + * updates. The trie is flushed at the beginning of recalculation, which means + * that such updates may use partial trie to see if they are relevant. But it + * works anyway! Either affected flowspec was already re-validated and added to + * the trie, then IP route change would match the trie and trigger a next round + * of re-validation, or it was not yet re-validated and added to the trie, but + * will be re-validated later in this round anyway. + * + * The third mechanism is used for RPKI re-validation of IP routes and it is the + * simplest. It is also an auxiliary export request belonging to the + * appropriate channel, triggering its reload/refeed timer after a settle time. */ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "lib/resource.h" @@ -44,12 +106,21 @@ #include "lib/hash.h" #include "lib/string.h" #include "lib/alloca.h" +#include "lib/flowspec.h" +#include "lib/idm.h" + +#ifdef CONFIG_BGP +#include "proto/bgp/bgp.h" +#endif #include <stdatomic.h> pool *rt_table_pool; list routing_tables; +list deleted_routing_tables; + +struct rt_cork rt_cork; /* Data structures for export journal */ #define RT_PENDING_EXPORT_ITEMS (page_size - sizeof(struct rt_export_block)) / sizeof(struct rt_pending_export) @@ -61,17 +132,28 @@ struct rt_export_block { struct rt_pending_export export[]; }; -static void rt_free_hostcache(rtable_private *tab); -static void rt_notify_hostcache(rtable_private *tab, net *net); +static void rt_free_hostcache(struct rtable_private *tab); static void rt_update_hostcache(void *tab); -static void rt_next_hop_update(void *tab); -static inline void rt_prune_table(void *tab); -static void rt_fast_prune_check(rtable_private *tab); -static inline void rt_schedule_notify(rtable_private *tab); -static void rt_feed_channel(void *); - -static inline void rt_export_used(rtable_private *tab); -static void rt_export_cleanup(void *tab); +static void rt_next_hop_update(struct rtable_private *tab); +static void rt_nhu_uncork(void *_tab); +static inline void rt_next_hop_resolve_rte(rte *r); +static inline void rt_flowspec_resolve_rte(rte *r, struct channel *c); +static inline void rt_prune_table(struct rtable_private *tab); +static void rt_kick_prune_timer(struct rtable_private *tab); +static void rt_feed_by_fib(void *); +static void rt_feed_by_trie(void *); +static void rt_feed_equal(void *); +static void rt_feed_for(void *); +static void rt_check_cork_low(struct rtable_private *tab); +static void rt_check_cork_high(struct rtable_private *tab); +static void rt_cork_release_hook(void *); +static void rt_shutdown(void *); +static void rt_delete(void *); + +static void rt_export_used(struct rt_table_exporter *, const char *, const char *); +static void rt_export_cleanup(struct rtable_private *tab); + +static int rte_same(rte *x, rte *y); const char *rt_import_state_name_array[TIS_MAX] = { [TIS_DOWN] = "DOWN", @@ -106,54 +188,193 @@ const char *rt_export_state_name(u8 state) return rt_export_state_name_array[state]; } -struct event_cork rt_cork; +static struct hostentry *rt_get_hostentry(struct rtable_private *tab, ip_addr a, ip_addr ll, rtable *dep); -static inline void -rte_update_lock(struct channel *c) +static inline rtable *rt_priv_to_pub(struct rtable_private *tab) { return RT_PUB(tab); } +static inline rtable *rt_pub_to_pub(rtable *tab) { return tab; } +#define RT_ANY_TO_PUB(tab) _Generic((tab),rtable*:rt_pub_to_pub,struct rtable_private*:rt_priv_to_pub)((tab)) + +#define rt_trace(tab, level, fmt, args...) do {\ + rtable *t = RT_ANY_TO_PUB((tab)); \ + if (t->config->debug & (level)) \ + log(L_TRACE "%s: " fmt, t->name, ##args); \ +} while (0) + +static void +net_init_with_trie(struct fib *f, void *N) { - c->rte_update_nest_cnt++; + struct rtable_private *tab = SKIP_BACK(struct rtable_private, fib, f); + net *n = N; + + if (tab->trie) + trie_add_prefix(tab->trie, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen); + + if (tab->trie_new) + trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen); } -static inline void -rte_update_unlock(struct channel *c) +static inline net * +net_route_ip4_trie(struct rtable_private *t, const net_addr_ip4 *n0) +{ + TRIE_WALK_TO_ROOT_IP4(t->trie, n0, n) + { + net *r; + if (r = net_find_valid(t, (net_addr *) &n)) + return r; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; +} + +static inline net * +net_route_vpn4_trie(struct rtable_private *t, const net_addr_vpn4 *n0) { - if (!--c->rte_update_nest_cnt) - lp_flush(c->rte_update_pool); + TRIE_WALK_TO_ROOT_IP4(t->trie, (const net_addr_ip4 *) n0, px) + { + net_addr_vpn4 n = NET_ADDR_VPN4(px.prefix, px.pxlen, n0->rd); + + net *r; + if (r = net_find_valid(t, (net_addr *) &n)) + return r; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; +} + +static inline net * +net_route_ip6_trie(struct rtable_private *t, const net_addr_ip6 *n0) +{ + TRIE_WALK_TO_ROOT_IP6(t->trie, n0, n) + { + net *r; + if (r = net_find_valid(t, (net_addr *) &n)) + return r; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; +} + +static inline net * +net_route_vpn6_trie(struct rtable_private *t, const net_addr_vpn6 *n0) +{ + TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px) + { + net_addr_vpn6 n = NET_ADDR_VPN6(px.prefix, px.pxlen, n0->rd); + + net *r; + if (r = net_find_valid(t, (net_addr *) &n)) + return r; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; } -/* Like fib_route(), but skips empty net entries */ static inline void * -net_route_ip4(rtable_private *t, net_addr_ip4 *n) +net_route_ip6_sadr_trie(struct rtable_private *t, const net_addr_ip6_sadr *n0) { + TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px) + { + net_addr_ip6_sadr n = NET_ADDR_IP6_SADR(px.prefix, px.pxlen, n0->src_prefix, n0->src_pxlen); + net *best = NULL; + int best_pxlen = 0; + + /* We need to do dst first matching. Since sadr addresses are hashed on dst + prefix only, find the hash table chain and go through it to find the + match with the longest matching src prefix. */ + for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next) + { + net_addr_ip6_sadr *a = (void *) fn->addr; + + if (net_equal_dst_ip6_sadr(&n, a) && + net_in_net_src_ip6_sadr(&n, a) && + (a->src_pxlen >= best_pxlen)) + { + best = fib_node_to_user(&t->fib, fn); + best_pxlen = a->src_pxlen; + } + } + + if (best) + return best; + } + TRIE_WALK_TO_ROOT_END; + + return NULL; +} + +static inline net * +net_route_ip4_fib(struct rtable_private *t, const net_addr_ip4 *n0) +{ + net_addr_ip4 n; + net_copy_ip4(&n, n0); + net *r; + while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0)) + { + n.pxlen--; + ip4_clrbit(&n.prefix, n.pxlen); + } - while (r = net_find_valid(t, (net_addr *) n), (!r) && (n->pxlen > 0)) + return r; +} + +static inline net * +net_route_vpn4_fib(struct rtable_private *t, const net_addr_vpn4 *n0) +{ + net_addr_vpn4 n; + net_copy_vpn4(&n, n0); + + net *r; + while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0)) { - n->pxlen--; - ip4_clrbit(&n->prefix, n->pxlen); + n.pxlen--; + ip4_clrbit(&n.prefix, n.pxlen); } return r; } -static inline void * -net_route_ip6(rtable_private *t, net_addr_ip6 *n) +static inline net * +net_route_ip6_fib(struct rtable_private *t, const net_addr_ip6 *n0) { + net_addr_ip6 n; + net_copy_ip6(&n, n0); + net *r; + while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0)) + { + n.pxlen--; + ip6_clrbit(&n.prefix, n.pxlen); + } - while (r = net_find_valid(t, (net_addr *) n), (!r) && (n->pxlen > 0)) + return r; +} + +static inline net * +net_route_vpn6_fib(struct rtable_private *t, const net_addr_vpn6 *n0) +{ + net_addr_vpn6 n; + net_copy_vpn6(&n, n0); + + net *r; + while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0)) { - n->pxlen--; - ip6_clrbit(&n->prefix, n->pxlen); + n.pxlen--; + ip6_clrbit(&n.prefix, n.pxlen); } return r; } static inline void * -net_route_ip6_sadr(rtable_private *t, net_addr_ip6_sadr *n) +net_route_ip6_sadr_fib(struct rtable_private *t, const net_addr_ip6_sadr *n0) { - struct fib_node *fn; + net_addr_ip6_sadr n; + net_copy_ip6_sadr(&n, n0); while (1) { @@ -162,13 +383,13 @@ net_route_ip6_sadr(rtable_private *t, net_addr_ip6_sadr *n) /* We need to do dst first matching. Since sadr addresses are hashed on dst prefix only, find the hash table chain and go through it to find the - match with the smallest matching src prefix. */ - for (fn = fib_get_chain(&t->fib, (net_addr *) n); fn; fn = fn->next) + match with the longest matching src prefix. */ + for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next) { net_addr_ip6_sadr *a = (void *) fn->addr; - if (net_equal_dst_ip6_sadr(n, a) && - net_in_net_src_ip6_sadr(n, a) && + if (net_equal_dst_ip6_sadr(&n, a) && + net_in_net_src_ip6_sadr(&n, a) && (a->src_pxlen >= best_pxlen)) { best = fib_node_to_user(&t->fib, fn); @@ -179,38 +400,52 @@ net_route_ip6_sadr(rtable_private *t, net_addr_ip6_sadr *n) if (best) return best; - if (!n->dst_pxlen) + if (!n.dst_pxlen) break; - n->dst_pxlen--; - ip6_clrbit(&n->dst_prefix, n->dst_pxlen); + n.dst_pxlen--; + ip6_clrbit(&n.dst_prefix, n.dst_pxlen); } return NULL; } -void * -net_route(rtable_private *tab, const net_addr *n) +net * +net_route(struct rtable_private *tab, const net_addr *n) { ASSERT(tab->addr_type == n->type); - net_addr *n0 = alloca(n->length); - net_copy(n0, n); - switch (n->type) { case NET_IP4: + if (tab->trie) + return net_route_ip4_trie(tab, (net_addr_ip4 *) n); + else + return net_route_ip4_fib (tab, (net_addr_ip4 *) n); + case NET_VPN4: - case NET_ROA4: - return net_route_ip4(tab, (net_addr_ip4 *) n0); + if (tab->trie) + return net_route_vpn4_trie(tab, (net_addr_vpn4 *) n); + else + return net_route_vpn4_fib (tab, (net_addr_vpn4 *) n); case NET_IP6: + if (tab->trie) + return net_route_ip6_trie(tab, (net_addr_ip6 *) n); + else + return net_route_ip6_fib (tab, (net_addr_ip6 *) n); + case NET_VPN6: - case NET_ROA6: - return net_route_ip6(tab, (net_addr_ip6 *) n0); + if (tab->trie) + return net_route_vpn6_trie(tab, (net_addr_vpn6 *) n); + else + return net_route_vpn6_fib (tab, (net_addr_vpn6 *) n); case NET_IP6_SADR: - return net_route_ip6_sadr(tab, (net_addr_ip6_sadr *) n0); + if (tab->trie) + return net_route_ip6_sadr_trie(tab, (net_addr_ip6_sadr *) n); + else + return net_route_ip6_sadr_fib (tab, (net_addr_ip6_sadr *) n); default: return NULL; @@ -219,15 +454,40 @@ net_route(rtable_private *tab, const net_addr *n) static int -net_roa_check_ip4(rtable *t, const net_addr_ip4 *px, u32 asn) +net_roa_check_ip4_trie(struct rtable_private *tab, const net_addr_ip4 *px, u32 asn) +{ + int anything = 0; + + TRIE_WALK_TO_ROOT_IP4(tab->trie, px, px0) + { + net_addr_roa4 roa0 = NET_ADDR_ROA4(px0.prefix, px0.pxlen, 0, 0); + + struct fib_node *fn; + for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next) + { + net_addr_roa4 *roa = (void *) fn->addr; + net *r = fib_node_to_user(&tab->fib, fn); + + if (net_equal_prefix_roa4(roa, &roa0) && r->routes && rte_is_valid(&r->routes->rte)) + { + anything = 1; + if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) + return ROA_VALID; + } + } + } + TRIE_WALK_TO_ROOT_END; + + return anything ? ROA_INVALID : ROA_UNKNOWN; +} + +static int +net_roa_check_ip4_fib(struct rtable_private *tab, const net_addr_ip4 *px, u32 asn) { struct net_addr_roa4 n = NET_ADDR_ROA4(px->prefix, px->pxlen, 0, 0); struct fib_node *fn; int anything = 0; - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); - while (1) { for (fn = fib_get_chain(&tab->fib, (net_addr *) &n); fn; fn = fn->next) @@ -239,10 +499,7 @@ net_roa_check_ip4(rtable *t, const net_addr_ip4 *px, u32 asn) { anything = 1; if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) - { - RT_UNLOCK(tab); return ROA_VALID; - } } } @@ -253,20 +510,44 @@ net_roa_check_ip4(rtable *t, const net_addr_ip4 *px, u32 asn) ip4_clrbit(&n.prefix, n.pxlen); } - RT_UNLOCK(tab); return anything ? ROA_INVALID : ROA_UNKNOWN; } static int -net_roa_check_ip6(rtable *t, const net_addr_ip6 *px, u32 asn) +net_roa_check_ip6_trie(struct rtable_private *tab, const net_addr_ip6 *px, u32 asn) +{ + int anything = 0; + + TRIE_WALK_TO_ROOT_IP6(tab->trie, px, px0) + { + net_addr_roa6 roa0 = NET_ADDR_ROA6(px0.prefix, px0.pxlen, 0, 0); + + struct fib_node *fn; + for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next) + { + net_addr_roa6 *roa = (void *) fn->addr; + net *r = fib_node_to_user(&tab->fib, fn); + + if (net_equal_prefix_roa6(roa, &roa0) && r->routes && rte_is_valid(&r->routes->rte)) + { + anything = 1; + if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) + return ROA_VALID; + } + } + } + TRIE_WALK_TO_ROOT_END; + + return anything ? ROA_INVALID : ROA_UNKNOWN; +} + +static int +net_roa_check_ip6_fib(struct rtable_private *tab, const net_addr_ip6 *px, u32 asn) { struct net_addr_roa6 n = NET_ADDR_ROA6(px->prefix, px->pxlen, 0, 0); struct fib_node *fn; int anything = 0; - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); - while (1) { for (fn = fib_get_chain(&tab->fib, (net_addr *) &n); fn; fn = fn->next) @@ -278,10 +559,7 @@ net_roa_check_ip6(rtable *t, const net_addr_ip6 *px, u32 asn) { anything = 1; if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen)) - { - RT_UNLOCK(tab); return ROA_VALID; - } } } @@ -292,7 +570,6 @@ net_roa_check_ip6(rtable *t, const net_addr_ip6 *px, u32 asn) ip6_clrbit(&n.prefix, n.pxlen); } - RT_UNLOCK(tab); return anything ? ROA_INVALID : ROA_UNKNOWN; } @@ -312,14 +589,30 @@ net_roa_check_ip6(rtable *t, const net_addr_ip6 *px, u32 asn) * must have type NET_IP4 or NET_IP6, respectively. */ int -net_roa_check(rtable *tab, const net_addr *n, u32 asn) +net_roa_check(rtable *tp, const net_addr *n, u32 asn) { - if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4)) - return net_roa_check_ip4(tab, (const net_addr_ip4 *) n, asn); - else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6)) - return net_roa_check_ip6(tab, (const net_addr_ip6 *) n, asn); - else - return ROA_UNKNOWN; /* Should not happen */ + int out = ROA_UNKNOWN; + + RT_LOCKED(tp, tab) + { + if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4)) + { + if (tab->trie) + out = net_roa_check_ip4_trie(tab, (const net_addr_ip4 *) n, asn); + else + out = net_roa_check_ip4_fib (tab, (const net_addr_ip4 *) n, asn); + } + else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6)) + { + if (tab->trie) + out = net_roa_check_ip6_trie(tab, (const net_addr_ip6 *) n, asn); + else + out = net_roa_check_ip6_fib (tab, (const net_addr_ip6 *) n, asn); + } + else + out = ROA_UNKNOWN; /* Should not happen */ + } + return out; } /** @@ -342,8 +635,8 @@ rte_find(net *net, struct rte_src *src) } -static struct rte_storage * -rte_store(const rte *r, net *net, rtable_private *tab) +struct rte_storage * +rte_store(const rte *r, net *net, struct rtable_private *tab) { struct rte_storage *e = sl_alloc(tab->rte_slab); @@ -352,6 +645,11 @@ rte_store(const rte *r, net *net, rtable_private *tab) rt_lock_source(e->rte.src); + if (ea_is_cached(e->rte.attrs)) + e->rte.attrs = rta_clone(e->rte.attrs); + else + e->rte.attrs = rta_lookup(e->rte.attrs, 1); + return e; } @@ -364,26 +662,29 @@ rte_store(const rte *r, net *net, rtable_private *tab) */ void -rte_free(struct rte_storage *e, rtable_private *tab) +rte_free(struct rte_storage *e) { rt_unlock_source(e->rte.src); rta_free(e->rte.attrs); - sl_free(tab->rte_slab, e); + sl_free(e); } static int /* Actually better or at least as good as */ -rte_better(rte *new, rte *old) +rte_better(const rte *new, const rte *old) { - int (*better)(rte *, rte *); + int (*better)(const rte *, const rte *); if (!rte_is_valid(old)) return 1; if (!rte_is_valid(new)) return 0; - if (new->attrs->pref > old->attrs->pref) + u32 np = rt_get_preference(new); + u32 op = rt_get_preference(old); + + if (np > op) return 1; - if (new->attrs->pref < old->attrs->pref) + if (np < op) return 0; if (new->src->owner->class != old->src->owner->class) { @@ -400,14 +701,14 @@ rte_better(rte *new, rte *old) } static int -rte_mergable(rte *pri, rte *sec) +rte_mergable(const rte *pri, const rte *sec) { - int (*mergable)(rte *, rte *); + int (*mergable)(const rte *, const rte *); if (!rte_is_valid(pri) || !rte_is_valid(sec)) return 0; - if (pri->attrs->pref != sec->attrs->pref) + if (rt_get_preference(pri) != rt_get_preference(sec)) return 0; if (pri->src->owner->class != sec->src->owner->class) @@ -422,11 +723,10 @@ rte_mergable(rte *pri, rte *sec) static void rte_trace(const char *name, const rte *e, int dir, const char *msg) { - log(L_TRACE "%s %c %s %N src %uL %uG %uS id %u %s%s", + log(L_TRACE "%s %c %s %N src %uL %uG %uS id %u %s", name, dir, msg, e->net, e->src->private_id, e->src->global_id, e->stale_cycle, e->id, - rta_dest_name(e->attrs->dest), - rte_is_filtered(e) ? " (filtered)" : ""); + rta_dest_name(rte_dest(e))); } static inline void @@ -465,26 +765,26 @@ rte_feed_count(net *n) { uint count = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) - count++; + count++; + return count; } static void -rte_feed_obtain(net *n, struct rte **feed, uint count) +rte_feed_obtain(net *n, const rte **feed, uint count) { uint i = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) { ASSERT_DIE(i < count); feed[i++] = &e->rte; } + ASSERT_DIE(i == count); } static rte * -export_filter_(struct channel *c, rte *rt, linpool *pool, int silent) +export_filter(struct channel *c, rte *rt, int silent) { struct proto *p = c->proto; const struct filter *filter = c->out_filter; @@ -514,7 +814,7 @@ export_filter_(struct channel *c, rte *rt, linpool *pool, int silent) } v = filter && ((filter == FILTER_REJECT) || - (f_run(filter, rt, pool, + (f_run(filter, rt, (silent ? FF_SILENT : 0)) > F_ACCEPT)); if (v) { @@ -540,17 +840,10 @@ reject_noset: return NULL; } -static inline rte * -export_filter(struct channel *c, rte *rt, int silent) -{ - return export_filter_(c, rt, c->rte_update_pool, silent); -} - -void do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte *old); - static void do_rt_notify(struct channel *c, const net_addr *net, rte *new, const rte *old) { + struct proto *p = c->proto; struct channel_export_stats *stats = &c->export_stats; if (c->refeeding && new) @@ -567,30 +860,16 @@ do_rt_notify(struct channel *c, const net_addr *net, rte *new, const rte *old) if (!new && old) CHANNEL_LIMIT_POP(c, OUT); - /* Store route export state */ - if (old) - bmap_clear(&c->export_map, old->id); - if (new) - bmap_set(&c->export_map, new->id); - - /* Apply export table */ - if (c->out_table) - rte_import(&c->out_table->push, net, new, old ? old->src : new->src); + stats->updates_accepted++; else - do_rt_notify_direct(c, net, new, old); -} + stats->withdraws_accepted++; -void -do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte *old) -{ - struct proto *p = c->proto; - struct channel_export_stats *stats = &c->export_stats; + if (old) + bmap_clear(&c->export_map, old->id); if (new) - stats->updates_accepted++; - else - stats->withdraws_accepted++; + bmap_set(&c->export_map, new->id); if (p->debug & D_ROUTES) { @@ -608,6 +887,16 @@ do_rt_notify_direct(struct channel *c, const net_addr *net, rte *new, const rte static void rt_notify_basic(struct channel *c, const net_addr *net, rte *new, rte *old) { + if (new && old && rte_same(new, old)) + { + if ((new->id != old->id) && bmap_test(&c->export_map, old->id)) + { + bmap_set(&c->export_map, new->id); + bmap_clear(&c->export_map, old->id); + } + return; + } + if (new) new = export_filter(c, new, 0); @@ -631,13 +920,12 @@ channel_rpe_mark_seen(struct rt_export_request *req, struct rt_pending_export *r } void -rt_notify_accepted(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe, - struct rte **feed, uint count) +rt_notify_accepted(struct rt_export_request *req, const net_addr *n, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); - rte nb0, *new_best = NULL; const rte *old_best = NULL; @@ -677,7 +965,7 @@ rt_notify_accepted(struct rt_export_request *req, const net_addr *n, struct rt_p done: /* Check obsolete routes for previously exported */ - while (rpe) + RPE_WALK(first, rpe, NULL) { channel_rpe_mark_seen(req, rpe); if (rpe->old) @@ -688,7 +976,8 @@ done: old_best = &rpe->old->rte; } } - rpe = rpe_next(rpe, NULL); + if (rpe == last) + break; } /* Nothing to export */ @@ -696,25 +985,16 @@ done: do_rt_notify(c, n, new_best, old_best); else DBG("rt_notify_accepted: nothing to export\n"); - - rte_update_unlock(c); -} - - -static struct nexthop * -nexthop_merge_rta(struct nexthop *nhs, rta *a, linpool *pool, int max) -{ - return nexthop_merge(nhs, &(a->nh), 1, 0, max, pool); } rte * -rt_export_merged(struct channel *c, struct rte **feed, uint count, linpool *pool, int silent) +rt_export_merged(struct channel *c, const rte **feed, uint count, linpool *pool, int silent) { _Thread_local static rte rloc; // struct proto *p = c->proto; - struct nexthop *nhs = NULL; - rte *best0 = feed[0]; + struct nexthop_adata *nhs = NULL; + const rte *best0 = feed[0]; rte *best = NULL; if (!rte_is_valid(best0)) @@ -725,7 +1005,7 @@ rt_export_merged(struct channel *c, struct rte **feed, uint count, linpool *pool return NULL; rloc = *best0; - best = export_filter_(c, &rloc, pool, silent); + best = export_filter(c, &rloc, silent); if (!best) /* Best route doesn't pass the filter */ @@ -741,35 +1021,41 @@ rt_export_merged(struct channel *c, struct rte **feed, uint count, linpool *pool continue; rte tmp0 = *feed[i]; - rte *tmp = export_filter_(c, &tmp0, pool, 1); + rte *tmp = export_filter(c, &tmp0, 1); if (!tmp || !rte_is_reachable(tmp)) continue; - nhs = nexthop_merge_rta(nhs, tmp->attrs, pool, c->merge_limit); + eattr *nhea = ea_find(tmp->attrs, &ea_gen_nexthop); + ASSERT_DIE(nhea); + + if (nhs) + nhs = nexthop_merge(nhs, (struct nexthop_adata *) nhea->u.ptr, c->merge_limit, pool); + else + nhs = (struct nexthop_adata *) nhea->u.ptr; } if (nhs) { - nhs = nexthop_merge_rta(nhs, best->attrs, pool, c->merge_limit); + eattr *nhea = ea_find(best->attrs, &ea_gen_nexthop); + ASSERT_DIE(nhea); - if (nhs->next) - { - best->attrs = rta_cow(best->attrs, pool); - nexthop_link(best->attrs, nhs); - } + nhs = nexthop_merge(nhs, (struct nexthop_adata *) nhea->u.ptr, c->merge_limit, pool); + + ea_set_attr(&best->attrs, + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nhs->ad)); } return best; } void -rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe, - struct rte **feed, uint count) +rt_notify_merged(struct rt_export_request *req, const net_addr *n, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); // struct proto *p = c->proto; #if 0 /* TODO: Find whether this check is possible when processing multiple changes at once. */ @@ -781,7 +1067,7 @@ rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pen return; #endif - rte *old_best = NULL; + const rte *old_best = NULL; /* Find old best route */ for (uint i = 0; i < count; i++) if (bmap_test(&c->export_map, feed[i]->id)) @@ -791,7 +1077,7 @@ rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pen } /* Check obsolete routes for previously exported */ - while (rpe) + RPE_WALK(first, rpe, NULL) { channel_rpe_mark_seen(req, rpe); if (rpe->old) @@ -802,80 +1088,83 @@ rt_notify_merged(struct rt_export_request *req, const net_addr *n, struct rt_pen old_best = &rpe->old->rte; } } - rpe = rpe_next(rpe, NULL); + if (rpe == last) + break; } /* Prepare new merged route */ - rte *new_merged = count ? rt_export_merged(c, feed, count, c->rte_update_pool, 0) : NULL; + rte *new_merged = count ? rt_export_merged(c, feed, count, tmp_linpool, 0) : NULL; if (new_merged || old_best) do_rt_notify(c, n, new_merged, old_best); - - rte_update_unlock(c); } void -rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); - rte *old = RTES_OR_NULL(rpe->old_best); - struct rte_storage *new_best = rpe->new_best; + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; - while (rpe) + RPE_WALK(first, rpe, NULL) { channel_rpe_mark_seen(req, rpe); new_best = rpe->new_best; - rpe = rpe_next(rpe, NULL); } - if (&new_best->rte != old) - { - rte n0, *new = RTES_CLONE(new_best, &n0); - rt_notify_basic(c, net, new, old); - } - - rte_update_unlock(c); + rte n0 = RTE_COPY_VALID(new_best); + if (n0.src || o) + rt_notify_basic(c, net, n0.src ? &n0 : NULL, o); } void -rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe) +rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); - struct rte_src *src = rpe->new ? rpe->new->rte.src : rpe->old->rte.src; - rte *old = RTES_OR_NULL(rpe->old); - struct rte_storage *new_any = rpe->new; - while (rpe) + rte *n = RTE_VALID_OR_NULL(first->new); + rte *o = RTE_VALID_OR_NULL(first->old); + + if (!n && !o) { - channel_rpe_mark_seen(req, rpe); - new_any = rpe->new; - rpe = rpe_next(rpe, src); + channel_rpe_mark_seen(req, first); + return; } - if (&new_any->rte != old) + struct rte_src *src = n ? n->src : o->src; + struct rte_storage *new_latest = first->new; + + RPE_WALK(first, rpe, src) { - rte n0, *new = RTES_CLONE(new_any, &n0); - rt_notify_basic(c, net, new, old); + channel_rpe_mark_seen(req, rpe); + new_latest = rpe->new; } - rte_update_unlock(c); + rte n0 = RTE_COPY_VALID(new_latest); + if (n0.src || o) + rt_notify_basic(c, net, n0.src ? &n0 : NULL, o); } void -rt_feed_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) +rt_feed_any(struct rt_export_request *req, const net_addr *net, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) { struct channel *c = SKIP_BACK(struct channel, out_req, req); - rte_update_lock(c); for (uint i=0; i<count; i++) + if (rte_is_valid(feed[i])) + { + rte n0 = *feed[i]; + rt_notify_basic(c, net, &n0, NULL); + } + + RPE_WALK(first, rpe, NULL) { - rte n0 = *feed[i]; - rt_notify_basic(c, net, &n0, NULL); + channel_rpe_mark_seen(req, rpe); + if (rpe == last) + break; } - - rte_update_unlock(c); } void @@ -905,14 +1194,38 @@ rpe_next(struct rt_pending_export *rpe, struct rte_src *src) } static struct rt_pending_export * rt_next_export_fast(struct rt_pending_export *last); -static void -rte_export(struct rt_export_hook *hook, struct rt_pending_export *rpe) +static int +rte_export(struct rt_table_export_hook *th, struct rt_pending_export *rpe) { + rtable *tab = RT_PUB(SKIP_BACK(struct rtable_private, exporter, th->table)); + struct rt_export_hook *hook = &th->h; if (bmap_test(&hook->seq_map, rpe->seq)) - goto seen; + goto ignore; /* Seen already */ const net_addr *n = rpe->new_best ? rpe->new_best->rte.net : rpe->old_best->rte.net; + switch (hook->req->addr_mode) + { + case TE_ADDR_NONE: + break; + + case TE_ADDR_IN: + if (!net_in_netX(n, hook->req->addr)) + goto ignore; + break; + + case TE_ADDR_EQUAL: + if (!net_equal(n, hook->req->addr)) + goto ignore; + break; + + case TE_ADDR_FOR: + bug("Continuos export of best prefix match not implemented yet."); + + default: + bug("Strange table export address mode: %d", hook->req->addr_mode); + } + if (rpe->new) hook->stats.updates_received++; else @@ -922,36 +1235,34 @@ rte_export(struct rt_export_hook *hook, struct rt_pending_export *rpe) hook->req->export_one(hook->req, n, rpe); else if (hook->req->export_bulk) { - RT_LOCK(hook->table); net *net = SKIP_BACK(struct network, n.addr, (net_addr (*)[0]) n); + RT_LOCK(tab); + struct rt_pending_export *last = net->last; uint count = rte_feed_count(net); - rte **feed = NULL; + const rte **feed = NULL; if (count) { feed = alloca(count * sizeof(rte *)); rte_feed_obtain(net, feed, count); } - RT_UNLOCK(hook->table); - hook->req->export_bulk(hook->req, n, rpe, feed, count); + RT_UNLOCK(tab); + hook->req->export_bulk(hook->req, n, rpe, last, feed, count); } else bug("Export request must always provide an export method"); -seen: +ignore: /* Get the next export if exists */ - hook->rpe_next = rt_next_export_fast(rpe); + th->rpe_next = rt_next_export_fast(rpe); /* The last block may be available to free */ - if (PAGE_HEAD(hook->rpe_next) != PAGE_HEAD(rpe)) - { - RT_LOCK(hook->table); - rt_export_used(RT_PRIV(hook->table)); - RT_UNLOCK(hook->table); - } + int used = (PAGE_HEAD(th->rpe_next) != PAGE_HEAD(rpe)); /* Releasing this export for cleanup routine */ DBG("store hook=%p last_export=%p seq=%lu\n", hook, rpe, rpe->seq); - atomic_store_explicit(&hook->last_export, rpe, memory_order_release); + atomic_store_explicit(&th->last_export, rpe, memory_order_release); + + return used; } /** @@ -986,47 +1297,28 @@ seen: * done outside of scope of rte_announce(). */ static void -rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_storage *old, +rte_announce(struct rtable_private *tab, net *net, struct rte_storage *new, struct rte_storage *old, struct rte_storage *new_best, struct rte_storage *old_best) { - if (!new_best || !rte_is_valid(&new_best->rte)) - new_best = NULL; - - if (!old_best || !rte_is_valid(&old_best->rte)) - old_best = NULL; - - if (!new || !rte_is_valid(&new->rte)) - new = NULL; - - if (old && !rte_is_valid(&old->rte)) - { - /* Filtered old route isn't announced, should be freed immediately. */ - rte_free(old, tab); - old = NULL; - } + int new_best_valid = rte_is_valid(RTE_OR_NULL(new_best)); + int old_best_valid = rte_is_valid(RTE_OR_NULL(old_best)); if ((new == old) && (new_best == old_best)) return; - if (new_best != old_best) - { - if (new_best) - new_best->rte.sender->stats.pref++; - if (old_best) - old_best->rte.sender->stats.pref--; + if (new_best_valid) + new_best->rte.sender->stats.pref++; + if (old_best_valid) + old_best->rte.sender->stats.pref--; - if (tab->hostcache) - rt_notify_hostcache(tab, net); - } - - if (EMPTY_LIST(tab->exports) && EMPTY_LIST(tab->pending_exports)) + if (EMPTY_LIST(tab->exporter.e.hooks) && EMPTY_LIST(tab->exporter.pending)) { /* No export hook and no pending exports to cleanup. We may free the route immediately. */ if (!old) return; hmap_clear(&tab->id_map, old->rte.id); - rte_free(old, tab); + rte_free(old); return; } @@ -1034,9 +1326,9 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ struct rt_export_block *rpeb = NULL, *rpebsnl = NULL; u32 end = 0; - if (!EMPTY_LIST(tab->pending_exports)) + if (!EMPTY_LIST(tab->exporter.pending)) { - rpeb = TAIL(tab->pending_exports); + rpeb = TAIL(tab->exporter.pending); end = atomic_load_explicit(&rpeb->end, memory_order_relaxed); if (end >= RT_PENDING_EXPORT_ITEMS) { @@ -1052,7 +1344,7 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ { rpeb = alloc_page(); *rpeb = (struct rt_export_block) {}; - add_tail(&tab->pending_exports, &rpeb->n); + add_tail(&tab->exporter.pending, &rpeb->n); } /* Fill the pending export */ @@ -1062,10 +1354,14 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ .new_best = new_best, .old = old, .old_best = old_best, - .seq = tab->next_export_seq++, + .seq = tab->exporter.next_seq++, }; - DBG("rte_announce: table=%s net=%N new=%p from %p old=%p from %p new_best=%p old_best=%p seq=%lu\n", tab->name, net->n.addr, new, new ? new->sender : NULL, old, old ? old->sender : NULL, new_best, old_best, rpe->seq); + DBGL("rte_announce: table=%s net=%N new=%p id %u from %s old=%p id %u from %s new_best=%p id %u old_best=%p id %u seq=%lu", + tab->name, net->n.addr, + new, new ? new->rte.id : 0, new ? new->rte.sender->req->name : NULL, + old, old ? old->rte.id : 0, old ? old->rte.sender->req->name : NULL, + new_best, old_best, rpe->seq); ASSERT_DIE(atomic_fetch_add_explicit(&rpeb->end, 1, memory_order_release) == end); @@ -1084,7 +1380,7 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ &net->last->next, &rpenull, rpe, memory_order_relaxed, memory_order_relaxed)); - + } net->last = rpe; @@ -1092,19 +1388,10 @@ rte_announce(rtable_private *tab, net *net, struct rte_storage *new, struct rte_ if (!net->first) net->first = rpe; - if (tab->first_export == NULL) - tab->first_export = rpe; + if (tab->exporter.first == NULL) + tab->exporter.first = rpe; - if (!EMPTY_LIST(tab->exports) && - (tab->first_export->seq + tab->config->cork_limit <= tab->next_export_seq) && - !tab->cork_active) - { - if (config->table_debug) - log(L_TRACE "%s: cork activated", tab->name); - - ev_cork(&rt_cork); - tab->cork_active = 1; - } + rt_check_cork_high(tab); } static struct rt_pending_export * @@ -1133,8 +1420,10 @@ rt_next_export_fast(struct rt_pending_export *last) } static struct rt_pending_export * -rt_next_export(struct rt_export_hook *hook, rtable_private *tab) +rt_next_export(struct rt_table_export_hook *hook, struct rt_table_exporter *tab) { + ASSERT_DIE(RT_IS_LOCKED(SKIP_BACK(struct rtable_private, exporter, tab))); + /* As the table is locked, it is safe to reload the last export pointer */ struct rt_pending_export *last = atomic_load_explicit(&hook->last_export, memory_order_acquire); @@ -1144,62 +1433,75 @@ rt_next_export(struct rt_export_hook *hook, rtable_private *tab) /* No, therefore we must process the table's first pending export */ else - return tab->first_export; + return tab->first; } static inline void rt_send_export_event(struct rt_export_hook *hook) { - ev_send(hook->req->list, hook->event); + ev_send(hook->req->list, &hook->event); } static void -rt_announce_exports(void *data) +rt_announce_exports(struct settle *s) { - rtable_private *tab = data; - ASSERT_DIE(birdloop_inside(tab->loop)); - - rt_schedule_notify(tab); + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, export_settle, s)), tab) + if (!EMPTY_LIST(tab->exporter.pending)) + { + struct rt_export_hook *c; node *n; + WALK_LIST2(c, n, tab->exporter.e.hooks, n) + { + if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_READY) + continue; - struct rt_export_hook *c; node *n; - WALK_LIST2(c, n, tab->exports, n) - { - if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_READY) - continue; + rt_send_export_event(c); + } + } +} - rt_send_export_event(c); - } +static void +rt_kick_export_settle(struct rtable_private *tab) +{ + tab->export_settle.cf = tab->rr_counter ? tab->config->export_rr_settle : tab->config->export_settle; + settle_kick(&tab->export_settle, tab->loop); } static void -rt_import_announce_exports(void *data) +rt_import_announce_exports(void *_hook) { - struct rt_import_hook *hook = data; - RT_LOCKED(hook->table, tab) + struct rt_import_hook *hook = _hook; + if (hook->import_state == TIS_CLEARED) { - if (hook->import_state == TIS_CLEARED) + void (*stopped)(struct rt_import_request *) = hook->stopped; + struct rt_import_request *req = hook->req; + + RT_LOCKED(hook->table, tab) { - rfree(hook->export_announce_event); + req->hook = NULL; - ev_send(hook->stopped->list, hook->stopped); + rt_trace(tab, D_EVENTS, "Hook %s stopped", req->name); rem_node(&hook->n); mb_free(hook); rt_unlock_table(tab); } - else - ev_send_loop(tab->loop, tab->announce_event); + + stopped(req); + return; } + + rt_trace(hook->table, D_EVENTS, "Announcing exports after imports from %s", hook->req->name); + birdloop_flag(hook->table->loop, RTF_EXPORT); } static struct rt_pending_export * -rt_last_export(rtable_private *tab) +rt_last_export(struct rt_table_exporter *tab) { struct rt_pending_export *rpe = NULL; - if (!EMPTY_LIST(tab->pending_exports)) + if (!EMPTY_LIST(tab->pending)) { /* We'll continue processing exports from this export on */ - struct rt_export_block *reb = TAIL(tab->pending_exports); + struct rt_export_block *reb = TAIL(tab->pending); ASSERT_DIE(reb->end); rpe = &reb->export[reb->end - 1]; } @@ -1212,35 +1514,47 @@ rt_last_export(rtable_private *tab) static void rt_export_hook(void *_data) { - struct rt_export_hook *c = _data; + struct rt_table_export_hook *c = _data; + rtable *tab = SKIP_BACK(rtable, priv.exporter, c->table); - ASSERT_DIE(atomic_load_explicit(&c->export_state, memory_order_relaxed) == TES_READY); + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_READY); if (!c->rpe_next) { - RT_LOCK(c->table); - c->rpe_next = rt_next_export(c, RT_PRIV(c->table)); + RT_LOCK(tab); + c->rpe_next = rt_next_export(c, c->table); if (!c->rpe_next) { - rt_export_used(RT_PRIV(c->table)); - RT_UNLOCK(c->table); + rt_export_used(c->table, c->h.req->name, "done exporting"); + RT_UNLOCK(tab); return; } - RT_UNLOCK(c->table); + RT_UNLOCK(tab); } + int used = 0; + int no_next = 0; + /* Process the export */ for (uint i=0; i<RT_EXPORT_BULK; i++) { - rte_export(c, c->rpe_next); + used += rte_export(c, c->rpe_next); if (!c->rpe_next) + { + no_next = 1; break; + } } - rt_send_export_event(c); + if (used) + RT_LOCKED(tab, t) + if (no_next || t->cork_active) + rt_export_used(c->table, c->h.req->name, no_next ? "finished export bulk" : "cork active"); + + rt_send_export_event(&c->h); } @@ -1267,16 +1581,29 @@ rte_validate(struct channel *ch, rte *e) return 0; } - if (net_type_match(n, NB_DEST) == !e->attrs->dest) + if (net_type_match(n, NB_DEST)) { - log(L_WARN "Ignoring route %N with invalid dest %d received via %s", - n, e->attrs->dest, ch->proto->name); - return 0; - } + eattr *nhea = ea_find(e->attrs, &ea_gen_nexthop); + int dest = nhea_dest(nhea); - if ((e->attrs->dest == RTD_UNICAST) && !nexthop_is_sorted(&(e->attrs->nh))) + if (dest == RTD_NONE) + { + log(L_WARN "Ignoring route %N with no destination received via %s", + n, ch->proto->name); + return 0; + } + + if ((dest == RTD_UNICAST) && + !nexthop_is_sorted((struct nexthop_adata *) nhea->u.ptr)) + { + log(L_WARN "Ignoring unsorted multipath route %N received via %s", + n, ch->proto->name); + return 0; + } + } + else if (ea_find(e->attrs, &ea_gen_nexthop)) { - log(L_WARN "Ignoring unsorted multipath route %N received via %s", + log(L_WARN "Ignoring route %N having a nexthop attribute received via %s", n, ch->proto->name); return 0; } @@ -1287,9 +1614,7 @@ rte_validate(struct channel *ch, rte *e) static int rte_same(rte *x, rte *y) { - ASSERT_DIE(x->attrs->cached && y->attrs->cached); - - /* rte.flags are not checked, as they are mostly internal to rtable */ + /* rte.flags / rte.pflags are not checked, as they are internal to rtable */ return x->attrs == y->attrs && x->src == y->src && @@ -1298,8 +1623,8 @@ rte_same(rte *x, rte *y) static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); } -static void -rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte *new, struct rte_src *src) +static int +rte_recalculate(struct rtable_private *table, struct rt_import_hook *c, net *net, rte *new, struct rte_src *src) { struct rt_import_request *req = c->req; struct rt_import_stats *stats = &c->stats; @@ -1307,22 +1632,17 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * rte *old_best = old_best_stored ? &old_best_stored->rte : NULL; rte *old = NULL; - /* Set the stale cycle unless already set */ - if (new && !(new->flags & REF_USE_STALE)) - new->stale_cycle = c->stale_set; + /* If the new route is identical to the old one, we find the attributes in + * cache and clone these with no performance drop. OTOH, if we were to lookup + * the attributes, such a route definitely hasn't been anywhere yet, + * therefore it's definitely worth the time. */ + struct rte_storage *new_stored = NULL; + if (new) + new = &(new_stored = rte_store(new, net, table))->rte; /* Find and remove original route from the same protocol */ struct rte_storage **before_old = rte_find(net, src); - if (!*before_old && !new) - { - stats->withdraws_ignored++; - return; - } - - if (new) - new->attrs = rta_is_cached(new->attrs) ? rta_clone(new->attrs) : rta_lookup(new->attrs); - if (*before_old) { old = &(old_stored = (*before_old))->rte; @@ -1342,7 +1662,7 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * c->table->name, net->n.addr, old->src->owner->name, old->src->private_id, old->src->global_id); } - if (new && rte_same(old, new)) + if (new && rte_same(old, &new_stored->rte)) { /* No changes, ignore the new route and refresh the old one */ old->stale_cycle = new->stale_cycle; @@ -1353,16 +1673,28 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * rt_rte_trace_in(D_ROUTES, req, new, "ignored"); } - rta_free(new->attrs); - return; + /* We need to free the already stored route here before returning */ + rte_free(new_stored); + return 0; } *before_old = (*before_old)->next; table->rt_count--; } - if (req->preimport) - new = req->preimport(req, new, old); + if (!old && !new) + { + stats->withdraws_ignored++; + return 0; + } + + /* If rejected by import limit, we need to pretend there is no route */ + if (req->preimport && (req->preimport(req, new, old) == 0)) + { + rte_free(new_stored); + new_stored = NULL; + new = NULL; + } int new_ok = rte_is_ok(new); int old_ok = rte_is_ok(old); @@ -1377,8 +1709,6 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * if (old_ok || new_ok) table->last_rt_change = current_time(); - struct rte_storage *new_stored = new ? rte_store(new, net, table) : NULL; - if (table->config->sorted) { /* If routes are sorted, just insert new route to appropriate position */ @@ -1474,121 +1804,81 @@ rte_recalculate(rtable_private *table, struct rt_import_hook *c, net *net, rte * hmap_set(&table->id_map, new_stored->rte.id); } - _Bool nb = (new_stored == net->routes); - _Bool ob = (old_best == old); - /* Log the route change */ - if (new_ok && old_ok) + if (new_ok) + rt_rte_trace_in(D_ROUTES, req, &new_stored->rte, new_stored == net->routes ? "added [best]" : "added"); + else if (old_ok) { - const char *best_indicator[2][2] = { { "updated", "updated [-best]" }, { "updated [+best]", "updated [best]" } }; - rt_rte_trace_in(D_ROUTES, req, &new_stored->rte, best_indicator[nb][ob]); + if (old != old_best) + rt_rte_trace_in(D_ROUTES, req, old, "removed"); + else if (net->routes && rte_is_ok(&net->routes->rte)) + rt_rte_trace_in(D_ROUTES, req, old, "removed [replaced]"); + else + rt_rte_trace_in(D_ROUTES, req, old, "removed [sole]"); } - else if (new_ok) - rt_rte_trace_in(D_ROUTES, req, &new_stored->rte, - (!net->routes->next || !rte_is_ok(&net->routes->next->rte)) ? "added [sole]" : - nb ? "added [best]" : "added"); - else if (old_ok) - rt_rte_trace_in(D_ROUTES, req, old, - (!net->routes || !rte_is_ok(&net->routes->rte)) ? "removed [sole]" : - ob ? "removed [best]" : "removed"); + else + if (req->trace_routes & D_ROUTES) + log(L_TRACE "%s > ignored %N %s->%s", req->name, net->n.addr, old ? "filtered" : "none", new ? "filtered" : "none"); /* Propagate the route change */ rte_announce(table, net, new_stored, old_stored, net->routes, old_best_stored); - if (!net->routes && - (table->gc_counter++ >= table->config->gc_max_ops) && - (table->gc_time + table->config->gc_min_time <= current_time())) - rt_schedule_prune(table); - -#if 0 - /* Enable and reimplement these callbacks if anybody wants to use them */ - if (old_ok && p->rte_remove) - p->rte_remove(net, old); - if (new_ok && p->rte_insert) - p->rte_insert(net, &new_stored->rte); -#endif - + return 1; } -rte * +int channel_preimport(struct rt_import_request *req, rte *new, rte *old) { struct channel *c = SKIP_BACK(struct channel, in_req, req); - if (!c->in_table) - { - if (new && !old) - if (CHANNEL_LIMIT_PUSH(c, RX)) - { - rta_free(new->attrs); - return NULL; - } + if (new && !old) + if (CHANNEL_LIMIT_PUSH(c, RX)) + return 0; - if (!new && old) - CHANNEL_LIMIT_POP(c, RX); - } + if (!new && old) + CHANNEL_LIMIT_POP(c, RX); int new_in = new && !rte_is_filtered(new); int old_in = old && !rte_is_filtered(old); if (new_in && !old_in) if (CHANNEL_LIMIT_PUSH(c, IN)) - if (c->in_keep_filtered) + if (c->in_keep & RIK_REJECTED) { new->flags |= REF_FILTERED; - return new; + return 1; } else - { - rta_free(new->attrs); - return NULL; - } + return 0; if (!new_in && old_in) CHANNEL_LIMIT_POP(c, IN); - return new; -} - -rte * -channel_in_preimport(struct rt_import_request *req, rte *new, rte *old) -{ - struct channel_aux_table *cat = SKIP_BACK(struct channel_aux_table, push, req); - - if (new && !old) - if (CHANNEL_LIMIT_PUSH(cat->c, RX)) - return NULL; - - if (!new && old) - CHANNEL_LIMIT_POP(cat->c, RX); - - return new; + return 1; } -void rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src *src); - void rte_update(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) { if (!c->in_req.hook) + { + log(L_WARN "%s.%s: Called rte_update without import hook", c->proto->name, c->name); return; + } ASSERT(c->channel_state == CS_UP); - if (c->in_table) - rte_import(&c->in_table->push, n, new, src); - else - rte_update_direct(c, n, new, src); -} + /* The import reloader requires prefilter routes to be the first layer */ + if (new && (c->in_keep & RIK_PREFILTER)) + if (ea_is_cached(new->attrs) && !new->attrs->next) + new->attrs = ea_clone(new->attrs); + else + new->attrs = ea_lookup(new->attrs, 0); -void -rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src *src) -{ const struct filter *filter = c->in_filter; struct channel_import_stats *stats = &c->import_stats; - rte_update_lock(c); if (new) { new->net = n; @@ -1596,30 +1886,49 @@ rte_update_direct(struct channel *c, const net_addr *n, rte *new, struct rte_src int fr; stats->updates_received++; - if (!rte_validate(c, new)) - { - channel_rte_trace_in(D_FILTERS, c, new, "invalid"); - stats->updates_invalid++; - new = NULL; - } - else if ((filter == FILTER_REJECT) || - ((fr = f_run(filter, new, c->rte_update_pool, 0)) > F_ACCEPT)) + if ((filter == FILTER_REJECT) || + ((fr = f_run(filter, new, 0)) > F_ACCEPT)) { stats->updates_filtered++; channel_rte_trace_in(D_FILTERS, c, new, "filtered out"); - if (c->in_keep_filtered) + if (c->in_keep & RIK_REJECTED) new->flags |= REF_FILTERED; else new = NULL; } + + if (new) + if (net_is_flow(n)) + rt_flowspec_resolve_rte(new, c); + else + rt_next_hop_resolve_rte(new); + + if (new && !rte_validate(c, new)) + { + channel_rte_trace_in(D_FILTERS, c, new, "invalid"); + stats->updates_invalid++; + new = NULL; + } + } else stats->withdraws_received++; rte_import(&c->in_req, n, new, src); - rte_update_unlock(c); + /* Now the route attributes are kept by the in-table cached version + * and we may drop the local handle */ + if (new && (c->in_keep & RIK_PREFILTER)) + { + /* There may be some updates on top of the original attribute block */ + ea_list *a = new->attrs; + while (a->next) + a = a->next; + + ea_free(a); + } + } void @@ -1627,91 +1936,99 @@ rte_import(struct rt_import_request *req, const net_addr *n, rte *new, struct rt { struct rt_import_hook *hook = req->hook; if (!hook) + { + log(L_WARN "%s: Called rte_import without import hook", req->name); return; + } - RT_LOCK(hook->table); - rtable_private *tab = RT_PRIV(hook->table); - - net *nn; - if (new) + RT_LOCKED(hook->table, tab) + { + net *nn; + if (new) { /* Use the actual struct network, not the dummy one */ nn = net_get(tab, n); new->net = nn->n.addr; new->sender = hook; + + /* Set the stale cycle */ + new->stale_cycle = hook->stale_set; } - else if (!(nn = net_find(tab, n))) + else if (!(nn = net_find(tab, n))) { req->hook->stats.withdraws_ignored++; - RT_UNLOCK(tab); - return; + if (req->trace_routes & D_ROUTES) + log(L_TRACE "%s > ignored %N withdraw", req->name, n); + RT_RETURN(tab); } - /* And recalculate the best route */ - rte_recalculate(tab, hook, nn, new, src); - - /* Schedule export announcement */ - ev_send(req->list, hook->export_announce_event); - - /* Done! */ - RT_UNLOCK(tab); + /* Recalculate the best route */ + if (rte_recalculate(tab, hook, nn, new, src)) + ev_send(req->list, &hook->announce_event); + } } /* Check rtable for best route to given net whether it would be exported do p */ int -rt_examine(rtable_private *t, net_addr *a, struct channel *c, const struct filter *filter) +rt_examine(rtable *tp, net_addr *a, struct channel *c, const struct filter *filter) { - net *n = net_find(t, a); + rte rt = {}; - if (!n || !n->routes) - return 0; - - rte rt = n->routes->rte; + RT_LOCKED(tp, t) + { + net *n = net_find(t, a); + if (n) + rt = RTE_COPY_VALID(n->routes); + } - if (!rte_is_valid(&rt)) + if (!rt.src) return 0; - rte_update_lock(c); - - /* Rest is stripped down export_filter() */ int v = c->proto->preexport ? c->proto->preexport(c, &rt) : 0; if (v == RIC_PROCESS) - v = (f_run(filter, &rt, c->rte_update_pool, FF_SILENT) <= F_ACCEPT); - - rte_update_unlock(c); + v = (f_run(filter, &rt, FF_SILENT) <= F_ACCEPT); return v > 0; } static void -rt_export_stopped(void *data) +rt_table_export_done(void *hh) { - struct rt_export_hook *hook = data; + struct rt_table_export_hook *hook = hh; + struct rt_export_request *req = hook->h.req; + void (*stopped)(struct rt_export_request *) = hook->h.stopped; + rtable *t = SKIP_BACK(rtable, priv.exporter, hook->table); - RT_LOCKED(hook->table, tab) + RT_LOCKED(t, tab) { + DBG("Export hook %p in table %s finished uc=%u\n", hook, tab->name, tab->use_count); + /* Drop pending exports */ - rt_export_used(tab); + rt_export_used(&tab->exporter, hook->h.req->name, "stopped"); - /* Unlist */ - rem_node(&hook->n); + /* Do the common code; this frees the hook */ + rt_export_stopped(&hook->h); } /* Report the channel as stopped. */ - hook->stopped(hook->req); + CALL(stopped, req); - RT_LOCKED(hook->table, tab) - { - /* Free the hook together with its coroutine. */ - rp_free(hook->pool, tab->rp); - rt_unlock_table(tab); + /* Unlock the table; this may free it */ + rt_unlock_table(t); +} - rt_fast_prune_check(tab); +void +rt_export_stopped(struct rt_export_hook *hook) +{ + /* Unlink from the request */ + hook->req->hook = NULL; - DBG("Export hook %p in table %s finished uc=%u\n", hook, tab->name, tab->use_count); - } -} + /* Unlist */ + rem_node(&hook->n); + /* Free the hook itself together with its pool */ + rfree(hook->pool); +} static inline void rt_set_import_state(struct rt_import_hook *hook, u8 state) @@ -1719,128 +2036,292 @@ rt_set_import_state(struct rt_import_hook *hook, u8 state) hook->last_state_change = current_time(); hook->import_state = state; - if (hook->req->log_state_change) - hook->req->log_state_change(hook->req, state); + CALL(hook->req->log_state_change, hook->req, state); } -static inline void +void rt_set_export_state(struct rt_export_hook *hook, u8 state) { hook->last_state_change = current_time(); - atomic_store_explicit(&hook->export_state, state, memory_order_release); + u8 old = atomic_exchange_explicit(&hook->export_state, state, memory_order_release); - if (hook->req->log_state_change) - hook->req->log_state_change(hook->req, state); + if (old != state) + CALL(hook->req->log_state_change, hook->req, state); } void rt_request_import(rtable *t, struct rt_import_request *req) { - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); - rt_lock_table(tab); - - ASSERT_DIE(!tab->delete); - - struct rt_import_hook *hook = req->hook = mb_allocz(tab->rp, sizeof(struct rt_import_hook)); - - DBG("Lock table %s for import %p req=%p uc=%u\n", tab->name, hook, req, tab->use_count); - - hook->req = req; - hook->table = t; + RT_LOCKED(t, tab) + { + rt_lock_table(tab); - hook->export_announce_event = ev_new_init(tab->rp, rt_import_announce_exports, hook); + struct rt_import_hook *hook = req->hook = mb_allocz(tab->rp, sizeof(struct rt_import_hook)); - if (!hook->stale_set) - hook->stale_set = hook->stale_valid = hook->stale_pruning = hook->stale_pruned = 1; + hook->announce_event = (event) { .hook = rt_import_announce_exports, .data = hook }; - rt_set_import_state(hook, TIS_UP); + DBG("Lock table %s for import %p req=%p uc=%u\n", tab->name, hook, req, tab->use_count); - hook->n = (node) {}; - add_tail(&tab->imports, &hook->n); - tab->imports_up++; + hook->req = req; + hook->table = t; - RT_UNLOCK(t); + rt_set_import_state(hook, TIS_UP); + add_tail(&tab->imports, &hook->n); + } } void -rt_stop_import(struct rt_import_request *req, event *stopped) +rt_stop_import(struct rt_import_request *req, void (*stopped)(struct rt_import_request *)) { ASSERT_DIE(req->hook); struct rt_import_hook *hook = req->hook; - rtable_private *tab = RT_LOCK(hook->table); + RT_LOCKED(hook->table, tab) + { + rt_schedule_prune(tab); + rt_set_import_state(hook, TIS_STOP); + hook->stopped = stopped; - rt_schedule_prune(tab); + /* Cancel table rr_counter */ + if (hook->stale_set != hook->stale_pruned) + tab->rr_counter -= (hook->stale_set - hook->stale_pruned); - tab->imports_up--; - rt_fast_prune_check(tab); + tab->rr_counter++; - rt_set_import_state(hook, TIS_STOP); + hook->stale_set = hook->stale_pruned = hook->stale_pruning = hook->stale_valid = 0; + } +} - hook->stopped = stopped; +static void rt_table_export_start_feed(struct rtable_private *tab, struct rt_table_export_hook *hook); +static void +rt_table_export_uncork(void *_hook) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); - if (hook->stale_set < hook->stale_valid) - if (!--tab->rr_count) - rt_schedule_notify(tab); + struct rt_table_export_hook *hook = _hook; + struct birdloop *loop = hook->h.req->list->loop; - RT_UNLOCK(tab); + if (loop != &main_birdloop) + birdloop_enter(loop); + + u8 state; + switch (state = atomic_load_explicit(&hook->h.export_state, memory_order_relaxed)) + { + case TES_HUNGRY: + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, hook->table)), tab) + if ((state = atomic_load_explicit(&hook->h.export_state, memory_order_relaxed)) == TES_HUNGRY) + rt_table_export_start_feed(tab, hook); + if (state != TES_STOP) + break; + /* fall through */ + case TES_STOP: + rt_stop_export_common(&hook->h); + break; + default: + bug("Uncorking a table export in a strange state: %u", state); + } + + if (loop != &main_birdloop) + birdloop_leave(loop); } -void -rt_request_export(rtable *t, struct rt_export_request *req) +static void +rt_table_export_start_locked(struct rtable_private *tab, struct rt_export_request *req) { - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); + struct rt_exporter *re = &tab->exporter.e; rt_lock_table(tab); - pool *p = rp_new(tab->rp, tab->loop, "Export hook"); - struct rt_export_hook *hook = req->hook = mb_allocz(p, sizeof(struct rt_export_hook)); - hook->pool = p; - - hook->req = req; - hook->table = t; + req->hook = rt_alloc_export(re, sizeof(struct rt_table_export_hook)); + req->hook->req = req; + + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, req->hook); + hook->h.event = (event) { + .hook = rt_table_export_uncork, + .data = hook, + }; + + if (rt_cork_check(&hook->h.event)) + rt_set_export_state(&hook->h, TES_HUNGRY); + else + rt_table_export_start_feed(tab, hook); +} + +static void +rt_table_export_start_feed(struct rtable_private *tab, struct rt_table_export_hook *hook) +{ + struct rt_exporter *re = &tab->exporter.e; + struct rt_export_request *req = hook->h.req; /* stats zeroed by mb_allocz */ + switch (req->addr_mode) + { + case TE_ADDR_IN: + if (tab->trie && net_val_match(tab->addr_type, NB_IP)) + { + hook->walk_state = mb_allocz(hook->h.pool, sizeof (struct f_trie_walk_state)); + hook->walk_lock = rt_lock_trie(tab); + trie_walk_init(hook->walk_state, tab->trie, req->addr); + hook->h.event.hook = rt_feed_by_trie; + hook->walk_last.type = 0; + break; + } + /* fall through */ + case TE_ADDR_NONE: + FIB_ITERATE_INIT(&hook->feed_fit, &tab->fib); + hook->h.event.hook = rt_feed_by_fib; + break; - bmap_init(&hook->seq_map, p, 1024); + case TE_ADDR_EQUAL: + hook->h.event.hook = rt_feed_equal; + break; - rt_set_export_state(hook, TES_HUNGRY); + case TE_ADDR_FOR: + hook->h.event.hook = rt_feed_for; + break; - hook->n = (node) {}; - add_tail(&tab->exports, &hook->n); + default: + bug("Requested an unknown export address mode"); + } DBG("New export hook %p req %p in table %s uc=%u\n", hook, req, tab->name, tab->use_count); - hook->event = ev_new_init(p, rt_feed_channel, hook); - RT_UNLOCK(t); + struct rt_pending_export *rpe = rt_last_export(hook->table); + DBG("store hook=%p last_export=%p seq=%lu\n", hook, rpe, rpe ? rpe->seq : 0); + atomic_store_explicit(&hook->last_export, rpe, memory_order_relaxed); + rt_init_export(re, req->hook); +} + +static void +rt_table_export_start(struct rt_exporter *re, struct rt_export_request *req) +{ + RT_LOCKED(SKIP_BACK(rtable, priv.exporter.e, re), tab) + rt_table_export_start_locked(tab, req); +} + +void rt_request_export(rtable *t, struct rt_export_request *req) +{ + RT_LOCKED(t, tab) + rt_table_export_start_locked(tab, req); /* Is locked inside */ +} + +void +rt_request_export_other(struct rt_exporter *re, struct rt_export_request *req) +{ + return re->class->start(re, req); +} + +struct rt_export_hook * +rt_alloc_export(struct rt_exporter *re, uint size) +{ + pool *p = rp_new(re->rp, "Export hook"); + struct rt_export_hook *hook = mb_allocz(p, size); + + hook->pool = p; + hook->table = re; + + hook->n = (node) {}; + add_tail(&re->hooks, &hook->n); + + return hook; +} + +void +rt_init_export(struct rt_exporter *re UNUSED, struct rt_export_hook *hook) +{ + hook->event.data = hook; + + bmap_init(&hook->seq_map, hook->pool, 16); + + /* Regular export */ + rt_set_export_state(hook, TES_FEEDING); rt_send_export_event(hook); } +static int +rt_table_export_stop_locked(struct rt_export_hook *hh) +{ + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, hh); + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter, hook->table); + + switch (atomic_load_explicit(&hh->export_state, memory_order_relaxed)) + { + case TES_HUNGRY: + rt_trace(tab, D_EVENTS, "Stopping export hook %s must wait for uncorking", hook->h.req->name); + return 0; + case TES_FEEDING: + switch (hh->req->addr_mode) + { + case TE_ADDR_IN: + if (hook->walk_lock) + { + rt_unlock_trie(tab, hook->walk_lock); + hook->walk_lock = NULL; + mb_free(hook->walk_state); + hook->walk_state = NULL; + break; + } + /* fall through */ + case TE_ADDR_NONE: + fit_get(&tab->fib, &hook->feed_fit); + break; + } + break; + + case TES_STOP: + bug("Tried to repeatedly stop the same export hook %s", hook->h.req->name); + } + + rt_trace(tab, D_EVENTS, "Stopping export hook %s right now", hook->h.req->name); + return 1; +} + +static void +rt_table_export_stop(struct rt_export_hook *hh) +{ + struct rt_table_export_hook *hook = SKIP_BACK(struct rt_table_export_hook, h, hh); + int ok = 0; + rtable *t = SKIP_BACK(rtable, priv.exporter, hook->table); + if (RT_IS_LOCKED(t)) + ok = rt_table_export_stop_locked(hh); + else + RT_LOCKED(t, tab) + ok = rt_table_export_stop_locked(hh); + + if (ok) + rt_stop_export_common(hh); + else + rt_set_export_state(&hook->h, TES_STOP); +} + void rt_stop_export(struct rt_export_request *req, void (*stopped)(struct rt_export_request *)) { + ASSERT_DIE(birdloop_inside(req->list->loop)); ASSERT_DIE(req->hook); struct rt_export_hook *hook = req->hook; - RT_LOCK(hook->table); - rtable_private *tab = RT_PRIV(hook->table); + /* Set the stopped callback */ + hook->stopped = stopped; - /* Stop feeding */ - ev_postpone(hook->event); + /* Run the stop code */ + if (hook->table->class->stop) + hook->table->class->stop(hook); + else + rt_stop_export_common(hook); +} - if (atomic_load_explicit(&hook->export_state, memory_order_relaxed) == TES_FEEDING) - fit_get(&tab->fib, &hook->feed_fit); +void +rt_stop_export_common(struct rt_export_hook *hook) +{ + /* Update export state */ + rt_set_export_state(hook, TES_STOP); - hook->event->hook = rt_export_stopped; - hook->stopped = stopped; + /* Reset the event as the stopped event */ + hook->event.hook = hook->table->class->done; + /* Run the stopped event */ rt_send_export_event(hook); - - RT_UNLOCK(hook->table); - - rt_set_export_state(hook, TES_STOP); } /** @@ -1853,48 +2334,48 @@ rt_stop_export(struct rt_export_request *req, void (*stopped)(struct rt_export_r * routes to the routing table (by rte_update()). After that, all protocol * routes (more precisely routes with @c as @sender) not sent during the * refresh cycle but still in the table from the past are pruned. This is - * implemented by setting rte->stale_cycle to req->stale_set in rte_update() - * and then dropping all routes with old stale_cycle values in table prune loop. */ + * implemented by marking all related routes as stale by REF_STALE flag in + * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD + * flag in rt_refresh_end() and then removing such routes in the prune loop. + */ void rt_refresh_begin(struct rt_import_request *req) { struct rt_import_hook *hook = req->hook; ASSERT_DIE(hook); - RT_LOCK(hook->table); - rtable_private *tab = RT_PRIV(hook->table); - - ASSERT_DIE(hook->stale_set == hook->stale_valid); + RT_LOCKED(hook->table, tab) + { /* If the pruning routine is too slow */ - if ((hook->stale_pruned < hook->stale_valid) && (hook->stale_pruned + 128 < hook->stale_valid) - || (hook->stale_pruned > hook->stale_valid) && (hook->stale_pruned > hook->stale_valid + 128)) + if (((hook->stale_set - hook->stale_pruned) & 0xff) >= 240) { - log(L_WARN "Route refresh flood in table %s", tab->name); + log(L_WARN "Route refresh flood in table %s (stale_set=%u, stale_pruned=%u)", hook->table->name, hook->stale_set, hook->stale_pruned); + + /* Forcibly set all old routes' stale cycle to zero. */ FIB_WALK(&tab->fib, net, n) { - for (struct rte_storage *e = n->routes; e; e = e->next) - if (e->rte.sender == req->hook) - e->rte.stale_cycle = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + if (e->rte.sender == req->hook) + e->rte.stale_cycle = 0; } FIB_WALK_END; - hook->stale_set = 1; - hook->stale_valid = 0; - hook->stale_pruned = 0; - } - else if (!++hook->stale_set) - { - /* Let's reserve the stale_cycle zero value for always-invalid routes */ - hook->stale_set = 1; - hook->stale_valid = 0; + + /* Smash the route refresh counter and zero everything. */ + tab->rr_counter -= hook->stale_set - hook->stale_pruned; + hook->stale_set = hook->stale_valid = hook->stale_pruning = hook->stale_pruned = 0; } - tab->rr_count++; + /* Now we can safely increase the stale_set modifier */ + hook->stale_set++; + + /* The table must know that we're route-refreshing */ + tab->rr_counter++; if (req->trace_routes & D_STATES) log(L_TRACE "%s: route refresh begin [%u]", req->name, hook->stale_set); - RT_UNLOCK(tab); + } } /** @@ -1911,19 +2392,21 @@ rt_refresh_end(struct rt_import_request *req) struct rt_import_hook *hook = req->hook; ASSERT_DIE(hook); - rtable_private *tab = RT_LOCK(hook->table); - hook->stale_valid++; - ASSERT_DIE(hook->stale_set == hook->stale_valid); - - rt_schedule_prune(tab); - - if (req->trace_routes & D_STATES) - log(L_TRACE "%s: route refresh end [%u]", req->name, hook->stale_valid); + RT_LOCKED(hook->table, tab) + { + /* Now valid routes are only those one with the latest stale_set value */ + uint cnt = hook->stale_set - hook->stale_valid; + hook->stale_valid = hook->stale_set; - if (!--tab->rr_count) - rt_schedule_notify(tab); + /* Here we can't kick the timer as we aren't in the table service loop */ + rt_schedule_prune(tab); - RT_UNLOCK(tab); + if (req->trace_routes & D_STATES) + if (cnt > 1) + log(L_TRACE "%s: route refresh end (x%u) [%u]", req->name, cnt, hook->stale_valid); + else + log(L_TRACE "%s: route refresh end [%u]", req->name, hook->stale_valid); + } } /** @@ -1937,7 +2420,7 @@ rte_dump(struct rte_storage *e) { debug("%-1N ", e->rte.net); debug("PF=%02x ", e->rte.pflags); - rta_dump(e->rte.attrs); + ea_dump(e->rte.attrs); debug("\n"); } @@ -1948,11 +2431,12 @@ rte_dump(struct rte_storage *e) * This function dumps contents of a given routing table to debug output. */ void -rt_dump(rtable *tab) +rt_dump(rtable *tp) { - RT_LOCK(tab); - rtable_private *t = RT_PRIV(tab); - debug("Dump of routing table <%s>%s\n", t->name, t->delete ? " (deleted)" : ""); + RT_LOCKED(tp, t) + { + + debug("Dump of routing table <%s>%s\n", t->name, t->deleted ? " (deleted)" : ""); #ifdef DEBUGGING fib_check(&t->fib); #endif @@ -1963,7 +2447,8 @@ rt_dump(rtable *tab) } FIB_WALK_END; debug("\n"); - RT_UNLOCK(tab); + + } } /** @@ -1979,16 +2464,20 @@ rt_dump_all(void) WALK_LIST2(t, n, routing_tables, n) rt_dump(t); + + WALK_LIST2(t, n, deleted_routing_tables, n) + rt_dump(t); } void -rt_dump_hooks(rtable *t) +rt_dump_hooks(rtable *tp) { - RT_LOCK(t); - rtable_private *tab = RT_PRIV(t); - debug("Dump of hooks in routing table <%s>%s\n", tab->name, tab->delete ? " (deleted)" : ""); - debug(" nhu_state=%u hcu_scheduled=%u use_count=%d rt_count=%u\n", - atomic_load(&tab->nhu_state), ev_active(tab->hcu_event), tab->use_count, tab->rt_count); + RT_LOCKED(tp, tab) + { + + debug("Dump of hooks in routing table <%s>%s\n", tab->name, tab->deleted ? " (deleted)" : ""); + debug(" nhu_state=%u use_count=%d rt_count=%u\n", + tab->nhu_state, tab->use_count, tab->rt_count); debug(" last_rt_change=%t gc_time=%t gc_counter=%d prune_state=%u\n", tab->last_rt_change, tab->gc_time, tab->gc_counter, tab->prune_state); @@ -2002,16 +2491,18 @@ rt_dump_hooks(rtable *t) ih->last_state_change, ih->import_state, ih->stopped); } - struct rt_export_hook *eh; - WALK_LIST(eh, tab->exports) + struct rt_table_export_hook *eh; + WALK_LIST(eh, tab->exporter.e.hooks) { - eh->req->dump_req(eh->req); + eh->h.req->dump_req(eh->h.req); debug(" Export hook %p requested by %p:" " refeed_pending=%u last_state_change=%t export_state=%u\n", - eh, eh->req, eh->refeed_pending, eh->last_state_change, atomic_load_explicit(&eh->export_state, memory_order_relaxed)); + eh, eh->h.req, eh->refeed_pending, eh->h.last_state_change, + atomic_load_explicit(&eh->h.export_state, memory_order_relaxed)); } debug("\n"); - RT_UNLOCK(t); + + } } void @@ -2024,156 +2515,279 @@ rt_dump_hooks_all(void) WALK_LIST2(t, n, routing_tables, n) rt_dump_hooks(t); + + WALK_LIST2(t, n, deleted_routing_tables, n) + rt_dump_hooks(t); } static inline void -rt_schedule_nhu(rtable *tab) +rt_schedule_nhu(struct rtable_private *tab) { - atomic_fetch_or_explicit(&tab->nhu_state, NHU_SCHEDULED, memory_order_acq_rel); - ev_send_loop(tab->loop, tab->nhu_event); - - /* state change: - * NHU_CLEAN -> NHU_SCHEDULED - * NHU_RUNNING -> NHU_DIRTY - */ + if (tab->nhu_corked) + { + if (!(tab->nhu_corked & NHU_SCHEDULED)) + tab->nhu_corked |= NHU_SCHEDULED; + } + else if (!(tab->nhu_state & NHU_SCHEDULED)) + { + rt_trace(tab, D_EVENTS, "Scheduling NHU"); + + /* state change: + * NHU_CLEAN -> NHU_SCHEDULED + * NHU_RUNNING -> NHU_DIRTY + */ + if ((tab->nhu_state |= NHU_SCHEDULED) == NHU_SCHEDULED) + birdloop_flag(tab->loop, RTF_NHU); + } } void -rt_schedule_prune(rtable_private *tab) +rt_schedule_prune(struct rtable_private *tab) { if (tab->prune_state == 0) - ev_send_loop(tab->loop, tab->prune_event); + birdloop_flag(tab->loop, RTF_CLEANUP); /* state change 0->1, 2->3 */ tab->prune_state |= 1; } -static int -rt_fast_prune_ready(rtable_private *tab) +static void +rt_export_used(struct rt_table_exporter *e, const char *who, const char *why) { - return EMPTY_LIST(tab->pending_exports) && EMPTY_LIST(tab->exports) && !tab->imports_up; + struct rtable_private *tab = SKIP_BACK(struct rtable_private, exporter, e); + ASSERT_DIE(RT_IS_LOCKED(tab)); + + rt_trace(tab, D_EVENTS, "Export cleanup requested by %s %s", who, why); + + if (tab->export_used) + return; + + tab->export_used = 1; + birdloop_flag(tab->loop, RTF_CLEANUP); } static void -rt_fast_prune_check(rtable_private *tab) +rt_flag_handler(struct birdloop_flag_handler *fh, u32 flags) { - if (tab->delete && rt_fast_prune_ready(tab)) + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, fh, fh)), tab) { - tab->prune_state |= 1; - ev_send_loop(tab->loop, tab->prune_event); - } -} + ASSERT_DIE(birdloop_inside(tab->loop)); + rt_lock_table(tab); -void -rt_export_used(rtable_private *tab) -{ - if (config->table_debug) - log(L_TRACE "%s: Export cleanup requested", tab->name); + if (flags & RTF_NHU) + rt_next_hop_update(tab); - ev_send_loop(tab->loop, tab->ec_event); -} + if (flags & RTF_EXPORT) + rt_kick_export_settle(tab); -static inline btime -rt_settled_time(rtable_private *tab) -{ - ASSUME(tab->base_settle_time != 0); + if (flags & RTF_CLEANUP) + { + if (tab->export_used) + rt_export_cleanup(tab); + + if (tab->prune_state) + rt_prune_table(tab); + } - btime min_settle_time = tab->rr_count ? tab->config->min_rr_settle_time : tab->config->min_settle_time; - btime max_settle_time = tab->rr_count ? tab->config->max_rr_settle_time : tab->config->max_settle_time; + if (flags & RTF_DELETE) + { + if (tab->hostcache) + rt_stop_export(&tab->hostcache->req, NULL); - DBG("settled time computed from %t %t %t %t as %t / %t, now is %t\n", - tab->name, tab->last_rt_change, min_settle_time, - tab->base_settle_time, max_settle_time, - tab->last_rt_change + min_settle_time, - tab->base_settle_time + max_settle_time, current_time()); + rt_unlock_table(tab); + } - return MIN(tab->last_rt_change + min_settle_time, - tab->base_settle_time + max_settle_time); + rt_unlock_table(tab); + } } static void -rt_settle_timer(timer *t) +rt_prune_timer(timer *t) { - rtable_private *tab = t->data; - ASSERT_DIE(birdloop_inside(tab->loop)); + RT_LOCKED((rtable *) t->data, tab) + if (tab->gc_counter >= tab->config->gc_threshold) + rt_schedule_prune(tab); +} - if (!tab->base_settle_time) +static void +rt_kick_prune_timer(struct rtable_private *tab) +{ + /* Return if prune is already scheduled */ + if (tm_active(tab->prune_timer) || (tab->prune_state & 1)) return; - btime settled_time = rt_settled_time(tab); - if (current_time() < settled_time) + /* Randomize GC period to +/- 50% */ + btime gc_period = tab->config->gc_period; + gc_period = (gc_period / 2) + (random_u32() % (uint) gc_period); + tm_start_in(tab->prune_timer, gc_period, tab->loop); +} + + +static void +rt_flowspec_export_one(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) +{ + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rtable *dst_pub = ln->dst; + ASSUME(rt_is_flow(dst_pub)); + struct rtable_private *dst = RT_LOCK(dst_pub); + + /* No need to inspect it further if recalculation is already scheduled */ + if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY) + || !trie_match_net(dst->flowspec_trie, net)) { - tm_set_in(tab->settle_timer, settled_time, tab->loop); + RT_UNLOCK(dst_pub); + rpe_mark_seen_all(req->hook, first, NULL, NULL); return; } - /* Settled */ - tab->base_settle_time = 0; + /* This net may affect some flowspecs, check the actual change */ + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; + + RPE_WALK(first, rpe, NULL) + { + rpe_mark_seen(req->hook, rpe); + new_best = rpe->new_best; + } + + /* Yes, something has actually changed. Schedule the update. */ + if (o != RTE_VALID_OR_NULL(new_best)) + rt_schedule_nhu(dst); - struct rt_subscription *s; - WALK_LIST(s, tab->subscribers) - ev_send(s->event->list, s->event); + RT_UNLOCK(dst_pub); } static void -rt_kick_settle_timer(rtable_private *tab) +rt_flowspec_dump_req(struct rt_export_request *req) { - tab->base_settle_time = current_time(); - - if (!tab->settle_timer) - tab->settle_timer = tm_new_init(tab->rp, rt_settle_timer, tab, 0, 0); - - if (!tm_active(tab->settle_timer)) - tm_set_in(tab->settle_timer, rt_settled_time(tab), tab->loop); + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + debug(" Flowspec link for table %s (%p)\n", ln->dst->name, req); } -static inline void -rt_schedule_notify(rtable_private *tab) +static void +rt_flowspec_log_state_change(struct rt_export_request *req, u8 state) { - if (EMPTY_LIST(tab->subscribers)) - return; + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rt_trace(ln->dst, D_STATES, "Flowspec link from %s export state changed to %s", + ln->src->name, rt_export_state_name(state)); +} - if (tab->base_settle_time) - return; +static struct rt_flowspec_link * +rt_flowspec_find_link(struct rtable_private *src, rtable *dst) +{ + struct rt_table_export_hook *hook; node *n; + WALK_LIST2(hook, n, src->exporter.e.hooks, h.n) + switch (atomic_load_explicit(&hook->h.export_state, memory_order_acquire)) + { + case TES_HUNGRY: + case TES_FEEDING: + case TES_READY: + if (hook->h.req->export_one == rt_flowspec_export_one) + { + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, hook->h.req); + if (ln->dst == dst) + return ln; + } + } - rt_kick_settle_timer(tab); + return NULL; } void -rt_subscribe(rtable *t, struct rt_subscription *s) +rt_flowspec_link(rtable *src_pub, rtable *dst_pub) { - s->tab = t; - RT_LOCKED(t, tab) + ASSERT(rt_is_ip(src_pub)); + ASSERT(rt_is_flow(dst_pub)); + + int lock_dst = 0; + + birdloop_enter(dst_pub->loop); + + RT_LOCKED(src_pub, src) { - rt_lock_table(tab); - DBG("rt_subscribe(%s)\n", tab->name); - add_tail(&tab->subscribers, &s->n); + struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst_pub); + + if (!ln) + { + pool *p = src->rp; + ln = mb_allocz(p, sizeof(struct rt_flowspec_link)); + ln->src = src_pub; + ln->dst = dst_pub; + ln->req = (struct rt_export_request) { + .name = mb_sprintf(p, "%s.flowspec.notifier", dst_pub->name), + .list = birdloop_event_list(dst_pub->loop), + .trace_routes = src->config->debug, + .dump_req = rt_flowspec_dump_req, + .log_state_change = rt_flowspec_log_state_change, + .export_one = rt_flowspec_export_one, + }; + + rt_table_export_start_locked(src, &ln->req); + + lock_dst = 1; + } + + ln->uc++; } + + if (lock_dst) + rt_lock_table(dst_pub); + + birdloop_leave(dst_pub->loop); +} + +static void +rt_flowspec_link_stopped(struct rt_export_request *req) +{ + struct rt_flowspec_link *ln = SKIP_BACK(struct rt_flowspec_link, req, req); + rtable *dst = ln->dst; + + mb_free(ln); + rt_unlock_table(dst); } void -rt_unsubscribe(struct rt_subscription *s) +rt_flowspec_unlink(rtable *src, rtable *dst) { - RT_LOCKED(s->tab, tab) + birdloop_enter(dst->loop); + + struct rt_flowspec_link *ln; + RT_LOCKED(src, t) { - rem_node(&s->n); - if (EMPTY_LIST(tab->subscribers) && tm_active(tab->settle_timer)) - tm_stop(tab->settle_timer); - rt_unlock_table(tab); + ln = rt_flowspec_find_link(t, dst); + + ASSERT(ln && (ln->uc > 0)); + + if (!--ln->uc) + rt_stop_export(&ln->req, rt_flowspec_link_stopped); } + + birdloop_leave(dst->loop); +} + +static void +rt_flowspec_reset_trie(struct rtable_private *tab) +{ + linpool *lp = tab->flowspec_trie->lp; + int ipv4 = tab->flowspec_trie->ipv4; + + lp_flush(lp); + tab->flowspec_trie = f_new_trie(lp, 0); + tab->flowspec_trie->ipv4 = ipv4; } static void rt_free(resource *_r) { - rtable_private *r = (rtable_private *) _r; + struct rtable_private *r = SKIP_BACK(struct rtable_private, r, _r); + + DOMAIN_FREE(rtable, r->lock); DBG("Deleting routing table %s\n", r->name); ASSERT_DIE(r->use_count == 0); - ASSERT_DIE(r->rt_count == 0); - ASSERT_DIE(!r->cork_active); - ASSERT_DIE(EMPTY_LIST(r->imports)); - ASSERT_DIE(EMPTY_LIST(r->exports)); + + r->config->table = NULL; + rem_node(&r->n); if (r->hostcache) rt_free_hostcache(r); @@ -2182,81 +2796,123 @@ rt_free(resource *_r) fib_free(&r->fib); hmap_free(&r->id_map); rfree(r->rt_event); - rfree(r->settle_timer); mb_free(r); */ } static void -rt_res_dump(resource *_r) +rt_res_dump(resource *_r, unsigned indent) { - rtable_private *r = RT_PRIV((rtable *) _r); + struct rtable_private *r = SKIP_BACK(struct rtable_private, r, _r); + debug("name \"%s\", addr_type=%s, rt_count=%u, use_count=%d\n", r->name, net_label[r->addr_type], r->rt_count, r->use_count); + + char x[32]; + bsprintf(x, "%%%dspending export %%p\n", indent + 2); + + node *n; + WALK_LIST(n, r->exporter.pending) + debug(x, "", n); } static struct resclass rt_class = { .name = "Routing table", - .size = sizeof(rtable_private), + .size = sizeof(rtable), .free = rt_free, .dump = rt_res_dump, .lookup = NULL, .memsize = NULL, }; +static const struct rt_exporter_class rt_table_exporter_class = { + .start = rt_table_export_start, + .stop = rt_table_export_stop, + .done = rt_table_export_done, +}; + +void +rt_exporter_init(struct rt_exporter *e) +{ + init_list(&e->hooks); +} + +static struct idm rtable_idm; +uint rtable_max_id = 0; + rtable * rt_setup(pool *pp, struct rtable_config *cf) { - int ns = strlen("Routing table ") + strlen(cf->name) + 1; - void *nb = mb_alloc(pp, ns); - ASSERT_DIE(ns - 1 == bsnprintf(nb, ns, "Routing table %s", cf->name)); + ASSERT_DIE(birdloop_inside(&main_birdloop)); - struct birdloop *l = birdloop_new(pp, DOMAIN_ORDER(rtable), nb); - pool *p = birdloop_pool(l); + pool *p = rp_newf(pp, "Routing table %s", cf->name); - birdloop_enter(l); - - rtable_private *t = ralloc(p, &rt_class); + struct rtable_private *t = ralloc(p, &rt_class); t->rp = p; - t->loop = l; t->rte_slab = sl_new(p, sizeof(struct rte_storage)); t->name = cf->name; t->config = cf; t->addr_type = cf->addr_type; + t->id = idm_alloc(&rtable_idm); + if (t->id >= rtable_max_id) + rtable_max_id = t->id + 1; + + t->lock = DOMAIN_NEW(rtable, t->name); fib_init(&t->fib, p, t->addr_type, sizeof(net), OFFSETOF(net, n), 0, NULL); + if (cf->trie_used) + { + t->trie = f_new_trie(lp_new_default(p), 0); + t->trie->ipv4 = net_val_match(t->addr_type, NB_IP4 | NB_VPN4 | NB_ROA4); + + t->fib.init = net_init_with_trie; + } + init_list(&t->imports); - init_list(&t->exports); hmap_init(&t->id_map, p, 1024); hmap_set(&t->id_map, 0); - init_list(&t->pending_exports); - init_list(&t->subscribers); + t->fh = (struct birdloop_flag_handler) { .hook = rt_flag_handler, }; + t->nhu_uncork_event = ev_new_init(p, rt_nhu_uncork, t); + t->prune_timer = tm_new_init(p, rt_prune_timer, t, 0, 0); + t->last_rt_change = t->gc_time = current_time(); + + t->export_settle = SETTLE_INIT(&cf->export_settle, rt_announce_exports, NULL); - t->announce_event = ev_new_init(p, rt_announce_exports, t); - t->ec_event = ev_new_init(p, rt_export_cleanup, t); - t->prune_event = ev_new_init(p, rt_prune_table, t); - t->hcu_event = ev_new_init(p, rt_update_hostcache, t); - t->nhu_event = ev_new_init(p, rt_next_hop_update, t); + t->exporter = (struct rt_table_exporter) { + .e = { + .class = &rt_table_exporter_class, + .addr_type = t->addr_type, + .rp = t->rp, + }, + .next_seq = 1, + }; - t->nhu_event->cork = &rt_cork; - t->prune_event->cork = &rt_cork; + rt_exporter_init(&t->exporter.e); - t->last_rt_change = t->gc_time = current_time(); - t->next_export_seq = 1; + init_list(&t->exporter.pending); + + t->cork_threshold = cf->cork_threshold; t->rl_pipe = (struct tbf) TBF_DEFAULT_LOG_LIMITS; - t->nhu_lp = lp_new_default(p); + if (rt_is_flow(RT_PUB(t))) + { + t->flowspec_trie = f_new_trie(lp_new_default(p), 0); + t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4); + } - mb_move(nb, p); - birdloop_leave(l); + /* Start the service thread */ + t->loop = birdloop_new(p, DOMAIN_ORDER(service), mb_sprintf(p, "Routing table %s", t->name), 0); + birdloop_enter(t->loop); + birdloop_flag_set_handler(t->loop, &t->fh); + birdloop_leave(t->loop); - return (rtable *) t; + return RT_PUB(t); } /** @@ -2269,11 +2925,15 @@ void rt_init(void) { rta_init(); - rt_table_pool = rp_new(&root_pool, &main_birdloop, "Routing tables"); + rt_table_pool = rp_new(&root_pool, "Routing tables"); init_list(&routing_tables); - ev_init_cork(&rt_cork, "Route Table Cork"); + init_list(&deleted_routing_tables); + ev_init_list(&rt_cork.queue, &main_birdloop, "Route cork release"); + rt_cork.run = (event) { .hook = rt_cork_release_hook }; + idm_init(&rtable_idm, rt_table_pool, 256); } + /** * rt_prune_table - prune a routing table * @@ -2289,18 +2949,15 @@ rt_init(void) * iteration. */ static void -rt_prune_table(void *data) +rt_prune_table(struct rtable_private *tab) { - rtable_private *tab = data; - ASSERT_DIE(birdloop_inside(tab->loop)); - struct fib_iterator *fit = &tab->prune_fit; - int limit = tab->delete ? 16384 : 512; + int limit = 2000; struct rt_import_hook *ih; node *n, *x; - DBG("Pruning route table %s\n", tab->name); + rt_trace(tab, D_STATES, "Pruning"); #ifdef DEBUGGING fib_check(&tab->fib); #endif @@ -2308,11 +2965,6 @@ rt_prune_table(void *data) if (tab->prune_state == 0) return; - if (tab->delete && !rt_fast_prune_ready(tab)) - return; - - rt_lock_table(tab); - if (tab->prune_state == 1) { /* Mark channels to flush */ @@ -2329,50 +2981,36 @@ rt_prune_table(void *data) FIB_ITERATE_INIT(fit, &tab->fib); tab->prune_state = 2; + + tab->gc_counter = 0; + tab->gc_time = current_time(); + + if (tab->prune_trie) + { + /* Init prefix trie pruning */ + tab->trie_new = f_new_trie(lp_new_default(tab->rp), 0); + tab->trie_new->ipv4 = tab->trie->ipv4; + } } again: FIB_ITERATE_START(&tab->fib, fit, net, n) { - if (tab->delete) + rescan: + if (limit <= 0) { - ASSERT_DIE(!n->first); - - for (struct rte_storage *e = n->routes, *next; e; e = next) - { - next = e->next; - - struct rt_import_request *req = e->rte.sender->req; - if (req->preimport) - req->preimport(req, NULL, &e->rte); - - tab->rt_count--; - hmap_clear(&tab->id_map, e->rte.id); - rte_free(e, tab); - limit--; - } - - n->routes = NULL; + FIB_ITERATE_PUT(fit); + birdloop_flag(tab->loop, RTF_CLEANUP); + return; } - else - rescan: + for (struct rte_storage *e=n->routes; e; e=e->next) { struct rt_import_hook *s = e->rte.sender; - if ((s->import_state == TIS_FLUSHING) || (e->rte.stale_cycle < s->stale_valid) || (e->rte.stale_cycle > s->stale_set)) { - if (limit <= 0) - { - FIB_ITERATE_PUT(fit); - ev_send_loop(tab->loop, tab->prune_event); - ev_send_loop(tab->loop, tab->announce_event); - rt_unlock_table(tab); - return; - } - rte_recalculate(tab, e->rte.sender, n, NULL, e->rte.src); limit--; @@ -2386,66 +3024,110 @@ again: fib_delete(&tab->fib, n); goto again; } + + if (tab->trie_new) + { + trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen); + limit--; + } } FIB_ITERATE_END; + rt_trace(tab, D_EVENTS, "Prune done, scheduling export timer"); + rt_kick_export_settle(tab); + #ifdef DEBUGGING fib_check(&tab->fib); #endif - tab->gc_counter = 0; - tab->gc_time = current_time(); - /* state change 2->0, 3->1 */ if (tab->prune_state &= 1) - ev_send_loop(tab->loop, tab->prune_event); + birdloop_flag(tab->loop, RTF_CLEANUP); - uint flushed_channels = 0; + if (tab->trie_new) + { + /* Finish prefix trie pruning */ + + if (!tab->trie_lock_count) + { + rfree(tab->trie->lp); + } + else + { + ASSERT(!tab->trie_old); + tab->trie_old = tab->trie; + tab->trie_old_lock_count = tab->trie_lock_count; + tab->trie_lock_count = 0; + } + + tab->trie = tab->trie_new; + tab->trie_new = NULL; + tab->prune_trie = 0; + } + else + { + /* Schedule prefix trie pruning */ + if (tab->trie && !tab->trie_old && (tab->trie->prefix_count > (2 * tab->fib.entries))) + { + /* state change 0->1, 2->3 */ + tab->prune_state |= 1; + tab->prune_trie = 1; + } + } /* Close flushed channels */ WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) if (ih->import_state == TIS_FLUSHING) { - ih->flush_seq = tab->next_export_seq; + DBG("flushing %s %s rr %u", ih->req->name, tab->name, tab->rr_counter); + ih->flush_seq = tab->exporter.next_seq; rt_set_import_state(ih, TIS_WAITING); - flushed_channels++; + tab->rr_counter--; + tab->wait_counter++; } else if (ih->stale_pruning != ih->stale_pruned) { + DBG("pruning %s %s rr %u set %u valid %u pruning %u pruned %u", ih->req->name, tab->name, tab->rr_counter, ih->stale_set, ih->stale_valid, ih->stale_pruning, ih->stale_pruned); + tab->rr_counter -= (ih->stale_pruning - ih->stale_pruned); ih->stale_pruned = ih->stale_pruning; - if (ih->req->trace_routes & D_STATES) log(L_TRACE "%s: table prune after refresh end [%u]", ih->req->name, ih->stale_pruned); } /* In some cases, we may want to directly proceed to export cleanup */ - if (EMPTY_LIST(tab->exports) && flushed_channels) + if (tab->wait_counter && (EMPTY_LIST(tab->exporter.e.hooks) || !tab->exporter.first)) rt_export_cleanup(tab); - - ev_send_loop(tab->loop, tab->announce_event); - rt_unlock_table(tab); } static void -rt_export_cleanup(void *data) +rt_export_cleanup(struct rtable_private *tab) { - rtable_private *tab = data; - ASSERT_DIE(birdloop_inside(tab->loop)); + tab->export_used = 0; u64 min_seq = ~((u64) 0); struct rt_pending_export *last_export_to_free = NULL; - struct rt_pending_export *first_export = tab->first_export; + struct rt_pending_export *first = tab->exporter.first; + int want_prune = 0; - struct rt_export_hook *eh; + struct rt_table_export_hook *eh; node *n; - WALK_LIST2(eh, n, tab->exports, n) + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) { - switch (atomic_load_explicit(&eh->export_state, memory_order_acquire)) + switch (atomic_load_explicit(&eh->h.export_state, memory_order_acquire)) { - case TES_DOWN: - case TES_HUNGRY: + /* Export cleanup while feeding isn't implemented */ + case TES_FEEDING: + goto done; + + /* States not interfering with export cleanup */ + case TES_DOWN: /* This should not happen at all */ + log(L_WARN "%s: Export cleanup found hook %s in explicit state TES_DOWN", tab->name, eh->h.req->name); + /* fall through */ + case TES_HUNGRY: /* Feeding waiting for uncork */ + case TES_STOP: /* No more export will happen on this hook */ continue; + /* Regular export */ case TES_READY: { struct rt_pending_export *last = atomic_load_explicit(&eh->last_export, memory_order_acquire); @@ -2462,23 +3144,20 @@ rt_export_cleanup(void *data) } default: - /* It's only safe to cleanup when the export state is idle or regular. No feeding or stopping allowed. */ - goto done; + bug("%s: Strange export state of hook %s: %d", tab->name, eh->h.req->name, atomic_load_explicit(&eh->h.export_state, memory_order_relaxed)); } } - tab->first_export = last_export_to_free ? rt_next_export_fast(last_export_to_free) : NULL; + tab->exporter.first = last_export_to_free ? rt_next_export_fast(last_export_to_free) : NULL; - if (config->table_debug) - log(L_TRACE "%s: Export cleanup, old first_export seq %lu, new %lu, min_seq %ld", - tab->name, - first_export ? first_export->seq : 0, - tab->first_export ? tab->first_export->seq : 0, + rt_trace(tab, D_STATES, "Export cleanup, old exporter.first seq %lu, new %lu, min_seq %ld", + first ? first->seq : 0, + tab->exporter.first ? tab->exporter.first->seq : 0, min_seq); - WALK_LIST2(eh, n, tab->exports, n) + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) { - if (atomic_load_explicit(&eh->export_state, memory_order_acquire) != TES_READY) + if (atomic_load_explicit(&eh->h.export_state, memory_order_acquire) != TES_READY) continue; struct rt_pending_export *last = atomic_load_explicit(&eh->last_export, memory_order_acquire); @@ -2494,45 +3173,47 @@ rt_export_cleanup(void *data) } } - while (first_export && (first_export->seq <= min_seq)) + while (first && (first->seq <= min_seq)) { - ASSERT_DIE(first_export->new || first_export->old); + ASSERT_DIE(first->new || first->old); - const net_addr *n = first_export->new ? - first_export->new->rte.net : - first_export->old->rte.net; + const net_addr *n = first->new ? + first->new->rte.net : + first->old->rte.net; net *net = SKIP_BACK(struct network, n.addr, (net_addr (*)[0]) n); - ASSERT_DIE(net->first == first_export); - - if (first_export == net->last) + ASSERT_DIE(net->first == first); + + if (first == net->last) /* The only export here */ net->last = net->first = NULL; else /* First is now the next one */ - net->first = atomic_load_explicit(&first_export->next, memory_order_relaxed); + net->first = atomic_load_explicit(&first->next, memory_order_relaxed); + + want_prune += !net->routes && !net->first; /* For now, the old route may be finally freed */ - if (first_export->old) + if (first->old) { - rt_rte_trace_in(D_ROUTES, first_export->old->rte.sender->req, &first_export->old->rte, "freed"); - hmap_clear(&tab->id_map, first_export->old->rte.id); - rte_free(first_export->old, tab); + rt_rte_trace_in(D_ROUTES, first->old->rte.sender->req, &first->old->rte, "freed"); + hmap_clear(&tab->id_map, first->old->rte.id); + rte_free(first->old); } #ifdef LOCAL_DEBUG - memset(first_export, 0xbd, sizeof(struct rt_pending_export)); + memset(first, 0xbd, sizeof(struct rt_pending_export)); #endif - struct rt_export_block *reb = HEAD(tab->pending_exports); - ASSERT_DIE(reb == PAGE_HEAD(first_export)); + struct rt_export_block *reb = HEAD(tab->exporter.pending); + ASSERT_DIE(reb == PAGE_HEAD(first)); - u32 pos = (first_export - &reb->export[0]); + u32 pos = (first - &reb->export[0]); u32 end = atomic_load_explicit(&reb->end, memory_order_relaxed); ASSERT_DIE(pos < end); struct rt_pending_export *next = NULL; - + if (++pos < end) next = &reb->export[pos]; else @@ -2545,66 +3226,160 @@ rt_export_cleanup(void *data) free_page(reb); - if (EMPTY_LIST(tab->pending_exports)) + if (EMPTY_LIST(tab->exporter.pending)) { - if (config->table_debug) - log(L_TRACE "%s: Resetting export seq", tab->name); + rt_trace(tab, D_EVENTS, "Resetting export seq"); node *n; - WALK_LIST2(eh, n, tab->exports, n) + WALK_LIST2(eh, n, tab->exporter.e.hooks, h.n) { - if (atomic_load_explicit(&eh->export_state, memory_order_acquire) != TES_READY) + if (atomic_load_explicit(&eh->h.export_state, memory_order_acquire) != TES_READY) continue; ASSERT_DIE(atomic_load_explicit(&eh->last_export, memory_order_acquire) == NULL); - bmap_reset(&eh->seq_map, 1024); + bmap_reset(&eh->h.seq_map, 16); } - tab->next_export_seq = 1; + tab->exporter.next_seq = 1; } else { - reb = HEAD(tab->pending_exports); + reb = HEAD(tab->exporter.pending); next = &reb->export[0]; } } - first_export = next; + first = next; } + rt_check_cork_low(tab); + done:; struct rt_import_hook *ih; node *x; - WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) - if (ih->import_state == TIS_WAITING) - if (!first_export || (first_export->seq >= ih->flush_seq)) - { - ih->import_state = TIS_CLEARED; - ev_send(ih->req->list, ih->export_announce_event); - } + if (tab->wait_counter) + WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) + if (ih->import_state == TIS_WAITING) + if (!first || (first->seq >= ih->flush_seq)) + { + ih->import_state = TIS_CLEARED; + tab->wait_counter--; + ev_send(ih->req->list, &ih->announce_event); + } + + if ((tab->gc_counter += want_prune) >= tab->config->gc_threshold) + rt_kick_prune_timer(tab); + + if (tab->export_used) + birdloop_flag(tab->loop, RTF_CLEANUP); - if (EMPTY_LIST(tab->pending_exports) && ev_active(tab->announce_event)) - ev_postpone(tab->announce_event); + if (EMPTY_LIST(tab->exporter.pending)) + settle_cancel(&tab->export_settle); +} + +static void +rt_cork_release_hook(void *data UNUSED) +{ + do synchronize_rcu(); + while ( + !atomic_load_explicit(&rt_cork.active, memory_order_acquire) && + ev_run_list(&rt_cork.queue) + ); +} - /* If reduced to at most one export block pending */ - if (tab->cork_active && - ((!tab->first_export) || (tab->first_export->seq + 128 > tab->next_export_seq))) +/** + * rt_lock_trie - lock a prefix trie of a routing table + * @tab: routing table with prefix trie to be locked + * + * The prune loop may rebuild the prefix trie and invalidate f_trie_walk_state + * structures. Therefore, asynchronous walks should lock the prefix trie using + * this function. That allows the prune loop to rebuild the trie, but postpones + * its freeing until all walks are done (unlocked by rt_unlock_trie()). + * + * Return a current trie that will be locked, the value should be passed back to + * rt_unlock_trie() for unlocking. + * + */ +struct f_trie * +rt_lock_trie(struct rtable_private *tab) +{ + ASSERT(tab->trie); + + tab->trie_lock_count++; + return tab->trie; +} + +/** + * rt_unlock_trie - unlock a prefix trie of a routing table + * @tab: routing table with prefix trie to be locked + * @trie: value returned by matching rt_lock_trie() + * + * Done for trie locked by rt_lock_trie() after walk over the trie is done. + * It may free the trie and schedule next trie pruning. + */ +void +rt_unlock_trie(struct rtable_private *tab, struct f_trie *trie) +{ + ASSERT(trie); + + if (trie == tab->trie) { - tab->cork_active = 0; - ev_uncork(&rt_cork); - if (config->table_debug) - log(L_TRACE "%s: cork released", tab->name); + /* Unlock the current prefix trie */ + ASSERT(tab->trie_lock_count); + tab->trie_lock_count--; } + else if (trie == tab->trie_old) + { + /* Unlock the old prefix trie */ + ASSERT(tab->trie_old_lock_count); + tab->trie_old_lock_count--; + + /* Free old prefix trie that is no longer needed */ + if (!tab->trie_old_lock_count) + { + rfree(tab->trie_old->lp); + tab->trie_old = NULL; - rt_fast_prune_check(tab); + /* Kick prefix trie pruning that was postponed */ + if (tab->trie && (tab->trie->prefix_count > (2 * tab->fib.entries))) + { + tab->prune_trie = 1; + rt_kick_prune_timer(tab); + } + } + } + else + log(L_BUG "Invalid arg to rt_unlock_trie()"); } + void rt_preconfig(struct config *c) { init_list(&c->tables); - rt_new_table(cf_get_symbol("master4"), NET_IP4); - rt_new_table(cf_get_symbol("master6"), NET_IP6); + c->def_tables[NET_IP4] = cf_define_symbol(cf_get_symbol("master4"), SYM_TABLE, table, NULL); + c->def_tables[NET_IP6] = cf_define_symbol(cf_get_symbol("master6"), SYM_TABLE, table, NULL); +} + +void +rt_postconfig(struct config *c) +{ + uint num_tables = list_length(&c->tables); + btime def_gc_period = 400 MS * num_tables; + def_gc_period = MAX(def_gc_period, 10 S); + def_gc_period = MIN(def_gc_period, 600 S); + + struct rtable_config *rc; + WALK_LIST(rc, c->tables) + if (rc->gc_period == (uint) -1) + rc->gc_period = (uint) def_gc_period; + + for (uint net_type = 0; net_type < NET_MAX; net_type++) + if (c->def_tables[net_type] && !c->def_tables[net_type]->table) + { + c->def_tables[net_type]->class = SYM_VOID; + c->def_tables[net_type] = NULL; + } } @@ -2613,180 +3388,453 @@ rt_preconfig(struct config *c) * triggered by rt_schedule_nhu(). */ -static inline int -rta_next_hop_outdated(rta *a) +void +ea_set_hostentry(ea_list **to, rtable *dep, rtable *src, ip_addr gw, ip_addr ll, u32 lnum, u32 labels[lnum]) { - struct hostentry *he = a->hostentry; - - if (!he) - return 0; - - if (!he->src) - return a->dest != RTD_UNREACHABLE; - - return (a->dest != he->dest) || (a->igp_metric != he->igp_metric) || - (!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh)); + struct { + struct adata ad; + struct hostentry *he; + u32 labels[0]; + } *head = (void *) tmp_alloc_adata(sizeof *head + sizeof(u32) * lnum - sizeof(struct adata)); + + RT_LOCKED(src, tab) + head->he = rt_get_hostentry(tab, gw, ll, dep); + memcpy(head->labels, labels, lnum * sizeof(u32)); + + ea_set_attr(to, EA_LITERAL_DIRECT_ADATA( + &ea_gen_hostentry, 0, &head->ad)); } -void -rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls, linpool *lp) + +static void +rta_apply_hostentry(ea_list **to, struct hostentry_adata *head) { - a->hostentry = he; - a->dest = he->dest; - a->igp_metric = he->igp_metric; + struct hostentry *he = head->he; + u32 *labels = head->labels; + u32 lnum = (u32 *) (head->ad.data + head->ad.length) - labels; + + ea_set_attr_u32(to, &ea_gen_igp_metric, 0, he->igp_metric); - if (a->dest != RTD_UNICAST) + if (!he->src) { - /* No nexthop */ -no_nexthop: - a->nh = (struct nexthop) {}; - if (mls) - { /* Store the label stack for later changes */ - a->nh.labels_orig = a->nh.labels = mls->len; - memcpy(a->nh.label, mls->stack, mls->len * sizeof(u32)); - } + ea_set_dest(to, 0, RTD_UNREACHABLE); return; } - if (((!mls) || (!mls->len)) && he->nexthop_linkable) + eattr *he_nh_ea = ea_find(he->src, &ea_gen_nexthop); + ASSERT_DIE(he_nh_ea); + + struct nexthop_adata *nhad = (struct nexthop_adata *) he_nh_ea->u.ptr; + int idest = nhea_dest(he_nh_ea); + + if ((idest != RTD_UNICAST) || + !lnum && he->nexthop_linkable) { /* Just link the nexthop chain, no label append happens. */ - memcpy(&(a->nh), &(he->src->nh), nexthop_size(&(he->src->nh))); + ea_copy_attr(to, he->src, &ea_gen_nexthop); return; } - struct nexthop *nhp = NULL, *nhr = NULL; - int skip_nexthop = 0; + uint total_size = OFFSETOF(struct nexthop_adata, nh); - for (struct nexthop *nh = &(he->src->nh); nh; nh = nh->next) + NEXTHOP_WALK(nh, nhad) { - if (skip_nexthop) - skip_nexthop--; - else + if (nh->labels + lnum > MPLS_MAX_LABEL_STACK) { - nhr = nhp; - nhp = (nhp ? (nhp->next = lp_alloc(lp, NEXTHOP_MAX_SIZE)) : &(a->nh)); + log(L_WARN "Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)", + nh->labels, lnum, nh->labels + lnum, MPLS_MAX_LABEL_STACK); + continue; } - memset(nhp, 0, NEXTHOP_MAX_SIZE); - nhp->iface = nh->iface; - nhp->weight = nh->weight; + total_size += NEXTHOP_SIZE_CNT(nh->labels + lnum); + } - if (mls) - { - nhp->labels = nh->labels + mls->len; - nhp->labels_orig = mls->len; - if (nhp->labels <= MPLS_MAX_LABEL_STACK) - { - memcpy(nhp->label, nh->label, nh->labels * sizeof(u32)); /* First the hostentry labels */ - memcpy(&(nhp->label[nh->labels]), mls->stack, mls->len * sizeof(u32)); /* Then the bottom labels */ - } - else - { - log(L_WARN "Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)", - nh->labels, mls->len, nhp->labels, MPLS_MAX_LABEL_STACK); - skip_nexthop++; - continue; - } - } - else if (nh->labels) + if (total_size == OFFSETOF(struct nexthop_adata, nh)) + { + log(L_WARN "No valid nexthop remaining, setting route unreachable"); + + struct nexthop_adata nha = { + .ad.length = NEXTHOP_DEST_SIZE, + .dest = RTD_UNREACHABLE, + }; + + ea_set_attr_data(to, &ea_gen_nexthop, 0, &nha.ad.data, nha.ad.length); + return; + } + + struct nexthop_adata *new = (struct nexthop_adata *) tmp_alloc_adata(total_size); + struct nexthop *dest = &new->nh; + + NEXTHOP_WALK(nh, nhad) + { + if (nh->labels + lnum > MPLS_MAX_LABEL_STACK) + continue; + + memcpy(dest, nh, NEXTHOP_SIZE(nh)); + if (lnum) { - nhp->labels = nh->labels; - nhp->labels_orig = 0; - memcpy(nhp->label, nh->label, nh->labels * sizeof(u32)); + memcpy(&(dest->label[dest->labels]), labels, lnum * sizeof labels[0]); + dest->labels += lnum; } if (ipa_nonzero(nh->gw)) - { - nhp->gw = nh->gw; /* Router nexthop */ - nhp->flags |= (nh->flags & RNF_ONLINK); - } + /* Router nexthop */ + dest->flags = (dest->flags & RNF_ONLINK); else if (!(nh->iface->flags & IF_MULTIACCESS) || (nh->iface->flags & IF_LOOPBACK)) - nhp->gw = IPA_NONE; /* PtP link - no need for nexthop */ + dest->gw = IPA_NONE; /* PtP link - no need for nexthop */ else if (ipa_nonzero(he->link)) - nhp->gw = he->link; /* Device nexthop with link-local address known */ + dest->gw = he->link; /* Device nexthop with link-local address known */ else - nhp->gw = he->addr; /* Device nexthop with link-local address unknown */ + dest->gw = he->addr; /* Device nexthop with link-local address unknown */ + + dest = NEXTHOP_NEXT(dest); } - if (skip_nexthop) - if (nhr) - nhr->next = NULL; - else + /* Fix final length */ + new->ad.length = (void *) dest - (void *) new->ad.data; + ea_set_attr(to, EA_LITERAL_DIRECT_ADATA( + &ea_gen_nexthop, 0, &new->ad)); +} + +static inline struct hostentry_adata * +rta_next_hop_outdated(ea_list *a) +{ + /* First retrieve the hostentry */ + eattr *heea = ea_find(a, &ea_gen_hostentry); + if (!heea) + return NULL; + + struct hostentry_adata *head = (struct hostentry_adata *) heea->u.ptr; + + /* If no nexthop is present, we have to create one */ + eattr *a_nh_ea = ea_find(a, &ea_gen_nexthop); + if (!a_nh_ea) + return head; + + struct nexthop_adata *nhad = (struct nexthop_adata *) a_nh_ea->u.ptr; + + /* Shortcut for unresolvable hostentry */ + if (!head->he->src) + return NEXTHOP_IS_REACHABLE(nhad) ? head : NULL; + + /* Comparing our nexthop with the hostentry nexthop */ + eattr *he_nh_ea = ea_find(head->he->src, &ea_gen_nexthop); + + return ( + (ea_get_int(a, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN) != head->he->igp_metric) || + (!head->he->nexthop_linkable) || + (!he_nh_ea != !a_nh_ea) || + (he_nh_ea && a_nh_ea && !adata_same(he_nh_ea->u.ptr, a_nh_ea->u.ptr))) + ? head : NULL; +} + +static inline int +rt_next_hop_update_rte(rte *old, rte *new) +{ + struct hostentry_adata *head = rta_next_hop_outdated(old->attrs); + if (!head) + return 0; + + *new = *old; + rta_apply_hostentry(&new->attrs, head); + return 1; +} + +static inline void +rt_next_hop_resolve_rte(rte *r) +{ + eattr *heea = ea_find(r->attrs, &ea_gen_hostentry); + if (!heea) + return; + + struct hostentry_adata *head = (struct hostentry_adata *) heea->u.ptr; + + rta_apply_hostentry(&r->attrs, head); +} + +#ifdef CONFIG_BGP + +static inline int +net_flow_has_dst_prefix(const net_addr *n) +{ + ASSUME(net_is_flow(n)); + + if (n->pxlen) + return 1; + + if (n->type == NET_FLOW4) + { + const net_addr_flow4 *n4 = (void *) n; + return (n4->length > sizeof(net_addr_flow4)) && (n4->data[0] == FLOW_TYPE_DST_PREFIX); + } + else + { + const net_addr_flow6 *n6 = (void *) n; + return (n6->length > sizeof(net_addr_flow6)) && (n6->data[0] == FLOW_TYPE_DST_PREFIX); + } +} + +static inline int +rta_as_path_is_empty(ea_list *a) +{ + eattr *e = ea_find(a, "bgp_path"); + return !e || (as_path_getlen(e->u.ptr) == 0); +} + +static inline u32 +rta_get_first_asn(ea_list *a) +{ + eattr *e = ea_find(a, "bgp_path"); + u32 asn; + + return (e && as_path_get_first_regular(e->u.ptr, &asn)) ? asn : 0; +} + +static inline enum flowspec_valid +rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, ea_list *a, int interior) +{ + ASSERT(rt_is_ip(tab_ip)); + ASSERT(rt_is_flow(tab_flow)); + + /* RFC 8955 6. a) Flowspec has defined dst prefix */ + if (!net_flow_has_dst_prefix(n)) + return FLOWSPEC_INVALID; + + /* RFC 9117 4.1. Accept AS_PATH is empty (fr */ + if (interior && rta_as_path_is_empty(a)) + return FLOWSPEC_VALID; + + + /* RFC 8955 6. b) Flowspec and its best-match route have the same originator */ + + /* Find flowspec dst prefix */ + net_addr dst; + if (n->type == NET_FLOW4) + net_fill_ip4(&dst, net4_prefix(n), net4_pxlen(n)); + else + net_fill_ip6(&dst, net6_prefix(n), net6_pxlen(n)); + + rte rb = {}; + net_addr_union nau; + RT_LOCKED(tab_ip, tip) + { + ASSERT(tip->trie); + /* Find best-match BGP unicast route for flowspec dst prefix */ + net *nb = net_route(tip, &dst); + if (nb) + { + rb = RTE_COPY_VALID(nb->routes); + rta_clone(rb.attrs); + net_copy(&nau.n, nb->n.addr); + rb.net = &nau.n; + } + } + + /* Register prefix to trie for tracking further changes */ + int max_pxlen = (n->type == NET_FLOW4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH; + RT_LOCKED(tab_flow, tfl) + trie_add_prefix(tfl->flowspec_trie, &dst, (rb.net ? rb.net->pxlen : 0), max_pxlen); + + /* No best-match BGP route -> no flowspec */ + if (!rb.attrs || (rt_get_source_attr(&rb) != RTS_BGP)) + return FLOWSPEC_INVALID; + + /* Find ORIGINATOR_ID values */ + u32 orig_a = ea_get_int(a, "bgp_originator_id", 0); + u32 orig_b = ea_get_int(rb.attrs, "bgp_originator_id", 0); + + /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */ + if ((orig_a != orig_b) || (!orig_a && !orig_b && !ipa_equal( + ea_get_ip(a, &ea_gen_from, IPA_NONE), + ea_get_ip(rb.attrs, &ea_gen_from, IPA_NONE) + ))) + return FLOWSPEC_INVALID; + + + /* Find ASN of the best-match route, for use in next checks */ + u32 asn_b = rta_get_first_asn(rb.attrs); + if (!asn_b) + return FLOWSPEC_INVALID; + + /* RFC 9117 4.2. For EBGP, flowspec and its best-match route are from the same AS */ + if (!interior && (rta_get_first_asn(a) != asn_b)) + return FLOWSPEC_INVALID; + + /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */ + RT_LOCKED(tab_ip, tip) + { + TRIE_WALK(tip->trie, subnet, &dst) { - a->dest = RTD_UNREACHABLE; - log(L_WARN "No valid nexthop remaining, setting route unreachable"); - goto no_nexthop; + net *nc = net_find_valid(tip, &subnet); + if (!nc) + continue; + + const rte *rc = &nc->routes->rte; + if (rt_get_source_attr(rc) != RTS_BGP) + RT_RETURN(tip, FLOWSPEC_INVALID); + + if (rta_get_first_asn(rc->attrs) != asn_b) + RT_RETURN(tip, FLOWSPEC_INVALID); } + TRIE_WALK_END; + } + + return FLOWSPEC_VALID; } -static inline struct rte_storage * -rt_next_hop_update_rte(rtable_private *tab, net *n, rte *old) +#endif /* CONFIG_BGP */ + +static int +rt_flowspec_update_rte(rtable *tab, rte *r, rte *new) { - rta *a = alloca(RTA_MAX_SIZE); - memcpy(a, old->attrs, rta_size(old->attrs)); +#ifdef CONFIG_BGP + if (r->generation || (rt_get_source_attr(r) != RTS_BGP)) + return 0; - mpls_label_stack mls = { .len = a->nh.labels_orig }; - memcpy(mls.stack, &a->nh.label[a->nh.labels - mls.len], mls.len * sizeof(u32)); + struct bgp_channel *bc = (struct bgp_channel *) SKIP_BACK(struct channel, in_req, r->sender->req); + if (!bc->base_table) + return 0; + + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, p, bc->c.proto); - rta_apply_hostentry(a, old->attrs->hostentry, &mls, tab->nhu_lp); - a->cached = 0; + enum flowspec_valid old = rt_get_flowspec_valid(r), + valid = rt_flowspec_check(bc->base_table, tab, r->net, r->attrs, p->is_interior); - rte e0 = *old; - e0.attrs = rta_lookup(a); + if (old == valid) + return 0; - return rte_store(&e0, n, tab); + *new = *r; + ea_set_attr_u32(&new->attrs, &ea_gen_flowspec_valid, 0, valid); + return 1; +#else + return 0; +#endif +} + +static inline void +rt_flowspec_resolve_rte(rte *r, struct channel *c) +{ +#ifdef CONFIG_BGP + enum flowspec_valid valid, old = rt_get_flowspec_valid(r); + struct bgp_channel *bc = (struct bgp_channel *) c; + + if ( (rt_get_source_attr(r) == RTS_BGP) + && (c->channel == &channel_bgp) + && (bc->base_table)) + { + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, p, bc->c.proto); + valid = rt_flowspec_check( + bc->base_table, + c->in_req.hook->table, + r->net, r->attrs, p->is_interior); + } + else + valid = FLOWSPEC_UNKNOWN; + + if (valid == old) + return; + + if (valid == FLOWSPEC_UNKNOWN) + ea_unset_attr(&r->attrs, 0, &ea_gen_flowspec_valid); + else + ea_set_attr_u32(&r->attrs, &ea_gen_flowspec_valid, 0, valid); +#endif } static inline int -rt_next_hop_update_net(rtable_private *tab, net *n) +rt_next_hop_update_net(struct rtable_private *tab, net *n) { - struct rte_storage *new; - int count = 0; + uint count = 0; + int is_flow = net_is_flow(n->n.addr); struct rte_storage *old_best = n->routes; if (!old_best) return 0; for (struct rte_storage *e, **k = &n->routes; e = *k; k = &e->next) - if (rta_next_hop_outdated(e->rte.attrs)) - count++; + count++; if (!count) return 0; struct rte_multiupdate { - struct rte_storage *old, *new; - } *updates = alloca(sizeof(struct rte_multiupdate) * count); + struct rte_storage *old, *new_stored; + rte new; + } *updates = tmp_allocz(sizeof(struct rte_multiupdate) * (count+1)); + + struct rt_pending_export *last_pending = n->last; - int pos = 0; + uint pos = 0; for (struct rte_storage *e, **k = &n->routes; e = *k; k = &e->next) - if (rta_next_hop_outdated(e->rte.attrs)) - { - struct rte_storage *new = rt_next_hop_update_rte(tab, n, &e->rte); + updates[pos++].old = e; + + /* This is an exceptional place where table can be unlocked while keeping its data: + * the reason why this is safe is that NHU must be always run from the same + * thread as cleanup routines, therefore the only real problem may arise when + * some importer does a change on this particular net (destination) while NHU + * is being computed. Statistically, this should almost never happen. In such + * case, we just drop all the computed changes and do it once again. + * */ + RT_UNLOCK(tab); - /* Call a pre-comparison hook */ - /* Not really an efficient way to compute this */ - if (e->rte.src->owner->rte_recalculate) - e->rte.src->owner->rte_recalculate(tab, n, &new->rte, &e->rte, &old_best->rte); + uint mod = 0; + if (is_flow) + for (uint i = 0; i < pos; i++) + mod += rt_flowspec_update_rte(RT_PUB(tab), &updates[i].old->rte, &updates[i].new); - updates[pos++] = (struct rte_multiupdate) { - .old = e, - .new = new, - }; + else + for (uint i = 0; i < pos; i++) + mod += rt_next_hop_update_rte(&updates[i].old->rte, &updates[i].new); - /* Replace the route in the list */ - new->next = e->next; - *k = e = new; + RT_LOCK(RT_PUB(tab)); + + if (!mod) + return 0; + /* Something has changed inbetween, retry NHU. */ + if (last_pending != n->last) + return rt_next_hop_update_net(tab, n); + + /* Now we reconstruct the original linked list */ + struct rte_storage **nptr = &n->routes; + for (uint i = 0; i < pos; i++) + { + updates[i].old->next = NULL; + + struct rte_storage *put; + if (updates[i].new.attrs) + put = updates[i].new_stored = rte_store(&updates[i].new, n, tab); + else + put = updates[i].old; + + *nptr = put; + nptr = &put->next; + } + *nptr = NULL; + + /* Call the pre-comparison hooks */ + for (uint i = 0; i < pos; i++) + if (updates[i].new_stored) + { /* Get a new ID for the route */ - new->rte.lastmod = current_time(); - new->rte.id = hmap_first_zero(&tab->id_map); - hmap_set(&tab->id_map, new->rte.id); + updates[i].new_stored->rte.lastmod = current_time(); + updates[i].new_stored->rte.id = hmap_first_zero(&tab->id_map); + hmap_set(&tab->id_map, updates[i].new_stored->rte.id); - lp_flush(tab->nhu_lp); + /* Call a pre-comparison hook */ + /* Not really an efficient way to compute this */ + if (updates[i].old->rte.src->owner->rte_recalculate) + updates[i].old->rte.src->owner->rte_recalculate(tab, n, &updates[i].new_stored->rte, &updates[i].old->rte, &old_best->rte); } - ASSERT_DIE(pos == count); +#if DEBUGGING + { + uint t = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + t++; + ASSERT_DIE(t == pos); + ASSERT_DIE(pos == count); + } +#endif /* Find the new best route */ struct rte_storage **new_best = NULL; @@ -2797,7 +3845,7 @@ rt_next_hop_update_net(rtable_private *tab, net *n) } /* Relink the new best route to the first position */ - new = *new_best; + struct rte_storage *new = *new_best; if (new != n->routes) { *new_best = new->next; @@ -2805,95 +3853,170 @@ rt_next_hop_update_net(rtable_private *tab, net *n) n->routes = new; } + uint total = 0; /* Announce the changes */ - for (int i=0; i<count; i++) + for (uint i=0; i<count; i++) { - _Bool nb = (new == updates[i].new), ob = (old_best == updates[i].old); + if (!updates[i].new_stored) + continue; + + _Bool nb = (new->rte.src == updates[i].new.src), ob = (i == 0); const char *best_indicator[2][2] = { { "autoupdated", "autoupdated [-best]" }, { "autoupdated [+best]", "autoupdated [best]" } }; - rt_rte_trace_in(D_ROUTES, updates[i].new->rte.sender->req, &updates[i].new->rte, best_indicator[nb][ob]); - rte_announce(tab, n, updates[i].new, updates[i].old, new, old_best); + rt_rte_trace_in(D_ROUTES, updates[i].new.sender->req, &updates[i].new, best_indicator[nb][ob]); + rte_announce(tab, n, updates[i].new_stored, updates[i].old, new, old_best); + + total++; } - return count; + return total; +} + +static void +rt_nhu_uncork(void *_tab) +{ + RT_LOCKED((rtable *) _tab, tab) + { + ASSERT_DIE(tab->nhu_corked); + ASSERT_DIE(tab->nhu_state == 0); + + /* Reset the state */ + tab->nhu_state = tab->nhu_corked; + tab->nhu_corked = 0; + rt_trace(tab, D_STATES, "Next hop updater uncorked"); + + birdloop_flag(tab->loop, RTF_NHU); + } } static void -rt_next_hop_update(void *data) +rt_next_hop_update(struct rtable_private *tab) { - rtable_private *tab = data; ASSERT_DIE(birdloop_inside(tab->loop)); - struct fib_iterator *fit = &tab->nhu_fit; - int max_feed = 32; + if (tab->nhu_corked) + return; - if (atomic_load_explicit(&tab->nhu_state, memory_order_acquire) == NHU_CLEAN) + if (!tab->nhu_state) return; - rt_lock_table(tab); + /* Check corkedness */ + if (rt_cork_check(tab->nhu_uncork_event)) + { + rt_trace(tab, D_STATES, "Next hop updater corked"); + if ((tab->nhu_state & NHU_RUNNING) + && !EMPTY_LIST(tab->exporter.pending)) + rt_kick_export_settle(tab); - if (atomic_load_explicit(&tab->nhu_state, memory_order_acquire) == NHU_SCHEDULED) - { - FIB_ITERATE_INIT(fit, &tab->fib); - ASSERT_DIE(atomic_exchange_explicit(&tab->nhu_state, NHU_RUNNING, memory_order_acq_rel) == NHU_SCHEDULED); - } + tab->nhu_corked = tab->nhu_state; + tab->nhu_state = 0; + return; + } + + struct fib_iterator *fit = &tab->nhu_fit; + int max_feed = 32; + /* Initialize a new run */ + if (tab->nhu_state == NHU_SCHEDULED) + { + FIB_ITERATE_INIT(fit, &tab->fib); + tab->nhu_state = NHU_RUNNING; + + if (tab->flowspec_trie) + rt_flowspec_reset_trie(tab); + } + + /* Walk the fib one net after another */ FIB_ITERATE_START(&tab->fib, fit, net, n) { if (max_feed <= 0) { FIB_ITERATE_PUT(fit); - ev_send_loop(tab->loop, tab->nhu_event); - rt_unlock_table(tab); + birdloop_flag(tab->loop, RTF_NHU); return; } + lp_state lps; + lp_save(tmp_linpool, &lps); max_feed -= rt_next_hop_update_net(tab, n); + lp_restore(tmp_linpool, &lps); } FIB_ITERATE_END; + /* Finished NHU, cleanup */ + rt_trace(tab, D_EVENTS, "NHU done, scheduling export timer"); + rt_kick_export_settle(tab); + /* State change: * NHU_DIRTY -> NHU_SCHEDULED * NHU_RUNNING -> NHU_CLEAN */ - if (atomic_fetch_and_explicit(&tab->nhu_state, NHU_SCHEDULED, memory_order_acq_rel) != NHU_RUNNING) - ev_send_loop(tab->loop, tab->nhu_event); + if ((tab->nhu_state &= NHU_SCHEDULED) == NHU_SCHEDULED) + birdloop_flag(tab->loop, RTF_NHU); +} - ev_send_loop(tab->loop, tab->announce_event); +void +rt_new_default_table(struct symbol *s) +{ + for (uint addr_type = 0; addr_type < NET_MAX; addr_type++) + if (s == new_config->def_tables[addr_type]) + { + ASSERT_DIE(!s->table); + s->table = rt_new_table(s, addr_type); + return; + } - rt_unlock_table(tab); + bug("Requested an unknown new default table: %s", s->name); } +struct rtable_config * +rt_get_default_table(struct config *cf, uint addr_type) +{ + struct symbol *ts = cf->def_tables[addr_type]; + if (!ts) + return NULL; + + if (!ts->table) + rt_new_default_table(ts); + + return ts->table; +} struct rtable_config * rt_new_table(struct symbol *s, uint addr_type) { - /* Hack that allows to 'redefine' the master table */ - if ((s->class == SYM_TABLE) && - (s->table == new_config->def_tables[addr_type]) && - ((addr_type == NET_IP4) || (addr_type == NET_IP6))) - return s->table; + if (s->table) + cf_error("Duplicate configuration of table %s", s->name); struct rtable_config *c = cfg_allocz(sizeof(struct rtable_config)); - cf_define_symbol(s, SYM_TABLE, table, c); + if (s == new_config->def_tables[addr_type]) + s->table = c; + else + cf_define_symbol(s, SYM_TABLE, table, c); + c->name = s->name; c->addr_type = addr_type; - c->gc_max_ops = 1000; - c->gc_min_time = 5; - c->min_settle_time = 1 S; - c->max_settle_time = 20 S; - c->min_rr_settle_time = 30 S; - c->max_rr_settle_time = 90 S; - c->cork_limit = 4 * page_size / sizeof(struct rt_pending_export); - c->owner = new_config; + c->gc_threshold = 1000; + c->gc_period = (uint) -1; /* set in rt_postconfig() */ + c->cork_threshold.low = 1024; + c->cork_threshold.high = 8192; + c->export_settle = (struct settle_config) { + .min = 1 MS, + .max = 100 MS, + }; + c->export_rr_settle = (struct settle_config) { + .min = 100 MS, + .max = 3 S, + }; + c->debug = new_config->table_debug; add_tail(&new_config->tables, &c->n); /* First table of each type is kept as default */ if (! new_config->def_tables[addr_type]) - new_config->def_tables[addr_type] = c; + new_config->def_tables[addr_type] = s; return c; } @@ -2907,8 +4030,9 @@ rt_new_table(struct symbol *s, uint addr_type) * configuration. */ void -rt_lock_table(rtable_private *r) +rt_lock_table_priv(struct rtable_private *r, const char *file, uint line) { + rt_trace(r, D_STATES, "Locked at %s:%d", file, line); r->use_count++; } @@ -2921,35 +4045,110 @@ rt_lock_table(rtable_private *r) * for deletion by configuration changes. */ void -rt_unlock_table(rtable_private *r) +rt_unlock_table_priv(struct rtable_private *r, const char *file, uint line) { - if (!--r->use_count && r->delete && - !r->prune_state && !atomic_load_explicit(&r->nhu_state, memory_order_acquire)) - birdloop_stop_self(r->loop, r->delete, r); + rt_trace(r, D_STATES, "Unlocked at %s:%d", file, line); + if (!--r->use_count && r->deleted) + /* Stop the service thread to finish this up */ + ev_send(&global_event_list, ev_new_init(r->rp, rt_shutdown, r)); } -static struct rtable_config * -rt_find_table_config(struct config *cf, char *name) +static void +rt_shutdown(void *tab_) { - struct symbol *sym = cf_find_symbol(cf, name); - return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL; + struct rtable_private *r = tab_; + birdloop_stop(r->loop, rt_delete, r); +} + +static void +rt_delete(void *tab_) +{ + birdloop_enter(&main_birdloop); + + /* We assume that nobody holds the table reference now as use_count is zero. + * Anyway the last holder may still hold the lock. Therefore we lock and + * unlock it the last time to be sure that nobody is there. */ + struct rtable_private *tab = RT_LOCK((rtable *) tab_); + struct config *conf = tab->deleted; + + RT_UNLOCK(RT_PUB(tab)); + + birdloop_free(tab->loop); + rfree(tab->rp); + config_del_obstacle(conf); + + birdloop_leave(&main_birdloop); } + static void -rt_done(void *data) +rt_check_cork_low(struct rtable_private *tab) { - RT_LOCKED((rtable *) data, t) + if (!tab->cork_active) + return; + + if (tab->deleted || !tab->exporter.first || (tab->exporter.first->seq + tab->cork_threshold.low > tab->exporter.next_seq)) { - struct rtable_config *tc = t->config; - struct config *c = tc->owner; + tab->cork_active = 0; + rt_cork_release(); - tc->table = NULL; - rem_node(&t->n); + rt_trace(tab, D_STATES, "Uncorked"); + } +} - config_del_obstacle(c); +static void +rt_check_cork_high(struct rtable_private *tab) +{ + if (!tab->deleted && !tab->cork_active && tab->exporter.first && (tab->exporter.first->seq + tab->cork_threshold.high <= tab->exporter.next_seq)) + { + tab->cork_active = 1; + rt_cork_acquire(); + rt_export_used(&tab->exporter, tab->name, "corked"); + + rt_trace(tab, D_STATES, "Corked"); } } + +static int +rt_reconfigure(struct rtable_private *tab, struct rtable_config *new, struct rtable_config *old) +{ + if ((new->addr_type != old->addr_type) || + (new->sorted != old->sorted) || + (new->trie_used != old->trie_used)) + return 0; + + DBG("\t%s: same\n", new->name); + new->table = RT_PUB(tab); + tab->name = new->name; + tab->config = new; + + if (tab->hostcache) + tab->hostcache->req.trace_routes = new->debug; + + struct rt_table_export_hook *hook; node *n; + WALK_LIST2(hook, n, tab->exporter.e.hooks, h.n) + if (hook->h.req->export_one == rt_flowspec_export_one) + hook->h.req->trace_routes = new->debug; + + tab->cork_threshold = new->cork_threshold; + + if (new->cork_threshold.high != old->cork_threshold.high) + rt_check_cork_high(tab); + + if (new->cork_threshold.low != old->cork_threshold.low) + rt_check_cork_low(tab); + + return 1; +} + +static struct rtable_config * +rt_find_table_config(struct config *cf, char *name) +{ + struct symbol *sym = cf_find_symbol(cf, name); + return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL; +} + /** * rt_commit - commit new routing table configuration * @new: new configuration @@ -2972,30 +4171,34 @@ rt_commit(struct config *new, struct config *old) { WALK_LIST(o, old->tables) { - RT_LOCK(o->table); - rtable_private *ot = RT_PRIV(o->table); - if (!ot->delete) - { - r = rt_find_table_config(new, o->name); - if (r && (r->addr_type == o->addr_type) && !new->shutdown) - { - DBG("\t%s: same\n", o->name); - r->table = (rtable *) ot; - ot->name = r->name; - ot->config = r; - if (o->sorted != r->sorted) - log(L_WARN "Reconfiguration of rtable sorted flag not implemented"); - } - else - { - DBG("\t%s: deleted\n", o->name); - rt_lock_table(ot); - ot->delete = rt_done; - config_add_obstacle(old); - rt_unlock_table(ot); - } - } - RT_UNLOCK(o->table); + struct rtable_private *tab = RT_LOCK(o->table); + + if (tab->deleted) + { + RT_UNLOCK(tab); + continue; + } + + r = rt_find_table_config(new, o->name); + if (r && !new->shutdown && rt_reconfigure(tab, r, o)) + { + RT_UNLOCK(tab); + continue; + } + + DBG("\t%s: deleted\n", o->name); + tab->deleted = old; + config_add_obstacle(old); + rt_lock_table(tab); + + rt_check_cork_low(tab); + + if (tab->hostcache && ev_get_list(&tab->hostcache->update) == &rt_cork.queue) + ev_postpone(&tab->hostcache->update); + + /* Force one more loop run */ + birdloop_flag(tab->loop, RTF_DELETE); + RT_UNLOCK(tab); } } @@ -3009,8 +4212,98 @@ rt_commit(struct config *new, struct config *old) DBG("\tdone\n"); } +static void +rt_feed_done(struct rt_export_hook *c) +{ + c->event.hook = rt_export_hook; + + rt_set_export_state(c, TES_READY); + + rt_send_export_event(c); +} + +#define MAX_FEED_BLOCK 1024 +typedef struct { + uint cnt, pos; + union { + struct rt_pending_export *rpe; + struct { + const rte **feed; + struct rt_feed_block_aux { + struct rt_pending_export *first, *last; + uint start; + } *aux; + }; + }; +} rt_feed_block; + +static int +rt_prepare_feed(struct rt_table_export_hook *c, net *n, rt_feed_block *b) +{ + if (n->routes) + { + if (c->h.req->export_bulk) + { + uint cnt = rte_feed_count(n); + if (b->cnt && (b->cnt + cnt > MAX_FEED_BLOCK)) + return 0; + + if (!b->cnt) + { + b->feed = tmp_alloc(sizeof(rte *) * MAX(MAX_FEED_BLOCK, cnt)); + + uint aux_block_size = (cnt >= MAX_FEED_BLOCK) ? 2 : (MAX_FEED_BLOCK + 2 - cnt); + b->aux = tmp_alloc(sizeof(struct rt_feed_block_aux) * aux_block_size); + } + + rte_feed_obtain(n, &b->feed[b->cnt], cnt); + + b->aux[b->pos++] = (struct rt_feed_block_aux) { + .start = b->cnt, + .first = n->first, + .last = n->last, + }; + + b->cnt += cnt; + } + else if (b->pos == MAX_FEED_BLOCK) + return 0; + else + { + if (!b->pos) + b->rpe = tmp_alloc(sizeof(struct rt_pending_export) * MAX_FEED_BLOCK); + + b->rpe[b->pos++] = (struct rt_pending_export) { .new = n->routes, .new_best = n->routes }; + } + } + + return 1; +} + +static void +rt_process_feed(struct rt_table_export_hook *c, rt_feed_block *b) +{ + if (!b->pos) + return; + + if (c->h.req->export_bulk) + { + b->aux[b->pos].start = b->cnt; + for (uint p = 0; p < b->pos; p++) + { + struct rt_feed_block_aux *aux = &b->aux[p]; + const rte **feed = &b->feed[aux->start]; + + c->h.req->export_bulk(c->h.req, feed[0]->net, aux->first, aux->last, feed, (aux+1)->start - aux->start); + } + } + else + for (uint p = 0; p < b->pos; p++) + c->h.req->export_one(c->h.req, b->rpe[p].new->rte.net, &b->rpe[p]); +} + /** - * rt_feed_channel - advertise all routes to a channel + * rt_feed_by_fib - advertise all routes to a channel by walking a fib * @c: channel to be fed * * This function performs one pass of advertisement of routes to a channel that @@ -3019,119 +4312,154 @@ rt_commit(struct config *new, struct config *old) * order not to monopolize CPU time.) */ static void -rt_feed_channel(void *data) +rt_feed_by_fib(void *data) { - struct rt_export_hook *c = data; - + struct rt_table_export_hook *c = data; struct fib_iterator *fit = &c->feed_fit; - int max_feed = 256; - - rtable_private *tab; - if (c->export_state == TES_HUNGRY) - { - rt_set_export_state(c, TES_FEEDING); - - tab = RT_LOCK(c->table); + rt_feed_block block = {}; - struct rt_pending_export *rpe = rt_last_export(tab); - DBG("store hook=%p last_export=%p seq=%lu\n", c, rpe, rpe ? rpe->seq : 0); - atomic_store_explicit(&c->last_export, rpe, memory_order_relaxed); + ASSERT(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); - FIB_ITERATE_INIT(&c->feed_fit, &tab->fib); - } - else - tab = RT_LOCK(c->table); - - ASSERT_DIE(c->export_state == TES_FEEDING); + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { -redo: FIB_ITERATE_START(&tab->fib, fit, net, n) { - if (max_feed <= 0) + if ((c->h.req->addr_mode == TE_ADDR_NONE) || net_in_netX(n->n.addr, c->h.req->addr)) + { + if (!rt_prepare_feed(c, n, &block)) { FIB_ITERATE_PUT(fit); - rt_send_export_event(c); - - RT_UNLOCK(c->table); + RT_UNLOCK(tab); + rt_process_feed(c, &block); + rt_send_export_event(&c->h); return; } - - if (atomic_load_explicit(&c->export_state, memory_order_acquire) != TES_FEEDING) - { - RT_UNLOCK(c->table); - return; } + } + FIB_ITERATE_END; + } - if (!n->routes || !rte_is_valid(&n->routes->rte)) - ; /* if no route, do nothing */ - else if (c->req->export_bulk) - { - uint count = rte_feed_count(n); - if (count) - { - rte **feed = alloca(count * sizeof(rte *)); - rte_feed_obtain(n, feed, count); + rt_process_feed(c, &block); + rt_feed_done(&c->h); +} + +static void +rt_feed_by_trie(void *data) +{ + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; - struct rt_pending_export *rpe_last, *rpe_first = n->first; - for (struct rt_pending_export *rpe = rpe_first; rpe; rpe = rpe_next(rpe, NULL)) - rpe_last = rpe; + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { - FIB_ITERATE_PUT_NEXT(fit, &tab->fib); - RT_UNLOCK(c->table); + ASSERT_DIE(c->walk_state); + struct f_trie_walk_state *ws = c->walk_state; - c->req->export_bulk(c->req, n->n.addr, NULL, feed, count); + ASSERT(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); - RT_LOCK(c->table); + do { + if (!c->walk_last.type) + continue; - for (struct rt_pending_export *rpe = rpe_first; rpe; rpe = rpe_next(rpe, NULL)) - { - rpe_mark_seen(c, rpe); - if (rpe == rpe_last) - break; - ASSERT_DIE(rpe->seq < rpe_last->seq); - } + net *n = net_find(tab, &c->walk_last); + if (!n) + continue; - max_feed -= count; + if (!rt_prepare_feed(c, n, &block)) + { + RT_UNLOCK(tab); + rt_process_feed(c, &block); + rt_send_export_event(&c->h); + return; + } + } + while (trie_walk_next(ws, &c->walk_last)); - goto redo; - } - } - else if (c->req->export_one) - { - struct rt_pending_export rpe = { .new = n->routes, .new_best = n->routes }; + rt_unlock_trie(tab, c->walk_lock); + c->walk_lock = NULL; - struct rt_pending_export *rpe_last, *rpe_first = n->first; - for (struct rt_pending_export *rpe = rpe_first; rpe; rpe = rpe_next(rpe, NULL)) - rpe_last = rpe; + mb_free(c->walk_state); + c->walk_state = NULL; - FIB_ITERATE_PUT_NEXT(fit, &tab->fib); - RT_UNLOCK(c->table); + c->walk_last.type = 0; - c->req->export_one(c->req, n->n.addr, &rpe); + } - RT_LOCK(c->table); - for (struct rt_pending_export *rpe = rpe_first; rpe; rpe = rpe_next(rpe, NULL)) - { - rpe_mark_seen(c, rpe); - if (rpe == rpe_last) - break; - ASSERT_DIE(rpe->seq < rpe_last->seq); - } + rt_process_feed(c, &block); + rt_feed_done(&c->h); +} - max_feed--; - goto redo; - } - else - bug("Export request must always provide an export method"); - } - FIB_ITERATE_END; +static void +rt_feed_equal(void *data) +{ + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; + net *n; - c->event->hook = rt_export_hook; - rt_send_export_event(c); + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); + ASSERT_DIE(c->h.req->addr_mode == TE_ADDR_EQUAL); - RT_UNLOCK(c->table); + if (n = net_find(tab, c->h.req->addr)) + ASSERT_DIE(rt_prepare_feed(c, n, &block)); + } - rt_set_export_state(c, TES_READY); + if (n) + rt_process_feed(c, &block); + + rt_feed_done(&c->h); +} + +static void +rt_feed_for(void *data) +{ + struct rt_table_export_hook *c = data; + rt_feed_block block = {}; + net *n; + + RT_LOCKED(RT_PUB(SKIP_BACK(struct rtable_private, exporter, c->table)), tab) + { + ASSERT_DIE(atomic_load_explicit(&c->h.export_state, memory_order_relaxed) == TES_FEEDING); + ASSERT_DIE(c->h.req->addr_mode == TE_ADDR_FOR); + + if (n = net_route(tab, c->h.req->addr)) + ASSERT_DIE(rt_prepare_feed(c, n, &block)); + } + + if (n) + rt_process_feed(c, &block); + + rt_feed_done(&c->h); +} + + +/* + * Import table + */ + +void channel_reload_export_bulk(struct rt_export_request *req, const net_addr *net, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) +{ + struct channel *c = SKIP_BACK(struct channel, reload_req, req); + + for (uint i=0; i<count; i++) + if (feed[i]->sender == c->in_req.hook) + { + /* Strip the table-specific information */ + rte new = rte_init_from(feed[i]); + + /* Strip the later attribute layers */ + while (new.attrs->next) + new.attrs = new.attrs->next; + + /* And reload the route */ + rte_update(c, net, &new, new.src); + } + + rpe_mark_seen_all(req->hook, first, last, NULL); } @@ -3230,7 +4558,7 @@ hc_delete_hostentry(struct hostcache *hc, pool *p, struct hostentry *he) rem_node(&he->ln); hc_remove(hc, he); - sl_free(hc->slab, he); + sl_free(he); hc->hash_items--; if (hc->hash_items < hc->hash_min) @@ -3238,7 +4566,56 @@ hc_delete_hostentry(struct hostcache *hc, pool *p, struct hostentry *he) } static void -rt_init_hostcache(rtable_private *tab) +hc_notify_dump_req(struct rt_export_request *req) +{ + debug(" Table %s (%p)\n", req->name, req); +} + +static void +hc_notify_log_state_change(struct rt_export_request *req, u8 state) +{ + struct hostcache *hc = SKIP_BACK(struct hostcache, req, req); + rt_trace((rtable *) hc->update.data, D_STATES, "HCU Export state changed to %s", rt_export_state_name(state)); +} + +static void +hc_notify_export_one(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first) +{ + struct hostcache *hc = SKIP_BACK(struct hostcache, req, req); + + /* No interest in this update, mark seen only */ + int interested = 1; + RT_LOCKED((rtable *) hc->update.data, tab) + if (ev_active(&hc->update) || !trie_match_net(hc->trie, net)) + { + rpe_mark_seen_all(req->hook, first, NULL, NULL); + interested = 0; + } + + if (!interested) + return; + + /* This net may affect some hostentries, check the actual change */ + rte *o = RTE_VALID_OR_NULL(first->old_best); + struct rte_storage *new_best = first->new_best; + + RPE_WALK(first, rpe, NULL) + { + rpe_mark_seen(req->hook, rpe); + new_best = rpe->new_best; + } + + /* Yes, something has actually changed. Do the hostcache update. */ + if (o != RTE_VALID_OR_NULL(new_best)) + RT_LOCKED((rtable *) hc->update.data, tab) + if ((atomic_load_explicit(&req->hook->export_state, memory_order_acquire) == TES_READY) + && !ev_active(&hc->update)) + ev_send_loop(tab->loop, &hc->update); +} + + +static void +rt_init_hostcache(struct rtable_private *tab) { struct hostcache *hc = mb_allocz(tab->rp, sizeof(struct hostcache)); init_list(&hc->hostentries); @@ -3247,14 +4624,30 @@ rt_init_hostcache(rtable_private *tab) hc_alloc_table(hc, tab->rp, HC_DEF_ORDER); hc->slab = sl_new(tab->rp, sizeof(struct hostentry)); - hc->lp = lp_new(tab->rp, LP_GOOD_SIZE(1024)); + hc->lp = lp_new(tab->rp); hc->trie = f_new_trie(hc->lp, 0); + hc->update = (event) { + .hook = rt_update_hostcache, + .data = tab, + }; + + hc->req = (struct rt_export_request) { + .name = mb_sprintf(tab->rp, "%s.hcu.notifier", tab->name), + .list = birdloop_event_list(tab->loop), + .trace_routes = tab->config->debug, + .dump_req = hc_notify_dump_req, + .log_state_change = hc_notify_log_state_change, + .export_one = hc_notify_export_one, + }; + + rt_table_export_start_locked(tab, &hc->req); + tab->hostcache = hc; } static void -rt_free_hostcache(rtable_private *tab) +rt_free_hostcache(struct rtable_private *tab) { struct hostcache *hc = tab->hostcache; @@ -3276,16 +4669,6 @@ rt_free_hostcache(rtable_private *tab) */ } -static void -rt_notify_hostcache(rtable_private *tab, net *net) -{ - if (ev_active(tab->hcu_event)) - return; - - if (trie_match_net(tab->hostcache->trie, net->n.addr)) - ev_send_loop(tab->loop, tab->hcu_event); -} - static int if_local_addr(ip_addr a, struct iface *i) { @@ -3299,14 +4682,14 @@ if_local_addr(ip_addr a, struct iface *i) } u32 -rt_get_igp_metric(rte *rt) +rt_get_igp_metric(const rte *rt) { - eattr *ea = ea_find(rt->attrs->eattrs, EA_GEN_IGP_METRIC); + eattr *ea = ea_find(rt->attrs, "igp_metric"); if (ea) return ea->u.data; - if (rt->attrs->source == RTS_DEVICE) + if (rt_get_source_attr(rt) == RTS_DEVICE) return 0; if (rt->src->owner->class->rte_igp_metric) @@ -3316,15 +4699,14 @@ rt_get_igp_metric(rte *rt) } static int -rt_update_hostentry(rtable_private *tab, struct hostentry *he) +rt_update_hostentry(struct rtable_private *tab, struct hostentry *he) { - rta *old_src = he->src; + ea_list *old_src = he->src; int direct = 0; int pxlen = 0; /* Reset the hostentry */ he->src = NULL; - he->dest = RTD_UNREACHABLE; he->nexthop_linkable = 0; he->igp_metric = 0; @@ -3334,11 +4716,13 @@ rt_update_hostentry(rtable_private *tab, struct hostentry *he) if (n) { struct rte_storage *e = n->routes; - rta *a = e->rte.attrs; - word pref = a->pref; + ea_list *a = e->rte.attrs; + u32 pref = rt_get_preference(&e->rte); for (struct rte_storage *ee = n->routes; ee; ee = ee->next) - if ((ee->rte.attrs->pref >= pref) && ee->rte.attrs->hostentry) + if (rte_is_valid(&ee->rte) && + (rt_get_preference(&ee->rte) >= pref) && + ea_find(ee->rte.attrs, &ea_gen_hostentry)) { /* Recursive route should not depend on another recursive route */ log(L_WARN "Next hop address %I resolvable through recursive route for %N", @@ -3348,9 +4732,12 @@ rt_update_hostentry(rtable_private *tab, struct hostentry *he) pxlen = n->n.addr->pxlen; - if (a->dest == RTD_UNICAST) - { - for (struct nexthop *nh = &(a->nh); nh; nh = nh->next) + eattr *nhea = ea_find(a, &ea_gen_nexthop); + ASSERT_DIE(nhea); + struct nexthop_adata *nhad = (void *) nhea->u.ptr; + + if (NEXTHOP_IS_REACHABLE(nhad)) + NEXTHOP_WALK(nh, nhad) if (ipa_zero(nh->gw)) { if (if_local_addr(he->addr, nh->iface)) @@ -3363,10 +4750,8 @@ rt_update_hostentry(rtable_private *tab, struct hostentry *he) direct++; } - } he->src = rta_clone(a); - he->dest = a->dest; he->nexthop_linkable = !direct; he->igp_metric = rt_get_igp_metric(&e->rte); } @@ -3382,10 +4767,26 @@ done: static void rt_update_hostcache(void *data) { - rtable_private *tab = data; - ASSERT_DIE(birdloop_inside(tab->loop)); + rtable **nhu_pending; + + RT_LOCKED((rtable *) data, tab) + { struct hostcache *hc = tab->hostcache; + + /* Shutdown shortcut */ + if (!hc->req.hook) + RT_RETURN(tab); + + if (rt_cork_check(&hc->update)) + { + rt_trace(tab, D_STATES, "Hostcache update corked"); + RT_RETURN(tab); + } + + /* Destination schedule map */ + nhu_pending = tmp_allocz(sizeof(rtable *) * rtable_max_id); + struct hostentry *he; node *n, *x; @@ -3403,31 +4804,33 @@ rt_update_hostcache(void *data) } if (rt_update_hostentry(tab, he)) - rt_schedule_nhu(he->tab); + nhu_pending[he->tab->id] = he->tab; } + } + + for (uint i=0; i<rtable_max_id; i++) + if (nhu_pending[i]) + RT_LOCKED(nhu_pending[i], dst) + rt_schedule_nhu(dst); } -struct hostentry * -rt_get_hostentry(rtable *t, ip_addr a, ip_addr ll, rtable *dep) +static struct hostentry * +rt_get_hostentry(struct rtable_private *tab, ip_addr a, ip_addr ll, rtable *dep) { + ip_addr link = ipa_zero(ll) ? a : ll; struct hostentry *he; - rtable_private *tab = RT_LOCK(t); - if (!tab->hostcache) rt_init_hostcache(tab); u32 k = hc_hash(a, dep); struct hostcache *hc = tab->hostcache; for (he = hc->hash_table[k >> hc->hash_shift]; he != NULL; he = he->next) - if (ipa_equal(he->addr, a) && (he->tab == dep)) - goto done; + if (ipa_equal(he->addr, a) && ipa_equal(he->link, link) && (he->tab == dep)) + return he; - he = hc_new_hostentry(hc, tab->rp, a, ipa_zero(ll) ? a : ll, dep, k); + he = hc_new_hostentry(hc, tab->rp, a, link, dep, k); rt_update_hostentry(tab, he); - -done: - RT_UNLOCK(t); return he; } diff --git a/nest/rt.h b/nest/rt.h new file mode 100644 index 00000000..01372b8c --- /dev/null +++ b/nest/rt.h @@ -0,0 +1,706 @@ +/* + * BIRD Internet Routing Daemon -- Routing Table + * + * (c) 1998--2000 Martin Mares <mj@ucw.cz> + * (c) 2019--2021 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_NEST_RT_H_ +#define _BIRD_NEST_RT_H_ + +#include "lib/lists.h" +#include "lib/bitmap.h" +#include "lib/resource.h" +#include "lib/net.h" +#include "lib/type.h" +#include "lib/fib.h" +#include "lib/route.h" +#include "lib/event.h" +#include "lib/rcu.h" +#include "lib/io-loop.h" +#include "lib/settle.h" + +#include <stdatomic.h> + +struct ea_list; +struct protocol; +struct proto; +struct channel; +struct rte_src; +struct hostcache; +struct symbol; +struct timer; +struct filter; +struct f_trie; +struct f_trie_walk_state; +struct cli; + +struct rt_cork_threshold { + u64 low, high; +}; + +/* + * Master Routing Tables. Generally speaking, each of them contains a FIB + * with each entry pointing to a list of route entries representing routes + * to given network (with the selected one at the head). + * + * Each of the RTE's contains variable data (the preference and protocol-dependent + * metrics) and a pointer to a route attribute block common for many routes). + * + * It's guaranteed that there is at most one RTE for every (prefix,proto) pair. + */ + +struct rtable_config { + node n; + char *name; + union rtable *table; + struct proto_config *krt_attached; /* Kernel syncer attached to this table */ + uint addr_type; /* Type of address data stored in table (NET_*) */ + uint gc_threshold; /* Maximum number of operations before GC is run */ + uint gc_period; /* Approximate time between two consecutive GC runs */ + byte sorted; /* Routes of network are sorted according to rte_better() */ + byte trie_used; /* Rtable has attached trie */ + byte debug; /* Whether to log */ + struct rt_cork_threshold cork_threshold; /* Cork threshold values */ + struct settle_config export_settle; /* Export announcement settler */ + struct settle_config export_rr_settle;/* Export announcement settler config valid when any + route refresh is running */ +}; + +struct rt_export_hook; +struct rt_export_request; +struct rt_exporter; + +struct rt_exporter_class { + void (*start)(struct rt_exporter *, struct rt_export_request *); + void (*stop)(struct rt_export_hook *); + void (*done)(void *_rt_export_hook); +}; + +struct rt_exporter { + const struct rt_exporter_class *class; + pool *rp; + list hooks; /* Registered route export hooks */ + uint addr_type; /* Type of address data exported (NET_*) */ +}; + +struct rt_table_exporter { + struct rt_exporter e; + list pending; /* List of packed struct rt_pending_export */ + + struct rt_pending_export *first; /* First export to announce */ + u64 next_seq; /* The next export will have this ID */ +}; + +extern uint rtable_max_id; + +DEFINE_DOMAIN(rtable); + +/* The public part of rtable structure */ +#define RTABLE_PUBLIC \ + resource r; \ + node n; /* Node in list of all tables */ \ + char *name; /* Name of this table */ \ + uint addr_type; /* Type of address data stored in table (NET_*) */ \ + uint id; /* Integer table ID for fast lookup */ \ + DOMAIN(rtable) lock; /* Lock to take to access the private parts */ \ + struct rtable_config *config; /* Configuration of this table */ \ + struct birdloop *loop; /* Service thread */ \ + +/* The complete rtable structure */ +struct rtable_private { + /* Once more the public part */ + RTABLE_PUBLIC; + + /* Here the private items not to be accessed without locking */ + pool *rp; /* Resource pool to allocate everything from, including itself */ + struct slab *rte_slab; /* Slab to allocate route objects */ + struct fib fib; + struct f_trie *trie; /* Trie of prefixes defined in fib */ + int use_count; /* Number of protocols using this table */ + u32 rt_count; /* Number of routes in the table */ + + list imports; /* Registered route importers */ + struct rt_table_exporter exporter; /* Exporter API structure */ + + struct hmap id_map; + struct hostcache *hostcache; + struct config *deleted; /* Table doesn't exist in current configuration, + * delete as soon as use_count becomes 0 and remove + * obstacle from this routing table. + */ + struct event *nhu_uncork_event; /* Helper event to schedule NHU on uncork */ + struct settle export_settle; /* Export batching settle timer */ + struct timer *prune_timer; /* Timer for periodic pruning / GC */ + struct birdloop_flag_handler fh; /* Handler for simple events */ + btime last_rt_change; /* Last time when route changed */ + btime gc_time; /* Time of last GC */ + uint gc_counter; /* Number of operations since last GC */ + uint rr_counter; /* Number of currently running route refreshes, + in fact sum of (stale_set - stale_pruned) over all importers + + one for each TIS_FLUSHING importer */ + uint wait_counter; /* Number of imports in TIS_WAITING state */ + byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */ + byte prune_trie; /* Prune prefix trie during next table prune */ + byte imports_flushing; /* Some imports are being flushed right now */ + byte nhu_state; /* Next Hop Update state */ + byte nhu_corked; /* Next Hop Update is corked with this state */ + byte export_used; /* Pending Export pruning is scheduled */ + byte cork_active; /* Cork has been activated */ + struct rt_cork_threshold cork_threshold; /* Threshold for table cork */ + struct fib_iterator prune_fit; /* Rtable prune FIB iterator */ + struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ + struct f_trie *trie_new; /* New prefix trie defined during pruning */ + struct f_trie *trie_old; /* Old prefix trie waiting to be freed */ + u32 trie_lock_count; /* Prefix trie locked by walks */ + u32 trie_old_lock_count; /* Old prefix trie locked by walks */ + struct tbf rl_pipe; /* Rate limiting token buffer for pipe collisions */ + + struct f_trie *flowspec_trie; /* Trie for evaluation of flowspec notifications */ +}; + +/* The final union private-public rtable structure */ +typedef union rtable { + struct { + RTABLE_PUBLIC; + }; + struct rtable_private priv; +} rtable; + +#define RT_IS_LOCKED(tab) DOMAIN_IS_LOCKED(rtable, (tab)->lock) + +#define RT_LOCK(tab) ({ LOCK_DOMAIN(rtable, (tab)->lock); &(tab)->priv; }) +#define RT_UNLOCK(tab) UNLOCK_DOMAIN(rtable, (tab)->lock) +#define RT_PRIV(tab) ({ ASSERT_DIE(RT_IS_LOCKED((tab))); &(tab)->priv; }) +#define RT_PUB(tab) SKIP_BACK(rtable, priv, tab) + +#define RT_LOCKED(tpub, tpriv) for (struct rtable_private *tpriv = RT_LOCK(tpub); tpriv; RT_UNLOCK(tpriv), (tpriv = NULL)) +#define RT_RETURN(tpriv, ...) do { RT_UNLOCK(tpriv); return __VA_ARGS__; } while (0) + +#define RT_PRIV_SAME(tpriv, tpub) (&(tpub)->priv == (tpriv)) + +/* Flags for birdloop_flag() */ +#define RTF_CLEANUP 1 +#define RTF_NHU 2 +#define RTF_EXPORT 4 +#define RTF_DELETE 8 + +extern struct rt_cork { + _Atomic uint active; + event_list queue; + event run; +} rt_cork; + +static inline void rt_cork_acquire(void) +{ + atomic_fetch_add_explicit(&rt_cork.active, 1, memory_order_acq_rel); +} + +static inline void rt_cork_release(void) +{ + if (atomic_fetch_sub_explicit(&rt_cork.active, 1, memory_order_acq_rel) == 1) + { + synchronize_rcu(); + ev_send(&global_work_list, &rt_cork.run); + } +} + +static inline int rt_cork_check(event *e) +{ + rcu_read_lock(); + + int corked = (atomic_load_explicit(&rt_cork.active, memory_order_acquire) > 0); + if (corked) + ev_send(&rt_cork.queue, e); + + rcu_read_unlock(); + + return corked; +} + + +typedef struct network { + struct rte_storage *routes; /* Available routes for this network */ + struct rt_pending_export *first, *last; + struct fib_node n; /* FIB flags reserved for kernel syncer */ +} net; + +struct rte_storage { + struct rte_storage *next; /* Next in chain */ + struct rte rte; /* Route data */ +}; + +#define RTE_COPY(r) ((r) ? (r)->rte : (rte) {}) +#define RTE_COPY_VALID(r) (((r) && (rte_is_valid(&(r)->rte))) ? (r)->rte : (rte) {}) +#define RTE_OR_NULL(r) ((r) ? &((r)->rte) : NULL) +#define RTE_VALID_OR_NULL(r) (((r) && (rte_is_valid(&(r)->rte))) ? &((r)->rte) : NULL) + +/* Table-channel connections */ + +struct rt_import_request { + struct rt_import_hook *hook; /* The table part of importer */ + char *name; + u8 trace_routes; + + event_list *list; /* Where to schedule announce events */ + + void (*dump_req)(struct rt_import_request *req); + void (*log_state_change)(struct rt_import_request *req, u8 state); + /* Preimport is called when the @new route is just-to-be inserted, replacing @old. + * Return a route (may be different or modified in-place) to continue or NULL to withdraw. */ + int (*preimport)(struct rt_import_request *req, struct rte *new, struct rte *old); +}; + +struct rt_import_hook { + node n; + rtable *table; /* The connected table */ + struct rt_import_request *req; /* The requestor */ + + struct rt_import_stats { + /* Import - from protocol to core */ + u32 pref; /* Number of routes selected as best in the (adjacent) routing table */ + u32 updates_ignored; /* Number of route updates rejected as already in route table */ + u32 updates_accepted; /* Number of route updates accepted and imported */ + u32 withdraws_ignored; /* Number of route withdraws rejected as already not in route table */ + u32 withdraws_accepted; /* Number of route withdraws accepted and processed */ + } stats; + + u64 flush_seq; /* Table export seq when the channel announced flushing */ + btime last_state_change; /* Time of last state transition */ + + u8 import_state; /* IS_* */ + u8 stale_set; /* Set this stale_cycle to imported routes */ + u8 stale_valid; /* Routes with this stale_cycle and bigger are considered valid */ + u8 stale_pruned; /* Last prune finished when this value was set at stale_valid */ + u8 stale_pruning; /* Last prune started when this value was set at stale_valid */ + + void (*stopped)(struct rt_import_request *); /* Stored callback when import is stopped */ + event announce_event; /* This event announces table updates */ +}; + +struct rt_pending_export { + struct rt_pending_export * _Atomic next; /* Next export for the same destination */ + struct rte_storage *new, *new_best, *old, *old_best; + u64 seq; /* Sequential ID (table-local) of the pending export */ +}; + +struct rt_export_request { + struct rt_export_hook *hook; /* Table part of the export */ + char *name; + const net_addr *addr; /* Network prefilter address */ + u8 trace_routes; + u8 addr_mode; /* Network prefilter mode (TE_ADDR_*) */ + + event_list *list; /* Where to schedule export events */ + + /* There are two methods of export. You can either request feeding every single change + * or feeding the whole route feed. In case of regular export, &export_one is preferred. + * Anyway, when feeding, &export_bulk is preferred, falling back to &export_one. + * Thus, for RA_OPTIMAL, &export_one is only set, + * for RA_MERGED and RA_ACCEPTED, &export_bulk is only set + * and for RA_ANY, both are set to accomodate for feeding all routes but receiving single changes + */ + void (*export_one)(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); + void (*export_bulk)(struct rt_export_request *req, const net_addr *net, + struct rt_pending_export *rpe, struct rt_pending_export *last, + const rte **feed, uint count); + + void (*dump_req)(struct rt_export_request *req); + void (*log_state_change)(struct rt_export_request *req, u8); +}; + +struct rt_export_hook { + node n; + struct rt_exporter *table; /* The connected table */ + + pool *pool; + + struct rt_export_request *req; /* The requestor */ + + struct rt_export_stats { + /* Export - from core to protocol */ + u32 updates_received; /* Number of route updates received */ + u32 withdraws_received; /* Number of route withdraws received */ + } stats; + + btime last_state_change; /* Time of last state transition */ + + _Atomic u8 export_state; /* Route export state (TES_*, see below) */ + struct event event; /* Event running all the export operations */ + + struct bmap seq_map; /* Keep track which exports were already procesed */ + + void (*stopped)(struct rt_export_request *); /* Stored callback when export is stopped */ +}; + +struct rt_table_export_hook { + union { + struct rt_export_hook h; + struct { /* Overriding the parent structure beginning */ + node _n; + struct rt_table_exporter *table; + }; + }; + + union { + struct fib_iterator feed_fit; /* Routing table iterator used during feeding */ + struct { + struct f_trie_walk_state *walk_state; /* Iterator over networks in trie */ + struct f_trie *walk_lock; /* Locked trie for walking */ + union { /* Last net visited but not processed */ + net_addr walk_last; + net_addr_ip4 walk_last_ip4; + net_addr_ip6 walk_last_ip6; + }; + }; + }; + + struct rt_pending_export *_Atomic last_export;/* Last export processed */ + struct rt_pending_export *rpe_next; /* Next pending export to process */ + + u8 refeed_pending; /* Refeeding and another refeed is scheduled */ + u8 feed_type; /* Which feeding method is used (TFT_*, see below) */ + +}; + +#define TIS_DOWN 0 +#define TIS_UP 1 +#define TIS_STOP 2 +#define TIS_FLUSHING 3 +#define TIS_WAITING 4 +#define TIS_CLEARED 5 +#define TIS_MAX 6 + +#define TES_DOWN 0 +#define TES_HUNGRY 1 +#define TES_FEEDING 2 +#define TES_READY 3 +#define TES_STOP 4 +#define TES_MAX 5 + +/* Value of addr_mode */ +#define TE_ADDR_NONE 0 /* No address matching */ +#define TE_ADDR_EQUAL 1 /* Exact query - show route <addr> */ +#define TE_ADDR_FOR 2 /* Longest prefix match - show route for <addr> */ +#define TE_ADDR_IN 3 /* Interval query - show route in <addr> */ + + +#define TFT_FIB 1 +#define TFT_TRIE 2 +#define TFT_HASH 3 + +void rt_request_import(rtable *tab, struct rt_import_request *req); +void rt_request_export(rtable *tab, struct rt_export_request *req); +void rt_request_export_other(struct rt_exporter *tab, struct rt_export_request *req); + +void rt_export_once(struct rt_exporter *tab, struct rt_export_request *req); + +void rt_stop_import(struct rt_import_request *, void (*stopped)(struct rt_import_request *)); +void rt_stop_export(struct rt_export_request *, void (*stopped)(struct rt_export_request *)); + +const char *rt_import_state_name(u8 state); +const char *rt_export_state_name(u8 state); + +static inline u8 rt_import_get_state(struct rt_import_hook *ih) { return ih ? ih->import_state : TIS_DOWN; } +static inline u8 rt_export_get_state(struct rt_export_hook *eh) { return eh ? eh->export_state : TES_DOWN; } + +void rt_set_export_state(struct rt_export_hook *hook, u8 state); + +void rte_import(struct rt_import_request *req, const net_addr *net, rte *new, struct rte_src *src); + +/* + * For table export processing + */ + +/* Get next rpe. If src is given, it must match. */ +struct rt_pending_export *rpe_next(struct rt_pending_export *rpe, struct rte_src *src); + +/* Walk all rpe's */ +#define RPE_WALK(first, it, src) \ + for (struct rt_pending_export *it = (first); it; it = rpe_next(it, (src))) + +/* Mark the pending export processed */ +void rpe_mark_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); + +#define rpe_mark_seen_all(hook, first, last, src) do { \ + RPE_WALK((first), _rpe, (src)) { \ + rpe_mark_seen((hook), _rpe); \ + if (_rpe == last) break; \ + }} while (0) + +/* Get pending export seen status */ +int rpe_get_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); + +/* + * For rt_export_hook and rt_exporter inheritance + */ + +void rt_init_export(struct rt_exporter *re, struct rt_export_hook *hook); +struct rt_export_hook *rt_alloc_export(struct rt_exporter *re, uint size); +void rt_stop_export_common(struct rt_export_hook *hook); +void rt_export_stopped(struct rt_export_hook *hook); +void rt_exporter_init(struct rt_exporter *re); + +/* + * Channel export hooks. To be refactored out. + */ + +int channel_preimport(struct rt_import_request *req, rte *new, rte *old); + +void channel_reload_export_bulk(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first, struct rt_pending_export *last, const rte **feed, uint count); + +void rt_notify_optimal(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); +void rt_notify_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *rpe); +void rt_feed_any(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first, struct rt_pending_export *last, const rte **feed, uint count); +void rt_notify_accepted(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first, struct rt_pending_export *last, const rte **feed, uint count); +void rt_notify_merged(struct rt_export_request *req, const net_addr *net, struct rt_pending_export *first, struct rt_pending_export *last, const rte **feed, uint count); + + + +/* Types of route announcement, also used as flags */ +#define RA_UNDEF 0 /* Undefined RA type */ +#define RA_OPTIMAL 1 /* Announcement of optimal route change */ +#define RA_ACCEPTED 2 /* Announcement of first accepted route */ +#define RA_ANY 3 /* Announcement of any route change */ +#define RA_MERGED 4 /* Announcement of optimal route merged with next ones */ + +/* Return value of preexport() callback */ +#define RIC_ACCEPT 1 /* Accepted by protocol */ +#define RIC_PROCESS 0 /* Process it through import filter */ +#define RIC_REJECT -1 /* Rejected by protocol */ +#define RIC_DROP -2 /* Silently dropped by protocol */ + +/* + * Next hop update data structures + */ + +#define NHU_CLEAN 0 +#define NHU_SCHEDULED 1 +#define NHU_RUNNING 2 +#define NHU_DIRTY 3 + +struct hostentry { + node ln; + ip_addr addr; /* IP address of host, part of key */ + ip_addr link; /* (link-local) IP address of host, used as gw + if host is directly attached */ + rtable *tab; /* Dependent table, part of key */ + struct hostentry *next; /* Next in hash chain */ + unsigned hash_key; /* Hash key */ + unsigned uc; /* Use count */ + ea_list *src; /* Source attributes */ + byte nexthop_linkable; /* Nexthop list is completely non-device */ + u32 igp_metric; /* Chosen route IGP metric */ +}; + +struct hostcache { + slab *slab; /* Slab holding all hostentries */ + struct hostentry **hash_table; /* Hash table for hostentries */ + unsigned hash_order, hash_shift; + unsigned hash_max, hash_min; + unsigned hash_items; + linpool *lp; /* Linpool for trie */ + struct f_trie *trie; /* Trie of prefixes that might affect hostentries */ + list hostentries; /* List of all hostentries */ + event update; + struct rt_export_request req; /* Notifier */ +}; + +struct rt_flowspec_link { + rtable *src; + rtable *dst; + u32 uc; + struct rt_export_request req; +}; + +#define rte_update channel_rte_import +/** + * rte_update - enter a new update to a routing table + * @c: channel doing the update + * @net: network address + * @rte: a &rte representing the new route + * @src: old route source identifier + * + * This function imports a new route to the appropriate table (via the channel). + * Table keys are @net (obligatory) and @rte->attrs->src. + * Both the @net and @rte pointers can be local. + * + * The route attributes (@rte->attrs) are obligatory. They can be also allocated + * locally. Anyway, if you use an already-cached attribute object, you shall + * call rta_clone() on that object yourself. (This semantics may change in future.) + * + * If the route attributes are local, you may set @rte->attrs->src to NULL, then + * the protocol's default route source will be supplied. + * + * When rte_update() gets a route, it automatically validates it. This includes + * checking for validity of the given network and next hop addresses and also + * checking for host-scope or link-scope routes. Then the import filters are + * processed and if accepted, the route is passed to route table recalculation. + * + * The accepted routes are then inserted into the table, replacing the old route + * for the same @net identified by @src. Then the route is announced + * to all the channels connected to the table using the standard export mechanism. + * Setting @rte to NULL makes this a withdraw, otherwise @rte->src must be the same + * as @src. + * + * All memory used for temporary allocations is taken from a special linpool + * @rte_update_pool and freed when rte_update() finishes. + */ +void rte_update(struct channel *c, const net_addr *net, struct rte *rte, struct rte_src *src); + +extern list routing_tables; +struct config; + +void rt_init(void); +void rt_preconfig(struct config *); +void rt_postconfig(struct config *); +void rt_commit(struct config *new, struct config *old); +void rt_lock_table_priv(struct rtable_private *, const char *file, uint line); +void rt_unlock_table_priv(struct rtable_private *, const char *file, uint line); +static inline void rt_lock_table_pub(rtable *t, const char *file, uint line) +{ RT_LOCKED(t, tt) rt_lock_table_priv(tt, file, line); } +static inline void rt_unlock_table_pub(rtable *t, const char *file, uint line) +{ RT_LOCKED(t, tt) rt_unlock_table_priv(tt, file, line); } + +#define rt_lock_table(t) _Generic((t), rtable *: rt_lock_table_pub, \ + struct rtable_private *: rt_lock_table_priv)((t), __FILE__, __LINE__) +#define rt_unlock_table(t) _Generic((t), rtable *: rt_unlock_table_pub, \ + struct rtable_private *: rt_unlock_table_priv)((t), __FILE__, __LINE__) + +struct f_trie * rt_lock_trie(struct rtable_private *tab); +void rt_unlock_trie(struct rtable_private *tab, struct f_trie *trie); +void rt_flowspec_link(rtable *src, rtable *dst); +void rt_flowspec_unlink(rtable *src, rtable *dst); +rtable *rt_setup(pool *, struct rtable_config *); + +static inline net *net_find(struct rtable_private *tab, const net_addr *addr) { return (net *) fib_find(&tab->fib, addr); } +static inline net *net_find_valid(struct rtable_private *tab, const net_addr *addr) +{ net *n = net_find(tab, addr); return (n && n->routes && rte_is_valid(&n->routes->rte)) ? n : NULL; } +static inline net *net_get(struct rtable_private *tab, const net_addr *addr) { return (net *) fib_get(&tab->fib, addr); } +net *net_route(struct rtable_private *tab, const net_addr *n); +int rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter); +rte *rt_export_merged(struct channel *c, const rte ** feed, uint count, linpool *pool, int silent); +void rt_refresh_begin(struct rt_import_request *); +void rt_refresh_end(struct rt_import_request *); +void rt_modify_stale(rtable *t, struct rt_import_request *); +void rt_schedule_prune(struct rtable_private *t); +void rte_dump(struct rte_storage *); +void rt_dump(rtable *); +void rt_dump_all(void); +void rt_dump_hooks(rtable *); +void rt_dump_hooks_all(void); +int rt_reload_channel(struct channel *c); +void rt_reload_channel_abort(struct channel *c); +void rt_refeed_channel(struct channel *c); +void rt_prune_sync(rtable *t, int all); +struct rtable_config *rt_new_table(struct symbol *s, uint addr_type); +void rt_new_default_table(struct symbol *s); +struct rtable_config *rt_get_default_table(struct config *cf, uint addr_type); + +static inline int rt_is_ip(rtable *tab) +{ return (tab->addr_type == NET_IP4) || (tab->addr_type == NET_IP6); } + +static inline int rt_is_vpn(rtable *tab) +{ return (tab->addr_type == NET_VPN4) || (tab->addr_type == NET_VPN6); } + +static inline int rt_is_roa(rtable *tab) +{ return (tab->addr_type == NET_ROA4) || (tab->addr_type == NET_ROA6); } + +static inline int rt_is_flow(rtable *tab) +{ return (tab->addr_type == NET_FLOW4) || (tab->addr_type == NET_FLOW6); } + + +/* Default limit for ECMP next hops, defined in sysdep code */ +extern const int rt_default_ecmp; + +struct rt_show_data_rtable { + node n; + const char *name; + struct rt_exporter *table; + struct channel *export_channel; + struct channel *prefilter; + struct krt_proto *kernel; +}; + +struct rt_show_data { + struct cli *cli; /* Pointer back to the CLI */ + net_addr *addr; + list tables; + struct rt_show_data_rtable *tab; /* Iterator over table list */ + struct rt_show_data_rtable *last_table; /* Last table in output */ + struct rt_export_request req; /* Export request in use */ + int verbose, tables_defined_by; + const struct filter *filter; + struct proto *show_protocol; + struct proto *export_protocol; + struct channel *export_channel; + struct config *running_on_config; + struct rt_export_hook *kernel_export_hook; + int export_mode, addr_mode, primary_only, filtered, stats; + + int net_counter, rt_counter, show_counter, table_counter; + int net_counter_last, rt_counter_last, show_counter_last; + int show_counter_last_flush; +}; + +void rt_show(struct rt_show_data *); +struct rt_show_data_rtable * rt_show_add_exporter(struct rt_show_data *d, struct rt_exporter *t, const char *name); +struct rt_show_data_rtable * rt_show_add_table(struct rt_show_data *d, rtable *t); + +/* Value of table definition mode in struct rt_show_data */ +#define RSD_TDB_DEFAULT 0 /* no table specified */ +#define RSD_TDB_INDIRECT 0 /* show route ... protocol P ... */ +#define RSD_TDB_ALL RSD_TDB_SET /* show route ... table all ... */ +#define RSD_TDB_DIRECT RSD_TDB_SET | RSD_TDB_NMN /* show route ... table X table Y ... */ + +#define RSD_TDB_SET 0x1 /* internal: show empty tables */ +#define RSD_TDB_NMN 0x2 /* internal: need matching net */ + +/* Value of export_mode in struct rt_show_data */ +#define RSEM_NONE 0 /* Export mode not used */ +#define RSEM_PREEXPORT 1 /* Routes ready for export, before filtering */ +#define RSEM_EXPORT 2 /* Routes accepted by export filter */ +#define RSEM_NOEXPORT 3 /* Routes rejected by export filter */ +#define RSEM_EXPORTED 4 /* Routes marked in export map */ + +/* Host entry: Resolve hook for recursive nexthops */ +extern struct ea_class ea_gen_hostentry; +struct hostentry_adata { + adata ad; + struct hostentry *he; + u32 labels[0]; +}; + +void +ea_set_hostentry(ea_list **to, rtable *dep, rtable *tab, ip_addr gw, ip_addr ll, u32 lnum, u32 labels[lnum]); + +void ea_show_hostentry(const struct adata *ad, byte *buf, uint size); +void ea_show_nexthop_list(struct cli *c, struct nexthop_adata *nhad); + +/* + * Default protocol preferences + */ + +#define DEF_PREF_DIRECT 240 /* Directly connected */ +#define DEF_PREF_STATIC 200 /* Static route */ +#define DEF_PREF_OSPF 150 /* OSPF intra-area, inter-area and type 1 external routes */ +#define DEF_PREF_BABEL 130 /* Babel */ +#define DEF_PREF_RIP 120 /* RIP */ +#define DEF_PREF_BGP 100 /* BGP */ +#define DEF_PREF_RPKI 100 /* RPKI */ +#define DEF_PREF_INHERITED 10 /* Routes inherited from other routing daemons */ +#define DEF_PREF_UNKNOWN 0 /* Routes with no preference set */ + +/* + * Route Origin Authorization + */ + +#define ROA_UNKNOWN 0 +#define ROA_VALID 1 +#define ROA_INVALID 2 + +int net_roa_check(rtable *tab, const net_addr *n, u32 asn); + +#endif diff --git a/proto/babel/babel.c b/proto/babel/babel.c index 6d2a593e..d398da8e 100644 --- a/proto/babel/babel.c +++ b/proto/babel/babel.c @@ -14,9 +14,8 @@ /** * DOC: The Babel protocol * - * Babel (RFC6126) is a loop-avoiding distance-vector routing protocol that is - * robust and efficient both in ordinary wired networks and in wireless mesh - * networks. + * The Babel is a loop-avoiding distance-vector routing protocol that is robust + * and efficient both in ordinary wired networks and in wireless mesh networks. * * The Babel protocol keeps state for each neighbour in a &babel_neighbor * struct, tracking received Hello and I Heard You (IHU) messages. A @@ -33,10 +32,17 @@ * an entry is updated by receiving updates from the network or when modified by * internal timers. The function selects from feasible and reachable routes the * one with the lowest metric to be announced to the core. + * + * Supported standards: + * RFC 8966 - The Babel Routing Protocol + * RFC 8967 - MAC Authentication for Babel + * RFC 9079 - Source Specific Routing for Babel + * RFC 9229 - IPv4 Routes with IPv6 Next Hop for Babel */ #include <stdlib.h> #include "babel.h" +#include "lib/macro.h" #define LOG_PKT_AUTH(msg, args...) \ log_rl(&p->log_pkt_tbf, L_AUTH "%s: " msg, p->p.name, args) @@ -49,21 +55,27 @@ static inline int ge_mod64k(uint a, uint b) { return (u16)(a - b) < 0x8000; } +/* Strict inequality version of the above */ +static inline int gt_mod64k(uint a, uint b) +{ return ge_mod64k(a, b) && a != b; } + static void babel_expire_requests(struct babel_proto *p, struct babel_entry *e); static void babel_select_route(struct babel_proto *p, struct babel_entry *e, struct babel_route *mod); static inline void babel_announce_retraction(struct babel_proto *p, struct babel_entry *e); static void babel_send_route_request(struct babel_proto *p, struct babel_entry *e, struct babel_neighbor *n); -static void babel_send_seqno_request(struct babel_proto *p, struct babel_entry *e, struct babel_seqno_request *sr); +static void babel_send_seqno_request(struct babel_proto *p, struct babel_entry *e, struct babel_seqno_request *sr, struct babel_neighbor *n); static void babel_update_cost(struct babel_neighbor *n); static inline void babel_kick_timer(struct babel_proto *p); static inline void babel_iface_kick_timer(struct babel_iface *ifa); +static struct ea_class ea_babel_metric, ea_babel_router_id, ea_babel_seqno; + /* * Functions to maintain data structures */ static void -babel_init_entry(void *E) +babel_init_entry(struct fib *f UNUSED, void *E) { struct babel_entry *e = E; @@ -101,7 +113,8 @@ babel_find_source(struct babel_entry *e, u64 router_id) } static struct babel_source * -babel_get_source(struct babel_proto *p, struct babel_entry *e, u64 router_id) +babel_get_source(struct babel_proto *p, struct babel_entry *e, u64 router_id, + u16 initial_seqno) { struct babel_source *s = babel_find_source(e, router_id); @@ -111,7 +124,7 @@ babel_get_source(struct babel_proto *p, struct babel_entry *e, u64 router_id) s = sl_allocz(p->source_slab); s->router_id = router_id; s->expires = current_time() + BABEL_GARBAGE_INTERVAL; - s->seqno = 0; + s->seqno = initial_seqno; s->metric = BABEL_INFINITY; add_tail(&e->sources, NODE s); @@ -119,7 +132,7 @@ babel_get_source(struct babel_proto *p, struct babel_entry *e, u64 router_id) } static void -babel_expire_sources(struct babel_proto *p, struct babel_entry *e) +babel_expire_sources(struct babel_proto *p UNUSED, struct babel_entry *e) { struct babel_source *n, *nx; btime now_ = current_time(); @@ -129,7 +142,7 @@ babel_expire_sources(struct babel_proto *p, struct babel_entry *e) if (n->expires && n->expires <= now_) { rem_node(NODE n); - sl_free(p->source_slab, n); + sl_free(n); } } } @@ -174,7 +187,7 @@ babel_retract_route(struct babel_proto *p, struct babel_route *r) } static void -babel_flush_route(struct babel_proto *p, struct babel_route *r) +babel_flush_route(struct babel_proto *p UNUSED, struct babel_route *r) { DBG("Babel: Flush route %N router_id %lR neigh %I\n", r->e->n.addr, r->router_id, r->neigh->addr); @@ -185,7 +198,7 @@ babel_flush_route(struct babel_proto *p, struct babel_route *r) if (r->e->selected == r) r->e->selected = NULL; - sl_free(p->route_slab, r); + sl_free(r); } static void @@ -288,61 +301,92 @@ babel_expire_routes(struct babel_proto *p) babel_expire_routes_(p, &p->ip6_rtable); } -static inline int seqno_request_valid(struct babel_seqno_request *sr) -{ return !sr->nbr || sr->nbr->ifa; } - /* - * Add seqno request to the table of pending requests (RFC 6216 3.2.6) and send + * Add seqno request to the table of pending requests (RFC 8966 3.2.6) and send * it to network. Do nothing if it is already in the table. */ static void babel_add_seqno_request(struct babel_proto *p, struct babel_entry *e, u64 router_id, u16 seqno, u8 hop_count, - struct babel_neighbor *nbr) + struct babel_neighbor *target) { struct babel_seqno_request *sr; + btime now_ = current_time(); WALK_LIST(sr, e->requests) if (sr->router_id == router_id) { - /* Found matching or newer */ - if (ge_mod64k(sr->seqno, seqno) && seqno_request_valid(sr)) + /* + * To suppress duplicates, check if we already have a newer (higher seqno) + * outstanding request. If we do, suppress this request if the outstanding + * request is one we originated ourselves. If the outstanding request is + * forwarded, suppress only if this request is also one we're forwarding + * *and* we're within the duplicate suppression time of that request (see + * below). + */ + if (ge_mod64k(sr->seqno, seqno) && + (!sr->forwarded || (target && now_ < sr->dup_suppress_time))) return; - /* Found older */ rem_node(NODE sr); - rem_node(&sr->nbr_node); + + /* Allow upgrading from forwarded to non-forwarded */ + if (!target) + sr->forwarded = 0; goto found; } /* No entries found */ sr = sl_allocz(p->seqno_slab); + sr->forwarded = !!target; found: sr->router_id = router_id; sr->seqno = seqno; - sr->hop_count = hop_count; + sr->hop_count = hop_count ?: BABEL_INITIAL_HOP_COUNT; sr->count = 0; - sr->expires = current_time() + BABEL_SEQNO_REQUEST_EXPIRY; - if (sr->nbr = nbr) - add_tail(&nbr->requests, &sr->nbr_node); + if (sr->forwarded) + { + /* + * We want to keep the entry around for a reasonable period of time so it + * can be used to trigger an update (through babel_satisfy_seqno_request()). + * However, duplicate suppression should only trigger for a short period of + * time so it suppresses duplicates from multiple sources, but not + * retransmissions from the same source. Hence we keep two timers. + */ + sr->expires = now_ + BABEL_SEQNO_FORWARD_EXPIRY; + sr->dup_suppress_time = now_ + BABEL_SEQNO_DUP_SUPPRESS_TIME; + } + else + { + sr->expires = now_ + BABEL_SEQNO_REQUEST_EXPIRY; + } add_tail(&e->requests, NODE sr); - - babel_send_seqno_request(p, e, sr); + babel_send_seqno_request(p, e, sr, target); } static void -babel_remove_seqno_request(struct babel_proto *p, struct babel_seqno_request *sr) +babel_generate_seqno_request(struct babel_proto *p, struct babel_entry *e, + u64 router_id, u16 seqno, struct babel_neighbor *target) { - if (sr->nbr) - rem_node(&sr->nbr_node); + struct babel_seqno_request req = { + .router_id = router_id, + .seqno = seqno, + .hop_count = BABEL_INITIAL_HOP_COUNT, + }; + babel_send_seqno_request(p, e, &req, target); +} + +static void +babel_remove_seqno_request(struct babel_proto *p UNUSED, struct babel_seqno_request *sr) +{ rem_node(NODE sr); - sl_free(p->seqno_slab, sr); + sl_free(sr); } static int @@ -370,21 +414,14 @@ babel_expire_requests(struct babel_proto *p, struct babel_entry *e) WALK_LIST_DELSAFE(sr, srx, e->requests) { - /* Remove seqno requests sent to dead neighbors */ - if (!seqno_request_valid(sr)) - { - babel_remove_seqno_request(p, sr); - continue; - } - /* Handle expired requests - resend or remove */ if (sr->expires && sr->expires <= now_) { - if (sr->count < BABEL_SEQNO_REQUEST_RETRY) + if (!sr->forwarded && sr->count < BABEL_SEQNO_REQUEST_RETRY) { sr->count++; sr->expires += (BABEL_SEQNO_REQUEST_EXPIRY << sr->count); - babel_send_seqno_request(p, e, sr); + babel_send_seqno_request(p, e, sr, NULL); } else { @@ -429,7 +466,6 @@ babel_get_neighbor(struct babel_iface *ifa, ip_addr addr) nbr->cost = BABEL_INFINITY; nbr->init_expiry = current_time() + BABEL_INITIAL_NEIGHBOR_TIMEOUT; init_list(&nbr->routes); - init_list(&nbr->requests); add_tail(&ifa->neigh_list, NODE nbr); return nbr; @@ -450,13 +486,6 @@ babel_flush_neighbor(struct babel_proto *p, struct babel_neighbor *nbr) babel_flush_route(p, r); } - struct babel_seqno_request *sr; - WALK_LIST_FIRST2(sr, nbr_node, nbr->requests) - { - sr->nbr = NULL; - rem_node(&sr->nbr_node); - } - nbr->ifa = NULL; rem_node(NODE nbr); mb_free(nbr); @@ -543,7 +572,7 @@ babel_is_feasible(struct babel_source *s, u16 seqno, u16 metric) { return !s || (metric == BABEL_INFINITY) || - (seqno > s->seqno) || + gt_mod64k(seqno, s->seqno) || ((seqno == s->seqno) && (metric < s->metric)); } @@ -640,37 +669,14 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) if (r) { - rta a0 = { - .source = RTS_BABEL, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, - .pref = c->preference, - .from = r->neigh->addr, - .nh.gw = r->next_hop, - .nh.iface = r->neigh->ifa->iface, - .eattrs = alloca(sizeof(ea_list) + 3*sizeof(eattr)), - }; - - *a0.eattrs = (ea_list) { .count = 3 }; - a0.eattrs->attrs[0] = (eattr) { - .id = EA_BABEL_METRIC, - .type = EAF_TYPE_INT, - .u.data = r->metric, - }; - - struct adata *ad = alloca(sizeof(struct adata) + sizeof(u64)); - ad->length = sizeof(u64); - memcpy(ad->data, &(r->router_id), sizeof(u64)); - a0.eattrs->attrs[1] = (eattr) { - .id = EA_BABEL_ROUTER_ID, - .type = EAF_TYPE_OPAQUE, - .u.ptr = ad, - }; - - a0.eattrs->attrs[2] = (eattr) { - .id = EA_BABEL_SEQNO, - .type = EAF_TYPE_INT, - .u.data = r->seqno, + struct nexthop_adata nhad = { + .nh = { + .gw = r->next_hop, + .iface = r->neigh->ifa->iface, + }, + .ad = { + .length = sizeof nhad - sizeof nhad.ad, + }, }; /* @@ -679,10 +685,26 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) * have routing work. */ if (!neigh_find(&p->p, r->next_hop, r->neigh->ifa->iface, 0)) - a0.nh.flags = RNF_ONLINK; + nhad.nh.flags = RNF_ONLINK; + + struct { + ea_list l; + eattr a[7]; + } eattrs = { + .l.count = ARRAY_SIZE(eattrs.a), + .a = { + EA_LITERAL_EMBEDDED(&ea_gen_preference, 0, c->preference), + EA_LITERAL_STORE_ADATA(&ea_gen_from, 0, &r->neigh->addr, sizeof(r->neigh->addr)), + EA_LITERAL_EMBEDDED(&ea_gen_source, 0, RTS_BABEL), + EA_LITERAL_STORE_ADATA(&ea_gen_nexthop, 0, nhad.ad.data, nhad.ad.length), + EA_LITERAL_EMBEDDED(&ea_babel_metric, 0, r->metric), + EA_LITERAL_STORE_ADATA(&ea_babel_router_id, 0, &r->router_id, sizeof(r->router_id)), + EA_LITERAL_EMBEDDED(&ea_babel_seqno, 0, r->seqno), + } + }; rte e0 = { - .attrs = &a0, + .attrs = &eattrs.l, .src = p->p.main_source, }; @@ -692,15 +714,14 @@ babel_announce_rte(struct babel_proto *p, struct babel_entry *e) else if (e->valid && (e->router_id != p->router_id)) { /* Unreachable */ - rta a0 = { - .source = RTS_BABEL, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNREACHABLE, - .pref = 1, - }; + ea_list *ea = NULL; + + ea_set_attr_u32(&ea, &ea_gen_preference, 0, 1); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_BABEL); + ea_set_dest(&ea, 0, RTD_UNREACHABLE); rte e0 = { - .attrs = &a0, + .attrs = ea, .src = p->p.main_source, }; @@ -862,14 +883,14 @@ babel_send_ihus(struct babel_iface *ifa) } static void -babel_send_hello(struct babel_iface *ifa) +babel_send_hello(struct babel_iface *ifa, uint interval) { struct babel_proto *p = ifa->proto; union babel_msg msg = {}; msg.type = BABEL_TLV_HELLO; msg.hello.seqno = ifa->hello_seqno++; - msg.hello.interval = ifa->cf->hello_interval; + msg.hello.interval = interval ?: ifa->cf->hello_interval; TRACE(D_PACKETS, "Sending hello on %s with seqno %d interval %t", ifa->ifname, msg.hello.seqno, (btime) msg.hello.interval); @@ -907,22 +928,23 @@ babel_send_wildcard_request(struct babel_iface *ifa) } static void -babel_send_seqno_request(struct babel_proto *p, struct babel_entry *e, struct babel_seqno_request *sr) +babel_send_seqno_request(struct babel_proto *p, struct babel_entry *e, + struct babel_seqno_request *sr, struct babel_neighbor *n) { union babel_msg msg = {}; msg.type = BABEL_TLV_SEQNO_REQUEST; - msg.seqno_request.hop_count = sr->hop_count ?: BABEL_INITIAL_HOP_COUNT; + msg.seqno_request.hop_count = sr->hop_count; msg.seqno_request.seqno = sr->seqno; msg.seqno_request.router_id = sr->router_id; net_copy(&msg.seqno_request.net, e->n.addr); - if (sr->nbr) + if (n) { TRACE(D_PACKETS, "Sending seqno request for %N router-id %lR seqno %d to %I on %s", - e->n.addr, sr->router_id, sr->seqno, sr->nbr->addr, sr->nbr->ifa->ifname); + e->n.addr, sr->router_id, sr->seqno, n->addr, n->ifa->ifname); - babel_send_unicast(&msg, sr->nbr->ifa, sr->nbr->addr); + babel_send_unicast(&msg, n->ifa, n->addr); } else { @@ -985,8 +1007,18 @@ babel_send_update_(struct babel_iface *ifa, btime changed, struct fib *rtable) msg.update.router_id = e->router_id; net_copy(&msg.update.net, e->n.addr); - msg.update.next_hop = ((e->n.addr->type == NET_IP4) ? - ifa->next_hop_ip4 : ifa->next_hop_ip6); + if (e->n.addr->type == NET_IP4) + { + /* Always prefer IPv4 nexthop if set */ + if (ipa_nonzero(ifa->next_hop_ip4)) + msg.update.next_hop = ifa->next_hop_ip4; + + /* Only send IPv6 nexthop if enabled */ + else if (ifa->cf->ext_next_hop) + msg.update.next_hop = ifa->next_hop_ip6; + } + else + msg.update.next_hop = ifa->next_hop_ip6; /* Do not send route if next hop is unknown, e.g. no configured IPv4 address */ if (ipa_zero(msg.update.next_hop)) @@ -994,13 +1026,13 @@ babel_send_update_(struct babel_iface *ifa, btime changed, struct fib *rtable) babel_enqueue(&msg, ifa); - /* Update feasibility distance for redistributed routes */ + /* RFC 8966 3.7.3 - update feasibility distance for redistributed routes */ if (e->router_id != p->router_id) { - struct babel_source *s = babel_get_source(p, e, e->router_id); + struct babel_source *s = babel_get_source(p, e, e->router_id, msg.update.seqno); s->expires = current_time() + BABEL_GARBAGE_INTERVAL; - if ((msg.update.seqno > s->seqno) || + if (gt_mod64k(msg.update.seqno, s->seqno) || ((msg.update.seqno == s->seqno) && (msg.update.metric < s->metric))) { s->seqno = msg.update.seqno; @@ -1245,6 +1277,13 @@ babel_handle_update(union babel_msg *m, struct babel_iface *ifa) return; } + /* Reject IPv4 via IPv6 routes if disabled */ + if ((msg->net.type == NET_IP4) && ipa_is_ip6(msg->next_hop) && !ifa->cf->ext_next_hop) + { + DBG("Babel: Ignoring disabled IPv4 via IPv6 route.\n"); + return; + } + /* Retraction */ if (msg->metric == BABEL_INFINITY) { @@ -1289,10 +1328,14 @@ babel_handle_update(union babel_msg *m, struct babel_iface *ifa) metric = babel_compute_metric(nbr, msg->metric); best = e->selected; - /* RFC section 3.8.2.2 - Dealing with unfeasible updates */ - if (!feasible && (metric != BABEL_INFINITY) && + /* + * RFC 8966 3.8.2.2 - dealing with unfeasible updates. Generate a one-off + * (not retransmitted) unicast seqno request to the originator of this update. + * Note: !feasible -> s exists, check for 's' is just for clarity / safety. + */ + if (!feasible && s && (metric != BABEL_INFINITY) && (!best || (r == best) || (metric < best->metric))) - babel_add_seqno_request(p, e, s->router_id, s->seqno + 1, 0, nbr); + babel_generate_seqno_request(p, e, s->router_id, s->seqno + 1, nbr); /* Special case - ignore unfeasible update to best route */ if (r == best && !feasible && (msg->router_id == r->router_id)) @@ -1331,7 +1374,7 @@ babel_handle_route_request(union babel_msg *m, struct babel_iface *ifa) struct babel_proto *p = ifa->proto; struct babel_msg_route_request *msg = &m->route_request; - /* RFC 6126 3.8.1.1 */ + /* RFC 8966 3.8.1.1 */ /* Wildcard request - full update on the interface */ if (msg->full) @@ -1357,13 +1400,39 @@ babel_handle_route_request(union babel_msg *m, struct babel_iface *ifa) } } +static struct babel_neighbor * +babel_find_seqno_request_target(struct babel_entry *e, struct babel_neighbor *skip) +{ + struct babel_route *r, *best_feasible = NULL, *best_any = NULL; + + WALK_LIST(r, e->routes) + { + if (r->neigh == skip) + continue; + + if (r->feasible && (!best_feasible || r->metric < best_feasible->metric)) + best_feasible = r; + + if (!best_any || r->metric < best_any->metric) + best_any = r; + } + + if (best_feasible) + return best_feasible->neigh; + + if (best_any) + return best_any->neigh; + + return NULL; +} + void babel_handle_seqno_request(union babel_msg *m, struct babel_iface *ifa) { struct babel_proto *p = ifa->proto; struct babel_msg_seqno_request *msg = &m->seqno_request; - /* RFC 6126 3.8.1.2 */ + /* RFC 8966 3.8.1.2 */ TRACE(D_PACKETS, "Handling seqno request for %N router-id %lR seqno %d hop count %d", &msg->net, msg->router_id, msg->seqno, msg->hop_count); @@ -1393,26 +1462,21 @@ babel_handle_seqno_request(union babel_msg *m, struct babel_iface *ifa) { /* Not ours; forward if TTL allows it */ - /* Find best admissible route */ - struct babel_route *r, *best1 = NULL, *best2 = NULL; - WALK_LIST(r, e->routes) - if ((r->router_id == msg->router_id) && !ipa_equal(r->neigh->addr, msg->sender)) - { - /* Find best feasible route */ - if ((!best1 || r->metric < best1->metric) && r->feasible) - best1 = r; + struct babel_neighbor *nbr, *target; - /* Find best not necessary feasible route */ - if (!best2 || r->metric < best2->metric) - best2 = r; - } + nbr = babel_find_neighbor(ifa, msg->sender); + if (!nbr) + return; - /* If no route is found, do nothing */ - r = best1 ?: best2; - if (!r) + target = babel_find_seqno_request_target(e, nbr); + if (!target) + { + TRACE(D_PACKETS, "No neighbor to forward seqno request for %N router-id %lR seqno %d to", + e->n.addr, msg->router_id, msg->seqno); return; + } - babel_add_seqno_request(p, e, msg->router_id, msg->seqno, msg->hop_count-1, r->neigh); + babel_add_seqno_request(p, e, msg->router_id, msg->seqno, msg->hop_count-1, target); } } @@ -1506,7 +1570,8 @@ babel_auth_check_pc(struct babel_iface *ifa, struct babel_msg_auth *msg) n->auth_index_len = msg->index_len; memcpy(n->auth_index, msg->index, msg->index_len); - n->auth_pc = msg->pc; + n->auth_pc_unicast = msg->pc; + n->auth_pc_multicast = msg->pc; n->auth_passed = 1; return 1; @@ -1525,16 +1590,30 @@ babel_auth_check_pc(struct babel_iface *ifa, struct babel_msg_auth *msg) return 0; } - /* (6) Index matches; only accept if PC is greater than last */ - if (n->auth_pc >= msg->pc) + /* + * (6) Index matches; only accept if PC is greater than last. We keep separate + * counters for unicast and multicast because multicast packets can be delayed + * significantly on wireless networks (enough to be received out of order). + * Separate counters are safe because the packet destination address is part + * of the MAC pseudo-header (so unicast packets can't be replayed as multicast + * and vice versa). + */ + u32 auth_pc = msg->unicast ? n->auth_pc_unicast : n->auth_pc_multicast; + if (auth_pc >= msg->pc) { LOG_PKT_AUTH("Authentication failed for %I on %s - " - "lower packet counter (rcv %u, old %u)", - msg->sender, ifa->ifname, msg->pc, n->auth_pc); + "lower %s packet counter (rcv %u, old %u)", + msg->sender, ifa->ifname, + msg->unicast ? "unicast" : "multicast", + msg->pc, auth_pc); return 0; } - n->auth_pc = msg->pc; + if (msg->unicast) + n->auth_pc_unicast = msg->pc; + else + n->auth_pc_multicast = msg->pc; + n->auth_passed = 1; return 1; @@ -1577,7 +1656,7 @@ babel_iface_timer(timer *t) if (now_ >= ifa->next_hello) { - babel_send_hello(ifa); + babel_send_hello(ifa, 0); ifa->next_hello += hello_period * (1 + (now_ - ifa->next_hello) / hello_period); } @@ -1624,7 +1703,7 @@ babel_iface_start(struct babel_iface *ifa) tm_start(ifa->timer, 100 MS); ifa->up = 1; - babel_send_hello(ifa); + babel_send_hello(ifa, 0); babel_send_wildcard_retraction(ifa); babel_send_wildcard_request(ifa); babel_send_update(ifa, 0); /* Full update */ @@ -1686,7 +1765,7 @@ babel_iface_update_addr4(struct babel_iface *ifa) ip_addr addr4 = ifa->iface->addr4 ? ifa->iface->addr4->ip : IPA_NONE; ifa->next_hop_ip4 = ipa_nonzero(ifa->cf->next_hop_ip4) ? ifa->cf->next_hop_ip4 : addr4; - if (ipa_zero(ifa->next_hop_ip4) && p->ip4_channel) + if (ipa_zero(ifa->next_hop_ip4) && p->ip4_channel && !ifa->cf->ext_next_hop) log(L_WARN "%s: Missing IPv4 next hop address for %s", p->p.name, ifa->ifname); if (ifa->up) @@ -1725,9 +1804,9 @@ babel_find_iface(struct babel_proto *p, struct iface *what) } static void -babel_iface_locked(struct object_lock *lock) +babel_iface_locked(void *_ifa) { - struct babel_iface *ifa = lock->data; + struct babel_iface *ifa = _ifa; struct babel_proto *p = ifa->proto; if (!babel_open_socket(ifa)) @@ -1747,7 +1826,7 @@ babel_add_iface(struct babel_proto *p, struct iface *new, struct babel_iface_con TRACE(D_EVENTS, "Adding interface %s", new->name); - pool *pool = rp_new(p->p.pool, p->p.loop, new->name); + pool *pool = rp_new(p->p.pool, new->name); ifa = mb_allocz(pool, sizeof(struct babel_iface)); ifa->proto = p; @@ -1763,8 +1842,8 @@ babel_add_iface(struct babel_proto *p, struct iface *new, struct babel_iface_con ifa->next_hop_ip4 = ipa_nonzero(ic->next_hop_ip4) ? ic->next_hop_ip4 : addr4; ifa->next_hop_ip6 = ipa_nonzero(ic->next_hop_ip6) ? ic->next_hop_ip6 : ifa->addr; - if (ipa_zero(ifa->next_hop_ip4) && p->ip4_channel) - log(L_WARN "%s: Missing IPv4 next hop address for %s", p->p.name, new->name); + if (ipa_zero(ifa->next_hop_ip4) && p->ip4_channel && !ic->ext_next_hop) + log(L_WARN "%s: Missing IPv4 next hop address for %s", p->p.name, ifa->ifname); init_list(&ifa->neigh_list); ifa->hello_seqno = 1; @@ -1782,8 +1861,11 @@ babel_add_iface(struct babel_proto *p, struct iface *new, struct babel_iface_con lock->addr = IP6_BABEL_ROUTERS; lock->port = ifa->cf->port; lock->iface = ifa->iface; - lock->hook = babel_iface_locked; - lock->data = ifa; + lock->event = (event) { + .hook = babel_iface_locked, + .data = ifa, + }; + lock->target = &global_event_list; olock_acquire(lock); } @@ -1799,7 +1881,7 @@ babel_remove_iface(struct babel_proto *p, struct babel_iface *ifa) rem_node(NODE ifa); - rp_free(ifa->pool, p->p.pool); /* contains ifa itself, locks, socket, etc */ + rfree(ifa->pool); /* contains ifa itself, locks, socket, etc */ } static int @@ -1884,7 +1966,7 @@ babel_reconfigure_iface(struct babel_proto *p, struct babel_iface *ifa, struct b if ((new->auth_type != BABEL_AUTH_NONE) && (new->auth_type != old->auth_type)) babel_auth_reset_index(ifa); - if (ipa_zero(ifa->next_hop_ip4) && p->ip4_channel) + if (ipa_zero(ifa->next_hop_ip4) && p->ip4_channel && !new->ext_next_hop) log(L_WARN "%s: Missing IPv4 next hop address for %s", p->p.name, ifa->ifname); if (ifa->next_hello > (current_time() + new->hello_interval)) @@ -1905,11 +1987,11 @@ babel_reconfigure_iface(struct babel_proto *p, struct babel_iface *ifa, struct b static void babel_reconfigure_ifaces(struct babel_proto *p, struct babel_config *cf) { - struct iface *iface; - - IFACE_LEGACY_ACCESS; - WALK_LIST(iface, global_iface_list) + IFACE_WALK(iface) { + if (p->p.vrf && p->p.vrf != iface->master) + continue; + if (!(iface->flags & IF_UP)) continue; @@ -1920,7 +2002,7 @@ babel_reconfigure_ifaces(struct babel_proto *p, struct babel_config *cf) struct babel_iface *ifa = babel_find_iface(p, iface); struct babel_iface_config *ic = (void *) iface_patt_find(&cf->iface_list, iface, NULL); - if (ic && iface_is_valid(p, iface)) + if (ic && !iface_is_valid(p, iface)) ic = NULL; if (ifa && ic) @@ -2029,41 +2111,45 @@ babel_dump(struct proto *P) } static void -babel_get_route_info(rte *rte, byte *buf) +babel_get_route_info(const rte *rte, byte *buf) { u64 rid = 0; - eattr *e = ea_find(rte->attrs->eattrs, EA_BABEL_ROUTER_ID); + eattr *e = ea_find(rte->attrs, &ea_babel_router_id); if (e) memcpy(&rid, e->u.ptr->data, sizeof(u64)); - buf += bsprintf(buf, " (%d/%d) [%lR]", rte->attrs->pref, - ea_get_int(rte->attrs->eattrs, EA_BABEL_METRIC, BABEL_INFINITY), rid); + buf += bsprintf(buf, " (%d/%d) [%lR]", + rt_get_preference(rte), + ea_get_int(rte->attrs, &ea_babel_metric, BABEL_INFINITY), rid); } -static int -babel_get_attr(const eattr *a, byte *buf, int buflen UNUSED) +static void +babel_router_id_format(const eattr *a, byte *buf, uint len) { - switch (a->id) - { - case EA_BABEL_SEQNO: - return GA_FULL; + u64 rid = 0; + memcpy(&rid, a->u.ptr->data, sizeof(u64)); + bsnprintf(buf, len, "%lR", rid); +} - case EA_BABEL_METRIC: - bsprintf(buf, "metric: %d", a->u.data); - return GA_FULL; +static struct ea_class ea_babel_metric = { + .name = "babel_metric", + .type = T_INT, +}; - case EA_BABEL_ROUTER_ID: - { - u64 rid = 0; - memcpy(&rid, a->u.ptr->data, sizeof(u64)); - bsprintf(buf, "router_id: %lR", rid); - return GA_FULL; - } +static struct ea_class ea_babel_router_id = { + .name = "babel_router_id", + .type = T_OPAQUE, + .readonly = 1, + .format = babel_router_id_format, +}; + +static struct ea_class ea_babel_seqno = { + .name = "babel_seqno", + .type = T_INT, + .readonly = 1, + .hidden = 1, +}; - default: - return GA_UNKNOWN; - } -} void babel_show_interfaces(struct proto *P, const char *iff) @@ -2261,11 +2347,15 @@ babel_kick_timer(struct babel_proto *p) static int -babel_preexport(struct channel *c, struct rte *new) +babel_preexport(struct channel *C, struct rte *new) { - struct rta *a = new->attrs; + if (new->src->owner != &C->proto->sources) + return 0; + /* Reject our own unreachable routes */ - if ((a->dest == RTD_UNREACHABLE) && (new->src->owner == &c->proto->sources)) + eattr *ea = ea_find(new->attrs, &ea_gen_nexthop); + struct nexthop_adata *nhad = (void *) ea->u.ptr; + if (!NEXTHOP_IS_REACHABLE(nhad)) return -1; return 0; @@ -2286,13 +2376,13 @@ babel_rt_notify(struct proto *P, struct channel *c UNUSED, const net_addr *net, { /* Update */ uint rt_seqno; - uint rt_metric = ea_get_int(new->attrs->eattrs, EA_BABEL_METRIC, 0); + uint rt_metric = ea_get_int(new->attrs, &ea_babel_metric, 0); u64 rt_router_id = 0; if (new->src->owner == &P->sources) { - rt_seqno = ea_find(new->attrs->eattrs, EA_BABEL_SEQNO)->u.data; - eattr *e = ea_find(new->attrs->eattrs, EA_BABEL_ROUTER_ID); + rt_seqno = ea_get_int(new->attrs, &ea_babel_seqno, 0); + eattr *e = ea_find(new->attrs, &ea_babel_router_id); if (e) memcpy(&rt_router_id, e->u.ptr->data, sizeof(u64)); } @@ -2341,18 +2431,18 @@ babel_rt_notify(struct proto *P, struct channel *c UNUSED, const net_addr *net, } static int -babel_rte_better(struct rte *new, struct rte *old) +babel_rte_better(const rte *new, const rte *old) { - uint new_metric = ea_find(new->attrs->eattrs, EA_BABEL_SEQNO)->u.data; - uint old_metric = ea_find(old->attrs->eattrs, EA_BABEL_SEQNO)->u.data; + uint new_metric = ea_get_int(new->attrs, &ea_babel_metric, BABEL_INFINITY); + uint old_metric = ea_get_int(old->attrs, &ea_babel_metric, BABEL_INFINITY); return new_metric < old_metric; } static u32 -babel_rte_igp_metric(struct rte *rt) +babel_rte_igp_metric(const rte *rt) { - return ea_get_int(rt->attrs->eattrs, EA_BABEL_METRIC, BABEL_INFINITY); + return ea_get_int(rt->attrs, &ea_babel_metric, BABEL_INFINITY); } @@ -2389,7 +2479,7 @@ babel_init(struct proto_config *CF) proto_configure_channel(P, &p->ip4_channel, cf->ip4_channel); proto_configure_channel(P, &p->ip6_channel, cf->ip6_channel); - P->if_notify = babel_if_notify; + P->iface_sub.if_notify = babel_if_notify; P->rt_notify = babel_rt_notify; P->preexport = babel_preexport; @@ -2442,6 +2532,11 @@ babel_iface_shutdown(struct babel_iface *ifa) { if (ifa->sk) { + /* + * Retract all our routes and lower the hello interval so peers' neighbour + * state expires quickly + */ + babel_send_hello(ifa, BABEL_MIN_INTERVAL); babel_send_wildcard_retraction(ifa); babel_send_queue(ifa); } @@ -2489,7 +2584,6 @@ babel_reconfigure(struct proto *P, struct proto_config *CF) struct protocol proto_babel = { .name = "Babel", .template = "babel%d", - .class = PROTOCOL_BABEL, .preference = DEF_PREF_BABEL, .channel_mask = NB_IP | NB_IP6_SADR, .proto_size = sizeof(struct babel_proto), @@ -2500,5 +2594,16 @@ struct protocol proto_babel = { .start = babel_start, .shutdown = babel_shutdown, .reconfigure = babel_reconfigure, - .get_attr = babel_get_attr }; + +void +babel_build(void) +{ + proto_build(&proto_babel); + + EA_REGISTER_ALL( + &ea_babel_metric, + &ea_babel_router_id, + &ea_babel_seqno + ); +} diff --git a/proto/babel/babel.h b/proto/babel/babel.h index 8b6da3c8..562abac2 100644 --- a/proto/babel/babel.h +++ b/proto/babel/babel.h @@ -16,7 +16,7 @@ #include "nest/bird.h" #include "nest/cli.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/locks.h" #include "nest/password.h" @@ -26,10 +26,6 @@ #include "lib/string.h" #include "lib/timer.h" -#define EA_BABEL_METRIC EA_CODE(PROTOCOL_BABEL, 0) -#define EA_BABEL_ROUTER_ID EA_CODE(PROTOCOL_BABEL, 1) -#define EA_BABEL_SEQNO EA_CODE(PROTOCOL_BABEL, 2) - #define BABEL_MAGIC 42 #define BABEL_VERSION 2 #define BABEL_PORT 6696 @@ -48,6 +44,8 @@ #define BABEL_ROUTE_REFRESH_FACTOR(X) ((btime)(X)*5/2) /* 2.5 */ #define BABEL_SEQNO_REQUEST_RETRY 4 #define BABEL_SEQNO_REQUEST_EXPIRY (2 S_) +#define BABEL_SEQNO_FORWARD_EXPIRY (10 S_) +#define BABEL_SEQNO_DUP_SUPPRESS_TIME (1 S_) #define BABEL_GARBAGE_INTERVAL (300 S_) #define BABEL_RXCOST_WIRED 96 #define BABEL_RXCOST_WIRELESS 256 @@ -112,6 +110,7 @@ enum babel_ae_type { BABEL_AE_IP4 = 1, BABEL_AE_IP6 = 2, BABEL_AE_IP6_LL = 3, + BABEL_AE_IP4_VIA_IP6 = 4, BABEL_AE_MAX }; @@ -145,8 +144,9 @@ struct babel_iface_config { ip_addr next_hop_ip4; ip_addr next_hop_ip6; + u8 ext_next_hop; /* Enable IPv4 via IPv6 */ - u8 auth_type; /* Authentication type (BABEL_AUTH_*) */ + u8 auth_type; /* Authentication type (BABEL_AUTH_*) */ u8 auth_permissive; /* Don't drop packets failing auth check */ uint mac_num_keys; /* Number of configured HMAC keys */ uint mac_total_len; /* Total digest length for all configured keys */ @@ -225,7 +225,8 @@ struct babel_neighbor { u16 next_hello_seqno; uint last_hello_int; - u32 auth_pc; + u32 auth_pc_unicast; + u32 auth_pc_multicast; u8 auth_passed; u8 auth_index_len; u8 auth_index[BABEL_AUTH_INDEX_LEN]; @@ -240,7 +241,6 @@ struct babel_neighbor { btime init_expiry; list routes; /* Routes this neighbour has sent us (struct babel_route) */ - list requests; /* Seqno requests bound to this neighbor */ }; struct babel_source { @@ -270,13 +270,13 @@ struct babel_route { struct babel_seqno_request { node n; - node nbr_node; u64 router_id; u16 seqno; + u8 forwarded; u8 hop_count; u8 count; btime expires; - struct babel_neighbor *nbr; + btime dup_suppress_time; }; struct babel_entry { @@ -404,6 +404,7 @@ struct babel_msg_auth { u8 challenge_seen; u8 challenge_len; u8 challenge[BABEL_AUTH_MAX_NONCE_LEN]; + u8 unicast; }; static inline int babel_sadr_enabled(struct babel_proto *p) diff --git a/proto/babel/config.Y b/proto/babel/config.Y index 05210fa4..6d1ad7d0 100644 --- a/proto/babel/config.Y +++ b/proto/babel/config.Y @@ -24,8 +24,9 @@ CF_DECLS CF_KEYWORDS(BABEL, INTERFACE, METRIC, RXCOST, HELLO, UPDATE, INTERVAL, PORT, TYPE, WIRED, WIRELESS, RX, TX, BUFFER, PRIORITY, LENGTH, CHECK, LINK, - NEXT, HOP, IPV4, IPV6, BABEL_METRIC, SHOW, INTERFACES, NEIGHBORS, - ENTRIES, RANDOMIZE, ROUTER, ID, AUTHENTICATION, NONE, MAC, PERMISSIVE) + NEXT, HOP, IPV4, IPV6, SHOW, INTERFACES, NEIGHBORS, + ENTRIES, RANDOMIZE, ROUTER, ID, AUTHENTICATION, NONE, MAC, PERMISSIVE, + EXTENDED) CF_GRAMMAR @@ -67,6 +68,7 @@ babel_iface_start: BABEL_IFACE->tx_tos = IP_PREC_INTERNET_CONTROL; BABEL_IFACE->tx_priority = sk_priority_control; BABEL_IFACE->check_link = 1; + BABEL_IFACE->ext_next_hop = 1; }; @@ -143,6 +145,7 @@ babel_iface_item: | CHECK LINK bool { BABEL_IFACE->check_link = $3; } | NEXT HOP IPV4 ipa { BABEL_IFACE->next_hop_ip4 = $4; if (!ipa_is_ip4($4)) cf_error("Must be an IPv4 address"); } | NEXT HOP IPV6 ipa { BABEL_IFACE->next_hop_ip6 = $4; if (!ipa_is_ip6($4)) cf_error("Must be an IPv6 address"); } + | EXTENDED NEXT HOP bool { BABEL_IFACE->ext_next_hop = $4; } | AUTHENTICATION NONE { BABEL_IFACE->auth_type = BABEL_AUTH_NONE; } | AUTHENTICATION MAC { BABEL_IFACE->auth_type = BABEL_AUTH_MAC; BABEL_IFACE->auth_permissive = 0; } | AUTHENTICATION MAC PERMISSIVE { BABEL_IFACE->auth_type = BABEL_AUTH_MAC; BABEL_IFACE->auth_permissive = 1; } @@ -163,8 +166,6 @@ babel_iface_opt_list: babel_iface: babel_iface_start iface_patt_list_nopx babel_iface_opt_list babel_iface_finish; -dynamic_attr: BABEL_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_BABEL_METRIC); } ; - CF_CLI_HELP(SHOW BABEL, ..., [[Show information about Babel protocol]]); CF_CLI(SHOW BABEL INTERFACES, optproto opttext, [<name>] [\"<interface>\"], [[Show information about Babel interfaces]]) diff --git a/proto/babel/packets.c b/proto/babel/packets.c index f13410e2..d26ee5c6 100644 --- a/proto/babel/packets.c +++ b/proto/babel/packets.c @@ -166,10 +166,12 @@ struct babel_parse_state { ip_addr next_hop_ip6; u64 router_id; /* Router ID used in subsequent updates */ u8 def_ip6_prefix[16]; /* Implicit IPv6 prefix in network order */ - u8 def_ip4_prefix[4]; /* Implicit IPv4 prefix in network order */ + u8 def_ip4_prefix[4]; /* Implicit IPv4 prefix (AE 1) in network order */ + u8 def_ip4_via_ip6_prefix[4]; /* Implicit IPv4 prefix (AE 4) in network order */ u8 router_id_seen; /* router_id field is valid */ u8 def_ip6_prefix_seen; /* def_ip6_prefix is valid */ u8 def_ip4_prefix_seen; /* def_ip4_prefix is valid */ + u8 def_ip4_via_ip6_prefix_seen; /* def_ip4_via_ip6_prefix is valid */ u8 current_tlv_endpos; /* End of self-terminating TLVs (offset from start) */ u8 sadr_enabled; u8 is_unicast; @@ -515,9 +517,6 @@ babel_read_ihu(struct babel_tlv *hdr, union babel_msg *m, msg->addr = IPA_NONE; msg->sender = state->saddr; - if (msg->ae >= BABEL_AE_MAX) - return PARSE_IGNORE; - /* * We only actually read link-local IPs. In every other case, the addr field * will be 0 but validation will succeed. The handler takes care of these @@ -526,17 +525,20 @@ babel_read_ihu(struct babel_tlv *hdr, union babel_msg *m, */ switch (msg->ae) { + case BABEL_AE_WILDCARD: + return PARSE_SUCCESS; + case BABEL_AE_IP4: if (TLV_OPT_LENGTH(tlv) < 4) return PARSE_ERROR; state->current_tlv_endpos += 4; - break; + return PARSE_SUCCESS; case BABEL_AE_IP6: if (TLV_OPT_LENGTH(tlv) < 16) return PARSE_ERROR; state->current_tlv_endpos += 16; - break; + return PARSE_SUCCESS; case BABEL_AE_IP6_LL: if (TLV_OPT_LENGTH(tlv) < 8) @@ -544,10 +546,17 @@ babel_read_ihu(struct babel_tlv *hdr, union babel_msg *m, msg->addr = ipa_from_ip6(get_ip6_ll(&tlv->addr)); state->current_tlv_endpos += 8; - break; + return PARSE_SUCCESS; + + /* RFC 9229 2.4 - IHU TLV MUST NOT carry the AE 4 (IPv4-via-IPv6) */ + case BABEL_AE_IP4_VIA_IP6: + return PARSE_ERROR; + + default: + return PARSE_IGNORE; } - return PARSE_SUCCESS; + return PARSE_IGNORE; } static uint @@ -640,6 +649,10 @@ babel_read_next_hop(struct babel_tlv *hdr, union babel_msg *m UNUSED, state->current_tlv_endpos += 8; return PARSE_IGNORE; + /* RFC 9229 2.4 - Next Hop TLV MUST NOT carry the AE 4 (IPv4-via-IPv6) */ + case BABEL_AE_IP4_VIA_IP6: + return PARSE_ERROR; + default: return PARSE_IGNORE; } @@ -692,6 +705,42 @@ babel_write_next_hop(struct babel_tlv *hdr, ip_addr addr, return 0; } +/* This is called directly from babel_read_update() to handle + both BABEL_AE_IP4 and BABEL_AE_IP4_VIA_IP6 encodings */ +static int +babel_read_ip4_prefix(struct babel_tlv_update *tlv, struct babel_msg_update *msg, + u8 *def_prefix, u8 *def_prefix_seen, ip_addr next_hop, int len) +{ + if (tlv->plen > IP4_MAX_PREFIX_LENGTH) + return PARSE_ERROR; + + /* Cannot omit data if there is no saved prefix */ + if (tlv->omitted && !*def_prefix_seen) + return PARSE_ERROR; + + /* Update must have next hop, unless it is retraction */ + if (ipa_zero(next_hop) && msg->metric != BABEL_INFINITY) + return PARSE_ERROR; + + /* Merge saved prefix and received prefix parts */ + u8 buf[4] = {}; + memcpy(buf, def_prefix, tlv->omitted); + memcpy(buf + tlv->omitted, tlv->addr, len); + + ip4_addr prefix4 = get_ip4(buf); + net_fill_ip4(&msg->net, prefix4, tlv->plen); + + if (tlv->flags & BABEL_UF_DEF_PREFIX) + { + put_ip4(def_prefix, prefix4); + *def_prefix_seen = 1; + } + + msg->next_hop = next_hop; + + return PARSE_SUCCESS; +} + static int babel_read_update(struct babel_tlv *hdr, union babel_msg *m, struct babel_parse_state *state) @@ -706,11 +755,11 @@ babel_read_update(struct babel_tlv *hdr, union babel_msg *m, /* Length of received prefix data without omitted part */ int len = BYTES(tlv->plen) - (int) tlv->omitted; - u8 buf[16] = {}; if ((len < 0) || ((uint) len > TLV_OPT_LENGTH(tlv))) return PARSE_ERROR; + int rc; switch (tlv->ae) { case BABEL_AE_WILDCARD: @@ -724,31 +773,20 @@ babel_read_update(struct babel_tlv *hdr, union babel_msg *m, break; case BABEL_AE_IP4: - if (tlv->plen > IP4_MAX_PREFIX_LENGTH) - return PARSE_ERROR; - - /* Cannot omit data if there is no saved prefix */ - if (tlv->omitted && !state->def_ip4_prefix_seen) - return PARSE_ERROR; - - /* Update must have next hop, unless it is retraction */ - if (ipa_zero(state->next_hop_ip4) && (msg->metric != BABEL_INFINITY)) - return PARSE_IGNORE; + rc = babel_read_ip4_prefix(tlv, msg, state->def_ip4_prefix, + &state->def_ip4_prefix_seen, + state->next_hop_ip4, len); + if (rc != PARSE_SUCCESS) + return rc; - /* Merge saved prefix and received prefix parts */ - memcpy(buf, state->def_ip4_prefix, tlv->omitted); - memcpy(buf + tlv->omitted, tlv->addr, len); - - ip4_addr prefix4 = get_ip4(buf); - net_fill_ip4(&msg->net, prefix4, tlv->plen); - - if (tlv->flags & BABEL_UF_DEF_PREFIX) - { - put_ip4(state->def_ip4_prefix, prefix4); - state->def_ip4_prefix_seen = 1; - } + break; - msg->next_hop = state->next_hop_ip4; + case BABEL_AE_IP4_VIA_IP6: + rc = babel_read_ip4_prefix(tlv, msg, state->def_ip4_via_ip6_prefix, + &state->def_ip4_via_ip6_prefix_seen, + state->next_hop_ip6, len); + if (rc != PARSE_SUCCESS) + return rc; break; @@ -761,6 +799,7 @@ babel_read_update(struct babel_tlv *hdr, union babel_msg *m, return PARSE_ERROR; /* Merge saved prefix and received prefix parts */ + u8 buf[16] = {}; memcpy(buf, state->def_ip6_prefix, tlv->omitted); memcpy(buf + tlv->omitted, tlv->addr, len); @@ -863,7 +902,7 @@ babel_write_update(struct babel_tlv *hdr, union babel_msg *m, } else if (msg->net.type == NET_IP4) { - tlv->ae = BABEL_AE_IP4; + tlv->ae = ipa_is_ip4(msg->next_hop) ? BABEL_AE_IP4 : BABEL_AE_IP4_VIA_IP6; tlv->plen = net4_pxlen(&msg->net); put_ip4_px(tlv->addr, &msg->net); } @@ -931,7 +970,12 @@ babel_read_route_request(struct babel_tlv *hdr, union babel_msg *m, msg->full = 1; return PARSE_SUCCESS; + /* + * RFC 9229 2.3 - When receiving requests, AE 1 (IPv4) and AE 4 + * (IPv4-via-IPv6) MUST be treated in the same manner. + */ case BABEL_AE_IP4: + case BABEL_AE_IP4_VIA_IP6: if (tlv->plen > IP4_MAX_PREFIX_LENGTH) return PARSE_ERROR; @@ -1032,7 +1076,12 @@ babel_read_seqno_request(struct babel_tlv *hdr, union babel_msg *m, case BABEL_AE_WILDCARD: return PARSE_ERROR; + /* + * RFC 9229 2.3 - When receiving requests, AE 1 (IPv4) and AE 4 + * (IPv4-via-IPv6) MUST be treated in the same manner. + */ case BABEL_AE_IP4: + case BABEL_AE_IP4_VIA_IP6: if (tlv->plen > IP4_MAX_PREFIX_LENGTH) return PARSE_ERROR; @@ -1318,7 +1367,6 @@ babel_send_to(struct babel_iface *ifa, ip_addr dest) static uint babel_write_queue(struct babel_iface *ifa, list *queue) { - struct babel_proto *p = ifa->proto; struct babel_write_state state = { .next_hop_ip6 = ifa->addr }; if (EMPTY_LIST(*queue)) @@ -1346,7 +1394,7 @@ babel_write_queue(struct babel_iface *ifa, list *queue) pos += len; rem_node(NODE msg); - sl_free(p->msg_slab, msg); + sl_free(msg); } pos += babel_auth_add_tlvs(ifa, (struct babel_tlv *) pos, end - pos); @@ -1507,13 +1555,13 @@ babel_process_packet(struct babel_iface *ifa, else if (res == PARSE_IGNORE) { DBG("Babel: Ignoring TLV of type %d\n", tlv->type); - sl_free(p->msg_slab, msg); + sl_free(msg); } else /* PARSE_ERROR */ { LOG_PKT("Bad TLV from %I via %s type %d pos %d - parse error", saddr, ifa->iface->name, tlv->type, (int) ((byte *)tlv - (byte *)pkt)); - sl_free(p->msg_slab, msg); + sl_free(msg); break; } } @@ -1525,7 +1573,7 @@ babel_process_packet(struct babel_iface *ifa, if (tlv_data[msg->msg.type].handle_tlv) tlv_data[msg->msg.type].handle_tlv(&msg->msg, ifa); rem_node(NODE msg); - sl_free(p->msg_slab, msg); + sl_free(msg); } } @@ -1618,7 +1666,7 @@ babel_open_socket(struct babel_iface *ifa) sk->ttl = 1; sk->flags = SKF_LADDR_RX; - if (sk_open(sk) < 0) + if (sk_open(sk, p->p.loop) < 0) goto err; if (sk_setup_multicast(sk) < 0) @@ -1656,6 +1704,7 @@ babel_read_pc(struct babel_tlv *hdr, union babel_msg *m UNUSED, state->auth.pc_seen = 1; state->auth.index_len = index_len; state->auth.index = tlv->index; + state->auth.unicast = state->is_unicast; state->current_tlv_endpos += index_len; return PARSE_SUCCESS; @@ -2011,7 +2060,7 @@ babel_auth_sign(struct babel_iface *ifa, ip_addr dest) } DBG("Added MAC signatures (%d bytes) on ifa %s for dest %I\n", - tot_len, ifa->ifname, dest); + pos - (pkt + len), ifa->ifname, dest); return pos - (pkt + len); } diff --git a/proto/bfd/bfd.c b/proto/bfd/bfd.c index 479c1510..c88c1cb2 100644 --- a/proto/bfd/bfd.c +++ b/proto/bfd/bfd.c @@ -82,7 +82,7 @@ * BFD thread to the main thread. This is done in an asynchronous way, sesions * with pending notifications are linked (in the BFD thread) to @notify_list in * &bfd_proto, and then bfd_notify_hook() in the main thread is activated using - * bfd_notify_kick() and a pipe. The hook then processes scheduled sessions and + * a standard event sending code. The hook then processes scheduled sessions and * calls hooks from associated BFD requests. This @notify_list (and state fields * in structure &bfd_session) is protected by a spinlock in &bfd_proto and * functions bfd_lock_sessions() / bfd_unlock_sessions(). @@ -113,26 +113,22 @@ #define HASH_IP_EQ(a1,n1,a2,n2) ipa_equal(a1, a2) && n1 == n2 #define HASH_IP_FN(a,n) ipa_hash(a) ^ u32_hash(n) -DEFINE_DOMAIN(rtable); #define BFD_LOCK LOCK_DOMAIN(rtable, bfd_global.lock) #define BFD_UNLOCK UNLOCK_DOMAIN(rtable, bfd_global.lock) -#define BFD_ASSERT_LOCKED ASSERT_DIE(DOMAIN_IS_LOCKED(rtable, bfd_global.lock)) static struct { DOMAIN(rtable) lock; list wait_list; + list pickup_list; list proto_list; + uint pickup_reload; } bfd_global; -static struct bfd_session bfd_admin_down = { .loc = ATOMIC_VAR_INIT((struct bfd_session_state) { .state = BFD_STATE_ADMIN_DOWN }), }; - const char *bfd_state_names[] = { "AdminDown", "Down", "Init", "Up" }; static void bfd_session_set_min_tx(struct bfd_session *s, u32 val); static struct bfd_iface *bfd_get_iface(struct bfd_proto *p, ip_addr local, struct iface *iface); static void bfd_free_iface(struct bfd_iface *ifa); -static void bfd_remove_session(struct bfd_proto *p, struct bfd_session *s); -static void bfd_reconfigure_session_hook(void *vsession); /* @@ -151,57 +147,37 @@ bfd_merge_options(const struct bfd_iface_config *cf, const struct bfd_options *o }; } -static int +static void bfd_session_update_state(struct bfd_session *s, uint state, uint diag) { struct bfd_proto *p = s->ifa->bfd; - uint old_state = BFD_LOC_STATE(s).state; + uint old_state = s->loc_state; + int notify; if (state == old_state) - { - if (current_time() > s->last_reqlist_check + 5 S) - { - BFD_LOCK; - if (EMPTY_LIST(s->request_list)) - { - bfd_remove_session(p, s); - BFD_UNLOCK; - return 1; - } - - s->last_reqlist_check = current_time(); - BFD_UNLOCK; - } - return 0; - } + return; TRACE(D_EVENTS, "Session to %I changed state from %s to %s", s->addr, bfd_state_names[old_state], bfd_state_names[state]); - atomic_store_explicit(&s->loc, ((struct bfd_session_state) { .state = state, .diag = diag }), memory_order_release); + bfd_lock_sessions(p); + s->loc_state = state; + s->loc_diag = diag; s->last_state_change = current_time(); + notify = !NODE_VALID(&s->n); + if (notify) + add_tail(&p->notify_list, &s->n); + bfd_unlock_sessions(p); + if (state == BFD_STATE_UP) bfd_session_set_min_tx(s, s->cf.min_tx_int); if (old_state == BFD_STATE_UP) bfd_session_set_min_tx(s, s->cf.idle_tx_int); - BFD_LOCK; - if (EMPTY_LIST(s->request_list)) - { - bfd_remove_session(p, s); - BFD_UNLOCK; - return 1; - } - - struct bfd_request *req; - node *nn; - WALK_LIST2(req, nn, s->request_list, n) - ev_send_self(&req->event); - - BFD_UNLOCK; - return 0; + if (notify) + ev_send(&global_event_list, &p->notify_event); } static void @@ -246,8 +222,8 @@ bfd_session_control_tx_timer(struct bfd_session *s, int reset) if (s->rem_demand_mode && !s->poll_active && - (BFD_LOC_STATE(s).state == BFD_STATE_UP) && - (s->rem.state == BFD_STATE_UP)) + (s->loc_state == BFD_STATE_UP) && + (s->rem_state == BFD_STATE_UP)) goto stop; if (s->rem_min_rx_int == 0) @@ -317,29 +293,28 @@ bfd_session_process_ctl(struct bfd_session *s, u8 flags, u32 old_tx_int, u32 old int next_state = 0; int diag = BFD_DIAG_NOTHING; - switch (BFD_LOC_STATE(s).state) + switch (s->loc_state) { case BFD_STATE_ADMIN_DOWN: return; case BFD_STATE_DOWN: - if (s->rem.state == BFD_STATE_DOWN) next_state = BFD_STATE_INIT; - else if (s->rem.state == BFD_STATE_INIT) next_state = BFD_STATE_UP; + if (s->rem_state == BFD_STATE_DOWN) next_state = BFD_STATE_INIT; + else if (s->rem_state == BFD_STATE_INIT) next_state = BFD_STATE_UP; break; case BFD_STATE_INIT: - if (s->rem.state == BFD_STATE_ADMIN_DOWN) next_state = BFD_STATE_DOWN, diag = BFD_DIAG_NEIGHBOR_DOWN; - else if (s->rem.state >= BFD_STATE_INIT) next_state = BFD_STATE_UP; + if (s->rem_state == BFD_STATE_ADMIN_DOWN) next_state = BFD_STATE_DOWN, diag = BFD_DIAG_NEIGHBOR_DOWN; + else if (s->rem_state >= BFD_STATE_INIT) next_state = BFD_STATE_UP; break; case BFD_STATE_UP: - if (s->rem.state <= BFD_STATE_DOWN) next_state = BFD_STATE_DOWN, diag = BFD_DIAG_NEIGHBOR_DOWN; + if (s->rem_state <= BFD_STATE_DOWN) next_state = BFD_STATE_DOWN, diag = BFD_DIAG_NEIGHBOR_DOWN; break; } if (next_state) - if (bfd_session_update_state(s, next_state, diag)) - return; + bfd_session_update_state(s, next_state, diag); bfd_session_control_tx_timer(s, 0); @@ -354,7 +329,7 @@ bfd_session_timeout(struct bfd_session *s) TRACE(D_EVENTS, "Session to %I expired", s->addr); - s->rem.state = BFD_STATE_DOWN; + s->rem_state = BFD_STATE_DOWN; s->rem_id = 0; s->rem_min_tx_int = 0; s->rem_min_rx_int = 1; @@ -365,8 +340,7 @@ bfd_session_timeout(struct bfd_session *s) s->poll_active = 0; s->poll_scheduled = 0; - if (bfd_session_update_state(s, BFD_STATE_DOWN, BFD_DIAG_TIMEOUT)) - return; + bfd_session_update_state(s, BFD_STATE_DOWN, BFD_DIAG_TIMEOUT); bfd_session_control_tx_timer(s, 1); } @@ -382,7 +356,7 @@ bfd_session_set_min_tx(struct bfd_session *s, u32 val) s->des_min_tx_new = val; /* Postpone timer update if des_min_tx_int increases and the session is up */ - if ((BFD_LOC_STATE(s).state != BFD_STATE_UP) || (val < s->des_min_tx_int)) + if ((s->loc_state != BFD_STATE_UP) || (val < s->des_min_tx_int)) { s->des_min_tx_int = val; bfd_session_update_tx_interval(s); @@ -402,7 +376,7 @@ bfd_session_set_min_rx(struct bfd_session *s, u32 val) s->req_min_rx_new = val; /* Postpone timer update if req_min_rx_int decreases and the session is up */ - if ((BFD_LOC_STATE(s).state != BFD_STATE_UP) || (val > s->req_min_rx_int)) + if ((s->loc_state != BFD_STATE_UP) || (val > s->req_min_rx_int)) { s->req_min_rx_int = val; bfd_session_update_detection_time(s, 0); @@ -414,14 +388,12 @@ bfd_session_set_min_rx(struct bfd_session *s, u32 val) struct bfd_session * bfd_find_session_by_id(struct bfd_proto *p, u32 id) { - ASSERT_DIE(birdloop_inside(p->p.loop)); return HASH_FIND(p->session_hash_id, HASH_ID, id); } struct bfd_session * bfd_find_session_by_addr(struct bfd_proto *p, ip_addr addr, uint ifindex) { - ASSERT_DIE(birdloop_inside(p->p.loop)); return HASH_FIND(p->session_hash_ip, HASH_IP, addr, ifindex); } @@ -455,7 +427,6 @@ static struct bfd_session * bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface *iface, struct bfd_options *opts) { ASSERT_DIE(birdloop_inside(p->p.loop)); - BFD_ASSERT_LOCKED; struct bfd_iface *ifa = bfd_get_iface(p, local, iface); @@ -469,15 +440,10 @@ bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface * HASH_INSERT(p->session_hash_ip, HASH_IP, s); s->cf = bfd_merge_options(ifa->cf, opts); - s->update_event = (event) { - .hook = bfd_reconfigure_session_hook, - .data = s, - .list = birdloop_event_list(p->p.loop), - }; /* Initialization of state variables - see RFC 5880 6.8.1 */ - atomic_store_explicit(&s->loc, ((struct bfd_session_state) { .state = BFD_STATE_DOWN }), memory_order_relaxed); - s->rem.state = BFD_STATE_DOWN; + s->loc_state = BFD_STATE_DOWN; + s->rem_state = BFD_STATE_DOWN; s->des_min_tx_int = s->des_min_tx_new = s->cf.idle_tx_int; s->req_min_rx_int = s->req_min_rx_new = s->cf.min_rx_int; s->rem_min_rx_int = 1; @@ -485,8 +451,8 @@ bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface * s->passive = s->cf.passive; s->tx_csn = random_u32(); - s->tx_timer = tm_new_init(p->p.pool, bfd_tx_timer_hook, s, 0, 0); - s->hold_timer = tm_new_init(p->p.pool, bfd_hold_timer_hook, s, 0, 0); + s->tx_timer = tm_new_init(p->tpool, bfd_tx_timer_hook, s, 0, 0); + s->hold_timer = tm_new_init(p->tpool, bfd_hold_timer_hook, s, 0, 0); bfd_session_update_tx_interval(s); bfd_session_control_tx_timer(s, 1); @@ -498,12 +464,42 @@ bfd_add_session(struct bfd_proto *p, ip_addr addr, ip_addr local, struct iface * return s; } +/* static void -bfd_remove_session(struct bfd_proto *p, struct bfd_session *s) +bfd_open_session(struct bfd_proto *p, struct bfd_session *s, ip_addr local, struct iface *ifa) { - ASSERT_DIE(birdloop_inside(p->p.loop)); - BFD_ASSERT_LOCKED; - ASSERT_DIE(EMPTY_LIST(s->request_list)); + birdloop_enter(p->p.loop); + + s->opened = 1; + + bfd_session_control_tx_timer(s); + + birdloop_leave(p->p.loop); +} + +static void +bfd_close_session(struct bfd_proto *p, struct bfd_session *s) +{ + birdloop_enter(p->p.loop); + + s->opened = 0; + + bfd_session_update_state(s, BFD_STATE_DOWN, BFD_DIAG_PATH_DOWN); + bfd_session_control_tx_timer(s); + + birdloop_leave(p->p.loop); +} +*/ + +static void +bfd_remove_session_locked(struct bfd_proto *p, struct bfd_session *s) +{ + /* Caller should ensure that request list is empty */ + + /* Remove session from notify list if scheduled for notification */ + /* No need for bfd_lock_sessions(), we are already protected by birdloop_enter() */ + if (NODE_VALID(&s->n)) + rem_node(&s->n); bfd_free_iface(s->ifa); @@ -515,25 +511,29 @@ bfd_remove_session(struct bfd_proto *p, struct bfd_session *s) TRACE(D_EVENTS, "Session to %I removed", s->addr); - sl_free(p->session_slab, s); + sl_free(s); +} + +static void +bfd_remove_session(struct bfd_proto *p, struct bfd_session *s) +{ + birdloop_enter(p->p.loop); + bfd_remove_session_locked(p, s); + birdloop_leave(p->p.loop); } static void bfd_reconfigure_session(struct bfd_proto *p, struct bfd_session *s) { - ASSERT_DIE(birdloop_inside(p->p.loop)); - BFD_LOCK; if (EMPTY_LIST(s->request_list)) - { - bfd_remove_session(p, s); - BFD_UNLOCK; return; - } + + birdloop_enter(p->p.loop); struct bfd_request *req = SKIP_BACK(struct bfd_request, n, HEAD(s->request_list)); s->cf = bfd_merge_options(s->ifa->cf, &req->opts); - u32 tx = (BFD_LOC_STATE(s).state == BFD_STATE_UP) ? s->cf.min_tx_int : s->cf.idle_tx_int; + u32 tx = (s->loc_state == BFD_STATE_UP) ? s->cf.min_tx_int : s->cf.idle_tx_int; bfd_session_set_min_tx(s, tx); bfd_session_set_min_rx(s, s->cf.min_rx_int); s->detect_mult = s->cf.multiplier; @@ -541,15 +541,9 @@ bfd_reconfigure_session(struct bfd_proto *p, struct bfd_session *s) bfd_session_control_tx_timer(s, 0); - TRACE(D_EVENTS, "Session to %I reconfigured", s->addr); - BFD_UNLOCK; -} + birdloop_leave(p->p.loop); -static void -bfd_reconfigure_session_hook(void *data) -{ - struct bfd_session *s = data; - return bfd_reconfigure_session(s->ifa->bfd, s); + TRACE(D_EVENTS, "Session to %I reconfigured", s->addr); } @@ -586,7 +580,7 @@ bfd_get_iface(struct bfd_proto *p, ip_addr local, struct iface *iface) struct bfd_config *cf = (struct bfd_config *) (p->p.cf); struct bfd_iface_config *ic = bfd_find_iface_config(cf, iface); - ifa = mb_allocz(p->p.pool, sizeof(struct bfd_iface)); + ifa = mb_allocz(p->tpool, sizeof(struct bfd_iface)); ifa->local = local; ifa->iface = iface; ifa->cf = ic; @@ -595,6 +589,9 @@ bfd_get_iface(struct bfd_proto *p, ip_addr local, struct iface *iface) ifa->sk = bfd_open_tx_sk(p, local, iface); ifa->uc = 1; + if (cf->strict_bind) + ifa->rx = bfd_open_rx_sk_bound(p, local, iface); + add_tail(&p->iface_list, &ifa->n); return ifa; @@ -607,17 +604,17 @@ bfd_free_iface(struct bfd_iface *ifa) return; if (ifa->sk) - { - sk_stop(ifa->sk); rfree(ifa->sk); - } + + if (ifa->rx) + rfree(ifa->rx); rem_node(&ifa->n); mb_free(ifa); } static void -bfd_reconfigure_iface(struct bfd_proto *p UNUSED, struct bfd_iface *ifa, struct bfd_config *nc) +bfd_reconfigure_iface(struct bfd_proto *p, struct bfd_iface *ifa, struct bfd_config *nc) { struct bfd_iface_config *new = bfd_find_iface_config(nc, ifa->iface); struct bfd_iface_config *old = ifa->cf; @@ -631,7 +628,9 @@ bfd_reconfigure_iface(struct bfd_proto *p UNUSED, struct bfd_iface *ifa, struct (new->passive != old->passive); /* This should be probably changed to not access ifa->cf from the BFD thread */ + birdloop_enter(p->p.loop); ifa->cf = new; + birdloop_leave(p->p.loop); } @@ -640,55 +639,77 @@ bfd_reconfigure_iface(struct bfd_proto *p UNUSED, struct bfd_iface *ifa, struct */ static void -bfd_request_notify(void *data) +bfd_request_notify(struct bfd_request *req, u8 state, u8 diag) { - struct bfd_request *req = data; - struct bfd_session_state old = req->old_state; - - BFD_LOCK; /* Needed to safely access req->session */ - struct bfd_session_state new = atomic_load_explicit(&req->session->loc, memory_order_acquire); - BFD_UNLOCK; + u8 old_state = req->state; - if (new.state == old.state) + if (state == old_state) return; - req->state = new.state; - req->diag = new.diag; - req->old_state = new; - req->down = (old.state == BFD_STATE_UP) && (new.state == BFD_STATE_DOWN); + req->state = state; + req->diag = diag; + req->old_state = old_state; + req->down = (old_state == BFD_STATE_UP) && (state == BFD_STATE_DOWN); if (req->hook) + { + struct birdloop *target = !birdloop_inside(req->target) ? req->target : NULL; + + if (target) + birdloop_enter(target); + req->hook(req); + + if (target) + birdloop_leave(target); + } } static int bfd_add_request(struct bfd_proto *p, struct bfd_request *req) { - BFD_ASSERT_LOCKED; - ASSERT_DIE(req->session == &bfd_admin_down); - struct bfd_config *cf = (struct bfd_config *) (p->p.cf); if (p->p.vrf && (p->p.vrf != req->vrf)) + { + TRACE(D_EVENTS, "Not accepting request to %I with different VRF", req->addr); return 0; + } if (ipa_is_ip4(req->addr) ? !cf->accept_ipv4 : !cf->accept_ipv6) + { + TRACE(D_EVENTS, "Not accepting request to %I (AF limit)", req->addr); return 0; + } if (req->iface ? !cf->accept_direct : !cf->accept_multihop) + { + TRACE(D_EVENTS, "Not accepting %s request to %I", req->iface ? "direct" : "multihop", req->addr); return 0; + } uint ifindex = req->iface ? req->iface->index : 0; struct bfd_session *s = bfd_find_session_by_addr(p, req->addr, ifindex); - if (!s) + if (s) + TRACE(D_EVENTS, "Session to %I reused", s->addr); + else s = bfd_add_session(p, req->addr, req->local, req->iface, &req->opts); rem_node(&req->n); add_tail(&s->request_list, &req->n); req->session = s; - ev_send_self(&req->event); + bfd_lock_sessions(p); + + int notify = !NODE_VALID(&s->n); + if (notify) + add_tail(&p->notify_list, &s->n); + + bfd_unlock_sessions(p); + + if (notify) + ev_send(&global_event_list, &p->notify_event); return 1; } @@ -696,35 +717,89 @@ bfd_add_request(struct bfd_proto *p, struct bfd_request *req) static void bfd_pickup_requests(void *_data UNUSED) { - struct bfd_proto *p; - node *nn; - WALK_LIST2(p, nn, bfd_global.proto_list, bfd_node) - { - birdloop_enter(p->p.loop); + /* NOTE TO MY FUTURE SELF + * + * Functions bfd_take_requests() and bfd_drop_requests() need to have + * consistent &bfd_global.wait_list and this is ensured only by having these + * functions called from bfd_start() and bfd_shutdown() which are both called + * in PROTO_LOCKED_FROM_MAIN context, i.e. always from &main_birdloop. + * + * This pickup event is also called in &main_birdloop, therefore we can + * freely do BFD_LOCK/BFD_UNLOCK while processing all the requests. All BFD + * protocols capable of bfd_add_request() are either started before this code + * happens or after that. + * + * If BFD protocols could start in parallel with this routine, they might + * miss some of the waiting requests, thus if anybody tries to start + * protocols or run this pickup event outside &main_birdloop in future, they + * shall ensure that this race condition is mitigated somehow. + * + * Thank you, my future self, for understanding. Have a nice day! + */ + + DBG("BFD pickup loop starting"); + + BFD_LOCK; + do { + bfd_global.pickup_reload = 0; + BFD_UNLOCK; + + node *n; + WALK_LIST(n, bfd_global.proto_list) + { + struct bfd_proto *p = SKIP_BACK(struct bfd_proto, bfd_node, n); + birdloop_enter(p->p.loop); + BFD_LOCK; + + TRACE(D_EVENTS, "Picking up new requests (%d available)", list_length(&bfd_global.pickup_list)); + + node *rn, *rnxt; + WALK_LIST_DELSAFE(rn, rnxt, bfd_global.pickup_list) + bfd_add_request(p, SKIP_BACK(struct bfd_request, n, rn)); + + BFD_UNLOCK; + + /* Remove sessions with no requests */ + HASH_WALK_DELSAFE(p->session_hash_id, next_id, s) + { + if (EMPTY_LIST(s->request_list)) + bfd_remove_session_locked(p, s); + } + HASH_WALK_END; + + birdloop_leave(p->p.loop); + } + BFD_LOCK; + } while (bfd_global.pickup_reload); - struct bfd_request *req; - node *rn, *rnxt; - WALK_LIST2_DELSAFE(req, rn, rnxt, bfd_global.wait_list, n) - bfd_add_request(p, req); + list tmp_list; + init_list(&tmp_list); + add_tail_list(&tmp_list, &bfd_global.pickup_list); - BFD_UNLOCK; - birdloop_ping(p->p.loop); - birdloop_leave(p->p.loop); - } + init_list(&bfd_global.pickup_list); + BFD_UNLOCK; + + log(L_TRACE "No protocol for %d BFD requests", list_length(&tmp_list)); + + node *n; + WALK_LIST(n, tmp_list) + bfd_request_notify(SKIP_BACK(struct bfd_request, n, n), BFD_STATE_ADMIN_DOWN, 0); + + BFD_LOCK; + add_tail_list(&bfd_global.wait_list, &tmp_list); + BFD_UNLOCK; } static event bfd_pickup_event = { .hook = bfd_pickup_requests }; -#define bfd_schedule_pickup() ev_send(&global_event_list, &bfd_pickup_event) static void bfd_take_requests(struct bfd_proto *p) { - struct bfd_request *req; node *n, *nn; BFD_LOCK; - WALK_LIST2_DELSAFE(req, n, nn, bfd_global.wait_list, n) - bfd_add_request(p, req); + WALK_LIST_DELSAFE(n, nn, bfd_global.wait_list) + bfd_add_request(p, SKIP_BACK(struct bfd_request, n, n)); BFD_UNLOCK; } @@ -739,13 +814,13 @@ bfd_drop_requests(struct bfd_proto *p) { struct bfd_request *req = SKIP_BACK(struct bfd_request, n, n); rem_node(&req->n); - add_tail(&bfd_global.wait_list, &req->n); - req->session = &bfd_admin_down; - ev_send_self(&req->event); + add_tail(&bfd_global.pickup_list, &req->n); + req->session = NULL; } - bfd_schedule_pickup(); - bfd_remove_session(p, s); + ev_send(&global_event_list, &bfd_pickup_event); + + bfd_remove_session_locked(p, s); } HASH_WALK_END; BFD_UNLOCK; @@ -757,7 +832,7 @@ struct bfd_request * bfd_request_session(pool *p, ip_addr addr, ip_addr local, struct iface *iface, struct iface *vrf, void (*hook)(struct bfd_request *), void *data, - struct event_list *list, + struct birdloop *target, const struct bfd_options *opts) { struct bfd_request *req = ralloc(p, &bfd_request_class); @@ -770,18 +845,18 @@ bfd_request_session(pool *p, ip_addr addr, ip_addr local, if (opts) req->opts = *opts; + ASSERT_DIE(target || !hook); req->hook = hook; req->data = data; - req->event = (event) { - .hook = bfd_request_notify, - .data = req, - .list = list, - }; + req->target = target; + + req->session = NULL; BFD_LOCK; - req->session = &bfd_admin_down; - add_tail(&bfd_global.wait_list, &req->n); - bfd_schedule_pickup(); + bfd_global.pickup_reload++; + add_tail(&bfd_global.pickup_list, &req->n); + ev_send(&global_event_list, &bfd_pickup_event); + DBG("New BFD request enlisted.\n"); BFD_UNLOCK; return req; @@ -790,17 +865,15 @@ bfd_request_session(pool *p, ip_addr addr, ip_addr local, void bfd_update_request(struct bfd_request *req, const struct bfd_options *opts) { + struct bfd_session *s = req->session; + if (!memcmp(opts, &req->opts, sizeof(const struct bfd_options))) return; - BFD_LOCK; req->opts = *opts; - struct bfd_session *s = req->session; - if (s != &bfd_admin_down) - ev_send_self(&s->update_event); - - BFD_UNLOCK; + if (s) + bfd_reconfigure_session(s->ifa->bfd, s); } static void @@ -812,11 +885,11 @@ bfd_request_free(resource *r) rem_node(&req->n); BFD_UNLOCK; - ev_postpone(&req->event); + ev_send(&global_event_list, &bfd_pickup_event); } static void -bfd_request_dump(resource *r) +bfd_request_dump(resource *r, unsigned indent UNUSED) { struct bfd_request *req = (struct bfd_request *) r; @@ -849,7 +922,7 @@ bfd_neigh_notify(struct neighbor *nb) if ((nb->scope > 0) && !n->req) { ip_addr local = ipa_nonzero(n->local) ? n->local : nb->ifa->ip; - n->req = bfd_request_session(p->p.pool, n->addr, local, nb->iface, p->p.vrf, NULL, NULL, birdloop_event_list(p->p.loop), NULL); + n->req = bfd_request_session(p->p.pool, n->addr, local, nb->iface, p->p.vrf, NULL, NULL, NULL, NULL); } if ((nb->scope <= 0) && n->req) @@ -866,7 +939,7 @@ bfd_start_neighbor(struct bfd_proto *p, struct bfd_neighbor *n) if (n->multihop) { - n->req = bfd_request_session(p->p.pool, n->addr, n->local, NULL, p->p.vrf, NULL, NULL, birdloop_event_list(p->p.loop), NULL); + n->req = bfd_request_session(p->p.pool, n->addr, n->local, NULL, p->p.vrf, NULL, NULL, NULL, NULL); return; } @@ -941,23 +1014,53 @@ bfd_reconfigure_neighbors(struct bfd_proto *p, struct bfd_config *new) /* - * BFD protocol glue + * BFD notify socket */ -void -bfd_init_all(void) +/* This core notify code should be replaced after main loop transition to birdloop */ + +static void +bfd_notify_hook(void *data) { - bfd_global.lock = DOMAIN_NEW(rtable, "BFD Global"); - init_list(&bfd_global.wait_list); - init_list(&bfd_global.proto_list); + struct bfd_proto *p = data; + struct bfd_session *s; + list tmp_list; + u8 state, diag; + node *n, *nn; + + bfd_lock_sessions(p); + init_list(&tmp_list); + add_tail_list(&tmp_list, &p->notify_list); + init_list(&p->notify_list); + bfd_unlock_sessions(p); + + WALK_LIST_FIRST(s, tmp_list) + { + bfd_lock_sessions(p); + rem_node(&s->n); + state = s->loc_state; + diag = s->loc_diag; + bfd_unlock_sessions(p); + + WALK_LIST_DELSAFE(n, nn, s->request_list) + bfd_request_notify(SKIP_BACK(struct bfd_request, n, n), state, diag); + + /* Remove the session if all requests were removed in notify hooks */ + if (EMPTY_LIST(s->request_list)) + bfd_remove_session(p, s); + } } +/* + * BFD protocol glue + */ + static struct proto * bfd_init(struct proto_config *c) { struct proto *p = proto_new(c); - p->neigh_notify = bfd_neigh_notify; + p->iface_sub.neigh_notify = bfd_neigh_notify; return p; } @@ -968,25 +1071,38 @@ bfd_start(struct proto *P) struct bfd_proto *p = (struct bfd_proto *) P; struct bfd_config *cf = (struct bfd_config *) (P->cf); + pthread_spin_init(&p->lock, PTHREAD_PROCESS_PRIVATE); + + p->tpool = rp_new(P->pool, "BFD loop pool"); + p->session_slab = sl_new(P->pool, sizeof(struct bfd_session)); HASH_INIT(p->session_hash_id, P->pool, 8); HASH_INIT(p->session_hash_ip, P->pool, 8); init_list(&p->iface_list); + init_list(&p->notify_list); + p->notify_event = (event) { + .hook = bfd_notify_hook, + .data = p, + }; + add_tail(&bfd_global.proto_list, &p->bfd_node); - if (cf->accept_ipv4 && cf->accept_direct) - p->rx4_1 = bfd_open_rx_sk(p, 0, SK_IPV4); + if (!cf->strict_bind) + { + if (cf->accept_ipv4 && cf->accept_direct) + p->rx4_1 = bfd_open_rx_sk(p, 0, SK_IPV4); - if (cf->accept_ipv4 && cf->accept_multihop) - p->rx4_m = bfd_open_rx_sk(p, 1, SK_IPV4); + if (cf->accept_ipv4 && cf->accept_multihop) + p->rx4_m = bfd_open_rx_sk(p, 1, SK_IPV4); - if (cf->accept_ipv6 && cf->accept_direct) - p->rx6_1 = bfd_open_rx_sk(p, 0, SK_IPV6); + if (cf->accept_ipv6 && cf->accept_direct) + p->rx6_1 = bfd_open_rx_sk(p, 0, SK_IPV6); - if (cf->accept_ipv6 && cf->accept_multihop) - p->rx6_m = bfd_open_rx_sk(p, 1, SK_IPV6); + if (cf->accept_ipv6 && cf->accept_multihop) + p->rx6_m = bfd_open_rx_sk(p, 1, SK_IPV6); + } bfd_take_requests(p); @@ -1011,11 +1127,6 @@ bfd_shutdown(struct proto *P) bfd_drop_requests(p); - if (p->rx4_1) sk_stop(p->rx4_1); - if (p->rx4_m) sk_stop(p->rx4_m); - if (p->rx6_1) sk_stop(p->rx6_1); - if (p->rx6_m) sk_stop(p->rx6_m); - return PS_DOWN; } @@ -1027,13 +1138,12 @@ bfd_reconfigure(struct proto *P, struct proto_config *c) struct bfd_config *new = (struct bfd_config *) c; struct bfd_iface *ifa; - ASSERT_DIE(birdloop_inside(P->loop)); - /* TODO: Improve accept reconfiguration */ if ((new->accept_ipv4 != old->accept_ipv4) || (new->accept_ipv6 != old->accept_ipv6) || (new->accept_direct != old->accept_direct) || - (new->accept_multihop != old->accept_multihop)) + (new->accept_multihop != old->accept_multihop) || + (new->strict_bind != old->strict_bind)) return 0; birdloop_mask_wakeups(p->p.loop); @@ -1041,12 +1151,12 @@ bfd_reconfigure(struct proto *P, struct proto_config *c) WALK_LIST(ifa, p->iface_list) bfd_reconfigure_iface(p, ifa, new); - HASH_WALK_DELSAFE(p->session_hash_id, next_id, s) + HASH_WALK(p->session_hash_id, next_id, s) { if (s->ifa->changed) bfd_reconfigure_session(p, s); } - HASH_WALK_DELSAFE_END; + HASH_WALK_END; bfd_reconfigure_neighbors(p, new); @@ -1071,14 +1181,13 @@ bfd_show_sessions(struct proto *P) { byte tbuf[TM_DATETIME_BUFFER_SIZE]; struct bfd_proto *p = (struct bfd_proto *) P; + uint state, diag UNUSED; btime tx_int, timeout; const char *ifname; - birdloop_enter(P->loop); if (p->p.proto_state != PS_UP) { cli_msg(-1020, "%s: is not up", p->p.name); - birdloop_leave(P->loop); return; } @@ -1086,9 +1195,12 @@ bfd_show_sessions(struct proto *P) cli_msg(-1020, "%-25s %-10s %-10s %-12s %8s %8s", "IP address", "Interface", "State", "Since", "Interval", "Timeout"); + HASH_WALK(p->session_hash_id, next_id, s) { - uint state = BFD_LOC_STATE(s).state; + /* FIXME: this is thread-unsafe, but perhaps harmless */ + state = s->loc_state; + diag = s->loc_diag; ifname = (s->ifa && s->ifa->iface) ? s->ifa->iface->name : "---"; tx_int = s->last_tx ? MAX(s->des_min_tx_int, s->rem_min_rx_int) : 0; timeout = (btime) MAX(s->req_min_rx_int, s->rem_min_tx_int) * s->rem_detect_mult; @@ -1100,15 +1212,12 @@ bfd_show_sessions(struct proto *P) s->addr, ifname, bfd_state_names[state], tbuf, tx_int, timeout); } HASH_WALK_END; - - birdloop_leave(P->loop); } struct protocol proto_bfd = { .name = "BFD", .template = "bfd%d", - .class = PROTOCOL_BFD, .proto_size = sizeof(struct bfd_proto), .config_size = sizeof(struct bfd_config), .init = bfd_init, @@ -1117,3 +1226,14 @@ struct protocol proto_bfd = { .reconfigure = bfd_reconfigure, .copy_config = bfd_copy_config, }; + +void +bfd_build(void) +{ + proto_build(&proto_bfd); + + bfd_global.lock = DOMAIN_NEW(rtable, "BFD Global"); + init_list(&bfd_global.wait_list); + init_list(&bfd_global.pickup_list); + init_list(&bfd_global.proto_list); +} diff --git a/proto/bfd/bfd.h b/proto/bfd/bfd.h index ffb1c43f..a4b7d63c 100644 --- a/proto/bfd/bfd.h +++ b/proto/bfd/bfd.h @@ -13,7 +13,7 @@ #include "nest/cli.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/password.h" #include "conf/conf.h" #include "lib/hash.h" @@ -47,6 +47,7 @@ struct bfd_config u8 accept_ipv6; u8 accept_direct; u8 accept_multihop; + u8 strict_bind; }; struct bfd_iface_config @@ -88,12 +89,19 @@ struct bfd_proto { struct proto p; + pthread_spinlock_t lock; + + pool *tpool; + node bfd_node; slab *session_slab; HASH(struct bfd_session) session_hash_id; HASH(struct bfd_session) session_hash_ip; + event notify_event; + list notify_list; + sock *rx4_1; sock *rx6_1; sock *rx4_m; @@ -110,12 +118,14 @@ struct bfd_iface struct bfd_proto *bfd; sock *sk; + sock *rx; u32 uc; u8 changed; }; struct bfd_session { + node n; ip_addr addr; /* Address of session */ struct bfd_iface *ifa; /* Iface associated with session */ struct bfd_session *next_id; /* Next in bfd.session_hash_id */ @@ -126,15 +136,14 @@ struct bfd_session u8 poll_active; u8 poll_scheduled; - _Atomic struct bfd_session_state loc; - struct bfd_session_state rem; -#define BFD_LOC_STATE(s) ({ struct bfd_session_state _bss = atomic_load_explicit(&(s)->loc, memory_order_relaxed); _bss; }) - + u8 loc_state; + u8 rem_state; + u8 loc_diag; + u8 rem_diag; u32 loc_id; /* Local session ID (local discriminator) */ u32 rem_id; /* Remote session ID (remote discriminator) */ - struct bfd_session_config cf; /* Static configuration parameers */ - event update_event; /* Reconfiguration requested */ + struct bfd_session_config cf; /* Static configuration parameters */ u32 des_min_tx_int; /* Desired min rx interval, local option */ u32 des_min_tx_new; /* Used for des_min_tx_int change */ @@ -156,8 +165,6 @@ struct bfd_session list request_list; /* List of client requests (struct bfd_request) */ btime last_state_change; /* Time of last state change */ - btime last_reqlist_check; /* Time of last check whether the request list is not empty */ - u8 notify_running; /* 1 if notify hooks are running */ u8 rx_csn_known; /* Received crypto sequence number is known */ u32 rx_csn; /* Last received crypto sequence number */ @@ -203,6 +210,10 @@ extern const char *bfd_state_names[]; extern const u8 bfd_auth_type_to_hash_alg[]; + +static inline void bfd_lock_sessions(struct bfd_proto *p) { pthread_spin_lock(&p->lock); } +static inline void bfd_unlock_sessions(struct bfd_proto *p) { pthread_spin_unlock(&p->lock); } + /* bfd.c */ struct bfd_session * bfd_find_session_by_id(struct bfd_proto *p, u32 id); struct bfd_session * bfd_find_session_by_addr(struct bfd_proto *p, ip_addr addr, uint ifindex); @@ -212,6 +223,7 @@ void bfd_show_sessions(struct proto *P); /* packets.c */ void bfd_send_ctl(struct bfd_proto *p, struct bfd_session *s, int final); sock * bfd_open_rx_sk(struct bfd_proto *p, int multihop, int inet_version); +sock * bfd_open_rx_sk_bound(struct bfd_proto *p, ip_addr local, struct iface *ifa); sock * bfd_open_tx_sk(struct bfd_proto *p, ip_addr local, struct iface *ifa); diff --git a/proto/bfd/config.Y b/proto/bfd/config.Y index ed5479fb..8e608bda 100644 --- a/proto/bfd/config.Y +++ b/proto/bfd/config.Y @@ -23,7 +23,8 @@ CF_DECLS CF_KEYWORDS(BFD, MIN, IDLE, RX, TX, INTERVAL, MULTIPLIER, PASSIVE, INTERFACE, MULTIHOP, NEIGHBOR, DEV, LOCAL, AUTHENTICATION, - NONE, SIMPLE, METICULOUS, KEYED, MD5, SHA1, IPV4, IPV6, DIRECT) + NONE, SIMPLE, METICULOUS, KEYED, MD5, SHA1, IPV4, IPV6, DIRECT, + STRICT, BIND) %type <iface> bfd_neigh_iface %type <a> bfd_neigh_local @@ -37,6 +38,7 @@ bfd_proto_start: proto_start BFD { this_proto = proto_config_new(&proto_bfd, $1); this_proto->loop_order = DOMAIN_ORDER(proto); + this_proto->loop_max_latency = 10 MS_; init_list(&BFD_CFG->patt_list); init_list(&BFD_CFG->neigh_list); BFD_CFG->accept_ipv4 = BFD_CFG->accept_ipv6 = 1; @@ -49,6 +51,7 @@ bfd_proto_item: | INTERFACE bfd_iface | MULTIHOP bfd_multihop | NEIGHBOR bfd_neighbor + | STRICT BIND bool { BFD_CFG->strict_bind = $3; } ; bfd_proto_opts: diff --git a/proto/bfd/packets.c b/proto/bfd/packets.c index 893d582d..a22f223b 100644 --- a/proto/bfd/packets.c +++ b/proto/bfd/packets.c @@ -290,11 +290,9 @@ bfd_send_ctl(struct bfd_proto *p, struct bfd_session *s, int final) if (!sk) return; - struct bfd_session_state loc = BFD_LOC_STATE(s); - pkt = (struct bfd_ctl_packet *) sk->tbuf; - pkt->vdiag = bfd_pack_vdiag(1, loc.diag); - pkt->flags = bfd_pack_flags(loc.state, 0); + pkt->vdiag = bfd_pack_vdiag(1, s->loc_diag); + pkt->flags = bfd_pack_flags(s->loc_state, 0); pkt->detect_mult = s->detect_mult; pkt->length = BFD_BASE_LEN; pkt->snd_id = htonl(s->loc_id); @@ -315,7 +313,7 @@ bfd_send_ctl(struct bfd_proto *p, struct bfd_session *s, int final) log(L_WARN "%s: Old packet overwritten in TX buffer", p->p.name); TRACE(D_PACKETS, "Sending CTL to %I [%s%s]", s->addr, - bfd_state_names[loc.state], bfd_format_flags(pkt->flags, fb)); + bfd_state_names[s->loc_state], bfd_format_flags(pkt->flags, fb)); sk_send_to(sk, pkt->length, s->addr, sk->dport); } @@ -368,12 +366,18 @@ bfd_rx_hook(sock *sk, uint len) if (ps > BFD_STATE_DOWN) DROP("invalid init state", ps); - uint ifindex = (sk->sport == BFD_CONTROL_PORT) ? sk->lifindex : 0; + uint ifindex = (sk->sport == BFD_CONTROL_PORT) ? + (sk->iface ? sk->iface->index : sk->lifindex) : + 0; s = bfd_find_session_by_addr(p, sk->faddr, ifindex); /* FIXME: better session matching and message */ if (!s) return 1; + + /* For active sessions we require matching remote id */ + if ((s->loc_state == BFD_STATE_UP) && (ntohl(pkt->snd_id) != s->rem_id)) + DROP("mismatched remote id", ntohl(pkt->snd_id)); } /* bfd_check_authentication() has its own error logging */ @@ -384,17 +388,16 @@ bfd_rx_hook(sock *sk, uint len) u32 old_rx_int = s->rem_min_rx_int; s->rem_id= ntohl(pkt->snd_id); - s->rem.state = bfd_pkt_get_state(pkt); - s->rem.diag = bfd_pkt_get_diag(pkt); + s->rem_state = bfd_pkt_get_state(pkt); + s->rem_diag = bfd_pkt_get_diag(pkt); s->rem_demand_mode = pkt->flags & BFD_FLAG_DEMAND; s->rem_min_tx_int = ntohl(pkt->des_min_tx_int); s->rem_min_rx_int = ntohl(pkt->req_min_rx_int); s->rem_detect_mult = pkt->detect_mult; TRACE(D_PACKETS, "CTL received from %I [%s%s]", sk->faddr, - bfd_state_names[s->rem.state], bfd_format_flags(pkt->flags, fb)); + bfd_state_names[s->rem_state], bfd_format_flags(pkt->flags, fb)); - /* This call may drop the session, must be called in tail position */ bfd_session_process_ctl(s, pkt->flags, old_tx_int, old_rx_int); return 1; @@ -427,13 +430,42 @@ bfd_open_rx_sk(struct bfd_proto *p, int multihop, int af) /* TODO: configurable ToS and priority */ sk->tos = IP_PREC_INTERNET_CONTROL; sk->priority = sk_priority_control; - sk->flags = SKF_THREAD | SKF_LADDR_RX | (!multihop ? SKF_TTL_RX : 0); - sk->loop = p->p.loop; + sk->flags = SKF_LADDR_RX | (!multihop ? SKF_TTL_RX : 0); + + if (sk_open(sk, p->p.loop) < 0) + goto err; + + return sk; + + err: + sk_log_error(sk, p->p.name); + rfree(sk); + return NULL; +} + +sock * +bfd_open_rx_sk_bound(struct bfd_proto *p, ip_addr local, struct iface *ifa) +{ + sock *sk = sk_new(p->tpool); + sk->type = SK_UDP; + sk->saddr = local; + sk->sport = ifa ? BFD_CONTROL_PORT : BFD_MULTI_CTL_PORT; + sk->iface = ifa; + sk->vrf = p->p.vrf; + sk->data = p; + + sk->rbsize = BFD_MAX_LEN; + sk->rx_hook = bfd_rx_hook; + sk->err_hook = bfd_err_hook; + + /* TODO: configurable ToS and priority */ + sk->tos = IP_PREC_INTERNET_CONTROL; + sk->priority = sk_priority_control; + sk->flags = SKF_BIND | (ifa ? SKF_TTL_RX : 0); - if (sk_open(sk) < 0) + if (sk_open(sk, p->p.loop) < 0) goto err; - sk_start(sk); return sk; err: @@ -460,13 +492,11 @@ bfd_open_tx_sk(struct bfd_proto *p, ip_addr local, struct iface *ifa) sk->tos = IP_PREC_INTERNET_CONTROL; sk->priority = sk_priority_control; sk->ttl = ifa ? 255 : -1; - sk->flags = SKF_THREAD | SKF_BIND | SKF_HIGH_PORT; - sk->loop = p->p.loop; + sk->flags = SKF_BIND | SKF_HIGH_PORT; - if (sk_open(sk) < 0) + if (sk_open(sk, p->p.loop) < 0) goto err; - sk_start(sk); return sk; err: diff --git a/proto/bgp/Makefile b/proto/bgp/Makefile index 00aaef5e..f6a38678 100644 --- a/proto/bgp/Makefile +++ b/proto/bgp/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 02b07410..8bff4c78 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -15,12 +15,13 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "conf/conf.h" #include "lib/resource.h" #include "lib/string.h" #include "lib/unaligned.h" +#include "lib/macro.h" #include "bgp.h" @@ -45,9 +46,9 @@ * * export - Hook that validates and normalizes attribute during export phase. * Receives eattr, may modify it (e.g., sort community lists for canonical - * representation), UNSET() it (e.g., skip empty lists), or WITHDRAW() it if - * necessary. May assume that eattr has value valid w.r.t. its type, but may be - * invalid w.r.t. BGP constraints. Optional. + * representation), UNSET() it (e.g., skip empty lists), or REJECT() the route + * if necessary. May assume that eattr has value valid w.r.t. its type, but may + * be invalid w.r.t. BGP constraints. Optional. * * encode - Hook that converts internal representation to external one during * packet writing. Receives eattr and puts it in the buffer (including attribute @@ -64,37 +65,72 @@ * format - Optional hook that converts eattr to textual representation. */ - -struct bgp_attr_desc { - const char *name; - uint type; - uint flags; - void (*export)(struct bgp_export_state *s, eattr *a); - int (*encode)(struct bgp_write_state *s, eattr *a, byte *buf, uint size); - void (*decode)(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to); - void (*format)(const eattr *ea, byte *buf, uint size); +union bgp_attr_desc { + struct ea_class class; + struct { + EA_CLASS_INSIDE; + uint flags; + void (*export)(struct bgp_export_state *s, eattr *a); + int (*encode)(struct bgp_write_state *s, eattr *a, byte *buf, uint size); + void (*decode)(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to); + }; }; -static const struct bgp_attr_desc bgp_attr_table[]; +static union bgp_attr_desc bgp_attr_table[]; +static inline const union bgp_attr_desc *bgp_find_attr_desc(eattr *a) +{ + const struct ea_class *class = ea_class_find(a->id); -static inline int bgp_attr_known(uint code); + if ((class < &bgp_attr_table[0].class) || (class >= &bgp_attr_table[BGP_ATTR_MAX].class)) + return NULL; -eattr * -bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintptr_t val) + return (const union bgp_attr_desc *) class; +} + +#define BGP_EA_ID(code) (bgp_attr_table[code].id) +#define EA_BGP_ID(code) (((union bgp_attr_desc *) ea_class_find(code)) - bgp_attr_table) + +void bgp_set_attr_u32(ea_list **to, uint code, uint flags, u32 val) { - ASSERT(bgp_attr_known(code)); + const union bgp_attr_desc *desc = &bgp_attr_table[code]; - return ea_set_attr( - attrs, - pool, - EA_CODE(PROTOCOL_BGP, code), - flags & ~BAF_EXT_LEN, - bgp_attr_table[code].type, - val - ); + ea_set_attr(to, EA_LITERAL_EMBEDDED( + &desc->class, + flags & ~BAF_EXT_LEN, + val + )); } +void bgp_set_attr_ptr(ea_list **to, uint code, uint flags, const struct adata *ad) +{ + const union bgp_attr_desc *desc = &bgp_attr_table[code]; + ea_set_attr(to, EA_LITERAL_DIRECT_ADATA( + &desc->class, + flags & ~BAF_EXT_LEN, + ad + )); +} + +void +bgp_set_attr_data(ea_list **to, uint code, uint flags, void *data, uint len) +{ + const union bgp_attr_desc *desc = &bgp_attr_table[code]; + + ea_set_attr(to, EA_LITERAL_STORE_ADATA( + &desc->class, + flags & ~BAF_EXT_LEN, + data, + len + )); +} + +void +bgp_unset_attr(ea_list **to, uint code) +{ + const union bgp_attr_desc *desc = &bgp_attr_table[code]; + ea_unset_attr(to, 0, &desc->class); +} #define REPORT(msg, args...) \ ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); }) @@ -106,7 +142,10 @@ bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintp ({ REPORT(msg, ## args); s->err_withdraw = 1; return; }) #define UNSET(a) \ - ({ a->type = EAF_TYPE_UNDEF; return; }) + ({ a->undef = 1; return; }) + +#define REJECT(msg, args...) \ + ({ log(L_ERR "%s: " msg, s->proto->p.name, ## args); s->err_reject = 1; return; }) #define NEW_BGP "Discarding %s attribute received from AS4-aware neighbor" #define BAD_EBGP "Discarding %s attribute received from EBGP neighbor" @@ -148,7 +187,7 @@ bgp_encode_u8(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) if (size < (3+1)) return -1; - bgp_put_attr_hdr3(buf, EA_ID(a->id), a->flags, 1); + bgp_put_attr_hdr3(buf, EA_BGP_ID(a->id), a->flags, 1); buf[3] = a->u.data; return 3+1; @@ -160,7 +199,7 @@ bgp_encode_u32(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) if (size < (3+4)) return -1; - bgp_put_attr_hdr3(buf, EA_ID(a->id), a->flags, 4); + bgp_put_attr_hdr3(buf, EA_BGP_ID(a->id), a->flags, 4); put_u32(buf+3, a->u.data); return 3+4; @@ -174,7 +213,7 @@ bgp_encode_u32s(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size if (size < (4+len)) return -1; - uint hdr = bgp_put_attr_hdr(buf, EA_ID(a->id), a->flags, len); + uint hdr = bgp_put_attr_hdr(buf, EA_BGP_ID(a->id), a->flags, len); put_u32s(buf + hdr, (u32 *) a->u.ptr->data, len / 4); return hdr + len; @@ -195,7 +234,7 @@ bgp_put_attr(byte *buf, uint size, uint code, uint flags, const byte *data, uint static int bgp_encode_raw(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) { - return bgp_put_attr(buf, size, EA_ID(a->id), a->flags, a->u.ptr->data, a->u.ptr->length); + return bgp_put_attr(buf, size, EA_BGP_ID(a->id), a->flags, a->u.ptr->data, a->u.ptr->length); } @@ -333,26 +372,26 @@ bgp_aigp_set_metric(struct linpool *pool, const struct adata *ad, u64 metric) } int -bgp_total_aigp_metric_(struct rta *a, u64 *metric, const struct adata **ad) +bgp_total_aigp_metric_(const rte *e, u64 *metric, const struct adata **ad) { - eattr *ea = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AIGP)); - if (!ea) + eattr *a = ea_find(e->attrs, BGP_EA_ID(BA_AIGP)); + if (!a) return 0; - const byte *b = bgp_aigp_get_tlv(ea->u.ptr, BGP_AIGP_METRIC); + const byte *b = bgp_aigp_get_tlv(a->u.ptr, BGP_AIGP_METRIC); if (!b) return 0; u64 aigp = get_u64(b + 3); - u64 step = a->igp_metric; + u64 step = rt_get_igp_metric(e); - if (!rta_resolvable(a) || (step >= IGP_METRIC_UNKNOWN)) + if (!rte_resolvable(e) || (step >= IGP_METRIC_UNKNOWN)) step = BGP_AIGP_MAX; if (!step) step = 1; - *ad = ea->u.ptr; + *ad = a->u.ptr; *metric = aigp + step; if (*metric < aigp) *metric = BGP_AIGP_MAX; @@ -363,7 +402,7 @@ bgp_total_aigp_metric_(struct rta *a, u64 *metric, const struct adata **ad) static inline int bgp_init_aigp_metric(rte *e, u64 *metric, const struct adata **ad) { - if (e->attrs->source == RTS_BGP) + if (rt_get_source_attr(e) == RTS_BGP) return 0; *metric = rt_get_igp_metric(e); @@ -372,9 +411,9 @@ bgp_init_aigp_metric(rte *e, u64 *metric, const struct adata **ad) } u32 -bgp_rte_igp_metric(struct rte *rt) +bgp_rte_igp_metric(const rte *rt) { - u64 metric = bgp_total_aigp_metric(rt->attrs); + u64 metric = bgp_total_aigp_metric(rt); return (u32) MIN(metric, (u64) IGP_METRIC_UNKNOWN); } @@ -387,7 +426,7 @@ static void bgp_export_origin(struct bgp_export_state *s, eattr *a) { if (a->u.data > 2) - WITHDRAW(BAD_VALUE, "ORIGIN", a->u.data); + REJECT(BAD_VALUE, "ORIGIN", a->u.data); } static void @@ -399,7 +438,7 @@ bgp_decode_origin(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte if (data[0] > 2) WITHDRAW(BAD_VALUE, "ORIGIN", data[0]); - bgp_set_attr_u32(to, s->pool, BA_ORIGIN, flags, data[0]); + bgp_set_attr_u32(to, BA_ORIGIN, flags, data[0]); } static void @@ -467,7 +506,7 @@ bgp_decode_as_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte !bgp_as_path_first_as_equal(data, len, p->remote_as)) WITHDRAW("Malformed AS_PATH attribute - %s", "First AS differs from neigbor AS"); - bgp_set_attr_data(to, s->pool, BA_AS_PATH, flags, data, len); + bgp_set_attr_data(to, BA_AS_PATH, flags, data, len); } @@ -539,7 +578,7 @@ bgp_decode_med(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *da WITHDRAW(BAD_LENGTH, "MULTI_EXIT_DISC", len); u32 val = get_u32(data); - bgp_set_attr_u32(to, s->pool, BA_MULTI_EXIT_DISC, flags, val); + bgp_set_attr_u32(to, BA_MULTI_EXIT_DISC, flags, val); } @@ -560,7 +599,7 @@ bgp_decode_local_pref(struct bgp_parse_state *s, uint code UNUSED, uint flags, b WITHDRAW(BAD_LENGTH, "LOCAL_PREF", len); u32 val = get_u32(data); - bgp_set_attr_u32(to, s->pool, BA_LOCAL_PREF, flags, val); + bgp_set_attr_u32(to, BA_LOCAL_PREF, flags, val); } @@ -570,7 +609,7 @@ bgp_decode_atomic_aggr(struct bgp_parse_state *s, uint code UNUSED, uint flags, if (len != 0) DISCARD(BAD_LENGTH, "ATOMIC_AGGR", len); - bgp_set_attr_data(to, s->pool, BA_ATOMIC_AGGR, flags, NULL, 0); + bgp_set_attr_data(to, BA_ATOMIC_AGGR, flags, NULL, 0); } static int @@ -604,7 +643,7 @@ bgp_decode_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flags, b len = aggregator_16to32(data, src); } - bgp_set_attr_data(to, s->pool, BA_AGGREGATOR, flags, data, len); + bgp_set_attr_data(to, BA_AGGREGATOR, flags, data, len); } static void @@ -633,7 +672,7 @@ bgp_decode_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, by struct adata *ad = lp_alloc_adata(s->pool, len); get_u32s(data, (u32 *) ad->data, len / 4); - bgp_set_attr_ptr(to, s->pool, BA_COMMUNITY, flags, ad); + bgp_set_attr_ptr(to, BA_COMMUNITY, flags, ad); } @@ -654,7 +693,7 @@ bgp_decode_originator_id(struct bgp_parse_state *s, uint code UNUSED, uint flags WITHDRAW(BAD_LENGTH, "ORIGINATOR_ID", len); u32 val = get_u32(data); - bgp_set_attr_u32(to, s->pool, BA_ORIGINATOR_ID, flags, val); + bgp_set_attr_u32(to, BA_ORIGINATOR_ID, flags, val); } @@ -679,7 +718,7 @@ bgp_decode_cluster_list(struct bgp_parse_state *s, uint code UNUSED, uint flags, struct adata *ad = lp_alloc_adata(s->pool, len); get_u32s(data, (u32 *) ad->data, len / 4); - bgp_set_attr_ptr(to, s->pool, BA_CLUSTER_LIST, flags, ad); + bgp_set_attr_ptr(to, BA_CLUSTER_LIST, flags, ad); } static void @@ -798,7 +837,7 @@ bgp_decode_ext_community(struct bgp_parse_state *s, uint code UNUSED, uint flags struct adata *ad = lp_alloc_adata(s->pool, len); get_u32s(data, (u32 *) ad->data, len / 4); - bgp_set_attr_ptr(to, s->pool, BA_EXT_COMMUNITY, flags, ad); + bgp_set_attr_ptr(to, BA_EXT_COMMUNITY, flags, ad); } @@ -811,7 +850,7 @@ bgp_decode_as4_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flag if (len != 8) DISCARD(BAD_LENGTH, "AS4_AGGREGATOR", len); - bgp_set_attr_data(to, s->pool, BA_AS4_AGGREGATOR, flags, data, len); + bgp_set_attr_data(to, BA_AS4_AGGREGATOR, flags, data, len); } static void @@ -841,7 +880,7 @@ bgp_decode_as4_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byt a = as_path_strip_confed(s->pool, a); } - bgp_set_attr_ptr(to, s->pool, BA_AS4_PATH, flags, a); + bgp_set_attr_ptr(to, BA_AS4_PATH, flags, a); } @@ -865,7 +904,7 @@ bgp_decode_aigp(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *d if (!bgp_aigp_valid(data, len, err, sizeof(err))) DISCARD("Malformed AIGP attribute - %s", err); - bgp_set_attr_data(to, s->pool, BA_AIGP, flags, data, len); + bgp_set_attr_data(to, BA_AIGP, flags, data, len); } static void @@ -897,9 +936,21 @@ bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint fla struct adata *ad = lp_alloc_adata(s->pool, len); get_u32s(data, (u32 *) ad->data, len / 4); - bgp_set_attr_ptr(to, s->pool, BA_LARGE_COMMUNITY, flags, ad); + bgp_set_attr_ptr(to, BA_LARGE_COMMUNITY, flags, ad); +} + + +static void +bgp_decode_otc(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to) +{ + if (len != 4) + WITHDRAW(BAD_LENGTH, "OTC", len); + + u32 val = get_u32(data); + bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, flags, val); } + static void bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a) { @@ -909,20 +960,20 @@ bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a) /* Perhaps we should just ignore it? */ if (!s->mpls) - WITHDRAW("Unexpected MPLS stack"); + REJECT("Unexpected MPLS stack"); /* Empty MPLS stack is not allowed */ if (!lnum) - WITHDRAW("Malformed MPLS stack - empty"); + REJECT("Malformed MPLS stack - empty"); /* This is ugly, but we must ensure that labels fit into NLRI field */ if ((24*lnum + (net_is_vpn(n) ? 64 : 0) + net_pxlen(n)) > 255) - WITHDRAW("Malformed MPLS stack - too many labels (%u)", lnum); + REJECT("Malformed MPLS stack - too many labels (%u)", lnum); for (uint i = 0; i < lnum; i++) { if (labels[i] > 0xfffff) - WITHDRAW("Malformed MPLS stack - invalid label (%u)", labels[i]); + REJECT("Malformed MPLS stack - invalid label (%u)", labels[i]); /* TODO: Check for special-purpose label values? */ } @@ -970,10 +1021,29 @@ bgp_format_mpls_label_stack(const eattr *a, byte *buf, uint size) } static inline void -bgp_decode_unknown(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to) +bgp_export_unknown(struct bgp_export_state *s UNUSED, eattr *a) { + if (!(a->flags & BAF_TRANSITIVE)) + UNSET(a); + + a->flags |= BAF_PARTIAL; +} + +static inline void +bgp_decode_unknown(struct bgp_parse_state *s UNUSED, uint code, uint flags, byte *data, uint len, ea_list **to) +{ + if (!(flags & BAF_OPTIONAL)) + WITHDRAW("Unknown attribute (code %u) - conflicting flags (%02x)", code, flags); + /* Cannot use bgp_set_attr_data() as it works on known attributes only */ - ea_set_attr_data(to, s->pool, EA_CODE(PROTOCOL_BGP, code), flags, EAF_TYPE_OPAQUE, data, len); + ea_set_attr_data(to, &bgp_attr_table[code].class, flags, data, len); +} + +static inline void +bgp_format_unknown(const eattr *a, byte *buf, uint size) +{ + if (a->flags & BAF_TRANSITIVE) + bsnprintf(buf, size, "(transitive)"); } @@ -981,10 +1051,10 @@ bgp_decode_unknown(struct bgp_parse_state *s, uint code, uint flags, byte *data, * Attribute table */ -static const struct bgp_attr_desc bgp_attr_table[] = { +static union bgp_attr_desc bgp_attr_table[BGP_ATTR_MAX] = { [BA_ORIGIN] = { - .name = "origin", - .type = EAF_TYPE_INT, + .name = "bgp_origin", + .type = T_ENUM_BGP_ORIGIN, .flags = BAF_TRANSITIVE, .export = bgp_export_origin, .encode = bgp_encode_u8, @@ -992,69 +1062,69 @@ static const struct bgp_attr_desc bgp_attr_table[] = { .format = bgp_format_origin, }, [BA_AS_PATH] = { - .name = "as_path", - .type = EAF_TYPE_AS_PATH, + .name = "bgp_path", + .type = T_PATH, .flags = BAF_TRANSITIVE, .encode = bgp_encode_as_path, .decode = bgp_decode_as_path, }, [BA_NEXT_HOP] = { - .name = "next_hop", - .type = EAF_TYPE_IP_ADDRESS, + .name = "bgp_next_hop", + .type = T_IP, .flags = BAF_TRANSITIVE, .encode = bgp_encode_next_hop, .decode = bgp_decode_next_hop, .format = bgp_format_next_hop, }, [BA_MULTI_EXIT_DISC] = { - .name = "med", - .type = EAF_TYPE_INT, + .name = "bgp_med", + .type = T_INT, .flags = BAF_OPTIONAL, .encode = bgp_encode_u32, .decode = bgp_decode_med, }, [BA_LOCAL_PREF] = { - .name = "local_pref", - .type = EAF_TYPE_INT, + .name = "bgp_local_pref", + .type = T_INT, .flags = BAF_TRANSITIVE, .export = bgp_export_local_pref, .encode = bgp_encode_u32, .decode = bgp_decode_local_pref, }, [BA_ATOMIC_AGGR] = { - .name = "atomic_aggr", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_atomic_aggr", + .type = T_OPAQUE, .flags = BAF_TRANSITIVE, .encode = bgp_encode_raw, .decode = bgp_decode_atomic_aggr, }, [BA_AGGREGATOR] = { - .name = "aggregator", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_aggregator", + .type = T_OPAQUE, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .encode = bgp_encode_aggregator, .decode = bgp_decode_aggregator, .format = bgp_format_aggregator, }, [BA_COMMUNITY] = { - .name = "community", - .type = EAF_TYPE_INT_SET, + .name = "bgp_community", + .type = T_CLIST, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .export = bgp_export_community, .encode = bgp_encode_u32s, .decode = bgp_decode_community, }, [BA_ORIGINATOR_ID] = { - .name = "originator_id", - .type = EAF_TYPE_ROUTER_ID, + .name = "bgp_originator_id", + .type = T_QUAD, .flags = BAF_OPTIONAL, .export = bgp_export_originator_id, .encode = bgp_encode_u32, .decode = bgp_decode_originator_id, }, [BA_CLUSTER_LIST] = { - .name = "cluster_list", - .type = EAF_TYPE_INT_SET, + .name = "bgp_cluster_list", + .type = T_CLIST, .flags = BAF_OPTIONAL, .export = bgp_export_cluster_list, .encode = bgp_encode_u32s, @@ -1062,43 +1132,47 @@ static const struct bgp_attr_desc bgp_attr_table[] = { .format = bgp_format_cluster_list, }, [BA_MP_REACH_NLRI] = { - .name = "mp_reach_nlri", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_mp_reach_nlri", + .type = T_OPAQUE, + .hidden = 1, .flags = BAF_OPTIONAL, .decode = bgp_decode_mp_reach_nlri, }, [BA_MP_UNREACH_NLRI] = { - .name = "mp_unreach_nlri", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_mp_unreach_nlri", + .type = T_OPAQUE, + .hidden = 1, .flags = BAF_OPTIONAL, .decode = bgp_decode_mp_unreach_nlri, }, [BA_EXT_COMMUNITY] = { - .name = "ext_community", - .type = EAF_TYPE_EC_SET, + .name = "bgp_ext_community", + .type = T_ECLIST, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .export = bgp_export_ext_community, .encode = bgp_encode_u32s, .decode = bgp_decode_ext_community, }, [BA_AS4_PATH] = { - .name = "as4_path", - .type = EAF_TYPE_AS_PATH, + .name = "bgp_as4_path", + .type = T_PATH, + .hidden = 1, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .encode = bgp_encode_raw, .decode = bgp_decode_as4_path, }, [BA_AS4_AGGREGATOR] = { - .name = "as4_aggregator", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_as4_aggregator", + .type = T_OPAQUE, + .hidden = 1, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .encode = bgp_encode_raw, .decode = bgp_decode_as4_aggregator, .format = bgp_format_aggregator, }, [BA_AIGP] = { - .name = "aigp", - .type = EAF_TYPE_OPAQUE, + .name = "bgp_aigp", + .type = T_OPAQUE, .flags = BAF_OPTIONAL | BAF_DECODE_FLAGS, .export = bgp_export_aigp, .encode = bgp_encode_raw, @@ -1106,16 +1180,24 @@ static const struct bgp_attr_desc bgp_attr_table[] = { .format = bgp_format_aigp, }, [BA_LARGE_COMMUNITY] = { - .name = "large_community", - .type = EAF_TYPE_LC_SET, + .name = "bgp_large_community", + .type = T_LCLIST, .flags = BAF_OPTIONAL | BAF_TRANSITIVE, .export = bgp_export_large_community, .encode = bgp_encode_u32s, .decode = bgp_decode_large_community, }, + [BA_ONLY_TO_CUSTOMER] = { + .name = "otc", + .type = T_INT, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .encode = bgp_encode_u32, + .decode = bgp_decode_otc, + }, [BA_MPLS_LABEL_STACK] = { - .name = "mpls_label_stack", - .type = EAF_TYPE_INT_SET, + .name = "bgp_mpls_label_stack", + .type = T_CLIST, + .readonly = 1, .export = bgp_export_mpls_label_stack, .encode = bgp_encode_mpls_label_stack, .decode = bgp_decode_mpls_label_stack, @@ -1123,12 +1205,32 @@ static const struct bgp_attr_desc bgp_attr_table[] = { }, }; -static inline int -bgp_attr_known(uint code) +eattr * +bgp_find_attr(ea_list *attrs, uint code) { - return (code < ARRAY_SIZE(bgp_attr_table)) && bgp_attr_table[code].name; + return ea_find(attrs, BGP_EA_ID(code)); } +void +bgp_register_attrs(void) +{ + for (uint i=0; i<ARRAY_SIZE(bgp_attr_table); i++) + { + if (!bgp_attr_table[i].name) + bgp_attr_table[i] = (union bgp_attr_desc) { + .name = mb_sprintf(&root_pool, "bgp_unknown_0x%02x", i), + .type = T_OPAQUE, + .flags = BAF_OPTIONAL, + .readonly = 1, + .export = bgp_export_unknown, + .encode = bgp_encode_raw, + .decode = bgp_decode_unknown, + .format = bgp_format_unknown, + }; + + ea_register_init(&bgp_attr_table[i].class); + } +} /* * Attribute export @@ -1137,38 +1239,24 @@ bgp_attr_known(uint code) static inline void bgp_export_attr(struct bgp_export_state *s, eattr *a, ea_list *to) { - if (EA_PROTO(a->id) != PROTOCOL_BGP) + const union bgp_attr_desc *desc = bgp_find_attr_desc(a); + if (!desc) return; - uint code = EA_ID(a->id); + /* The flags might have been zero if the attr was added locally */ + a->flags = (a->flags & BAF_PARTIAL) | desc->flags; - if (bgp_attr_known(code)) - { - const struct bgp_attr_desc *desc = &bgp_attr_table[code]; - - /* The flags might have been zero if the attr was added by filters */ - a->flags = (a->flags & BAF_PARTIAL) | desc->flags; - - /* Set partial bit if new opt-trans attribute is attached to non-local route */ - if ((s->src != NULL) && (a->type & EAF_ORIGINATED) && - (a->flags & BAF_OPTIONAL) && (a->flags & BAF_TRANSITIVE)) - a->flags |= BAF_PARTIAL; - - /* Call specific hook */ - CALL(desc->export, s, a); + /* Set partial bit if new opt-trans attribute is attached to non-local route */ + if ((s->src != NULL) && (a->originated) && + (a->flags & BAF_OPTIONAL) && (a->flags & BAF_TRANSITIVE)) + a->flags |= BAF_PARTIAL; - /* Attribute might become undefined in hook */ - if ((a->type & EAF_TYPE_MASK) == EAF_TYPE_UNDEF) - return; - } - else - { - /* Don't re-export unknown non-transitive attributes */ - if (!(a->flags & BAF_TRANSITIVE)) - return; + /* Call specific hook */ + CALL(desc->export, s, a); - a->flags |= BAF_PARTIAL; - } + /* Attribute might become undefined in hook */ + if (a->undef) + return; /* Append updated attribute */ to->attrs[to->count++] = *a; @@ -1188,12 +1276,11 @@ bgp_export_attr(struct bgp_export_state *s, eattr *a, ea_list *to) * Result: one sorted attribute list segment, or NULL if attributes are unsuitable. */ static inline ea_list * -bgp_export_attrs(struct bgp_export_state *s, ea_list *attrs) +bgp_export_attrs(struct bgp_export_state *s, ea_list *a) { /* Merge the attribute list */ - ea_list *new = lp_alloc(s->pool, ea_scan(attrs)); - ea_merge(attrs, new); - ea_sort(new); + ea_list *new = ea_normalize(a, 0); + ASSERT_DIE(new); uint i, count; count = new->count; @@ -1203,7 +1290,7 @@ bgp_export_attrs(struct bgp_export_state *s, ea_list *attrs) for (i = 0; i < count; i++) bgp_export_attr(s, &new->attrs[i], new); - if (s->err_withdraw) + if (s->err_reject) return NULL; return new; @@ -1217,14 +1304,9 @@ bgp_export_attrs(struct bgp_export_state *s, ea_list *attrs) static inline int bgp_encode_attr(struct bgp_write_state *s, eattr *a, byte *buf, uint size) { - ASSERT(EA_PROTO(a->id) == PROTOCOL_BGP); - - uint code = EA_ID(a->id); - - if (bgp_attr_known(code)) - return bgp_attr_table[code].encode(s, a, buf, size); - else - return bgp_encode_raw(s, a, buf, size); + const union bgp_attr_desc *desc = bgp_find_attr_desc(a); + ASSERT_DIE(desc); + return desc->encode(s, a, buf, size); } /** @@ -1289,7 +1371,7 @@ bgp_cluster_list_loopy(struct bgp_proto *p, ea_list *attrs) } static inline void -bgp_decode_attr(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to) +bgp_decode_attr(struct bgp_parse_state *s, byte code, byte flags, byte *data, uint len, ea_list **to) { /* Handle duplicate attributes; RFC 7606 3 (g) */ if (BIT32_TEST(s->attrs_seen, code)) @@ -1301,24 +1383,15 @@ bgp_decode_attr(struct bgp_parse_state *s, uint code, uint flags, byte *data, ui } BIT32_SET(s->attrs_seen, code); - if (bgp_attr_known(code)) - { - const struct bgp_attr_desc *desc = &bgp_attr_table[code]; + ASSERT_DIE(bgp_attr_table[code].id); + const union bgp_attr_desc *desc = &bgp_attr_table[code]; - /* Handle conflicting flags; RFC 7606 3 (c) */ - if (((flags ^ desc->flags) & (BAF_OPTIONAL | BAF_TRANSITIVE)) && - !(desc->flags & BAF_DECODE_FLAGS)) - WITHDRAW("Malformed %s attribute - conflicting flags (%02x)", desc->name, flags); + /* Handle conflicting flags; RFC 7606 3 (c) */ + if (((flags ^ desc->flags) & (BAF_OPTIONAL | BAF_TRANSITIVE)) && + !(desc->flags & BAF_DECODE_FLAGS)) + WITHDRAW("Malformed %s attribute - conflicting flags (%02x, expected %02x)", desc->name, flags, desc->flags); - desc->decode(s, code, flags, data, len, to); - } - else /* Unknown attribute */ - { - if (!(flags & BAF_OPTIONAL)) - WITHDRAW("Unknown attribute (code %u) - conflicting flags (%02x)", code, flags); - - bgp_decode_unknown(s, code, flags, data, len, to); - } + desc->decode(s, code, flags, data, len, to); } /** @@ -1336,7 +1409,8 @@ bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len) { struct bgp_proto *p = s->proto; ea_list *attrs = NULL; - uint code, flags, alen; + uint alen; + byte code, flags; byte *pos = data; /* Parse the attributes */ @@ -1401,23 +1475,23 @@ bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len) /* Reject routes with our ASN in AS_PATH attribute */ if (bgp_as_path_loopy(p, attrs, p->local_as)) - goto withdraw; + goto loop; /* Reject routes with our Confederation ID in AS_PATH attribute; RFC 5065 4.0 */ if ((p->public_as != p->local_as) && bgp_as_path_loopy(p, attrs, p->public_as)) - goto withdraw; + goto loop; /* Reject routes with our Router ID in ORIGINATOR_ID attribute; RFC 4456 8 */ if (p->is_internal && bgp_originator_id_loopy(p, attrs)) - goto withdraw; + goto loop; /* Reject routes with our Cluster ID in CLUSTER_LIST attribute; RFC 4456 8 */ if (p->rr_client && bgp_cluster_list_loopy(p, attrs)) - goto withdraw; + goto loop; /* If there is no local preference, define one */ if (!BIT32_TEST(s->attrs_seen, BA_LOCAL_PREF)) - bgp_set_attr_u32(&attrs, s->pool, BA_LOCAL_PREF, 0, p->cf->default_local_pref); + bgp_set_attr_u32(&attrs, BA_LOCAL_PREF, 0, p->cf->default_local_pref); return attrs; @@ -1434,16 +1508,43 @@ withdraw: s->err_withdraw = 1; return NULL; + +loop: + /* Loops are handled as withdraws, but ignored silently. Do not set err_withdraw. */ + return NULL; } void -bgp_finish_attrs(struct bgp_parse_state *s, rta *a) +bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to) { /* AIGP test here instead of in bgp_decode_aigp() - we need to know channel */ if (BIT32_TEST(s->attrs_seen, BA_AIGP) && !s->channel->cf->aigp) { REPORT("Discarding AIGP attribute received on non-AIGP session"); - bgp_unset_attr(&a->eattrs, s->pool, BA_AIGP); + bgp_unset_attr(to, BA_AIGP); + } + + /* Handle OTC ingress procedure, RFC 9234 */ + if (bgp_channel_is_role_applicable(s->channel)) + { + struct bgp_proto *p = s->proto; + eattr *e = bgp_find_attr(*to, BA_ONLY_TO_CUSTOMER); + + /* Reject routes from downstream if they are leaked */ + if (e && (p->cf->local_role == BGP_ROLE_PROVIDER || + p->cf->local_role == BGP_ROLE_RS_SERVER)) + WITHDRAW("Route leak detected - OTC attribute from downstream"); + + /* Reject routes from peers if they are leaked */ + if (e && (p->cf->local_role == BGP_ROLE_PEER) && (e->u.data != p->cf->remote_as)) + WITHDRAW("Route leak detected - OTC attribute with mismatched ASN (%u)", + (uint) e->u.data); + + /* Mark routes from upstream if it did not happened before */ + if (!e && (p->cf->local_role == BGP_ROLE_CUSTOMER || + p->cf->local_role == BGP_ROLE_PEER || + p->cf->local_role == BGP_ROLE_RS_CLIENT)) + bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, 0, p->cf->remote_as); } } @@ -1458,13 +1559,13 @@ bgp_finish_attrs(struct bgp_parse_state *s, rta *a) #define RBH_FN(a,h) h #define RBH_REHASH bgp_rbh_rehash -#define RBH_PARAMS /8, *2, 2, 2, 8, 20 +#define RBH_PARAMS /8, *2, 2, 2, 12, 20 HASH_DEFINE_REHASH_FN(RBH, struct bgp_bucket) -void -bgp_init_bucket_table(struct bgp_channel *c) +static void +bgp_init_bucket_table(struct bgp_pending_tx *c) { HASH_INIT(c->bucket_hash, c->pool, 8); @@ -1472,24 +1573,8 @@ bgp_init_bucket_table(struct bgp_channel *c) c->withdraw_bucket = NULL; } -void -bgp_free_bucket_table(struct bgp_channel *c) -{ - HASH_FREE(c->bucket_hash); - - struct bgp_bucket *b; - WALK_LIST_FIRST(b, c->bucket_queue) - { - rem_node(&b->send_node); - mb_free(b); - } - - mb_free(c->withdraw_bucket); - c->withdraw_bucket = NULL; -} - static struct bgp_bucket * -bgp_get_bucket(struct bgp_channel *c, ea_list *new) +bgp_get_bucket(struct bgp_pending_tx *c, ea_list *new) { /* Hash and lookup */ u32 hash = ea_hash(new); @@ -1498,55 +1583,27 @@ bgp_get_bucket(struct bgp_channel *c, ea_list *new) if (b) return b; - uint ea_size = sizeof(ea_list) + new->count * sizeof(eattr); - uint ea_size_aligned = BIRD_ALIGN(ea_size, CPU_STRUCT_ALIGN); - uint size = sizeof(struct bgp_bucket) + ea_size_aligned; - uint i; - byte *dest; - - /* Gather total size of non-inline attributes */ - for (i = 0; i < new->count; i++) - { - eattr *a = &new->attrs[i]; - - if (!(a->type & EAF_EMBEDDED)) - size += BIRD_ALIGN(sizeof(struct adata) + a->u.ptr->length, CPU_STRUCT_ALIGN); - } + /* Scan the list for total size */ + uint ea_size = BIRD_CPU_ALIGN(ea_list_size(new)); + uint size = sizeof(struct bgp_bucket) + ea_size; - /* Create the bucket */ + /* Allocate the bucket */ b = mb_alloc(c->pool, size); *b = (struct bgp_bucket) { }; init_list(&b->prefixes); b->hash = hash; - /* Copy list of extended attributes */ - memcpy(b->eattrs, new, ea_size); - dest = ((byte *) b->eattrs) + ea_size_aligned; - - /* Copy values of non-inline attributes */ - for (i = 0; i < new->count; i++) - { - eattr *a = &b->eattrs->attrs[i]; - - if (!(a->type & EAF_EMBEDDED)) - { - const struct adata *oa = a->u.ptr; - struct adata *na = (struct adata *) dest; - memcpy(na, oa, sizeof(struct adata) + oa->length); - a->u.ptr = na; - dest += BIRD_ALIGN(sizeof(struct adata) + na->length, CPU_STRUCT_ALIGN); - } - } + /* Copy the ea_list */ + ea_list_copy(b->eattrs, new, ea_size); - /* Insert the bucket to send queue and bucket hash */ - add_tail(&c->bucket_queue, &b->send_node); + /* Insert the bucket to bucket hash */ HASH_INSERT2(c->bucket_hash, RBH, c->pool, b); return b; } static struct bgp_bucket * -bgp_get_withdraw_bucket(struct bgp_channel *c) +bgp_get_withdraw_bucket(struct bgp_pending_tx *c) { if (!c->withdraw_bucket) { @@ -1557,25 +1614,45 @@ bgp_get_withdraw_bucket(struct bgp_channel *c) return c->withdraw_bucket; } -void -bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b) +static void +bgp_free_bucket(struct bgp_pending_tx *c, struct bgp_bucket *b) { - rem_node(&b->send_node); HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b); mb_free(b); } +int +bgp_done_bucket(struct bgp_channel *bc, struct bgp_bucket *b) +{ + struct bgp_pending_tx *c = bc->ptx; + + /* Won't free the withdraw bucket */ + if (b == c->withdraw_bucket) + return 0; + + if (EMPTY_LIST(b->prefixes)) + rem_node(&b->send_node); + + if (b->px_uc || !EMPTY_LIST(b->prefixes)) + return 0; + + bgp_free_bucket(c, b); + return 1; +} + void -bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_defer_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { + struct bgp_pending_tx *c = bc->ptx; rem_node(&b->send_node); add_tail(&c->bucket_queue, &b->send_node); } void -bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) +bgp_withdraw_bucket(struct bgp_channel *bc, struct bgp_bucket *b) { - struct bgp_proto *p = (void *) c->c.proto; + struct bgp_proto *p = (void *) bc->c.proto; + struct bgp_pending_tx *c = bc->ptx; struct bgp_bucket *wb = bgp_get_withdraw_bucket(c); log(L_ERR "%s: Attribute list too long", p->p.name); @@ -1584,8 +1661,8 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) struct bgp_prefix *px = HEAD(b->prefixes); log(L_ERR "%s: - withdrawing %N", p->p.name, &px->net); - rem_node(&px->buck_node); - add_tail(&wb->prefixes, &px->buck_node); + rem_node(&px->buck_node_xx); + add_tail(&wb->prefixes, &px->buck_node_xx); } } @@ -1596,42 +1673,43 @@ bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) #define PXH_KEY(px) px->net, px->path_id, px->hash #define PXH_NEXT(px) px->next -#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && i1 == i2 && net_equal(n1, n2) +#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2) #define PXH_FN(n,i,h) h #define PXH_REHASH bgp_pxh_rehash -#define PXH_PARAMS /8, *2, 2, 2, 8, 24 +#define PXH_PARAMS /8, *2, 2, 2, 12, 24 HASH_DEFINE_REHASH_FN(PXH, struct bgp_prefix) -void -bgp_init_prefix_table(struct bgp_channel *c) +static void +bgp_init_prefix_table(struct bgp_channel *bc) { + struct bgp_pending_tx *c = bc->ptx; HASH_INIT(c->prefix_hash, c->pool, 8); - uint alen = net_addr_length[c->c.net_type]; + uint alen = net_addr_length[bc->c.net_type]; c->prefix_slab = alen ? sl_new(c->pool, sizeof(struct bgp_prefix) + alen) : NULL; } -void -bgp_free_prefix_table(struct bgp_channel *c) -{ - HASH_FREE(c->prefix_hash); - - rfree(c->prefix_slab); - c->prefix_slab = NULL; -} - static struct bgp_prefix * -bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id) +bgp_get_prefix(struct bgp_pending_tx *c, const net_addr *net, struct rte_src *src, int add_path_tx) { - u32 hash = net_hash(net) ^ u32_hash(path_id); - struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id, hash); + u32 path_id = src->global_id; + u32 path_id_hash = add_path_tx ? path_id : 0; + /* We must use a different hash function than the rtable */ + u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id_hash)); + struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id_hash, hash); if (px) { - rem_node(&px->buck_node); + if (!add_path_tx && (path_id != px->path_id)) + { + rt_unlock_source(rt_find_source_global(px->path_id)); + rt_lock_source(src); + px->path_id = path_id; + } + return px; } @@ -1644,34 +1722,321 @@ bgp_get_prefix(struct bgp_channel *c, const net_addr *net, u32 path_id) px->hash = hash; px->path_id = path_id; net_copy(px->net, net); + rt_lock_source(src); HASH_INSERT2(c->prefix_hash, PXH, c->pool, px); return px; } -void -bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px) +static void bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px); + +static inline int +bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *b) +{ +#define IS_WITHDRAW_BUCKET(b) ((b) == c->ptx->withdraw_bucket) +#define BPX_TRACE(what) do { \ + if (c->c.debug & D_ROUTES) log(L_TRACE "%s.%s < %s %N %uG %s", \ + c->c.proto->name, c->c.name, what, \ + px->net, px->path_id, IS_WITHDRAW_BUCKET(b) ? "withdraw" : "update"); } while (0) + px->lastmod = current_time(); + + /* Already queued for the same bucket */ + if (px->cur == b) + { + BPX_TRACE("already queued"); + return 0; + } + + /* Unqueue from the old bucket */ + if (px->cur) + { + rem_node(&px->buck_node_xx); + bgp_done_bucket(c, px->cur); + } + + /* The new bucket is the same as we sent before */ + if ((px->last == b) || c->c.out_table && !px->last && IS_WITHDRAW_BUCKET(b)) + { + if (px->cur) + BPX_TRACE("reverted"); + else + BPX_TRACE("already sent"); + + /* Well, we haven't sent anything yet */ + if (!px->last) + bgp_free_prefix(c->ptx, px); + + px->cur = NULL; + return 0; + } + + /* Enqueue the bucket if it has been empty */ + if (!IS_WITHDRAW_BUCKET(b) && EMPTY_LIST(b->prefixes)) + add_tail(&c->ptx->bucket_queue, &b->send_node); + + /* Enqueue to the new bucket and indicate the change */ + add_tail(&b->prefixes, &px->buck_node_xx); + px->cur = b; + + BPX_TRACE("queued"); + return 1; + +#undef BPX_TRACE +} + +static void +bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px) { - rem_node(&px->buck_node); HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px); + rt_unlock_source(rt_find_source_global(px->path_id)); + if (c->prefix_slab) - sl_free(c->prefix_slab, px); + sl_free(px); else mb_free(px); } +void +bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck) +{ + /* Cleanup: We're called from bucket senders. */ + ASSERT_DIE(px->cur == buck); + rem_node(&px->buck_node_xx); + + /* We may want to store the updates */ + if (c->c.out_table) + { + /* Nothing to be sent right now */ + px->cur = NULL; + + /* Unref the previous sent version */ + if (px->last) + px->last->px_uc--; + + /* Ref the current sent version */ + if (!IS_WITHDRAW_BUCKET(buck)) + { + px->last = buck; + px->last->px_uc++; + return; + } + + /* Prefixes belonging to the withdraw bucket are freed always */ + } + + bgp_free_prefix(c->ptx, px); +} + +static void +bgp_pending_tx_rfree(resource *r) +{ + struct bgp_pending_tx *ptx = SKIP_BACK(struct bgp_pending_tx, r, r); + + HASH_WALK(ptx->prefix_hash, next, n) + rt_unlock_source(rt_find_source_global(n->path_id)); + HASH_WALK_END; +} + +static void bgp_pending_tx_dump(resource *r UNUSED, unsigned indent UNUSED) { debug("\n"); } + +static struct resclass bgp_pending_tx_class = { + .name = "BGP Pending TX", + .size = sizeof(struct bgp_pending_tx), + .free = bgp_pending_tx_rfree, + .dump = bgp_pending_tx_dump, +}; + +void +bgp_init_pending_tx(struct bgp_channel *c) +{ + ASSERT_DIE(!c->ptx); + + pool *p = rp_new(c->pool, "BGP Pending TX"); + c->ptx = ralloc(p, &bgp_pending_tx_class); + c->ptx->pool = p; + + bgp_init_bucket_table(c->ptx); + bgp_init_prefix_table(c); +} + +void +bgp_free_pending_tx(struct bgp_channel *c) +{ + ASSERT_DIE(c->ptx); + ASSERT_DIE(c->ptx->pool); + + rfree(c->ptx->pool); + c->ptx = NULL; +} + + +/* + * Prefix hash table exporter + */ + +struct bgp_out_export_hook { + struct rt_export_hook h; + u32 hash_iter; /* Iterator over hash */ +}; + +static void +bgp_out_table_feed(void *data) +{ + struct bgp_out_export_hook *hook = data; + struct bgp_channel *bc = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->h.table); + struct bgp_pending_tx *c = bc->ptx; + + int max = 512; + + const net_addr *neq = (hook->h.req->addr_mode == TE_ADDR_EQUAL) ? hook->h.req->addr : NULL; + const net_addr *cand = NULL; + + do { + HASH_WALK_ITER(c->prefix_hash, PXH, n, hook->hash_iter) + { + switch (hook->h.req->addr_mode) + { + case TE_ADDR_IN: + if (!net_in_netX(n->net, hook->h.req->addr)) + continue; + /* fall through */ + case TE_ADDR_NONE: + /* Splitting only for multi-net exports */ + if (--max <= 0) + HASH_WALK_ITER_PUT; + break; + + case TE_ADDR_FOR: + if (!neq) + { + if (net_in_netX(hook->h.req->addr, n->net) && (!cand || (n->net->length > cand->length))) + cand = n->net; + continue; + } + /* fall through */ + case TE_ADDR_EQUAL: + if (!net_equal(n->net, neq)) + continue; + break; + } + + struct bgp_bucket *buck = n->cur ?: n->last; + ea_list *ea = NULL; + if (buck == c->withdraw_bucket) + ea_set_dest(&ea, 0, RTD_UNREACHABLE); + else + { + ea = buck->eattrs; + eattr *eanh = bgp_find_attr(ea, BA_NEXT_HOP); + ASSERT_DIE(eanh); + const ip_addr *nh = (const void *) eanh->u.ptr->data; + + struct nexthop_adata nhad = { + .ad = { .length = sizeof (struct nexthop_adata) - sizeof (struct adata), }, + .nh = { .gw = nh[0], }, + }; + + ea_set_attr(&ea, EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, tmp_copy_adata(&nhad.ad))); + } + + struct rte_storage es = { + .rte = { + .attrs = ea, + .net = n->net, + .src = rt_find_source_global(n->path_id), + .sender = NULL, + .lastmod = n->lastmod, + .flags = n->cur ? REF_PENDING : 0, + }, + }; + + struct rt_pending_export rpe = { + .new = &es, .new_best = &es, + }; + + if (hook->h.req->export_bulk) + { + const rte *feed = &es.rte; + hook->h.req->export_bulk(hook->h.req, n->net, &rpe, &rpe, &feed, 1); + } + else if (hook->h.req->export_one) + hook->h.req->export_one(hook->h.req, n->net, &rpe); + else + bug("No export method in export request"); + } + HASH_WALK_ITER_END; + + neq = cand; + cand = NULL; + } while (neq); + + if (hook->hash_iter) + ev_schedule_work(&hook->h.event); + else + rt_set_export_state(&hook->h, TES_READY); +} + +static void +bgp_out_table_export_start(struct rt_exporter *re, struct rt_export_request *req) +{ + req->hook = rt_alloc_export(re, sizeof(struct bgp_out_export_hook)); + req->hook->req = req; + + struct bgp_out_export_hook *hook = SKIP_BACK(struct bgp_out_export_hook, h, req->hook); + + hook->h.event.hook = bgp_out_table_feed; + rt_init_export(re, req->hook); +} + +static void +bgp_out_table_export_done(void *data) +{ + struct bgp_out_export_hook *hook = data; + struct rt_export_request *req = hook->h.req; + void (*stopped)(struct rt_export_request *) = hook->h.stopped; + + rt_export_stopped(&hook->h); + CALL(stopped, req); +} + +static const struct rt_exporter_class bgp_out_table_export_class = { + .start = bgp_out_table_export_start, + .done = bgp_out_table_export_done, +}; + +void +bgp_setup_out_table(struct bgp_channel *c) +{ + ASSERT_DIE(c->c.out_table == NULL); + + c->prefix_exporter = (struct rt_exporter) { + .class = &bgp_out_table_export_class, + .addr_type = c->c.table->addr_type, + .rp = c->c.proto->pool, + }; + + rt_exporter_init(&c->prefix_exporter); + + c->c.out_table = &c->prefix_exporter; +} + /* * BGP protocol glue */ int -bgp_preexport(struct channel *c, rte *e) +bgp_preexport(struct channel *C, rte *e) { - struct bgp_proto *p = (struct bgp_proto *) (c->proto); + struct bgp_proto *p = (struct bgp_proto *) C->proto; struct bgp_proto *src = bgp_rte_proto(e); + struct bgp_channel *c = (struct bgp_channel *) C; + + /* Ignore non-BGP channels */ + if (C->channel != &channel_bgp) + return -1; /* Reject our routes */ if (src == p) @@ -1681,6 +2046,22 @@ bgp_preexport(struct channel *c, rte *e) if (src == NULL) return 0; + /* Reject flowspec that failed validation */ + if (net_is_flow(e->net)) + switch (rt_get_flowspec_valid(e)) + { + case FLOWSPEC_VALID: + break; + case FLOWSPEC_INVALID: + return -1; + case FLOWSPEC_UNKNOWN: + ASSUME((rt_get_source_attr(e) != RTS_BGP) || + !((struct bgp_channel *) SKIP_BACK(struct channel, in_req, e->sender->req))->base_table); + break; + case FLOWSPEC__MAX: + bug("This never happens."); + } + /* IBGP route reflection, RFC 4456 */ if (p->is_internal && src->is_internal && (p->local_as == src->local_as)) { @@ -1690,16 +2071,16 @@ bgp_preexport(struct channel *c, rte *e) /* Generally, this should be handled when path is received, but we check it also here as rr_cluster_id may be undefined or different in src. */ - if (p->rr_cluster_id && bgp_cluster_list_loopy(p, e->attrs->eattrs)) + if (p->rr_cluster_id && bgp_cluster_list_loopy(p, e->attrs)) return -1; } /* Handle well-known communities, RFC 1997 */ - struct eattr *com; + struct eattr *a; if (p->cf->interpret_communities && - (com = ea_find(e->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)))) + (a = bgp_find_attr(e->attrs, BA_COMMUNITY))) { - const struct adata *d = com->u.ptr; + const struct adata *d = a->u.ptr; /* Do not export anywhere */ if (int_set_contains(d, BGP_COMM_NO_ADVERTISE)) @@ -1718,6 +2099,16 @@ bgp_preexport(struct channel *c, rte *e) return -1; } + /* Do not export routes marked with OTC to upstream, RFC 9234 */ + if (bgp_channel_is_role_applicable(c)) + { + a = bgp_find_attr(e->attrs, BA_ONLY_TO_CUSTOMER); + if (a && (p->cf->local_role==BGP_ROLE_CUSTOMER || + p->cf->local_role==BGP_ROLE_PEER || + p->cf->local_role==BGP_ROLE_RS_CLIENT)) + return -1; + } + return 0; } @@ -1732,7 +2123,7 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at /* ORIGIN attribute - mandatory, attach if missing */ if (! bgp_find_attr(attrs0, BA_ORIGIN)) - bgp_set_attr_u32(&attrs, pool, BA_ORIGIN, 0, src ? ORIGIN_INCOMPLETE : ORIGIN_IGP); + bgp_set_attr_u32(&attrs, BA_ORIGIN, 0, src ? ORIGIN_INCOMPLETE : ORIGIN_IGP); /* AS_PATH attribute - mandatory */ a = bgp_find_attr(attrs0, BA_AS_PATH); @@ -1747,24 +2138,24 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at { /* IBGP or route server -> just ensure there is one */ if (!a) - bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, &null_adata); + bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, &null_adata); } else if (p->is_interior) { /* Confederation -> prepend ASN as AS_CONFED_SEQUENCE */ ad = as_path_prepend2(pool, ad, AS_PATH_CONFED_SEQUENCE, p->public_as); - bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, ad); + bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, ad); } else /* Regular EBGP (no RS, no confederation) */ { /* Regular EBGP -> prepend ASN as regular sequence */ ad = as_path_prepend2(pool, ad, AS_PATH_SEQUENCE, p->public_as); - bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, ad); + bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, ad); /* MULTI_EXIT_DESC attribute - accept only if set in export filter */ a = bgp_find_attr(attrs0, BA_MULTI_EXIT_DISC); - if (a && !(a->type & EAF_FRESH)) - bgp_unset_attr(&attrs, pool, BA_MULTI_EXIT_DISC); + if (a && !a->fresh && !p->cf->allow_med) + bgp_unset_attr(&attrs, BA_MULTI_EXIT_DISC); } /* NEXT_HOP attribute - delegated to AF-specific hook */ @@ -1773,16 +2164,16 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at /* LOCAL_PREF attribute - required for IBGP, attach if missing */ if (p->is_interior && ! bgp_find_attr(attrs0, BA_LOCAL_PREF)) - bgp_set_attr_u32(&attrs, pool, BA_LOCAL_PREF, 0, p->cf->default_local_pref); + bgp_set_attr_u32(&attrs, BA_LOCAL_PREF, 0, p->cf->default_local_pref); /* AIGP attribute - accumulate local metric or originate new one */ u64 metric; if (s.local_next_hop && - (bgp_total_aigp_metric_(e->attrs, &metric, &ad) || + (bgp_total_aigp_metric_(e, &metric, &ad) || (c->cf->aigp_originate && bgp_init_aigp_metric(e, &metric, &ad)))) { ad = bgp_aigp_set_metric(pool, ad, metric); - bgp_set_attr_ptr(&attrs, pool, BA_AIGP, 0, ad); + bgp_set_attr_ptr(&attrs, BA_AIGP, 0, ad); } /* IBGP route reflection, RFC 4456 */ @@ -1790,7 +2181,7 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at { /* ORIGINATOR_ID attribute - attach if not already set */ if (! bgp_find_attr(attrs0, BA_ORIGINATOR_ID)) - bgp_set_attr_u32(&attrs, pool, BA_ORIGINATOR_ID, 0, src->remote_id); + bgp_set_attr_u32(&attrs, BA_ORIGINATOR_ID, 0, src->remote_id); /* CLUSTER_LIST attribute - prepend cluster ID */ a = bgp_find_attr(attrs0, BA_CLUSTER_LIST); @@ -1805,7 +2196,7 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at ad = int_set_prepend(pool, ad, p->rr_cluster_id); /* Should be at least one prepended cluster ID */ - bgp_set_attr_ptr(&attrs, pool, BA_CLUSTER_LIST, 0, ad); + bgp_set_attr_ptr(&attrs, BA_CLUSTER_LIST, 0, ad); } /* AS4_* transition attributes, RFC 6793 4.2.2 */ @@ -1814,18 +2205,28 @@ bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *at a = bgp_find_attr(attrs, BA_AS_PATH); if (a && as_path_contains_as4(a->u.ptr)) { - bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, as_path_to_old(pool, a->u.ptr)); - bgp_set_attr_ptr(&attrs, pool, BA_AS4_PATH, 0, as_path_strip_confed(pool, a->u.ptr)); + bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, as_path_to_old(pool, a->u.ptr)); + bgp_set_attr_ptr(&attrs, BA_AS4_PATH, 0, as_path_strip_confed(pool, a->u.ptr)); } a = bgp_find_attr(attrs, BA_AGGREGATOR); if (a && aggregator_contains_as4(a->u.ptr)) { - bgp_set_attr_ptr(&attrs, pool, BA_AGGREGATOR, 0, aggregator_to_old(pool, a->u.ptr)); - bgp_set_attr_ptr(&attrs, pool, BA_AS4_AGGREGATOR, 0, a->u.ptr); + bgp_set_attr_ptr(&attrs, BA_AGGREGATOR, 0, aggregator_to_old(pool, a->u.ptr)); + bgp_set_attr_ptr(&attrs, BA_AS4_AGGREGATOR, 0, a->u.ptr); } } + /* Mark routes for downstream with OTC, RFC 9234 */ + if (bgp_channel_is_role_applicable(c)) + { + a = bgp_find_attr(attrs, BA_ONLY_TO_CUSTOMER); + if (!a && (p->cf->local_role == BGP_ROLE_PROVIDER || + p->cf->local_role == BGP_ROLE_PEER || + p->cf->local_role == BGP_ROLE_RS_SERVER)) + bgp_set_attr_u32(&attrs, BA_ONLY_TO_CUSTOMER, 0, p->public_as); + } + /* * Presence of mandatory attributes ORIGIN and AS_PATH is ensured by above * conditions. Presence and validity of quasi-mandatory NEXT_HOP attribute @@ -1842,34 +2243,39 @@ bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, c struct bgp_proto *p = (void *) P; struct bgp_channel *c = (void *) C; struct bgp_bucket *buck; - struct bgp_prefix *px; - u32 path; + struct rte_src *path; + + /* Ignore non-BGP channels */ + if (C->channel != &channel_bgp) + return; if (new) { - struct ea_list *attrs = bgp_update_attrs(p, c, new, new->attrs->eattrs, C->rte_update_pool); + struct ea_list *attrs = bgp_update_attrs(p, c, new, new->attrs, tmp_linpool); + + /* Error during attribute processing */ + if (!attrs) + log(L_ERR "%s: Invalid route %N withdrawn", p->p.name, n); /* If attributes are invalid, we fail back to withdraw */ - buck = attrs ? bgp_get_bucket(c, attrs) : bgp_get_withdraw_bucket(c); - path = new->src->global_id; + buck = attrs ? bgp_get_bucket(c->ptx, attrs) : bgp_get_withdraw_bucket(c->ptx); + path = new->src; } else { - buck = bgp_get_withdraw_bucket(c); - path = old->src->global_id; + buck = bgp_get_withdraw_bucket(c->ptx); + path = old->src; } - px = bgp_get_prefix(c, n, c->add_path_tx ? path : 0); - add_tail(&buck->prefixes, &px->buck_node); - - bgp_schedule_packet(p->conn, c, PKT_UPDATE); + if (bgp_update_prefix(c, bgp_get_prefix(c->ptx, n, path, c->add_path_tx), buck)) + bgp_schedule_packet(p->conn, c, PKT_UPDATE); } static inline u32 -bgp_get_neighbor(rte *r) +bgp_get_neighbor(const rte *r) { - eattr *e = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); + eattr *e = ea_find(r->attrs, BGP_EA_ID(BA_AS_PATH)); u32 as; if (e && as_path_get_first_regular(e->u.ptr, &as)) @@ -1881,30 +2287,14 @@ bgp_get_neighbor(rte *r) } static inline int -rte_stale(rte *r) +rte_stale(const rte *r) { - if (r->pflags & BGP_REF_STALE) - return 1; - - if (r->pflags & BGP_REF_NOT_STALE) - return 0; - - /* If staleness is unknown, compute and cache it */ - eattr *a = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)); - if (a && int_set_contains(a->u.ptr, BGP_COMM_LLGR_STALE)) - { - r->pflags |= BGP_REF_STALE; - return 1; - } - else - { - r->pflags |= BGP_REF_NOT_STALE; - return 0; - } + eattr *a = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY)); + return a && int_set_contains(a->u.ptr, BGP_COMM_LLGR_STALE); } int -bgp_rte_better(rte *new, rte *old) +bgp_rte_better(const rte *new, const rte *old) { struct bgp_proto *new_bgp = bgp_rte_proto(new); struct bgp_proto *old_bgp = bgp_rte_proto(old); @@ -1920,8 +2310,8 @@ bgp_rte_better(rte *new, rte *old) return 1; /* RFC 4271 9.1.2.1. Route resolvability test */ - n = rta_resolvable(new->attrs); - o = rta_resolvable(old->attrs); + n = rte_resolvable(new); + o = rte_resolvable(old); if (n > o) return 1; if (n < o) @@ -1936,8 +2326,8 @@ bgp_rte_better(rte *new, rte *old) return 1; /* Start with local preferences */ - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); + x = ea_find(new->attrs, BGP_EA_ID(BA_LOCAL_PREF)); + y = ea_find(old->attrs, BGP_EA_ID(BA_LOCAL_PREF)); n = x ? x->u.data : new_bgp->cf->default_local_pref; o = y ? y->u.data : old_bgp->cf->default_local_pref; if (n > o) @@ -1946,8 +2336,8 @@ bgp_rte_better(rte *new, rte *old) return 0; /* RFC 7311 4.1 - Apply AIGP metric */ - u64 n2 = bgp_total_aigp_metric(new->attrs); - u64 o2 = bgp_total_aigp_metric(old->attrs); + u64 n2 = bgp_total_aigp_metric(new); + u64 o2 = bgp_total_aigp_metric(old); if (n2 < o2) return 1; if (n2 > o2) @@ -1956,8 +2346,8 @@ bgp_rte_better(rte *new, rte *old) /* RFC 4271 9.1.2.2. a) Use AS path lengths */ if (new_bgp->cf->compare_path_lengths || old_bgp->cf->compare_path_lengths) { - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); + x = ea_find(new->attrs, BGP_EA_ID(BA_AS_PATH)); + y = ea_find(old->attrs, BGP_EA_ID(BA_AS_PATH)); n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; if (n < o) @@ -1967,8 +2357,8 @@ bgp_rte_better(rte *new, rte *old) } /* RFC 4271 9.1.2.2. b) Use origins */ - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); + x = ea_find(new->attrs, BGP_EA_ID(BA_ORIGIN)); + y = ea_find(old->attrs, BGP_EA_ID(BA_ORIGIN)); n = x ? x->u.data : ORIGIN_INCOMPLETE; o = y ? y->u.data : ORIGIN_INCOMPLETE; if (n < o) @@ -1990,8 +2380,8 @@ bgp_rte_better(rte *new, rte *old) if (new_bgp->cf->med_metric || old_bgp->cf->med_metric || (bgp_get_neighbor(new) == bgp_get_neighbor(old))) { - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); + x = ea_find(new->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC)); + y = ea_find(old->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC)); n = x ? x->u.data : new_bgp->cf->default_med; o = y ? y->u.data : old_bgp->cf->default_med; if (n < o) @@ -2007,8 +2397,8 @@ bgp_rte_better(rte *new, rte *old) return 1; /* RFC 4271 9.1.2.2. e) Compare IGP metrics */ - n = new_bgp->cf->igp_metric ? new->attrs->igp_metric : 0; - o = old_bgp->cf->igp_metric ? old->attrs->igp_metric : 0; + n = new_bgp->cf->igp_metric ? rt_get_igp_metric(new) : 0; + o = old_bgp->cf->igp_metric ? rt_get_igp_metric(old) : 0; if (n < o) return 1; if (n > o) @@ -2016,8 +2406,8 @@ bgp_rte_better(rte *new, rte *old) /* RFC 4271 9.1.2.2. f) Compare BGP identifiers */ /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighbor ID */ - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID)); + x = ea_find(new->attrs, BGP_EA_ID(BA_ORIGINATOR_ID)); + y = ea_find(old->attrs, BGP_EA_ID(BA_ORIGINATOR_ID)); n = x ? x->u.data : new_bgp->remote_id; o = y ? y->u.data : old_bgp->remote_id; @@ -2034,8 +2424,8 @@ bgp_rte_better(rte *new, rte *old) return 0; /* RFC 4456 9. b) Compare cluster list lengths */ - x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_CLUSTER_LIST)); - y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_CLUSTER_LIST)); + x = ea_find(new->attrs, BGP_EA_ID(BA_CLUSTER_LIST)); + y = ea_find(old->attrs, BGP_EA_ID(BA_CLUSTER_LIST)); n = x ? int_set_get_size(x->u.ptr) : 0; o = y ? int_set_get_size(y->u.ptr) : 0; if (n < o) @@ -2049,7 +2439,7 @@ bgp_rte_better(rte *new, rte *old) int -bgp_rte_mergable(rte *pri, rte *sec) +bgp_rte_mergable(const rte *pri, const rte *sec) { struct bgp_proto *pri_bgp = bgp_rte_proto(pri); struct bgp_proto *sec_bgp = bgp_rte_proto(sec); @@ -2057,17 +2447,20 @@ bgp_rte_mergable(rte *pri, rte *sec) u32 p, s; /* Skip suppressed routes (see bgp_rte_recalculate()) */ - /* LLGR draft - depreference stale routes */ - if (pri->pflags != sec->pflags) + if ((pri->pflags ^ sec->pflags) & BGP_REF_SUPPRESSED) return 0; /* RFC 4271 9.1.2.1. Route resolvability test */ - if (rta_resolvable(pri->attrs) != rta_resolvable(sec->attrs)) + if (rte_resolvable(pri) != rte_resolvable(sec)) + return 0; + + /* LLGR draft - depreference stale routes */ + if (rte_stale(pri) != rte_stale(sec)) return 0; /* Start with local preferences */ - x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); - y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); + x = ea_find(pri->attrs, BGP_EA_ID(BA_LOCAL_PREF)); + y = ea_find(sec->attrs, BGP_EA_ID(BA_LOCAL_PREF)); p = x ? x->u.data : pri_bgp->cf->default_local_pref; s = y ? y->u.data : sec_bgp->cf->default_local_pref; if (p != s) @@ -2076,8 +2469,8 @@ bgp_rte_mergable(rte *pri, rte *sec) /* RFC 4271 9.1.2.2. a) Use AS path lengths */ if (pri_bgp->cf->compare_path_lengths || sec_bgp->cf->compare_path_lengths) { - x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); - y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); + x = ea_find(pri->attrs, BGP_EA_ID(BA_AS_PATH)); + y = ea_find(sec->attrs, BGP_EA_ID(BA_AS_PATH)); p = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; s = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; @@ -2089,8 +2482,8 @@ bgp_rte_mergable(rte *pri, rte *sec) } /* RFC 4271 9.1.2.2. b) Use origins */ - x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); - y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); + x = ea_find(pri->attrs, BGP_EA_ID(BA_ORIGIN)); + y = ea_find(sec->attrs, BGP_EA_ID(BA_ORIGIN)); p = x ? x->u.data : ORIGIN_INCOMPLETE; s = y ? y->u.data : ORIGIN_INCOMPLETE; if (p != s) @@ -2100,8 +2493,8 @@ bgp_rte_mergable(rte *pri, rte *sec) if (pri_bgp->cf->med_metric || sec_bgp->cf->med_metric || (bgp_get_neighbor(pri) == bgp_get_neighbor(sec))) { - x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); - y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); + x = ea_find(pri->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC)); + y = ea_find(sec->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC)); p = x ? x->u.data : pri_bgp->cf->default_med; s = y ? y->u.data : sec_bgp->cf->default_med; if (p != s) @@ -2113,8 +2506,8 @@ bgp_rte_mergable(rte *pri, rte *sec) return 0; /* RFC 4271 9.1.2.2. e) Compare IGP metrics */ - p = pri_bgp->cf->igp_metric ? pri->attrs->igp_metric : 0; - s = sec_bgp->cf->igp_metric ? sec->attrs->igp_metric : 0; + p = pri_bgp->cf->igp_metric ? rt_get_igp_metric(pri) : 0; + s = sec_bgp->cf->igp_metric ? rt_get_igp_metric(sec) : 0; if (p != s) return 0; @@ -2127,7 +2520,7 @@ bgp_rte_mergable(rte *pri, rte *sec) static inline int same_group(rte *r, u32 lpref, u32 lasn) { - return (r->attrs->pref == lpref) && (bgp_get_neighbor(r) == lasn); + return (rt_get_preference(r) == lpref) && (bgp_get_neighbor(r) == lasn); } static inline int @@ -2138,10 +2531,10 @@ use_deterministic_med(struct rte_storage *r) } int -bgp_rte_recalculate(rtable_private *table, net *net, rte *new, rte *old, rte *old_best) +bgp_rte_recalculate(struct rtable_private *table, net *net, rte *new, rte *old, rte *old_best) { rte *key = new ? new : old; - u32 lpref = key->attrs->pref; + u32 lpref = rt_get_preference(key); u32 lasn = bgp_get_neighbor(key); int old_suppressed = old ? !!(old->pflags & BGP_REF_SUPPRESSED) : 0; @@ -2208,7 +2601,7 @@ bgp_rte_recalculate(rtable_private *table, net *net, rte *new, rte *old, rte *ol /* The default case - find a new best-in-group route */ rte *r = new; /* new may not be in the list */ - for (struct rte_storage *s = net->routes; rte_is_valid(&s->rte); s = s->next) + for (struct rte_storage *s = net->routes; rte_is_valid(RTE_OR_NULL(s)); s = s->next) if (use_deterministic_med(s) && same_group(&s->rte, lpref, lasn)) { s->rte.pflags |= BGP_REF_SUPPRESSED; @@ -2225,7 +2618,7 @@ bgp_rte_recalculate(rtable_private *table, net *net, rte *new, rte *old, rte *ol new->pflags &= ~BGP_REF_SUPPRESSED; /* Found all existing routes mergable with best-in-group */ - for (struct rte_storage *s = net->routes; rte_is_valid(&s->rte); s = s->next) + for (struct rte_storage *s = net->routes; rte_is_valid(RTE_OR_NULL(s)); s = s->next) if (use_deterministic_med(s) && same_group(&s->rte, lpref, lasn)) if ((&s->rte != r) && bgp_rte_mergable(r, &s->rte)) s->rte.pflags &= ~BGP_REF_SUPPRESSED; @@ -2264,43 +2657,58 @@ bgp_rte_recalculate(rtable_private *table, net *net, rte *new, rte *old, rte *ol } void -bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe UNUSED, rte **feed, uint count) +bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, + struct rt_pending_export *first, struct rt_pending_export *last, + const rte **feed, uint count) { struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req); + struct rt_import_hook *irh = c->c.in_req.hook; - do { - rte *r = feed[--count]; - if (r->sender != c->c.in_req.hook) - continue; + /* Find our routes among others */ + for (uint i=0; i<count; i++) + { + const rte *r = feed[i]; - /* A new route, do not mark as stale */ - if (r->stale_cycle == c->c.in_req.hook->stale_set) + if ( + !rte_is_valid(r) || /* Not a valid route */ + (r->sender != irh) || /* Not our route */ + (r->stale_cycle == irh->stale_set)) /* A new route, do not mark as stale */ continue; - eattr *ea = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)); + eattr *ea = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY)); const struct adata *ad = ea ? ea->u.ptr : NULL; uint flags = ea ? ea->flags : BAF_PARTIAL; - rte e0 = *r; - e0.flags |= REF_USE_STALE; - + /* LLGR not allowed, withdraw the route */ if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR)) + { rte_import(&c->c.in_req, n, NULL, r->src); + continue; + } - else if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE)) - rte_import(&c->c.in_req, n, &e0, r->src); + /* Route already marked as LLGR, do nothing */ + if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE)) + continue; - else { - rta *a = e0.attrs = rta_do_cow(r->attrs, c->c.rte_update_pool); + /* Store the tmp_linpool state to aggresively save memory */ + struct lp_state tmpp; + lp_save(tmp_linpool, &tmpp); - bgp_set_attr_ptr(&(a->eattrs), c->c.rte_update_pool, BA_COMMUNITY, flags, - int_set_add(c->c.rte_update_pool, ad, BGP_COMM_LLGR_STALE)); - e0.pflags |= BGP_REF_STALE; + /* Mark the route as LLGR */ + rte e0 = *r; + bgp_set_attr_ptr(&e0.attrs, BA_COMMUNITY, flags, int_set_add(tmp_linpool, ad, BGP_COMM_LLGR_STALE)); - rte_import(&c->c.in_req, n, &e0, r->src); - lp_flush(c->c.rte_update_pool); - } - } while (count); + /* We need to update the route but keep it stale. */ + ASSERT_DIE(irh->stale_set == irh->stale_valid + 1); + irh->stale_set--; + rte_import(&c->c.in_req, n, &e0, r->src); + irh->stale_set++; + + /* Restore the memory state */ + lp_restore(tmp_linpool, &tmpp); + } + + rpe_mark_seen_all(req->hook, first, last, NULL); } @@ -2316,8 +2724,8 @@ bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool) eattr *a4 = bgp_find_attr(*attrs, BA_AS4_AGGREGATOR); /* First, unset AS4_* attributes */ - if (p4) bgp_unset_attr(attrs, pool, BA_AS4_PATH); - if (a4) bgp_unset_attr(attrs, pool, BA_AS4_AGGREGATOR); + if (p4) bgp_unset_attr(attrs, BA_AS4_PATH); + if (a4) bgp_unset_attr(attrs, BA_AS4_AGGREGATOR); /* Handle AGGREGATOR attribute */ if (a2 && a4) @@ -2350,60 +2758,37 @@ bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool) } } -int -bgp_get_attr(const eattr *a, byte *buf, int buflen) -{ - uint i = EA_ID(a->id); - const struct bgp_attr_desc *d; - int len; - - if (bgp_attr_known(i)) - { - d = &bgp_attr_table[i]; - len = bsprintf(buf, "%s", d->name); - buf += len; - if (d->format) - { - *buf++ = ':'; - *buf++ = ' '; - d->format(a, buf, buflen - len - 2); - return GA_FULL; - } - return GA_NAME; - } - - bsprintf(buf, "%02x%s", i, (a->flags & BAF_TRANSITIVE) ? " [t]" : ""); - return GA_NAME; -} - void -bgp_get_route_info(rte *e, byte *buf) +bgp_get_route_info(const rte *e, byte *buf) { - eattr *p = ea_find(e->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); - eattr *o = ea_find(e->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); + eattr *p = ea_find(e->attrs, BGP_EA_ID(BA_AS_PATH)); + eattr *o = ea_find(e->attrs, BGP_EA_ID(BA_ORIGIN)); u32 origas; - buf += bsprintf(buf, " (%d", e->attrs->pref); + buf += bsprintf(buf, " (%d", rt_get_preference(e)); - if (e->pflags & BGP_REF_SUPPRESSED) - buf += bsprintf(buf, "-"); + if (!net_is_flow(e->net)) + { + if (e->pflags & BGP_REF_SUPPRESSED) + buf += bsprintf(buf, "-"); - if (rte_stale(e)) - buf += bsprintf(buf, "s"); + if (rte_stale(e)) + buf += bsprintf(buf, "s"); - u64 metric = bgp_total_aigp_metric(e->attrs); - if (metric < BGP_AIGP_MAX) - { - buf += bsprintf(buf, "/%lu", metric); - } - else if (e->attrs->igp_metric) - { - if (!rta_resolvable(e->attrs)) - buf += bsprintf(buf, "/-"); - else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN) - buf += bsprintf(buf, "/?"); - else - buf += bsprintf(buf, "/%d", e->attrs->igp_metric); + u64 metric = bgp_total_aigp_metric(e); + if (metric < BGP_AIGP_MAX) + { + buf += bsprintf(buf, "/%lu", metric); + } + else if (metric = rt_get_igp_metric(e)) + { + if (!rte_resolvable(e)) + buf += bsprintf(buf, "/-"); + else if (metric >= IGP_METRIC_UNKNOWN) + buf += bsprintf(buf, "/?"); + else + buf += bsprintf(buf, "/%d", metric); + } } buf += bsprintf(buf, ") ["); diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index e2754649..cda0eb8d 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -101,7 +101,9 @@ * RFC 8203 - BGP Administrative Shutdown Communication * RFC 8212 - Default EBGP Route Propagation Behavior without Policies * RFC 8654 - Extended Message Support for BGP - * draft-ietf-idr-ext-opt-param-07 + * RFC 9072 - Extended Optional Parameters Length for BGP OPEN Message + * RFC 9117 - Revised Validation Procedure for BGP Flow Specifications + * RFC 9234 - Route Leak Prevention and Detection Using Roles * draft-uttaro-idr-bgp-persistence-04 * draft-walton-bgp-hostname-capability-02 */ @@ -113,7 +115,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "nest/locks.h" #include "conf/conf.h" @@ -124,8 +126,13 @@ #include "bgp.h" -/* Global list of listening sockets */ -static list STATIC_LIST_INIT(bgp_sockets); +static void bgp_listen_create(void *); + +static list STATIC_LIST_INIT(bgp_sockets); /* Global list of listening sockets */ +static list STATIC_LIST_INIT(bgp_listen_pending); /* Global list of listening socket open requests */ +static event bgp_listen_event = { .hook = bgp_listen_create }; + +DOMAIN(rtable) bgp_listen_domain; static void bgp_connect(struct bgp_proto *p); static void bgp_active(struct bgp_proto *p); @@ -136,14 +143,69 @@ static void bgp_update_bfd(struct bgp_proto *p, const struct bfd_options *bfd); static int bgp_incoming_connection(sock *sk, uint dummy UNUSED); static void bgp_listen_sock_err(sock *sk UNUSED, int err); +static void bgp_initiate_disable(struct bgp_proto *p, int err_val); static void bgp_graceful_restart_feed(struct bgp_channel *c); -static inline void channel_refresh_end_reload(struct channel *c) + + +static inline int +bgp_setup_auth(struct bgp_proto *p, int enable) { - channel_refresh_end(c); + /* Beware. This is done from main_birdloop and protocol birdloop is NOT ENTERED. + * Anyway, we are only accessing: + * - protocol config which can be changed only from main_birdloop (reconfig) + * - protocol listen socket which is always driven by main_birdloop + * - protocol name which is set on reconfig + */ + + if (p->cf->password && p->listen.sock) + { + ip_addr prefix = p->cf->remote_ip; + int pxlen = -1; + + if (p->cf->remote_range) + { + prefix = net_prefix(p->cf->remote_range); + pxlen = net_pxlen(p->cf->remote_range); + } + + int rv = sk_set_md5_auth(p->listen.sock->sk, + p->cf->local_ip, prefix, pxlen, p->cf->iface, + enable ? p->cf->password : NULL, p->cf->setkey); + + if (rv < 0) + sk_log_error(p->listen.sock->sk, p->p.name); - if (c->in_table) - channel_request_reload(c); + return rv; + } + else + return 0; +} + +/** + * bgp_close - close a BGP instance + * @p: BGP instance + * + * This function frees and deconfigures shared BGP resources. + */ +static void +bgp_close(struct bgp_proto *p) +{ + LOCK_DOMAIN(rtable, bgp_listen_domain); + + struct bgp_listen_request *req = &p->listen; + struct bgp_socket *bs = req->sock; + + if (bs) + { + req->sock = NULL; + rem_node(&req->n); + + if (bs && EMPTY_LIST(bs->requests)) + ev_send(&global_event_list, &bgp_listen_event); + } + + UNLOCK_DOMAIN(rtable, bgp_listen_domain); } /** @@ -155,121 +217,136 @@ static inline void channel_refresh_end_reload(struct channel *c) * is acquired and neighbor is ready). When error, caller should change state to * PS_DOWN and return immediately. */ -static int +static void bgp_open(struct bgp_proto *p) { - ASSERT_DIE(birdloop_inside(&main_birdloop)); - - struct bgp_socket *bs = NULL; - struct iface *ifa = p->cf->strict_bind ? p->cf->iface : NULL; - ip_addr addr = p->cf->strict_bind ? p->cf->local_ip : - (p->ipv4 ? IPA_NONE4 : IPA_NONE6); - uint port = p->cf->local_port; - uint flags = p->cf->free_bind ? SKF_FREEBIND : 0; - uint flag_mask = SKF_FREEBIND; + LOCK_DOMAIN(rtable, bgp_listen_domain); + struct bgp_listen_request *req = &p->listen; /* We assume that cf->iface is defined iff cf->local_ip is link-local */ + req->iface = p->cf->strict_bind ? p->cf->iface : NULL; + req->vrf = p->p.vrf; + req->addr = p->cf->strict_bind ? p->cf->local_ip : + (p->ipv4 ? IPA_NONE4 : IPA_NONE6); + req->port = p->cf->local_port; + req->flags = p->cf->free_bind ? SKF_FREEBIND : 0; - WALK_LIST(bs, bgp_sockets) - if (ipa_equal(bs->sk->saddr, addr) && - (bs->sk->sport == port) && - (bs->sk->iface == ifa) && - (bs->sk->vrf == p->p.vrf) && - ((bs->sk->flags & flag_mask) == flags)) - { - bs->uc++; - p->sock = bs; - return 0; - } - - sock *sk = sk_new(proto_pool); - sk->type = SK_TCP_PASSIVE; - sk->ttl = 255; - sk->saddr = addr; - sk->sport = port; - sk->iface = ifa; - sk->vrf = p->p.vrf; - sk->flags = flags | SKF_PASSIVE_THREAD; - sk->tos = IP_PREC_INTERNET_CONTROL; - sk->rbsize = BGP_RX_BUFFER_SIZE; - sk->tbsize = BGP_TX_BUFFER_SIZE; - sk->rx_hook = bgp_incoming_connection; - sk->err_hook = bgp_listen_sock_err; - - if (sk_open(sk) < 0) - goto err; - - bs = mb_allocz(proto_pool, sizeof(struct bgp_socket)); - bs->sk = sk; - bs->uc = 1; - p->sock = bs; - sk->data = bs; + BGP_TRACE(D_EVENTS, "Requesting listen socket at %I%J port %u", req->addr, req->iface, req->port); - add_tail(&bgp_sockets, &bs->n); + add_tail(&bgp_listen_pending, &req->n); + ev_send(&global_event_list, &bgp_listen_event); - return 0; - -err: - sk_log_error(sk, p->p.name); - log(L_ERR "%s: Cannot open listening socket", p->p.name); - rfree(sk); - return -1; + UNLOCK_DOMAIN(rtable, bgp_listen_domain); } -/** - * bgp_close - close a BGP instance - * @p: BGP instance - * - * This function frees and deconfigures shared BGP resources. - */ static void -bgp_close(struct bgp_proto *p) +bgp_listen_create(void *_ UNUSED) { ASSERT_DIE(birdloop_inside(&main_birdloop)); - struct bgp_socket *bs = p->sock; - - ASSERT(bs && bs->uc); + uint flag_mask = SKF_FREEBIND; - if (--bs->uc) - return; + while (1) { + LOCK_DOMAIN(rtable, bgp_listen_domain); - rfree(bs->sk); - rem_node(&bs->n); - mb_free(bs); -} + if (EMPTY_LIST(bgp_listen_pending)) + { + UNLOCK_DOMAIN(rtable, bgp_listen_domain); + break; + } -static inline int -bgp_setup_auth(struct bgp_proto *p, int enable) -{ - if (p->cf->password) - { - ip_addr prefix = p->cf->remote_ip; - int pxlen = -1; + /* Get the first request to match */ + struct bgp_listen_request *req = HEAD(bgp_listen_pending); + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, listen, req); + rem_node(&req->n); + + /* First try to find existing socket */ + struct bgp_socket *bs; + WALK_LIST(bs, bgp_sockets) + if (ipa_equal(bs->sk->saddr, req->addr) && + (bs->sk->sport == req->port) && + (bs->sk->iface == req->iface) && + (bs->sk->vrf == req->vrf) && + ((bs->sk->flags & flag_mask) == req->flags)) + break; - if (p->cf->remote_range) + /* Not found any */ + if (NODE_VALID(bs)) + BGP_TRACE(D_EVENTS, "Found a listening socket: %p", bs); + else { - prefix = net_prefix(p->cf->remote_range); - pxlen = net_pxlen(p->cf->remote_range); + /* Allocating new socket from global protocol pool. + * We can do this in main_birdloop. */ + sock *sk = sk_new(proto_pool); + sk->type = SK_TCP_PASSIVE; + sk->ttl = 255; + sk->saddr = req->addr; + sk->sport = req->port; + sk->iface = req->iface; + sk->vrf = req->vrf; + sk->flags = req->flags; + sk->tos = IP_PREC_INTERNET_CONTROL; + sk->rbsize = BGP_RX_BUFFER_SIZE; + sk->tbsize = BGP_TX_BUFFER_SIZE; + sk->rx_hook = bgp_incoming_connection; + sk->err_hook = bgp_listen_sock_err; + + if (sk_open(sk, &main_birdloop) < 0) + { + sk_log_error(sk, p->p.name); + log(L_ERR "%s: Cannot open listening socket", p->p.name); + rfree(sk); + UNLOCK_DOMAIN(rtable, bgp_listen_domain); + + bgp_initiate_disable(p, BEM_NO_SOCKET); + continue; + } + + bs = mb_allocz(proto_pool, sizeof(struct bgp_socket)); + bs->sk = sk; + sk->data = bs; + + init_list(&bs->requests); + add_tail(&bgp_sockets, &bs->n); + + BGP_TRACE(D_EVENTS, "Created new listening socket: %p", bs); } - int rv = sk_set_md5_auth(p->sock->sk, - p->cf->local_ip, prefix, pxlen, p->cf->iface, - enable ? p->cf->password : NULL, p->cf->setkey); + req->sock = bs; + add_tail(&bs->requests, &req->n); - if (rv < 0) - sk_log_error(p->sock->sk, p->p.name); + if (bgp_setup_auth(p, 1) < 0) + { + rem_node(&req->n); + req->sock = NULL; - return rv; + UNLOCK_DOMAIN(rtable, bgp_listen_domain); + + bgp_initiate_disable(p, BEM_INVALID_MD5); + continue; + } + + UNLOCK_DOMAIN(rtable, bgp_listen_domain); } - else - return 0; + + /* Cleanup leftover listening sockets */ + LOCK_DOMAIN(rtable, bgp_listen_domain); + struct bgp_socket *bs; + node *nxt; + WALK_LIST_DELSAFE(bs, nxt, bgp_sockets) + if (EMPTY_LIST(bs->requests)) + { + rfree(bs->sk); + rem_node(&bs->n); + mb_free(bs); + } + UNLOCK_DOMAIN(rtable, bgp_listen_domain); } static inline struct bgp_channel * bgp_find_channel(struct bgp_proto *p, u32 afi) { struct bgp_channel *c; - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) if (c->afi == afi) return c; @@ -288,6 +365,8 @@ bgp_startup(struct bgp_proto *p) if (p->postponed_sk) { /* Apply postponed incoming connection */ + sk_reloop(p->postponed_sk, p->p.loop); + bgp_setup_conn(p, &p->incoming_conn); bgp_setup_sk(&p->incoming_conn, p->postponed_sk); bgp_send_open(&p->incoming_conn); @@ -305,13 +384,7 @@ bgp_startup_timeout(timer *t) static void bgp_initiate(struct bgp_proto *p) { - int err_val; - - if (bgp_open(p) < 0) - { err_val = BEM_NO_SOCKET; goto err1; } - - if (bgp_setup_auth(p, 1) < 0) - { err_val = BEM_INVALID_MD5; goto err2; } + bgp_open(p); if (p->cf->bfd) bgp_update_bfd(p, p->cf->bfd); @@ -324,24 +397,28 @@ bgp_initiate(struct bgp_proto *p) } else bgp_startup(p); +} - return; - -err2: - bgp_close(p); -err1: - p->p.disabled = 1; - bgp_store_error(p, NULL, BE_MISC, err_val); - - p->neigh = NULL; - proto_notify_state(&p->p, PS_DOWN); - - return; +static void +bgp_initiate_disable(struct bgp_proto *p, int err_val) +{ + PROTO_LOCKED_FROM_MAIN(&p->p) + { + /* The protocol may be already down for another reason. + * Shutdown the protocol only if it isn't already shutting down. */ + switch (p->p.proto_state) + { + case PS_START: + case PS_UP: + p->p.disabled = 1; + bgp_store_error(p, NULL, BE_MISC, err_val); + bgp_stop(p, err_val, NULL, 0); + } + } } /** * bgp_start_timer - start a BGP timer - * @p: bgp_proto which the timer belongs to * @t: timer * @value: time (in seconds) to fire (0 to disable the timer) * @@ -352,8 +429,6 @@ err1: void bgp_start_timer(struct bgp_proto *p, timer *t, uint value) { - BGP_ASSERT_INSIDE(p); - if (value) { /* The randomization procedure is specified in RFC 4271 section 10 */ @@ -375,7 +450,7 @@ bgp_start_timer(struct bgp_proto *p, timer *t, uint value) void bgp_close_conn(struct bgp_conn *conn) { - BGP_ASSERT_INSIDE(conn->bgp); + // struct bgp_proto *p = conn->bgp; DBG("BGP: Closing connection\n"); conn->packets_to_send = 0; @@ -386,8 +461,10 @@ bgp_close_conn(struct bgp_conn *conn) conn->keepalive_timer = NULL; rfree(conn->hold_timer); conn->hold_timer = NULL; + rfree(conn->tx_ev); conn->tx_ev = NULL; + rfree(conn->sk); conn->sk = NULL; @@ -467,8 +544,6 @@ bgp_graceful_close_conn(struct bgp_conn *conn, int subcode, byte *data, uint len static void bgp_down(struct bgp_proto *p) { - bgp_start_timer(p, p->startup_timer, 0); - if (p->start_state > BSS_PREPARE) { bgp_setup_auth(p, 0); @@ -482,34 +557,21 @@ bgp_down(struct bgp_proto *p) } static void -bgp_active_event(void *vp) +bgp_decision(void *vp) { struct bgp_proto *p = vp; - BGP_ASSERT_INSIDE(p); - - DBG("%s: Decision start\n", p->p.name); + DBG("BGP: Decision start\n"); if ((p->p.proto_state == PS_START) && (p->outgoing_conn.state == BS_IDLE) && (p->incoming_conn.state != BS_OPENCONFIRM) && !p->passive) bgp_active(p); -} - -static void -bgp_down_event(void *vp) -{ - struct bgp_proto *p = vp; - - BGP_ENTER(p); - DBG("%s: Down event\n", p->p.name); if ((p->p.proto_state == PS_STOP) && (p->outgoing_conn.state == BS_IDLE) && (p->incoming_conn.state == BS_IDLE)) bgp_down(p); - - BGP_LEAVE(p); } static struct bgp_proto * @@ -539,9 +601,16 @@ void bgp_stop(struct bgp_proto *p, int subcode, byte *data, uint len) { proto_notify_state(&p->p, PS_STOP); + p->uncork_ev->data = NULL; bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len); bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len); - ev_send_loop(&main_birdloop, p->down_event); + + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + if (c->ptx) + bgp_free_pending_tx(c); + + proto_send_event(&p->p, p->event); } static inline void @@ -588,7 +657,6 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) p->link_addr = p->neigh->iface->llv6->ip; conn->sk->fast_rx = 0; - conn->sk->cork = &rt_cork; p->conn = conn; p->last_error_class = 0; @@ -614,7 +682,7 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) /* Summary state of ADD_PATH RX for active channels */ uint summary_add_path_rx = 0; - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) { const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi); const struct bgp_af_caps *rem = bgp_find_af_caps(peer, c->afi); @@ -635,7 +703,7 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) int active = loc->ready && rem->ready; c->c.disabled = !active; - c->c.reloadable = p->route_refresh || c->cf->import_table; + c->c.reloadable = p->route_refresh || ((c->c.in_keep & RIK_PREFILTER) == RIK_PREFILTER); c->index = active ? num++ : 0; @@ -696,7 +764,7 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) p->channel_count = num; p->summary_add_path_rx = summary_add_path_rx; - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) { if (c->c.disabled) continue; @@ -747,8 +815,7 @@ bgp_conn_enter_idle_state(struct bgp_conn *conn) bgp_close_conn(conn); bgp_conn_set_state(conn, BS_IDLE); - ev_send_loop(p->p.loop, p->active_event); - ev_send_loop(&main_birdloop, p->down_event); + proto_send_event(&p->p, p->event); if (os == BS_ESTABLISHED) bgp_conn_leave_established_state(p); @@ -776,7 +843,7 @@ bgp_handle_graceful_restart(struct bgp_proto *p) p->gr_active_num = 0; struct bgp_channel *c; - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) { /* FIXME: perhaps check for channel state instead of disabled flag? */ if (c->c.disabled) @@ -790,16 +857,14 @@ bgp_handle_graceful_restart(struct bgp_proto *p) { case BGP_GRS_NONE: c->gr_active = BGP_GRS_ACTIVE; - channel_refresh_begin(&c->c); - break; + /* fall through */ case BGP_GRS_ACTIVE: - channel_refresh_end(&c->c); - channel_refresh_begin(&c->c); + rt_refresh_begin(&c->c.in_req); break; case BGP_GRS_LLGR: - channel_refresh_begin(&c->c); + rt_refresh_begin(&c->c.in_req); bgp_graceful_restart_feed(c); break; } @@ -807,15 +872,13 @@ bgp_handle_graceful_restart(struct bgp_proto *p) else { /* Just flush the routes */ - channel_refresh_begin(&c->c); - channel_refresh_end(&c->c); + rt_refresh_begin(&c->c.in_req); + rt_refresh_end(&c->c.in_req); } /* Reset bucket and prefix tables */ - bgp_free_bucket_table(c); - bgp_free_prefix_table(c); - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); + bgp_free_pending_tx(c); + bgp_init_pending_tx(c); c->packets_to_send = 0; } @@ -871,6 +934,8 @@ bgp_graceful_restart_feed(struct bgp_channel *c) } + + /** * bgp_graceful_restart_done - finish active BGP graceful restart * @c: BGP channel @@ -893,11 +958,8 @@ bgp_graceful_restart_done(struct bgp_channel *c) if (!p->gr_active_num) BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); - if (c->stale_feed.hook) - rt_stop_export(&c->stale_feed, bgp_graceful_restart_feed_done); - tm_stop(c->stale_timer); - channel_refresh_end_reload(&c->c); + rt_refresh_end(&c->c.in_req); } /** @@ -919,7 +981,7 @@ bgp_graceful_restart_timeout(timer *t) if (p->llgr_ready) { struct bgp_channel *c; - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) { /* Channel is not in GR and is already flushed */ if (!c->gr_active) @@ -976,11 +1038,8 @@ bgp_refresh_begin(struct bgp_channel *c) if (c->load_state == BFS_LOADING) { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } - if (c->load_state == BFS_REFRESHING) - channel_refresh_end(&c->c); - c->load_state = BFS_REFRESHING; - channel_refresh_begin(&c->c); + rt_refresh_begin(&c->c.in_req); } /** @@ -1001,7 +1060,7 @@ bgp_refresh_end(struct bgp_channel *c) { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } c->load_state = BFS_NONE; - channel_refresh_end_reload(&c->c); + rt_refresh_end(&c->c.in_req); } @@ -1137,12 +1196,10 @@ bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn) static void bgp_setup_sk(struct bgp_conn *conn, sock *s) { - ASSERT_DIE(s->flags & SKF_THREAD); s->data = conn; s->err_hook = bgp_sock_err; s->fast_rx = 1; conn->sk = s; - sk_start(s); } static void @@ -1151,8 +1208,6 @@ bgp_active(struct bgp_proto *p) int delay = MAX(1, p->cf->connect_delay_time); struct bgp_conn *conn = &p->outgoing_conn; - BGP_ASSERT_INSIDE(p); - BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay); bgp_setup_conn(p, conn); bgp_conn_set_state(conn, BS_ACTIVE); @@ -1173,12 +1228,9 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c struct bgp_conn *conn = &p->outgoing_conn; int hops = p->cf->multihop ? : 1; - BGP_ASSERT_INSIDE(p); - DBG("BGP: Connecting\n"); sock *s = sk_new(p->p.pool); s->type = SK_TCP_ACTIVE; - s->flags |= SKF_THREAD; s->saddr = p->local_ip; s->daddr = p->remote_ip; s->dport = p->cf->remote_port; @@ -1190,6 +1242,7 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c s->tos = IP_PREC_INTERNET_CONTROL; s->password = p->cf->password; s->tx_hook = bgp_connected; + s->flags = p->cf->free_bind ? SKF_FREEBIND : 0; BGP_TRACE(D_EVENTS, "Connecting to %I%J from local address %I%J", s->daddr, ipa_is_link_local(s->daddr) ? p->cf->iface : NULL, s->saddr, ipa_is_link_local(s->saddr) ? s->iface : NULL); @@ -1197,7 +1250,7 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c bgp_setup_sk(conn, s); bgp_conn_set_state(conn, BS_CONNECT); - if (sk_open(s) < 0) + if (sk_open(s, p->p.loop) < 0) goto err; /* Set minimal receive TTL if needed */ @@ -1227,12 +1280,17 @@ static struct bgp_proto * bgp_find_proto(sock *sk) { struct bgp_proto *best = NULL; - struct bgp_proto *p; + struct bgp_socket *bs = sk->data; + struct bgp_listen_request *req; /* sk->iface is valid only if src or dst address is link-local */ int link = ipa_is_link_local(sk->saddr) || ipa_is_link_local(sk->daddr); - WALK_LIST(p, proto_list) + LOCK_DOMAIN(rtable, bgp_listen_domain); + + WALK_LIST(req, bs->requests) + { + struct bgp_proto *p = SKIP_BACK(struct bgp_proto, listen, req); if ((p->p.proto == &proto_bgp) && (ipa_equal(p->remote_ip, sk->daddr) || bgp_is_dynamic(p)) && (!p->cf->remote_range || ipa_in_netX(sk->daddr, p->cf->remote_range)) && @@ -1246,7 +1304,9 @@ bgp_find_proto(sock *sk) if (!bgp_is_dynamic(p)) break; } + } + UNLOCK_DOMAIN(rtable, bgp_listen_domain); return best; } @@ -1265,6 +1325,8 @@ bgp_find_proto(sock *sk) static int bgp_incoming_connection(sock *sk, uint dummy UNUSED) { + ASSERT_DIE(birdloop_inside(&main_birdloop)); + struct bgp_proto *p; int acc, hops; @@ -1278,17 +1340,7 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) return 0; } - if (p->p.loop == &main_birdloop) - { - /* Protocol is down for whatever reason. No need for locking. */ - BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) rejected (protocol is down)", - sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, - sk->dport); - rfree(sk); - return 0; - } - - BGP_ENTER(p); + birdloop_enter(p->p.loop); /* * BIRD should keep multiple incoming connections in OpenSent state (for @@ -1319,8 +1371,7 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) if (!acc) { rfree(sk); - BGP_LEAVE(p); - return 0; + goto leave; } hops = p->cf->multihop ? : 1; @@ -1345,22 +1396,24 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) p = bgp_spawn(p, sk->daddr); p->postponed_sk = sk; rmove(sk, p->p.pool); - BGP_LEAVE(p); - return 0; + goto leave; } rmove(sk, p->p.pool); + sk_reloop(sk, p->p.loop); + bgp_setup_conn(p, &p->incoming_conn); bgp_setup_sk(&p->incoming_conn, sk); bgp_send_open(&p->incoming_conn); - BGP_LEAVE(p); - return 0; + goto leave; err: sk_log_error(sk, p->p.name); log(L_ERR "%s: Incoming connection aborted", p->p.name); rfree(sk); - BGP_LEAVE(p); + +leave: + birdloop_leave(p->p.loop); return 0; } @@ -1395,9 +1448,10 @@ bgp_neigh_notify(neighbor *n) struct bgp_proto *p = (struct bgp_proto *) n->proto; int ps = p->p.proto_state; - BGP_ASSERT_INSIDE(p); + if (n != p->neigh) + return; - if ((n != p->neigh) || (ps == PS_DOWN) || (ps == PS_STOP)) + if ((ps == PS_DOWN) || (ps == PS_STOP)) return; int prepare = (ps == PS_START) && (p->start_state == BSS_PREPARE); @@ -1470,15 +1524,22 @@ static void bgp_update_bfd(struct bgp_proto *p, const struct bfd_options *bfd) { if (bfd && p->bfd_req) + { + BGP_TRACE(D_EVENTS, "Updating existing BFD request"); bfd_update_request(p->bfd_req, bfd); + } if (bfd && !p->bfd_req && !bgp_is_dynamic(p)) + { p->bfd_req = bfd_request_session(p->p.pool, p->remote_ip, p->local_ip, p->cf->multihop ? NULL : p->neigh->iface, - p->p.vrf, bgp_bfd_notify, p, birdloop_event_list(p->p.loop), bfd); + p->p.vrf, bgp_bfd_notify, p, p->p.loop, bfd); + BGP_TRACE(D_EVENTS, "Requesting a new BFD session"); + } if (!bfd && p->bfd_req) { + BGP_TRACE(D_EVENTS, "Retracting the BFD request"); rfree(p->bfd_req); p->bfd_req = NULL; } @@ -1490,8 +1551,11 @@ bgp_reload_routes(struct channel *C) struct bgp_proto *p = (void *) C->proto; struct bgp_channel *c = (void *) C; - ASSERT(p->conn && (p->route_refresh)); + /* Ignore non-BGP channels */ + if (C->channel != &channel_bgp) + return; + ASSERT(p->conn && p->route_refresh); bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); } @@ -1501,6 +1565,10 @@ bgp_feed_begin(struct channel *C, int initial) struct bgp_proto *p = (void *) C->proto; struct bgp_channel *c = (void *) C; + /* Ignore non-BGP channels */ + if (C->channel != &channel_bgp) + return; + /* This should not happen */ if (!p->conn) return; @@ -1508,6 +1576,12 @@ bgp_feed_begin(struct channel *C, int initial) if (initial && p->cf->gr_mode) c->feed_state = BFS_LOADING; + if (!initial && C->out_table) + { + c->feed_out_table = 1; + return; + } + /* It is refeed and both sides support enhanced route refresh */ if (!initial && p->enhanced_refresh) { @@ -1526,6 +1600,16 @@ bgp_feed_end(struct channel *C) struct bgp_proto *p = (void *) C->proto; struct bgp_channel *c = (void *) C; + /* Ignore non-BGP channels */ + if (C->channel != &channel_bgp) + return; + + if (c->feed_out_table) + { + c->feed_out_table = 0; + return; + } + /* This should not happen */ if (!p->conn) return; @@ -1548,17 +1632,14 @@ bgp_feed_end(struct channel *C) static void -bgp_start_locked(struct object_lock *lock) +bgp_start_locked(void *_p) { - struct bgp_proto *p = lock->data; + struct bgp_proto *p = _p; const struct bgp_config *cf = p->cf; - BGP_ENTER(p); - if (p->p.proto_state != PS_START) { DBG("BGP: Got lock in different state %d\n", p->p.proto_state); - BGP_LEAVE(p); return; } @@ -1568,11 +1649,10 @@ bgp_start_locked(struct object_lock *lock) { /* Multi-hop sessions do not use neighbor entries */ bgp_initiate(p); - BGP_LEAVE(p); return; } - neighbor *n = neigh_find(&p->p, p->remote_ip, cf->iface, NEF_STICKY | NEF_NOTIFY_MAIN); + neighbor *n = neigh_find(&p->p, p->remote_ip, cf->iface, NEF_STICKY); if (!n) { log(L_ERR "%s: Invalid remote address %I%J", p->p.name, p->remote_ip, cf->iface); @@ -1580,7 +1660,6 @@ bgp_start_locked(struct object_lock *lock) p->p.disabled = 1; bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP); proto_notify_state(&p->p, PS_DOWN); - BGP_LEAVE(p); return; } @@ -1592,8 +1671,6 @@ bgp_start_locked(struct object_lock *lock) BGP_TRACE(D_EVENTS, "Waiting for link on %s", n->iface->name); else bgp_start_neighbor(p); - - BGP_LEAVE(p); } static int @@ -1632,13 +1709,12 @@ bgp_start(struct proto *P) p->stats.rx_bytes = p->stats.tx_bytes = 0; p->last_rx_update = 0; - p->active_event = ev_new_init(p->p.pool, bgp_active_event, p); - p->down_event = ev_new_init(p->p.pool, bgp_down_event, p); + p->event = ev_new_init(p->p.pool, bgp_decision, p); + p->uncork_ev = ev_new_init(p->p.pool, bgp_uncork, p); + p->startup_timer = tm_new_init(p->p.pool, bgp_startup_timeout, p, 0, 0); p->gr_timer = tm_new_init(p->p.pool, bgp_graceful_restart_timeout, p, 0, 0); - p->rx_lp = lp_new_default(p->p.pool); - p->local_id = proto_get_router_id(P->cf); if (p->rr_client) p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id; @@ -1650,7 +1726,7 @@ bgp_start(struct proto *P) if (p->p.gr_recovery && p->cf->gr_mode) { struct bgp_channel *c; - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) channel_graceful_restart_lock(&c->c); } @@ -1665,8 +1741,11 @@ bgp_start(struct proto *P) lock->iface = p->cf->iface; lock->vrf = p->cf->iface ? NULL : p->p.vrf; lock->type = OBJLOCK_TCP; - lock->hook = bgp_start_locked; - lock->data = p; + lock->event = (event) { + .hook = bgp_start_locked, + .data = p, + }; + lock->target = proto_event_list(P); /* For dynamic BGP, we use inst 1 to avoid collisions with regular BGP */ if (bgp_is_dynamic(p)) @@ -1782,7 +1861,7 @@ bgp_init(struct proto_config *CF) P->rt_notify = bgp_rt_notify; P->preexport = bgp_preexport; - P->neigh_notify = bgp_neigh_notify; + P->iface_sub.neigh_notify = bgp_neigh_notify; P->reload_routes = bgp_reload_routes; P->feed_begin = bgp_feed_begin; P->feed_end = bgp_feed_end; @@ -1809,7 +1888,7 @@ bgp_init(struct proto_config *CF) /* Add all channels */ struct bgp_channel_config *cc; - WALK_LIST(cc, CF->channels) + BGP_CF_WALK_CHANNELS(cf, cc) proto_add_channel(P, &cc->c); return P; @@ -1830,6 +1909,9 @@ bgp_channel_init(struct channel *C, struct channel_config *CF) if (cf->igp_table_ip6) c->igp_table_ip6 = cf->igp_table_ip6->table; + + if (cf->base_table) + c->base_table = cf->base_table->table; } static int @@ -1840,22 +1922,23 @@ bgp_channel_start(struct channel *C) ip_addr src = p->local_ip; if (c->igp_table_ip4) - RT_LOCKED(c->igp_table_ip4, t) - rt_lock_table(t); + rt_lock_table(c->igp_table_ip4); if (c->igp_table_ip6) - RT_LOCKED(c->igp_table_ip6, t) - rt_lock_table(t); + rt_lock_table(c->igp_table_ip6); - c->pool = p->p.pool; // XXXX - bgp_init_bucket_table(c); - bgp_init_prefix_table(c); + if (c->base_table) + { + rt_lock_table(c->base_table); + rt_flowspec_link(c->base_table, c->c.table); + } - if (c->cf->import_table) - channel_setup_in_table(C, 0); + c->pool = p->p.pool; // XXXX if (c->cf->export_table) - channel_setup_out_table(C); + bgp_setup_out_table(c); + + bgp_init_pending_tx(c); c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0); @@ -1926,12 +2009,16 @@ bgp_channel_cleanup(struct channel *C) struct bgp_channel *c = (void *) C; if (c->igp_table_ip4) - RT_LOCKED(c->igp_table_ip4, t) - rt_unlock_table(t); + rt_unlock_table(c->igp_table_ip4); if (c->igp_table_ip6) - RT_LOCKED(c->igp_table_ip6, t) - rt_unlock_table(t); + rt_unlock_table(c->igp_table_ip6); + + if (c->base_table) + { + rt_flowspec_unlink(c->base_table, c->c.table); + rt_unlock_table(c->base_table); + } c->index = 0; @@ -1944,7 +2031,7 @@ bgp_find_channel_config(struct bgp_config *cf, u32 afi) { struct bgp_channel_config *cc; - WALK_LIST(cc, cf->c.channels) + BGP_CF_WALK_CHANNELS(cf, cc) if (cc->afi == afi) return cc; @@ -1974,12 +2061,31 @@ bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32 return cc2->c.table; /* Last, try default table of given type */ - if (tab = cf->c.global->def_tables[type]) + if (tab = rt_get_default_table(cf->c.global, type)) return tab; cf_error("Undefined IGP table"); } +static struct rtable_config * +bgp_default_base_table(struct bgp_config *cf, struct bgp_channel_config *cc) +{ + /* Expected table type */ + u32 type = (cc->afi == BGP_AF_FLOW4) ? NET_IP4 : NET_IP6; + + /* First, try appropriate IP channel */ + u32 afi2 = BGP_AF(BGP_AFI(cc->afi), BGP_SAFI_UNICAST); + struct bgp_channel_config *cc2 = bgp_find_channel_config(cf, afi2); + if (cc2 && (cc2->c.table->addr_type == type)) + return cc2->c.table; + + /* Last, try default table of given type */ + struct rtable_config *tab = rt_get_default_table(cf->c.global, type); + if (tab) + return tab; + + cf_error("Undefined base table"); +} void bgp_postconfig(struct proto_config *CF) @@ -2042,6 +2148,15 @@ bgp_postconfig(struct proto_config *CF) if (internal && cf->rs_client) cf_error("Only external neighbor can be RS client"); + if (internal && (cf->local_role != BGP_ROLE_UNDEFINED)) + cf_error("Local role cannot be set on IBGP sessions"); + + if (interior && (cf->local_role != BGP_ROLE_UNDEFINED)) + log(L_WARN "BGP roles are not recommended to be used within AS confederations"); + + if (cf->require_roles && (cf->local_role == BGP_ROLE_UNDEFINED)) + cf_error("Local role must be set if roles are required"); + if (!cf->confederation && cf->confederation_member) cf_error("Confederation ID must be set for member sessions"); @@ -2064,9 +2179,24 @@ bgp_postconfig(struct proto_config *CF) if (internal && cf->enforce_first_as) cf_error("Enforce first AS check is requires EBGP sessions"); + if (cf->keepalive_time > cf->hold_time) + cf_error("Keepalive time must be at most hold time"); + + if (cf->keepalive_time > (cf->hold_time / 2)) + log(L_WARN "Keepalive time should be at most 1/2 of hold time"); + + if (cf->min_hold_time > cf->hold_time) + cf_error("Min hold time (%u) exceeds hold time (%u)", + cf->min_hold_time, cf->hold_time); + + uint keepalive_time = cf->keepalive_time ?: cf->hold_time / 3; + if (cf->min_keepalive_time > keepalive_time) + cf_error("Min keepalive time (%u) exceeds keepalive time (%u)", + cf->min_keepalive_time, keepalive_time); + struct bgp_channel_config *cc; - WALK_LIST(cc, CF->channels) + BGP_CF_WALK_CHANNELS(cf, cc) { /* Handle undefined import filter */ if (cc->c.in_filter == FILTER_UNDEF) @@ -2094,6 +2224,10 @@ bgp_postconfig(struct proto_config *CF) if (!cc->gw_mode) cc->gw_mode = cf->multihop ? GW_RECURSIVE : GW_DIRECT; + /* Different default for next_hop_prefer */ + if (!cc->next_hop_prefer) + cc->next_hop_prefer = (cc->gw_mode == GW_DIRECT) ? NHP_GLOBAL : NHP_LOCAL; + /* Defaults based on proto config */ if (cc->gr_able == 0xff) cc->gr_able = (cf->gr_mode == BGP_GR_ABLE); @@ -2124,6 +2258,14 @@ bgp_postconfig(struct proto_config *CF) cf_error("Mismatched IGP table type"); } + /* Default value of base table */ + if ((BGP_SAFI(cc->afi) == BGP_SAFI_FLOW) && cc->validate && !cc->base_table) + cc->base_table = bgp_default_base_table(cf, cc); + + if (cc->base_table && !cc->base_table->trie_used) + cf_error("Flowspec validation requires base table (%s) with trie", + cc->base_table->name); + if (cf->multihop && (cc->gw_mode == GW_DIRECT)) cf_error("Multihop BGP cannot use direct gateway mode"); @@ -2165,20 +2307,16 @@ bgp_reconfigure(struct proto *P, struct proto_config *CF) WALK_LIST(C, p->p.channels) C->stale = 1; - WALK_LIST(cc, new->c.channels) + BGP_CF_WALK_CHANNELS(new, cc) { C = (struct channel *) bgp_find_channel(p, cc->afi); same = proto_configure_channel(P, &C, &cc->c) && same; - - if (C) - C->stale = 0; } WALK_LIST_DELSAFE(C, C2, p->p.channels) if (C->stale) same = proto_configure_channel(P, &C, NULL) && same; - if (same && (p->start_state > BSS_PREPARE)) bgp_update_bfd(p, new->bfd); @@ -2192,7 +2330,7 @@ bgp_reconfigure(struct proto *P, struct proto_config *CF) return same; } -#define IGP_TABLE(cf, sym) ((cf)->igp_table_##sym ? (cf)->igp_table_##sym ->table : NULL ) +#define TABLE(cf, NAME) ((cf)->NAME ? (cf)->NAME->table : NULL ) static int bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *import_changed, int *export_changed) @@ -2203,29 +2341,33 @@ bgp_channel_reconfigure(struct channel *C, struct channel_config *CC, int *impor struct bgp_channel_config *old = c->cf; if ((new->secondary != old->secondary) || + (new->validate != old->validate) || (new->gr_able != old->gr_able) || (new->llgr_able != old->llgr_able) || (new->llgr_time != old->llgr_time) || (new->ext_next_hop != old->ext_next_hop) || (new->add_path != old->add_path) || - (new->import_table != old->import_table) || (new->export_table != old->export_table) || - (IGP_TABLE(new, ip4) != IGP_TABLE(old, ip4)) || - (IGP_TABLE(new, ip6) != IGP_TABLE(old, ip6))) + (TABLE(new, igp_table_ip4) != TABLE(old, igp_table_ip4)) || + (TABLE(new, igp_table_ip6) != TABLE(old, igp_table_ip6)) || + (TABLE(new, base_table) != TABLE(old, base_table))) return 0; if (new->mandatory && !old->mandatory && (C->channel_state != CS_UP)) return 0; if ((new->gw_mode != old->gw_mode) || + (new->next_hop_prefer != old->next_hop_prefer) || (new->aigp != old->aigp) || (new->cost != old->cost)) { - /* import_changed itself does not force ROUTE_REFRESH when import_table is active */ - if (c->c.in_table && (c->c.channel_state == CS_UP)) + /* If import table is active and route refresh is possible, we just ask for route refresh */ + if ((c->c.in_keep & RIK_PREFILTER) && (c->c.channel_state == CS_UP) && p->route_refresh) bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); - *import_changed = 1; + /* Otherwise we do complete reload */ + else + *import_changed = 1; } if (!ipa_equal(new->next_hop_addr, old->next_hop_addr) || @@ -2394,6 +2536,15 @@ bgp_show_afis(int code, char *s, u32 *afis, uint count) cli_msg(code, b.start); } +const char * +bgp_format_role_name(u8 role) +{ + static const char *bgp_role_names[] = { "provider", "rs_server", "rs_client", "customer", "peer" }; + if (role == BGP_ROLE_UNDEFINED) return "undefined"; + if (role < ARRAY_SIZE(bgp_role_names)) return bgp_role_names[role]; + return "?"; +} + static void bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) { @@ -2522,6 +2673,9 @@ bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) if (caps->hostname) cli_msg(-1006, " Hostname: %s", caps->hostname); + + if (caps->role != BGP_ROLE_UNDEFINED) + cli_msg(-1006, " Role: %s", bgp_format_role_name(caps->role)); } static void @@ -2529,9 +2683,6 @@ bgp_show_proto_info(struct proto *P) { struct bgp_proto *p = (struct bgp_proto *) P; - if (p->p.proto_state != PS_DOWN) - BGP_ASSERT_INSIDE(p); - cli_msg(-1006, " BGP state: %s", bgp_state_dsc(p)); if (bgp_is_dynamic(p) && p->cf->remote_range) @@ -2539,6 +2690,9 @@ bgp_show_proto_info(struct proto *P) else cli_msg(-1006, " Neighbor address: %I%J", p->remote_ip, p->cf->iface); + if ((p->conn == &p->outgoing_conn) && (p->cf->remote_port != BGP_PORT)) + cli_msg(-1006, " Neighbor port: %u", p->cf->remote_port); + cli_msg(-1006, " Neighbor AS: %u", p->remote_as); cli_msg(-1006, " Local AS: %u", p->cf->local_as); @@ -2581,6 +2735,9 @@ bgp_show_proto_info(struct proto *P) tm_remains(p->conn->hold_timer), p->conn->hold_time); cli_msg(-1006, " Keepalive timer: %t/%u", tm_remains(p->conn->keepalive_timer), p->conn->keepalive_time); + cli_msg(-1006, " TX pending: %d bytes%s", + p->conn->sk->tpos - p->conn->sk->ttx, + ev_active(p->conn->tx_ev) ? " (refill scheduled)" : ""); } #if 0 @@ -2609,6 +2766,9 @@ bgp_show_proto_info(struct proto *P) { channel_show_info(&c->c); + if (c->c.channel != &channel_bgp) + continue; + if (p->gr_active_num) cli_msg(-1006, " Neighbor GR: %s", bgp_gr_states[c->gr_active]); @@ -2628,6 +2788,25 @@ bgp_show_proto_info(struct proto *P) if (c->igp_table_ip6) cli_msg(-1006, " IGP IPv6 table: %s", c->igp_table_ip6->name); + + if (c->base_table) + cli_msg(-1006, " Base table: %s", c->base_table->name); + + uint bucket_cnt = 0; + uint prefix_cnt = 0; + struct bgp_bucket *buck; + struct bgp_prefix *px; + if (c->ptx) + WALK_LIST(buck, c->ptx->bucket_queue) + { + bucket_cnt++; + WALK_LIST(px, buck->prefixes) + if (px->cur) + prefix_cnt++; + } + + cli_msg(-1006, " Pending %u attribute sets with total %u prefixes to send", + bucket_cnt, prefix_cnt); } } } @@ -2645,7 +2824,6 @@ struct channel_class channel_bgp = { struct protocol proto_bgp = { .name = "BGP", .template = "bgp%d", - .class = PROTOCOL_BGP, .preference = DEF_PREF_BGP, .channel_mask = NB_IP | NB_VPN | NB_FLOW, .proto_size = sizeof(struct bgp_proto), @@ -2657,6 +2835,12 @@ struct protocol proto_bgp = { .reconfigure = bgp_reconfigure, .copy_config = bgp_copy_config, .get_status = bgp_get_status, - .get_attr = bgp_get_attr, .show_proto_info = bgp_show_proto_info }; + +void bgp_build(void) +{ + proto_build(&proto_bgp); + bgp_register_attrs(); + bgp_listen_domain = DOMAIN_NEW(rtable, "BGP Listen Sockets"); +} diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 2a2fe689..6f75874f 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -14,13 +14,12 @@ #include <stdint.h> #include <setjmp.h> #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/bfd.h" //#include "lib/lists.h" #include "lib/hash.h" #include "lib/socket.h" -struct linpool; struct eattr; @@ -68,10 +67,10 @@ struct bgp_af_desc { u8 no_igp; const char *name; uint (*encode_nlri)(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size); - void (*decode_nlri)(struct bgp_parse_state *s, byte *pos, uint len, rta *a); + void (*decode_nlri)(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a); void (*update_next_hop)(struct bgp_export_state *s, eattr *nh, ea_list **to); uint (*encode_next_hop)(struct bgp_write_state *s, eattr *nh, byte *buf, uint size); - void (*decode_next_hop)(struct bgp_parse_state *s, byte *pos, uint len, rta *a); + void (*decode_next_hop)(struct bgp_parse_state *s, byte *pos, uint len, ea_list **to); }; @@ -98,7 +97,7 @@ struct bgp_config { int capabilities; /* Enable capability handshake [RFC 5492] */ int enable_refresh; /* Enable local support for route refresh [RFC 2918] */ int enable_as4; /* Enable local support for 4B AS numbers [RFC 6793] */ - int enable_extended_messages; /* Enable local support for extended messages [draft] */ + int enable_extended_messages; /* Enable local support for extended messages [RFC 8654] */ int enable_hostname; /* Enable local support for hostname [draft] */ u32 rr_cluster_id; /* Route reflector cluster ID, if different from local ID */ int rr_client; /* Whether neighbor is RR client of me */ @@ -109,18 +108,24 @@ struct bgp_config { int interpret_communities; /* Hardwired handling of well-known communities */ int allow_local_as; /* Allow that number of local ASNs in incoming AS_PATHs */ int allow_local_pref; /* Allow LOCAL_PREF in EBGP sessions */ + int allow_med; /* Allow BGP_MED in EBGP sessions */ int allow_as_sets; /* Allow AS_SETs in incoming AS_PATHs */ int enforce_first_as; /* Enable check for neighbor AS as first AS in AS_PATH */ int gr_mode; /* Graceful restart mode (BGP_GR_*) */ int llgr_mode; /* Long-lived graceful restart mode (BGP_LLGR_*) */ int setkey; /* Set MD5 password to system SA/SP database */ + u8 local_role; /* Set peering role with neighbor [RFC 9234] */ + int require_roles; /* Require configured roles on both sides */ /* Times below are in seconds */ unsigned gr_time; /* Graceful restart timeout */ unsigned llgr_time; /* Long-lived graceful restart stale time */ unsigned connect_delay_time; /* Minimum delay between connect attempts */ unsigned connect_retry_time; /* Timeout for connect attempts */ - unsigned hold_time, initial_hold_time; + unsigned hold_time; + unsigned min_hold_time; /* Minimum accepted hold time */ + unsigned initial_hold_time; unsigned keepalive_time; + unsigned min_keepalive_time; /* Minimum accepted keepalive time */ unsigned error_amnesia_time; /* Errors are forgotten after */ unsigned error_delay_time_min; /* Time to wait after an error is detected */ unsigned error_delay_time_max; @@ -144,9 +149,11 @@ struct bgp_channel_config { ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */ u8 next_hop_self; /* Always set next hop to local IP address (NH_*) */ u8 next_hop_keep; /* Do not modify next hop attribute (NH_*) */ + u8 next_hop_prefer; /* Prefer global or link-local next hop (NHP_*) */ u8 mandatory; /* Channel is mandatory in capability negotiation */ u8 gw_mode; /* How we compute route gateway from next_hop attr, see GW_* */ u8 secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */ + u8 validate; /* Validate Flowspec per RFC 8955 (6) */ u8 gr_able; /* Allow full graceful restart for the channel */ u8 llgr_able; /* Allow full long-lived GR for the channel */ uint llgr_time; /* Long-lived graceful restart stale time */ @@ -156,15 +163,23 @@ struct bgp_channel_config { u8 aigp_originate; /* AIGP is originated automatically */ u32 cost; /* IGP cost for direct next hops */ u8 import_table; /* Use c.in_table as Adj-RIB-In */ - u8 export_table; /* Use c.out_table as Adj-RIB-Out */ + u8 export_table; /* Keep Adj-RIB-Out and export it */ struct rtable_config *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ struct rtable_config *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ + struct rtable_config *base_table; /* Base table for Flowspec validation */ }; #define BGP_PT_INTERNAL 1 #define BGP_PT_EXTERNAL 2 +#define BGP_ROLE_UNDEFINED 255 +#define BGP_ROLE_PROVIDER 0 +#define BGP_ROLE_RS_SERVER 1 +#define BGP_ROLE_RS_CLIENT 2 +#define BGP_ROLE_CUSTOMER 3 +#define BGP_ROLE_PEER 4 + #define NH_NO 0 #define NH_ALL 1 #define NH_IBGP 2 @@ -177,6 +192,9 @@ struct bgp_channel_config { #define GW_DIRECT 1 #define GW_RECURSIVE 2 +#define NHP_GLOBAL 1 +#define NHP_LOCAL 2 + #define BGP_ADD_PATH_RX 1 #define BGP_ADD_PATH_TX 2 #define BGP_ADD_PATH_FULL 3 @@ -203,8 +221,6 @@ struct bgp_channel_config { /* rte->pflags */ #define BGP_REF_SUPPRESSED 0x1 /* Used for deterministic MED comparison */ -#define BGP_REF_STALE 0x2 /* Route is LLGR_STATE */ -#define BGP_REF_NOT_STALE 0x4 /* Route is NOT LLGR_STATE */ struct bgp_af_caps { u32 afi; @@ -222,9 +238,10 @@ struct bgp_caps { u32 as4_number; /* Announced ASN */ u8 as4_support; /* Four-octet AS capability, RFC 6793 */ - u8 ext_messages; /* Extended message length, RFC draft */ + u8 ext_messages; /* Extended message length, RFC 8654 */ u8 route_refresh; /* Route refresh capability, RFC 2918 */ u8 enhanced_refresh; /* Enhanced route refresh, RFC 7313 */ + u8 role; /* BGP role capability, RFC 9234 */ u8 gr_aware; /* Graceful restart capability, RFC 4724 */ u8 gr_flags; /* Graceful restart flags */ @@ -248,8 +265,8 @@ struct bgp_caps { struct bgp_socket { node n; /* Node in global bgp_sockets */ + list requests; /* Listen requests */ sock *sk; /* Real listening socket */ - u32 uc; /* Use count */ }; struct bgp_stats { @@ -284,6 +301,16 @@ struct bgp_conn { uint hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */ }; +struct bgp_listen_request { + node n; /* Node in bgp_socket / pending list */ + struct bgp_socket *sock; /* Assigned socket */ + ip_addr addr; + struct iface *iface; + struct iface *vrf; + uint port; + uint flags; +}; + struct bgp_proto { struct proto p; const struct bgp_config *cf; /* Shortcut to BGP configuration */ @@ -313,18 +340,17 @@ struct bgp_proto { struct bgp_conn *conn; /* Connection we have established */ struct bgp_conn outgoing_conn; /* Outgoing connection we're working with */ struct bgp_conn incoming_conn; /* Incoming connection we have neither accepted nor rejected yet */ - struct linpool *rx_lp; /* Linpool for parsing received updates */ struct object_lock *lock; /* Lock for neighbor connection */ struct neighbor *neigh; /* Neighbor entry corresponding to remote ip, NULL if multihop */ - struct bgp_socket *sock; /* Shared listening socket */ + struct bgp_listen_request listen; /* Shared listening socket */ struct bfd_request *bfd_req; /* BFD request, if BFD is used */ struct birdsock *postponed_sk; /* Postponed incoming socket for dynamic BGP */ + event *uncork_ev; /* Uncork event in case of congestion */ struct bgp_stats stats; /* BGP statistics */ btime last_established; /* Last time of enter/leave of established state */ btime last_rx_update; /* Last time of RX update */ ip_addr link_addr; /* Link-local version of local_ip */ - event *active_event; /* Event for respawning */ - event *down_event; /* Event to shut down */ + event *event; /* Event for respawning and shutting process */ timer *startup_timer; /* Timer used to delay protocol startup due to previous errors (startup_delay) */ timer *gr_timer; /* Timer waiting for reestablishment after graceful restart */ int dynamic_name_counter; /* Counter for dynamic BGP names */ @@ -347,15 +373,12 @@ struct bgp_channel { rtable *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ rtable *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ + rtable *base_table; /* Base table for Flowspec validation */ /* Rest are zeroed when down */ pool *pool; - HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ - struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ - list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ - - HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ - slab *prefix_slab; /* Slab holding prefix nodes */ + struct bgp_pending_tx *ptx; /* Routes waiting to be sent */ + struct rt_exporter prefix_exporter; /* Table-like exporter for ptx */ ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */ ip_addr link_addr; /* Link-local version of next_hop_addr */ @@ -376,11 +399,16 @@ struct bgp_channel { u8 feed_state; /* Feed state (TX) for EoR, RR packets, see BFS_* */ u8 load_state; /* Load state (RX) for EoR, RR packets, see BFS_* */ + + u8 feed_out_table; /* Refeed into out_table */ }; struct bgp_prefix { - node buck_node; /* Node in per-bucket list */ + node buck_node_xx; /* Node in per-bucket list */ struct bgp_prefix *next; /* Node in prefix hash table */ + struct bgp_bucket *last; /* Last bucket sent with this prefix */ + struct bgp_bucket *cur; /* Current bucket (cur == last) if no update is required */ + btime lastmod; /* Last modification of this prefix */ u32 hash; u32 path_id; net_addr net[0]; @@ -389,11 +417,24 @@ struct bgp_prefix { struct bgp_bucket { node send_node; /* Node in send queue */ struct bgp_bucket *next; /* Node in bucket hash table */ - list prefixes; /* Prefixes in this bucket (struct bgp_prefix) */ + list prefixes; /* Prefixes to send in this bucket (struct bgp_prefix) */ u32 hash; /* Hash over extended attributes */ + u32 px_uc; /* How many prefixes are linking this bucket */ ea_list eattrs[0]; /* Per-bucket extended attributes */ }; +struct bgp_pending_tx { + resource r; + pool *pool; + + HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ + struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ + list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ + + HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ + slab *prefix_slab; /* Slab holding prefix nodes */ +}; + struct bgp_export_state { struct bgp_proto *proto; struct bgp_channel *channel; @@ -404,7 +445,7 @@ struct bgp_export_state { int mpls; u32 attrs_seen[1]; - uint err_withdraw; + uint err_reject; uint local_next_hop; }; @@ -430,6 +471,7 @@ struct bgp_parse_state { int as4_session; int add_path; int mpls; + int reach_nlri_step; u32 attrs_seen[256/32]; @@ -456,13 +498,12 @@ struct bgp_parse_state { uint err_subcode; jmp_buf err_jmpbuf; - struct hostentry *hostentry; adata *mpls_labels; /* Cached state for bgp_rte_update() */ u32 last_id; struct rte_src *last_src; - rta *cached_rta; + ea_list *cached_ea; }; #define BGP_PORT 179 @@ -475,6 +516,9 @@ struct bgp_parse_state { #define BGP_RX_BUFFER_EXT_SIZE 65535 #define BGP_TX_BUFFER_EXT_SIZE 65535 +#define BGP_CF_WALK_CHANNELS(P,C) WALK_LIST(C, P->c.channels) if (C->c.channel == &channel_bgp) +#define BGP_WALK_CHANNELS(P,C) WALK_LIST(C, P->p.channels) if (C->c.channel == &channel_bgp) + static inline int bgp_channel_is_ipv4(struct bgp_channel *c) { return BGP_AFI(c->afi) == BGP_AFI_IPV4; } @@ -487,6 +531,12 @@ static inline int bgp_cc_is_ipv4(struct bgp_channel_config *c) static inline int bgp_cc_is_ipv6(struct bgp_channel_config *c) { return BGP_AFI(c->afi) == BGP_AFI_IPV6; } +static inline int bgp_channel_is_role_applicable(struct bgp_channel *c) +{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); } + +static inline int bgp_cc_is_role_applicable(struct bgp_channel_config *c) +{ return (c->afi == BGP_AF_IPV4 || c->afi == BGP_AF_IPV6); } + static inline uint bgp_max_packet_length(struct bgp_conn *conn) { return conn->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; } @@ -513,14 +563,17 @@ void bgp_refresh_begin(struct bgp_channel *c); void bgp_refresh_end(struct bgp_channel *c); void bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code); void bgp_stop(struct bgp_proto *p, int subcode, byte *data, uint len); - -struct rte_source *bgp_find_source(struct bgp_proto *p, u32 path_id); -struct rte_source *bgp_get_source(struct bgp_proto *p, u32 path_id); +const char *bgp_format_role_name(u8 role); static inline int -rta_resolvable(rta *a) +rte_resolvable(const rte *rt) { - return a->dest == RTD_UNICAST; + eattr *nhea = ea_find(rt->attrs, &ea_gen_nexthop); + if (!nhea) + return 0; + + struct nexthop_adata *nhad = (void *) nhea->u.ptr; + return NEXTHOP_IS_REACHABLE(nhad) || (nhad->dest != RTD_UNREACHABLE); } extern struct rte_owner_class bgp_rte_owner_class; @@ -539,63 +592,41 @@ extern struct rte_owner_class bgp_rte_owner_class; /* attrs.c */ -static inline eattr * -bgp_find_attr(ea_list *attrs, uint code) -{ - return ea_find(attrs, EA_CODE(PROTOCOL_BGP, code)); -} - eattr * -bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintptr_t val); - -static inline void -bgp_set_attr_u32(ea_list **to, struct linpool *pool, uint code, uint flags, u32 val) -{ bgp_set_attr(to, pool, code, flags, (uintptr_t) val); } - -static inline void -bgp_set_attr_ptr(ea_list **to, struct linpool *pool, uint code, uint flags, const struct adata *val) -{ bgp_set_attr(to, pool, code, flags, (uintptr_t) val); } +bgp_find_attr(ea_list *attrs, uint code); -static inline void -bgp_set_attr_data(ea_list **to, struct linpool *pool, uint code, uint flags, void *data, uint len) -{ - struct adata *a = lp_alloc_adata(pool, len); - bmemcpy(a->data, data, len); - bgp_set_attr(to, pool, code, flags, (uintptr_t) a); -} - -static inline void -bgp_unset_attr(ea_list **to, struct linpool *pool, uint code) -{ eattr *e = bgp_set_attr(to, pool, code, 0, 0); e->type = EAF_TYPE_UNDEF; } +void bgp_set_attr_u32(ea_list **to, uint code, uint flags, u32 val); +void bgp_set_attr_ptr(ea_list **to, uint code, uint flags, const struct adata *ad); +void bgp_set_attr_data(ea_list **to, uint code, uint flags, void *data, uint len); +void bgp_unset_attr(ea_list **to, uint code); int bgp_encode_mp_reach_mrt(struct bgp_write_state *s, eattr *a, byte *buf, uint size); int bgp_encode_attrs(struct bgp_write_state *s, ea_list *attrs, byte *buf, byte *end); ea_list * bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len); -void bgp_finish_attrs(struct bgp_parse_state *s, rta *a); +void bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to); + +void bgp_setup_out_table(struct bgp_channel *c); + +void bgp_init_pending_tx(struct bgp_channel *c); +void bgp_free_pending_tx(struct bgp_channel *c); -void bgp_init_bucket_table(struct bgp_channel *c); -void bgp_free_bucket_table(struct bgp_channel *c); -void bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b); -void bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b); void bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b); +int bgp_done_bucket(struct bgp_channel *c, struct bgp_bucket *b); -void bgp_init_prefix_table(struct bgp_channel *c); -void bgp_free_prefix_table(struct bgp_channel *c); -void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *bp); +void bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck); -int bgp_rte_better(struct rte *, struct rte *); -int bgp_rte_mergable(rte *pri, rte *sec); -int bgp_rte_recalculate(rtable_private *table, net *net, rte *new, rte *old, rte *old_best); -void bgp_rte_modify_stale(struct rt_export_request *, const net_addr *, struct rt_pending_export *, rte **, uint); -u32 bgp_rte_igp_metric(struct rte *); +int bgp_rte_better(const rte *, const rte *); +int bgp_rte_mergable(const rte *pri, const rte *sec); +int bgp_rte_recalculate(struct rtable_private *table, net *net, rte *new, rte *old, rte *old_best); +void bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *first, struct rt_pending_export *last, const rte **feed, uint count); +u32 bgp_rte_igp_metric(const rte *); void bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, const rte *old); int bgp_preexport(struct channel *, struct rte *); -int bgp_get_attr(const struct eattr *e, byte *buf, int buflen); -void bgp_get_route_info(struct rte *, byte *); -int bgp_total_aigp_metric_(rta *a, u64 *metric, const struct adata **ad); +void bgp_get_route_info(const rte *, byte *); +int bgp_total_aigp_metric_(const rte *e, u64 *metric, const struct adata **ad); -static inline struct bgp_proto *bgp_rte_proto(struct rte *rte) +static inline struct bgp_proto *bgp_rte_proto(const rte *rte) { return (rte->src->owner->class == &bgp_rte_owner_class) ? SKIP_BACK(struct bgp_proto, p.sources, rte->src->owner) : NULL; @@ -605,15 +636,17 @@ static inline struct bgp_proto *bgp_rte_proto(struct rte *rte) #define BGP_AIGP_MAX U64(0xffffffffffffffff) static inline u64 -bgp_total_aigp_metric(rta *a) +bgp_total_aigp_metric(const rte *e) { u64 metric = BGP_AIGP_MAX; const struct adata *ad; - bgp_total_aigp_metric_(a, &metric, &ad); + bgp_total_aigp_metric_(e, &metric, &ad); return metric; } +void bgp_register_attrs(void); + /* packets.c */ @@ -625,6 +658,7 @@ void bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) void bgp_kick_tx(void *vconn); void bgp_tx(struct birdsock *sk); int bgp_rx(struct birdsock *sk, uint size); +void bgp_uncork(void *vp); const char * bgp_error_dsc(unsigned code, unsigned subcode); void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len); @@ -650,26 +684,32 @@ void bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to); #define BAF_DECODE_FLAGS 0x0100 /* Private flag - attribute flags are handled by the decode hook */ -#define BA_ORIGIN 0x01 /* RFC 4271 */ /* WM */ -#define BA_AS_PATH 0x02 /* WM */ -#define BA_NEXT_HOP 0x03 /* WM */ -#define BA_MULTI_EXIT_DISC 0x04 /* ON */ -#define BA_LOCAL_PREF 0x05 /* WD */ -#define BA_ATOMIC_AGGR 0x06 /* WD */ -#define BA_AGGREGATOR 0x07 /* OT */ -#define BA_COMMUNITY 0x08 /* RFC 1997 */ /* OT */ -#define BA_ORIGINATOR_ID 0x09 /* RFC 4456 */ /* ON */ -#define BA_CLUSTER_LIST 0x0a /* RFC 4456 */ /* ON */ -#define BA_MP_REACH_NLRI 0x0e /* RFC 4760 */ -#define BA_MP_UNREACH_NLRI 0x0f /* RFC 4760 */ -#define BA_EXT_COMMUNITY 0x10 /* RFC 4360 */ -#define BA_AS4_PATH 0x11 /* RFC 6793 */ -#define BA_AS4_AGGREGATOR 0x12 /* RFC 6793 */ -#define BA_AIGP 0x1a /* RFC 7311 */ -#define BA_LARGE_COMMUNITY 0x20 /* RFC 8092 */ +enum bgp_attr_id { + BA_ORIGIN = 0x01, /* RFC 4271 */ /* WM */ + BA_AS_PATH = 0x02, /* WM */ + BA_NEXT_HOP = 0x03, /* WM */ + BA_MULTI_EXIT_DISC = 0x04, /* ON */ + BA_LOCAL_PREF = 0x05, /* WD */ + BA_ATOMIC_AGGR = 0x06, /* WD */ + BA_AGGREGATOR = 0x07, /* OT */ + BA_COMMUNITY = 0x08, /* RFC 1997 */ /* OT */ + BA_ORIGINATOR_ID = 0x09, /* RFC 4456 */ /* ON */ + BA_CLUSTER_LIST = 0x0a, /* RFC 4456 */ /* ON */ + BA_MP_REACH_NLRI = 0x0e, /* RFC 4760 */ + BA_MP_UNREACH_NLRI = 0x0f, /* RFC 4760 */ + BA_EXT_COMMUNITY = 0x10, /* RFC 4360 */ + BA_AS4_PATH = 0x11, /* RFC 6793 */ + BA_AS4_AGGREGATOR = 0x12, /* RFC 6793 */ + BA_AIGP = 0x1a, /* RFC 7311 */ + BA_LARGE_COMMUNITY = 0x20, /* RFC 8092 */ +#define BA_ONLY_TO_CUSTOMER 0x23 /* RFC 9234 */ /* Bird's private internal BGP attributes */ -#define BA_MPLS_LABEL_STACK 0xfe /* MPLS label stack transfer attribute */ + BA_MPLS_LABEL_STACK = 0x100, /* MPLS label stack transfer attribute */ + +/* Maximum */ + BGP_ATTR_MAX, +}; /* BGP connection states */ @@ -762,10 +802,5 @@ void bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to); #define ORIGIN_EGP 1 #define ORIGIN_INCOMPLETE 2 -/* Loop */ - -#define BGP_ENTER(bgp) birdloop_enter(bgp->p.loop) -#define BGP_LEAVE(bgp) birdloop_leave(bgp->p.loop) -#define BGP_ASSERT_INSIDE(bgp) ASSERT_DIE((bgp->p.loop != &main_birdloop) && birdloop_inside(bgp->p.loop)) #endif diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 7c0d8d65..a37bb27a 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -19,19 +19,19 @@ CF_DECLS CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE, MULTIHOP, STARTUP, VIA, NEXT, HOP, SELF, DEFAULT, PATH, METRIC, ERROR, - START, DELAY, FORGET, WAIT, ENABLE, DISABLE, AFTER, BGP_PATH, - BGP_LOCAL_PREF, BGP_MED, BGP_ORIGIN, BGP_NEXT_HOP, BGP_ATOMIC_AGGR, - BGP_AGGREGATOR, BGP_COMMUNITY, BGP_EXT_COMMUNITY, BGP_LARGE_COMMUNITY, + START, DELAY, FORGET, WAIT, ENABLE, DISABLE, AFTER, + BGP_LOCAL_PREF, BGP_MED, SOURCE, ADDRESS, PASSWORD, RR, RS, CLIENT, CLUSTER, ID, AS4, ADVERTISE, IPV4, CAPABILITIES, LIMIT, PASSIVE, PREFER, OLDER, MISSING, LLADDR, - DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, - BGP_CLUSTER_LIST, IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, + DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, + IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC, SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX, GRACEFUL, RESTART, AWARE, CHECK, LINK, PORT, EXTENDED, MESSAGES, SETKEY, STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG, LIVED, STALE, IMPORT, IBGP, EBGP, MANDATORY, INTERNAL, EXTERNAL, SETS, - DYNAMIC, RANGE, NAME, DIGITS, BGP_AIGP, AIGP, ORIGINATE, COST, ENFORCE, - FIRST, FREE) + DYNAMIC, RANGE, NAME, DIGITS, AIGP, ORIGINATE, COST, ENFORCE, + FIRST, FREE, VALIDATE, BASE, ROLE, ROLES, PEER, PROVIDER, CUSTOMER, + RS_SERVER, RS_CLIENT, REQUIRE, BGP_OTC, GLOBAL) %type <i> bgp_nh %type <i32> bgp_afi @@ -40,10 +40,14 @@ CF_KEYWORDS(CEASE, PREFIX, LIMIT, HIT, ADMINISTRATIVE, SHUTDOWN, RESET, PEER, CONFIGURATION, CHANGE, DECONFIGURED, CONNECTION, REJECTED, COLLISION, OUT, OF, RESOURCES) -%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag +%type<i> bgp_cease_mask bgp_cease_list bgp_cease_flag bgp_role_name CF_GRAMMAR +/* Workaround for collisions between keywords and symbols */ +toksym: ROLE | PEER | PROVIDER | CUSTOMER | RS_SERVER | RS_CLIENT ; +toksym: BGP_MED | BGP_LOCAL_PREF | SOURCE ; + proto: bgp_proto '}' ; bgp_proto_start: proto_start BGP { @@ -73,6 +77,7 @@ bgp_proto_start: proto_start BGP { BGP_CFG->llgr_mode = -1; BGP_CFG->llgr_time = 3600; BGP_CFG->setkey = 1; + BGP_CFG->local_role = BGP_ROLE_UNDEFINED; BGP_CFG->dynamic_name = "dynbgp"; BGP_CFG->check_link = -1; } @@ -115,6 +120,14 @@ bgp_cease_flag: | OUT OF RESOURCES { $$ = 1 << 8; } ; +bgp_role_name: + PEER { $$ = BGP_ROLE_PEER; } + | PROVIDER { $$ = BGP_ROLE_PROVIDER; } + | CUSTOMER { $$ = BGP_ROLE_CUSTOMER; } + | RS_SERVER { $$ = BGP_ROLE_RS_SERVER; } + | RS_CLIENT { $$ = BGP_ROLE_RS_CLIENT; } + ; + bgp_proto: bgp_proto_start proto_name '{' | bgp_proto proto_item ';' @@ -144,7 +157,8 @@ bgp_proto: | bgp_proto RS CLIENT bool ';' { BGP_CFG->rs_client = $4; } | bgp_proto CONFEDERATION expr ';' { BGP_CFG->confederation = $3; } | bgp_proto CONFEDERATION MEMBER bool ';' { BGP_CFG->confederation_member = $4; } - | bgp_proto HOLD TIME expr ';' { BGP_CFG->hold_time = $4; } + | bgp_proto HOLD TIME expr ';' { BGP_CFG->hold_time = $4; if (($4 && $4<3) || ($4>65535)) cf_error("Hold time must be in range 3-65535 or zero"); } + | bgp_proto MIN HOLD TIME expr ';' { BGP_CFG->min_hold_time = $5; } | bgp_proto STARTUP HOLD TIME expr ';' { BGP_CFG->initial_hold_time = $5; } | bgp_proto DIRECT ';' { BGP_CFG->multihop = 0; } | bgp_proto MULTIHOP ';' { BGP_CFG->multihop = 64; } @@ -168,7 +182,8 @@ bgp_proto: | bgp_proto START DELAY TIME expr ';' { BGP_CFG->connect_delay_time = $5; log(L_WARN "%s: Start delay time option is deprecated, use connect delay time", this_proto->name); } | bgp_proto CONNECT DELAY TIME expr ';' { BGP_CFG->connect_delay_time = $5; } | bgp_proto CONNECT RETRY TIME expr ';' { BGP_CFG->connect_retry_time = $5; } - | bgp_proto KEEPALIVE TIME expr ';' { BGP_CFG->keepalive_time = $4; } + | bgp_proto KEEPALIVE TIME expr ';' { BGP_CFG->keepalive_time = $4; if (($4<1) || ($4>65535)) cf_error("Keepalive time must be in range 1-65535"); } + | bgp_proto MIN KEEPALIVE TIME expr ';' { BGP_CFG->min_keepalive_time = $5; } | bgp_proto ERROR FORGET TIME expr ';' { BGP_CFG->error_amnesia_time = $5; } | bgp_proto ERROR WAIT TIME expr ',' expr ';' { BGP_CFG->error_delay_time_min = $5; BGP_CFG->error_delay_time_max = $7; } | bgp_proto DISABLE AFTER ERROR bool ';' { BGP_CFG->disable_after_error = $5; } @@ -185,6 +200,7 @@ bgp_proto: | bgp_proto ALLOW LOCAL AS ';' { BGP_CFG->allow_local_as = -1; } | bgp_proto ALLOW LOCAL AS expr ';' { BGP_CFG->allow_local_as = $5; } | bgp_proto ALLOW BGP_LOCAL_PREF bool ';' { BGP_CFG->allow_local_pref = $4; } + | bgp_proto ALLOW BGP_MED bool ';' { BGP_CFG->allow_med = $4; } | bgp_proto ALLOW AS SETS bool ';' { BGP_CFG->allow_as_sets = $5; } | bgp_proto GRACEFUL RESTART bool ';' { BGP_CFG->gr_mode = $4; } | bgp_proto GRACEFUL RESTART AWARE ';' { BGP_CFG->gr_mode = BGP_GR_AWARE; } @@ -198,6 +214,8 @@ bgp_proto: | bgp_proto BFD GRACEFUL ';' { init_bfd_opts(&BGP_CFG->bfd); BGP_CFG->bfd->mode = BGP_BFD_GRACEFUL; } | bgp_proto BFD { open_bfd_opts(&BGP_CFG->bfd); } bfd_opts { close_bfd_opts(); } ';' | bgp_proto ENFORCE FIRST AS bool ';' { BGP_CFG->enforce_first_as = $5; } + | bgp_proto LOCAL ROLE bgp_role_name ';' { BGP_CFG->local_role = $4; } + | bgp_proto REQUIRE ROLES bool ';' { BGP_CFG->require_roles = $4; } ; bgp_afi: @@ -252,11 +270,17 @@ bgp_channel_item: | NEXT HOP ADDRESS ipa { BGP_CC->next_hop_addr = $4; } | NEXT HOP SELF bgp_nh { BGP_CC->next_hop_self = $4; } | NEXT HOP KEEP bgp_nh { BGP_CC->next_hop_keep = $4; } + | NEXT HOP PREFER GLOBAL { BGP_CC->next_hop_prefer = NHP_GLOBAL; } | MANDATORY bool { BGP_CC->mandatory = $2; } | MISSING LLADDR bgp_lladdr { log(L_WARN "%s.%s: Missing lladdr option is deprecated and ignored, remove it", this_proto->name, this_channel->name); } | GATEWAY DIRECT { BGP_CC->gw_mode = GW_DIRECT; } | GATEWAY RECURSIVE { BGP_CC->gw_mode = GW_RECURSIVE; } | SECONDARY bool { BGP_CC->secondary = $2; } + | VALIDATE bool { + BGP_CC->validate = $2; + if (BGP_SAFI(BGP_CC->afi) != BGP_SAFI_FLOW) + cf_error("Validate option limited to flowspec channels"); + } | GRACEFUL RESTART bool { BGP_CC->gr_able = $3; } | LONG LIVED GRACEFUL RESTART bool { BGP_CC->llgr_able = $5; } | LONG LIVED STALE TIME expr { BGP_CC->llgr_time = $5; } @@ -280,6 +304,16 @@ bgp_channel_item: else cf_error("Mismatched IGP table type"); } + | BASE TABLE rtable { + if (BGP_SAFI(BGP_CC->afi) != BGP_SAFI_FLOW) + cf_error("Base table option limited to flowspec channels"); + + if (((BGP_CC->afi == BGP_AF_FLOW4) && ($3->addr_type == NET_IP4)) || + ((BGP_CC->afi == BGP_AF_FLOW6) && ($3->addr_type == NET_IP6))) + BGP_CC->base_table = $3; + else + cf_error("Mismatched base table type"); + } ; bgp_channel_opts: @@ -297,41 +331,14 @@ bgp_channel_end: if (!this_channel->table) cf_error("Routing table not specified"); + if (BGP_CC->import_table) + this_channel->in_keep |= RIK_PREFILTER; + this_channel = NULL; }; bgp_proto_channel: bgp_channel_start bgp_channel_opt_list bgp_channel_end; - -dynamic_attr: BGP_ORIGIN - { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_ENUM_BGP_ORIGIN, EA_CODE(PROTOCOL_BGP, BA_ORIGIN)); } ; -dynamic_attr: BGP_PATH - { $$ = f_new_dynamic_attr(EAF_TYPE_AS_PATH, T_PATH, EA_CODE(PROTOCOL_BGP, BA_AS_PATH)); } ; -dynamic_attr: BGP_NEXT_HOP - { $$ = f_new_dynamic_attr(EAF_TYPE_IP_ADDRESS, T_IP, EA_CODE(PROTOCOL_BGP, BA_NEXT_HOP)); } ; -dynamic_attr: BGP_MED - { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_CODE(PROTOCOL_BGP, BA_MULTI_EXIT_DISC)); } ; -dynamic_attr: BGP_LOCAL_PREF - { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF)); } ; -dynamic_attr: BGP_ATOMIC_AGGR - { $$ = f_new_dynamic_attr(EAF_TYPE_OPAQUE, T_ENUM_EMPTY, EA_CODE(PROTOCOL_BGP, BA_ATOMIC_AGGR)); } ; -dynamic_attr: BGP_AGGREGATOR - { $$ = f_new_dynamic_attr(EAF_TYPE_OPAQUE, T_ENUM_EMPTY, EA_CODE(PROTOCOL_BGP, BA_AGGREGATOR)); } ; -dynamic_attr: BGP_COMMUNITY - { $$ = f_new_dynamic_attr(EAF_TYPE_INT_SET, T_CLIST, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY)); } ; -dynamic_attr: BGP_ORIGINATOR_ID - { $$ = f_new_dynamic_attr(EAF_TYPE_ROUTER_ID, T_QUAD, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID)); } ; -dynamic_attr: BGP_CLUSTER_LIST - { $$ = f_new_dynamic_attr(EAF_TYPE_INT_SET, T_CLIST, EA_CODE(PROTOCOL_BGP, BA_CLUSTER_LIST)); } ; -dynamic_attr: BGP_EXT_COMMUNITY - { $$ = f_new_dynamic_attr(EAF_TYPE_EC_SET, T_ECLIST, EA_CODE(PROTOCOL_BGP, BA_EXT_COMMUNITY)); } ; -dynamic_attr: BGP_AIGP - { $$ = f_new_dynamic_attr(EAF_TYPE_OPAQUE, T_ENUM_EMPTY, EA_CODE(PROTOCOL_BGP, BA_AIGP)); } ; -dynamic_attr: BGP_LARGE_COMMUNITY - { $$ = f_new_dynamic_attr(EAF_TYPE_LC_SET, T_LCLIST, EA_CODE(PROTOCOL_BGP, BA_LARGE_COMMUNITY)); } ; - - - CF_ENUM(T_ENUM_BGP_ORIGIN, ORIGIN_, IGP, EGP, INCOMPLETE) CF_CODE diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index 88b66040..978a1891 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -15,8 +15,8 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" -#include "nest/attrs.h" +#include "nest/rt.h" +#include "lib/attrs.h" #include "proto/mrt/mrt.h" #include "conf/conf.h" #include "lib/unaligned.h" @@ -238,6 +238,7 @@ bgp_prepare_capabilities(struct bgp_conn *conn) caps->ext_messages = p->cf->enable_extended_messages; caps->route_refresh = p->cf->enable_refresh; caps->enhanced_refresh = p->cf->enable_refresh; + caps->role = p->cf->local_role; if (caps->as4_support) caps->as4_number = p->public_as; @@ -261,7 +262,7 @@ bgp_prepare_capabilities(struct bgp_conn *conn) } /* Allocate and fill per-AF fields */ - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) { ac = &caps->af_data[caps->af_count++]; ac->afi = c->afi; @@ -350,6 +351,13 @@ bgp_write_capabilities(struct bgp_conn *conn, byte *buf) *buf++ = 0; /* Capability data length */ } + if (caps->role != BGP_ROLE_UNDEFINED) + { + *buf++ = 9; /* Capability 9: Announce chosen BGP role */ + *buf++ = 1; /* Capability data length */ + *buf++ = caps->role; + } + if (caps->gr_aware) { *buf++ = 64; /* Capability 64: Support for graceful restart */ @@ -449,11 +457,15 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) struct bgp_proto *p = conn->bgp; struct bgp_caps *caps; struct bgp_af_caps *ac; + uint err_subcode = 0; int i, cl; u32 af; if (!conn->remote_caps) + { caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + sizeof(struct bgp_af_caps)); + caps->role = BGP_ROLE_UNDEFINED; + } else { caps = conn->remote_caps; @@ -506,13 +518,28 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) } break; - case 6: /* Extended message length capability, RFC draft */ + case 6: /* Extended message length capability, RFC 8654 */ if (cl != 0) goto err; caps->ext_messages = 1; break; + case 9: /* BGP role capability, RFC 9234 */ + if (cl != 1) + goto err; + + /* Reserved value */ + if (pos[2] == BGP_ROLE_UNDEFINED) + { err_subcode = 11; goto err; } + + /* Multiple inconsistent values */ + if ((caps->role != BGP_ROLE_UNDEFINED) && (caps->role != pos[2])) + { err_subcode = 11; goto err; } + + caps->role = pos[2]; + break; + case 64: /* Graceful restart capability, RFC 4724 */ if (cl % 4 != 2) goto err; @@ -638,7 +665,7 @@ bgp_read_capabilities(struct bgp_conn *conn, byte *pos, int len) err: mb_free(caps); - bgp_error(conn, 2, 0, NULL, 0); + bgp_error(conn, 2, err_subcode, NULL, 0); return -1; } @@ -654,7 +681,7 @@ bgp_check_capabilities(struct bgp_conn *conn) /* This is partially overlapping with bgp_conn_enter_established_state(), but we need to run this just after we receive OPEN message */ - WALK_LIST(c, p->p.channels) + BGP_WALK_CHANNELS(p, c) { const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi); const struct bgp_af_caps *rem = bgp_find_af_caps(remote, c->afi); @@ -684,7 +711,7 @@ bgp_read_options(struct bgp_conn *conn, byte *pos, uint len, uint rest) struct bgp_proto *p = conn->bgp; int ext = 0; - /* Handle extended length (draft-ietf-idr-ext-opt-param-07) */ + /* Handle extended length, RFC 9072 */ if ((len > 0) && (rest > 0) && (pos[0] == 255)) { if (rest < 3) @@ -769,7 +796,7 @@ bgp_create_open(struct bgp_conn *conn, byte *buf) buf[10] = 2; /* Option 2: Capability list */ buf[11] = len; /* Option data length */ } - else /* draft-ietf-idr-ext-opt-param-07 */ + else /* Extended length, RFC 9072 */ { /* Move capabilities 4 B forward */ memmove(buf + 16, pos, len); @@ -820,9 +847,25 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) if (bgp_read_options(conn, pkt+29, pkt[28], len-29) < 0) return; + /* RFC 4271 4.2 - hold time must be either 0 or at least 3 */ if (hold > 0 && hold < 3) { bgp_error(conn, 2, 6, pkt+22, 2); return; } + /* Compute effective hold and keepalive times */ + uint hold_time = MIN(hold, p->cf->hold_time); + uint keepalive_time = p->cf->keepalive_time ? + (p->cf->keepalive_time * hold_time / p->cf->hold_time) : + hold_time / 3; + + /* Keepalive time might be rounded down to zero */ + if (hold_time && !keepalive_time) + keepalive_time = 1; + + /* Check effective values against configured minimums */ + if ((hold_time < p->cf->min_hold_time) || + (keepalive_time < p->cf->min_keepalive_time)) + { bgp_error(conn, 2, 6, pkt+22, 2); return; } + /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */ if (!id || (p->is_internal && id == p->local_id)) { bgp_error(conn, 2, 3, pkt+24, -4); return; } @@ -854,6 +897,22 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) conn->received_as = asn; } + /* RFC 9234 4.2 - check role agreement */ + u8 local_role = p->cf->local_role; + u8 neigh_role = caps->role; + + if ((local_role != BGP_ROLE_UNDEFINED) && + (neigh_role != BGP_ROLE_UNDEFINED) && + !((local_role == BGP_ROLE_PEER && neigh_role == BGP_ROLE_PEER) || + (local_role == BGP_ROLE_CUSTOMER && neigh_role == BGP_ROLE_PROVIDER) || + (local_role == BGP_ROLE_PROVIDER && neigh_role == BGP_ROLE_CUSTOMER) || + (local_role == BGP_ROLE_RS_CLIENT && neigh_role == BGP_ROLE_RS_SERVER) || + (local_role == BGP_ROLE_RS_SERVER && neigh_role == BGP_ROLE_RS_CLIENT))) + { bgp_error(conn, 2, 11, &neigh_role, -1); return; } + + if ((p->cf->require_roles) && (neigh_role == BGP_ROLE_UNDEFINED)) + { bgp_error(conn, 2, 11, &neigh_role, -1); return; } + /* Check the other connection */ other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; switch (other->state) @@ -904,8 +963,8 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) } /* Update our local variables */ - conn->hold_time = MIN(hold, p->cf->hold_time); - conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3; + conn->hold_time = hold_time; + conn->keepalive_time = keepalive_time; conn->as4_session = conn->local_caps->as4_support && caps->as4_support; conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages; p->remote_id = id; @@ -932,14 +991,18 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) #define WITHDRAW(msg, args...) \ ({ REPORT(msg, ## args); s->err_withdraw = 1; return; }) +#define REJECT(msg, args...) \ + ({ log(L_ERR "%s: " msg, s->proto->p.name, ## args); s->err_reject = 1; return; }) + #define BAD_AFI "Unexpected AF <%u/%u> in UPDATE" #define BAD_NEXT_HOP "Invalid NEXT_HOP attribute" #define NO_NEXT_HOP "Missing NEXT_HOP attribute" #define NO_LABEL_STACK "Missing MPLS stack" +#define MISMATCHED_AF " - mismatched address family (%I for %s)" static void -bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll) +bgp_apply_next_hop(struct bgp_parse_state *s, ea_list **to, ip_addr gw, ip_addr ll) { struct bgp_proto *p = s->proto; struct bgp_channel *c = s->channel; @@ -949,67 +1012,88 @@ bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll) neighbor *nbr = NULL; /* GW_DIRECT -> single_hop -> p->neigh != NULL */ - if (ipa_nonzero(gw)) + if (ipa_nonzero2(gw)) nbr = neigh_find(&p->p, gw, NULL, 0); else if (ipa_nonzero(ll)) nbr = neigh_find(&p->p, ll, p->neigh->iface, 0); + else + WITHDRAW(BAD_NEXT_HOP " - zero address"); - if (!nbr || (nbr->scope == SCOPE_HOST)) - WITHDRAW(BAD_NEXT_HOP); + if (!nbr) + WITHDRAW(BAD_NEXT_HOP " - address %I not directly reachable", ipa_nonzero(gw) ? gw : ll); - a->dest = RTD_UNICAST; - a->nh.gw = nbr->addr; - a->nh.iface = nbr->iface; - a->igp_metric = c->cf->cost; + if (nbr->scope == SCOPE_HOST) + WITHDRAW(BAD_NEXT_HOP " - address %I is local", nbr->addr); + + ea_set_attr_u32(to, &ea_gen_igp_metric, 0, c->cf->cost); + + struct nexthop_adata nhad = { + .nh = { + .gw = nbr->addr, + .iface = nbr->iface, + }, + .ad = { + .length = sizeof nhad - sizeof nhad.ad, + }, + }; + ea_set_attr_data(to, &ea_gen_nexthop, 0, nhad.ad.data, nhad.ad.length); } else /* GW_RECURSIVE */ { - if (ipa_zero(gw)) - WITHDRAW(BAD_NEXT_HOP); + if (ipa_zero2(gw)) + WITHDRAW(BAD_NEXT_HOP " - zero address"); rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6; - s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table); - - if (!s->mpls) - rta_apply_hostentry(a, s->hostentry, NULL, s->pool); + ip_addr lla = (c->cf->next_hop_prefer == NHP_LOCAL) ? ll : IPA_NONE; - /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */ + if (s->mpls) + { + u32 labels[BGP_MPLS_MAX]; + ea_set_hostentry(to, c->c.table, tab, gw, lla, BGP_MPLS_MAX, labels); + } + else + ea_set_hostentry(to, c->c.table, tab, gw, lla, 0, NULL); } } static void -bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum) +bgp_apply_mpls_labels(struct bgp_parse_state *s, ea_list **to, u32 lnum, u32 labels[lnum]) { if (lnum > MPLS_MAX_LABEL_STACK) { REPORT("Too many MPLS labels ($u)", lnum); - a->dest = RTD_UNREACHABLE; - a->hostentry = NULL; - a->nh = (struct nexthop) { }; + ea_set_dest(to, 0, RTD_UNREACHABLE); return; } /* Handle implicit NULL as empty MPLS stack */ if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL)) - lnum = 0; + lnum = s->mpls_labels->length = 0; if (s->channel->cf->gw_mode == GW_DIRECT) { - a->nh.labels = lnum; - memcpy(a->nh.label, labels, 4*lnum); + eattr *e = ea_find(*to, &ea_gen_nexthop); + struct { + struct nexthop_adata nhad; + u32 labels[MPLS_MAX_LABEL_STACK]; + } nh; + + memcpy(&nh.nhad, e->u.ptr, sizeof(struct adata) + e->u.ptr->length); + nh.nhad.nh.labels = lnum; + memcpy(nh.labels, labels, lnum * sizeof(u32)); + nh.nhad.ad.length = sizeof nh.nhad + lnum * sizeof(u32); } else /* GW_RECURSIVE */ { - mpls_label_stack ms; - - ms.len = lnum; - memcpy(ms.stack, labels, 4*lnum); - rta_apply_hostentry(a, s->hostentry, &ms, s->pool); + eattr *e = ea_find(*to, &ea_gen_hostentry); + ASSERT_DIE(e); + struct hostentry_adata *head = (void *) e->u.ptr; + memcpy(&head->labels, labels, lnum * sizeof(u32)); + head->ad.length = (void *)(&head->labels[lnum]) - (void *) head->ad.data; } } - static int bgp_match_src(struct bgp_export_state *s, int mode) { @@ -1039,13 +1123,17 @@ bgp_use_next_hop(struct bgp_export_state *s, eattr *a) return 1; /* Keep it when explicitly set in export filter */ - if (a->type & EAF_FRESH) + if (a->fresh) return 1; /* Check for non-matching AF */ if ((ipa_is_ip4(*nh) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop) return 0; + /* Do not pass NEXT_HOP between different VRFs */ + if (p->p.vrf && s->src && s->src->p.vrf && (p->p.vrf != s->src->p.vrf)) + return 0; + /* Keep it when exported to internal peers */ if (p->is_interior && ipa_nonzero(*nh)) return 1; @@ -1056,31 +1144,45 @@ bgp_use_next_hop(struct bgp_export_state *s, eattr *a) return p->neigh && (p->neigh->iface == ifa); } -static inline int +static inline struct nexthop * bgp_use_gateway(struct bgp_export_state *s) { struct bgp_proto *p = s->proto; struct bgp_channel *c = s->channel; - rta *ra = s->route->attrs; + ea_list *ra = s->route->attrs; /* Handle next hop self option - also applies to gateway */ if (c->cf->next_hop_self && bgp_match_src(s, c->cf->next_hop_self)) - return 0; + return NULL; + + eattr *nhea = ea_find(ra, &ea_gen_nexthop); + if (!nhea) + return NULL; /* We need one valid global gateway */ - if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw)) - return 0; + struct nexthop_adata *nhad = (struct nexthop_adata *) nhea->u.ptr; + if (!NEXTHOP_IS_REACHABLE(nhad) || + !NEXTHOP_ONE(nhad) || ipa_zero(nhad->nh.gw) || + ipa_is_link_local(nhad->nh.gw)) + return NULL; /* Check for non-matching AF */ - if ((ipa_is_ip4(ra->nh.gw) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop) + if ((ipa_is_ip4(nhad->nh.gw) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop) + return NULL; + + /* Do not use gateway from different VRF */ + if (p->p.vrf && nhad->nh.iface && (p->p.vrf != nhad->nh.iface->master)) return 0; /* Use it when exported to internal peers */ if (p->is_interior) - return 1; + return &nhad->nh; /* Use it when forwarded to single-hop BGP peer on on the same iface */ - return p->neigh && (p->neigh->iface == ra->nh.iface); + if (p->neigh && (p->neigh->iface == nhad->nh.iface)) + return &nhad->nh; + + return NULL; } static void @@ -1088,60 +1190,64 @@ bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to) { if (!a || !bgp_use_next_hop(s, a)) { - if (bgp_use_gateway(s)) + struct nexthop *nhloc; + if (nhloc = bgp_use_gateway(s)) { - rta *ra = s->route->attrs; - ip_addr nh[1] = { ra->nh.gw }; - bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16); + ip_addr nh[1] = { nhloc->gw }; + bgp_set_attr_data(to, BA_NEXT_HOP, 0, nh, 16); if (s->mpls) { u32 implicit_null = BGP_MPLS_NULL; - u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null; - uint lnum = ra->nh.labels ? ra->nh.labels : 1; - bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4); + u32 *labels = nhloc->labels ? nhloc->label : &implicit_null; + uint lnum = nhloc->labels ? nhloc->labels : 1; + bgp_set_attr_data(to, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4); } + else + bgp_unset_attr(to, BA_MPLS_LABEL_STACK); } else { ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr }; - bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16); + bgp_set_attr_data(to, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16); s->local_next_hop = 1; /* TODO: Use local MPLS assigned label */ if (s->mpls) { u32 implicit_null = BGP_MPLS_NULL; - bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, &implicit_null, 4); + bgp_set_attr_data(to, BA_MPLS_LABEL_STACK, 0, &implicit_null, 4); } + else + bgp_unset_attr(to, BA_MPLS_LABEL_STACK); } } /* Check if next hop is valid */ a = bgp_find_attr(*to, BA_NEXT_HOP); if (!a) - WITHDRAW(NO_NEXT_HOP); + REJECT(NO_NEXT_HOP); ip_addr *nh = (void *) a->u.ptr->data; ip_addr peer = s->proto->remote_ip; uint len = a->u.ptr->length; /* Forbid zero next hop */ - if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1]))) - WITHDRAW(BAD_NEXT_HOP); + if (ipa_zero2(nh[0]) && ((len != 32) || ipa_zero(nh[1]))) + REJECT(BAD_NEXT_HOP " - zero address"); /* Forbid next hop equal to neighbor IP */ if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1]))) - WITHDRAW(BAD_NEXT_HOP); + REJECT(BAD_NEXT_HOP " - neighbor address %I", peer); /* Forbid next hop with non-matching AF */ if ((ipa_is_ip4(nh[0]) != bgp_channel_is_ipv4(s->channel)) && !s->channel->ext_next_hop) - WITHDRAW(BAD_NEXT_HOP); + REJECT(BAD_NEXT_HOP MISMATCHED_AF, nh[0], s->channel->desc->name); /* Just check if MPLS stack */ if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK)) - WITHDRAW(NO_LABEL_STACK); + REJECT(NO_LABEL_STACK); } static uint @@ -1175,7 +1281,7 @@ bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size } static void -bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a) +bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, ea_list **to) { struct bgp_channel *c = s->channel; struct adata *ad = lp_alloc_adata(s->pool, 32); @@ -1212,12 +1318,12 @@ bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a) ad->length = 16; if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop) - WITHDRAW(BAD_NEXT_HOP); + WITHDRAW(BAD_NEXT_HOP MISMATCHED_AF, nh[0], c->desc->name); // XXXX validate next hop - bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad); - bgp_apply_next_hop(s, a, nh[0], nh[1]); + bgp_set_attr_ptr(to, BA_NEXT_HOP, 0, ad); + bgp_apply_next_hop(s, to, nh[0], nh[1]); } static uint @@ -1255,7 +1361,7 @@ bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint siz } static void -bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a) +bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, ea_list **to) { struct bgp_channel *c = s->channel; struct adata *ad = lp_alloc_adata(s->pool, 32); @@ -1293,12 +1399,12 @@ bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a) bgp_parse_error(s, 9); if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop) - WITHDRAW(BAD_NEXT_HOP); + WITHDRAW(BAD_NEXT_HOP MISMATCHED_AF, nh[0], c->desc->name); // XXXX validate next hop - bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad); - bgp_apply_next_hop(s, a, nh[0], nh[1]); + bgp_set_attr_ptr(to, BA_NEXT_HOP, 0, ad); + bgp_apply_next_hop(s, to, nh[0], nh[1]); } @@ -1310,7 +1416,7 @@ bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte } static void -bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED) +bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, ea_list **to UNUSED) { /* * Although we expect no next hop and RFC 7606 7.11 states that attribute @@ -1322,11 +1428,11 @@ bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, ui } static void -bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to) +bgp_update_next_hop_none(struct bgp_export_state *s UNUSED, eattr *a, ea_list **to) { /* NEXT_HOP shall not pass */ if (a) - bgp_unset_attr(to, s->pool, BA_NEXT_HOP); + bgp_unset_attr(to, BA_NEXT_HOP); } @@ -1335,7 +1441,7 @@ bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to) */ static void -bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0) +bgp_rte_update(struct bgp_parse_state *s, const net_addr *n, u32 path_id, ea_list *a0) { if (path_id != s->last_id) { @@ -1344,28 +1450,27 @@ bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0) s->last_src = rt_get_source(&s->proto->p, path_id); s->last_id = path_id; - rta_free(s->cached_rta); - s->cached_rta = NULL; + ea_free(s->cached_ea); + s->cached_ea = NULL; } if (!a0) { + /* Route update was changed to withdraw */ + if (s->err_withdraw && s->reach_nlri_step) + REPORT("Invalid route %N withdrawn", n); + /* Route withdraw */ rte_update(&s->channel->c, n, NULL, s->last_src); return; } /* Prepare cached route attributes */ - if (s->cached_rta == NULL) - { - /* Workaround for rta_lookup() breaking eattrs */ - ea_list *ea = a0->eattrs; - s->cached_rta = rta_lookup(a0); - a0->eattrs = ea; - } + if (s->cached_ea == NULL) + s->cached_ea = ea_lookup(a0, 0); rte e0 = { - .attrs = s->cached_rta, + .attrs = s->cached_ea, .src = s->last_src, }; @@ -1392,9 +1497,10 @@ bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, const adata *mpls, byte } static void -bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a) +bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, ea_list **to) { - u32 labels[BGP_MPLS_MAX], label; + u32 labels[BGP_MPLS_MAX]; + u32 label; uint lnum = 0; do { @@ -1408,31 +1514,20 @@ bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *p /* RFC 8277 2.4 - withdraw does not have variable-size MPLS stack but fixed-size 24-bit Compatibility field, which MUST be ignored */ - if (!a && !s->err_withdraw) + if (!s->reach_nlri_step) return; } while (!(label & BGP_MPLS_BOS)); - if (!a) + if (!*to) return; - /* Attach MPLS attribute unless we already have one */ - if (!s->mpls_labels) - { - s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX); - bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels); - } - - /* Overwrite data in the attribute */ - s->mpls_labels->length = 4*lnum; - memcpy(s->mpls_labels->data, labels, 4*lnum); - /* Update next hop entry in rta */ - bgp_apply_mpls_labels(s, a, labels, lnum); + bgp_apply_mpls_labels(s, to, lnum, labels); /* Attributes were changed, invalidate cached entry */ - rta_free(s->cached_rta); - s->cached_rta = NULL; + rta_free(s->cached_ea); + s->cached_ea = NULL; return; } @@ -1468,14 +1563,14 @@ bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1501,7 +1596,7 @@ bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) /* Decode MPLS labels */ if (s->mpls) - bgp_decode_mpls_labels(s, &pos, &len, &l, a); + bgp_decode_mpls_labels(s, &pos, &len, &l, &a); if (l > IP4_MAX_PREFIX_LENGTH) bgp_parse_error(s, 10); @@ -1553,14 +1648,14 @@ bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1586,7 +1681,7 @@ bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) /* Decode MPLS labels */ if (s->mpls) - bgp_decode_mpls_labels(s, &pos, &len, &l, a); + bgp_decode_mpls_labels(s, &pos, &len, &l, &a); if (l > IP6_MAX_PREFIX_LENGTH) bgp_parse_error(s, 10); @@ -1641,14 +1736,14 @@ bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *b memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1674,7 +1769,7 @@ bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) /* Decode MPLS labels */ if (s->mpls) - bgp_decode_mpls_labels(s, &pos, &len, &l, a); + bgp_decode_mpls_labels(s, &pos, &len, &l, &a); /* Decode route distinguisher */ if (l < 64) @@ -1738,14 +1833,14 @@ bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *b memcpy(pos, &a, b); ADVANCE(pos, size, b); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1771,7 +1866,7 @@ bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) /* Decode MPLS labels */ if (s->mpls) - bgp_decode_mpls_labels(s, &pos, &len, &l, a); + bgp_decode_mpls_labels(s, &pos, &len, &l, &a); /* Decode route distinguisher */ if (l < 64) @@ -1825,14 +1920,14 @@ bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte * memcpy(pos, net->data, flen); ADVANCE(pos, size, flen); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -1913,14 +2008,14 @@ bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte * memcpy(pos, net->data, flen); ADVANCE(pos, size, flen); - bgp_free_prefix(s->channel, px); + bgp_done_prefix(s->channel, px, buck); } return pos - buf; } static void -bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, ea_list *a) { while (len) { @@ -2147,6 +2242,8 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu * var IPv4 Network Layer Reachability Information */ + ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck); + int lr, la; la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH); @@ -2168,6 +2265,8 @@ bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *bu static byte * bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { + ASSERT_DIE(s->channel->ptx->withdraw_bucket != buck); + /* * 2 B IPv4 Withdrawn Routes Length (zero) * --- IPv4 Withdrawn Routes NLRI (unused) @@ -2291,11 +2390,14 @@ bgp_create_update(struct bgp_channel *c, byte *buf) again: ; + struct lp_state tmpp; + lp_save(tmp_linpool, &tmpp); + /* Initialize write state */ struct bgp_write_state s = { .proto = p, .channel = c, - .pool = c->c.rte_update_pool, + .pool = tmp_linpool, .mp_reach = (c->afi != BGP_AF_IPV4) || c->ext_next_hop, .as4_session = p->as4_session, .add_path = c->add_path_tx, @@ -2303,7 +2405,7 @@ again: ; }; /* Try unreachable bucket */ - if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) + if ((buck = c->ptx->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) { res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ? bgp_create_ip_unreach(&s, buck, buf, end): @@ -2313,14 +2415,14 @@ again: ; } /* Try reachable buckets */ - if (!EMPTY_LIST(c->bucket_queue)) + if (!EMPTY_LIST(c->ptx->bucket_queue)) { - buck = HEAD(c->bucket_queue); + buck = HEAD(c->ptx->bucket_queue); /* Cleanup empty buckets */ - if (EMPTY_LIST(buck->prefixes)) + if (bgp_done_bucket(c, buck)) { - bgp_free_bucket(c, buck); + lp_restore(tmp_linpool, &tmpp); goto again; } @@ -2328,13 +2430,13 @@ again: ; bgp_create_ip_reach(&s, buck, buf, end): bgp_create_mp_reach(&s, buck, buf, end); - if (EMPTY_LIST(buck->prefixes)) - bgp_free_bucket(c, buck); - else - bgp_defer_bucket(c, buck); + bgp_done_bucket(c, buck); if (!res) + { + lp_restore(tmp_linpool, &tmpp); goto again; + } goto done; } @@ -2345,7 +2447,7 @@ again: ; done: BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE"); p->stats.tx_updates++; - lp_flush(s.pool); + lp_restore(tmp_linpool, &tmpp); return res; } @@ -2412,7 +2514,6 @@ static inline void bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len) { struct bgp_channel *c = bgp_get_channel(s->proto, afi); - rta *a = NULL; if (!c) DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi)); @@ -2434,26 +2535,22 @@ bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_lis if (ea) { - a = allocz(RTA_MAX_SIZE); - - a->source = RTS_BGP; - a->scope = SCOPE_UNIVERSE; - a->from = s->proto->remote_ip; - a->eattrs = ea; - a->pref = c->c.preference; + ea_set_attr_data(&ea, &ea_gen_from, 0, &s->proto->remote_ip, sizeof(ip_addr)); + ea_set_attr_u32(&ea, &ea_gen_preference, 0, c->c.preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_BGP); - c->desc->decode_next_hop(s, nh, nh_len, a); - bgp_finish_attrs(s, a); + c->desc->decode_next_hop(s, nh, nh_len, &ea); + bgp_finish_attrs(s, &ea); /* Handle withdraw during next hop decoding */ if (s->err_withdraw) - a = NULL; + ea = NULL; } - c->desc->decode_nlri(s, nlri, len, a); + c->desc->decode_nlri(s, nlri, len, ea); - rta_free(s->cached_rta); - s->cached_rta = NULL; + rta_free(s->cached_ea); + s->cached_ea = NULL; rt_unlock_source(s->last_src); } @@ -2477,10 +2574,13 @@ bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) bgp_start_timer(p, conn->hold_timer, conn->hold_time); + struct lp_state tmpp; + lp_save(tmp_linpool, &tmpp); + /* Initialize parse state */ struct bgp_parse_state s = { .proto = p, - .pool = p->rx_lp, + .pool = tmp_linpool, .as4_session = p->as4_session, }; @@ -2546,6 +2646,8 @@ bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) if (s.mp_unreach_len) bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0); + s.reach_nlri_step = 1; + if (s.ip_reach_len) bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len, ea, s.ip_next_hop_data, s.ip_next_hop_len); @@ -2555,8 +2657,8 @@ bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) ea, s.mp_next_hop_data, s.mp_next_hop_len); done: - rta_free(s.cached_rta); - lp_flush(s.pool); + rta_free(s.cached_ea); + lp_restore(tmp_linpool, &tmpp); return; } @@ -2879,7 +2981,11 @@ bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) { ASSERT(conn->sk); - DBG("BGP: Scheduling packet type %d\n", type); + struct bgp_proto *p = conn->bgp; + if (c) + BGP_TRACE(D_PACKETS, "Scheduling packet type %d for channel %s", type, c->c.name); + else + BGP_TRACE(D_PACKETS, "Scheduling packet type %d", type); if (c) { @@ -2896,7 +3002,7 @@ bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) conn->packets_to_send |= 1 << type; if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev)) - ev_send_loop(conn->bgp->p.loop, conn->tx_ev); + proto_send_event(&p->p, conn->tx_ev); } void bgp_kick_tx(void *vconn) @@ -2909,7 +3015,7 @@ bgp_kick_tx(void *vconn) ; if (!max && !ev_active(conn->tx_ev)) - ev_send_loop(conn->bgp->p.loop, conn->tx_ev); + proto_send_event(&conn->bgp->p, conn->tx_ev); } void @@ -2917,13 +3023,14 @@ bgp_tx(sock *sk) { struct bgp_conn *conn = sk->data; + ASSERT_DIE(birdloop_inside(conn->bgp->p.loop)); DBG("BGP: TX hook\n"); uint max = 1024; while (--max && (bgp_fire_tx(conn) > 0)) ; if (!max && !ev_active(conn->tx_ev)) - ev_send_loop(conn->bgp->p.loop, conn->tx_ev); + proto_send_event(&conn->bgp->p, conn->tx_ev); } @@ -2944,6 +3051,7 @@ static struct { { 2, 6, "Unacceptable hold time" }, { 2, 7, "Required capability missing" }, /* [RFC5492] */ { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */ + { 2,11, "Role mismatch" }, /* From Open Policy, RFC 9234 */ { 3, 0, "Invalid UPDATE message" }, { 3, 1, "Malformed attribute list" }, { 3, 2, "Unrecognized well-known attribute" }, @@ -3038,13 +3146,19 @@ bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, if (len) { - /* Bad peer AS - we would like to print the AS */ - if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4))) + /* Bad peer AS / unacceptable hold time - print the value as decimal number */ + if ((code == 2) && ((subcode == 2) || (subcode == 6)) && ((len == 2) || (len == 4))) { t += bsprintf(t, ": %u", (len == 2) ? get_u16(data) : get_u32(data)); goto done; } + if ((code == 2) && (subcode == 11) && (len == 1)) + { + t += bsprintf(t, " (%s)", bgp_format_role_name(get_u8(data))); + goto done; + } + /* RFC 8203 - shutdown communication */ if (((code == 6) && ((subcode == 2) || (subcode == 4)))) if (bgp_handle_message(p, data, len, &t)) @@ -3147,6 +3261,30 @@ bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len) } } +void +bgp_uncork(void *vp) +{ + /* The uncork event is run from &main_birdloop and there is no useful way how + * to assign the target loop to it, thus we have to lock it ourselves. */ + + struct bgp_proto *p = vp; + if (!p) + return; + + birdloop_enter(p->p.loop); + + if (p && p->conn && (p->conn->state == BS_ESTABLISHED) && !p->conn->sk->rx_hook) + { + struct birdsock *sk = p->conn->sk; + ASSERT_DIE(sk->rpos > sk->rbuf); + sk_resume_rx(p->p.loop, sk, bgp_rx); + bgp_rx(sk, sk->rpos - sk->rbuf); + BGP_TRACE(D_PACKETS, "Uncorked"); + } + + birdloop_leave(p->p.loop); +} + /** * bgp_rx - handle received data * @sk: socket @@ -3161,6 +3299,7 @@ int bgp_rx(sock *sk, uint size) { struct bgp_conn *conn = sk->data; + struct bgp_proto *p = conn->bgp; byte *pkt_start = sk->rbuf; byte *end = pkt_start + size; uint i, len; @@ -3170,6 +3309,12 @@ bgp_rx(sock *sk, uint size) { if ((conn->state == BS_CLOSE) || (conn->sk != sk)) return 0; + if ((conn->state == BS_ESTABLISHED) && rt_cork_check(conn->bgp->uncork_ev)) + { + sk_pause_rx(p->p.loop, sk); + BGP_TRACE(D_PACKETS, "Corked"); + return 0; + } for(i=0; i<16; i++) if (pkt_start[i] != 0xff) { diff --git a/proto/mrt/Makefile b/proto/mrt/Makefile index 925fb102..8cd44ac1 100644 --- a/proto/mrt/Makefile +++ b/proto/mrt/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/mrt/mrt.c b/proto/mrt/mrt.c index e12f7743..82fd426a 100644 --- a/proto/mrt/mrt.c +++ b/proto/mrt/mrt.c @@ -113,13 +113,13 @@ mrt_buffer_flush(buffer *b) } #define MRT_DEFINE_TYPE(S, T) \ - UNUSED static inline void mrt_put_##S##_(buffer *b, T x) \ + static inline void UNUSED mrt_put_##S##_(buffer *b, T x) \ { \ put_##S(b->pos, x); \ b->pos += sizeof(T); \ } \ \ - UNUSED static inline void mrt_put_##S(buffer *b, T x) \ + static inline void UNUSED mrt_put_##S(buffer *b, T x) \ { \ mrt_buffer_need(b, sizeof(T)); \ put_##S(b->pos, x); \ @@ -243,21 +243,15 @@ mrt_next_table(struct mrt_table_dump_state *s) rtable *tab = mrt_next_table_(s->table, s->table_ptr, s->table_expr); if (s->table) - { - RT_LOCK(s->table); - rt_unlock_table(RT_PRIV(s->table)); - RT_UNLOCK(s->table); - } + RT_LOCKED(s->table, tab) + rt_unlock_table(tab); s->table = tab; s->ipv4 = tab ? (tab->addr_type == NET_IP4) : 0; if (s->table) - { - RT_LOCK(s->table); - rt_lock_table(RT_PRIV(s->table)); - RT_UNLOCK(s->table); - } + RT_LOCKED(s->table, tab) + rt_lock_table(tab); return s->table; } @@ -431,7 +425,7 @@ mrt_rib_table_header(struct mrt_table_dump_state *s, net_addr *n) static void mrt_rib_table_entry_bgp_attrs(struct mrt_table_dump_state *s, rte *r) { - struct ea_list *eattrs = r->attrs->eattrs; + struct ea_list *eattrs = r->attrs; buffer *b = &s->buf; if (!eattrs) @@ -439,7 +433,7 @@ mrt_rib_table_entry_bgp_attrs(struct mrt_table_dump_state *s, rte *r) /* Attribute list must be normalized for bgp_encode_attrs() */ if (!rta_is_cached(r->attrs)) - ea_normalize(eattrs); + eattrs = ea_normalize(eattrs, 0); mrt_buffer_need(b, MRT_ATTR_BUFFER_SIZE); byte *pos = b->pos; @@ -533,7 +527,7 @@ mrt_rib_table_dump(struct mrt_table_dump_state *s, net *n, int add_path) } rte e = rt->rte; - if (f_run(s->filter, &e, s->linpool, 0) <= F_ACCEPT) + if (f_run(s->filter, &e, 0) <= F_ACCEPT) mrt_rib_table_entry(s, &e); lp_flush(s->linpool); @@ -561,13 +555,12 @@ mrt_rib_table_dump(struct mrt_table_dump_state *s, net *n, int add_path) static struct mrt_table_dump_state * mrt_table_dump_init(pool *pp) { - pool *pool = rp_new(pp, &main_birdloop, "MRT Table Dump"); + pool *pool = rp_new(pp, "MRT Table Dump"); struct mrt_table_dump_state *s = mb_allocz(pool, sizeof(struct mrt_table_dump_state)); s->pool = pool; - s->parent = pp; - s->linpool = lp_new(pool, 4080); - s->peer_lp = lp_new(pool, 4080); + s->linpool = lp_new(pool); + s->peer_lp = lp_new(pool); mrt_buffer_init(&s->buf, pool, 2 * MRT_ATTR_BUFFER_SIZE); /* We lock the current config as we may reference it indirectly by filter */ @@ -583,26 +576,21 @@ static void mrt_table_dump_free(struct mrt_table_dump_state *s) { if (s->table) - { - RT_LOCK(s->table); - - if (s->table_open) - FIB_ITERATE_UNLINK(&s->fit, &RT_PRIV(s->table)->fib); + RT_LOCKED(s->table, tab) + { + if (s->table_open) + FIB_ITERATE_UNLINK(&s->fit, &tab->fib); - rt_unlock_table(RT_PRIV(s->table)); - RT_UNLOCK(s->table); - } + rt_unlock_table(tab); + } if (s->table_ptr) - { - RT_LOCK(s->table_ptr); - rt_unlock_table(RT_PRIV(s->table_ptr)); - RT_UNLOCK(s->table_ptr); - } + RT_LOCKED(s->table_ptr, tab) + rt_unlock_table(tab); config_del_obstacle(s->config); - rp_free(s->pool, s->parent); + rfree(s->pool); } @@ -614,14 +602,8 @@ mrt_table_dump_step(struct mrt_table_dump_state *s) s->max = 2048; s->bws = &bws; - rtable_private *tab; - if (s->table_open) - { - RT_LOCK(s->table); - tab = RT_PRIV(s->table); goto step; - } while (mrt_next_table(s)) { @@ -630,8 +612,9 @@ mrt_table_dump_step(struct mrt_table_dump_state *s) mrt_peer_table_dump(s); - RT_LOCK(s->table); - tab = RT_PRIV(s->table); + RT_LOCKED(s->table, tab) + { + FIB_ITERATE_INIT(&s->fit, &tab->fib); s->table_open = 1; @@ -641,8 +624,7 @@ mrt_table_dump_step(struct mrt_table_dump_state *s) if (s->max < 0) { FIB_ITERATE_PUT(&s->fit); - RT_UNLOCK(s->table); - return 0; + RT_RETURN(tab, 0); } /* With Always ADD_PATH option, we jump directly to second phase */ @@ -657,11 +639,12 @@ mrt_table_dump_step(struct mrt_table_dump_state *s) FIB_ITERATE_END; s->table_open = 0; + } + mrt_close_file(s); mrt_peer_table_flush(s); } - RT_UNLOCK(s->table); return 1; } @@ -689,11 +672,8 @@ mrt_timer(timer *t) s->always_add_path = cf->always_add_path; if (s->table_ptr) - { - RT_LOCK(s->table_ptr); - rt_lock_table(RT_PRIV(s->table_ptr)); - RT_UNLOCK(s->table_ptr); - } + RT_LOCKED(s->table_ptr, tab) + rt_lock_table(tab); p->table_dump = s; ev_schedule(p->event); @@ -735,14 +715,17 @@ mrt_dump_cont(struct cli *c) cli_printf(c, 0, ""); mrt_table_dump_free(c->rover); - c->cont = c->cleanup = c->rover = NULL; + c->cont = NULL; + c->cleanup = NULL; + c->rover = NULL; } -static void +static int mrt_dump_cleanup(struct cli *c) { mrt_table_dump_free(c->rover); c->rover = NULL; + return 0; } void @@ -766,11 +749,8 @@ mrt_dump_cmd(struct mrt_dump_data *d) s->filename = d->filename; if (s->table_ptr) - { - RT_LOCK(s->table_ptr); - rt_lock_table(RT_PRIV(s->table_ptr)); - RT_UNLOCK(s->table_ptr); - } + RT_LOCKED(s->table_ptr, tab) + rt_lock_table(tab); this_cli->cont = mrt_dump_cont; this_cli->cleanup = mrt_dump_cleanup; @@ -940,7 +920,6 @@ mrt_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUSE struct protocol proto_mrt = { .name = "MRT", .template = "mrt%d", - .class = PROTOCOL_MRT, .proto_size = sizeof(struct mrt_proto), .config_size = sizeof(struct mrt_config), .init = mrt_init, @@ -949,3 +928,9 @@ struct protocol proto_mrt = { .reconfigure = mrt_reconfigure, .copy_config = mrt_copy_config, }; + +void +mrt_build(void) +{ + proto_build(&proto_mrt); +} diff --git a/proto/mrt/mrt.h b/proto/mrt/mrt.h index 2e616f6f..f535a391 100644 --- a/proto/mrt/mrt.h +++ b/proto/mrt/mrt.h @@ -13,7 +13,7 @@ #include "nest/bird.h" #include "nest/protocol.h" #include "lib/lists.h" -#include "nest/route.h" +#include "nest/rt.h" #include "lib/event.h" #include "lib/hash.h" @@ -67,7 +67,6 @@ struct mrt_table_dump_state { /* Allocated by mrt_table_dump_init() */ pool *pool; /* Pool for table dump */ - pool *parent; /* Parent pool for cleanup */ linpool *linpool; /* Temporary linear pool */ linpool *peer_lp; /* Linear pool for peer entries in peer_hash */ buffer buf; /* Buffer for MRT messages */ diff --git a/proto/ospf/Makefile b/proto/ospf/Makefile index 39e74f71..015f394a 100644 --- a/proto/ospf/Makefile +++ b/proto/ospf/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/ospf/config.Y b/proto/ospf/config.Y index 4b7d5a36..bc3df8db 100644 --- a/proto/ospf/config.Y +++ b/proto/ospf/config.Y @@ -190,7 +190,7 @@ ospf_check_auth(void) CF_DECLS -CF_KEYWORDS(OSPF, V2, V3, OSPF_METRIC1, OSPF_METRIC2, OSPF_TAG, OSPF_ROUTER_ID) +CF_KEYWORDS(OSPF, V2, V3) CF_KEYWORDS(AREA, NEIGHBORS, RFC1583COMPAT, STUB, TICK, COST, COST2, RETRANSMIT) CF_KEYWORDS(HELLO, TRANSMIT, PRIORITY, DEAD, TYPE, BROADCAST, BCAST, DEFAULT) CF_KEYWORDS(NONBROADCAST, NBMA, POINTOPOINT, PTP, POINTOMULTIPOINT, PTMP) @@ -505,11 +505,6 @@ ospf_iface: ospf_iface_start ospf_iface_patt_list ospf_iface_opt_list { ospf_iface_finish(); } ; -dynamic_attr: OSPF_METRIC1 { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_OSPF_METRIC1); } ; -dynamic_attr: OSPF_METRIC2 { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_OSPF_METRIC2); } ; -dynamic_attr: OSPF_TAG { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_OSPF_TAG); } ; -dynamic_attr: OSPF_ROUTER_ID { $$ = f_new_dynamic_attr(EAF_TYPE_ROUTER_ID, T_QUAD, EA_OSPF_ROUTER_ID); } ; - CF_CLI_HELP(SHOW OSPF, ..., [[Show information about OSPF protocol]]); CF_CLI(SHOW OSPF, optproto, [<name>], [[Show information about OSPF protocol]]) { PROTO_WALK_CMD($3, &proto_ospf, p) ospf_sh(p); }; diff --git a/proto/ospf/iface.c b/proto/ospf/iface.c index 049030ac..0aa7fa00 100644 --- a/proto/ospf/iface.c +++ b/proto/ospf/iface.c @@ -136,7 +136,7 @@ ospf_sk_open(struct ospf_iface *ifa) sk->flags = SKF_LADDR_RX | (ifa->check_ttl ? SKF_TTL_RX : 0); sk->ttl = ifa->cf->ttl_security ? 255 : 1; - if (sk_open(sk) < 0) + if (sk_open(sk, p->p.loop) < 0) goto err; /* 12 is an offset of the checksum in an OSPFv3 packet */ @@ -220,7 +220,7 @@ ospf_open_vlink_sk(struct ospf_proto *p) sk->data = (void *) p; sk->flags = 0; - if (sk_open(sk) < 0) + if (sk_open(sk, p->p.loop) < 0) goto err; /* 12 is an offset of the checksum in an OSPFv3 packet */ @@ -311,7 +311,7 @@ ospf_iface_remove(struct ospf_iface *ifa) ospf_iface_sm(ifa, ISM_DOWN); rem_node(NODE ifa); - rp_free(ifa->pool, p->p.pool); + rfree(ifa->pool); } void @@ -484,9 +484,9 @@ ospf_iface_find(struct ospf_proto *p, struct iface *what) } static void -ospf_iface_add(struct object_lock *lock) +ospf_iface_add(void *_ifa) { - struct ospf_iface *ifa = lock->data; + struct ospf_iface *ifa = _ifa; struct ospf_proto *p = ifa->oa->po; /* Open socket if interface is not stub */ @@ -567,7 +567,7 @@ ospf_iface_new(struct ospf_area *oa, struct ifa *addr, struct ospf_iface_patt *i OSPF_TRACE(D_EVENTS, "Adding interface %s (%N) to area %R", iface->name, &addr->prefix, oa->areaid); - pool = rp_new(p->p.pool, p->p.loop, "OSPF Interface"); + pool = rp_new(p->p.pool, "OSPF Interface"); ifa = mb_allocz(pool, sizeof(struct ospf_iface)); ifa->iface = iface; ifa->addr = addr; @@ -668,8 +668,11 @@ ospf_iface_new(struct ospf_area *oa, struct ifa *addr, struct ospf_iface_patt *i lock->port = OSPF_PROTO; lock->inst = ifa->instance_id; lock->iface = iface; - lock->data = ifa; - lock->hook = ospf_iface_add; + lock->event = (event) { + .hook = ospf_iface_add, + .data = ifa, + }; + lock->target = &global_event_list; olock_acquire(lock); } @@ -687,7 +690,7 @@ ospf_iface_new_vlink(struct ospf_proto *p, struct ospf_iface_patt *ip) /* Vlink ifname is stored just after the ospf_iface structure */ - pool = rp_new(p->p.pool, p->p.loop, "OSPF Vlink"); + pool = rp_new(p->p.pool, "OSPF Vlink"); ifa = mb_allocz(pool, sizeof(struct ospf_iface) + 16); ifa->oa = p->backbone; ifa->cf = ip; @@ -1222,12 +1225,13 @@ ospf_ifa_notify3(struct proto *P, uint flags, struct ifa *a) static void ospf_reconfigure_ifaces2(struct ospf_proto *p) { - struct iface *iface; struct ifa *a; - IFACE_LEGACY_ACCESS; - WALK_LIST(iface, global_iface_list) + IFACE_WALK(iface) { + if (p->p.vrf && p->p.vrf != iface->master) + continue; + if (! (iface->flags & IF_UP)) continue; @@ -1269,12 +1273,13 @@ ospf_reconfigure_ifaces2(struct ospf_proto *p) static void ospf_reconfigure_ifaces3(struct ospf_proto *p) { - struct iface *iface; struct ifa *a; - IFACE_LEGACY_ACCESS; - WALK_LIST(iface, global_iface_list) + IFACE_WALK(iface) { + if (p->p.vrf && p->p.vrf != iface->master) + continue; + if (! (iface->flags & IF_UP)) continue; diff --git a/proto/ospf/neighbor.c b/proto/ospf/neighbor.c index 4ae0d3fa..b0fdc42f 100644 --- a/proto/ospf/neighbor.c +++ b/proto/ospf/neighbor.c @@ -80,7 +80,7 @@ struct ospf_neighbor * ospf_neighbor_new(struct ospf_iface *ifa) { struct ospf_proto *p = ifa->oa->po; - struct pool *pool = rp_new(p->p.pool, p->p.loop, "OSPF Neighbor"); + struct pool *pool = rp_new(p->p.pool, "OSPF Neighbor"); struct ospf_neighbor *n = mb_allocz(pool, sizeof(struct ospf_neighbor)); n->pool = pool; @@ -120,7 +120,7 @@ ospf_neigh_down(struct ospf_neighbor *n) s_get(&(n->dbsi)); release_lsrtl(p, n); rem_node(NODE n); - rp_free(n->pool, p->p.pool); + rfree(n->pool); OSPF_TRACE(D_EVENTS, "Neighbor %R on %s removed", rid, ifa->ifname); } @@ -777,7 +777,7 @@ ospf_neigh_update_bfd(struct ospf_neighbor *n, int use_bfd) if (use_bfd && !n->bfd_req) n->bfd_req = bfd_request_session(n->pool, n->ip, n->ifa->addr->ip, n->ifa->iface, p->p.vrf, - ospf_neigh_bfd_hook, n, birdloop_event_list(p->p.loop), NULL); + ospf_neigh_bfd_hook, n, p->p.loop, NULL); if (!use_bfd && n->bfd_req) { diff --git a/proto/ospf/ospf.c b/proto/ospf/ospf.c index 16774df6..896bf5a3 100644 --- a/proto/ospf/ospf.c +++ b/proto/ospf/ospf.c @@ -106,11 +106,12 @@ #include <stdlib.h> #include "ospf.h" +#include "lib/macro.h" static int ospf_preexport(struct channel *C, rte *new); static void ospf_reload_routes(struct channel *C); -static int ospf_rte_better(struct rte *new, struct rte *old); -static u32 ospf_rte_igp_metric(struct rte *rt); +static int ospf_rte_better(const rte *new, const rte *old); +static u32 ospf_rte_igp_metric(const rte *rt); static void ospf_disp(timer *timer); @@ -299,7 +300,7 @@ ospf_start(struct proto *P) p->lsab_size = 256; p->lsab_used = 0; p->lsab = mb_alloc(P->pool, p->lsab_size); - p->nhpool = lp_new(P->pool, 12*sizeof(struct nexthop)); + p->nhpool = lp_new(P->pool); init_list(&(p->iface_list)); init_list(&(p->area_list)); fib_init(&p->rtf, P->pool, ospf_get_af(p), sizeof(ort), OFFSETOF(ort, fn), 0, NULL); @@ -370,8 +371,8 @@ ospf_init(struct proto_config *CF) P->main_channel = proto_add_channel(P, proto_cf_main_channel(CF)); P->rt_notify = ospf_rt_notify; - P->if_notify = ospf_if_notify; - P->ifa_notify = cf->ospf2 ? ospf_ifa_notify2 : ospf_ifa_notify3; + P->iface_sub.if_notify = ospf_if_notify; + P->iface_sub.ifa_notify = cf->ospf2 ? ospf_ifa_notify2 : ospf_ifa_notify3; P->preexport = ospf_preexport; P->reload_routes = ospf_reload_routes; P->feed_begin = ospf_feed_begin; @@ -384,25 +385,28 @@ ospf_init(struct proto_config *CF) /* If new is better return 1 */ static int -ospf_rte_better(struct rte *new, struct rte *old) +ospf_rte_better(const rte *new, const rte *old) { - u32 new_metric1 = ea_get_int(new->attrs->eattrs, EA_OSPF_METRIC1, LSINFINITY); + u32 new_metric1 = ea_get_int(new->attrs, &ea_ospf_metric1, LSINFINITY); if (new_metric1 == LSINFINITY) return 0; - if(new->attrs->source < old->attrs->source) return 1; - if(new->attrs->source > old->attrs->source) return 0; + u32 ns = rt_get_source_attr(new); + u32 os = rt_get_source_attr(old); - if(new->attrs->source == RTS_OSPF_EXT2) + if (ns < os) return 1; + if (ns > os) return 0; + + if (ns == RTS_OSPF_EXT2) { - u32 old_metric2 = ea_get_int(old->attrs->eattrs, EA_OSPF_METRIC2, LSINFINITY); - u32 new_metric2 = ea_get_int(new->attrs->eattrs, EA_OSPF_METRIC2, LSINFINITY); - if(new_metric2 < old_metric2) return 1; - if(new_metric2 > old_metric2) return 0; + u32 old_metric2 = ea_get_int(old->attrs, &ea_ospf_metric2, LSINFINITY); + u32 new_metric2 = ea_get_int(new->attrs, &ea_ospf_metric2, LSINFINITY); + if (new_metric2 < old_metric2) return 1; + if (new_metric2 > old_metric2) return 0; } - u32 old_metric1 = ea_get_int(old->attrs->eattrs, EA_OSPF_METRIC1, LSINFINITY); + u32 old_metric1 = ea_get_int(old->attrs, &ea_ospf_metric1, LSINFINITY); if (new_metric1 < old_metric1) return 1; @@ -410,12 +414,12 @@ ospf_rte_better(struct rte *new, struct rte *old) } static u32 -ospf_rte_igp_metric(struct rte *rt) +ospf_rte_igp_metric(const rte *rt) { - if (rt->attrs->source == RTS_OSPF_EXT2) + if (rt_get_source_attr(rt) == RTS_OSPF_EXT2) return IGP_METRIC_UNKNOWN; - return ea_get_int(rt->attrs->eattrs, EA_OSPF_METRIC1, LSINFINITY); + return ea_get_int(rt->attrs, &ea_ospf_metric1, LSINFINITY); } void @@ -482,13 +486,13 @@ ospf_disp(timer * timer) * import to the filters. */ static int -ospf_preexport(struct channel *c, rte *e) +ospf_preexport(struct channel *C, rte *e) { - struct ospf_proto *p = (struct ospf_proto *) c->proto; + struct ospf_proto *p = (struct ospf_proto *) C->proto; struct ospf_area *oa = ospf_main_area(p); /* Reject our own routes */ - if (e->sender == c->in_req.hook) + if (e->sender == C->in_req.hook) return -1; /* Do not export routes to stub areas */ @@ -531,7 +535,7 @@ ospf_shutdown(struct proto *P) /* Cleanup locked rta entries */ FIB_WALK(&p->rtf, ort, nf) { - rta_free(nf->old_rta); + ea_free(nf->old_ea); } FIB_WALK_END; @@ -566,11 +570,12 @@ ospf_get_status(struct proto *P, byte * buf) } static void -ospf_get_route_info(rte * rte, byte * buf) +ospf_get_route_info(const rte * rte, byte * buf) { char *type = "<bug>"; - switch (rte->attrs->source) + uint source = rt_get_source_attr(rte); + switch (source) { case RTS_OSPF: type = "I"; @@ -587,42 +592,26 @@ ospf_get_route_info(rte * rte, byte * buf) } buf += bsprintf(buf, " %s", type); - buf += bsprintf(buf, " (%d/%d", rte->attrs->pref, ea_get_int(rte->attrs->eattrs, EA_OSPF_METRIC1, LSINFINITY)); - if (rte->attrs->source == RTS_OSPF_EXT2) - buf += bsprintf(buf, "/%d", ea_get_int(rte->attrs->eattrs, EA_OSPF_METRIC2, LSINFINITY)); + buf += bsprintf(buf, " (%d/%d", rt_get_preference(rte), ea_get_int(rte->attrs, &ea_ospf_metric1, LSINFINITY)); + if (source == RTS_OSPF_EXT2) + buf += bsprintf(buf, "/%d", ea_get_int(rte->attrs, &ea_ospf_metric2, LSINFINITY)); buf += bsprintf(buf, ")"); - if (rte->attrs->source == RTS_OSPF_EXT1 || rte->attrs->source == RTS_OSPF_EXT2) + if (source == RTS_OSPF_EXT1 || source == RTS_OSPF_EXT2) { - eattr *ea = ea_find(rte->attrs->eattrs, EA_OSPF_TAG); + eattr *ea = ea_find(rte->attrs, &ea_ospf_tag); if (ea && (ea->u.data > 0)) buf += bsprintf(buf, " [%x]", ea->u.data); } - eattr *ea = ea_find(rte->attrs->eattrs, EA_OSPF_ROUTER_ID); + eattr *ea = ea_find(rte->attrs, &ea_ospf_router_id); if (ea) buf += bsprintf(buf, " [%R]", ea->u.data); } -static int -ospf_get_attr(const eattr * a, byte * buf, int buflen UNUSED) +static void +ospf_tag_format(const eattr * a, byte * buf, uint buflen) { - switch (a->id) - { - case EA_OSPF_METRIC1: - bsprintf(buf, "metric1"); - return GA_NAME; - case EA_OSPF_METRIC2: - bsprintf(buf, "metric2"); - return GA_NAME; - case EA_OSPF_TAG: - bsprintf(buf, "tag: 0x%08x", a->u.data); - return GA_FULL; - case EA_OSPF_ROUTER_ID: - bsprintf(buf, "router_id"); - return GA_NAME; - default: - return GA_UNKNOWN; - } + bsnprintf(buf, buflen, "0x%08x", a->u.data); } static void @@ -1526,7 +1515,6 @@ struct rte_owner_class ospf_rte_owner_class = { struct protocol proto_ospf = { .name = "OSPF", .template = "ospf%d", - .class = PROTOCOL_OSPF, .preference = DEF_PREF_OSPF, .channel_mask = NB_IP, .proto_size = sizeof(struct ospf_proto), @@ -1537,5 +1525,38 @@ struct protocol proto_ospf = { .shutdown = ospf_shutdown, .reconfigure = ospf_reconfigure, .get_status = ospf_get_status, - .get_attr = ospf_get_attr, }; + +struct ea_class ea_ospf_metric1 = { + .name = "ospf_metric1", + .type = T_INT, +}; + +struct ea_class ea_ospf_metric2 = { + .name = "ospf_metric2", + .type = T_INT, +}; + +struct ea_class ea_ospf_tag = { + .name = "ospf_tag", + .type = T_INT, + .format = ospf_tag_format, +}; + +struct ea_class ea_ospf_router_id = { + .name = "ospf_router_id", + .type = T_QUAD, +}; + +void +ospf_build(void) +{ + proto_build(&proto_ospf); + + EA_REGISTER_ALL( + &ea_ospf_metric1, + &ea_ospf_metric2, + &ea_ospf_tag, + &ea_ospf_router_id + ); +} diff --git a/proto/ospf/ospf.h b/proto/ospf/ospf.h index a5f83e79..3477ba5a 100644 --- a/proto/ospf/ospf.h +++ b/proto/ospf/ospf.h @@ -22,7 +22,7 @@ #include "lib/resource.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "nest/locks.h" #include "nest/bfd.h" @@ -939,12 +939,7 @@ struct lsadb_show_data { u32 router; /* Advertising router, 0 -> all */ }; - -#define EA_OSPF_METRIC1 EA_CODE(PROTOCOL_OSPF, 0) -#define EA_OSPF_METRIC2 EA_CODE(PROTOCOL_OSPF, 1) -#define EA_OSPF_TAG EA_CODE(PROTOCOL_OSPF, 2) -#define EA_OSPF_ROUTER_ID EA_CODE(PROTOCOL_OSPF, 3) - +extern struct ea_class ea_ospf_metric1, ea_ospf_metric2, ea_ospf_tag, ea_ospf_router_id; /* * For regular networks, neighbor address must match network prefix. diff --git a/proto/ospf/rt.c b/proto/ospf/rt.c index 3e208023..69c2907d 100644 --- a/proto/ospf/rt.c +++ b/proto/ospf/rt.c @@ -28,24 +28,30 @@ nh_is_vlink(struct nexthop *nhs) static inline int unresolved_vlink(ort *ort) { - return ort->n.nhs && nh_is_vlink(ort->n.nhs); + return ort->n.nhs && nh_is_vlink(&ort->n.nhs->nh); } -static inline struct nexthop * +static inline struct nexthop_adata * new_nexthop(struct ospf_proto *p, ip_addr gw, struct iface *iface, byte weight) { - struct nexthop *nh = lp_allocz(p->nhpool, sizeof(struct nexthop)); - nh->gw = gw; - nh->iface = iface; - nh->weight = weight; - return nh; + struct nexthop_adata *nhad = lp_alloc(p->nhpool, sizeof(struct nexthop_adata)); + *nhad = (struct nexthop_adata) { + .ad = { .length = sizeof *nhad - sizeof nhad->ad, }, + .nh = { + .gw = gw, + .iface = iface, + .weight = weight, + }, + }; + + return nhad; } /* Returns true if there are device nexthops in n */ static inline int -has_device_nexthops(const struct nexthop *n) +has_device_nexthops(struct nexthop_adata *nhad) { - for (; n; n = n->next) + NEXTHOP_WALK(n, nhad) if (ipa_zero(n->gw)) return 1; @@ -53,38 +59,22 @@ has_device_nexthops(const struct nexthop *n) } /* Replace device nexthops with nexthops to gw */ -static struct nexthop * -fix_device_nexthops(struct ospf_proto *p, const struct nexthop *n, ip_addr gw) +static struct nexthop_adata * +fix_device_nexthops(struct ospf_proto *p, struct nexthop_adata *old, ip_addr gw) { - struct nexthop *root1 = NULL; - struct nexthop *root2 = NULL; - struct nexthop **nn1 = &root1; - struct nexthop **nn2 = &root2; - if (!p->ecmp) - return new_nexthop(p, gw, n->iface, n->weight); - - /* This is a bit tricky. We cannot just copy the list and update n->gw, - because the list should stay sorted, so we create two lists, one with new - gateways and one with old ones, and then merge them. */ - - for (; n; n = n->next) { - struct nexthop *nn = new_nexthop(p, ipa_zero(n->gw) ? gw : n->gw, n->iface, n->weight); + struct nexthop_adata *new = (struct nexthop_adata *) lp_store_adata(p->nhpool, old->ad.data, old->ad.length); + new->nh.gw = gw; + return new; + } + struct nexthop_adata *tmp = (struct nexthop_adata *) tmp_copy_adata(&old->ad); + NEXTHOP_WALK(n, tmp) if (ipa_zero(n->gw)) - { - *nn1 = nn; - nn1 = &(nn->next); - } - else - { - *nn2 = nn; - nn2 = &(nn->next); - } - } + n->gw = gw; - return nexthop_merge(root1, root2, 1, 1, p->ecmp, p->nhpool); + return nexthop_sort(tmp, p->nhpool); } @@ -169,9 +159,9 @@ orta_compare(const struct ospf_proto *p, const orta *new, const orta *old) return -1; if (!new->nhs) return 1; - if (nh_is_vlink(new->nhs)) + if (nh_is_vlink(&new->nhs->nh)) return -1; - if (nh_is_vlink(old->nhs)) + if (nh_is_vlink(&old->nhs->nh)) return 1; @@ -279,11 +269,7 @@ ort_merge(struct ospf_proto *p, ort *o, const orta *new) orta *old = &o->n; if (old->nhs != new->nhs) - { - old->nhs = nexthop_merge(old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse, - p->ecmp, p->nhpool); - old->nhs_reuse = 1; - } + old->nhs = nexthop_merge(old->nhs, new->nhs, p->ecmp, p->nhpool); if (old->rid < new->rid) old->rid = new->rid; @@ -295,11 +281,7 @@ ort_merge_ext(struct ospf_proto *p, ort *o, const orta *new) orta *old = &o->n; if (old->nhs != new->nhs) - { - old->nhs = nexthop_merge(old->nhs, new->nhs, old->nhs_reuse, new->nhs_reuse, - p->ecmp, p->nhpool); - old->nhs_reuse = 1; - } + old->nhs = nexthop_merge(old->nhs, new->nhs, p->ecmp, p->nhpool); if (old->tag != new->tag) old->tag = 0; @@ -1165,7 +1147,7 @@ ospf_check_vlinks(struct ospf_proto *p) if (tmp && (tmp->color == INSPF) && ipa_nonzero(tmp->lb) && tmp->nhs) { - struct ospf_iface *nhi = ospf_iface_find(p, tmp->nhs->iface); + struct ospf_iface *nhi = ospf_iface_find(p, tmp->nhs->nh.iface); if ((ifa->state != OSPF_IS_PTP) || (ifa->vifa != nhi) @@ -1579,10 +1561,7 @@ ospf_ext_spf(struct ospf_proto *p) /* Replace device nexthops with nexthops to forwarding address from LSA */ if (has_device_nexthops(nfa.nhs)) - { nfa.nhs = fix_device_nexthops(p, nfa.nhs, rt.fwaddr); - nfa.nhs_reuse = 1; - } } if (rt.ebit) @@ -1726,10 +1705,10 @@ ospf_rt_spf(struct ospf_proto *p) static inline int -inherit_nexthops(struct nexthop *pn) +inherit_nexthops(struct nexthop_adata *pn) { /* Proper nexthops (with defined GW) or dummy vlink nexthops (without iface) */ - return pn && (ipa_nonzero(pn->gw) || !pn->iface); + return pn && (ipa_nonzero(pn->nh.gw) || !pn->nh.iface); } static inline ip_addr @@ -1744,12 +1723,12 @@ link_lsa_lladdr(struct ospf_proto *p, struct top_hash_entry *en) return ospf_is_ip4(p) ? ipa_from_ip4(ospf3_6to4(ll)) : ipa_from_ip6(ll); } -static struct nexthop * +static struct nexthop_adata * calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry *par, int pos, uint data, uint lif, uint nif) { struct ospf_proto *p = oa->po; - struct nexthop *pn = par->nhs; + struct nexthop_adata *pn = par->nhs; struct top_hash_entry *link = NULL; struct ospf_iface *ifa = NULL; ip_addr nh = IPA_NONE; @@ -1827,10 +1806,10 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, return NULL; } - struct nexthop *nhs = new_nexthop(p, nh, ifa->iface, ifa->ecmp_weight); + struct nexthop_adata *nhs = new_nexthop(p, nh, ifa->iface, ifa->ecmp_weight); if (ifa->addr->flags & IA_HOST) - nhs->flags = RNF_ONLINK; + nhs->nh.flags = RNF_ONLINK; return nhs; } @@ -1851,7 +1830,7 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, if (ipa_zero(en->lb)) goto bad; - return new_nexthop(p, en->lb, pn->iface, pn->weight); + return new_nexthop(p, en->lb, pn->nh.iface, pn->nh.weight); } else /* OSPFv3 */ { @@ -1859,7 +1838,7 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, * Next-hop is taken from lladdr field of Link-LSA, en->lb_id * is computed in link_back(). */ - link = ospf_hash_find(p->gr, pn->iface->index, en->lb_id, rid, LSA_T_LINK); + link = ospf_hash_find(p->gr, pn->nh.iface->index, en->lb_id, rid, LSA_T_LINK); if (!link) return NULL; @@ -1867,7 +1846,7 @@ calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en, if (ipa_zero(nh)) return NULL; - return new_nexthop(p, nh, pn->iface, pn->weight); + return new_nexthop(p, nh, pn->nh.iface, pn->nh.weight); } } @@ -1914,7 +1893,7 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry if (!link_back(oa, en, par, lif, nif)) return; - struct nexthop *nhs = calc_next_hop(oa, en, par, pos, data, lif, nif); + struct nexthop_adata *nhs = calc_next_hop(oa, en, par, pos, data, lif, nif); if (!nhs) { log(L_WARN "%s: Cannot find next hop for LSA (Type: %04x, Id: %R, Rt: %R)", @@ -1923,7 +1902,7 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry } /* If en->dist > 0, we know that en->color == CANDIDATE and en->nhs is defined. */ - if ((dist == en->dist) && !nh_is_vlink(en->nhs)) + if ((dist == en->dist) && !nh_is_vlink(&en->nhs->nh)) { /* * For multipath, we should merge nexthops. We merge regular nexthops only. @@ -1947,13 +1926,11 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry */ /* Keep old ones */ - if (!p->ecmp || nh_is_vlink(nhs) || (nhs == en->nhs)) + if (!p->ecmp || nh_is_vlink(&nhs->nh) || (nhs == en->nhs)) return; /* Merge old and new */ - int new_reuse = (par->nhs != nhs); - en->nhs = nexthop_merge(en->nhs, nhs, en->nhs_reuse, new_reuse, p->ecmp, p->nhpool); - en->nhs_reuse = 1; + en->nhs = nexthop_merge(en->nhs, nhs, p->ecmp, p->nhpool); return; } @@ -1967,7 +1944,6 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry en->nhs = nhs; en->dist = dist; en->color = CANDIDATE; - en->nhs_reuse = (par->nhs != nhs); prev = NULL; @@ -2001,14 +1977,34 @@ add_cand(struct ospf_area *oa, struct top_hash_entry *en, struct top_hash_entry } static inline int -ort_changed(ort *nf, rta *nr) +ort_changed(ort *nf, ea_list *nr) { - rta *or = nf->old_rta; - return !or || + ea_list *or = nf->old_ea; + + if (!or || (nf->n.metric1 != nf->old_metric1) || (nf->n.metric2 != nf->old_metric2) || - (nf->n.tag != nf->old_tag) || (nf->n.rid != nf->old_rid) || - (nr->source != or->source) || (nr->dest != or->dest) || - !nexthop_same(&(nr->nh), &(or->nh)); + (nf->n.tag != nf->old_tag) || (nf->n.rid != nf->old_rid)) + return 1; + + eattr *nhea_n = ea_find(nr, &ea_gen_nexthop); + eattr *nhea_o = ea_find(or, &ea_gen_nexthop); + if (!nhea_n != !nhea_o) + return 1; + + if (nhea_n && nhea_o) + { + struct nexthop_adata *nhad_n = (struct nexthop_adata *) nhea_n->u.ptr; + struct nexthop_adata *nhad_o = (struct nexthop_adata *) nhea_o->u.ptr; + + if (!nexthop_same(nhad_n, nhad_o)) + return 1; + } + + if ( ea_get_int(nr, &ea_gen_source, 0) + != ea_get_int(or, &ea_gen_source, 0)) + return 1; + + return 0; } static void @@ -2030,10 +2026,9 @@ again1: FIB_ITERATE_START(fib, &fit, ort, nf) { /* Sanity check of next-hop addresses, failure should not happen */ - if (nf->n.type) + if (nf->n.type && nf->n.nhs) { - struct nexthop *nh; - for (nh = nf->n.nhs; nh; nh = nh->next) + NEXTHOP_WALK(nh, nf->n.nhs) if (ipa_nonzero(nh->gw)) { neighbor *nbr = neigh_find(&p->p, nh->gw, nh->iface, @@ -2052,69 +2047,67 @@ again1: if (nf->n.type) /* Add the route */ { - rta a0 = { - .source = nf->n.type, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, - .nh = *(nf->n.nhs), - .pref = p->p.main_channel->preference, - }; - - if (reload || ort_changed(nf, &a0)) - { - a0.eattrs = alloca(sizeof(ea_list) + 4 * sizeof(eattr)); - memset(a0.eattrs, 0, sizeof(ea_list)); + struct { + ea_list l; + eattr a[7]; + } eattrs; + eattrs.l = (ea_list) {}; + + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_gen_preference, 0, p->p.main_channel->preference); + + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_gen_source, 0, nf->n.type); + + eattrs.a[eattrs.l.count++] = + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nf->n.nhs->ad); + + if (reload || ort_changed(nf, &eattrs.l)) + { nf->old_metric1 = nf->n.metric1; nf->old_metric2 = nf->n.metric2; nf->old_tag = nf->n.tag; nf->old_rid = nf->n.rid; - a0.eattrs->attrs[a0.eattrs->count++] = (eattr) { - .id = EA_OSPF_METRIC1, - .type = EAF_TYPE_INT, - .u.data = nf->n.metric1, - }; + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_ospf_metric1, 0, nf->n.metric1); if (nf->n.type == RTS_OSPF_EXT2) - a0.eattrs->attrs[a0.eattrs->count++] = (eattr) { - .id = EA_OSPF_METRIC2, - .type = EAF_TYPE_INT, - .u.data = nf->n.metric2, - }; + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_ospf_metric2, 0, nf->n.metric2); if ((nf->n.type == RTS_OSPF_EXT1) || (nf->n.type == RTS_OSPF_EXT2)) - a0.eattrs->attrs[a0.eattrs->count++] = (eattr) { - .id = EA_OSPF_TAG, - .type = EAF_TYPE_INT, - .u.data = nf->n.tag, - }; - - a0.eattrs->attrs[a0.eattrs->count++] = (eattr) { - .id = EA_OSPF_ROUTER_ID, - .type = EAF_TYPE_ROUTER_ID, - .u.data = nf->n.rid, - }; + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_ospf_tag, 0, nf->n.tag); + + eattrs.a[eattrs.l.count++] = + EA_LITERAL_EMBEDDED(&ea_ospf_router_id, 0, nf->n.rid); - rta_free(nf->old_rta); - nf->old_rta = rta_lookup(&a0); + ASSERT_DIE(ARRAY_SIZE(eattrs.a) >= eattrs.l.count); + + ea_list *eal = ea_lookup(&eattrs.l, 0); + ea_free(nf->old_ea); + nf->old_ea = eal; rte e0 = { - .attrs = nf->old_rta, + .attrs = eal, .src = p->p.main_source, }; + /* DBG("Mod rte type %d - %N via %I on iface %s, met %d\n", a0.source, nf->fn.addr, a0.gw, a0.iface ? a0.iface->name : "(none)", nf->n.metric1); + */ rte_update(p->p.main_channel, nf->fn.addr, &e0, p->p.main_source); } } - else if (nf->old_rta) + else if (nf->old_ea) { /* Remove the route */ - rta_free(nf->old_rta); - nf->old_rta = NULL; + rta_free(nf->old_ea); + nf->old_ea = NULL; rte_update(p->p.main_channel, nf->fn.addr, NULL, p->p.main_source); } diff --git a/proto/ospf/rt.h b/proto/ospf/rt.h index 094e125b..88eefef9 100644 --- a/proto/ospf/rt.h +++ b/proto/ospf/rt.h @@ -18,8 +18,6 @@ typedef struct orta { u8 type; /* RTS_OSPF_* */ - u8 nhs_reuse; /* Whether nhs nodes can be reused during merging. - See a note in rt.c:add_cand() */ u32 options; /* * For ORT_ROUTER routes, options field are router-LSA style @@ -53,7 +51,7 @@ typedef struct orta struct ospf_area *oa; struct ospf_area *voa; /* Used when route is replaced in ospf_rt_sum_tr(), NULL otherwise */ - struct nexthop *nhs; /* Next hops computed during SPF */ + struct nexthop_adata *nhs; /* Next hops computed during SPF */ struct top_hash_entry *en; /* LSA responsible for this orta */ } orta; @@ -80,7 +78,7 @@ typedef struct ort */ orta n; u32 old_metric1, old_metric2, old_tag, old_rid; - rta *old_rta; + ea_list *old_ea; u32 lsa_id; u8 external_rte; u8 area_net; diff --git a/proto/ospf/topology.c b/proto/ospf/topology.c index bb88d20a..85bce03d 100644 --- a/proto/ospf/topology.c +++ b/proto/ospf/topology.c @@ -1337,9 +1337,9 @@ ospf_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *n, rt ASSERT(p->asbr); /* Get route attributes */ - rta *a = new->attrs; - eattr *m1a = ea_find(a->eattrs, EA_OSPF_METRIC1); - eattr *m2a = ea_find(a->eattrs, EA_OSPF_METRIC2); + ea_list *a = new->attrs; + eattr *m1a = ea_find(a, &ea_ospf_metric1); + eattr *m2a = ea_find(a, &ea_ospf_metric2); uint m1 = m1a ? m1a->u.data : 0; uint m2 = m2a ? m2a->u.data : 10000; @@ -1363,11 +1363,14 @@ ospf_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *n, rt uint ebit = m2a || !m1a; uint metric = ebit ? m2 : m1; - uint tag = ea_get_int(a->eattrs, EA_OSPF_TAG, 0); + uint tag = ea_get_int(a, &ea_ospf_tag, 0); ip_addr fwd = IPA_NONE; - if ((a->dest == RTD_UNICAST) && use_gw_for_fwaddr(p, a->nh.gw, a->nh.iface)) - fwd = a->nh.gw; + eattr *nhea = ea_find(a, &ea_gen_nexthop); + struct nexthop_adata *nhad = (struct nexthop_adata *) nhea->u.ptr; + if (NEXTHOP_IS_REACHABLE(nhad)) + if (use_gw_for_fwaddr(p, nhad->nh.gw, nhad->nh.iface)) + fwd = nhad->nh.gw; /* NSSA-LSA with P-bit set must have non-zero forwarding address */ if (oa && ipa_zero(fwd)) @@ -2135,7 +2138,7 @@ ospf_hash_delete(struct top_graph *f, struct top_hash_entry *e) if (*ee == e) { *ee = e->next; - sl_free(f->hash_slab, e); + sl_free(e); if (f->hash_entries-- < f->hash_entries_min) ospf_top_rehash(f, -HASH_LO_STEP); return; diff --git a/proto/ospf/topology.h b/proto/ospf/topology.h index c36d0b50..3c92b431 100644 --- a/proto/ospf/topology.h +++ b/proto/ospf/topology.h @@ -28,7 +28,7 @@ struct top_hash_entry u16 next_lsa_opts; /* For postponed LSA origination */ btime inst_time; /* Time of installation into DB */ struct ort *nf; /* Reference fibnode for sum and ext LSAs, NULL for otherwise */ - struct nexthop *nhs; /* Computed nexthops - valid only in ospf_rt_spf() */ + struct nexthop_adata *nhs; /* Computed nexthops - valid only in ospf_rt_spf() */ ip_addr lb; /* In OSPFv2, link back address. In OSPFv3, any global address in the area useful for vlinks */ u32 lb_id; /* Interface ID of link back iface (for bcast or NBMA networks) */ u32 dist; /* Distance from the root */ @@ -39,8 +39,6 @@ struct top_hash_entry #define CANDIDATE 1 #define INSPF 2 u8 mode; /* LSA generated during RT calculation (LSA_RTCALC or LSA_STALE)*/ - u8 nhs_reuse; /* Whether nhs nodes can be reused during merging. - See a note in rt.c:add_cand() */ }; diff --git a/proto/perf/perf.c b/proto/perf/perf.c index aa688d88..dc5bbf2f 100644 --- a/proto/perf/perf.c +++ b/proto/perf/perf.c @@ -18,7 +18,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "conf/conf.h" #include "filter/filter.h" @@ -85,7 +85,7 @@ random_net_ip4(void) } struct perf_random_routes { - struct rta *a; + ea_list *a; net_addr net; }; @@ -142,17 +142,21 @@ perf_loop(void *data) *((net_addr_ip4 *) &(p->data[i].net)) = random_net_ip4(); if (!p->attrs_per_rte || !(i % p->attrs_per_rte)) { - struct rta a0 = { - .source = RTS_PERF, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, - .pref = p->p.main_channel->preference, + ea_list *ea = NULL; + + ea_set_attr_u32(&ea, &ea_gen_preference, 0, p->p.main_channel->preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_PERF); + + struct nexthop_adata nhad = { .nh.iface = p->ifa->iface, .nh.gw = gw, .nh.weight = 1, }; - p->data[i].a = rta_lookup(&a0); + ea_set_attr_data(&ea, &ea_gen_nexthop, 0, + &nhad.ad.data, sizeof nhad - sizeof nhad.ad); + + p->data[i].a = rta_lookup(ea, 0); } else p->data[i].a = rta_clone(p->data[i-1].a); @@ -198,9 +202,9 @@ perf_loop(void *data) p->exp++; } - RT_LOCK(P->main_channel->table); - rt_schedule_prune(RT_PRIV(P->main_channel->table)); - RT_UNLOCK(P->main_channel->table); + RT_LOCKED(P->main_channel->table, tab) + rt_schedule_prune(tab); + ev_schedule(p->loop); } @@ -268,7 +272,7 @@ perf_init(struct proto_config *CF) switch (p->mode) { case PERF_MODE_IMPORT: - P->ifa_notify = perf_ifa_notify; + P->iface_sub.ifa_notify = perf_ifa_notify; break; case PERF_MODE_EXPORT: P->rt_notify = perf_rt_notify; @@ -307,7 +311,6 @@ perf_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUS struct protocol proto_perf = { .name = "Perf", .template = "perf%d", - .class = PROTOCOL_PERF, .channel_mask = NB_IP, .proto_size = sizeof(struct perf_proto), .config_size = sizeof(struct perf_config), @@ -316,3 +319,9 @@ struct protocol proto_perf = { .reconfigure = perf_reconfigure, .copy_config = perf_copy_config, }; + +void +perf_build(void) +{ + proto_build(&proto_perf); +} diff --git a/proto/pipe/Makefile b/proto/pipe/Makefile index 5093da98..0d68db4c 100644 --- a/proto/pipe/Makefile +++ b/proto/pipe/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/pipe/config.Y b/proto/pipe/config.Y index fc08445f..444de127 100644 --- a/proto/pipe/config.Y +++ b/proto/pipe/config.Y @@ -42,6 +42,12 @@ pipe_proto: pipe_proto_start '{' | pipe_proto proto_item ';' | pipe_proto channel_item_ ';' + | pipe_proto IMPORT IN net_any imexport ';' { + if (this_channel->net_type && ($4->type != this_channel->net_type)) + cf_error("Incompatible export prefilter type"); + PIPE_CFG->in_subprefix = $4; + this_channel->in_filter = $5; + } | pipe_proto PEER TABLE rtable ';' { PIPE_CFG->peer = $4; } | pipe_proto MAX GENERATION expr ';' { if (($4 < 1) || ($4 > 254)) cf_error("Max generation must be in range 1..254, got %u", $4); diff --git a/proto/pipe/pipe.c b/proto/pipe/pipe.c index 270f7b92..b2083010 100644 --- a/proto/pipe/pipe.c +++ b/proto/pipe/pipe.c @@ -35,7 +35,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "conf/conf.h" #include "filter/filter.h" @@ -43,32 +43,29 @@ #include "pipe.h" -#ifdef CONFIG_BGP -#include "proto/bgp/bgp.h" -#endif - static void pipe_rt_notify(struct proto *P, struct channel *src_ch, const net_addr *n, rte *new, const rte *old) { struct pipe_proto *p = (void *) P; struct channel *dst = (src_ch == p->pri) ? p->sec : p->pri; + uint *flags = (src_ch == p->pri) ? &p->sec_flags : &p->pri_flags; if (!new && !old) return; + /* Start the route refresh if requested to */ + if (*flags & PIPE_FL_RR_BEGIN_PENDING) + { + *flags &= ~PIPE_FL_RR_BEGIN_PENDING; + rt_refresh_begin(&dst->in_req); + } + if (new) { - rta *a = alloca(rta_size(new->attrs)); - memcpy(a, new->attrs, rta_size(new->attrs)); + rte e0 = rte_init_from(new); - a->cached = 0; - a->hostentry = NULL; - - rte e0 = { - .attrs = a, - .src = new->src, - .generation = new->generation + 1, - }; + e0.generation = new->generation + 1; + ea_unset_attr(&e0.attrs, 0, &ea_gen_hostentry); rte_update(dst, n, &e0, new->src); } @@ -77,12 +74,12 @@ pipe_rt_notify(struct proto *P, struct channel *src_ch, const net_addr *n, rte * } static int -pipe_preexport(struct channel *c, rte *e) +pipe_preexport(struct channel *C, rte *e) { - struct pipe_proto *p = (void *) c->proto; + struct pipe_proto *p = (void *) C->proto; /* Avoid direct loopbacks */ - if (e->sender == c->in_req.hook) + if (e->sender == C->in_req.hook) return -1; /* Indirection check */ @@ -90,8 +87,8 @@ pipe_preexport(struct channel *c, rte *e) if (e->generation >= max_generation) { log_rl(&p->rl_gen, L_ERR "Route overpiped (%u hops of %u configured in %s) in table %s: %N %s/%u:%u", - e->generation, max_generation, c->proto->name, - c->table->name, e->net, e->src->owner->name, e->src->private_id, e->src->global_id); + e->generation, max_generation, C->proto->name, + C->table->name, e->net, e->src->owner->name, e->src->private_id, e->src->global_id); return -1; } @@ -109,12 +106,12 @@ pipe_reload_routes(struct channel *C) } static void -pipe_feed_begin(struct channel *C, int refeeding UNUSED) +pipe_feed_begin(struct channel *C, int initial UNUSED) { struct pipe_proto *p = (void *) C->proto; - struct channel *dst = (C == p->pri) ? p->sec : p->pri; + uint *flags = (C == p->pri) ? &p->sec_flags : &p->pri_flags; - channel_refresh_begin(dst); + *flags |= PIPE_FL_RR_BEGIN_PENDING; } static void @@ -122,8 +119,17 @@ pipe_feed_end(struct channel *C) { struct pipe_proto *p = (void *) C->proto; struct channel *dst = (C == p->pri) ? p->sec : p->pri; + uint *flags = (C == p->pri) ? &p->sec_flags : &p->pri_flags; + + /* If not even started, start the RR now */ + if (*flags & PIPE_FL_RR_BEGIN_PENDING) + { + *flags &= ~PIPE_FL_RR_BEGIN_PENDING; + rt_refresh_begin(&dst->in_req); + } - channel_refresh_end(dst); + /* Finish RR always */ + rt_refresh_end(&dst->in_req); } static void @@ -144,10 +150,16 @@ pipe_postconfig(struct proto_config *CF) if (cc->table->addr_type != cf->peer->addr_type) cf_error("Primary table and peer table must have the same type"); + if (cc->out_subprefix && (cc->table->addr_type != cc->out_subprefix->type)) + cf_error("Export subprefix must match table type"); + + if (cf->in_subprefix && (cc->table->addr_type != cf->in_subprefix->type)) + cf_error("Import subprefix must match table type"); + if (cc->rx_limit.action) cf_error("Pipe protocol does not support receive limits"); - if (cc->in_keep_filtered) + if (cc->in_keep) cf_error("Pipe protocol prohibits keeping filtered routes"); cc->debug = cf->c.debug; @@ -163,6 +175,7 @@ pipe_configure_channels(struct pipe_proto *p, struct pipe_config *cf) .channel = cc->channel, .table = cc->table, .out_filter = cc->out_filter, + .out_subprefix = cc->out_subprefix, .in_limit = cc->in_limit, .ra_mode = RA_ANY, .debug = cc->debug, @@ -174,6 +187,7 @@ pipe_configure_channels(struct pipe_proto *p, struct pipe_config *cf) .channel = cc->channel, .table = cf->peer, .out_filter = cc->in_filter, + .out_subprefix = cf->in_subprefix, .in_limit = cc->out_limit, .ra_mode = RA_ANY, .debug = cc->debug, @@ -318,7 +332,6 @@ pipe_update_debug(struct proto *P) struct protocol proto_pipe = { .name = "Pipe", .template = "pipe%d", - .class = PROTOCOL_PIPE, .proto_size = sizeof(struct pipe_proto), .config_size = sizeof(struct pipe_config), .postconfig = pipe_postconfig, @@ -328,3 +341,9 @@ struct protocol proto_pipe = { .get_status = pipe_get_status, .show_proto_info = pipe_show_proto_info }; + +void +pipe_build(void) +{ + proto_build(&proto_pipe); +} diff --git a/proto/pipe/pipe.h b/proto/pipe/pipe.h index 60c857eb..501b8565 100644 --- a/proto/pipe/pipe.h +++ b/proto/pipe/pipe.h @@ -12,6 +12,7 @@ struct pipe_config { struct proto_config c; struct rtable_config *peer; /* Table we're connected to */ + const net_addr *in_subprefix; u8 max_generation; }; @@ -19,7 +20,11 @@ struct pipe_proto { struct proto p; struct channel *pri; struct channel *sec; + uint pri_flags; + uint sec_flags; struct tbf rl_gen; }; +#define PIPE_FL_RR_BEGIN_PENDING 1 /* Route refresh should start with the first route notified */ + #endif diff --git a/proto/radv/Makefile b/proto/radv/Makefile index 05317eff..5c56fbf3 100644 --- a/proto/radv/Makefile +++ b/proto/radv/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/radv/config.Y b/proto/radv/config.Y index 8d4a3ab9..fb68d2e5 100644 --- a/proto/radv/config.Y +++ b/proto/radv/config.Y @@ -33,7 +33,7 @@ CF_KEYWORDS(RADV, PREFIX, INTERFACE, MIN, MAX, RA, DELAY, INTERVAL, SOLICITED, RETRANS, TIMER, CURRENT, HOP, LIMIT, DEFAULT, VALID, PREFERRED, MULT, LIFETIME, SKIP, ONLINK, AUTONOMOUS, RDNSS, DNSSL, NS, DOMAIN, LOCAL, TRIGGER, SENSITIVE, PREFERENCE, LOW, MEDIUM, HIGH, PROPAGATE, ROUTE, - ROUTES, RA_PREFERENCE, RA_LIFETIME) + ROUTES) CF_ENUM(T_ENUM_RA_PREFERENCE, RA_PREF_, LOW, MEDIUM, HIGH) @@ -336,9 +336,6 @@ radv_sensitive: | SENSITIVE bool { $$ = $2; } ; -dynamic_attr: RA_PREFERENCE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_ENUM_RA_PREFERENCE, EA_RA_PREFERENCE); } ; -dynamic_attr: RA_LIFETIME { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_RA_LIFETIME); } ; - CF_CODE CF_END diff --git a/proto/radv/packets.c b/proto/radv/packets.c index 5cd8b2de..c6b565d2 100644 --- a/proto/radv/packets.c +++ b/proto/radv/packets.c @@ -493,7 +493,7 @@ radv_sk_open(struct radv_iface *ifa) sk->data = ifa; sk->flags = SKF_LADDR_RX; - if (sk_open(sk) < 0) + if (sk_open(sk, ifa->ra->p.loop) < 0) goto err; /* We want listen just to ICMPv6 messages of type RS and RA */ diff --git a/proto/radv/radv.c b/proto/radv/radv.c index 15673555..434155dc 100644 --- a/proto/radv/radv.c +++ b/proto/radv/radv.c @@ -10,6 +10,7 @@ #include <stdlib.h> #include "radv.h" +#include "lib/macro.h" /** * DOC: Router Advertisements @@ -42,6 +43,8 @@ * RFC 6106 - DNS extensions (RDDNS, DNSSL) */ +static struct ea_class ea_radv_preference, ea_radv_lifetime; + static void radv_prune_prefixes(struct radv_iface *ifa); static void radv_prune_routes(struct radv_proto *p); @@ -263,9 +266,9 @@ radv_iface_find(struct radv_proto *p, struct iface *what) } static void -radv_iface_add(struct object_lock *lock) +radv_iface_add(void *_ifa) { - struct radv_iface *ifa = lock->data; + struct radv_iface *ifa = _ifa; struct radv_proto *p = ifa->ra; if (! radv_sk_open(ifa)) @@ -284,7 +287,7 @@ radv_iface_new(struct radv_proto *p, struct iface *iface, struct radv_iface_conf RADV_TRACE(D_EVENTS, "Adding interface %s", iface->name); - pool *pool = rp_new(p->p.pool, p->p.loop, iface->name); + pool *pool = rp_new(p->p.pool, iface->name); ifa = mb_allocz(pool, sizeof(struct radv_iface)); ifa->pool = pool; ifa->ra = p; @@ -302,8 +305,11 @@ radv_iface_new(struct radv_proto *p, struct iface *iface, struct radv_iface_conf lock->type = OBJLOCK_IP; lock->port = ICMPV6_PROTO; lock->iface = iface; - lock->data = ifa; - lock->hook = radv_iface_add; + lock->event = (event) { + .hook = radv_iface_add, + .data = ifa, + }; + lock->target = &global_event_list; ifa->lock = lock; olock_acquire(lock); @@ -317,7 +323,7 @@ radv_iface_remove(struct radv_iface *ifa) rem_node(NODE ifa); - rp_free(ifa->pool, p->p.pool); + rfree(ifa->pool); } static void @@ -391,10 +397,10 @@ radv_net_match_trigger(struct radv_config *cf, const net_addr *n) } int -radv_preexport(struct channel *c, rte *new) +radv_preexport(struct channel *C, rte *new) { // struct radv_proto *p = (struct radv_proto *) P; - struct radv_config *cf = (struct radv_config *) (c->proto->cf); + struct radv_config *cf = (struct radv_config *) (C->proto->cf); if (radv_net_match_trigger(cf, new->net)) return RIC_PROCESS; @@ -444,11 +450,11 @@ radv_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *n, rt { /* Update */ - ea = ea_find(new->attrs->eattrs, EA_RA_PREFERENCE); + ea = ea_find(new->attrs, &ea_radv_preference); uint preference = ea ? ea->u.data : RA_PREF_MEDIUM; uint preference_set = !!ea; - ea = ea_find(new->attrs->eattrs, EA_RA_LIFETIME); + ea = ea_find(new->attrs, &ea_radv_lifetime); uint lifetime = ea ? ea->u.data : 0; uint lifetime_set = !!ea; @@ -555,10 +561,7 @@ radv_check_active(struct radv_proto *p) return 1; struct channel *c = p->p.main_channel; - RT_LOCK(c->table); - int active = rt_examine(RT_PRIV(c->table), &cf->trigger, c, c->out_filter); - RT_UNLOCK(c->table); - return active; + return rt_examine(c->table, &cf->trigger, c, c->out_filter); } static void @@ -580,8 +583,8 @@ radv_init(struct proto_config *CF) P->preexport = radv_preexport; P->rt_notify = radv_rt_notify; - P->if_notify = radv_if_notify; - P->ifa_notify = radv_ifa_notify; + P->iface_sub.if_notify = radv_if_notify; + P->iface_sub.ifa_notify = radv_ifa_notify; return P; } @@ -663,10 +666,11 @@ radv_reconfigure(struct proto *P, struct proto_config *CF) if (!old->propagate_routes && new->propagate_routes) channel_request_feeding(p->p.main_channel); - IFACE_LEGACY_ACCESS; - struct iface *iface; - WALK_LIST(iface, global_iface_list) + IFACE_WALK(iface) { + if (p->p.vrf && p->p.vrf != iface->master) + continue; + if (!(iface->flags & IF_UP)) continue; @@ -742,27 +746,26 @@ radv_pref_str(u32 pref) } } -/* The buffer has some minimal size */ -static int -radv_get_attr(const eattr *a, byte *buf, int buflen UNUSED) +static void +radv_preference_format(const eattr *a, byte *buf, uint buflen) { - switch (a->id) - { - case EA_RA_PREFERENCE: - bsprintf(buf, "preference: %s", radv_pref_str(a->u.data)); - return GA_FULL; - case EA_RA_LIFETIME: - bsprintf(buf, "lifetime"); - return GA_NAME; - default: - return GA_UNKNOWN; - } + bsnprintf(buf, buflen, "%s", radv_pref_str(a->u.data)); } +static struct ea_class ea_radv_preference = { + .name = "radv_preference", + .type = T_ENUM_RA_PREFERENCE, + .format = radv_preference_format, +}; + +static struct ea_class ea_radv_lifetime = { + .name = "radv_lifetime", + .type = T_INT, +}; + struct protocol proto_radv = { .name = "RAdv", .template = "radv%d", - .class = PROTOCOL_RADV, .channel_mask = NB_IP6, .proto_size = sizeof(struct radv_proto), .config_size = sizeof(struct radv_config), @@ -773,5 +776,15 @@ struct protocol proto_radv = { .reconfigure = radv_reconfigure, .copy_config = radv_copy_config, .get_status = radv_get_status, - .get_attr = radv_get_attr }; + +void +radv_build(void) +{ + proto_build(&proto_radv); + + EA_REGISTER_ALL( + &ea_radv_preference, + &ea_radv_lifetime + ); +} diff --git a/proto/radv/radv.h b/proto/radv/radv.h index 14d40f8a..c9219bda 100644 --- a/proto/radv/radv.h +++ b/proto/radv/radv.h @@ -19,7 +19,7 @@ #include "lib/resource.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "nest/locks.h" #include "conf/conf.h" @@ -195,10 +195,6 @@ struct radv_iface #define RA_PREF_HIGH 0x08 #define RA_PREF_MASK 0x18 -/* Attributes */ -#define EA_RA_PREFERENCE EA_CODE(PROTOCOL_RADV, 0) -#define EA_RA_LIFETIME EA_CODE(PROTOCOL_RADV, 1) - #ifdef LOCAL_DEBUG #define RADV_FORCE_DEBUG 1 #else diff --git a/proto/rip/Makefile b/proto/rip/Makefile index 7feabcd8..f4a6fa72 100644 --- a/proto/rip/Makefile +++ b/proto/rip/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/rip/config.Y b/proto/rip/config.Y index 28ee9609..3c0973b1 100644 --- a/proto/rip/config.Y +++ b/proto/rip/config.Y @@ -37,7 +37,7 @@ CF_KEYWORDS(RIP, NG, ECMP, LIMIT, WEIGHT, INFINITY, METRIC, UPDATE, TIMEOUT, PASSIVE, VERSION, SPLIT, HORIZON, POISON, REVERSE, CHECK, ZERO, TIME, BFD, AUTHENTICATION, NONE, PLAINTEXT, CRYPTOGRAPHIC, MD5, TTL, SECURITY, RX, TX, BUFFER, LENGTH, PRIORITY, ONLY, LINK, - DEMAND, CIRCUIT, RIP_METRIC, RIP_TAG) + DEMAND, CIRCUIT) %type <i> rip_variant rip_auth @@ -190,9 +190,6 @@ rip_iface: rip_iface_start iface_patt_list_nopx rip_iface_opt_list rip_iface_finish; -dynamic_attr: RIP_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_RIP_METRIC); } ; -dynamic_attr: RIP_TAG { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_RIP_TAG); } ; - CF_CLI_HELP(SHOW RIP, ..., [[Show information about RIP protocol]]); CF_CLI(SHOW RIP INTERFACES, optproto opttext, [<name>] [\"<interface>\"], [[Show information about RIP interfaces]]) diff --git a/proto/rip/packets.c b/proto/rip/packets.c index 9c3bd7a3..fecdf896 100644 --- a/proto/rip/packets.c +++ b/proto/rip/packets.c @@ -1012,7 +1012,7 @@ rip_open_socket(struct rip_iface *ifa) /* sk->rbsize and sk->tbsize are handled in rip_iface_update_buffers() */ - if (sk_open(sk) < 0) + if (sk_open(sk, p->p.loop) < 0) goto err; if (ifa->cf->mode == RIP_IM_MULTICAST) diff --git a/proto/rip/rip.c b/proto/rip/rip.c index 0f8b10ad..d15177da 100644 --- a/proto/rip/rip.c +++ b/proto/rip/rip.c @@ -78,6 +78,7 @@ #include <stdlib.h> #include "rip.h" +#include "lib/macro.h" static inline void rip_lock_neighbor(struct rip_neighbor *n); @@ -88,6 +89,7 @@ static inline void rip_iface_kick_timer(struct rip_iface *ifa); static void rip_iface_timer(timer *timer); static void rip_trigger_update(struct rip_proto *p); +static struct ea_class ea_rip_metric, ea_rip_tag, ea_rip_from; /* * RIP routes @@ -108,14 +110,14 @@ rip_add_rte(struct rip_proto *p, struct rip_rte **rp, struct rip_rte *src) } static inline void -rip_remove_rte(struct rip_proto *p, struct rip_rte **rp) +rip_remove_rte(struct rip_proto *p UNUSED, struct rip_rte **rp) { struct rip_rte *rt = *rp; rip_unlock_neighbor(rt->from); *rp = rt->next; - sl_free(p->rte_slab, rt); + sl_free(rt); } static inline int rip_same_rte(struct rip_rte *a, struct rip_rte *b) @@ -124,6 +126,11 @@ static inline int rip_same_rte(struct rip_rte *a, struct rip_rte *b) static inline int rip_valid_rte(struct rip_rte *rt) { return rt->from->ifa != NULL; } +struct rip_iface_adata { + struct adata ad; + struct iface *iface; +}; + /** * rip_announce_rte - announce route from RIP routing table to the core * @p: RIP instance @@ -144,71 +151,87 @@ rip_announce_rte(struct rip_proto *p, struct rip_entry *en) if (rt) { /* Update */ - rta a0 = { - .pref = p->p.main_channel->preference, - .source = RTS_RIP, - .scope = SCOPE_UNIVERSE, - .dest = RTD_UNICAST, + struct { + ea_list l; + eattr a[3]; + } ea_block = { + .l.count = ARRAY_SIZE(ea_block.a), + .a = { + EA_LITERAL_EMBEDDED(&ea_gen_preference, 0, p->p.main_channel->preference), + EA_LITERAL_EMBEDDED(&ea_gen_source, 0, RTS_RIP), + EA_LITERAL_EMBEDDED(&ea_rip_metric, 0, rt->metric), + }, }; - u8 rt_metric = rt->metric; + ea_list *ea = &ea_block.l; + u16 rt_tag = rt->tag; + struct iface *rt_from = NULL; if (p->ecmp) { /* ECMP route */ - struct nexthop *nhs = NULL; int num = 0; for (rt = en->routes; rt && (num < p->ecmp); rt = rt->next) + if (rip_valid_rte(rt)) + num++; + + struct nexthop_adata *nhad = (struct nexthop_adata *) tmp_alloc_adata((num+1) * sizeof(struct nexthop)); + struct nexthop *nh = &nhad->nh; + + for (rt = en->routes; rt && (num < p->ecmp); rt = rt->next) { if (!rip_valid_rte(rt)) - continue; + continue; - struct nexthop *nh = allocz(sizeof(struct nexthop)); + *nh = (struct nexthop) { + .gw = rt->next_hop, + .iface = rt->from->ifa->iface, + .weight = rt->from->ifa->cf->ecmp_weight, + }; - nh->gw = rt->next_hop; - nh->iface = rt->from->ifa->iface; - nh->weight = rt->from->ifa->cf->ecmp_weight; + if (!rt_from) + rt_from = rt->from->ifa->iface; - nexthop_insert(&nhs, nh); - num++; + nh = NEXTHOP_NEXT(nh); if (rt->tag != rt_tag) rt_tag = 0; } - a0.nh = *nhs; + nhad->ad.length = ((void *) nh - (void *) nhad->ad.data); + + ea_set_attr(&ea, + EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, + &(nexthop_sort(nhad, tmp_linpool)->ad))); } else { /* Unipath route */ - a0.from = rt->from->nbr->addr; - a0.nh.gw = rt->next_hop; - a0.nh.iface = rt->from->ifa->iface; + rt_from = rt->from->ifa->iface; + + struct nexthop_adata nhad = { + .nh.gw = rt->next_hop, + .nh.iface = rt->from->ifa->iface, + }; + + ea_set_attr_data(&ea, &ea_gen_nexthop, 0, + &nhad.ad.data, sizeof nhad - sizeof nhad.ad); + ea_set_attr_data(&ea, &ea_gen_from, 0, &rt->from->nbr->addr, sizeof(ip_addr)); } - a0.eattrs = alloca(sizeof(ea_list) + 3*sizeof(eattr)); - memset(a0.eattrs, 0, sizeof(ea_list)); /* Zero-ing only the ea_list header */ - a0.eattrs->count = 3; - a0.eattrs->attrs[0] = (eattr) { - .id = EA_RIP_METRIC, - .type = EAF_TYPE_INT, - .u.data = rt_metric, - }; - a0.eattrs->attrs[1] = (eattr) { - .id = EA_RIP_TAG, - .type = EAF_TYPE_INT, - .u.data = rt_tag, - }; - a0.eattrs->attrs[2] = (eattr) { - .id = EA_RIP_FROM, - .type = EAF_TYPE_PTR, - .u.data = (uintptr_t) a0.nh.iface, + ea_set_attr_u32(&ea, &ea_rip_tag, 0, rt_tag); + + struct rip_iface_adata riad = { + .ad = { .length = sizeof(struct rip_iface_adata) - sizeof(struct adata) }, + .iface = rt_from, }; + ea_set_attr(&ea, + EA_LITERAL_DIRECT_ADATA(&ea_rip_from, 0, &riad.ad)); rte e0 = { - .attrs = &a0, + .attrs = ea, .src = p->p.main_source, }; @@ -320,9 +343,10 @@ rip_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net, s if (new) { /* Update */ - u32 rt_tag = ea_get_int(new->attrs->eattrs, EA_RIP_TAG, 0); - u32 rt_metric = ea_get_int(new->attrs->eattrs, EA_RIP_METRIC, 1); - struct iface *rt_from = (struct iface *) ea_get_int(new->attrs->eattrs, EA_RIP_FROM, 0); + u32 rt_tag = ea_get_int(new->attrs, &ea_rip_tag, 0); + u32 rt_metric = ea_get_int(new->attrs, &ea_rip_metric, 1); + const eattr *rie = ea_find(new->attrs, &ea_rip_from); + struct iface *rt_from = rie ? ((struct rip_iface_adata *) rie->u.ptr)->iface : NULL; if (rt_metric > p->infinity) { @@ -354,8 +378,14 @@ rip_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net, s en->metric = rt_metric; en->tag = rt_tag; en->from = (new->src->owner == &P->sources) ? rt_from : NULL; - en->iface = new->attrs->nh.iface; - en->next_hop = new->attrs->nh.gw; + + eattr *nhea = ea_find(new->attrs, &ea_gen_nexthop); + if (nhea) + { + struct nexthop_adata *nhad = (struct nexthop_adata *) nhea->u.ptr; + en->iface = nhad->nh.iface; + en->next_hop = nhad->nh.gw; + } } else { @@ -513,7 +543,7 @@ rip_update_bfd(struct rip_proto *p, struct rip_neighbor *n) ip_addr saddr = rip_is_v2(p) ? n->ifa->sk->saddr : n->nbr->ifa->ip; n->bfd_req = bfd_request_session(p->p.pool, n->nbr->addr, saddr, n->nbr->iface, p->p.vrf, - rip_bfd_notify, n, birdloop_event_list(p->p.loop), NULL); + rip_bfd_notify, n, p->p.loop, NULL); } if (!use_bfd && n->bfd_req) @@ -637,9 +667,9 @@ rip_iface_update_bfd(struct rip_iface *ifa) static void -rip_iface_locked(struct object_lock *lock) +rip_iface_locked(void *_ifa) { - struct rip_iface *ifa = lock->data; + struct rip_iface *ifa = _ifa; struct rip_proto *p = ifa->rip; if (!rip_open_socket(ifa)) @@ -701,8 +731,11 @@ rip_add_iface(struct rip_proto *p, struct iface *iface, struct rip_iface_config lock->type = OBJLOCK_UDP; lock->port = ic->port; lock->iface = iface; - lock->data = ifa; - lock->hook = rip_iface_locked; + lock->event = (event) { + .hook = rip_iface_locked, + .data = ifa, + }; + lock->target = &global_event_list; ifa->lock = lock; olock_acquire(lock); @@ -774,11 +807,11 @@ rip_reconfigure_iface(struct rip_proto *p, struct rip_iface *ifa, struct rip_ifa static void rip_reconfigure_ifaces(struct rip_proto *p, struct rip_config *cf) { - struct iface *iface; - - IFACE_LEGACY_ACCESS; - WALK_LIST(iface, global_iface_list) + IFACE_WALK(iface) { + if (p->p.vrf && p->p.vrf != iface->master) + continue; + if (!(iface->flags & IF_UP)) continue; @@ -1086,28 +1119,22 @@ rip_reload_routes(struct channel *C) static struct rte_owner_class rip_rte_owner_class; static inline struct rip_proto * -rip_rte_proto(struct rte *rte) +rip_rte_proto(const rte *rte) { return (rte->src->owner->class == &rip_rte_owner_class) ? SKIP_BACK(struct rip_proto, p.sources, rte->src->owner) : NULL; } -static int -rip_rte_better(struct rte *new, struct rte *old) +static u32 +rip_rte_igp_metric(const rte *rt) { - ASSERT_DIE(new->src == old->src); - struct rip_proto *p = rip_rte_proto(new); - - u32 new_metric = ea_get_int(new->attrs->eattrs, EA_RIP_METRIC, p->infinity); - u32 old_metric = ea_get_int(old->attrs->eattrs, EA_RIP_METRIC, p->infinity); - - return new_metric < old_metric; + return ea_get_int(rt->attrs, &ea_rip_metric, IGP_METRIC_UNKNOWN); } -static u32 -rip_rte_igp_metric(struct rte *rt) +static int +rip_rte_better(const rte *new, const rte *old) { - return ea_get_int(rt->attrs->eattrs, EA_RIP_METRIC, IGP_METRIC_UNKNOWN); + return rip_rte_igp_metric(new) < rip_rte_igp_metric(old); } static void @@ -1127,9 +1154,9 @@ rip_init(struct proto_config *CF) P->main_channel = proto_add_channel(P, proto_cf_main_channel(CF)); - P->if_notify = rip_if_notify; + P->iface_sub.if_notify = rip_if_notify; P->rt_notify = rip_rt_notify; - P->neigh_notify = rip_neigh_notify; + P->iface_sub.neigh_notify = rip_neigh_notify; P->reload_routes = rip_reload_routes; P->sources.class = &rip_rte_owner_class; @@ -1204,35 +1231,41 @@ rip_reconfigure(struct proto *P, struct proto_config *CF) } static void -rip_get_route_info(rte *rte, byte *buf) +rip_get_route_info(const rte *rte, byte *buf) { struct rip_proto *p = rip_rte_proto(rte); - u32 rt_metric = ea_get_int(rte->attrs->eattrs, EA_RIP_METRIC, p->infinity); - u32 rt_tag = ea_get_int(rte->attrs->eattrs, EA_RIP_TAG, 0); + u32 rt_metric = ea_get_int(rte->attrs, &ea_rip_metric, p->infinity); + u32 rt_tag = ea_get_int(rte->attrs, &ea_rip_tag, 0); - buf += bsprintf(buf, " (%d/%d)", rte->attrs->pref, rt_metric); + buf += bsprintf(buf, " (%d/%d)", rt_get_preference(rte), rt_metric); if (rt_tag) bsprintf(buf, " [%04x]", rt_tag); } -static int -rip_get_attr(const eattr *a, byte *buf, int buflen UNUSED) +static void +rip_tag_format(const eattr *a, byte *buf, uint buflen) { - switch (a->id) - { - case EA_RIP_METRIC: - bsprintf(buf, "metric: %d", a->u.data); - return GA_FULL; + bsnprintf(buf, buflen, "%04x", a->u.data); +} + +static struct ea_class ea_rip_metric = { + .name = "rip_metric", + .type = T_INT, +}; - case EA_RIP_TAG: - bsprintf(buf, "tag: %04x", a->u.data); - return GA_FULL; +static struct ea_class ea_rip_tag = { + .name = "rip_tag", + .type = T_INT, + .format = rip_tag_format, +}; - default: - return GA_UNKNOWN; - } -} +static struct ea_class ea_rip_from = { + .name = "rip_from", + .type = T_IFACE, + .readonly = 1, + .hidden = 1, +}; void rip_show_interfaces(struct proto *P, const char *iff) @@ -1342,7 +1375,6 @@ static struct rte_owner_class rip_rte_owner_class = { struct protocol proto_rip = { .name = "RIP", .template = "rip%d", - .class = PROTOCOL_RIP, .preference = DEF_PREF_RIP, .channel_mask = NB_IP, .proto_size = sizeof(struct rip_proto), @@ -1353,5 +1385,16 @@ struct protocol proto_rip = { .start = rip_start, .shutdown = rip_shutdown, .reconfigure = rip_reconfigure, - .get_attr = rip_get_attr }; + +void +rip_build(void) +{ + proto_build(&proto_rip); + + EA_REGISTER_ALL( + &ea_rip_metric, + &ea_rip_tag, + &ea_rip_from + ); +} diff --git a/proto/rip/rip.h b/proto/rip/rip.h index f8713c4a..a01f8d3b 100644 --- a/proto/rip/rip.h +++ b/proto/rip/rip.h @@ -16,7 +16,7 @@ #include "nest/cli.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/password.h" #include "nest/locks.h" #include "nest/bfd.h" @@ -195,10 +195,6 @@ struct rip_rte #define RIP_ENTRY_VALID 1 /* Valid outgoing route */ #define RIP_ENTRY_STALE 2 /* Stale outgoing route, waiting for GC */ -#define EA_RIP_METRIC EA_CODE(PROTOCOL_RIP, 0) -#define EA_RIP_TAG EA_CODE(PROTOCOL_RIP, 1) -#define EA_RIP_FROM EA_CODE(PROTOCOL_RIP, 2) - static inline int rip_is_v2(struct rip_proto *p) { return p->rip2; } diff --git a/proto/rpki/Makefile b/proto/rpki/Makefile index eb09b7df..0f60b2a0 100644 --- a/proto/rpki/Makefile +++ b/proto/rpki/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/rpki/rpki.c b/proto/rpki/rpki.c index afba2216..e5638aff 100644 --- a/proto/rpki/rpki.c +++ b/proto/rpki/rpki.c @@ -121,14 +121,11 @@ rpki_table_add_roa(struct rpki_cache *cache, struct channel *channel, const net_ { struct rpki_proto *p = cache->p; - rta a0 = { - .pref = channel->preference, - .source = RTS_RPKI, - .scope = SCOPE_UNIVERSE, - .dest = RTD_NONE, - }; + ea_list *ea = NULL; + ea_set_attr_u32(&ea, &ea_gen_preference, 0, channel->preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_RPKI); - rte e0 = { .attrs = &a0, .src = p->p.main_source, }; + rte e0 = { .attrs = ea, .src = p->p.main_source, }; rte_update(channel, &pfxr->n, &e0, p->p.main_source); } @@ -302,12 +299,13 @@ rpki_cache_change_state(struct rpki_cache *cache, const enum rpki_cache_state ne case RPKI_CS_NO_INCR_UPDATE_AVAIL: /* Server was unable to answer the last Serial Query and sent Cache Reset. */ - rpki_cache_change_state(cache, RPKI_CS_RESET); - break; - case RPKI_CS_ERROR_NO_DATA_AVAIL: /* No validation records are available on the cache server. */ - rpki_cache_change_state(cache, RPKI_CS_RESET); + + if (old_state == RPKI_CS_ESTABLISHED) + rpki_cache_change_state(cache, RPKI_CS_RESET); + else + rpki_schedule_next_retry(cache); break; case RPKI_CS_ERROR_FATAL: @@ -491,6 +489,11 @@ rpki_retry_hook(timer *tm) } break; + case RPKI_CS_NO_INCR_UPDATE_AVAIL: + case RPKI_CS_ERROR_NO_DATA_AVAIL: + rpki_cache_change_state(cache, RPKI_CS_RESET); + break; + default: rpki_cache_change_state(cache, RPKI_CS_CONNECTING); break; @@ -596,7 +599,7 @@ rpki_check_expire_interval(uint seconds) static struct rpki_cache * rpki_init_cache(struct rpki_proto *p, struct rpki_config *cf) { - pool *pool = rp_new(p->p.pool, p->p.loop, cf->hostname); + pool *pool = rp_new(p->p.pool, cf->hostname); struct rpki_cache *cache = mb_allocz(pool, sizeof(struct rpki_cache)); @@ -870,16 +873,27 @@ rpki_show_proto_info(struct proto *P) if (cache) { const char *transport_name = "---"; + uint default_port = 0; switch (cf->tr_config.type) { #if HAVE_LIBSSH - case RPKI_TR_SSH: transport_name = "SSHv2"; break; + case RPKI_TR_SSH: + transport_name = "SSHv2"; + default_port = RPKI_SSH_PORT; + break; #endif - case RPKI_TR_TCP: transport_name = "Unprotected over TCP"; break; + case RPKI_TR_TCP: + transport_name = "Unprotected over TCP"; + default_port = RPKI_TCP_PORT; + break; }; cli_msg(-1006, " Cache server: %s", cf->hostname); + + if (cf->port != default_port) + cli_msg(-1006, " Cache port: %u", cf->port); + cli_msg(-1006, " Status: %s", rpki_cache_state_to_str(cache->state)); cli_msg(-1006, " Transport: %s", transport_name); cli_msg(-1006, " Protocol version: %u", cache->version); @@ -977,7 +991,6 @@ rpki_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUS struct protocol proto_rpki = { .name = "RPKI", .template = "rpki%d", - .class = PROTOCOL_RPKI, .preference = DEF_PREF_RPKI, .proto_size = sizeof(struct rpki_proto), .config_size = sizeof(struct rpki_config), @@ -991,3 +1004,9 @@ struct protocol proto_rpki = { .reconfigure = rpki_reconfigure, .get_status = rpki_get_status, }; + +void +rpki_build(void) +{ + proto_build(&proto_rpki); +} diff --git a/proto/rpki/rpki.h b/proto/rpki/rpki.h index a70a2027..20253844 100644 --- a/proto/rpki/rpki.h +++ b/proto/rpki/rpki.h @@ -13,7 +13,7 @@ #define _BIRD_RPKI_H_ #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "lib/socket.h" #include "lib/ip.h" diff --git a/proto/rpki/ssh_transport.c b/proto/rpki/ssh_transport.c index 223afa80..425ad460 100644 --- a/proto/rpki/ssh_transport.c +++ b/proto/rpki/ssh_transport.c @@ -35,11 +35,9 @@ rpki_tr_ssh_open(struct rpki_tr_sock *tr) sk->ssh->subsystem = "rpki-rtr"; sk->ssh->state = SK_SSH_CONNECT; - if (sk_open(sk) != 0) + if (sk_open(sk, cache->p->p.loop) != 0) return RPKI_TR_ERROR; - sk_start(sk); - return RPKI_TR_SUCCESS; } diff --git a/proto/rpki/tcp_transport.c b/proto/rpki/tcp_transport.c index 4e850c44..ebb8030f 100644 --- a/proto/rpki/tcp_transport.c +++ b/proto/rpki/tcp_transport.c @@ -28,11 +28,9 @@ rpki_tr_tcp_open(struct rpki_tr_sock *tr) sk->type = SK_TCP_ACTIVE; - if (sk_open(sk) != 0) + if (sk_open(sk, tr->cache->p->p.loop) != 0) return RPKI_TR_ERROR; - sk_start(sk); - return RPKI_TR_SUCCESS; } diff --git a/proto/rpki/transport.c b/proto/rpki/transport.c index 26609764..81bd6dd8 100644 --- a/proto/rpki/transport.c +++ b/proto/rpki/transport.c @@ -85,8 +85,7 @@ rpki_tr_open(struct rpki_tr_sock *tr) sk->rbsize = RPKI_RX_BUFFER_SIZE; sk->tbsize = RPKI_TX_BUFFER_SIZE; sk->tos = IP_PREC_INTERNET_CONTROL; - sk->flags |= SKF_THREAD; - sk->loop = cache->p->p.loop; + sk->vrf = cache->p->p.vrf; if (ipa_zero(sk->daddr) && sk->host) { @@ -121,7 +120,6 @@ rpki_tr_close(struct rpki_tr_sock *tr) if (tr->sk) { - sk_stop(tr->sk); rfree(tr->sk); tr->sk = NULL; } diff --git a/proto/static/Makefile b/proto/static/Makefile index e38f9b74..de6e819b 100644 --- a/proto/static/Makefile +++ b/proto/static/Makefile @@ -3,4 +3,4 @@ obj := $(src-o-files) $(all-daemon) $(cf-local) -tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file +tests_objs := $(tests_objs) $(src-o-files) diff --git a/proto/static/config.Y b/proto/static/config.Y index 41e10dbf..9d26ee82 100644 --- a/proto/static/config.Y +++ b/proto/static/config.Y @@ -40,7 +40,7 @@ static_route_finish(void) if (net_type_match(this_srt->net, NB_DEST) == !this_srt->dest) cf_error("Unexpected or missing nexthop/type"); - this_srt->cmds = f_linearize(this_srt_cmds); + this_srt->cmds = f_linearize(this_srt_cmds, 0); } CF_DECLS diff --git a/proto/static/static.c b/proto/static/static.c index d89ca8b0..6bae827b 100644 --- a/proto/static/static.c +++ b/proto/static/static.c @@ -38,7 +38,7 @@ #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/cli.h" #include "conf/conf.h" #include "filter/filter.h" @@ -47,8 +47,6 @@ #include "static.h" -static linpool *static_lp; - static inline struct rte_src * static_get_source(struct static_proto *p, uint i) { return i ? rt_get_source(&p->p, i) : p->p.main_source; } @@ -59,68 +57,77 @@ static void static_announce_rte(struct static_proto *p, struct static_route *r) { struct rte_src *src; - rta *a = allocz(RTA_MAX_SIZE); - a->source = RTS_STATIC; - a->scope = SCOPE_UNIVERSE; - a->dest = r->dest; - a->pref = p->p.main_channel->preference; + ea_list *ea = NULL; + ea_set_attr_u32(&ea, &ea_gen_preference, 0, p->p.main_channel->preference); + ea_set_attr_u32(&ea, &ea_gen_source, 0, RTS_STATIC); if (r->dest == RTD_UNICAST) { - struct static_route *r2; - struct nexthop *nhs = NULL; + uint sz = 0; + for (struct static_route *r2 = r; r2; r2 = r2->mp_next) + if (r2->active) + sz += NEXTHOP_SIZE_CNT(r2->mls ? r2->mls->length / sizeof(u32) : 0); - for (r2 = r; r2; r2 = r2->mp_next) + if (!sz) + goto withdraw; + + struct nexthop_adata *nhad = allocz(sz + sizeof *nhad); + struct nexthop *nh = &nhad->nh; + + for (struct static_route *r2 = r; r2; r2 = r2->mp_next) { if (!r2->active) continue; - struct nexthop *nh = allocz(NEXTHOP_MAX_SIZE); - nh->gw = r2->via; - nh->iface = r2->neigh->iface; - nh->flags = r2->onlink ? RNF_ONLINK : 0; - nh->weight = r2->weight; + *nh = (struct nexthop) { + .gw = r2->via, + .iface = r2->neigh->iface, + .flags = r2->onlink ? RNF_ONLINK : 0, + .weight = r2->weight, + }; + if (r2->mls) { - nh->labels = r2->mls->len; - memcpy(nh->label, r2->mls->stack, r2->mls->len * sizeof(u32)); + nh->labels = r2->mls->length / sizeof(u32); + memcpy(nh->label, r2->mls->data, r2->mls->length); } - nexthop_insert(&nhs, nh); + nh = NEXTHOP_NEXT(nh); } - if (!nhs) - goto withdraw; - - nexthop_link(a, nhs); + ea_set_attr_data(&ea, &ea_gen_nexthop, 0, + nhad->ad.data, (void *) nh - (void *) nhad->ad.data); } - if (r->dest == RTDX_RECURSIVE) + else if (r->dest == RTDX_RECURSIVE) { rtable *tab = ipa_is_ip4(r->via) ? p->igp_table_ip4 : p->igp_table_ip6; - rta_set_recursive_next_hop(p->p.main_channel->table, a, tab, r->via, IPA_NONE, r->mls, static_lp); + u32 *labels = r->mls ? (void *) r->mls->data : NULL; + u32 lnum = r->mls ? r->mls->length / sizeof(u32) : 0; + + ea_set_hostentry(&ea, p->p.main_channel->table, tab, + r->via, IPA_NONE, lnum, labels); } + else if (r->dest) + ea_set_dest(&ea, 0, r->dest); + /* Already announced */ if (r->state == SRS_CLEAN) return; /* We skip rta_lookup() here */ src = static_get_source(p, r->index); - rte e0 = { .attrs = a, .src = src, .net = r->net, }, *e = &e0; + rte e0 = { .attrs = ea, .src = src, .net = r->net, }, *e = &e0; /* Evaluate the filter */ if (r->cmds) - f_eval_rte(r->cmds, e, static_lp); + f_eval_rte(r->cmds, e); rte_update(p->p.main_channel, r->net, e, src); static_free_source(src, r->index); r->state = SRS_CLEAN; - - if (r->cmds) - lp_flush(static_lp); - return; withdraw: @@ -206,7 +213,7 @@ static_update_bfd(struct static_proto *p, struct static_route *r) // ip_addr local = ipa_nonzero(r->local) ? r->local : nb->ifa->ip; r->bfd_req = bfd_request_session(p->p.pool, r->via, nb->ifa->ip, nb->iface, p->p.vrf, - static_bfd_notify, r, birdloop_event_list(p->p.loop), NULL); + static_bfd_notify, r, p->p.loop, NULL); } if (!bfd_up && r->bfd_req) @@ -322,31 +329,17 @@ static_same_dest(struct static_route *x, struct static_route *y) (x->weight != y->weight) || (x->use_bfd != y->use_bfd) || (!x->mls != !y->mls) || - ((x->mls) && (y->mls) && (x->mls->len != y->mls->len))) + ((x->mls) && (y->mls) && adata_same(x->mls, y->mls))) return 0; - - if (!x->mls) - continue; - - for (uint i = 0; i < x->mls->len; i++) - if (x->mls->stack[i] != y->mls->stack[i]) - return 0; } return !x && !y; case RTDX_RECURSIVE: if (!ipa_equal(x->via, y->via) || (!x->mls != !y->mls) || - ((x->mls) && (y->mls) && (x->mls->len != y->mls->len))) + ((x->mls) && (y->mls) && adata_same(x->mls, y->mls))) return 0; - if (!x->mls) - return 1; - - for (uint i = 0; i < x->mls->len; i++) - if (x->mls->stack[i] != y->mls->stack[i]) - return 0; - return 1; default: @@ -413,18 +406,18 @@ static_reload_routes(struct channel *C) } static int -static_rte_better(rte *new, rte *old) +static_rte_better(const rte *new, const rte *old) { - u32 n = ea_get_int(new->attrs->eattrs, EA_GEN_IGP_METRIC, IGP_METRIC_UNKNOWN); - u32 o = ea_get_int(old->attrs->eattrs, EA_GEN_IGP_METRIC, IGP_METRIC_UNKNOWN); + u32 n = ea_get_int(new->attrs, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN); + u32 o = ea_get_int(old->attrs, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN); return n < o; } static int -static_rte_mergable(rte *pri, rte *sec) +static_rte_mergable(const rte *pri, const rte *sec) { - u32 a = ea_get_int(pri->attrs->eattrs, EA_GEN_IGP_METRIC, IGP_METRIC_UNKNOWN); - u32 b = ea_get_int(sec->attrs->eattrs, EA_GEN_IGP_METRIC, IGP_METRIC_UNKNOWN); + u32 a = ea_get_int(pri->attrs, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN); + u32 b = ea_get_int(sec->attrs, &ea_gen_igp_metric, IGP_METRIC_UNKNOWN); return a == b; } @@ -443,11 +436,11 @@ static_postconfig(struct proto_config *CF) if (!cf->igp_table_ip4) cf->igp_table_ip4 = (cc->table->addr_type == NET_IP4) ? - cc->table : cf->c.global->def_tables[NET_IP4]; + cc->table : rt_get_default_table(cf->c.global, NET_IP4); if (!cf->igp_table_ip6) cf->igp_table_ip6 = (cc->table->addr_type == NET_IP6) ? - cc->table : cf->c.global->def_tables[NET_IP6]; + cc->table : rt_get_default_table(cf->c.global, NET_IP6); WALK_LIST(r, cf->routes) if (r->net && (r->net->type != CF->net_type)) @@ -467,7 +460,7 @@ static_init(struct proto_config *CF) P->main_channel = proto_add_channel(P, proto_cf_main_channel(CF)); - P->neigh_notify = static_neigh_notify; + P->iface_sub.neigh_notify = static_neigh_notify; P->reload_routes = static_reload_routes; P->sources.class = &static_rte_owner_class; @@ -487,16 +480,11 @@ static_start(struct proto *P) struct static_config *cf = (void *) P->cf; struct static_route *r; - if (!static_lp) - static_lp = lp_new(&root_pool, LP_GOOD_SIZE(1024)); - if (p->igp_table_ip4) - RT_LOCKED(p->igp_table_ip4, t) - rt_lock_table(t); + rt_lock_table(p->igp_table_ip4); if (p->igp_table_ip6) - RT_LOCKED(p->igp_table_ip6, t) - rt_lock_table(t); + rt_lock_table(p->igp_table_ip6); p->event = ev_new_init(p->p.pool, static_announce_marked, p); @@ -506,7 +494,12 @@ static_start(struct proto *P) proto_notify_state(P, PS_UP); WALK_LIST(r, cf->routes) + { + struct lp_state lps; + lp_save(tmp_linpool, &lps); static_add_rte(p, r); + lp_restore(tmp_linpool, &lps); + } return PS_UP; } @@ -523,12 +516,10 @@ static_shutdown(struct proto *P) static_reset_rte(p, r); if (p->igp_table_ip4) - RT_LOCKED(p->igp_table_ip4, t) - rt_unlock_table(t); + rt_unlock_table(p->igp_table_ip4); if (p->igp_table_ip6) - RT_LOCKED(p->igp_table_ip6, t) - rt_unlock_table(t); + rt_unlock_table(p->igp_table_ip6); return PS_DOWN; } @@ -718,13 +709,14 @@ static_copy_config(struct proto_config *dest, struct proto_config *src) } static void -static_get_route_info(rte *rte, byte *buf) +static_get_route_info(const rte *rte, byte *buf) { - eattr *a = ea_find(rte->attrs->eattrs, EA_GEN_IGP_METRIC); - if (a) - buf += bsprintf(buf, " (%d/%u)", rte->attrs->pref, a->u.data); + eattr *a = ea_find(rte->attrs, &ea_gen_igp_metric); + u32 pref = rt_get_preference(rte); + if (a && (a->u.data < IGP_METRIC_UNKNOWN)) + buf += bsprintf(buf, " (%d/%u)", pref, a->u.data); else - buf += bsprintf(buf, " (%d)", rte->attrs->pref); + buf += bsprintf(buf, " (%d)", pref); } static void @@ -783,7 +775,6 @@ static struct rte_owner_class static_rte_owner_class = { struct protocol proto_static = { .name = "Static", .template = "static%d", - .class = PROTOCOL_STATIC, .preference = DEF_PREF_STATIC, .channel_mask = NB_ANY, .proto_size = sizeof(struct static_proto), @@ -796,3 +787,9 @@ struct protocol proto_static = { .reconfigure = static_reconfigure, .copy_config = static_copy_config, }; + +void +static_build(void) +{ + proto_build(&proto_static); +} diff --git a/proto/static/static.h b/proto/static/static.h index fc91f71c..ea7ca33b 100644 --- a/proto/static/static.h +++ b/proto/static/static.h @@ -9,7 +9,7 @@ #ifndef _BIRD_STATIC_H_ #define _BIRD_STATIC_H_ -#include "nest/route.h" +#include "nest/rt.h" #include "nest/bfd.h" #include "lib/buffer.h" @@ -49,7 +49,7 @@ struct static_route { byte weight; /* Multipath next hop weight */ byte use_bfd; /* Configured to use BFD */ struct bfd_request *bfd_req; /* BFD request, if BFD is used */ - mpls_label_stack *mls; /* MPLS label stack; may be NULL */ + struct adata *mls; /* MPLS label stack; may be NULL */ }; /* diff --git a/sysdep/bsd-netlink/Makefile b/sysdep/bsd-netlink/Makefile new file mode 100644 index 00000000..188ac8de --- /dev/null +++ b/sysdep/bsd-netlink/Makefile @@ -0,0 +1,6 @@ +src := netlink.c +obj := $(src-o-files) +$(all-daemon) +$(conf-y-targets): $(s)netlink.Y + +tests_objs := $(tests_objs) $(src-o-files) diff --git a/sysdep/bsd-netlink/netlink-sys.h b/sysdep/bsd-netlink/netlink-sys.h new file mode 100644 index 00000000..a6e5052b --- /dev/null +++ b/sysdep/bsd-netlink/netlink-sys.h @@ -0,0 +1,29 @@ +/* + * Netlink FreeBSD-specific functions + * + * (c) 2022 Alexander Chernikov <melifaro@FreeBSD.org> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_NETLINK_SYS_H_ +#define _BIRD_NETLINK_SYS_H_ + +#include <netlink/netlink.h> +#include <netlink/netlink_route.h> + +#ifndef AF_MPLS +#define AF_MPLS 39 +#endif + +#ifndef SO_RCVBUFFORCE +#define SO_RCVBUFFORCE SO_RCVBUF +#endif + +static inline int +netlink_error_to_os(int error) +{ + return (error); +} + +#endif diff --git a/sysdep/bsd-netlink/netlink.Y b/sysdep/bsd-netlink/netlink.Y new file mode 120000 index 00000000..cde7f012 --- /dev/null +++ b/sysdep/bsd-netlink/netlink.Y @@ -0,0 +1 @@ +../linux/netlink.Y
\ No newline at end of file diff --git a/sysdep/bsd-netlink/netlink.c b/sysdep/bsd-netlink/netlink.c new file mode 120000 index 00000000..d9257568 --- /dev/null +++ b/sysdep/bsd-netlink/netlink.c @@ -0,0 +1 @@ +../linux/netlink.c
\ No newline at end of file diff --git a/sysdep/bsd/krt-sock.Y b/sysdep/bsd/krt-sock.Y index 8581bd43..a03d6df5 100644 --- a/sysdep/bsd/krt-sock.Y +++ b/sysdep/bsd/krt-sock.Y @@ -10,7 +10,7 @@ CF_HDR CF_DECLS -CF_KEYWORDS(KERNEL, TABLE) +CF_KEYWORDS(KERNEL, TABLE, METRIC) CF_GRAMMAR @@ -25,6 +25,14 @@ kern_sys_item: THIS_KRT->sys.table_id = $3; } + | METRIC expr { + if ($2 && !krt_max_metric) + cf_error("Kernel route metric not supported"); + if ($2 > krt_max_metric) + cf_error("Kernel table id must be in range 0-%u", krt_max_metric); + + THIS_KRT->sys.metric = $2; + } ; CF_CODE diff --git a/sysdep/bsd/krt-sock.c b/sysdep/bsd/krt-sock.c index fb407739..094268b7 100644 --- a/sysdep/bsd/krt-sock.c +++ b/sysdep/bsd/krt-sock.c @@ -25,7 +25,7 @@ #include "nest/bird.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "sysdep/unix/unix.h" @@ -47,6 +47,11 @@ const int rt_default_ecmp = 0; * table_id is specified explicitly as sysctl scan argument, while in FreeBSD it * is handled implicitly by changing default table using setfib() syscall. * + * OpenBSD allows to use route metric. The behavior is controlled by these macro + * KRT_USE_METRIC, which enables use of rtm_priority in route send/recevive. + * There is also KRT_DEFAULT_METRIC and KRT_MAX_METRIC for default and maximum + * metric values. + * * KRT_SHARED_SOCKET - use shared kernel socked instead of one for each krt_proto * KRT_USE_SETFIB_SCAN - use setfib() for sysctl() route scan * KRT_USE_SETFIB_SOCK - use SO_SETFIB socket option for kernel sockets @@ -63,6 +68,9 @@ const int rt_default_ecmp = 0; #ifdef __OpenBSD__ #define KRT_MAX_TABLES (RT_TABLEID_MAX+1) +#define KRT_USE_METRIC +#define KRT_MAX_METRIC 255 +#define KRT_DEFAULT_METRIC 56 #define KRT_SHARED_SOCKET #define KRT_USE_SYSCTL_7 #endif @@ -71,6 +79,14 @@ const int rt_default_ecmp = 0; #define KRT_MAX_TABLES 1 #endif +#ifndef KRT_MAX_METRIC +#define KRT_MAX_METRIC 0 +#endif + +#ifndef KRT_DEFAULT_METRIC +#define KRT_DEFAULT_METRIC 0 +#endif + /* Dynamic max number of tables */ @@ -143,20 +159,27 @@ static struct krt_proto *krt_table_map[KRT_MAX_TABLES][2]; #endif +/* Make it available to parser code */ +const uint krt_max_metric = KRT_MAX_METRIC; + + /* Route socket message processing */ int krt_capable(rte *e) { - rta *a = e->attrs; + ea_list *eattrs = e->attrs; + eattr *nhea = ea_find(eattrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); return - ((a->dest == RTD_UNICAST && !a->nh.next) /* No multipath support */ + ((dest == RTD_UNICAST && !NEXTHOP_ONE(nh)) /* No multipath support */ #ifdef RTF_REJECT - || a->dest == RTD_UNREACHABLE + || dest == RTD_UNREACHABLE #endif #ifdef RTF_BLACKHOLE - || a->dest == RTD_BLACKHOLE + || dest == RTD_BLACKHOLE #endif ); } @@ -186,10 +209,14 @@ struct ks_msg memcpy(p, body, (l > sizeof(*p) ? sizeof(*p) : l));\ body += l;} -static inline void +static inline void UNUSED sockaddr_fill_dl(struct sockaddr_dl *sa, struct iface *ifa) { uint len = OFFSETOF(struct sockaddr_dl, sdl_data); + + /* Workaround for FreeBSD 13.0 */ + len = MAX(len, sizeof(struct sockaddr)); + memset(sa, 0, len); sa->sdl_len = len; sa->sdl_family = AF_LINK; @@ -200,15 +227,19 @@ static int krt_send_route(struct krt_proto *p, int cmd, const rte *e) { const net_addr *net = e->net; - rta *a = e->attrs; + ea_list *eattrs = e->attrs; + eattr *nhea = ea_find(eattrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); + static int msg_seq; - struct iface *j, *i = a->nh.iface; + struct iface *j, *i = (dest == RTD_UNICAST) ? nh->nh.iface : NULL; int l; struct ks_msg msg; char *body = (char *)msg.buf; sockaddr gate, mask, dst; -// DBG("krt-sock: send %I/%d via %I\n", net->prefix, net->pxlen, a->gw); + DBG("krt-sock: send %N via %I\n", net, nh->nh.gw); bzero(&msg,sizeof (struct rt_msghdr)); msg.rtm.rtm_version = RTM_VERSION; @@ -218,7 +249,7 @@ krt_send_route(struct krt_proto *p, int cmd, const rte *e) msg.rtm.rtm_flags = RTF_UP | RTF_PROTO1; /* XXXX */ - if (net_pxlen(net) == net_max_prefix_length[net->type]) + if (net_pxlen(e->net) == net_max_prefix_length[net->type]) msg.rtm.rtm_flags |= RTF_HOST; else msg.rtm.rtm_addrs |= RTA_NETMASK; @@ -227,12 +258,16 @@ krt_send_route(struct krt_proto *p, int cmd, const rte *e) msg.rtm.rtm_tableid = KRT_CF->sys.table_id; #endif +#ifdef KRT_USE_METRIC + msg.rtm.rtm_priority = KRT_CF->sys.metric; +#endif + #ifdef RTF_REJECT - if(a->dest == RTD_UNREACHABLE) + if(dest == RTD_UNREACHABLE) msg.rtm.rtm_flags |= RTF_REJECT; #endif #ifdef RTF_BLACKHOLE - if(a->dest == RTD_BLACKHOLE) + if(dest == RTD_BLACKHOLE) msg.rtm.rtm_flags |= RTF_BLACKHOLE; #endif @@ -242,16 +277,17 @@ krt_send_route(struct krt_proto *p, int cmd, const rte *e) */ if (!i) { - IFACE_LOCK; - WALK_LIST(j, global_iface_list) + j = if_walk_first(); + while (j) { if (j->flags & IF_LOOPBACK) { i = j; break; } + j = if_walk_next(j); } - IFACE_UNLOCK; + if_walk_done(); if (!i) { @@ -277,12 +313,12 @@ krt_send_route(struct krt_proto *p, int cmd, const rte *e) sockaddr_fill(&dst, af, net_prefix(net), NULL, 0); sockaddr_fill(&mask, af, net_pxmask(net), NULL, 0); - switch (a->dest) + switch (dest) { case RTD_UNICAST: - if (ipa_nonzero(a->nh.gw)) + if (ipa_nonzero(nh->nh.gw)) { - ip_addr gw = a->nh.gw; + ip_addr gw = nh->nh.gw; /* Embed interface ID to link-local address */ if (ipa_is_link_local(gw)) @@ -294,6 +330,8 @@ krt_send_route(struct krt_proto *p, int cmd, const rte *e) break; } + /* Fall through */ + #ifdef RTF_REJECT case RTD_UNREACHABLE: #endif @@ -515,79 +553,87 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan) src = KRT_SRC_ALIEN; else src = KRT_SRC_KERNEL; + + union { + struct { + struct adata ad; + struct nexthop nh; + u32 labels[MPLS_MAX_LABEL_STACK]; + }; + struct nexthop_adata nhad; + } nhad = {}; + + ea_list *eattrs = NULL; - rta a = { - .source = RTS_INHERIT, - .scope = SCOPE_UNIVERSE, - }; + ea_set_attr_u32(&eattrs, &ea_gen_source, 0, RTS_INHERIT); /* reject/blackhole routes have also set RTF_GATEWAY, we wil check them first. */ #ifdef RTF_REJECT if(flags & RTF_REJECT) { - a.dest = RTD_UNREACHABLE; + nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_UNREACHABLE); goto done; } #endif #ifdef RTF_BLACKHOLE if(flags & RTF_BLACKHOLE) { - a.dest = RTD_BLACKHOLE; + nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_BLACKHOLE); goto done; } #endif - a.nh.iface = if_find_by_index(msg->rtm.rtm_index); - if (!a.nh.iface) + nhad.nh.iface = if_find_by_index(msg->rtm.rtm_index); + if (!nhad.nh.iface) { log(L_ERR "KRT: Received route %N with unknown ifindex %u", &ndst, msg->rtm.rtm_index); return; } - a.dest = RTD_UNICAST; if (flags & RTF_GATEWAY) { - a.nh.gw = igate; + nhad.nh.gw = igate; /* Clean up embedded interface ID returned in link-local address */ - if (ipa_is_link_local(a.nh.gw)) - _I0(a.nh.gw) = 0xfe800000; + if (ipa_is_link_local(nhad.nh.gw)) + _I0(nhad.nh.gw) = 0xfe800000; /* The BSD kernel does not support an onlink flag. We heuristically set the onlink flag, if the iface has only host addresses. */ - if (krt_assume_onlink(a.nh.iface, ipv6)) - a.nh.flags |= RNF_ONLINK; + if (krt_assume_onlink(nhad.nh.iface, ipv6)) + nhad.nh.flags |= RNF_ONLINK; neighbor *nbr; - nbr = neigh_find(&p->p, a.nh.gw, a.nh.iface, - (a.nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0); + nbr = neigh_find(&p->p, nhad.nh.gw, nhad.nh.iface, + (nhad.nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0); if (!nbr || (nbr->scope == SCOPE_HOST)) { /* Ignore routes with next-hop 127.0.0.1, host routes with such next-hop appear on OpenBSD for address aliases. */ - if (ipa_classify(a.nh.gw) == (IADDR_HOST | SCOPE_HOST)) + if (ipa_classify(nhad.nh.gw) == (IADDR_HOST | SCOPE_HOST)) return; log(L_ERR "KRT: Received route %N with strange next-hop %I", - &ndst, a.nh.gw); + &ndst, nhad.nh.gw); return; } } - done:; - rte e0 = { .attrs = &a, .net = &ndst, }; + nhad.ad.length = (void *) NEXTHOP_NEXT(&nhad.nh) - (void *) nhad.ad.data; - ea_list *ea = alloca(sizeof(ea_list) + 1 * sizeof(eattr)); - *ea = (ea_list) { .count = 1, .next = e0.attrs->eattrs }; - e0.attrs->eattrs = ea; + done: + ea_set_attr(&eattrs, EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nhad.ad)); + rte e0 = { .attrs = eattrs, .net = &ndst, }; - ea->attrs[0] = (eattr) { - .id = EA_KRT_SOURCE, - .type = EAF_TYPE_INT, - .u.data = src2, - }; + ea_set_attr(&e0.attrs, + EA_LITERAL_EMBEDDED(&ea_krt_source, 0, src2)); + +#ifdef KRT_USE_METRIC + ea_set_attr(&e0.attrs, + EA_LITERAL_EMBEDDED(&ea_krt_metric, 0, msg->rtm.rtm_priority)); +#endif if (scan) krt_got_route(p, &e0, src); @@ -828,6 +874,7 @@ krt_read_msg(struct proto *p, struct ks_msg *msg, int scan) { case RTM_GET: if(!scan) return; + /* Fall through */ case RTM_ADD: case RTM_DELETE: case RTM_CHANGE: @@ -1041,7 +1088,7 @@ krt_sock_open(pool *pool, void *data, int table_id UNUSED) sk->fd = fd; sk->data = data; - if (sk_open(sk) < 0) + if (sk_open(sk, &main_birdloop) < 0) bug("krt-sock: sk_open failed"); return sk; @@ -1148,7 +1195,7 @@ krt_sys_shutdown(struct krt_proto *p) int krt_sys_reconfigure(struct krt_proto *p UNUSED, struct krt_config *n, struct krt_config *o) { - return n->sys.table_id == o->sys.table_id; + return (n->sys.table_id == o->sys.table_id) && (n->sys.metric == o->sys.metric); } void @@ -1161,11 +1208,13 @@ krt_sys_preconfig(struct config *c UNUSED) void krt_sys_init_config(struct krt_config *c) { c->sys.table_id = 0; /* Default table */ + c->sys.metric = KRT_DEFAULT_METRIC; } void krt_sys_copy_config(struct krt_config *d, struct krt_config *s) { d->sys.table_id = s->sys.table_id; + d->sys.metric = s->sys.metric; } diff --git a/sysdep/bsd/krt-sys.h b/sysdep/bsd/krt-sys.h index 57501884..198373c0 100644 --- a/sysdep/bsd/krt-sys.h +++ b/sysdep/bsd/krt-sys.h @@ -32,9 +32,11 @@ static inline void kif_sys_copy_config(struct kif_config *d UNUSED, struct kif_c /* Kernel routes */ extern uint krt_max_tables; +extern const uint krt_max_metric; struct krt_params { int table_id; /* Kernel table ID we sync with */ + u32 metric; /* Kernel metric used for all routes */ }; struct krt_state { diff --git a/sysdep/bsd/sysio.h b/sysdep/bsd/sysio.h index f1887fb4..b6b42b1e 100644 --- a/sysdep/bsd/sysio.h +++ b/sysdep/bsd/sysio.h @@ -15,8 +15,23 @@ #ifdef __FreeBSD__ /* Should be defined in sysdep/cf/bsd.h, but it is flavor-specific */ #define CONFIG_DONTROUTE_UNICAST + +#if __FreeBSD_version >= 1201000 +#define CONFIG_USE_IP_MREQN +#endif + #endif + +#ifdef __OpenBSD__ + +#if OpenBSD >= 202105 +#define CONFIG_USE_IP_MREQN +#endif + +#endif + + #ifdef __NetBSD__ #ifndef IP_RECVTTL @@ -29,6 +44,7 @@ #endif + #ifdef __DragonFly__ #define TCP_MD5SIG TCP_SIGNATURE_ENABLE #endif @@ -45,13 +61,21 @@ #define INIT_MREQ4(maddr,ifa) \ { .imr_multiaddr = ipa_to_in4(maddr), .imr_interface = ip4_to_in4(ifa->sysdep) } +#define INIT_MREQN4(maddr,ifa) \ + { .imr_multiaddr = ipa_to_in4(maddr), .imr_ifindex = ifa->index } + static inline int sk_setup_multicast4(sock *s) { - struct in_addr ifa = ip4_to_in4(s->iface->sysdep); u8 ttl = s->ttl; u8 n = 0; +#ifdef CONFIG_USE_IP_MREQN + struct ip_mreqn ifa = { .imr_ifindex = s->iface->index }; +#else + struct in_addr ifa = ip4_to_in4(s->iface->sysdep); +#endif + /* This defines where should we send _outgoing_ multicasts */ if (setsockopt(s->fd, IPPROTO_IP, IP_MULTICAST_IF, &ifa, sizeof(ifa)) < 0) ERR("IP_MULTICAST_IF"); @@ -68,7 +92,11 @@ sk_setup_multicast4(sock *s) static inline int sk_join_group4(sock *s, ip_addr maddr) { +#ifdef CONFIG_USE_IP_MREQN + struct ip_mreqn mr = INIT_MREQN4(maddr, s->iface); +#else struct ip_mreq mr = INIT_MREQ4(maddr, s->iface); +#endif if (setsockopt(s->fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mr, sizeof(mr)) < 0) ERR("IP_ADD_MEMBERSHIP"); @@ -79,7 +107,11 @@ sk_join_group4(sock *s, ip_addr maddr) static inline int sk_leave_group4(sock *s, ip_addr maddr) { +#ifdef CONFIG_USE_IP_MREQN + struct ip_mreqn mr = INIT_MREQN4(maddr, s->iface); +#else struct ip_mreq mr = INIT_MREQ4(maddr, s->iface); +#endif if (setsockopt(s->fd, IPPROTO_IP, IP_DROP_MEMBERSHIP, &mr, sizeof(mr)) < 0) ERR("IP_ADD_MEMBERSHIP"); diff --git a/sysdep/cf/README b/sysdep/cf/README index 9a7a4afa..af65aaec 100644 --- a/sysdep/cf/README +++ b/sysdep/cf/README @@ -4,7 +4,6 @@ Available configuration variables: CONFIG_AUTO_ROUTES Device routes are added automagically by the kernel CONFIG_SELF_CONSCIOUS We're able to recognize whether route was installed by us CONFIG_MULTIPLE_TABLES The kernel supports multiple routing tables -CONFIG_ALL_TABLES_AT_ONCE Kernel scanner wants to process all tables at once CONFIG_SINGLE_ROUTE There is only one route per network CONFIG_MC_PROPER_SRC Multicast packets have source address according to socket saddr field @@ -15,3 +14,6 @@ CONFIG_DONTROUTE_UNICAST Use MSG_DONTROUTE flag for unicast packets (def for Fre CONFIG_USE_HDRINCL Use IP_HDRINCL instead of control messages for source address on raw IP sockets. CONFIG_RESTRICTED_PRIVILEGES Implements restricted privileges using drop_uid() + +CONFIG_MADV_DONTNEED_TO_FREE To free pages, use MADV_DONTNEED instead of MADV_FREE (linux) +CONFIG_DISABLE_THP Disable transparent huge pages (linux) diff --git a/sysdep/cf/bsd-netlink.h b/sysdep/cf/bsd-netlink.h new file mode 100644 index 00000000..69297867 --- /dev/null +++ b/sysdep/cf/bsd-netlink.h @@ -0,0 +1,25 @@ +/* + * Configuration for FreeBSD based systems with netlink support + * + * (c) 2022 Alexander Chernikov <melifaro@FreeBSD.org> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#define CONFIG_AUTO_ROUTES +#define CONFIG_SELF_CONSCIOUS +#define CONFIG_MULTIPLE_TABLES +#define CONFIG_SINGLE_ROUTE + +#define CONFIG_SKIP_MC_BIND +#define CONFIG_NO_IFACE_BIND +#define CONFIG_USE_HDRINCL + +#define CONFIG_INCLUDE_SYSIO_H "sysdep/bsd/sysio.h" +#define CONFIG_INCLUDE_KRTSYS_H "sysdep/linux/krt-sys.h" +#define CONFIG_INCLUDE_NLSYS_H "sysdep/bsd-netlink/netlink-sys.h" + +/* +Link: sysdep/unix +Link: sysdep/bsd-netlink + */ diff --git a/sysdep/cf/linux.h b/sysdep/cf/linux.h index 047d3764..5edc4969 100644 --- a/sysdep/cf/linux.h +++ b/sysdep/cf/linux.h @@ -9,7 +9,6 @@ #define CONFIG_AUTO_ROUTES #define CONFIG_SELF_CONSCIOUS #define CONFIG_MULTIPLE_TABLES -#define CONFIG_ALL_TABLES_AT_ONCE #define CONFIG_IP6_SADR_KERNEL #define CONFIG_MC_PROPER_SRC @@ -17,10 +16,15 @@ #define CONFIG_INCLUDE_SYSIO_H "sysdep/linux/sysio.h" #define CONFIG_INCLUDE_KRTSYS_H "sysdep/linux/krt-sys.h" +#define CONFIG_INCLUDE_NLSYS_H "sysdep/linux/netlink-sys.h" + +#define CONFIG_LINUX_NETLINK #define CONFIG_RESTRICTED_PRIVILEGES #define CONFIG_INCLUDE_SYSPRIV_H "sysdep/linux/syspriv.h" +#define CONFIG_MADV_DONTNEED_TO_FREE +#define CONFIG_DISABLE_THP #ifndef AF_MPLS #define AF_MPLS 28 diff --git a/sysdep/config.h b/sysdep/config.h index 2425b666..861e52bc 100644 --- a/sysdep/config.h +++ b/sysdep/config.h @@ -13,7 +13,7 @@ #ifdef GIT_LABEL #define BIRD_VERSION XSTR1(GIT_LABEL) #else -#define BIRD_VERSION "3.0-alpha0" +#define BIRD_VERSION "2.0.12" #endif /* Include parameters determined by configure script */ diff --git a/sysdep/linux/krt-sys.h b/sysdep/linux/krt-sys.h index 8897f889..aa90f6e4 100644 --- a/sysdep/linux/krt-sys.h +++ b/sysdep/linux/krt-sys.h @@ -34,38 +34,6 @@ static inline struct ifa * kif_get_primary_ip(struct iface *i UNUSED) { return N #define KRT_ALLOW_MERGE_PATHS 1 -#define EA_KRT_PREFSRC EA_CODE(PROTOCOL_KERNEL, 0x10) -#define EA_KRT_REALM EA_CODE(PROTOCOL_KERNEL, 0x11) -#define EA_KRT_SCOPE EA_CODE(PROTOCOL_KERNEL, 0x12) - - -#define KRT_METRICS_MAX 0x10 /* RTAX_QUICKACK+1 */ -#define KRT_METRICS_OFFSET 0x20 /* Offset of EA_KRT_* vs RTAX_* */ - -#define KRT_FEATURES_MAX 4 - -/* - * Following attributes are parts of RTA_METRICS kernel route attribute, their - * ids must be consistent with their RTAX_* constants (+ KRT_METRICS_OFFSET) - */ -#define EA_KRT_METRICS EA_CODE(PROTOCOL_KERNEL, 0x20) /* Dummy one */ -#define EA_KRT_LOCK EA_CODE(PROTOCOL_KERNEL, 0x21) -#define EA_KRT_MTU EA_CODE(PROTOCOL_KERNEL, 0x22) -#define EA_KRT_WINDOW EA_CODE(PROTOCOL_KERNEL, 0x23) -#define EA_KRT_RTT EA_CODE(PROTOCOL_KERNEL, 0x24) -#define EA_KRT_RTTVAR EA_CODE(PROTOCOL_KERNEL, 0x25) -#define EA_KRT_SSTRESH EA_CODE(PROTOCOL_KERNEL, 0x26) -#define EA_KRT_CWND EA_CODE(PROTOCOL_KERNEL, 0x27) -#define EA_KRT_ADVMSS EA_CODE(PROTOCOL_KERNEL, 0x28) -#define EA_KRT_REORDERING EA_CODE(PROTOCOL_KERNEL, 0x29) -#define EA_KRT_HOPLIMIT EA_CODE(PROTOCOL_KERNEL, 0x2a) -#define EA_KRT_INITCWND EA_CODE(PROTOCOL_KERNEL, 0x2b) -#define EA_KRT_FEATURES EA_CODE(PROTOCOL_KERNEL, 0x2c) -#define EA_KRT_RTO_MIN EA_CODE(PROTOCOL_KERNEL, 0x2d) -#define EA_KRT_INITRWND EA_CODE(PROTOCOL_KERNEL, 0x2e) -#define EA_KRT_QUICKACK EA_CODE(PROTOCOL_KERNEL, 0x2f) - - struct krt_params { u32 table_id; /* Kernel table ID we sync with */ u32 metric; /* Kernel metric used for all routes */ diff --git a/sysdep/linux/netlink-sys.h b/sysdep/linux/netlink-sys.h new file mode 100644 index 00000000..4c99307b --- /dev/null +++ b/sysdep/linux/netlink-sys.h @@ -0,0 +1,63 @@ +/* + * BIRD -- Linux Netlink Interface + * + * (c) 1999--2000 Martin Mares <mj@ucw.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_NETLINK_SYS_H_ +#define _BIRD_NETLINK_SYS_H_ + +#include <asm/types.h> +#include <linux/if.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> + +#ifdef HAVE_MPLS_KERNEL +#include <linux/lwtunnel.h> +#endif + +#ifndef MSG_TRUNC /* Hack: Several versions of glibc miss this one :( */ +#define MSG_TRUNC 0x20 +#endif + +#ifndef IFA_FLAGS +#define IFA_FLAGS 8 +#endif + +#ifndef IFF_LOWER_UP +#define IFF_LOWER_UP 0x10000 +#endif + +#ifndef RTA_TABLE +#define RTA_TABLE 15 +#endif + +#ifndef RTA_VIA +#define RTA_VIA 18 +#endif + +#ifndef RTA_NEWDST +#define RTA_NEWDST 19 +#endif + +#ifndef RTA_ENCAP_TYPE +#define RTA_ENCAP_TYPE 21 +#endif + +#ifndef RTA_ENCAP +#define RTA_ENCAP 22 +#endif + +#ifndef NETLINK_GET_STRICT_CHK +#define NETLINK_GET_STRICT_CHK 12 +#endif + +static inline int +netlink_error_to_os(int error) +{ + return -error; +} + +#endif diff --git a/sysdep/linux/netlink.Y b/sysdep/linux/netlink.Y index 487ad1d8..9b07e9bb 100644 --- a/sysdep/linux/netlink.Y +++ b/sysdep/linux/netlink.Y @@ -11,13 +11,12 @@ CF_HDR CF_DECLS CF_KEYWORDS(KERNEL, TABLE, METRIC, NETLINK, RX, BUFFER, - KRT_PREFSRC, KRT_REALM, KRT_SCOPE, KRT_MTU, KRT_WINDOW, - KRT_RTT, KRT_RTTVAR, KRT_SSTRESH, KRT_CWND, KRT_ADVMSS, KRT_REORDERING, - KRT_HOPLIMIT, KRT_INITCWND, KRT_RTO_MIN, KRT_INITRWND, KRT_QUICKACK, KRT_LOCK_MTU, KRT_LOCK_WINDOW, KRT_LOCK_RTT, KRT_LOCK_RTTVAR, KRT_LOCK_SSTRESH, KRT_LOCK_CWND, KRT_LOCK_ADVMSS, KRT_LOCK_REORDERING, KRT_LOCK_HOPLIMIT, KRT_LOCK_RTO_MIN, KRT_FEATURE_ECN, KRT_FEATURE_ALLFRAG) +%type <fab> attr_bit + CF_GRAMMAR kern_proto: kern_proto kern_sys_item ';' ; @@ -28,40 +27,42 @@ kern_sys_item: | NETLINK RX BUFFER expr { THIS_KRT->sys.netlink_rx_buffer = $4; } ; -dynamic_attr: KRT_PREFSRC { $$ = f_new_dynamic_attr(EAF_TYPE_IP_ADDRESS, T_IP, EA_KRT_PREFSRC); } ; -dynamic_attr: KRT_REALM { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_REALM); } ; -dynamic_attr: KRT_SCOPE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SCOPE); } ; - -dynamic_attr: KRT_MTU { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_MTU); } ; -dynamic_attr: KRT_WINDOW { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_WINDOW); } ; -dynamic_attr: KRT_RTT { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTT); } ; -dynamic_attr: KRT_RTTVAR { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTTVAR); } ; -dynamic_attr: KRT_SSTRESH { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SSTRESH); } ; -dynamic_attr: KRT_CWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_CWND); } ; -dynamic_attr: KRT_ADVMSS { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_ADVMSS); } ; -dynamic_attr: KRT_REORDERING { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_REORDERING); } ; -dynamic_attr: KRT_HOPLIMIT { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_HOPLIMIT); } ; -dynamic_attr: KRT_INITCWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_INITCWND); } ; -dynamic_attr: KRT_RTO_MIN { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTO_MIN); } ; -dynamic_attr: KRT_INITRWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_INITRWND); } ; -dynamic_attr: KRT_QUICKACK { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_QUICKACK); } ; - /* Bits of EA_KRT_LOCK, based on RTAX_* constants */ -dynamic_attr: KRT_LOCK_MTU { $$ = f_new_dynamic_attr_bit(2, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_WINDOW { $$ = f_new_dynamic_attr_bit(3, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_RTT { $$ = f_new_dynamic_attr_bit(4, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_RTTVAR { $$ = f_new_dynamic_attr_bit(5, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_SSTRESH { $$ = f_new_dynamic_attr_bit(6, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_CWND { $$ = f_new_dynamic_attr_bit(7, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_ADVMSS { $$ = f_new_dynamic_attr_bit(8, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_REORDERING { $$ = f_new_dynamic_attr_bit(9, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_HOPLIMIT { $$ = f_new_dynamic_attr_bit(10, T_BOOL, EA_KRT_LOCK); } ; -dynamic_attr: KRT_LOCK_RTO_MIN { $$ = f_new_dynamic_attr_bit(13, T_BOOL, EA_KRT_LOCK); } ; +attr_bit: KRT_LOCK_MTU { $$ = f_new_dynamic_attr_bit(2, "krt_lock"); } ; +attr_bit: KRT_LOCK_WINDOW { $$ = f_new_dynamic_attr_bit(3, "krt_lock"); } ; +attr_bit: KRT_LOCK_RTT { $$ = f_new_dynamic_attr_bit(4, "krt_lock"); } ; +attr_bit: KRT_LOCK_RTTVAR { $$ = f_new_dynamic_attr_bit(5, "krt_lock"); } ; +attr_bit: KRT_LOCK_SSTRESH { $$ = f_new_dynamic_attr_bit(6, "krt_lock"); } ; +attr_bit: KRT_LOCK_CWND { $$ = f_new_dynamic_attr_bit(7, "krt_lock"); } ; +attr_bit: KRT_LOCK_ADVMSS { $$ = f_new_dynamic_attr_bit(8, "krt_lock"); } ; +attr_bit: KRT_LOCK_REORDERING { $$ = f_new_dynamic_attr_bit(9, "krt_lock"); } ; +attr_bit: KRT_LOCK_HOPLIMIT { $$ = f_new_dynamic_attr_bit(10, "krt_lock"); } ; +attr_bit: KRT_LOCK_RTO_MIN { $$ = f_new_dynamic_attr_bit(13, "krt_lock"); } ; -dynamic_attr: KRT_FEATURE_ECN { $$ = f_new_dynamic_attr_bit(0, T_BOOL, EA_KRT_FEATURES); } ; -dynamic_attr: KRT_FEATURE_ALLFRAG { $$ = f_new_dynamic_attr(3, T_BOOL, EA_KRT_FEATURES); } ; +/* Bits of EA_KRT_FEATURES */ +attr_bit: KRT_FEATURE_ECN { $$ = f_new_dynamic_attr_bit(0, "krt_features"); } ; +attr_bit: KRT_FEATURE_ALLFRAG { $$ = f_new_dynamic_attr_bit(3, "krt_features"); } ; +/* Getting attribute bits (moved here to not confuse Bison on *BSD) */ +term: + attr_bit { + struct f_inst *c = f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_INT, .val.i = (1U << $1.bit)}); + $$ = f_new_inst(FI_EQ, c, f_new_inst(FI_BITAND, f_new_inst(FI_EA_GET, $1.class), c)); + } + ; + +/* Setting attribute bits (moved here to not confuse Bison on *BSD) */ +cmd: + attr_bit '=' term ';' { + $$ = f_new_inst(FI_CONDITION, $3, + f_generate_complex_default(FI_BITOR, $1.class, + f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_INT, .val.i = (1U << $1.bit)}), 0), + f_generate_complex_default(FI_BITAND, $1.class, + f_new_inst(FI_CONSTANT, (struct f_val) { .type = T_INT, .val.i = ~(1U << $1.bit)}), 0) + ); + } + ; CF_CODE diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 7984acd0..e8a86ce4 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -6,7 +6,6 @@ * Can be freely distributed and used under the terms of the GNU GPL. */ -#include <alloca.h> #include <stdio.h> #include <unistd.h> #include <fcntl.h> @@ -17,7 +16,7 @@ #undef LOCAL_DEBUG #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "lib/alloca.h" @@ -26,100 +25,118 @@ #include "lib/socket.h" #include "lib/string.h" #include "lib/hash.h" +#include "lib/macro.h" #include "conf/conf.h" -#include <asm/types.h> -#include <linux/if.h> -#include <linux/netlink.h> -#include <linux/rtnetlink.h> +#include CONFIG_INCLUDE_NLSYS_H -#ifdef HAVE_MPLS_KERNEL -#include <linux/lwtunnel.h> -#endif +#define krt_ipv4(p) ((p)->af == AF_INET) -#ifndef MSG_TRUNC /* Hack: Several versions of glibc miss this one :( */ -#define MSG_TRUNC 0x20 -#endif +const int rt_default_ecmp = 16; -#ifndef IFA_FLAGS -#define IFA_FLAGS 8 -#endif +struct nl_parse_state +{ + struct krt_proto *proto; + struct linpool *pool; + int scan; -#ifndef IFF_LOWER_UP -#define IFF_LOWER_UP 0x10000 -#endif + u32 rta_flow; +}; -#ifndef RTA_TABLE -#define RTA_TABLE 15 -#endif +/* + * Netlink eattr definitions + */ -#ifndef RTA_VIA -#define RTA_VIA 18 -#endif +#define KRT_METRICS_MAX ARRAY_SIZE(ea_krt_metrics) +#define KRT_FEATURES_MAX 4 + +static void krt_bitfield_format(const eattr *e, byte *buf, uint buflen); + +static struct ea_class + ea_krt_prefsrc = { + .name = "krt_prefsrc", + .type = T_IP, + }, + ea_krt_realm = { + .name = "krt_realm", + .type = T_INT, + }, + ea_krt_scope = { + .name = "krt_scope", + .type = T_INT, + }; -#ifndef RTA_NEWDST -#define RTA_NEWDST 19 -#endif +static struct ea_class ea_krt_metrics[] = { + [RTAX_LOCK] = { + .name = "krt_lock", + .type = T_INT, + .format = krt_bitfield_format, + }, + [RTAX_FEATURES] = { + .name = "krt_features", + .type = T_INT, + .format = krt_bitfield_format, + }, +#define KRT_METRIC_INT(_rtax, _name) [_rtax] = { .name = _name, .type = T_INT } + KRT_METRIC_INT(RTAX_MTU, "krt_mtu"), + KRT_METRIC_INT(RTAX_WINDOW, "krt_window"), + KRT_METRIC_INT(RTAX_RTT, "krt_rtt"), + KRT_METRIC_INT(RTAX_RTTVAR, "krt_rttvar"), + KRT_METRIC_INT(RTAX_SSTHRESH, "krt_sstresh"), + KRT_METRIC_INT(RTAX_CWND, "krt_cwnd"), + KRT_METRIC_INT(RTAX_ADVMSS, "krt_advmss"), + KRT_METRIC_INT(RTAX_REORDERING, "krt_reordering"), + KRT_METRIC_INT(RTAX_HOPLIMIT, "krt_hoplimit"), + KRT_METRIC_INT(RTAX_INITCWND, "krt_initcwnd"), + KRT_METRIC_INT(RTAX_RTO_MIN, "krt_rto_min"), + KRT_METRIC_INT(RTAX_INITRWND, "krt_initrwnd"), + KRT_METRIC_INT(RTAX_QUICKACK, "krt_quickack"), +#undef KRT_METRIC_INT +}; -#ifndef RTA_ENCAP_TYPE -#define RTA_ENCAP_TYPE 21 -#endif +static const char *krt_metrics_names[KRT_METRICS_MAX] = { + NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss", + "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack" +}; -#ifndef RTA_ENCAP -#define RTA_ENCAP 22 -#endif +static const char *krt_features_names[KRT_FEATURES_MAX] = { + "ecn", NULL, NULL, "allfrag" +}; -#ifndef NETLINK_GET_STRICT_CHK -#define NETLINK_GET_STRICT_CHK 12 -#endif +static void +krt_bitfield_format(const eattr *a, byte *buf, uint buflen) +{ + if (a->id == ea_krt_metrics[RTAX_LOCK].id) + ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX); + else if (a->id == ea_krt_metrics[RTAX_FEATURES].id) + ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX); +} -#define krt_ipv4(p) ((p)->af == AF_INET) -#define krt_ecmp6(p) ((p)->af == AF_INET6) +static void +nl_ea_register(void) +{ + EA_REGISTER_ALL( + &ea_krt_prefsrc, + &ea_krt_realm, + &ea_krt_scope + ); -const int rt_default_ecmp = 16; + for (uint i = 0; i < KRT_METRICS_MAX; i++) + { + if (!ea_krt_metrics[i].name) + ea_krt_metrics[i] = (struct ea_class) { + .name = mb_sprintf(&root_pool, "krt_metric_%d", i), + .type = T_INT, + }; -/* - * Structure nl_parse_state keeps state of received route processing. Ideally, - * we could just independently parse received Netlink messages and immediately - * propagate received routes to the rest of BIRD, but older Linux kernel (before - * version 4.11) represents and announces IPv6 ECMP routes not as one route with - * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of - * routes with the same prefix. More recent kernels work as with IPv4. - * - * Therefore, BIRD keeps currently processed route in nl_parse_state structure - * and postpones its propagation until we expect it to be final; i.e., when - * non-matching route is received or when the scan ends. When another matching - * route is received, it is merged with the already processed route to form an - * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the - * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6 - * routes with RTA_MULTIPATH set are just considered non-matching. - * - * This is ignored for asynchronous notifications (every notification is handled - * as a separate route). It is not an issue for our routes, as we ignore such - * notifications anyways. But importing alien IPv6 ECMP routes does not work - * properly with older kernels. - * - * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes - * for the same prefix. - */ + ea_register_init(&ea_krt_metrics[i]); + } -struct nl_parse_state -{ - struct linpool *pool; - int scan; - int merge; + for (uint i = 1; i < KRT_METRICS_MAX; i++) + ASSERT_DIE(ea_krt_metrics[i].id == ea_krt_metrics[0].id + i); +} - net_addr *net; - rta *attrs; - struct krt_proto *proto; - s8 new; - s8 krt_src; - u8 krt_type; - u8 krt_proto; - u32 krt_metric; - u32 rta_flow; /* Used during parsing */ -}; /* * Synchronous Netlink interface @@ -161,16 +178,13 @@ nl_open_sock(struct nl_sock *nl) } } -static void -nl_set_strict_dump(struct nl_sock *nl, int strict) +static int +nl_set_strict_dump(struct nl_sock *nl UNUSED, int strict UNUSED) { - /* - * Strict checking is not necessary, it improves behavior on newer kernels. - * If it is not available (missing SOL_NETLINK compile-time, or ENOPROTOOPT - * run-time), we can just ignore it. - */ #ifdef SOL_NETLINK - setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); + return setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); +#else + return -1; #endif } @@ -198,10 +212,17 @@ nl_cfg_rx_buffer_size(struct config *cfg) static void nl_open(void) { + if ((nl_scan.fd >= 0) && (nl_req.fd >= 0)) + return; + nl_open_sock(&nl_scan); nl_open_sock(&nl_req); - nl_set_strict_dump(&nl_scan, 1); + if (nl_set_strict_dump(&nl_scan, 1) < 0) + { + log(L_WARN "KRT: Netlink strict checking failed, will scan all tables at once"); + krt_use_shared_scan(); + } } static void @@ -256,11 +277,13 @@ nl_request_dump_addr(int af) } static void -nl_request_dump_route(int af) +nl_request_dump_route(int af, int table_id) { struct { struct nlmsghdr nh; struct rtmsg rtm; + struct rtattr rta; + u32 table_id; } req = { .nh.nlmsg_type = RTM_GETROUTE, .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)), @@ -269,7 +292,17 @@ nl_request_dump_route(int af) .rtm.rtm_family = af, }; - send(nl_scan.fd, &req, sizeof(req), 0); + if (table_id < 256) + req.rtm.rtm_table = table_id; + else + { + req.rta.rta_type = RTA_TABLE; + req.rta.rta_len = RTA_LENGTH(4); + req.table_id = table_id; + req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + req.rta.rta_len; + } + + send(nl_scan.fd, &req, req.nh.nlmsg_len, 0); nl_scan.last_hdr = NULL; } @@ -334,7 +367,7 @@ nl_error(struct nlmsghdr *h, int ignore_esrch) return ENOBUFS; } e = (struct nlmsgerr *) NLMSG_DATA(h); - ec = -e->error; + ec = netlink_error_to_os(e->error); if (ec && !(ignore_esrch && (ec == ESRCH))) log_rl(&rl_netlink_err, L_WARN "Netlink: %s", strerror(ec)); return ec; @@ -731,12 +764,12 @@ nl_add_nexthop(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af UNUS } static void -nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, ea_list *eattrs) +nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop_adata *nhad, int af, ea_list *eattrs) { struct rtattr *a = nl_open_attr(h, bufsize, RTA_MULTIPATH); - eattr *flow = ea_find(eattrs, EA_KRT_REALM); + eattr *flow = ea_find(eattrs, &ea_krt_realm); - for (; nh; nh = nh->next) + NEXTHOP_WALK(nh, nhad) { struct rtnexthop *rtnh = nl_open_nexthop(h, bufsize); @@ -760,31 +793,44 @@ nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, e nl_close_attr(h, a); } -static struct nexthop * +static struct nexthop_adata * nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr *n, struct rtattr *ra, int af, int krt_src) { struct rtattr *a[BIRD_RTA_MAX]; - struct rtnexthop *nh = RTA_DATA(ra); - struct nexthop *rv, *first, **last; - unsigned len = RTA_PAYLOAD(ra); + struct rtnexthop *nh, *orig_nh = RTA_DATA(ra); + unsigned len, orig_len = RTA_PAYLOAD(ra); + uint cnt = 0; + + /* First count the nexthops */ + for (len = orig_len, nh = orig_nh; len; len -= NLMSG_ALIGN(nh->rtnh_len), nh = RTNH_NEXT(nh)) + { + /* Use RTNH_OK(nh,len) ?? */ + if ((len < sizeof(*nh)) || (len < nh->rtnh_len)) + goto err; + + if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD)) + ; + else + cnt++; + } - first = NULL; - last = &first; + struct nexthop_adata *nhad = lp_allocz(s->pool, cnt * NEXTHOP_MAX_SIZE + sizeof *nhad); + struct nexthop *rv = &nhad->nh; - while (len) + for (len = orig_len, nh = orig_nh; len; len -= NLMSG_ALIGN(nh->rtnh_len), nh = RTNH_NEXT(nh)) { /* Use RTNH_OK(nh,len) ?? */ if ((len < sizeof(*nh)) || (len < nh->rtnh_len)) goto err; if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD)) - goto next; + continue; - *last = rv = lp_allocz(s->pool, NEXTHOP_MAX_SIZE); - last = &(rv->next); + *rv = (struct nexthop) { + .weight = nh->rtnh_hops, + .iface = if_find_by_index(nh->rtnh_ifindex), + }; - rv->weight = nh->rtnh_hops; - rv->iface = if_find_by_index(nh->rtnh_ifindex); if (!rv->iface) { log(L_ERR "KRT: Received route %N with unknown ifindex %u", n, nh->rtnh_ifindex); @@ -831,11 +877,11 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr rv->gw = rta_get_via(a[RTA_VIA]); #endif + if (nh->rtnh_flags & RTNH_F_ONLINK) + rv->flags |= RNF_ONLINK; + if (ipa_nonzero(rv->gw)) { - if (nh->rtnh_flags & RTNH_F_ONLINK) - rv->flags |= RNF_ONLINK; - neighbor *nbr; nbr = neigh_find(&p->p, rv->gw, rv->iface, (rv->flags & RNF_ONLINK) ? NEF_ONLINK : 0); @@ -863,16 +909,14 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr } #endif - next: - len -= NLMSG_ALIGN(nh->rtnh_len); - nh = RTNH_NEXT(nh); + rv = NEXTHOP_NEXT(rv); } - /* Ensure nexthops are sorted to satisfy nest invariant */ - if (!nexthop_is_sorted(first)) - first = nexthop_sort(first); + /* Store final length */ + nhad->ad.length = (void *) rv - (void *) nhad->ad.data; - return first; + /* Ensure nexthops are sorted to satisfy nest invariant */ + return nexthop_is_sorted(nhad) ? nhad : nexthop_sort(nhad, s->pool); err: log(L_ERR "KRT: Received strange multipath route %N", n); @@ -1239,9 +1283,7 @@ kif_do_scan(struct kif_proto *p UNUSED) log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type); /* Re-resolve master interface for slaves */ - IFACE_LEGACY_ACCESS; - struct iface *i; - WALK_LIST(i, global_iface_list) + IFACE_WALK(i) if (i->master_index) { struct iface f = { @@ -1249,13 +1291,13 @@ kif_do_scan(struct kif_proto *p UNUSED) .mtu = i->mtu, .index = i->index, .master_index = i->master_index, - .master = if_find_by_index(i->master_index) + .master = if_find_by_index_locked(i->master_index) }; if (f.master != i->master) { memcpy(f.name, i->name, sizeof(f.name)); - if_update(&f); + if_update_locked(&f); } } @@ -1301,11 +1343,16 @@ HASH_DEFINE_REHASH_FN(RTH, struct krt_proto) int krt_capable(rte *e) { - rta *a = e->attrs; + eattr *ea = ea_find(e->attrs, &ea_gen_nexthop); + if (!ea) + return 0; + + struct nexthop_adata *nhad = (void *) ea->u.ptr; + if (NEXTHOP_IS_REACHABLE(nhad)) + return 1; - switch (a->dest) + switch (nhad->dest) { - case RTD_UNICAST: case RTD_BLACKHOLE: case RTD_UNREACHABLE: case RTD_PROHIBIT: @@ -1317,21 +1364,24 @@ krt_capable(rte *e) } static inline int -nh_bufsize(struct nexthop *nh) +nh_bufsize(struct nexthop_adata *nhad) { int rv = 0; - for (; nh != NULL; nh = nh->next) + NEXTHOP_WALK(nh, nhad) rv += RTNH_LENGTH(RTA_LENGTH(sizeof(ip_addr))); return rv; } static int -nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nexthop *nh) +nl_send_route(struct krt_proto *p, const rte *e, int op) { eattr *ea; - rta *a = e->attrs; - ea_list *eattrs = a->eattrs; - int bufsize = 128 + KRT_METRICS_MAX*8 + nh_bufsize(&(a->nh)); + ea_list *eattrs = e->attrs; + eattr *nhea = ea_find(eattrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); + + int bufsize = 128 + KRT_METRICS_MAX*8 + (nh ? nh_bufsize(nh) : 0); u32 priority = 0; struct { @@ -1399,7 +1449,7 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho priority = 0; else if (KRT_CF->sys.metric) priority = KRT_CF->sys.metric; - else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, EA_KRT_METRIC))) + else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, &ea_krt_metric))) priority = ea->u.data; if (priority) @@ -1407,20 +1457,22 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho /* For route delete, we do not specify remaining route attributes */ if (op == NL_OP_DELETE) - goto dest; + goto done; /* Default scope is LINK for device routes, UNIVERSE otherwise */ if (p->af == AF_MPLS) r->r.rtm_scope = RT_SCOPE_UNIVERSE; - else if (ea = ea_find(eattrs, EA_KRT_SCOPE)) + else if (ea = ea_find(eattrs, &ea_krt_scope)) r->r.rtm_scope = ea->u.data; + else if (dest == RTD_UNICAST && ipa_zero(nh->nh.gw)) + r->r.rtm_scope = RT_SCOPE_LINK; else - r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; + r->r.rtm_scope = RT_SCOPE_UNIVERSE; - if (ea = ea_find(eattrs, EA_KRT_PREFSRC)) + if (ea = ea_find(eattrs, &ea_krt_prefsrc)) nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data); - if (ea = ea_find(eattrs, EA_KRT_REALM)) + if (ea = ea_find(eattrs, &ea_krt_realm)) nl_add_attr_u32(&r->h, rsize, RTA_FLOW, ea->u.data); @@ -1428,9 +1480,9 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho metrics[0] = 0; struct ea_walk_state ews = { .eattrs = eattrs }; - while (ea = ea_walk(&ews, EA_KRT_METRICS, KRT_METRICS_MAX)) + while (ea = ea_walk(&ews, ea_krt_metrics[0].id, KRT_METRICS_MAX)) { - int id = ea->id - EA_KRT_METRICS; + int id = ea->id - ea_krt_metrics[0].id; metrics[0] |= 1 << id; metrics[id] = ea->u.data; } @@ -1438,20 +1490,18 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho if (metrics[0]) nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX); - -dest: switch (dest) { case RTD_UNICAST: r->r.rtm_type = RTN_UNICAST; - if (nh->next && !krt_ecmp6(p)) + if (!NEXTHOP_ONE(nh)) nl_add_multipath(&r->h, rsize, nh, p->af, eattrs); else { - nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->iface->index); - nl_add_nexthop(&r->h, rsize, nh, p->af); + nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->nh.iface->index); + nl_add_nexthop(&r->h, rsize, &nh->nh, p->af); - if (nh->flags & RNF_ONLINK) + if (nh->nh.flags & RNF_ONLINK) r->r.rtm_flags |= RTNH_F_ONLINK; } break; @@ -1470,82 +1520,56 @@ dest: bug("krt_capable inconsistent with nl_send_route"); } +done: /* Ignore missing for DELETE */ return nl_exchange(&r->h, (op == NL_OP_DELETE)); } static inline int -nl_add_rte(struct krt_proto *p, rte *e) -{ - rta *a = e->attrs; - int err = 0; - - if (krt_ecmp6(p) && a->nh.next) - { - struct nexthop *nh = &(a->nh); - - err = nl_send_route(p, e, NL_OP_ADD, RTD_UNICAST, nh); - if (err < 0) - return err; - - for (nh = nh->next; nh; nh = nh->next) - err += nl_send_route(p, e, NL_OP_APPEND, RTD_UNICAST, nh); - - return err; - } - - return nl_send_route(p, e, NL_OP_ADD, a->dest, &(a->nh)); -} - -static inline int -nl_delete_rte(struct krt_proto *p, const rte *e) +nl_allow_replace(struct krt_proto *p, rte *new) { - int err = 0; + /* + * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for + * matching rtm_protocol, but that is OK when dedicated priority is used. + * + * For IPv6, the NL_OP_REPLACE is still broken even in Linux 4.19 LTS + * (although it seems to be fixed in Linux 5.10 LTS) for sequence: + * + * ip route add 2001:db8::/32 via fe80::1 dev eth0 + * ip route replace 2001:db8::/32 dev eth0 + * + * (it ends with two routes instead of replacing the first by the second one) + * + * Replacing with direct and special type (e.g. unreachable) routes does not + * work, but replacing with regular routes work reliably + */ - /* For IPv6, we just repeatedly request DELETE until we get error */ - do - err = nl_send_route(p, e, NL_OP_DELETE, RTD_NONE, NULL); - while (krt_ecmp6(p) && !err); + if (krt_ipv4(p)) + return 1; - return err; -} + eattr *nhea = ea_find(new->attrs, &ea_gen_nexthop); + struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + int dest = nhea_dest(nhea); -static inline int -nl_replace_rte(struct krt_proto *p, rte *e) -{ - rta *a = e->attrs; - return nl_send_route(p, e, NL_OP_REPLACE, a->dest, &(a->nh)); + return (dest == RTD_UNICAST) && ipa_nonzero(nh->nh.gw); } - void krt_replace_rte(struct krt_proto *p, const net_addr *n UNUSED, rte *new, const rte *old) { int err = 0; - /* - * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for - * matching rtm_protocol, but that is OK when dedicated priority is used. - * - * We do not use NL_OP_REPLACE for IPv6, as it has broken semantics for ECMP - * and with some kernel versions ECMP replace crashes kernel. Would need more - * testing and checks for kernel versions. - * - * For IPv6, we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the - * old route value, so we do not try to optimize IPv6 ECMP reconfigurations. - */ - - if (krt_ipv4(p) && old && new) + if (old && new && nl_allow_replace(p, new)) { - err = nl_replace_rte(p, new); + err = nl_send_route(p, new, NL_OP_REPLACE); } else { if (old) - nl_delete_rte(p, old); + nl_send_route(p, old, NL_OP_DELETE); if (new) - err = nl_add_rte(p, new); + err = nl_send_route(p, new, NL_OP_ADD); } if (new) @@ -1557,73 +1581,6 @@ krt_replace_rte(struct krt_proto *p, const net_addr *n UNUSED, rte *new, const r } } -static int -nl_mergable_route(struct nl_parse_state *s, const net_addr *net, struct krt_proto *p, uint priority, uint krt_type, uint rtm_family) -{ - /* Route merging is used for IPv6 scans */ - if (!s->scan || (rtm_family != AF_INET6)) - return 0; - - /* Saved and new route must have same network, proto/table, and priority */ - if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority)) - return 0; - - /* Both must be regular unicast routes */ - if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST)) - return 0; - - return 1; -} - -static void -nl_announce_route(struct nl_parse_state *s) -{ - rte e0 = { - .attrs = s->attrs, - .net = s->net, - }; - - ea_list *ea = alloca(sizeof(ea_list) + 2 * sizeof(eattr)); - *ea = (ea_list) { .count = 2, .next = e0.attrs->eattrs }; - e0.attrs->eattrs = ea; - - ea->attrs[0] = (eattr) { - .id = EA_KRT_SOURCE, - .type = EAF_TYPE_INT, - .u.data = s->krt_proto, - }; - ea->attrs[1] = (eattr) { - .id = EA_KRT_METRIC, - .type = EAF_TYPE_INT, - .u.data = s->krt_metric, - }; - - if (s->scan) - krt_got_route(s->proto, &e0, s->krt_src); - else - krt_got_route_async(s->proto, &e0, s->new, s->krt_src); - - s->net = NULL; - s->attrs = NULL; - s->proto = NULL; - lp_flush(s->pool); -} - -static inline void -nl_parse_begin(struct nl_parse_state *s, int scan) -{ - memset(s, 0, sizeof (struct nl_parse_state)); - s->pool = nl_linpool; - s->scan = scan; -} - -static inline void -nl_parse_end(struct nl_parse_state *s) -{ - if (s->net) - nl_announce_route(s); -} - #define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0) #define SKIP(ARG, ...) do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0) @@ -1759,80 +1716,112 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) net6_prefix(&src), net6_pxlen(&src)); } - if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type, i->rtm_family)) - nl_announce_route(s); - - rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE); - ra->source = RTS_INHERIT; - ra->scope = SCOPE_UNIVERSE; + ea_list *ra = NULL; + ea_set_attr_u32(&ra, &ea_gen_source, 0, RTS_INHERIT); + ea_set_attr_u32(&ra, &ea_krt_source, 0, i->rtm_protocol); + ea_set_attr_u32(&ra, &ea_krt_metric, 0, priority); if (a[RTA_FLOW]) s->rta_flow = rta_get_u32(a[RTA_FLOW]); else s->rta_flow = 0; + union { + struct { + struct adata ad; + struct nexthop nh; + u32 labels[MPLS_MAX_LABEL_STACK]; + }; + struct nexthop_adata nhad; + } nhad = {}; + switch (i->rtm_type) { case RTN_UNICAST: - ra->dest = RTD_UNICAST; - if (a[RTA_MULTIPATH]) { - struct nexthop *nh = nl_parse_multipath(s, p, net, a[RTA_MULTIPATH], i->rtm_family, krt_src); + struct nexthop_adata *nh = nl_parse_multipath(s, p, net, a[RTA_MULTIPATH], i->rtm_family, krt_src); if (!nh) SKIP("strange RTA_MULTIPATH\n"); - nexthop_link(ra, nh); + ea_set_attr(&ra, EA_LITERAL_DIRECT_ADATA( + &ea_gen_nexthop, 0, &nh->ad)); break; } if ((i->rtm_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD)) SKIP("ignore RTNH_F_DEAD\n"); - ra->nh.iface = if_find_by_index(oif); - if (!ra->nh.iface) + nhad.nh.iface = if_find_by_index(oif); + if (!nhad.nh.iface) { log(L_ERR "KRT: Received route %N with unknown ifindex %u", net, oif); return; } if (a[RTA_GATEWAY]) - ra->nh.gw = rta_get_ipa(a[RTA_GATEWAY]); + nhad.nh.gw = rta_get_ipa(a[RTA_GATEWAY]); #ifdef HAVE_MPLS_KERNEL if (a[RTA_VIA]) - ra->nh.gw = rta_get_via(a[RTA_VIA]); + nhad.nh.gw = rta_get_via(a[RTA_VIA]); #endif - if (ipa_nonzero(ra->nh.gw)) + if (i->rtm_flags & RTNH_F_ONLINK) + nhad.nh.flags |= RNF_ONLINK; + + if (ipa_nonzero(nhad.nh.gw)) { /* Silently skip strange 6to4 routes */ const net_addr_ip6 sit = NET_ADDR_IP6(IP6_NONE, 96); - if ((i->rtm_family == AF_INET6) && ipa_in_netX(ra->nh.gw, (net_addr *) &sit)) + if ((i->rtm_family == AF_INET6) && ipa_in_netX(nhad.nh.gw, (net_addr *) &sit)) return; - if (i->rtm_flags & RTNH_F_ONLINK) - ra->nh.flags |= RNF_ONLINK; - neighbor *nbr; - nbr = neigh_find(&p->p, ra->nh.gw, ra->nh.iface, - (ra->nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0); + nbr = neigh_find(&p->p, nhad.nh.gw, nhad.nh.iface, + (nhad.nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0); if (!nbr || (nbr->scope == SCOPE_HOST)) { - log(L_ERR "KRT: Received route %N with strange next-hop %I", net, ra->nh.gw); + log(L_ERR "KRT: Received route %N with strange next-hop %I", net, + nhad.nh.gw); return; } } +#ifdef HAVE_MPLS_KERNEL + if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !a[RTA_MULTIPATH]) + nhad.nh.labels = rta_get_mpls(a[RTA_NEWDST], nhad.nh.label); + + if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !a[RTA_MULTIPATH]) + { + switch (rta_get_u16(a[RTA_ENCAP_TYPE])) + { + case LWTUNNEL_ENCAP_MPLS: + { + struct rtattr *enca[BIRD_RTA_MAX]; + nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]); + nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca)); + nhad.nh.labels = rta_get_mpls(enca[RTA_DST], nhad.nh.label); + break; + } + default: + SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE])); + break; + } + } +#endif + + /* Finalize the nexthop */ + nhad.ad.length = (void *) NEXTHOP_NEXT(&nhad.nh) - (void *) nhad.ad.data; break; case RTN_BLACKHOLE: - ra->dest = RTD_BLACKHOLE; + nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_BLACKHOLE); break; case RTN_UNREACHABLE: - ra->dest = RTD_UNREACHABLE; + nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_UNREACHABLE); break; case RTN_PROHIBIT: - ra->dest = RTD_PROHIBIT; + nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_PROHIBIT); break; /* FIXME: What about RTN_THROW? */ default: @@ -1840,163 +1829,77 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) return; } -#ifdef HAVE_MPLS_KERNEL - if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !ra->nh.next) - ra->nh.labels = rta_get_mpls(a[RTA_NEWDST], ra->nh.label); - - if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !ra->nh.next) - { - switch (rta_get_u16(a[RTA_ENCAP_TYPE])) - { - case LWTUNNEL_ENCAP_MPLS: - { - struct rtattr *enca[BIRD_RTA_MAX]; - nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]); - nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca)); - ra->nh.labels = rta_get_mpls(enca[RTA_DST], ra->nh.label); - break; - } - default: - SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE])); - break; - } - } -#endif + if (nhad.ad.length) + ea_set_attr(&ra, EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, &nhad.ad)); if (i->rtm_scope != def_scope) - { - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = ra->eattrs; - ra->eattrs = ea; - ea->flags = EALF_SORTED; - ea->count = 1; - ea->attrs[0].id = EA_KRT_SCOPE; - ea->attrs[0].flags = 0; - ea->attrs[0].type = EAF_TYPE_INT; - ea->attrs[0].u.data = i->rtm_scope; - } + ea_set_attr(&ra, + EA_LITERAL_EMBEDDED(&ea_krt_scope, 0, i->rtm_scope)); if (a[RTA_PREFSRC]) - { - ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]); - - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = ra->eattrs; - ra->eattrs = ea; - ea->flags = EALF_SORTED; - ea->count = 1; - ea->attrs[0].id = EA_KRT_PREFSRC; - ea->attrs[0].flags = 0; - ea->attrs[0].type = EAF_TYPE_IP_ADDRESS; - - struct adata *ad = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps)); - ad->length = sizeof(ps); - memcpy(ad->data, &ps, sizeof(ps)); - - ea->attrs[0].u.ptr = ad; - } + { + ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]); + + ea_set_attr(&ra, + EA_LITERAL_STORE_ADATA(&ea_krt_prefsrc, 0, &ps, sizeof(ps))); + } /* Can be set per-route or per-nexthop */ if (s->rta_flow) - { - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = ra->eattrs; - ra->eattrs = ea; - ea->flags = EALF_SORTED; - ea->count = 1; - ea->attrs[0].id = EA_KRT_REALM; - ea->attrs[0].flags = 0; - ea->attrs[0].type = EAF_TYPE_INT; - ea->attrs[0].u.data = s->rta_flow; - } + ea_set_attr(&ra, + EA_LITERAL_EMBEDDED(&ea_krt_realm, 0, s->rta_flow)); if (a[RTA_METRICS]) { u32 metrics[KRT_METRICS_MAX]; - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr)); - int t, n = 0; - if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0) { log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net); return; } - for (t = 1; t < KRT_METRICS_MAX; t++) + for (uint t = 1; t < KRT_METRICS_MAX; t++) if (metrics[0] & (1 << t)) - { - ea->attrs[n].id = EA_CODE(PROTOCOL_KERNEL, KRT_METRICS_OFFSET + t); - ea->attrs[n].flags = 0; - ea->attrs[n].type = EAF_TYPE_INT; /* FIXME: Some are EAF_TYPE_BITFIELD */ - ea->attrs[n].u.data = metrics[t]; - n++; - } - - if (n > 0) - { - ea->next = ra->eattrs; - ea->flags = EALF_SORTED; - ea->count = n; - ra->eattrs = ea; - } + ea_set_attr(&ra, + EA_LITERAL_EMBEDDED(&ea_krt_metrics[t], 0, metrics[t])); } - /* - * Ideally, now we would send the received route to the rest of kernel code. - * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we - * postpone it and merge next hops until the end of the sequence. Note that - * when doing merging of next hops, we expect the new route to be unipath. - * Otherwise, we ignore additional next hops in nexthop_insert(). - */ + rte e0 = { + .net = net, + .attrs = ra, + }; - if (!s->net) - { - /* Store the new route */ - s->net = lp_alloc(s->pool, net->length); - net_copy(s->net, net); - - s->attrs = ra; - s->proto = p; - s->new = new; - s->krt_src = krt_src; - s->krt_type = i->rtm_type; - s->krt_proto = i->rtm_protocol; - s->krt_metric = priority; - } + if (s->scan) + krt_got_route(p, &e0, krt_src); else - { - /* Merge next hops with the stored route */ - rta *oa = s->attrs; - - struct nexthop *nhs = &oa->nh; - nexthop_insert(&nhs, &ra->nh); + krt_got_route_async(p, &e0, new, krt_src); - /* Perhaps new nexthop is inserted at the first position */ - if (nhs == &ra->nh) - { - /* Swap rtas */ - s->attrs = ra; - - /* Keep old eattrs */ - ra->eattrs = oa->eattrs; - } - } + lp_flush(s->pool); } void -krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */ +krt_do_scan(struct krt_proto *p) { - struct nlmsghdr *h; - struct nl_parse_state s; + struct nl_parse_state s = { + .proto = p, + .pool = nl_linpool, + .scan = 1, + }; - nl_parse_begin(&s, 1); - nl_request_dump_route(AF_UNSPEC); + /* Table-specific scan or shared scan */ + if (p) + nl_request_dump_route(p->af, krt_table_id(p)); + else + nl_request_dump_route(AF_UNSPEC, 0); + + struct nlmsghdr *h; while (h = nl_get_scan()) + { if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) nl_parse_route(&s, h); else log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type); - nl_parse_end(&s); + } } /* @@ -2011,16 +1914,18 @@ static struct config *nl_last_config; /* For tracking changes to nl_async_bufsiz static void nl_async_msg(struct nlmsghdr *h) { - struct nl_parse_state s; + struct nl_parse_state s = { + .proto = NULL, + .pool = nl_linpool, + .scan = 0, + }; switch (h->nlmsg_type) { case RTM_NEWROUTE: case RTM_DELROUTE: DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type); - nl_parse_begin(&s, 0); nl_parse_route(&s, h); - nl_parse_end(&s); break; case RTM_NEWLINK: case RTM_DELLINK: @@ -2138,7 +2043,7 @@ nl_open_async(void) sk->rx_hook = nl_async_hook; sk->err_hook = nl_async_err_hook; sk->fd = fd; - if (sk_open(sk) < 0) + if (sk_open(sk, &main_birdloop) < 0) bug("Netlink: sk_open failed"); } @@ -2178,6 +2083,8 @@ krt_sys_io_init(void) { nl_linpool = lp_new_default(krt_pool); HASH_INIT(nl_table_map, krt_pool, 6); + + nl_ea_register(); } int @@ -2231,56 +2138,6 @@ krt_sys_copy_config(struct krt_config *d, struct krt_config *s) d->sys.metric = s->sys.metric; } -static const char *krt_metrics_names[KRT_METRICS_MAX] = { - NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss", - "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack" -}; - -static const char *krt_features_names[KRT_FEATURES_MAX] = { - "ecn", NULL, NULL, "allfrag" -}; - -int -krt_sys_get_attr(const eattr *a, byte *buf, int buflen UNUSED) -{ - switch (a->id) - { - case EA_KRT_PREFSRC: - bsprintf(buf, "prefsrc"); - return GA_NAME; - - case EA_KRT_REALM: - bsprintf(buf, "realm"); - return GA_NAME; - - case EA_KRT_SCOPE: - bsprintf(buf, "scope"); - return GA_NAME; - - case EA_KRT_LOCK: - buf += bsprintf(buf, "lock:"); - ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX); - return GA_FULL; - - case EA_KRT_FEATURES: - buf += bsprintf(buf, "features:"); - ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX); - return GA_FULL; - - default:; - int id = (int)EA_ID(a->id) - KRT_METRICS_OFFSET; - if (id > 0 && id < KRT_METRICS_MAX) - { - bsprintf(buf, "%s", krt_metrics_names[id]); - return GA_NAME; - } - - return GA_UNKNOWN; - } -} - - - void kif_sys_start(struct kif_proto *p UNUSED) { diff --git a/sysdep/linux/netlink.c.orig b/sysdep/linux/netlink.c.orig deleted file mode 100644 index 7cea5322..00000000 --- a/sysdep/linux/netlink.c.orig +++ /dev/null @@ -1,2179 +0,0 @@ -/* - * BIRD -- Linux Netlink Interface - * - * (c) 1999--2000 Martin Mares <mj@ucw.cz> - * - * Can be freely distributed and used under the terms of the GNU GPL. - */ - -#include <alloca.h> -#include <stdio.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/socket.h> -#include <sys/uio.h> -#include <errno.h> - -#undef LOCAL_DEBUG - -#include "nest/bird.h" -#include "nest/route.h" -#include "nest/protocol.h" -#include "nest/iface.h" -#include "lib/alloca.h" -#include "sysdep/unix/unix.h" -#include "sysdep/unix/krt.h" -#include "lib/socket.h" -#include "lib/string.h" -#include "lib/hash.h" -#include "conf/conf.h" - -#include <asm/types.h> -#include <linux/if.h> -#include <linux/netlink.h> -#include <linux/rtnetlink.h> - -#ifdef HAVE_MPLS_KERNEL -#include <linux/lwtunnel.h> -#endif - -#ifndef MSG_TRUNC /* Hack: Several versions of glibc miss this one :( */ -#define MSG_TRUNC 0x20 -#endif - -#ifndef IFA_FLAGS -#define IFA_FLAGS 8 -#endif - -#ifndef IFF_LOWER_UP -#define IFF_LOWER_UP 0x10000 -#endif - -#ifndef RTA_TABLE -#define RTA_TABLE 15 -#endif - -#ifndef RTA_VIA -#define RTA_VIA 18 -#endif - -#ifndef RTA_NEWDST -#define RTA_NEWDST 19 -#endif - -#ifndef RTA_ENCAP_TYPE -#define RTA_ENCAP_TYPE 21 -#endif - -#ifndef RTA_ENCAP -#define RTA_ENCAP 22 -#endif - -#define krt_ipv4(p) ((p)->af == AF_INET) -#define krt_ecmp6(p) ((p)->af == AF_INET6) - -const int rt_default_ecmp = 16; - -/* - * Structure nl_parse_state keeps state of received route processing. Ideally, - * we could just independently parse received Netlink messages and immediately - * propagate received routes to the rest of BIRD, but older Linux kernel (before - * version 4.11) represents and announces IPv6 ECMP routes not as one route with - * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of - * routes with the same prefix. More recent kernels work as with IPv4. - * - * Therefore, BIRD keeps currently processed route in nl_parse_state structure - * and postpones its propagation until we expect it to be final; i.e., when - * non-matching route is received or when the scan ends. When another matching - * route is received, it is merged with the already processed route to form an - * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the - * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6 - * routes with RTA_MULTIPATH set are just considered non-matching. - * - * This is ignored for asynchronous notifications (every notification is handled - * as a separate route). It is not an issue for our routes, as we ignore such - * notifications anyways. But importing alien IPv6 ECMP routes does not work - * properly with older kernels. - * - * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes - * for the same prefix. - */ - -struct nl_parse_state -{ - struct linpool *pool; - int scan; - int merge; - - net *net; - rta *attrs; - struct krt_proto *proto; - s8 new; - s8 krt_src; - u8 krt_type; - u8 krt_proto; - u32 krt_metric; - - u32 rta_flow; /* Used during parsing */ -}; - -/* - * Synchronous Netlink interface - */ - -struct nl_sock -{ - int fd; - u32 seq; - byte *rx_buffer; /* Receive buffer */ - struct nlmsghdr *last_hdr; /* Recently received packet */ - uint last_size; -}; - -#define NL_RX_SIZE 8192 - -#define NL_OP_DELETE 0 -#define NL_OP_ADD (NLM_F_CREATE|NLM_F_EXCL) -#define NL_OP_REPLACE (NLM_F_CREATE|NLM_F_REPLACE) -#define NL_OP_APPEND (NLM_F_CREATE|NLM_F_APPEND) - -static linpool *nl_linpool; - -static struct nl_sock nl_scan = {.fd = -1}; /* Netlink socket for synchronous scan */ -static struct nl_sock nl_req = {.fd = -1}; /* Netlink socket for requests */ - -static void -nl_open_sock(struct nl_sock *nl) -{ - if (nl->fd < 0) - { - nl->fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); - if (nl->fd < 0) - die("Unable to open rtnetlink socket: %m"); - nl->seq = (u32) (current_time() TO_S); /* Or perhaps random_u32() ? */ - nl->rx_buffer = xmalloc(NL_RX_SIZE); - nl->last_hdr = NULL; - nl->last_size = 0; - } -} - -static void -nl_open(void) -{ - nl_open_sock(&nl_scan); - nl_open_sock(&nl_req); -} - -static void -nl_send(struct nl_sock *nl, struct nlmsghdr *nh) -{ - struct sockaddr_nl sa; - - memset(&sa, 0, sizeof(sa)); - sa.nl_family = AF_NETLINK; - nh->nlmsg_pid = 0; - nh->nlmsg_seq = ++(nl->seq); - nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len); - if (sendto(nl->fd, nh, nh->nlmsg_len, 0, (struct sockaddr *)&sa, sizeof(sa)) < 0) - die("rtnetlink sendto: %m"); - nl->last_hdr = NULL; -} - -static void -nl_request_dump(int af, int cmd) -{ - struct { - struct nlmsghdr nh; - struct rtgenmsg g; - } req = { - .nh.nlmsg_type = cmd, - .nh.nlmsg_len = sizeof(req), - .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, - .g.rtgen_family = af - }; - nl_send(&nl_scan, &req.nh); -} - -static struct nlmsghdr * -nl_get_reply(struct nl_sock *nl) -{ - for(;;) - { - if (!nl->last_hdr) - { - struct iovec iov = { nl->rx_buffer, NL_RX_SIZE }; - struct sockaddr_nl sa; - struct msghdr m = { - .msg_name = &sa, - .msg_namelen = sizeof(sa), - .msg_iov = &iov, - .msg_iovlen = 1, - }; - int x = recvmsg(nl->fd, &m, 0); - if (x < 0) - die("nl_get_reply: %m"); - if (sa.nl_pid) /* It isn't from the kernel */ - { - DBG("Non-kernel packet\n"); - continue; - } - nl->last_size = x; - nl->last_hdr = (void *) nl->rx_buffer; - if (m.msg_flags & MSG_TRUNC) - bug("nl_get_reply: got truncated reply which should be impossible"); - } - if (NLMSG_OK(nl->last_hdr, nl->last_size)) - { - struct nlmsghdr *h = nl->last_hdr; - nl->last_hdr = NLMSG_NEXT(h, nl->last_size); - if (h->nlmsg_seq != nl->seq) - { - log(L_WARN "nl_get_reply: Ignoring out of sequence netlink packet (%x != %x)", - h->nlmsg_seq, nl->seq); - continue; - } - return h; - } - if (nl->last_size) - log(L_WARN "nl_get_reply: Found packet remnant of size %d", nl->last_size); - nl->last_hdr = NULL; - } -} - -static struct tbf rl_netlink_err = TBF_DEFAULT_LOG_LIMITS; - -static int -nl_error(struct nlmsghdr *h, int ignore_esrch) -{ - struct nlmsgerr *e; - int ec; - - if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) - { - log(L_WARN "Netlink: Truncated error message received"); - return ENOBUFS; - } - e = (struct nlmsgerr *) NLMSG_DATA(h); - ec = -e->error; - if (ec && !(ignore_esrch && (ec == ESRCH))) - log_rl(&rl_netlink_err, L_WARN "Netlink: %s", strerror(ec)); - return ec; -} - -static struct nlmsghdr * -nl_get_scan(void) -{ - struct nlmsghdr *h = nl_get_reply(&nl_scan); - - if (h->nlmsg_type == NLMSG_DONE) - return NULL; - if (h->nlmsg_type == NLMSG_ERROR) - { - nl_error(h, 0); - return NULL; - } - return h; -} - -static int -nl_exchange(struct nlmsghdr *pkt, int ignore_esrch) -{ - struct nlmsghdr *h; - - nl_send(&nl_req, pkt); - for(;;) - { - h = nl_get_reply(&nl_req); - if (h->nlmsg_type == NLMSG_ERROR) - break; - log(L_WARN "nl_exchange: Unexpected reply received"); - } - return nl_error(h, ignore_esrch) ? -1 : 0; -} - -/* - * Netlink attributes - */ - -static int nl_attr_len; - -static void * -nl_checkin(struct nlmsghdr *h, int lsize) -{ - nl_attr_len = h->nlmsg_len - NLMSG_LENGTH(lsize); - if (nl_attr_len < 0) - { - log(L_ERR "nl_checkin: underrun by %d bytes", -nl_attr_len); - return NULL; - } - return NLMSG_DATA(h); -} - -struct nl_want_attrs { - u8 defined:1; - u8 checksize:1; - u8 size; -}; - - -#define BIRD_IFLA_MAX (IFLA_WIRELESS+1) - -static struct nl_want_attrs ifla_attr_want[BIRD_IFLA_MAX] = { - [IFLA_IFNAME] = { 1, 0, 0 }, - [IFLA_MTU] = { 1, 1, sizeof(u32) }, - [IFLA_MASTER] = { 1, 1, sizeof(u32) }, - [IFLA_WIRELESS] = { 1, 0, 0 }, -}; - - -#define BIRD_IFA_MAX (IFA_FLAGS+1) - -static struct nl_want_attrs ifa_attr_want4[BIRD_IFA_MAX] = { - [IFA_ADDRESS] = { 1, 1, sizeof(ip4_addr) }, - [IFA_LOCAL] = { 1, 1, sizeof(ip4_addr) }, - [IFA_BROADCAST] = { 1, 1, sizeof(ip4_addr) }, - [IFA_FLAGS] = { 1, 1, sizeof(u32) }, -}; - -static struct nl_want_attrs ifa_attr_want6[BIRD_IFA_MAX] = { - [IFA_ADDRESS] = { 1, 1, sizeof(ip6_addr) }, - [IFA_LOCAL] = { 1, 1, sizeof(ip6_addr) }, - [IFA_FLAGS] = { 1, 1, sizeof(u32) }, -}; - - -#define BIRD_RTA_MAX (RTA_ENCAP+1) - -static struct nl_want_attrs nexthop_attr_want4[BIRD_RTA_MAX] = { - [RTA_GATEWAY] = { 1, 1, sizeof(ip4_addr) }, - [RTA_VIA] = { 1, 0, 0 }, - [RTA_FLOW] = { 1, 1, sizeof(u32) }, - [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) }, - [RTA_ENCAP] = { 1, 0, 0 }, -}; - -static struct nl_want_attrs nexthop_attr_want6[BIRD_RTA_MAX] = { - [RTA_GATEWAY] = { 1, 1, sizeof(ip6_addr) }, - [RTA_VIA] = { 1, 0, 0 }, - [RTA_FLOW] = { 1, 1, sizeof(u32) }, - [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) }, - [RTA_ENCAP] = { 1, 0, 0 }, -}; - -#ifdef HAVE_MPLS_KERNEL -static struct nl_want_attrs nexthop_attr_want_mpls[BIRD_RTA_MAX] = { - [RTA_VIA] = { 1, 0, 0 }, - [RTA_NEWDST] = { 1, 0, 0 }, -}; - -static struct nl_want_attrs encap_mpls_want[BIRD_RTA_MAX] = { - [RTA_DST] = { 1, 0, 0 }, -}; -#endif - -static struct nl_want_attrs rtm_attr_want4[BIRD_RTA_MAX] = { - [RTA_DST] = { 1, 1, sizeof(ip4_addr) }, - [RTA_OIF] = { 1, 1, sizeof(u32) }, - [RTA_GATEWAY] = { 1, 1, sizeof(ip4_addr) }, - [RTA_PRIORITY] = { 1, 1, sizeof(u32) }, - [RTA_PREFSRC] = { 1, 1, sizeof(ip4_addr) }, - [RTA_METRICS] = { 1, 0, 0 }, - [RTA_MULTIPATH] = { 1, 0, 0 }, - [RTA_FLOW] = { 1, 1, sizeof(u32) }, - [RTA_TABLE] = { 1, 1, sizeof(u32) }, - [RTA_VIA] = { 1, 0, 0 }, - [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) }, - [RTA_ENCAP] = { 1, 0, 0 }, -}; - -static struct nl_want_attrs rtm_attr_want6[BIRD_RTA_MAX] = { - [RTA_DST] = { 1, 1, sizeof(ip6_addr) }, - [RTA_SRC] = { 1, 1, sizeof(ip6_addr) }, - [RTA_IIF] = { 1, 1, sizeof(u32) }, - [RTA_OIF] = { 1, 1, sizeof(u32) }, - [RTA_GATEWAY] = { 1, 1, sizeof(ip6_addr) }, - [RTA_PRIORITY] = { 1, 1, sizeof(u32) }, - [RTA_PREFSRC] = { 1, 1, sizeof(ip6_addr) }, - [RTA_METRICS] = { 1, 0, 0 }, - [RTA_MULTIPATH] = { 1, 0, 0 }, - [RTA_FLOW] = { 1, 1, sizeof(u32) }, - [RTA_TABLE] = { 1, 1, sizeof(u32) }, - [RTA_VIA] = { 1, 0, 0 }, - [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) }, - [RTA_ENCAP] = { 1, 0, 0 }, -}; - -#ifdef HAVE_MPLS_KERNEL -static struct nl_want_attrs rtm_attr_want_mpls[BIRD_RTA_MAX] = { - [RTA_DST] = { 1, 1, sizeof(u32) }, - [RTA_IIF] = { 1, 1, sizeof(u32) }, - [RTA_OIF] = { 1, 1, sizeof(u32) }, - [RTA_PRIORITY] = { 1, 1, sizeof(u32) }, - [RTA_METRICS] = { 1, 0, 0 }, - [RTA_MULTIPATH] = { 1, 0, 0 }, - [RTA_FLOW] = { 1, 1, sizeof(u32) }, - [RTA_TABLE] = { 1, 1, sizeof(u32) }, - [RTA_VIA] = { 1, 0, 0 }, - [RTA_NEWDST] = { 1, 0, 0 }, -}; -#endif - - -static int -nl_parse_attrs(struct rtattr *a, struct nl_want_attrs *want, struct rtattr **k, int ksize) -{ - int max = ksize / sizeof(struct rtattr *); - bzero(k, ksize); - - for ( ; RTA_OK(a, nl_attr_len); a = RTA_NEXT(a, nl_attr_len)) - { - if ((a->rta_type >= max) || !want[a->rta_type].defined) - continue; - - if (want[a->rta_type].checksize && (RTA_PAYLOAD(a) != want[a->rta_type].size)) - { - log(L_ERR "nl_parse_attrs: Malformed attribute received"); - return 0; - } - - k[a->rta_type] = a; - } - - if (nl_attr_len) - { - log(L_ERR "nl_parse_attrs: remnant of size %d", nl_attr_len); - return 0; - } - - return 1; -} - -static inline u16 rta_get_u16(struct rtattr *a) -{ return *(u16 *) RTA_DATA(a); } - -static inline u32 rta_get_u32(struct rtattr *a) -{ return *(u32 *) RTA_DATA(a); } - -static inline ip4_addr rta_get_ip4(struct rtattr *a) -{ return ip4_ntoh(*(ip4_addr *) RTA_DATA(a)); } - -static inline ip6_addr rta_get_ip6(struct rtattr *a) -{ return ip6_ntoh(*(ip6_addr *) RTA_DATA(a)); } - -static inline ip_addr rta_get_ipa(struct rtattr *a) -{ - if (RTA_PAYLOAD(a) == sizeof(ip4_addr)) - return ipa_from_ip4(rta_get_ip4(a)); - else - return ipa_from_ip6(rta_get_ip6(a)); -} - -#ifdef HAVE_MPLS_KERNEL -static inline ip_addr rta_get_via(struct rtattr *a) -{ - struct rtvia *v = RTA_DATA(a); - switch(v->rtvia_family) { - case AF_INET: return ipa_from_ip4(ip4_ntoh(*(ip4_addr *) v->rtvia_addr)); - case AF_INET6: return ipa_from_ip6(ip6_ntoh(*(ip6_addr *) v->rtvia_addr)); - } - return IPA_NONE; -} - -static u32 rta_mpls_stack[MPLS_MAX_LABEL_STACK]; -static inline int rta_get_mpls(struct rtattr *a, u32 *stack) -{ - if (!a) - return 0; - - if (RTA_PAYLOAD(a) % 4) - log(L_WARN "KRT: Strange length of received MPLS stack: %u", RTA_PAYLOAD(a)); - - int labels = mpls_get(RTA_DATA(a), RTA_PAYLOAD(a) & ~0x3, stack); - - if (labels < 0) - { - log(L_WARN "KRT: Too long MPLS stack received, ignoring"); - labels = 0; - } - - return labels; -} -#endif - -struct rtattr * -nl_add_attr(struct nlmsghdr *h, uint bufsize, uint code, const void *data, uint dlen) -{ - uint pos = NLMSG_ALIGN(h->nlmsg_len); - uint len = RTA_LENGTH(dlen); - - if (pos + len > bufsize) - bug("nl_add_attr: packet buffer overflow"); - - struct rtattr *a = (struct rtattr *)((char *)h + pos); - a->rta_type = code; - a->rta_len = len; - h->nlmsg_len = pos + len; - - if (dlen > 0) - memcpy(RTA_DATA(a), data, dlen); - - return a; -} - -static inline struct rtattr * -nl_open_attr(struct nlmsghdr *h, uint bufsize, uint code) -{ - return nl_add_attr(h, bufsize, code, NULL, 0); -} - -static inline void -nl_close_attr(struct nlmsghdr *h, struct rtattr *a) -{ - a->rta_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)a; -} - -static inline void -nl_add_attr_u16(struct nlmsghdr *h, uint bufsize, int code, u16 data) -{ - nl_add_attr(h, bufsize, code, &data, 2); -} - -static inline void -nl_add_attr_u32(struct nlmsghdr *h, uint bufsize, int code, u32 data) -{ - nl_add_attr(h, bufsize, code, &data, 4); -} - -static inline void -nl_add_attr_ip4(struct nlmsghdr *h, uint bufsize, int code, ip4_addr ip4) -{ - ip4 = ip4_hton(ip4); - nl_add_attr(h, bufsize, code, &ip4, sizeof(ip4)); -} - -static inline void -nl_add_attr_ip6(struct nlmsghdr *h, uint bufsize, int code, ip6_addr ip6) -{ - ip6 = ip6_hton(ip6); - nl_add_attr(h, bufsize, code, &ip6, sizeof(ip6)); -} - -static inline void -nl_add_attr_ipa(struct nlmsghdr *h, uint bufsize, int code, ip_addr ipa) -{ - if (ipa_is_ip4(ipa)) - nl_add_attr_ip4(h, bufsize, code, ipa_to_ip4(ipa)); - else - nl_add_attr_ip6(h, bufsize, code, ipa_to_ip6(ipa)); -} - -#ifdef HAVE_MPLS_KERNEL -static inline void -nl_add_attr_mpls(struct nlmsghdr *h, uint bufsize, int code, int len, u32 *stack) -{ - char buf[len*4]; - mpls_put(buf, len, stack); - nl_add_attr(h, bufsize, code, buf, len*4); -} - -static inline void -nl_add_attr_mpls_encap(struct nlmsghdr *h, uint bufsize, int len, u32 *stack) -{ - nl_add_attr_u16(h, bufsize, RTA_ENCAP_TYPE, LWTUNNEL_ENCAP_MPLS); - - struct rtattr *nest = nl_open_attr(h, bufsize, RTA_ENCAP); - nl_add_attr_mpls(h, bufsize, RTA_DST, len, stack); - nl_close_attr(h, nest); -} - -static inline void -nl_add_attr_via(struct nlmsghdr *h, uint bufsize, ip_addr ipa) -{ - struct rtvia *via = alloca(sizeof(struct rtvia) + 16); - - if (ipa_is_ip4(ipa)) - { - via->rtvia_family = AF_INET; - put_ip4(via->rtvia_addr, ipa_to_ip4(ipa)); - nl_add_attr(h, bufsize, RTA_VIA, via, sizeof(struct rtvia) + 4); - } - else - { - via->rtvia_family = AF_INET6; - put_ip6(via->rtvia_addr, ipa_to_ip6(ipa)); - nl_add_attr(h, bufsize, RTA_VIA, via, sizeof(struct rtvia) + 16); - } -} -#endif - -static inline struct rtnexthop * -nl_open_nexthop(struct nlmsghdr *h, uint bufsize) -{ - uint pos = NLMSG_ALIGN(h->nlmsg_len); - uint len = RTNH_LENGTH(0); - - if (pos + len > bufsize) - bug("nl_open_nexthop: packet buffer overflow"); - - h->nlmsg_len = pos + len; - - return (void *)h + pos; -} - -static inline void -nl_close_nexthop(struct nlmsghdr *h, struct rtnexthop *nh) -{ - nh->rtnh_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)nh; -} - -static inline void -nl_add_nexthop(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af UNUSED) -{ -#ifdef HAVE_MPLS_KERNEL - if (nh->labels > 0) - if (af == AF_MPLS) - nl_add_attr_mpls(h, bufsize, RTA_NEWDST, nh->labels, nh->label); - else - nl_add_attr_mpls_encap(h, bufsize, nh->labels, nh->label); - - if (ipa_nonzero(nh->gw)) - { - if (af == (ipa_is_ip4(nh->gw) ? AF_INET : AF_INET6)) - nl_add_attr_ipa(h, bufsize, RTA_GATEWAY, nh->gw); - else - nl_add_attr_via(h, bufsize, nh->gw); - } -#else - - if (ipa_nonzero(nh->gw)) - nl_add_attr_ipa(h, bufsize, RTA_GATEWAY, nh->gw); -#endif -} - -static void -nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, ea_list *eattrs) -{ - struct rtattr *a = nl_open_attr(h, bufsize, RTA_MULTIPATH); - eattr *flow = ea_find(eattrs, EA_KRT_REALM); - - for (; nh; nh = nh->next) - { - struct rtnexthop *rtnh = nl_open_nexthop(h, bufsize); - - rtnh->rtnh_flags = 0; - rtnh->rtnh_hops = nh->weight; - rtnh->rtnh_ifindex = nh->iface->index; - - nl_add_nexthop(h, bufsize, nh, af); - - if (nh->flags & RNF_ONLINK) - rtnh->rtnh_flags |= RTNH_F_ONLINK; - - /* Our KRT_REALM is per-route, but kernel RTA_FLOW is per-nexthop. - Therefore, we need to attach the same attribute to each nexthop. */ - if (flow) - nl_add_attr_u32(h, bufsize, RTA_FLOW, flow->u.data); - - nl_close_nexthop(h, rtnh); - } - - nl_close_attr(h, a); -} - -static struct nexthop * -nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr *n, struct rtattr *ra, int af, int krt_src) -{ - struct rtattr *a[BIRD_RTA_MAX]; - struct rtnexthop *nh = RTA_DATA(ra); - struct nexthop *rv, *first, **last; - unsigned len = RTA_PAYLOAD(ra); - - first = NULL; - last = &first; - - while (len) - { - /* Use RTNH_OK(nh,len) ?? */ - if ((len < sizeof(*nh)) || (len < nh->rtnh_len)) - goto err; - - if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD)) - goto next; - - *last = rv = lp_allocz(s->pool, NEXTHOP_MAX_SIZE); - last = &(rv->next); - - rv->weight = nh->rtnh_hops; - rv->iface = if_find_by_index(nh->rtnh_ifindex); - if (!rv->iface) - { - log(L_ERR "KRT: Received route %N with unknown ifindex %u", n, nh->rtnh_ifindex); - return NULL; - } - - /* Nonexistent RTNH_PAYLOAD ?? */ - nl_attr_len = nh->rtnh_len - RTNH_LENGTH(0); - switch (af) - { - case AF_INET: - if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want4, a, sizeof(a))) - goto err; - break; - - case AF_INET6: - if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want6, a, sizeof(a))) - goto err; - break; - -#ifdef HAVE_MPLS_KERNEL - case AF_MPLS: - if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want_mpls, a, sizeof(a))) - goto err; - - if (a[RTA_NEWDST]) - rv->labels = rta_get_mpls(a[RTA_NEWDST], rv->label); - - break; -#endif - - default: - goto err; - } - - if (a[RTA_GATEWAY]) - rv->gw = rta_get_ipa(a[RTA_GATEWAY]); - - if (a[RTA_FLOW]) - s->rta_flow = rta_get_u32(a[RTA_FLOW]); - -#ifdef HAVE_MPLS_KERNEL - if (a[RTA_VIA]) - rv->gw = rta_get_via(a[RTA_VIA]); -#endif - - if (ipa_nonzero(rv->gw)) - { - if (nh->rtnh_flags & RTNH_F_ONLINK) - rv->flags |= RNF_ONLINK; - - neighbor *nbr; - nbr = neigh_find(&p->p, rv->gw, rv->iface, - (rv->flags & RNF_ONLINK) ? NEF_ONLINK : 0); - if (!nbr || (nbr->scope == SCOPE_HOST)) - { - log(L_ERR "KRT: Received route %N with strange next-hop %I", n, rv->gw); - return NULL; - } - } - -#ifdef HAVE_MPLS_KERNEL - if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE]) - { - if (rta_get_u16(a[RTA_ENCAP_TYPE]) != LWTUNNEL_ENCAP_MPLS) - { - log(L_WARN "KRT: Received route %N with unknown encapsulation method %d", - n, rta_get_u16(a[RTA_ENCAP_TYPE])); - return NULL; - } - - struct rtattr *enca[BIRD_RTA_MAX]; - nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]); - nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca)); - rv->labels = rta_get_mpls(enca[RTA_DST], rv->label); - } -#endif - - next: - len -= NLMSG_ALIGN(nh->rtnh_len); - nh = RTNH_NEXT(nh); - } - - /* Ensure nexthops are sorted to satisfy nest invariant */ - if (!nexthop_is_sorted(first)) - first = nexthop_sort(first); - - return first; - -err: - log(L_ERR "KRT: Received strange multipath route %N", n); - return NULL; -} - -static void -nl_add_metrics(struct nlmsghdr *h, uint bufsize, u32 *metrics, int max) -{ - struct rtattr *a = nl_open_attr(h, bufsize, RTA_METRICS); - int t; - - for (t = 1; t < max; t++) - if (metrics[0] & (1 << t)) - nl_add_attr_u32(h, bufsize, t, metrics[t]); - - nl_close_attr(h, a); -} - -static int -nl_parse_metrics(struct rtattr *hdr, u32 *metrics, int max) -{ - struct rtattr *a = RTA_DATA(hdr); - int len = RTA_PAYLOAD(hdr); - - metrics[0] = 0; - for (; RTA_OK(a, len); a = RTA_NEXT(a, len)) - { - if (a->rta_type == RTA_UNSPEC) - continue; - - if (a->rta_type >= max) - continue; - - if (RTA_PAYLOAD(a) != 4) - return -1; - - metrics[0] |= 1 << a->rta_type; - metrics[a->rta_type] = rta_get_u32(a); - } - - if (len > 0) - return -1; - - return 0; -} - - -/* - * Scanning of interfaces - */ - -static void -nl_parse_link(struct nlmsghdr *h, int scan) -{ - struct ifinfomsg *i; - struct rtattr *a[BIRD_IFLA_MAX]; - int new = h->nlmsg_type == RTM_NEWLINK; - struct iface f = {}; - struct iface *ifi; - char *name; - u32 mtu, master = 0; - uint fl; - - if (!(i = nl_checkin(h, sizeof(*i))) || !nl_parse_attrs(IFLA_RTA(i), ifla_attr_want, a, sizeof(a))) - return; - if (!a[IFLA_IFNAME] || (RTA_PAYLOAD(a[IFLA_IFNAME]) < 2) || !a[IFLA_MTU]) - { - /* - * IFLA_IFNAME and IFLA_MTU are required, in fact, but there may also come - * a message with IFLA_WIRELESS set, where (e.g.) no IFLA_IFNAME exists. - * We simply ignore all such messages with IFLA_WIRELESS without notice. - */ - - if (a[IFLA_WIRELESS]) - return; - - log(L_ERR "KIF: Malformed message received"); - return; - } - - name = RTA_DATA(a[IFLA_IFNAME]); - mtu = rta_get_u32(a[IFLA_MTU]); - - if (a[IFLA_MASTER]) - master = rta_get_u32(a[IFLA_MASTER]); - - ifi = if_find_by_index(i->ifi_index); - if (!new) - { - DBG("KIF: IF%d(%s) goes down\n", i->ifi_index, name); - if (!ifi) - return; - - if_delete(ifi); - } - else - { - DBG("KIF: IF%d(%s) goes up (mtu=%d,flg=%x)\n", i->ifi_index, name, mtu, i->ifi_flags); - if (ifi && strncmp(ifi->name, name, sizeof(ifi->name)-1)) - if_delete(ifi); - - strncpy(f.name, name, sizeof(f.name)-1); - f.index = i->ifi_index; - f.mtu = mtu; - - f.master_index = master; - f.master = if_find_by_index(master); - - fl = i->ifi_flags; - if (fl & IFF_UP) - f.flags |= IF_ADMIN_UP; - if (fl & IFF_LOWER_UP) - f.flags |= IF_LINK_UP; - if (fl & IFF_LOOPBACK) /* Loopback */ - f.flags |= IF_MULTIACCESS | IF_LOOPBACK | IF_IGNORE; - else if (fl & IFF_POINTOPOINT) /* PtP */ - f.flags |= IF_MULTICAST; - else if (fl & IFF_BROADCAST) /* Broadcast */ - f.flags |= IF_MULTIACCESS | IF_BROADCAST | IF_MULTICAST; - else - f.flags |= IF_MULTIACCESS; /* NBMA */ - - if (fl & IFF_MULTICAST) - f.flags |= IF_MULTICAST; - - ifi = if_update(&f); - - if (!scan) - if_end_partial_update(ifi); - } -} - -static void -nl_parse_addr4(struct ifaddrmsg *i, int scan, int new) -{ - struct rtattr *a[BIRD_IFA_MAX]; - struct iface *ifi; - u32 ifa_flags; - int scope; - - if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want4, a, sizeof(a))) - return; - - if (!a[IFA_LOCAL]) - { - log(L_ERR "KIF: Malformed message received (missing IFA_LOCAL)"); - return; - } - if (!a[IFA_ADDRESS]) - { - log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)"); - return; - } - - ifi = if_find_by_index(i->ifa_index); - if (!ifi) - { - log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index); - return; - } - - if (a[IFA_FLAGS]) - ifa_flags = rta_get_u32(a[IFA_FLAGS]); - else - ifa_flags = i->ifa_flags; - - struct ifa ifa; - bzero(&ifa, sizeof(ifa)); - ifa.iface = ifi; - if (ifa_flags & IFA_F_SECONDARY) - ifa.flags |= IA_SECONDARY; - - ifa.ip = rta_get_ipa(a[IFA_LOCAL]); - - if (i->ifa_prefixlen > IP4_MAX_PREFIX_LENGTH) - { - log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen); - new = 0; - } - if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH) - { - ifa.brd = rta_get_ipa(a[IFA_ADDRESS]); - net_fill_ip4(&ifa.prefix, rta_get_ip4(a[IFA_ADDRESS]), i->ifa_prefixlen); - - /* It is either a host address or a peer address */ - if (ipa_equal(ifa.ip, ifa.brd)) - ifa.flags |= IA_HOST; - else - { - ifa.flags |= IA_PEER; - ifa.opposite = ifa.brd; - } - } - else - { - net_fill_ip4(&ifa.prefix, ipa_to_ip4(ifa.ip), i->ifa_prefixlen); - net_normalize(&ifa.prefix); - - if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 1) - ifa.opposite = ipa_opposite_m1(ifa.ip); - - if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 2) - ifa.opposite = ipa_opposite_m2(ifa.ip); - - if (ifi->flags & IF_BROADCAST) - { - /* If kernel offers us a broadcast address, we trust it */ - if (a[IFA_BROADCAST]) - ifa.brd = ipa_from_ip4(rta_get_ip4(a[IFA_BROADCAST])); - /* Otherwise we create one (except for /31) */ - else if (i->ifa_prefixlen < (IP4_MAX_PREFIX_LENGTH - 1)) - ifa.brd = ipa_from_ip4(ip4_or(ipa_to_ip4(ifa.ip), - ip4_not(ip4_mkmask(i->ifa_prefixlen)))); - } - } - - scope = ipa_classify(ifa.ip); - if (scope < 0) - { - log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name); - return; - } - ifa.scope = scope & IADDR_SCOPE_MASK; - - DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n", - ifi->index, ifi->name, - new ? "added" : "removed", - ifa.ip, ifa.flags, &ifa.prefix, ifa.brd, ifa.opposite); - - if (new) - ifa_update(&ifa); - else - ifa_delete(&ifa); - - if (!scan) - if_end_partial_update(ifi); -} - -static void -nl_parse_addr6(struct ifaddrmsg *i, int scan, int new) -{ - struct rtattr *a[BIRD_IFA_MAX]; - struct iface *ifi; - u32 ifa_flags; - int scope; - - if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want6, a, sizeof(a))) - return; - - if (!a[IFA_ADDRESS]) - { - log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)"); - return; - } - - ifi = if_find_by_index(i->ifa_index); - if (!ifi) - { - log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index); - return; - } - - if (a[IFA_FLAGS]) - ifa_flags = rta_get_u32(a[IFA_FLAGS]); - else - ifa_flags = i->ifa_flags; - - struct ifa ifa; - bzero(&ifa, sizeof(ifa)); - ifa.iface = ifi; - if (ifa_flags & IFA_F_SECONDARY) - ifa.flags |= IA_SECONDARY; - - /* Ignore tentative addresses silently */ - if (ifa_flags & IFA_F_TENTATIVE) - return; - - /* IFA_LOCAL can be unset for IPv6 interfaces */ - ifa.ip = rta_get_ipa(a[IFA_LOCAL] ? : a[IFA_ADDRESS]); - - if (i->ifa_prefixlen > IP6_MAX_PREFIX_LENGTH) - { - log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen); - new = 0; - } - if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH) - { - ifa.brd = rta_get_ipa(a[IFA_ADDRESS]); - net_fill_ip6(&ifa.prefix, rta_get_ip6(a[IFA_ADDRESS]), i->ifa_prefixlen); - - /* It is either a host address or a peer address */ - if (ipa_equal(ifa.ip, ifa.brd)) - ifa.flags |= IA_HOST; - else - { - ifa.flags |= IA_PEER; - ifa.opposite = ifa.brd; - } - } - else - { - net_fill_ip6(&ifa.prefix, ipa_to_ip6(ifa.ip), i->ifa_prefixlen); - net_normalize(&ifa.prefix); - - if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH - 1) - ifa.opposite = ipa_opposite_m1(ifa.ip); - } - - scope = ipa_classify(ifa.ip); - if (scope < 0) - { - log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name); - return; - } - ifa.scope = scope & IADDR_SCOPE_MASK; - - DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n", - ifi->index, ifi->name, - new ? "added" : "removed", - ifa.ip, ifa.flags, &ifa.prefix, ifa.brd, ifa.opposite); - - if (new) - ifa_update(&ifa); - else - ifa_delete(&ifa); - - if (!scan) - if_end_partial_update(ifi); -} - -static void -nl_parse_addr(struct nlmsghdr *h, int scan) -{ - struct ifaddrmsg *i; - - if (!(i = nl_checkin(h, sizeof(*i)))) - return; - - int new = (h->nlmsg_type == RTM_NEWADDR); - - switch (i->ifa_family) - { - case AF_INET: - return nl_parse_addr4(i, scan, new); - - case AF_INET6: - return nl_parse_addr6(i, scan, new); - } -} - -void -kif_do_scan(struct kif_proto *p UNUSED) -{ - struct nlmsghdr *h; - - if_start_update(); - - nl_request_dump(AF_UNSPEC, RTM_GETLINK); - while (h = nl_get_scan()) - if (h->nlmsg_type == RTM_NEWLINK || h->nlmsg_type == RTM_DELLINK) - nl_parse_link(h, 1); - else - log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type); - - /* Re-resolve master interface for slaves */ - struct iface *i; - WALK_LIST(i, iface_list) - if (i->master_index) - { - struct iface f = { - .flags = i->flags, - .mtu = i->mtu, - .index = i->index, - .master_index = i->master_index, - .master = if_find_by_index(i->master_index) - }; - - if (f.master != i->master) - { - memcpy(f.name, i->name, sizeof(f.name)); - if_update(&f); - } - } - - nl_request_dump(AF_INET, RTM_GETADDR); - while (h = nl_get_scan()) - if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR) - nl_parse_addr(h, 1); - else - log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type); - - nl_request_dump(AF_INET6, RTM_GETADDR); - while (h = nl_get_scan()) - if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR) - nl_parse_addr(h, 1); - else - log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type); - - if_end_update(); -} - -/* - * Routes - */ - -static inline u32 -krt_table_id(struct krt_proto *p) -{ - return KRT_CF->sys.table_id; -} - -static HASH(struct krt_proto) nl_table_map; - -#define RTH_KEY(p) p->af, krt_table_id(p) -#define RTH_NEXT(p) p->sys.hash_next -#define RTH_EQ(a1,i1,a2,i2) a1 == a2 && i1 == i2 -#define RTH_FN(a,i) a ^ u32_hash(i) - -#define RTH_REHASH rth_rehash -#define RTH_PARAMS /8, *2, 2, 2, 6, 20 - -HASH_DEFINE_REHASH_FN(RTH, struct krt_proto) - -int -krt_capable(rte *e) -{ - rta *a = e->attrs; - - switch (a->dest) - { - case RTD_UNICAST: - case RTD_BLACKHOLE: - case RTD_UNREACHABLE: - case RTD_PROHIBIT: - return 1; - - default: - return 0; - } -} - -static inline int -nh_bufsize(struct nexthop *nh) -{ - int rv = 0; - for (; nh != NULL; nh = nh->next) - rv += RTNH_LENGTH(RTA_LENGTH(sizeof(ip_addr))); - return rv; -} - -static int -nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh) -{ - eattr *ea; - net *net = e->net; - rta *a = e->attrs; - ea_list *eattrs = a->eattrs; - int bufsize = 128 + KRT_METRICS_MAX*8 + nh_bufsize(&(a->nh)); - u32 priority = 0; - - struct { - struct nlmsghdr h; - struct rtmsg r; - char buf[0]; - } *r; - - int rsize = sizeof(*r) + bufsize; - r = alloca(rsize); - - DBG("nl_send_route(%N,op=%x)\n", net->n.addr, op); - - bzero(&r->h, sizeof(r->h)); - bzero(&r->r, sizeof(r->r)); - r->h.nlmsg_type = op ? RTM_NEWROUTE : RTM_DELROUTE; - r->h.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); - r->h.nlmsg_flags = op | NLM_F_REQUEST | NLM_F_ACK; - - r->r.rtm_family = p->af; - r->r.rtm_dst_len = net_pxlen(net->n.addr); - r->r.rtm_protocol = RTPROT_BIRD; - r->r.rtm_scope = RT_SCOPE_NOWHERE; -#ifdef HAVE_MPLS_KERNEL - if (p->af == AF_MPLS) - { - /* - * Kernel MPLS code is a bit picky. We must: - * 1) Always set RT_SCOPE_UNIVERSE and RTN_UNICAST (even for RTM_DELROUTE) - * 2) Never use RTA_PRIORITY - */ - - u32 label = net_mpls(net->n.addr); - nl_add_attr_mpls(&r->h, rsize, RTA_DST, 1, &label); - r->r.rtm_scope = RT_SCOPE_UNIVERSE; - r->r.rtm_type = RTN_UNICAST; - } - else -#endif - { - nl_add_attr_ipa(&r->h, rsize, RTA_DST, net_prefix(net->n.addr)); - - /* Add source address for IPv6 SADR routes */ - if (net->n.addr->type == NET_IP6_SADR) - { - net_addr_ip6_sadr *a = (void *) &net->n.addr; - nl_add_attr_ip6(&r->h, rsize, RTA_SRC, a->src_prefix); - r->r.rtm_src_len = a->src_pxlen; - } - } - - /* - * Strange behavior for RTM_DELROUTE: - * 1) rtm_family is ignored in IPv6, works for IPv4 - * 2) not setting RTA_PRIORITY is different from setting default value (on IPv6) - * 3) not setting RTA_PRIORITY is equivalent to setting 0, which is wildcard - */ - - if (krt_table_id(p) < 256) - r->r.rtm_table = krt_table_id(p); - else - nl_add_attr_u32(&r->h, rsize, RTA_TABLE, krt_table_id(p)); - - if (p->af == AF_MPLS) - priority = 0; - else if (a->source == RTS_DUMMY) - priority = e->u.krt.metric; - else if (KRT_CF->sys.metric) - priority = KRT_CF->sys.metric; - else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, EA_KRT_METRIC))) - priority = ea->u.data; - - if (priority) - nl_add_attr_u32(&r->h, rsize, RTA_PRIORITY, priority); - - /* For route delete, we do not specify remaining route attributes */ - if (op == NL_OP_DELETE) - goto dest; - - /* Default scope is LINK for device routes, UNIVERSE otherwise */ - if (p->af == AF_MPLS) - r->r.rtm_scope = RT_SCOPE_UNIVERSE; - else if (ea = ea_find(eattrs, EA_KRT_SCOPE)) - r->r.rtm_scope = ea->u.data; - else - r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; - - if (ea = ea_find(eattrs, EA_KRT_PREFSRC)) - nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data); - - if (ea = ea_find(eattrs, EA_KRT_REALM)) - nl_add_attr_u32(&r->h, rsize, RTA_FLOW, ea->u.data); - - - u32 metrics[KRT_METRICS_MAX]; - metrics[0] = 0; - - struct ea_walk_state ews = { .eattrs = eattrs }; - while (ea = ea_walk(&ews, EA_KRT_METRICS, KRT_METRICS_MAX)) - { - int id = ea->id - EA_KRT_METRICS; - metrics[0] |= 1 << id; - metrics[id] = ea->u.data; - } - - if (metrics[0]) - nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX); - - -dest: - switch (dest) - { - case RTD_UNICAST: - r->r.rtm_type = RTN_UNICAST; - if (nh->next && !krt_ecmp6(p)) - nl_add_multipath(&r->h, rsize, nh, p->af, eattrs); - else - { - nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->iface->index); - nl_add_nexthop(&r->h, rsize, nh, p->af); - - if (nh->flags & RNF_ONLINK) - r->r.rtm_flags |= RTNH_F_ONLINK; - } - break; - case RTD_BLACKHOLE: - r->r.rtm_type = RTN_BLACKHOLE; - break; - case RTD_UNREACHABLE: - r->r.rtm_type = RTN_UNREACHABLE; - break; - case RTD_PROHIBIT: - r->r.rtm_type = RTN_PROHIBIT; - break; - case RTD_NONE: - break; - default: - bug("krt_capable inconsistent with nl_send_route"); - } - - /* Ignore missing for DELETE */ - return nl_exchange(&r->h, (op == NL_OP_DELETE)); -} - -static inline int -nl_add_rte(struct krt_proto *p, rte *e) -{ - rta *a = e->attrs; - int err = 0; - - if (krt_ecmp6(p) && a->nh.next) - { - struct nexthop *nh = &(a->nh); - - err = nl_send_route(p, e, NL_OP_ADD, RTD_UNICAST, nh); - if (err < 0) - return err; - - for (nh = nh->next; nh; nh = nh->next) - err += nl_send_route(p, e, NL_OP_APPEND, RTD_UNICAST, nh); - - return err; - } - - return nl_send_route(p, e, NL_OP_ADD, a->dest, &(a->nh)); -} - -static inline int -nl_delete_rte(struct krt_proto *p, rte *e) -{ - int err = 0; - - /* For IPv6, we just repeatedly request DELETE until we get error */ - do - err = nl_send_route(p, e, NL_OP_DELETE, RTD_NONE, NULL); - while (krt_ecmp6(p) && !err); - - return err; -} - -static inline int -nl_replace_rte(struct krt_proto *p, rte *e) -{ - rta *a = e->attrs; - return nl_send_route(p, e, NL_OP_REPLACE, a->dest, &(a->nh)); -} - - -void -krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old) -{ - int err = 0; - - /* - * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for - * matching rtm_protocol, but that is OK when dedicated priority is used. - * - * We do not use NL_OP_REPLACE for IPv6, as it has broken semantics for ECMP - * and with some kernel versions ECMP replace crashes kernel. Would need more - * testing and checks for kernel versions. - * - * For IPv6, we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the - * old route value, so we do not try to optimize IPv6 ECMP reconfigurations. - */ - - if (krt_ipv4(p) && old && new) - { - err = nl_replace_rte(p, new); - } - else - { - if (old) - nl_delete_rte(p, old); - - if (new) - err = nl_add_rte(p, new); - } - - if (new) - { - if (err < 0) - bmap_clear(&p->sync_map, new->id); - else - bmap_set(&p->sync_map, new->id); - } -} - -static int -nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint priority, uint krt_type, uint rtm_family) -{ - /* Route merging is used for IPv6 scans */ - if (!s->scan || (rtm_family != AF_INET6)) - return 0; - - /* Saved and new route must have same network, proto/table, and priority */ - if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority)) - return 0; - - /* Both must be regular unicast routes */ - if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST)) - return 0; - - return 1; -} - -static void -nl_announce_route(struct nl_parse_state *s) -{ - rte *e = rte_get_temp(s->attrs); - e->net = s->net; - e->u.krt.src = s->krt_src; - e->u.krt.proto = s->krt_proto; - e->u.krt.seen = 0; - e->u.krt.best = 0; - e->u.krt.metric = s->krt_metric; - - if (s->scan) - krt_got_route(s->proto, e); - else - krt_got_route_async(s->proto, e, s->new); - - s->net = NULL; - s->attrs = NULL; - s->proto = NULL; - lp_flush(s->pool); -} - -static inline void -nl_parse_begin(struct nl_parse_state *s, int scan) -{ - memset(s, 0, sizeof (struct nl_parse_state)); - s->pool = nl_linpool; - s->scan = scan; -} - -static inline void -nl_parse_end(struct nl_parse_state *s) -{ - if (s->net) - nl_announce_route(s); -} - - -#define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0) -#define SKIP(ARG, ...) do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0) - -static void -nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) -{ - struct krt_proto *p; - struct rtmsg *i; - struct rtattr *a[BIRD_RTA_MAX]; - int new = h->nlmsg_type == RTM_NEWROUTE; - - net_addr dst, src = {}; - u32 oif = ~0; - u32 table_id; - u32 priority = 0; - u32 def_scope = RT_SCOPE_UNIVERSE; - int krt_src; - - if (!(i = nl_checkin(h, sizeof(*i)))) - return; - - switch (i->rtm_family) - { - case AF_INET: - if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want4, a, sizeof(a))) - return; - - if (a[RTA_DST]) - net_fill_ip4(&dst, rta_get_ip4(a[RTA_DST]), i->rtm_dst_len); - else - net_fill_ip4(&dst, IP4_NONE, 0); - break; - - case AF_INET6: - if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want6, a, sizeof(a))) - return; - - if (a[RTA_DST]) - net_fill_ip6(&dst, rta_get_ip6(a[RTA_DST]), i->rtm_dst_len); - else - net_fill_ip6(&dst, IP6_NONE, 0); - - if (a[RTA_SRC]) - net_fill_ip6(&src, rta_get_ip6(a[RTA_SRC]), i->rtm_src_len); - else - net_fill_ip6(&src, IP6_NONE, 0); - break; - -#ifdef HAVE_MPLS_KERNEL - case AF_MPLS: - if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want_mpls, a, sizeof(a))) - return; - - if (!a[RTA_DST]) - SKIP0("MPLS route without RTA_DST\n"); - - if (rta_get_mpls(a[RTA_DST], rta_mpls_stack) != 1) - SKIP0("MPLS route with multi-label RTA_DST\n"); - - net_fill_mpls(&dst, rta_mpls_stack[0]); - break; -#endif - - default: - return; - } - - if (a[RTA_OIF]) - oif = rta_get_u32(a[RTA_OIF]); - - if (a[RTA_TABLE]) - table_id = rta_get_u32(a[RTA_TABLE]); - else - table_id = i->rtm_table; - - if (i->rtm_flags & RTM_F_CLONED) - SKIP("cloned\n"); - - /* Do we know this table? */ - p = HASH_FIND(nl_table_map, RTH, i->rtm_family, table_id); - if (!p) - SKIP("unknown table %u\n", table_id); - - if (a[RTA_SRC] && (p->p.net_type != NET_IP6_SADR)) - SKIP("src prefix for non-SADR channel\n"); - - if (a[RTA_IIF]) - SKIP("IIF set\n"); - - if (i->rtm_tos != 0) /* We don't support TOS */ - SKIP("TOS %02x\n", i->rtm_tos); - - if (s->scan && !new) - SKIP("RTM_DELROUTE in scan\n"); - - if (a[RTA_PRIORITY]) - priority = rta_get_u32(a[RTA_PRIORITY]); - - int c = net_classify(&dst); - if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK)) - SKIP("strange class/scope\n"); - - switch (i->rtm_protocol) - { - case RTPROT_UNSPEC: - SKIP("proto unspec\n"); - - case RTPROT_REDIRECT: - krt_src = KRT_SRC_REDIRECT; - break; - - case RTPROT_KERNEL: - krt_src = KRT_SRC_KERNEL; - return; - - case RTPROT_BIRD: - if (!s->scan) - SKIP("echo\n"); - krt_src = KRT_SRC_BIRD; - break; - - case RTPROT_BOOT: - default: - krt_src = KRT_SRC_ALIEN; - } - - net_addr *n = &dst; - if (p->p.net_type == NET_IP6_SADR) - { - n = alloca(sizeof(net_addr_ip6_sadr)); - net_fill_ip6_sadr(n, net6_prefix(&dst), net6_pxlen(&dst), - net6_prefix(&src), net6_pxlen(&src)); - } - - net *net = net_get(p->p.main_channel->table, n); - - if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type, i->rtm_family)) - nl_announce_route(s); - - rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE); - ra->src = p->p.main_source; - ra->source = RTS_INHERIT; - ra->scope = SCOPE_UNIVERSE; - - if (a[RTA_FLOW]) - s->rta_flow = rta_get_u32(a[RTA_FLOW]); - else - s->rta_flow = 0; - - switch (i->rtm_type) - { - case RTN_UNICAST: - ra->dest = RTD_UNICAST; - - if (a[RTA_MULTIPATH]) - { - struct nexthop *nh = nl_parse_multipath(s, p, n, a[RTA_MULTIPATH], i->rtm_family, krt_src); - if (!nh) - SKIP("strange RTA_MULTIPATH\n"); - - nexthop_link(ra, nh); - break; - } - - if ((i->rtm_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD)) - SKIP("ignore RTNH_F_DEAD\n"); - - ra->nh.iface = if_find_by_index(oif); - if (!ra->nh.iface) - { - log(L_ERR "KRT: Received route %N with unknown ifindex %u", net->n.addr, oif); - return; - } - - if (a[RTA_GATEWAY]) - ra->nh.gw = rta_get_ipa(a[RTA_GATEWAY]); - -#ifdef HAVE_MPLS_KERNEL - if (a[RTA_VIA]) - ra->nh.gw = rta_get_via(a[RTA_VIA]); -#endif - - if (ipa_nonzero(ra->nh.gw)) - { - /* Silently skip strange 6to4 routes */ - const net_addr_ip6 sit = NET_ADDR_IP6(IP6_NONE, 96); - if ((i->rtm_family == AF_INET6) && ipa_in_netX(ra->nh.gw, (net_addr *) &sit)) - return; - - if (i->rtm_flags & RTNH_F_ONLINK) - ra->nh.flags |= RNF_ONLINK; - - neighbor *nbr; - nbr = neigh_find(&p->p, ra->nh.gw, ra->nh.iface, - (ra->nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0); - if (!nbr || (nbr->scope == SCOPE_HOST)) - { - log(L_ERR "KRT: Received route %N with strange next-hop %I", net->n.addr, - ra->nh.gw); - return; - } - } - - break; - case RTN_BLACKHOLE: - ra->dest = RTD_BLACKHOLE; - break; - case RTN_UNREACHABLE: - ra->dest = RTD_UNREACHABLE; - break; - case RTN_PROHIBIT: - ra->dest = RTD_PROHIBIT; - break; - /* FIXME: What about RTN_THROW? */ - default: - SKIP("type %d\n", i->rtm_type); - return; - } - -#ifdef HAVE_MPLS_KERNEL - if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !ra->nh.next) - ra->nh.labels = rta_get_mpls(a[RTA_NEWDST], ra->nh.label); - - if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !ra->nh.next) - { - switch (rta_get_u16(a[RTA_ENCAP_TYPE])) - { - case LWTUNNEL_ENCAP_MPLS: - { - struct rtattr *enca[BIRD_RTA_MAX]; - nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]); - nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca)); - ra->nh.labels = rta_get_mpls(enca[RTA_DST], ra->nh.label); - break; - } - default: - SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE])); - break; - } - } -#endif - - if (i->rtm_scope != def_scope) - { - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = ra->eattrs; - ra->eattrs = ea; - ea->flags = EALF_SORTED; - ea->count = 1; - ea->attrs[0].id = EA_KRT_SCOPE; - ea->attrs[0].flags = 0; - ea->attrs[0].type = EAF_TYPE_INT; - ea->attrs[0].u.data = i->rtm_scope; - } - - if (a[RTA_PREFSRC]) - { - ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]); - - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = ra->eattrs; - ra->eattrs = ea; - ea->flags = EALF_SORTED; - ea->count = 1; - ea->attrs[0].id = EA_KRT_PREFSRC; - ea->attrs[0].flags = 0; - ea->attrs[0].type = EAF_TYPE_IP_ADDRESS; - - struct adata *ad = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps)); - ad->length = sizeof(ps); - memcpy(ad->data, &ps, sizeof(ps)); - - ea->attrs[0].u.ptr = ad; - } - - /* Can be set per-route or per-nexthop */ - if (s->rta_flow) - { - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = ra->eattrs; - ra->eattrs = ea; - ea->flags = EALF_SORTED; - ea->count = 1; - ea->attrs[0].id = EA_KRT_REALM; - ea->attrs[0].flags = 0; - ea->attrs[0].type = EAF_TYPE_INT; - ea->attrs[0].u.data = s->rta_flow; - } - - if (a[RTA_METRICS]) - { - u32 metrics[KRT_METRICS_MAX]; - ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr)); - int t, n = 0; - - if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0) - { - log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net->n.addr); - return; - } - - for (t = 1; t < KRT_METRICS_MAX; t++) - if (metrics[0] & (1 << t)) - { - ea->attrs[n].id = EA_CODE(PROTOCOL_KERNEL, KRT_METRICS_OFFSET + t); - ea->attrs[n].flags = 0; - ea->attrs[n].type = EAF_TYPE_INT; /* FIXME: Some are EAF_TYPE_BITFIELD */ - ea->attrs[n].u.data = metrics[t]; - n++; - } - - if (n > 0) - { - ea->next = ra->eattrs; - ea->flags = EALF_SORTED; - ea->count = n; - ra->eattrs = ea; - } - } - - /* - * Ideally, now we would send the received route to the rest of kernel code. - * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we - * postpone it and merge next hops until the end of the sequence. Note that - * when doing merging of next hops, we expect the new route to be unipath. - * Otherwise, we ignore additional next hops in nexthop_insert(). - */ - - if (!s->net) - { - /* Store the new route */ - s->net = net; - s->attrs = ra; - s->proto = p; - s->new = new; - s->krt_src = krt_src; - s->krt_type = i->rtm_type; - s->krt_proto = i->rtm_protocol; - s->krt_metric = priority; - } - else - { - /* Merge next hops with the stored route */ - rta *oa = s->attrs; - - struct nexthop *nhs = &oa->nh; - nexthop_insert(&nhs, &ra->nh); - - /* Perhaps new nexthop is inserted at the first position */ - if (nhs == &ra->nh) - { - /* Swap rtas */ - s->attrs = ra; - - /* Keep old eattrs */ - ra->eattrs = oa->eattrs; - } - } -} - -void -krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */ -{ - struct nlmsghdr *h; - struct nl_parse_state s; - - nl_parse_begin(&s, 1); - nl_request_dump(AF_UNSPEC, RTM_GETROUTE); - while (h = nl_get_scan()) - if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) - nl_parse_route(&s, h); - else - log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type); - nl_parse_end(&s); -} - -/* - * Asynchronous Netlink interface - */ - -static sock *nl_async_sk; /* BIRD socket for asynchronous notifications */ -static byte *nl_async_rx_buffer; /* Receive buffer */ - -static void -nl_async_msg(struct nlmsghdr *h) -{ - struct nl_parse_state s; - - switch (h->nlmsg_type) - { - case RTM_NEWROUTE: - case RTM_DELROUTE: - DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type); - nl_parse_begin(&s, 0); - nl_parse_route(&s, h); - nl_parse_end(&s); - break; - case RTM_NEWLINK: - case RTM_DELLINK: - DBG("KRT: Received async link notification (%d)\n", h->nlmsg_type); - if (kif_proto) - nl_parse_link(h, 0); - break; - case RTM_NEWADDR: - case RTM_DELADDR: - DBG("KRT: Received async address notification (%d)\n", h->nlmsg_type); - if (kif_proto) - nl_parse_addr(h, 0); - break; - default: - DBG("KRT: Received unknown async notification (%d)\n", h->nlmsg_type); - } -} - -static int -nl_async_hook(sock *sk, uint size UNUSED) -{ - struct iovec iov = { nl_async_rx_buffer, NL_RX_SIZE }; - struct sockaddr_nl sa; - struct msghdr m = { - .msg_name = &sa, - .msg_namelen = sizeof(sa), - .msg_iov = &iov, - .msg_iovlen = 1, - }; - struct nlmsghdr *h; - int x; - uint len; - - x = recvmsg(sk->fd, &m, 0); - if (x < 0) - { - if (errno == ENOBUFS) - { - /* - * Netlink reports some packets have been thrown away. - * One day we might react to it by asking for route table - * scan in near future. - */ - log(L_WARN "Kernel dropped some netlink messages, will resync on next scan."); - return 1; /* More data are likely to be ready */ - } - else if (errno != EWOULDBLOCK) - log(L_ERR "Netlink recvmsg: %m"); - return 0; - } - if (sa.nl_pid) /* It isn't from the kernel */ - { - DBG("Non-kernel packet\n"); - return 1; - } - h = (void *) nl_async_rx_buffer; - len = x; - if (m.msg_flags & MSG_TRUNC) - { - log(L_WARN "Netlink got truncated asynchronous message"); - return 1; - } - while (NLMSG_OK(h, len)) - { - nl_async_msg(h); - h = NLMSG_NEXT(h, len); - } - if (len) - log(L_WARN "nl_async_hook: Found packet remnant of size %d", len); - return 1; -} - -static void -nl_async_err_hook(sock *sk, int e UNUSED) -{ - nl_async_hook(sk, 0); -} - -static void -nl_open_async(void) -{ - sock *sk; - struct sockaddr_nl sa; - int fd; - - if (nl_async_sk) - return; - - DBG("KRT: Opening async netlink socket\n"); - - fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); - if (fd < 0) - { - log(L_ERR "Unable to open asynchronous rtnetlink socket: %m"); - return; - } - - bzero(&sa, sizeof(sa)); - sa.nl_family = AF_NETLINK; - sa.nl_groups = RTMGRP_LINK | - RTMGRP_IPV4_IFADDR | RTMGRP_IPV4_ROUTE | - RTMGRP_IPV6_IFADDR | RTMGRP_IPV6_ROUTE; - - if (bind(fd, (struct sockaddr *) &sa, sizeof(sa)) < 0) - { - log(L_ERR "Unable to bind asynchronous rtnetlink socket: %m"); - close(fd); - return; - } - - nl_async_rx_buffer = xmalloc(NL_RX_SIZE); - - sk = nl_async_sk = sk_new(krt_pool); - sk->type = SK_MAGIC; - sk->rx_hook = nl_async_hook; - sk->err_hook = nl_async_err_hook; - sk->fd = fd; - if (sk_open(sk) < 0) - bug("Netlink: sk_open failed"); -} - - -/* - * Interface to the UNIX krt module - */ - -void -krt_sys_io_init(void) -{ - nl_linpool = lp_new_default(krt_pool); - HASH_INIT(nl_table_map, krt_pool, 6); -} - -int -krt_sys_start(struct krt_proto *p) -{ - struct krt_proto *old = HASH_FIND(nl_table_map, RTH, p->af, krt_table_id(p)); - - if (old) - { - log(L_ERR "%s: Kernel table %u already registered by %s", - p->p.name, krt_table_id(p), old->p.name); - return 0; - } - - HASH_INSERT2(nl_table_map, RTH, krt_pool, p); - - nl_open(); - nl_open_async(); - - return 1; -} - -void -krt_sys_shutdown(struct krt_proto *p) -{ - HASH_REMOVE2(nl_table_map, RTH, krt_pool, p); -} - -int -krt_sys_reconfigure(struct krt_proto *p UNUSED, struct krt_config *n, struct krt_config *o) -{ - return (n->sys.table_id == o->sys.table_id) && (n->sys.metric == o->sys.metric); -} - -void -krt_sys_init_config(struct krt_config *cf) -{ - cf->sys.table_id = RT_TABLE_MAIN; - cf->sys.metric = 32; -} - -void -krt_sys_copy_config(struct krt_config *d, struct krt_config *s) -{ - d->sys.table_id = s->sys.table_id; - d->sys.metric = s->sys.metric; -} - -static const char *krt_metrics_names[KRT_METRICS_MAX] = { - NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss", - "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack" -}; - -static const char *krt_features_names[KRT_FEATURES_MAX] = { - "ecn", NULL, NULL, "allfrag" -}; - -int -krt_sys_get_attr(const eattr *a, byte *buf, int buflen UNUSED) -{ - switch (a->id) - { - case EA_KRT_PREFSRC: - bsprintf(buf, "prefsrc"); - return GA_NAME; - - case EA_KRT_REALM: - bsprintf(buf, "realm"); - return GA_NAME; - - case EA_KRT_SCOPE: - bsprintf(buf, "scope"); - return GA_NAME; - - case EA_KRT_LOCK: - buf += bsprintf(buf, "lock:"); - ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX); - return GA_FULL; - - case EA_KRT_FEATURES: - buf += bsprintf(buf, "features:"); - ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX); - return GA_FULL; - - default:; - int id = (int)EA_ID(a->id) - KRT_METRICS_OFFSET; - if (id > 0 && id < KRT_METRICS_MAX) - { - bsprintf(buf, "%s", krt_metrics_names[id]); - return GA_NAME; - } - - return GA_UNKNOWN; - } -} - - - -void -kif_sys_start(struct kif_proto *p UNUSED) -{ - nl_open(); - nl_open_async(); -} - -void -kif_sys_shutdown(struct kif_proto *p UNUSED) -{ -} - -int -kif_update_sysdep_addr(struct iface *i UNUSED) -{ - return 0; -} diff --git a/sysdep/unix/Makefile b/sysdep/unix/Makefile index 07f454ab..6f6b0d26 100644 --- a/sysdep/unix/Makefile +++ b/sysdep/unix/Makefile @@ -1,4 +1,4 @@ -src := alloc.c io.c io-loop.c krt.c log.c main.c random.c coroutine.c +src := alloc.c io.c io-loop.c krt.c log.c main.c random.c domain.c obj := $(src-o-files) $(all-daemon) $(cf-local) diff --git a/sysdep/unix/alloc.c b/sysdep/unix/alloc.c index c09a8356..cafcc8dd 100644 --- a/sysdep/unix/alloc.c +++ b/sysdep/unix/alloc.c @@ -10,239 +10,324 @@ #include "lib/resource.h" #include "lib/lists.h" #include "lib/event.h" +#include "lib/rcu.h" -#include "sysdep/unix/io-loop.h" - +#include <errno.h> #include <stdlib.h> #include <unistd.h> -#include <stdatomic.h> -#include <errno.h> #ifdef HAVE_MMAP #include <sys/mman.h> #endif +#ifdef CONFIG_DISABLE_THP +#include <sys/prctl.h> +#ifndef PR_SET_THP_DISABLE +#define PR_SET_THP_DISABLE 41 +#endif +#endif + long page_size = 0; #ifdef HAVE_MMAP -#if DEBUGGING -#define FP_NODE_OFFSET 42 -#else -#define FP_NODE_OFFSET 1 -#endif +#define KEEP_PAGES_MAX 512 +#define KEEP_PAGES_MIN 32 +#define KEEP_PAGES_MAX_LOCAL 16 +#define ALLOC_PAGES_AT_ONCE 8 + +STATIC_ASSERT(KEEP_PAGES_MIN * 4 < KEEP_PAGES_MAX); +STATIC_ASSERT(ALLOC_PAGES_AT_ONCE < KEEP_PAGES_MAX_LOCAL); + static _Bool use_fake = 0; +static _Bool initialized = 0; + +#if DEBUGGING +struct free_page { + node unused[42]; + struct free_page * _Atomic next; +}; #else -static _Bool use_fake = 1; +struct free_page { + struct free_page * _Atomic next; +}; #endif +#define EP_POS_MAX ((page_size - OFFSETOF(struct empty_pages, pages)) / sizeof (void *)) + +struct empty_pages { + struct empty_pages *next; + uint pos; + void *pages[0]; +}; + +DEFINE_DOMAIN(resource); +static DOMAIN(resource) empty_pages_domain; +static struct empty_pages *empty_pages = NULL; + +static struct free_page * _Atomic page_stack = NULL; +static _Thread_local struct free_page * local_page_stack = NULL; + +static void page_cleanup(void *); +static event page_cleanup_event = { .hook = page_cleanup, }; +#define SCHEDULE_CLEANUP do if (initialized && !shutting_down) ev_send(&global_event_list, &page_cleanup_event); while (0) + +_Atomic int pages_kept = 0; +_Atomic int pages_kept_locally = 0; +static _Thread_local int pages_kept_here = 0; + static void * alloc_sys_page(void) { - void *ptr = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + void *ptr = mmap(NULL, page_size * ALLOC_PAGES_AT_ONCE, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr == MAP_FAILED) - bug("mmap(%lu) failed: %m", page_size); + die("mmap(%ld) failed: %m", (s64) page_size); return ptr; } extern int shutting_down; /* Shutdown requested. */ +#else // ! HAVE_MMAP +#define use_fake 1 +#endif + void * alloc_page(void) { -#ifdef HAVE_MMAP - if (!use_fake) + /* If the system page allocator is goofy, we use posix_memalign to get aligned blocks of memory. */ + if (use_fake) { - struct free_pages *fp = &birdloop_current->pages; - if (!fp->cnt) - return alloc_sys_page(); - - node *n = HEAD(fp->list); - rem_node(n); - if ((--fp->cnt < fp->min) && !shutting_down) - ev_send(fp->cleanup->list, fp->cleanup); - - void *ptr = n - FP_NODE_OFFSET; - memset(ptr, 0, page_size); + void *ptr = NULL; + int err = posix_memalign(&ptr, page_size, page_size); + + if (err || !ptr) + die("posix_memalign(%ld) failed", (s64) page_size); + return ptr; } - else -#endif - { -#ifdef HAVE_ALIGNED_ALLOC - void *ret = aligned_alloc(page_size, page_size); - if (!ret) - bug("aligned_alloc(%lu) failed", page_size); - return ret; -#else - bug("BIRD should have already died on fatal error."); -#endif - } -} -void -free_page(void *ptr) -{ #ifdef HAVE_MMAP - if (!use_fake) + /* If there is any free page kept hot in this thread, we use it. */ + struct free_page *fp = local_page_stack; + if (fp) { - struct free_pages *fp = &birdloop_current->pages; - struct node *n = ptr; - n += FP_NODE_OFFSET; - - memset(n, 0, sizeof(node)); - add_tail(&fp->list, n); - if ((++fp->cnt > fp->max) && !shutting_down) - ev_send(fp->cleanup->list, fp->cleanup); + local_page_stack = atomic_load_explicit(&fp->next, memory_order_acquire); + atomic_fetch_sub_explicit(&pages_kept_locally, 1, memory_order_relaxed); + pages_kept_here--; + return fp; } - else -#endif - free(ptr); -} -#ifdef HAVE_MMAP + ASSERT_DIE(pages_kept_here == 0); -#define GFP (&main_birdloop.pages) + /* If there is any free page kept hot in global storage, we use it. */ + rcu_read_lock(); + fp = atomic_load_explicit(&page_stack, memory_order_acquire); + while (fp && !atomic_compare_exchange_strong_explicit( + &page_stack, &fp, atomic_load_explicit(&fp->next, memory_order_acquire), + memory_order_acq_rel, memory_order_acquire)) + ; + rcu_read_unlock(); -void -flush_pages(struct birdloop *loop) -{ - ASSERT_DIE(birdloop_inside(loop->parent->loop)); + if (fp) + { + atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed); + return fp; + } - struct free_pages *fp = &loop->pages; - struct free_pages *pfp = &loop->parent->loop->pages; + /* If there is any free page kept cold, we use that. */ + LOCK_DOMAIN(resource, empty_pages_domain); + if (empty_pages) { + if (empty_pages->pos) + /* Either the keeper page contains at least one cold page pointer, return that */ + fp = empty_pages->pages[--empty_pages->pos]; + else + { + /* Or the keeper page has no more cold page pointer, return the keeper page */ + fp = (struct free_page *) empty_pages; + empty_pages = empty_pages->next; + } + } + UNLOCK_DOMAIN(resource, empty_pages_domain); - add_tail_list(&pfp->list, &fp->list); - pfp->cnt += fp->cnt; - - fp->cnt = 0; - fp->list = (list) {}; - fp->min = 0; - fp->max = 0; + if (fp) + return fp; - rfree(fp->cleanup); - fp->cleanup = NULL; + /* And in the worst case, allocate some new pages by mmap() */ + void *ptr = alloc_sys_page(); + for (int i=1; i<ALLOC_PAGES_AT_ONCE; i++) + free_page(ptr + page_size * i); + + return ptr; +#endif } -static void -cleanup_pages(void *data) +void +free_page(void *ptr) { - struct birdloop *loop = data; - birdloop_enter(loop); - - ASSERT_DIE(birdloop_inside(loop->parent->loop)); - - struct free_pages *fp = &loop->pages; - struct free_pages *pfp = &loop->parent->loop->pages; - - while ((fp->cnt < fp->min) && (pfp->cnt > pfp->min)) + /* If the system page allocator is goofy, we just free the block and care no more. */ + if (use_fake) { - node *n = HEAD(pfp->list); - rem_node(n); - add_tail(&fp->list, n); - fp->cnt++; - pfp->cnt--; - } - - while (fp->cnt < fp->min) - { - node *n = alloc_sys_page(); - add_tail(&fp->list, n + FP_NODE_OFFSET); - fp->cnt++; + free(ptr); + return; } - while (fp->cnt > fp->max) +#ifdef HAVE_MMAP + /* We primarily try to keep the pages locally. */ + struct free_page *fp = ptr; + if (shutting_down || (pages_kept_here < KEEP_PAGES_MAX_LOCAL)) { - node *n = HEAD(fp->list); - rem_node(n); - add_tail(&pfp->list, n); - fp->cnt--; - pfp->cnt++; + atomic_store_explicit(&fp->next, local_page_stack, memory_order_relaxed); + local_page_stack = fp; + + atomic_fetch_add_explicit(&pages_kept_locally, 1, memory_order_relaxed); + pages_kept_here++; + return; } - birdloop_leave(loop); + /* If there are too many local pages, we add the free page to the global hot-free-page list */ + rcu_read_lock(); + struct free_page *next = atomic_load_explicit(&page_stack, memory_order_acquire); - if (!shutting_down && (pfp->cnt > pfp->max)) - ev_send(pfp->cleanup->list, pfp->cleanup); + do atomic_store_explicit(&fp->next, next, memory_order_release); + while (!atomic_compare_exchange_strong_explicit( + &page_stack, &next, fp, + memory_order_acq_rel, memory_order_acquire)); + rcu_read_unlock(); + + /* And if there are too many global hot free pages, we ask for page cleanup */ + if (atomic_fetch_add_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX) + SCHEDULE_CLEANUP; +#endif } -static void -cleanup_global_pages(void *data UNUSED) +/* When the routine is going to sleep for a long time, we flush the local + * hot page cache to not keep dirty pages for nothing. */ +void +flush_local_pages(void) { - while (GFP->cnt < GFP->max) - { - node *n = alloc_sys_page(); - add_tail(&GFP->list, n + FP_NODE_OFFSET); - GFP->cnt++; - } + if (use_fake || !local_page_stack || shutting_down) + return; - for (uint limit = GFP->cnt; (limit > 0) && (GFP->cnt > GFP->max); limit--) + /* We first count the pages to enable consistency checking. + * Also, we need to know the last page. */ + struct free_page *last = local_page_stack, *next; + int check_count = 1; + while (next = atomic_load_explicit(&last->next, memory_order_acquire)) { - node *n = TAIL(GFP->list); - rem_node(n); - - if (munmap(n - FP_NODE_OFFSET, page_size) == 0) - GFP->cnt--; - else if (errno == ENOMEM) - add_head(&GFP->list, n); - else - bug("munmap(%p) failed: %m", n - FP_NODE_OFFSET); + check_count++; + last = next; } + + /* The actual number of pages must be equal to the counter value. */ + ASSERT_DIE(check_count == pages_kept_here); + + /* Repeatedly trying to insert the whole page list into global page stack at once. */ + rcu_read_lock(); + next = atomic_load_explicit(&page_stack, memory_order_acquire); + + /* First we set the outwards pointer (from our last), + * then we try to set the inwards pointer to our first page. */ + do atomic_store_explicit(&last->next, next, memory_order_release); + while (!atomic_compare_exchange_strong_explicit( + &page_stack, &next, local_page_stack, + memory_order_acq_rel, memory_order_acquire)); + rcu_read_unlock(); + + /* Finished. Now the local stack is empty. */ + local_page_stack = NULL; + pages_kept_here = 0; + + /* Check the state of global page cache and maybe schedule its cleanup. */ + atomic_fetch_sub_explicit(&pages_kept_locally, check_count, memory_order_relaxed); + if (atomic_fetch_add_explicit(&pages_kept, check_count, memory_order_relaxed) >= KEEP_PAGES_MAX) + SCHEDULE_CLEANUP; } -void -init_pages(struct birdloop *loop) +#ifdef HAVE_MMAP +static void +page_cleanup(void *_ UNUSED) { - struct free_pages *fp = &loop->pages; + /* Cleanup on shutdown is ignored. All pages may be kept hot, OS will take care. */ + if (shutting_down) + return; + + struct free_page *stack = atomic_exchange_explicit(&page_stack, NULL, memory_order_acq_rel); + if (!stack) + return; - init_list(&fp->list); - fp->cleanup = ev_new_init(loop->parent->loop->pool, cleanup_pages, loop); - fp->cleanup->list = (loop->parent->loop == &main_birdloop) ? &global_work_list : birdloop_event_list(loop->parent->loop); - fp->min = 4; - fp->max = 16; - for (fp->cnt = 0; fp->cnt < fp->min; fp->cnt++) + do { + synchronize_rcu(); + struct free_page *fp = stack; + stack = atomic_load_explicit(&fp->next, memory_order_acquire); + + LOCK_DOMAIN(resource, empty_pages_domain); + /* Empty pages are stored as pointers. To store them, we need a pointer block. */ + if (!empty_pages || (empty_pages->pos == EP_POS_MAX)) + { + /* There is either no pointer block or the last block is full. We use this block as a pointer block. */ + empty_pages = (struct empty_pages *) fp; + *empty_pages = (struct empty_pages) {}; + } + else + { + /* We store this block as a pointer into the first free place + * and tell the OS that the underlying memory is trash. */ + empty_pages->pages[empty_pages->pos++] = fp; + if (madvise(fp, page_size, +#ifdef CONFIG_MADV_DONTNEED_TO_FREE + MADV_DONTNEED +#else + MADV_FREE +#endif + ) < 0) + bug("madvise(%p) failed: %m", fp); + } + UNLOCK_DOMAIN(resource, empty_pages_domain); + } + while ((atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX / 2) && stack); + + while (stack) { - node *n = alloc_sys_page(); - add_tail(&fp->list, n + FP_NODE_OFFSET); + struct free_page *f = stack; + stack = atomic_load_explicit(&f->next, memory_order_acquire); + free_page(f); + + atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed); } } +#endif -static event global_free_pages_cleanup_event = { .hook = cleanup_global_pages, .list = &global_work_list }; - -void resource_sys_init(void) +void +resource_sys_init(void) { +#ifdef CONFIG_DISABLE_THP + /* Disable transparent huge pages, they do not work properly with madvice(MADV_DONTNEED) */ + if (prctl(PR_SET_THP_DISABLE, (unsigned long) 1, (unsigned long) 0, (unsigned long) 0, (unsigned long) 0) < 0) + log(L_WARN "Cannot disable transparent huge pages: prctl(PR_SET_THP_DISABLE) failed: %m"); +#endif + +#ifdef HAVE_MMAP + /* Check what page size the system supports */ if (!(page_size = sysconf(_SC_PAGESIZE))) die("System page size must be non-zero"); - if (u64_popcount(page_size) == 1) + if ((u64_popcount(page_size) == 1) && (page_size >= (1 << 10)) && (page_size <= (1 << 18))) { - init_list(&GFP->list); - GFP->cleanup = &global_free_pages_cleanup_event; - GFP->min = 0; - GFP->max = 256; + /* We assume that page size has only one bit and is between 1K and 256K (incl.). + * Otherwise, the assumptions in lib/slab.c (sl_head's num_full range) aren't met. */ + + empty_pages_domain = DOMAIN_NEW(resource, "Empty Pages"); + initialized = 1; return; } -#ifdef HAVE_ALIGNED_ALLOC - log(L_WARN "Got strange memory page size (%lu), using the aligned allocator instead", page_size); -#else - die("Got strange memory page size (%lu) and aligned_alloc is not available", page_size); -#endif - /* Too big or strange page, use the aligned allocator instead */ - page_size = 4096; + log(L_WARN "Got strange memory page size (%ld), using the aligned allocator instead", (s64) page_size); use_fake = 1; -} - -#else +#endif -void -resource_sys_init(void) -{ page_size = 4096; - use_fake = 1; + initialized = 1; } - -#endif diff --git a/sysdep/unix/config.Y b/sysdep/unix/config.Y index 5c4b5bef..a50ec757 100644 --- a/sysdep/unix/config.Y +++ b/sysdep/unix/config.Y @@ -101,6 +101,12 @@ mrtdump_base: ; +conf: THREADS expr { + if ($2 < 1) cf_error("Number of threads must be at least one."); + new_config->thread_count = $2; +} + + conf: debug_unix ; debug_unix: @@ -145,6 +151,11 @@ CF_CLI_HELP(GRACEFUL, restart, [[Shut the daemon down for graceful restart]]) CF_CLI(GRACEFUL RESTART,,, [[Shut the daemon down for graceful restart]]) { cmd_graceful_restart(); } ; +CF_CLI(SHOW THREADS,,, [[Write out thread information]]) +{ cmd_show_threads(0); } ; + +CF_CLI(SHOW THREADS ALL,,, [[Write out thread and IO loop information]]) +{ cmd_show_threads(1); } ; cfg_name: /* empty */ { $$ = NULL; } diff --git a/sysdep/unix/coroutine.c b/sysdep/unix/domain.c index 2068afd5..f4ee595d 100644 --- a/sysdep/unix/coroutine.c +++ b/sysdep/unix/domain.c @@ -1,7 +1,6 @@ /* - * BIRD Coroutines + * BIRD Locking * - * (c) 2017 Martin Mares <mj@ucw.cz> * (c) 2020 Maria Matejka <mq@jmq.cz> * * Can be freely distributed and used under the terms of the GNU GPL. @@ -17,19 +16,11 @@ #include "lib/birdlib.h" #include "lib/locking.h" -#include "lib/coro.h" -#include "lib/rcu.h" #include "lib/resource.h" #include "lib/timer.h" #include "conf/conf.h" -#define CORO_STACK_SIZE 65536 - -/* - * Implementation of coroutines based on POSIX threads - */ - #include <errno.h> #include <fcntl.h> #include <poll.h> @@ -79,6 +70,12 @@ domain_free(struct domain_generic *dg) xfree(dg); } +const char * +domain_name(struct domain_generic *dg) +{ + return dg->name; +} + uint dg_order(struct domain_generic *dg) { return dg->order; @@ -86,11 +83,15 @@ uint dg_order(struct domain_generic *dg) void do_lock(struct domain_generic *dg, struct domain_generic **lsp) { - if ((char *) lsp - (char *) &locking_stack != (int) dg->order) + struct lock_order stack_copy; + memcpy(&stack_copy, &locking_stack, sizeof(stack_copy)); + struct domain_generic **lll = last_locked; + + if ((char *) lsp - (char *) &locking_stack != dg->order) bug("Trying to lock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack); if (lsp <= last_locked) - bug("Trying to lock in a bad order"); + bug("Trying to lock in a bad order: %p %p", &stack_copy, lll); if (*lsp) bug("Inconsistent locking stack state on lock"); @@ -110,7 +111,7 @@ void do_lock(struct domain_generic *dg, struct domain_generic **lsp) void do_unlock(struct domain_generic *dg, struct domain_generic **lsp) { - if ((char *) lsp - (char *) &locking_stack != (int) dg->order) + if ((char *) lsp - (char *) &locking_stack != dg->order) bug("Trying to unlock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack); if (dg->locked_by != &locking_stack) @@ -123,84 +124,3 @@ void do_unlock(struct domain_generic *dg, struct domain_generic **lsp) dg->prev = NULL; pthread_mutex_unlock(&dg->mutex); } - -/* Coroutines */ -struct coroutine { - resource r; - pthread_t id; - pthread_attr_t attr; - struct rcu_coro rcu; - void (*entry)(void *); - void *data; -}; - -static _Thread_local _Bool coro_cleaned_up = 0; - -static void coro_free(resource *r) -{ - struct coroutine *c = (void *) r; - rcu_coro_stop(&c->rcu); - ASSERT_DIE(pthread_equal(pthread_self(), c->id)); - pthread_attr_destroy(&c->attr); - coro_cleaned_up = 1; -} - -static void coro_dump(resource *r UNUSED) { } - -static struct resclass coro_class = { - .name = "Coroutine", - .size = sizeof(struct coroutine), - .free = coro_free, - .dump = coro_dump, -}; - -_Thread_local struct coroutine *this_coro = NULL; - -static void *coro_entry(void *p) -{ - struct coroutine *c = p; - - ASSERT_DIE(c->entry); - - this_coro = c; - rcu_coro_start(&c->rcu); - - c->entry(c->data); - ASSERT_DIE(coro_cleaned_up); - - return NULL; -} - -struct coroutine *coro_run(pool *p, void (*entry)(void *), void *data) -{ - ASSERT_DIE(entry); - ASSERT_DIE(p); - - struct coroutine *c = ralloc(p, &coro_class); - - c->entry = entry; - c->data = data; - - int e = 0; - - if (e = pthread_attr_init(&c->attr)) - die("pthread_attr_init() failed: %M", e); - - if (e = pthread_attr_setstacksize(&c->attr, CORO_STACK_SIZE)) - die("pthread_attr_setstacksize(%u) failed: %M", CORO_STACK_SIZE, e); - - if (e = pthread_attr_setdetachstate(&c->attr, PTHREAD_CREATE_DETACHED)) - die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e); - - if (e = pthread_create(&c->id, &c->attr, coro_entry, c)) - die("pthread_create() failed: %M", e); - - return c; -} - -void -coro_yield(void) -{ - const struct timespec req = { .tv_nsec = 100 }; - nanosleep(&req, NULL); -} diff --git a/sysdep/unix/io-loop.c b/sysdep/unix/io-loop.c index 9b107a1a..efb408e0 100644 --- a/sysdep/unix/io-loop.c +++ b/sysdep/unix/io-loop.c @@ -17,8 +17,8 @@ #include "nest/bird.h" #include "lib/buffer.h" -#include "lib/coro.h" #include "lib/lists.h" +#include "lib/locking.h" #include "lib/resource.h" #include "lib/event.h" #include "lib/timer.h" @@ -27,15 +27,54 @@ #include "lib/io-loop.h" #include "sysdep/unix/io-loop.h" #include "conf/conf.h" +#include "nest/cli.h" + +#define THREAD_STACK_SIZE 65536 /* To be lowered in near future */ + +static struct birdloop *birdloop_new_internal(pool *pp, uint order, const char *name, int request_pickup, struct birdloop_pickup_group *group); + +/* + * Nanosecond time for accounting purposes + * + * A fixed point on startup is set as zero, all other values are relative to that. + * Caution: this overflows after like 500 years or so. If you plan to run + * BIRD for such a long time, please implement some means of overflow prevention. + */ + +static struct timespec ns_begin; + +static void ns_init(void) +{ + if (clock_gettime(CLOCK_MONOTONIC, &ns_begin)) + bug("clock_gettime: %m"); +} + +static u64 ns_now(void) +{ + struct timespec ts; + if (clock_gettime(CLOCK_MONOTONIC, &ts)) + bug("clock_gettime: %m"); + + return (u64) (ts.tv_sec - ns_begin.tv_sec) * 1000000000 + ts.tv_nsec - ns_begin.tv_nsec; +} + /* * Current thread context */ -_Thread_local struct birdloop *birdloop_current = NULL; +_Thread_local struct birdloop *birdloop_current; static _Thread_local struct birdloop *birdloop_wakeup_masked; static _Thread_local uint birdloop_wakeup_masked_count; +#define LOOP_NAME(loop) domain_name((loop)->time.domain) + +#define LOOP_TRACE(loop, fmt, args...) do { if (config && config->latency_debug) log(L_TRACE "%s (%p): " fmt, LOOP_NAME(loop), (loop), ##args); } while (0) +#define THREAD_TRACE(...) do { if (config && config->latency_debug) log(L_TRACE "Thread: " __VA_ARGS__); } while (0) + +#define LOOP_WARN(loop, fmt, args...) log(L_TRACE "%s (%p): " fmt, LOOP_NAME(loop), (loop), ##args) + + event_list * birdloop_event_list(struct birdloop *loop) { @@ -48,12 +87,6 @@ birdloop_time_loop(struct birdloop *loop) return &loop->time; } -pool * -birdloop_pool(struct birdloop *loop) -{ - return loop->pool; -} - _Bool birdloop_inside(struct birdloop *loop) { @@ -64,91 +97,210 @@ birdloop_inside(struct birdloop *loop) return 0; } +_Bool +birdloop_in_this_thread(struct birdloop *loop) +{ + return pthread_equal(pthread_self(), loop->thread->thread_id); +} + +void +birdloop_flag(struct birdloop *loop, u32 flag) +{ + atomic_fetch_or_explicit(&loop->flags, flag, memory_order_acq_rel); + birdloop_ping(loop); +} + +void +birdloop_flag_set_handler(struct birdloop *loop, struct birdloop_flag_handler *fh) +{ + ASSERT_DIE(birdloop_inside(loop)); + loop->flag_handler = fh; +} + +static int +birdloop_process_flags(struct birdloop *loop) +{ + if (!loop->flag_handler) + return 0; + + u32 flags = atomic_exchange_explicit(&loop->flags, 0, memory_order_acq_rel); + if (!flags) + return 0; + + loop->flag_handler->hook(loop->flag_handler, flags); + return 1; +} + /* * Wakeup code for birdloop */ -static void -pipe_new(int *pfds) +void +pipe_new(struct pipe *p) { - int rv = pipe(pfds); + int rv = pipe(p->fd); if (rv < 0) die("pipe: %m"); - if (fcntl(pfds[0], F_SETFL, O_NONBLOCK) < 0) + if (fcntl(p->fd[0], F_SETFL, O_NONBLOCK) < 0) die("fcntl(O_NONBLOCK): %m"); - if (fcntl(pfds[1], F_SETFL, O_NONBLOCK) < 0) + if (fcntl(p->fd[1], F_SETFL, O_NONBLOCK) < 0) die("fcntl(O_NONBLOCK): %m"); } void -pipe_drain(int fd) +pipe_drain(struct pipe *p) { - char buf[64]; - int rv; - - try: - rv = read(fd, buf, 64); - if (rv < 0) - { - if (errno == EINTR) - goto try; - if (errno == EAGAIN) + while (1) { + char buf[64]; + int rv = read(p->fd[0], buf, sizeof(buf)); + if ((rv < 0) && (errno == EAGAIN)) return; - die("wakeup read: %m"); + + if (rv == 0) + bug("wakeup read eof"); + if ((rv < 0) && (errno != EINTR)) + bug("wakeup read: %m"); + } +} + +int +pipe_read_one(struct pipe *p) +{ + while (1) { + char v; + int rv = read(p->fd[0], &v, sizeof(v)); + if (rv == 1) + return 1; + if ((rv < 0) && (errno == EAGAIN)) + return 0; + if (rv > 1) + bug("wakeup read more bytes than expected: %d", rv); + if (rv == 0) + bug("wakeup read eof"); + if (errno != EINTR) + bug("wakeup read: %m"); } - if (rv == 64) - goto try; } void -pipe_kick(int fd) +pipe_kick(struct pipe *p) { - u64 v = 1; + char v = 1; int rv; - try: - rv = write(fd, &v, sizeof(u64)); - if (rv < 0) - { - if (errno == EINTR) - goto try; - if (errno == EAGAIN) + while (1) { + rv = write(p->fd[1], &v, sizeof(v)); + if ((rv >= 0) || (errno == EAGAIN)) return; - die("wakeup write: %m"); + if (errno != EINTR) + bug("wakeup write: %m"); } } +void +pipe_pollin(struct pipe *p, struct pfd *pfd) +{ + BUFFER_PUSH(pfd->pfd) = (struct pollfd) { + .fd = p->fd[0], + .events = POLLIN, + }; + BUFFER_PUSH(pfd->loop) = NULL; +} + static inline void -wakeup_init(struct birdloop *loop) +wakeup_init(struct bird_thread *loop) { - pipe_new(loop->wakeup_fds); + pipe_new(&loop->wakeup); } static inline void -wakeup_drain(struct birdloop *loop) +wakeup_drain(struct bird_thread *loop) { - pipe_drain(loop->wakeup_fds[0]); + pipe_drain(&loop->wakeup); } static inline void -wakeup_do_kick(struct birdloop *loop) +wakeup_do_kick(struct bird_thread *loop) { - pipe_kick(loop->wakeup_fds[1]); + pipe_kick(&loop->wakeup); } -void -birdloop_ping(struct birdloop *loop) +static inline _Bool +birdloop_try_ping(struct birdloop *loop, u32 ltt) { - u32 ping_sent = atomic_fetch_add_explicit(&loop->ping_sent, 1, memory_order_acq_rel); - if (ping_sent) - return; + /* Somebody else is already pinging, be idempotent */ + if (ltt & LTT_PING) + { + LOOP_TRACE(loop, "already being pinged"); + return 0; + } + + /* Thread moving is an implicit ping */ + if (ltt & LTT_MOVE) + { + LOOP_TRACE(loop, "ping while moving"); + return 1; + } + /* No more flags allowed */ + ASSERT_DIE(!ltt); + + /* No ping when not picked up */ + if (!loop->thread) + { + LOOP_TRACE(loop, "not picked up yet, can't ping"); + return 1; + } + + /* No ping when masked */ if (loop == birdloop_wakeup_masked) + { + LOOP_TRACE(loop, "wakeup masked, can't ping"); birdloop_wakeup_masked_count++; + return 1; + } + + /* Send meta event to ping */ + if ((loop != loop->thread->meta) && (loop != &main_birdloop)) + { + LOOP_TRACE(loop, "Ping by meta event to %p", loop->thread->meta); + ev_send_loop(loop->thread->meta, &loop->event); + return 1; + } + + /* Do the real ping */ + LOOP_TRACE(loop, "sending pipe ping"); + wakeup_do_kick(loop->thread); + return 0; +} + +static inline void +birdloop_do_ping(struct birdloop *loop) +{ + /* Register our ping effort */ + u32 ltt = atomic_fetch_or_explicit(&loop->thread_transition, LTT_PING, memory_order_acq_rel); + + /* Try to ping in multiple ways */ + if (birdloop_try_ping(loop, ltt)) + atomic_fetch_and_explicit(&loop->thread_transition, ~LTT_PING, memory_order_acq_rel); +} + +void +birdloop_ping(struct birdloop *loop) +{ + if (!birdloop_inside(loop)) + { + LOOP_TRACE(loop, "ping from outside"); + birdloop_do_ping(loop); + } else - wakeup_do_kick(loop); + { + LOOP_TRACE(loop, "ping from inside, pending=%d", loop->ping_pending); + if (!loop->ping_pending) + loop->ping_pending++; + } } @@ -161,204 +313,748 @@ sockets_init(struct birdloop *loop) { init_list(&loop->sock_list); loop->sock_num = 0; +} + +void +socket_changed(sock *s) +{ + struct birdloop *loop = s->loop; + ASSERT_DIE(birdloop_inside(loop)); - BUFFER_INIT(loop->poll_sk, loop->pool, 4); - BUFFER_INIT(loop->poll_fd, loop->pool, 4); - loop->poll_changed = 1; /* add wakeup fd */ + loop->sock_changed++; + birdloop_ping(loop); } -static void -sockets_add(struct birdloop *loop, sock *s) +void +birdloop_add_socket(struct birdloop *loop, sock *s) { - ASSERT_DIE(!enlisted(&s->n)); + ASSERT_DIE(birdloop_inside(loop)); + ASSERT_DIE(!s->loop); + LOOP_TRACE(loop, "adding socket %p (total=%d)", s, loop->sock_num); add_tail(&loop->sock_list, &s->n); loop->sock_num++; s->loop = loop; s->index = -1; - loop->poll_changed = 1; - birdloop_ping(loop); + socket_changed(s); } +extern sock *stored_sock; /* mainloop hack */ + void -sk_start(sock *s) +birdloop_remove_socket(struct birdloop *loop, sock *s) { - ASSERT_DIE(birdloop_current != &main_birdloop); - sockets_add(birdloop_current, s); -} + ASSERT_DIE(!enlisted(&s->n) == !s->loop); -static void -sockets_remove(struct birdloop *loop, sock *s) -{ + if (!s->loop) + return; + + ASSERT_DIE(birdloop_inside(loop)); ASSERT_DIE(s->loop == loop); - if (!enlisted(&s->n)) - return; + /* Decouple the socket from the loop at all. */ + LOOP_TRACE(loop, "removing socket %p (total=%d)", s, loop->sock_num); + + if (loop->sock_active == s) + loop->sock_active = sk_next(s); + + if ((loop == &main_birdloop) && (s == stored_sock)) + stored_sock = sk_next(s); rem_node(&s->n); loop->sock_num--; - if (s->index >= 0) - { - loop->poll_sk.data[s->index] = NULL; - s->index = -1; - loop->poll_changed = 1; - birdloop_ping(loop); - } + socket_changed(s); s->loop = NULL; + s->index = -1; } void -sk_stop(sock *s) +sk_reloop(sock *s, struct birdloop *loop) { - sockets_remove(birdloop_current, s); + ASSERT_DIE(birdloop_inside(loop)); + ASSERT_DIE(birdloop_inside(s->loop)); + + if (loop == s->loop) + return; + + birdloop_remove_socket(s->loop, s); + birdloop_add_socket(loop, s); +} + +void +sk_pause_rx(struct birdloop *loop, sock *s) +{ + ASSERT_DIE(birdloop_inside(loop)); + s->rx_hook = NULL; + socket_changed(s); +} + +void +sk_resume_rx(struct birdloop *loop, sock *s, int (*hook)(sock *, uint)) +{ + ASSERT_DIE(birdloop_inside(loop)); + ASSERT_DIE(hook); + s->rx_hook = hook; + socket_changed(s); } static inline uint sk_want_events(sock *s) +{ return (s->rx_hook ? POLLIN : 0) | (sk_tx_pending(s) ? POLLOUT : 0); } + +void +sockets_prepare(struct birdloop *loop, struct pfd *pfd) { - uint out = ((s->ttx != s->tpos) ? POLLOUT : 0); - if (s->rx_hook) - if (s->cork) + node *n; + WALK_LIST(n, loop->sock_list) + { + sock *s = SKIP_BACK(sock, n, n); + uint w = sk_want_events(s); + + if (!w) { - LOCK_DOMAIN(cork, s->cork->lock); - if (!enlisted(&s->cork_node)) - if (s->cork->count) - { -// log(L_TRACE "Socket %p corked", s); - add_tail(&s->cork->sockets, &s->cork_node); - } - else - out |= POLLIN; - UNLOCK_DOMAIN(cork, s->cork->lock); + s->index = -1; + continue; } - else - out |= POLLIN; -// log(L_TRACE "sk_want_events(%p) = %x", s, out); - return out; + s->index = pfd->pfd.used; + LOOP_TRACE(loop, "socket %p poll index is %d", s, s->index); + + BUFFER_PUSH(pfd->pfd) = (struct pollfd) { + .fd = s->fd, + .events = sk_want_events(s), + }; + BUFFER_PUSH(pfd->loop) = loop; + } } +int sk_read(sock *s, int revents); +int sk_write(sock *s); +void sk_err(sock *s, int revents); -void -sk_ping(sock *s) +static int +sockets_fire(struct birdloop *loop) { - s->loop->poll_changed = 1; - birdloop_ping(s->loop); + if (EMPTY_LIST(loop->sock_list)) + return 0; + + int repeat = 0; + + times_update(); + + struct pollfd *pfd = loop->thread->pfd->pfd.data; + loop->sock_active = SKIP_BACK(sock, n, HEAD(loop->sock_list)); + + while (loop->sock_active) + { + sock *s = loop->sock_active; + + int rev; + if ((s->index >= 0) && (rev = pfd[s->index].revents) && !(rev & POLLNVAL)) + { + int e = 1; + + if (rev & POLLOUT) + { + /* Write everything. */ + while ((s == loop->sock_active) && (e = sk_write(s))) + ; + + if (s != loop->sock_active) + continue; + + if (!sk_tx_pending(s)) + loop->thread->sock_changed++; + } + + if (rev & POLLIN) + /* Read just one packet and request repeat. */ + if ((s == loop->sock_active) && s->rx_hook) + if (sk_read(s, rev)) + repeat++; + + if (s != loop->sock_active) + continue; + + if (!(rev & (POLLOUT | POLLIN)) && (rev & POLLERR)) + sk_err(s, rev); + + if (s != loop->sock_active) + continue; + } + + loop->sock_active = sk_next(s); + } + + return repeat; } /* -FIXME: this should be called from sock code + * Threads + */ + +DEFINE_DOMAIN(resource); + +struct birdloop_pickup_group { + DOMAIN(resource) domain; + list loops; + list threads; + btime max_latency; +} pickup_groups[2] = { + { + /* all zeroes */ + }, + { + /* FIXME: make this dynamic, now it copies the loop_max_latency value from proto/bfd/config.Y */ + .max_latency = 10 MS, + }, +}; + +static _Thread_local struct bird_thread *this_thread; static void -sockets_update(struct birdloop *loop, sock *s) +birdloop_set_thread(struct birdloop *loop, struct bird_thread *thr, struct birdloop_pickup_group *group) +{ + struct bird_thread *old = loop->thread; + ASSERT_DIE(!thr != !old); + + /* Signal our moving effort */ + u32 ltt = atomic_fetch_or_explicit(&loop->thread_transition, LTT_MOVE, memory_order_acq_rel); + ASSERT_DIE((ltt & LTT_MOVE) == 0); + + while (ltt & LTT_PING) + { + birdloop_yield(); + ltt = atomic_load_explicit(&loop->thread_transition, memory_order_acquire); + ASSERT_DIE(ltt & LTT_MOVE); + } + /* Now we are free of running pings */ + + if (loop->thread = thr) + { + add_tail(&thr->loops, &loop->n); + thr->loop_count++; + } + else + { + old->loop_count--; + + LOCK_DOMAIN(resource, group->domain); + add_tail(&group->loops, &loop->n); + UNLOCK_DOMAIN(resource, group->domain); + } + + /* Finished */ + atomic_fetch_and_explicit(&loop->thread_transition, ~LTT_MOVE, memory_order_acq_rel); + + /* Request to run by force */ + ev_send_loop(loop->thread->meta, &loop->event); +} + +static struct birdloop * +birdloop_take(struct birdloop_pickup_group *group) { - if (s->index >= 0) - loop->poll_fd.data[s->index].events = sk_want_events(s); + struct birdloop *loop = NULL; + + LOCK_DOMAIN(resource, group->domain); + if (!EMPTY_LIST(group->loops)) + { + /* Take the first loop from the pickup list and unlock */ + loop = SKIP_BACK(struct birdloop, n, HEAD(group->loops)); + rem_node(&loop->n); + UNLOCK_DOMAIN(resource, group->domain); + + birdloop_set_thread(loop, this_thread, group); + + /* This thread goes to the end of the pickup list */ + LOCK_DOMAIN(resource, group->domain); + rem_node(&this_thread->n); + add_tail(&group->threads, &this_thread->n); + + /* If there are more loops to be picked up, wakeup the next thread in order */ + if (!EMPTY_LIST(group->loops)) + wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(group->threads))); + } + UNLOCK_DOMAIN(resource, group->domain); + + return loop; } -*/ static void -sockets_prepare(struct birdloop *loop) +birdloop_drop(struct birdloop *loop, struct birdloop_pickup_group *group) { - BUFFER_SET(loop->poll_sk, loop->sock_num + 1); - BUFFER_SET(loop->poll_fd, loop->sock_num + 1); + /* Remove loop from this thread's list */ + rem_node(&loop->n); - struct pollfd *pfd = loop->poll_fd.data; - sock **psk = loop->poll_sk.data; - uint i = 0; - node *n; + /* Unset loop's thread */ + if (birdloop_inside(loop)) + birdloop_set_thread(loop, NULL, group); + else + { + birdloop_enter(loop); + birdloop_set_thread(loop, NULL, group); + birdloop_leave(loop); + } - WALK_LIST(n, loop->sock_list) + /* Put loop into pickup list */ + LOCK_DOMAIN(resource, group->domain); + add_tail(&group->loops, &loop->n); + UNLOCK_DOMAIN(resource, group->domain); +} + +static int +poll_timeout(struct birdloop *loop) +{ + timer *t = timers_first(&loop->time); + if (!t) + return -1; + + btime remains = tm_remains(t); + return remains TO_MS + ((remains TO_MS) MS < remains); +} + +static void * +bird_thread_main(void *arg) +{ + struct bird_thread *thr = this_thread = arg; + + rcu_thread_start(&thr->rcu); + synchronize_rcu(); + + tmp_init(thr->pool); + init_list(&thr->loops); + + thr->meta = birdloop_new_internal(thr->pool, DOMAIN_ORDER(meta), "Thread Meta", 0, thr->group); + thr->meta->thread = thr; + birdloop_enter(thr->meta); + + thr->sock_changed = 1; + + struct pfd pfd; + BUFFER_INIT(pfd.pfd, thr->pool, 16); + BUFFER_INIT(pfd.loop, thr->pool, 16); + thr->pfd = &pfd; + + while (1) { - sock *s = SKIP_BACK(sock, n, n); + u64 thr_loop_start = ns_now(); + int timeout; + + /* Pickup new loops */ + struct birdloop *loop = birdloop_take(thr->group); + if (loop) + { + birdloop_enter(loop); + if (!EMPTY_LIST(loop->sock_list)) + thr->sock_changed = 1; + birdloop_leave(loop); + } + + /* Schedule all loops with timed out timers */ + timers_fire(&thr->meta->time, 0); + + /* Compute maximal time per loop */ + u64 thr_before_run = ns_now(); + if (thr->loop_count > 0) + thr->max_loop_time_ns = (thr->max_latency_ns / 2 - (thr_before_run - thr_loop_start)) / (u64) thr->loop_count; + + /* Run all scheduled loops */ + int more_events = ev_run_list(&thr->meta->event_list); + if (more_events) + { + THREAD_TRACE("More events to run"); + timeout = 0; + } + else + { + timeout = poll_timeout(thr->meta); + if (timeout == -1) + THREAD_TRACE("No timers, no events"); + else + THREAD_TRACE("Next timer in %d ms", timeout); + } + + /* Run priority events before sleeping */ + ev_run_list(&thr->priority_events); + + /* Do we have to refresh sockets? */ + if (thr->sock_changed) + { + thr->sock_changed = 0; + + BUFFER_FLUSH(pfd.pfd); + BUFFER_FLUSH(pfd.loop); + + pipe_pollin(&thr->wakeup, &pfd); + + node *nn; + WALK_LIST2(loop, nn, thr->loops, n) + { + birdloop_enter(loop); + sockets_prepare(loop, &pfd); + birdloop_leave(loop); + } + + ASSERT_DIE(pfd.loop.used == pfd.pfd.used); + } + /* Nothing to do in at least 5 seconds, flush local hot page cache */ + else if (timeout > 5000) + flush_local_pages(); + +poll_retry:; + int rv = poll(pfd.pfd.data, pfd.pfd.used, timeout); + if (rv < 0) + { + if (errno == EINTR || errno == EAGAIN) + goto poll_retry; + bug("poll in %p: %m", thr); + } - ASSERT(i < loop->sock_num); + /* Drain wakeup fd */ + if (pfd.pfd.data[0].revents & POLLIN) + { + ASSERT_DIE(rv > 0); + rv--; + wakeup_drain(thr); + } - s->index = i; - *psk = s; - pfd->fd = s->fd; - pfd->events = sk_want_events(s); - pfd->revents = 0; + atomic_fetch_and_explicit(&thr->meta->thread_transition, ~LTT_PING, memory_order_acq_rel); - pfd++; - psk++; - i++; + /* Schedule loops with active sockets */ + if (rv) + for (uint i = 1; i < pfd.pfd.used; i++) + if (pfd.pfd.data[i].revents) + { + LOOP_TRACE(pfd.loop.data[i], "socket id %d got revents=%d", i, pfd.pfd.data[i].revents); + ev_send_loop(thr->meta, &pfd.loop.data[i]->event); + } } - ASSERT(i == loop->sock_num); + bug("An infinite loop has ended."); +} + +static void +bird_thread_cleanup(void *_thr) +{ + struct bird_thread *thr = _thr; + ASSERT_DIE(birdloop_inside(&main_birdloop)); + + /* Thread attributes no longer needed */ + pthread_attr_destroy(&thr->thread_attr); + + /* Free all remaining memory */ + rfree(thr->pool); +} + +static struct bird_thread * +bird_thread_start(struct birdloop_pickup_group *group) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); + ASSERT_DIE(DOMAIN_IS_LOCKED(resource, group->domain)); + + pool *p = rp_new(&root_pool, "Thread"); + + struct bird_thread *thr = mb_allocz(p, sizeof(*thr)); + thr->pool = p; + thr->cleanup_event = (event) { .hook = bird_thread_cleanup, .data = thr, }; + thr->group = group; + thr->max_latency_ns = (group->max_latency ?: 5 S) TO_NS; + + wakeup_init(thr); + ev_init_list(&thr->priority_events, NULL, "Thread direct event list"); - /* Add internal wakeup fd */ - *psk = NULL; - pfd->fd = loop->wakeup_fds[0]; - pfd->events = POLLIN; - pfd->revents = 0; + add_tail(&group->threads, &thr->n); - loop->poll_changed = 0; + int e = 0; + + if (e = pthread_attr_init(&thr->thread_attr)) + die("pthread_attr_init() failed: %M", e); + + /* We don't have to worry about thread stack size so much. + if (e = pthread_attr_setstacksize(&thr->thread_attr, THREAD_STACK_SIZE)) + die("pthread_attr_setstacksize(%u) failed: %M", THREAD_STACK_SIZE, e); + */ + + if (e = pthread_attr_setdetachstate(&thr->thread_attr, PTHREAD_CREATE_DETACHED)) + die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e); + + if (e = pthread_create(&thr->thread_id, &thr->thread_attr, bird_thread_main, thr)) + die("pthread_create() failed: %M", e); + + return thr; } -int sk_read(sock *s, int revents); -int sk_write(sock *s); +static struct birdloop *thread_dropper; +static event *thread_dropper_event; +static uint thread_dropper_goal; static void -sockets_fire(struct birdloop *loop) +bird_thread_shutdown(void * _ UNUSED) { - struct pollfd *pfd = loop->poll_fd.data; - sock **psk = loop->poll_sk.data; - int poll_num = loop->poll_fd.used - 1; + struct birdloop_pickup_group *group = this_thread->group; + LOCK_DOMAIN(resource, group->domain); + int dif = list_length(&group->threads) - thread_dropper_goal; + struct birdloop *tdl_stop = NULL; - times_update(); + if (dif > 0) + ev_send_loop(thread_dropper, thread_dropper_event); + else + { + tdl_stop = thread_dropper; + thread_dropper = NULL; + } + + UNLOCK_DOMAIN(resource, group->domain); - /* Last fd is internal wakeup fd */ - if (pfd[poll_num].revents & POLLIN) + DBG("Thread pickup size differs from dropper goal by %d%s\n", dif, tdl_stop ? ", stopping" : ""); + + if (tdl_stop) { - wakeup_drain(loop); - loop->poll_changed = 1; + birdloop_stop_self(tdl_stop, NULL, NULL); + return; } - int i; - for (i = 0; i < poll_num; pfd++, psk++, i++) + struct bird_thread *thr = this_thread; + + /* Leave the thread-picker list to get no more loops */ + LOCK_DOMAIN(resource, group->domain); + rem_node(&thr->n); + UNLOCK_DOMAIN(resource, group->domain); + + /* Drop loops including the thread dropper itself */ + while (!EMPTY_LIST(thr->loops)) + birdloop_drop(HEAD(thr->loops), group); + + /* Let others know about new loops */ + if (!EMPTY_LIST(group->loops)) + wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(group->threads))); + UNLOCK_DOMAIN(resource, group->domain); + + /* Leave the thread-dropper loop as we aren't going to return. */ + birdloop_leave(thread_dropper); + + /* Stop the meta loop */ + birdloop_leave(thr->meta); + domain_free(thr->meta->time.domain); + rfree(thr->meta->pool); + + /* Local pages not needed anymore */ + flush_local_pages(); + + /* Unregister from RCU */ + rcu_thread_stop(&thr->rcu); + + /* Request thread cleanup from main loop */ + ev_send_loop(&main_birdloop, &thr->cleanup_event); + + /* Exit! */ + pthread_exit(NULL); +} + + +void +bird_thread_commit(struct config *new, struct config *old UNUSED) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); + + if (new->shutdown) + return; + + if (!new->thread_count) + new->thread_count = 1; + + while (1) { - if (!*psk) - continue; + struct birdloop_pickup_group *group = &pickup_groups[0]; + LOCK_DOMAIN(resource, group->domain); + + int dif = list_length(&group->threads) - (thread_dropper_goal = new->thread_count); + _Bool thread_dropper_running = !!thread_dropper; - if (! pfd->revents) + if (dif < 0) + { + bird_thread_start(group); + UNLOCK_DOMAIN(resource, group->domain); continue; + } + + UNLOCK_DOMAIN(resource, group->domain); + + if ((dif > 0) && !thread_dropper_running) + { + struct birdloop *tdl = birdloop_new(&root_pool, DOMAIN_ORDER(control), "Thread dropper", group->max_latency); + event *tde = ev_new_init(tdl->pool, bird_thread_shutdown, NULL); + + LOCK_DOMAIN(resource, group->domain); + thread_dropper = tdl; + thread_dropper_event = tde; + UNLOCK_DOMAIN(resource, group->domain); + + ev_send_loop(thread_dropper, thread_dropper_event); + } + + return; + } +} + + +DEFINE_DOMAIN(control); + +struct bird_thread_show_data { + cli *cli; + pool *pool; + DOMAIN(control) lock; + uint total; + uint done; + u8 show_loops; +}; - if (pfd->revents & POLLNVAL) - bug("poll: invalid fd %d", pfd->fd); +static void +bird_thread_show_cli_cont(struct cli *c UNUSED) +{ + /* Explicitly do nothing to prevent CLI from trying to parse another command. */ +} + +static int +bird_thread_show_cli_cleanup(struct cli *c UNUSED) +{ + return 1; /* Defer the cleanup until the writeout is finished. */ +} - int e = 1; +static void +bird_thread_show(void *data) +{ + struct bird_thread_show_data *tsd = data; - if (pfd->revents & POLLIN) - while (e && *psk && (*psk)->rx_hook) - e = sk_read(*psk, pfd->revents); + LOCK_DOMAIN(control, tsd->lock); + if (tsd->show_loops) + cli_printf(tsd->cli, -1026, "Thread %p", this_thread); + + u64 total_time_ns = 0; + struct birdloop *loop; + WALK_LIST(loop, this_thread->loops) + { + if (tsd->show_loops) + cli_printf(tsd->cli, -1026, " Loop %s time: %t", domain_name(loop->time.domain), loop->total_time_spent_ns NS); + total_time_ns += loop->total_time_spent_ns; + } + + tsd->done++; + int last = (tsd->done == tsd->total); + + if (last) + { + tsd->cli->cont = NULL; + tsd->cli->cleanup = NULL; + } - e = 1; - if (pfd->revents & POLLOUT) + if (tsd->show_loops) + cli_printf(tsd->cli, (last ? 1 : -1) * 1026, " Total time: %t", total_time_ns NS); + else + cli_printf(tsd->cli, (last ? 1 : -1) * 1026, "Thread %p time %t", this_thread, total_time_ns NS); + + UNLOCK_DOMAIN(control, tsd->lock); + + if (last) + { + the_bird_lock(); + + for (int i=0; i<2; i++) { - loop->poll_changed = 1; - while (e && *psk) - e = sk_write(*psk); + struct birdloop_pickup_group *group = &pickup_groups[i]; + + LOCK_DOMAIN(resource, group->domain); + if (!EMPTY_LIST(group->loops)) + if (tsd->show_loops) + { + cli_printf(tsd->cli, -1026, "Unassigned loops"); + WALK_LIST(loop, group->loops) + cli_printf(tsd->cli, -1026, " Loop %s time: %t", domain_name(loop->time.domain), loop->total_time_spent_ns NS); + } + else + { + uint count = 0; + u64 total_time_ns = 0; + WALK_LIST(loop, group->loops) + { + count++; + total_time_ns += loop->total_time_spent_ns; + } + cli_printf(tsd->cli, -1026, "Unassigned loops: %d, total time %t", count, total_time_ns NS); + } + UNLOCK_DOMAIN(resource, group->domain); } + + cli_write_trigger(tsd->cli); + DOMAIN_FREE(control, tsd->lock); + rfree(tsd->pool); + + the_bird_unlock(); } } +void +cmd_show_threads(int show_loops) +{ + pool *p = rp_new(&root_pool, "Show Threads"); + + struct bird_thread_show_data *tsd = mb_allocz(p, sizeof(struct bird_thread_show_data)); + tsd->lock = DOMAIN_NEW(control, "Show Threads"); + tsd->cli = this_cli; + tsd->pool = p; + tsd->show_loops = show_loops; + + this_cli->cont = bird_thread_show_cli_cont; + this_cli->cleanup = bird_thread_show_cli_cleanup; + + for (int i=0; i<2; i++) + { + struct birdloop_pickup_group *group = &pickup_groups[i]; + + LOCK_DOMAIN(control, tsd->lock); + LOCK_DOMAIN(resource, group->domain); + + struct bird_thread *thr; + WALK_LIST(thr, group->threads) + { + tsd->total++; + ev_send(&thr->priority_events, ev_new_init(p, bird_thread_show, tsd)); + wakeup_do_kick(thr); + } + + UNLOCK_DOMAIN(resource, group->domain); + UNLOCK_DOMAIN(control, tsd->lock); + } +} + /* * Birdloop */ -struct birdloop main_birdloop; +static struct bird_thread main_thread; +struct birdloop main_birdloop = { .thread = &main_thread, }; static void birdloop_enter_locked(struct birdloop *loop); void birdloop_init(void) { - wakeup_init(&main_birdloop); + ns_init(); + + for (int i=0; i<2; i++) + { + struct birdloop_pickup_group *group = &pickup_groups[i]; + + group->domain = DOMAIN_NEW(resource, "Loop Pickup"); + init_list(&group->loops); + init_list(&group->threads); + } + + wakeup_init(main_birdloop.thread); main_birdloop.time.domain = the_bird_domain.the_bird; main_birdloop.time.loop = &main_birdloop; @@ -366,85 +1062,177 @@ birdloop_init(void) times_update(); timers_init(&main_birdloop.time, &root_pool); - root_pool.loop = &main_birdloop; - main_birdloop.pool = &root_pool; - birdloop_enter_locked(&main_birdloop); } -static void birdloop_main(void *arg); - -void -birdloop_free(resource *r) +static void +birdloop_stop_internal(struct birdloop *loop) { - struct birdloop *loop = (void *) r; + LOOP_TRACE(loop, "Stopping"); - ASSERT_DIE(loop->links == 0); - domain_free(loop->time.domain); -} + /* Block incoming pings */ + u32 ltt = atomic_load_explicit(&loop->thread_transition, memory_order_acquire); + while (!atomic_compare_exchange_strong_explicit( + &loop->thread_transition, <t, LTT_PING, + memory_order_acq_rel, memory_order_acquire)) + ; -void -birdloop_dump(resource *r) -{ - struct birdloop *loop = (void *) r; + /* Flush remaining events */ + ASSERT_DIE(!ev_run_list(&loop->event_list)); + + /* Drop timers */ + timer *t; + while (t = timers_first(&loop->time)) + tm_stop(t); - debug("%s\n", loop->pool->name); + /* Drop sockets */ + sock *s; + WALK_LIST_FIRST2(s, n, loop->sock_list) + birdloop_remove_socket(loop, s); + + /* Unschedule from Meta */ + ev_postpone(&loop->event); + tm_stop(&loop->timer); + + /* Remove from thread loop list */ + rem_node(&loop->n); + loop->thread = NULL; + + /* Leave the loop context without causing any other fuss */ + ASSERT_DIE(!ev_active(&loop->event)); + loop->ping_pending = 0; + birdloop_leave(loop); + + /* Request local socket reload */ + this_thread->sock_changed++; + + /* Tail-call the stopped hook */ + loop->stopped(loop->stop_data); } -struct resmem birdloop_memsize(resource *r) +static void +birdloop_run(void *_loop) { - struct birdloop *loop = (void *) r; + /* Run priority events before the loop is executed */ + ev_run_list(&this_thread->priority_events); + + u64 start_time = ns_now(); + u64 end_time = start_time + this_thread->max_loop_time_ns; + + struct birdloop *loop = _loop; + birdloop_enter(loop); + + u64 locked_time = ns_now(), task_done_time; + if (locked_time > end_time) + LOOP_WARN(loop, "locked %luns after its scheduled end time", locked_time - end_time); + + uint repeat, loop_runs = 0; + do { + repeat = 0; + LOOP_TRACE(loop, "Regular run"); + loop_runs++; + + if (loop->stopped) + /* Birdloop left inside the helper function */ + return birdloop_stop_internal(loop); + + /* Process sockets */ + repeat += sockets_fire(loop); - return (struct resmem) { - .effective = sizeof(struct birdloop) - sizeof(resource) - ALLOC_OVERHEAD, - .overhead = ALLOC_OVERHEAD + sizeof(resource) + page_size * list_length(&loop->pages.list), - }; + /* Run timers */ + timers_fire(&loop->time, 0); + + /* Run flag handlers */ + repeat += birdloop_process_flags(loop); + + /* Run events */ + repeat += ev_run_list(&loop->event_list); + + /* Check end time */ + } while (((task_done_time = ns_now()) < end_time) && repeat); + + /* Request meta timer */ + timer *t = timers_first(&loop->time); + if (t) + tm_start_in(&loop->timer, tm_remains(t), this_thread->meta); + else + tm_stop(&loop->timer); + + /* Request re-run if needed */ + if (repeat) + ev_send_loop(this_thread->meta, &loop->event); + + /* Collect socket change requests */ + this_thread->sock_changed += loop->sock_changed; + loop->sock_changed = 0; + + birdloop_leave(loop); } -struct resclass birdloop_class = { - .name = "IO Loop", - .size = sizeof(struct birdloop), - .free = birdloop_free, - .dump = birdloop_dump, - .memsize = birdloop_memsize, -}; +static void +birdloop_run_timer(timer *tm) +{ + struct birdloop *loop = tm->data; + LOOP_TRACE(loop, "Timer ready, requesting run"); + ev_send_loop(loop->thread->meta, &loop->event); +} -struct birdloop * -birdloop_new(pool *pp, uint order, const char *name) +static struct birdloop * +birdloop_new_internal(pool *pp, uint order, const char *name, int request_pickup, struct birdloop_pickup_group *group) { struct domain_generic *dg = domain_new(name, order); - struct birdloop *loop = ralloc(pp, &birdloop_class); + pool *p = rp_new(pp, name); + struct birdloop *loop = mb_allocz(p, sizeof(struct birdloop)); + loop->pool = p; loop->time.domain = dg; loop->time.loop = loop; - birdloop_enter(loop); + atomic_store_explicit(&loop->thread_transition, 0, memory_order_relaxed); - loop->pool = rp_new(pp, loop, name); - loop->parent = pp; - rmove(&loop->r, loop->pool); + birdloop_enter(loop); - wakeup_init(loop); ev_init_list(&loop->event_list, loop, name); - timers_init(&loop->time, loop->pool); + timers_init(&loop->time, p); sockets_init(loop); - init_pages(loop); + loop->event = (event) { .hook = birdloop_run, .data = loop, }; + loop->timer = (timer) { .hook = birdloop_run_timer, .data = loop, }; - loop->time.coro = coro_run(loop->pool, birdloop_main, loop); + if (request_pickup) + { + LOCK_DOMAIN(resource, group->domain); + add_tail(&group->loops, &loop->n); + if (EMPTY_LIST(group->threads)) + bird_thread_start(group); + + wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(group->threads))); + UNLOCK_DOMAIN(resource, group->domain); + } + else + loop->n.next = loop->n.prev = &loop->n; birdloop_leave(loop); return loop; } +struct birdloop * +birdloop_new(pool *pp, uint order, const char *name, btime max_latency) +{ + return birdloop_new_internal(pp, order, name, 1, max_latency ? &pickup_groups[1] : &pickup_groups[0]); +} + static void birdloop_do_stop(struct birdloop *loop, void (*stopped)(void *data), void *data) { + LOOP_TRACE(loop, "Stop requested"); + loop->stopped = stopped; loop->stop_data = data; - wakeup_do_kick(loop); + + birdloop_do_ping(loop); } void @@ -464,6 +1252,15 @@ birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *dat birdloop_do_stop(loop, stopped, data); } +void +birdloop_free(struct birdloop *loop) +{ + ASSERT_DIE(loop->thread == NULL); + + domain_free(loop->time.domain); + rfree(loop->pool); +} + static void birdloop_enter_locked(struct birdloop *loop) { @@ -490,6 +1287,14 @@ birdloop_leave_locked(struct birdloop *loop) /* Check the current context */ ASSERT_DIE(birdloop_current == loop); + /* Send pending pings */ + if (loop->ping_pending) + { + LOOP_TRACE(loop, "sending pings on leave"); + loop->ping_pending = 0; + birdloop_do_ping(loop); + } + /* Restore the old context */ birdloop_current = loop->prev_loop; } @@ -514,107 +1319,13 @@ birdloop_unmask_wakeups(struct birdloop *loop) ASSERT_DIE(birdloop_wakeup_masked == loop); birdloop_wakeup_masked = NULL; if (birdloop_wakeup_masked_count) - wakeup_do_kick(loop); + wakeup_do_kick(loop->thread); birdloop_wakeup_masked_count = 0; } void -birdloop_link(struct birdloop *loop) -{ - ASSERT_DIE(birdloop_inside(loop)); - loop->links++; -} - -void -birdloop_unlink(struct birdloop *loop) -{ - ASSERT_DIE(birdloop_inside(loop)); - ASSERT_DIE(loop->links); - if (!--loop->links) - birdloop_ping(loop); -} - -static void -birdloop_main(void *arg) +birdloop_yield(void) { - struct birdloop *loop = arg; - timer *t; - int rv, timeout; - - btime loop_begin = current_time(); - - birdloop_enter(loop); - while (1) - { - timers_fire(&loop->time, 0); - if (ev_run_list(&loop->event_list)) - timeout = 0; - else if (t = timers_first(&loop->time)) - timeout = (tm_remains(t) TO_MS) + 1; - else - timeout = -1; - - if (loop->poll_changed) - sockets_prepare(loop); - - btime duration = current_time_update() - loop_begin; - if (duration > config->watchdog_warning) - log(L_WARN "I/O loop cycle took %d ms", (int) (duration TO_MS)); - - birdloop_leave(loop); - - try: - rv = poll(loop->poll_fd.data, loop->poll_fd.used, timeout); - if (rv < 0) - { - if (errno == EINTR || errno == EAGAIN) - goto try; - die("poll: %m"); - } - - birdloop_enter(loop); - - if (loop->stopped && !loop->links) - break; - - loop_begin = current_time_update(); - - if (rv) - sockets_fire(loop); - - atomic_exchange_explicit(&loop->ping_sent, 0, memory_order_acq_rel); - } - - /* Flush remaining events */ - ASSERT_DIE(!ev_run_list(&loop->event_list)); - - /* No timers allowed */ - ASSERT_DIE(timers_count(&loop->time) == 0); - ASSERT_DIE(EMPTY_LIST(loop->sock_list)); - ASSERT_DIE(loop->sock_num == 0); - - birdloop_leave(loop); - - /* Lock parent loop */ - pool *parent = loop->parent; - birdloop_enter(parent->loop); - - /* Move the loop temporarily to parent pool */ - birdloop_enter(loop); - rmove(&loop->r, parent); - birdloop_leave(loop); - - /* Announce loop stop */ - loop->stopped(loop->stop_data); - - /* Free the pool and loop */ - birdloop_enter(loop); - rp_free(loop->pool, parent); - flush_pages(loop); - birdloop_leave(loop); - rfree(&loop->r); - - /* And finally leave the parent loop before finishing */ - birdloop_leave(parent->loop); + usleep(100); } diff --git a/sysdep/unix/io-loop.h b/sysdep/unix/io-loop.h index 1727637a..2b0b7ebf 100644 --- a/sysdep/unix/io-loop.h +++ b/sysdep/unix/io-loop.h @@ -7,52 +7,88 @@ #ifndef _BIRD_SYSDEP_UNIX_IO_LOOP_H_ #define _BIRD_SYSDEP_UNIX_IO_LOOP_H_ -#include "nest/bird.h" +#include "lib/rcu.h" -#include "lib/lists.h" -#include "lib/event.h" -#include "lib/timer.h" +#include <pthread.h> -struct free_pages +struct pipe { - list list; /* List of empty pages */ - event *cleanup; /* Event to call when number of pages is outside bounds */ - u16 min, max; /* Minimal and maximal number of free pages kept */ - uint cnt; /* Number of empty pages */ + int fd[2]; }; +struct pfd { + BUFFER(struct pollfd) pfd; + BUFFER(struct birdloop *) loop; +}; + +void sockets_prepare(struct birdloop *, struct pfd *); +void socket_changed(struct birdsock *); + +void pipe_new(struct pipe *); +void pipe_pollin(struct pipe *, struct pfd *); +void pipe_drain(struct pipe *); +void pipe_kick(struct pipe *); + struct birdloop { - resource r; + node n; + + event event; + timer timer; pool *pool; - pool *parent; struct timeloop time; event_list event_list; list sock_list; - uint sock_num; + struct birdsock *sock_active; + int sock_num; + uint sock_changed; - BUFFER(sock *) poll_sk; - BUFFER(struct pollfd) poll_fd; - u8 poll_changed; + uint ping_pending; - _Atomic u32 ping_sent; - int wakeup_fds[2]; - - uint links; - - struct free_pages pages; + _Atomic u32 thread_transition; +#define LTT_PING 1 +#define LTT_MOVE 2 + _Atomic u32 flags; + struct birdloop_flag_handler *flag_handler; void (*stopped)(void *data); void *stop_data; struct birdloop *prev_loop; + + struct bird_thread *thread; + + u64 total_time_spent_ns; }; -extern _Thread_local struct birdloop *birdloop_current; +struct bird_thread +{ + node n; -void init_pages(struct birdloop *loop); -void flush_pages(struct birdloop *loop); + struct pipe wakeup; + event_list priority_events; + + struct birdloop *meta; + + pthread_t thread_id; + pthread_attr_t thread_attr; + + struct rcu_thread rcu; + + list loops; + struct birdloop_pickup_group *group; + pool *pool; + struct pfd *pfd; + + event cleanup_event; + + int sock_changed; + uint loop_count; + + u64 max_latency_ns; + u64 max_loop_time_ns; +}; #endif diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index 0e5adc14..88d187a4 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -76,7 +76,7 @@ rf_free(resource *r) } static void -rf_dump(resource *r) +rf_dump(resource *r, unsigned indent UNUSED) { struct rfile *a = (struct rfile *) r; @@ -140,7 +140,7 @@ times_update(void) if ((ts.tv_sec < 0) || (((u64) ts.tv_sec) > ((u64) 1 << 40))) log(L_WARN "Monotonic clock is crazy"); - + btime new_time = ts.tv_sec S + ts.tv_nsec NS; if (new_time < old_time) @@ -722,11 +722,7 @@ sk_log_error(sock *s, const char *p) * Actual struct birdsock code */ -static list sock_list; -static struct birdsock *current_sock; -static struct birdsock *stored_sock; - -static inline sock * +sock * sk_next(sock *s) { if (!s->n.next->next) @@ -774,8 +770,7 @@ sk_ssh_free(sock *s) if (ssh->channel) { - if (ssh_channel_is_open(ssh->channel)) - ssh_channel_close(ssh->channel); + ssh_channel_close(ssh->channel); ssh_channel_free(ssh->channel); ssh->channel = NULL; } @@ -789,12 +784,12 @@ sk_ssh_free(sock *s) } #endif + static void sk_free(resource *r) { sock *s = (sock *) r; - ASSERT_DIE(!s->loop || birdloop_inside(s->loop)); sk_free_bufs(s); #ifdef HAVE_LIBSSH @@ -802,30 +797,10 @@ sk_free(resource *r) sk_ssh_free(s); #endif - if (s->cork) - { - LOCK_DOMAIN(cork, s->cork->lock); - if (enlisted(&s->cork_node)) - rem_node(&s->cork_node); - UNLOCK_DOMAIN(cork, s->cork->lock); - } - - if (!s->loop) - ; - else if (s->flags & SKF_THREAD) - sk_stop(s); - else - { - if (s == current_sock) - current_sock = sk_next(s); - if (s == stored_sock) - stored_sock = sk_next(s); - - if (enlisted(&s->n)) - rem_node(&s->n); - } + if (s->loop) + birdloop_remove_socket(s->loop, s); - if (s->type != SK_SSH && s->type != SK_SSH_ACTIVE && s->fd != -1) + if (s->fd >= 0 && s->type != SK_SSH && s->type != SK_SSH_ACTIVE) close(s->fd); s->fd = -1; @@ -876,7 +851,7 @@ sk_reallocate(sock *s) } static void -sk_dump(resource *r) +sk_dump(resource *r, unsigned indent UNUSED) { sock *s = (sock *) r; static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "SSH>", "SSH", "DEL!" }; @@ -1038,12 +1013,6 @@ sk_setup(sock *s) } static void -sk_insert(sock *s) -{ - add_tail(&sock_list, &s->n); -} - -static void sk_tcp_connected(sock *s) { sockaddr sa; @@ -1116,14 +1085,7 @@ sk_passive_connected(sock *s, int type) return 1; } - if (s->flags & SKF_PASSIVE_THREAD) - t->flags |= SKF_THREAD; - else - { - ASSERT_DIE(s->loop == &main_birdloop); - t->loop = &main_birdloop; - sk_insert(t); - } + birdloop_add_socket(s->loop, t); sk_alloc_bufs(t); s->rx_hook(t, 0); @@ -1169,34 +1131,45 @@ sk_ssh_connect(sock *s) { int server_identity_is_ok = 1; +#ifdef HAVE_SSH_OLD_SERVER_VALIDATION_API +#define ssh_session_is_known_server ssh_is_server_known +#define SSH_KNOWN_HOSTS_OK SSH_SERVER_KNOWN_OK +#define SSH_KNOWN_HOSTS_UNKNOWN SSH_SERVER_NOT_KNOWN +#define SSH_KNOWN_HOSTS_CHANGED SSH_SERVER_KNOWN_CHANGED +#define SSH_KNOWN_HOSTS_NOT_FOUND SSH_SERVER_FILE_NOT_FOUND +#define SSH_KNOWN_HOSTS_ERROR SSH_SERVER_ERROR +#define SSH_KNOWN_HOSTS_OTHER SSH_SERVER_FOUND_OTHER +#endif + /* Check server identity */ - switch (ssh_is_server_known(s->ssh->session)) + switch (ssh_session_is_known_server(s->ssh->session)) { #define LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s,msg,args...) log(L_WARN "SSH Identity %s@%s:%u: " msg, (s)->ssh->username, (s)->host, (s)->dport, ## args); - case SSH_SERVER_KNOWN_OK: + case SSH_KNOWN_HOSTS_OK: /* The server is known and has not changed. */ break; - case SSH_SERVER_NOT_KNOWN: + case SSH_KNOWN_HOSTS_UNKNOWN: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server is unknown, its public key was not found in the known host file %s", s->ssh->server_hostkey_path); + server_identity_is_ok = 0; break; - case SSH_SERVER_KNOWN_CHANGED: + case SSH_KNOWN_HOSTS_CHANGED: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server key has changed. Either you are under attack or the administrator changed the key."); server_identity_is_ok = 0; break; - case SSH_SERVER_FILE_NOT_FOUND: + case SSH_KNOWN_HOSTS_NOT_FOUND: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The known host file %s does not exist", s->ssh->server_hostkey_path); server_identity_is_ok = 0; break; - case SSH_SERVER_ERROR: + case SSH_KNOWN_HOSTS_ERROR: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "Some error happened"); server_identity_is_ok = 0; break; - case SSH_SERVER_FOUND_OTHER: + case SSH_KNOWN_HOSTS_OTHER: LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server gave use a key of a type while we had an other type recorded. " \ "It is a possible attack."); server_identity_is_ok = 0; @@ -1327,6 +1300,7 @@ sk_open_ssh(sock *s) /** * sk_open - open a socket + * @loop: loop * @s: socket * * This function takes a socket resource created by sk_new() and @@ -1336,7 +1310,7 @@ sk_open_ssh(sock *s) * Result: 0 for success, -1 for an error. */ int -sk_open(sock *s) +sk_open(sock *s, struct birdloop *loop) { int af = AF_UNSPEC; int fd = -1; @@ -1345,17 +1319,6 @@ sk_open(sock *s) ip_addr bind_addr = IPA_NONE; sockaddr sa; - if (s->flags & SKF_THREAD) - { - ASSERT_DIE(s->loop && (s->loop != &main_birdloop)); - ASSERT_DIE(birdloop_inside(s->loop)); - } - else - { - ASSERT_DIE(!s->loop); - s->loop = &main_birdloop; - } - if (s->type <= SK_IP) { /* @@ -1500,9 +1463,7 @@ sk_open(sock *s) sk_alloc_bufs(s); } - if (!(s->flags & SKF_THREAD)) - sk_insert(s); - + birdloop_add_socket(loop, s); return 0; err: @@ -1512,7 +1473,7 @@ err: } int -sk_open_unix(sock *s, char *name) +sk_open_unix(sock *s, struct birdloop *loop, char *name) { struct sockaddr_un sa; int fd; @@ -1539,41 +1500,10 @@ sk_open_unix(sock *s, char *name) return -1; s->fd = fd; - s->loop = &main_birdloop; - sk_insert(s); + birdloop_add_socket(loop, s); return 0; } -static void -sk_reloop_hook(void *_vs) -{ - sock *s = _vs; - if (birdloop_inside(&main_birdloop)) - { - s->flags &= ~SKF_THREAD; - sk_insert(s); - } - else - { - s->flags |= SKF_THREAD; - sk_start(s); - } -} - -void -sk_reloop(sock *s, struct birdloop *loop) -{ - if (enlisted(&s->n)) - rem_node(&s->n); - - s->reloop = (event) { - .hook = sk_reloop_hook, - .data = s, - }; - - ev_send_loop(loop, &s->reloop); -} - #define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \ CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL) @@ -1696,6 +1626,13 @@ sk_recvmsg(sock *s) static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; } +_Bool +sk_tx_pending(sock *s) +{ + return s->ttx != s->tpos; +} + + static int sk_maybe_write(sock *s) { @@ -1706,7 +1643,7 @@ sk_maybe_write(sock *s) case SK_TCP: case SK_MAGIC: case SK_UNIX: - while (s->ttx != s->tpos) + while (sk_tx_pending(s)) { e = write(s->fd, s->ttx, s->tpos - s->ttx); @@ -1719,7 +1656,6 @@ sk_maybe_write(sock *s) s->err_hook(s, (errno != EPIPE) ? errno : 0); return -1; } - sk_ping(s); return 0; } s->ttx += e; @@ -1729,7 +1665,7 @@ sk_maybe_write(sock *s) #ifdef HAVE_LIBSSH case SK_SSH: - while (s->ttx != s->tpos) + while (sk_tx_pending(s)) { e = ssh_channel_write(s->ssh->channel, s->ttx, s->tpos - s->ttx); @@ -1812,7 +1748,12 @@ sk_send(sock *s, unsigned len) { s->ttx = s->tbuf; s->tpos = s->tbuf + len; - return sk_maybe_write(s); + + int e = sk_maybe_write(s); + if (e == 0) /* Trigger thread poll reload to poll this socket's write. */ + socket_changed(s); + + return e; } /** @@ -1859,7 +1800,7 @@ call_rx_hook(sock *s, int size) if (s->rx_hook(s, size)) { /* We need to be careful since the socket could have been deleted by the hook */ - if (current_sock == s) + if (s->loop->sock_active == s) s->rpos = s->rbuf; } } @@ -1913,8 +1854,8 @@ sk_read_ssh(sock *s) /* sk_read() and sk_write() are called from BFD's event loop */ -int -sk_read(sock *s, int revents) +static inline int +sk_read_noflush(sock *s, int revents) { switch (s->type) { @@ -1927,25 +1868,6 @@ sk_read(sock *s, int revents) case SK_TCP: case SK_UNIX: { - if (s->cork) - { - int cont = 0; - LOCK_DOMAIN(cork, s->cork->lock); - if (!enlisted(&s->cork_node)) - if (s->cork->count) - { -// log(L_TRACE "Socket %p corked", s); - add_tail(&s->cork->sockets, &s->cork_node); - sk_ping(s); - } - else - cont = 1; - UNLOCK_DOMAIN(cork, s->cork->lock); - - if (!cont) - return 0; - } - int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos); if (c < 0) @@ -1996,7 +1918,15 @@ sk_read(sock *s, int revents) } int -sk_write(sock *s) +sk_read(sock *s, int revents) +{ + int e = sk_read_noflush(s, revents); + tmp_flush(); + return e; +} + +static inline int +sk_write_noflush(sock *s) { switch (s->type) { @@ -2034,7 +1964,7 @@ sk_write(sock *s) #endif default: - if (s->ttx != s->tpos && sk_maybe_write(s) > 0) + if (sk_tx_pending(s) && sk_maybe_write(s) > 0) { if (s->tx_hook) s->tx_hook(s); @@ -2044,6 +1974,14 @@ sk_write(sock *s) } } +int +sk_write(sock *s) +{ + int e = sk_write_noflush(s); + tmp_flush(); + return e; +} + int sk_is_ipv4(sock *s) { return s->af == AF_INET; } @@ -2062,6 +2000,7 @@ sk_err(sock *s, int revents) } s->err_hook(s, se); + tmp_flush(); } void @@ -2071,11 +2010,11 @@ sk_dump_all(void) sock *s; debug("Open sockets:\n"); - WALK_LIST(n, sock_list) + WALK_LIST(n, main_birdloop.sock_list) { s = SKIP_BACK(sock, n, n); debug("%p ", s); - sk_dump(&s->r); + sk_dump(&s->r, 3); } debug("\n"); } @@ -2104,15 +2043,15 @@ static btime loop_time; static void io_update_time(void) { - last_io_time = current_time_update(); + last_io_time = current_time(); if (event_open) { event_open->duration = last_io_time - event_open->timestamp; if (event_open->duration > config->latency_limit) - log(L_WARN "Event 0x%p 0x%p took %d ms", - event_open->hook, event_open->data, (int) (event_open->duration TO_MS)); + log(L_WARN "Event 0x%p 0x%p took %u.%03u ms", + event_open->hook, event_open->data, (uint) (event_open->duration TO_MS), (uint) (event_open->duration % 1000)); event_open = NULL; } @@ -2176,6 +2115,8 @@ watchdog_sigalrm(int sig UNUSED) config->latency_limit = 0xffffffff; io_update_time(); + debug_safe("Watchdog timer timed out\n"); + /* We want core dump */ abort(); } @@ -2216,8 +2157,8 @@ watchdog_stop(void) btime duration = last_io_time - loop_time; if (duration > config->watchdog_warning) - log(L_WARN "I/O loop cycle took %d ms for %d events", - (int) (duration TO_MS), event_log_num); + log(L_WARN "I/O loop cycle took %u.%03u ms for %d events", + (uint) (duration TO_MS), (uint) (duration % 1000), event_log_num); } @@ -2228,7 +2169,7 @@ watchdog_stop(void) void io_init(void) { - init_list(&sock_list); + init_list(&main_birdloop.sock_list); ev_init_list(&global_event_list, &main_birdloop, "Global event list"); ev_init_list(&global_work_list, &main_birdloop, "Global work list"); ev_init_list(&main_birdloop.event_list, &main_birdloop, "Global fast event list"); @@ -2245,20 +2186,17 @@ static int short_loops = 0; #define SHORT_LOOP_MAX 10 #define WORK_EVENTS_MAX 10 -void pipe_drain(int fd); -void check_stored_pages(void); +sock *stored_sock; void io_loop(void) { int poll_tout, timeout; - int nfds, events, pout; - int reload_requested = 0; + int events, pout; timer *t; - sock *s; - node *n; - int fdmax = 256; - struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd)); + struct pfd pfd; + BUFFER_INIT(pfd.pfd, &root_pool, 16); + BUFFER_INIT(pfd.loop, &root_pool, 16); watchdog_start1(); for(;;) @@ -2270,14 +2208,8 @@ io_loop(void) timers_fire(&main_birdloop.time, 1); io_close_event(); -#if DEBUGGING -#define PERIODIC_WAKEUP 86400000 -#else -#define PERIODIC_WAKEUP 3000 -#endif -restart_poll: // FIXME - poll_tout = ((reload_requested || events) ? 0 : PERIODIC_WAKEUP); /* Time in milliseconds */ + poll_tout = (events ? 0 : 3000); /* Time in milliseconds */ if (t = timers_first(&main_birdloop.time)) { times_update(); @@ -2285,40 +2217,11 @@ restart_poll: poll_tout = MIN(poll_tout, timeout); } - /* A hack to reload main io_loop() when something has changed asynchronously. */ - pfd[0].fd = main_birdloop.wakeup_fds[0]; - pfd[0].events = POLLIN; - - nfds = 1; - - WALK_LIST(n, sock_list) - { - pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */ - s = SKIP_BACK(sock, n, n); - if (s->rx_hook && !ev_corked(s->cork)) - { - pfd[nfds].fd = s->fd; - pfd[nfds].events |= POLLIN; - } - if (s->tx_hook && s->ttx != s->tpos) - { - pfd[nfds].fd = s->fd; - pfd[nfds].events |= POLLOUT; - } - if (pfd[nfds].fd != -1) - { - s->index = nfds; - nfds++; - } - else - s->index = -1; + BUFFER_FLUSH(pfd.pfd); + BUFFER_FLUSH(pfd.loop); - if (nfds >= fdmax) - { - fdmax *= 2; - pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd)); - } - } + pipe_pollin(&main_birdloop.thread->wakeup, &pfd); + sockets_prepare(&main_birdloop, &pfd); /* * Yes, this is racy. But even if the signal comes before this test @@ -2350,7 +2253,7 @@ restart_poll: /* And finally enter poll() to find active sockets */ watchdog_stop(); birdloop_leave(&main_birdloop); - pout = poll(pfd, nfds, poll_tout); + pout = poll(pfd.pfd.data, pfd.pfd.used, poll_tout); birdloop_enter(&main_birdloop); watchdog_start(); @@ -2358,111 +2261,99 @@ restart_poll: { if (errno == EINTR || errno == EAGAIN) continue; - die("poll: %m"); + bug("poll: %m"); } - - if (pout && (pfd[0].revents & POLLIN)) - { - /* IO loop reload requested */ - pipe_drain(main_birdloop.wakeup_fds[0]); - reload_requested = 1; - goto restart_poll; - } - - if (reload_requested) - { - reload_requested = 0; - atomic_exchange_explicit(&main_birdloop.ping_sent, 0, memory_order_acq_rel); - } - if (pout) { + if (pfd.pfd.data[0].revents & POLLIN) + { + /* IO loop reload requested */ + pipe_drain(&main_birdloop.thread->wakeup); + atomic_fetch_and_explicit(&main_birdloop.thread_transition, ~LTT_PING, memory_order_acq_rel); + continue; + } + times_update(); /* guaranteed to be non-empty */ - current_sock = SKIP_BACK(sock, n, HEAD(sock_list)); + main_birdloop.sock_active = SKIP_BACK(sock, n, HEAD(main_birdloop.sock_list)); - while (current_sock) + while (main_birdloop.sock_active) + { + sock *s = main_birdloop.sock_active; + if (s->index != -1) { - sock *s = current_sock; - if (s->index == -1) - { - current_sock = sk_next(s); - goto next; - } - int e; int steps; steps = MAX_STEPS; - if (s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook) + if (s->fast_rx && (pfd.pfd.data[s->index].revents & POLLIN) && s->rx_hook) do { steps--; io_log_event(s->rx_hook, s->data); - e = sk_read(s, pfd[s->index].revents); - if (s != current_sock) - goto next; + e = sk_read(s, pfd.pfd.data[s->index].revents); } - while (e && s->rx_hook && steps); + while (e && (main_birdloop.sock_active == s) && s->rx_hook && steps); + + if (s != main_birdloop.sock_active) + continue; steps = MAX_STEPS; - if (pfd[s->index].revents & POLLOUT) + if (pfd.pfd.data[s->index].revents & POLLOUT) do { steps--; io_log_event(s->tx_hook, s->data); e = sk_write(s); - if (s != current_sock) - goto next; } - while (e && steps); + while (e && (main_birdloop.sock_active == s) && steps); - current_sock = sk_next(s); - next: ; + if (s != main_birdloop.sock_active) + continue; } + main_birdloop.sock_active = sk_next(s); + } + short_loops++; if (events && (short_loops < SHORT_LOOP_MAX)) continue; short_loops = 0; int count = 0; - current_sock = stored_sock; - if (current_sock == NULL) - current_sock = SKIP_BACK(sock, n, HEAD(sock_list)); + main_birdloop.sock_active = stored_sock; + if (main_birdloop.sock_active == NULL) + main_birdloop.sock_active = SKIP_BACK(sock, n, HEAD(main_birdloop.sock_list)); - while (current_sock && count < MAX_RX_STEPS) + while (main_birdloop.sock_active && count < MAX_RX_STEPS) { - sock *s = current_sock; + sock *s = main_birdloop.sock_active; if (s->index == -1) - { - current_sock = sk_next(s); - goto next2; - } + goto next2; - if (!s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook) + if (!s->fast_rx && (pfd.pfd.data[s->index].revents & POLLIN) && s->rx_hook) { count++; io_log_event(s->rx_hook, s->data); - sk_read(s, pfd[s->index].revents); - if (s != current_sock) - goto next2; + sk_read(s, pfd.pfd.data[s->index].revents); + if (s != main_birdloop.sock_active) + continue; } - if (pfd[s->index].revents & (POLLHUP | POLLERR)) + if (pfd.pfd.data[s->index].revents & (POLLHUP | POLLERR)) { - sk_err(s, pfd[s->index].revents); - if (s != current_sock) - goto next2; + sk_err(s, pfd.pfd.data[s->index].revents); + if (s != main_birdloop.sock_active) + continue; } - current_sock = sk_next(s); next2: ; + main_birdloop.sock_active = sk_next(s); } - stored_sock = current_sock; + stored_sock = main_birdloop.sock_active; } } } diff --git a/sysdep/unix/krt.Y b/sysdep/unix/krt.Y index 95b54d65..4ce9a328 100644 --- a/sysdep/unix/krt.Y +++ b/sysdep/unix/krt.Y @@ -29,7 +29,7 @@ kif_set_preferred(ip_addr ip) CF_DECLS -CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, KRT_SOURCE, KRT_METRIC, MERGE, PATHS) +CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, MERGE, PATHS) CF_KEYWORDS(INTERFACE, PREFERRED) %type <i> kern_mp_limit @@ -122,9 +122,6 @@ kif_iface: kif_iface_start iface_patt_list_nopx kif_iface_opt_list; -dynamic_attr: KRT_SOURCE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SOURCE); } ; -dynamic_attr: KRT_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_METRIC); } ; - CF_CODE CF_END diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index 0cb86213..9e6ddb45 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -53,7 +53,7 @@ #include "nest/bird.h" #include "nest/iface.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "filter/filter.h" #include "conf/conf.h" @@ -74,7 +74,7 @@ static list krt_proto_list; void krt_io_init(void) { - krt_pool = rp_new(&root_pool, &main_birdloop, "Kernel Syncer"); + krt_pool = rp_new(&root_pool, "Kernel Syncer"); krt_filter_lp = lp_new_default(krt_pool); init_list(&krt_proto_list); krt_sys_io_init(); @@ -163,6 +163,15 @@ kif_shutdown(struct proto *P) return PS_DOWN; } +static void +kif_cleanup(struct proto *p) +{ + if (p->debug & D_EVENTS) + log(L_TRACE "%s: Flushing interfaces", p->name); + if_start_update(); + if_end_update(); +} + static int kif_reconfigure(struct proto *p, struct proto_config *new) { @@ -232,17 +241,24 @@ kif_copy_config(struct proto_config *dest, struct proto_config *src) struct protocol proto_unix_iface = { .name = "Device", .template = "device%d", - .class = PROTOCOL_DEVICE, .proto_size = sizeof(struct kif_proto), .config_size = sizeof(struct kif_config), .preconfig = kif_preconfig, .init = kif_init, .start = kif_start, .shutdown = kif_shutdown, + .cleanup = kif_cleanup, .reconfigure = kif_reconfigure, .copy_config = kif_copy_config }; +void +kif_build(void) +{ + proto_build(&proto_unix_iface); +} + + /* * Tracing of routes */ @@ -280,30 +296,40 @@ static struct tbf rl_alien = TBF_DEFAULT_LOG_LIMITS; static inline u32 krt_metric(rte *a) { - eattr *ea = ea_find(a->attrs->eattrs, EA_KRT_METRIC); + eattr *ea = ea_find(a->attrs, &ea_krt_metric); return ea ? ea->u.data : 0; } -static inline int -krt_rte_better(rte *a, rte *b) +static void +krt_learn_alien_attr(struct channel *c, rte *e) { - return (krt_metric(a) > krt_metric(b)); + ea_set_attr_u32(&e->attrs, &ea_gen_preference, 0, c->preference); } /* Called when alien route is discovered during scan */ static void -krt_learn_rte(struct krt_proto *p, rte *e) +krt_learn_scan(struct krt_proto *p, rte *e) { - struct rte_src *src = e->src = rt_get_source(&p->p, krt_metric(e)); - rte_update(p->p.main_channel, e->net, e, e->src); - rt_unlock_source(src); + rte e0 = { + .attrs = e->attrs, + .src = rt_get_source(&p->p, krt_metric(e)), + }; + + krt_learn_alien_attr(p->p.main_channel, &e0); + + rte_update(p->p.main_channel, e->net, &e0, e0.src); + rt_unlock_source(e0.src); } static void -krt_learn_init(struct krt_proto *p) +krt_learn_async(struct krt_proto *p, rte *e, int new) { - if (KRT_CF->learn) - channel_setup_in_table(p->p.main_channel, 1); + if (new) + return krt_learn_scan(p, e); + + struct rte_src *src = rt_get_source(&p->p, krt_metric(e)); + rte_update(p->p.main_channel, e->net, NULL, src); + rt_unlock_source(src); } #endif @@ -323,17 +349,17 @@ rte_feed_count(net *n) { uint count = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) + if (rte_is_valid(RTE_OR_NULL(e))) count++; return count; } static void -rte_feed_obtain(net *n, rte **feed, uint count) +rte_feed_obtain(net *n, const rte **feed, uint count) { uint i = 0; for (struct rte_storage *e = n->routes; e; e = e->next) - if (rte_is_valid(RTES_OR_NULL(e))) + if (rte_is_valid(RTE_OR_NULL(e))) { ASSERT_DIE(i < count); feed[i++] = &e->rte; @@ -344,6 +370,13 @@ rte_feed_obtain(net *n, rte **feed, uint count) static struct rte * krt_export_net(struct krt_proto *p, net *net) { + /* FIXME: Here we are calling filters in table-locked context when exporting + * to kernel. Here BIRD can crash if the user requested ROA check in kernel + * export filter. It doesn't make much sense to write the filters like this, + * therefore we may keep this unfinished piece of work here for later as it + * won't really affect anybody. */ + ASSERT_DIE(RT_IS_LOCKED(p->p.main_channel->table)); + struct channel *c = p->p.main_channel; const struct filter *filter = c->out_filter; @@ -353,7 +386,7 @@ krt_export_net(struct krt_proto *p, net *net) if (!count) return NULL; - rte **feed = alloca(count * sizeof(rte *)); + const rte **feed = alloca(count * sizeof(rte *)); rte_feed_obtain(net, feed, count); return rt_export_merged(c, feed, count, krt_filter_lp, 1); } @@ -372,7 +405,7 @@ krt_export_net(struct krt_proto *p, net *net) if (filter == FILTER_ACCEPT) goto accept; - if (f_run(filter, &rt, krt_filter_lp, FF_SILENT) > F_ACCEPT) + if (f_run(filter, &rt, FF_SILENT) > F_ACCEPT) goto reject; @@ -386,15 +419,12 @@ reject: static int krt_same_dest(rte *k, rte *e) { - rta *ka = k->attrs, *ea = e->attrs; + ea_list *ka = k->attrs, *ea = e->attrs; - if (ka->dest != ea->dest) - return 0; + eattr *nhea_k = ea_find(ka, &ea_gen_nexthop); + eattr *nhea_e = ea_find(ea, &ea_gen_nexthop); - if (ka->dest == RTD_UNICAST) - return nexthop_same(&(ka->nh), &(ea->nh)); - - return 1; + return (!nhea_k == !nhea_e) && adata_same(nhea_k->u.ptr, nhea_e->u.ptr); } /* @@ -412,22 +442,28 @@ krt_got_route(struct krt_proto *p, rte *e, s8 src) switch (src) { case KRT_SRC_KERNEL: - goto ignore; + krt_trace_in(p, e, "ignored"); + return; case KRT_SRC_REDIRECT: - goto delete; + krt_trace_in(p, e, "deleting"); + krt_replace_rte(p, e->net, NULL, e); + return; case KRT_SRC_ALIEN: if (KRT_CF->learn) - krt_learn_rte(p, e); + krt_learn_scan(p, e); else krt_trace_in_rl(&rl_alien, p, e, "[alien] ignored"); return; } #endif + /* The rest is for KRT_SRC_BIRD (or KRT_SRC_UNKNOWN) */ - RT_LOCK(p->p.main_channel->table); + RT_LOCKED(p->p.main_channel->table, tab) + { + /* Deleting all routes if flush is requested */ if (p->flush_routes) goto delete; @@ -436,7 +472,7 @@ krt_got_route(struct krt_proto *p, rte *e, s8 src) if (!p->ready) goto ignore; - net *net = net_find(RT_PRIV(p->p.main_channel->table), e->net); + net *net = net_find(tab, e->net); if (!net || !krt_is_installed(p, net)) goto delete; @@ -481,8 +517,9 @@ delete: krt_replace_rte(p, e->net, NULL, e); goto done; -done: - RT_UNLOCK(p->p.main_channel->table); +done:; + } + lp_flush(krt_filter_lp); } @@ -490,18 +527,13 @@ static void krt_init_scan(struct krt_proto *p) { bmap_reset(&p->seen_map, 1024); - -#ifdef KRT_ALLOW_LEARN - if (KRT_CF->learn) - channel_refresh_begin(p->p.main_channel); -#endif } static void krt_prune(struct krt_proto *p) { - RT_LOCK(p->p.main_channel->table); - rtable_private *t = RT_PRIV(p->p.main_channel->table); + RT_LOCKED(p->p.main_channel->table, t) + { KRT_TRACE(p, D_EVENTS, "Pruning table %s", t->name); FIB_WALK(&t->fib, net, n) @@ -521,15 +553,10 @@ krt_prune(struct krt_proto *p) } FIB_WALK_END; - RT_UNLOCK(p->p.main_channel->table); - -#ifdef KRT_ALLOW_LEARN - if (KRT_CF->learn) - channel_refresh_end(p->p.main_channel); -#endif - if (p->ready) p->initialized = 1; + + } } static void @@ -567,25 +594,24 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src) case KRT_SRC_ALIEN: if (KRT_CF->learn) { - krt_learn_rte(p, e); + krt_learn_async(p, e, new); return; } #endif } } + /* * Periodic scanning */ - -#ifdef CONFIG_ALL_TABLES_AT_ONCE - -static timer *krt_scan_timer; -static int krt_scan_count; +static timer *krt_scan_all_timer; +static int krt_scan_all_count; +static _Bool krt_scan_all_tables; static void -krt_scan(timer *t UNUSED) +krt_scan_all(timer *t UNUSED) { struct krt_proto *p; node *n; @@ -606,35 +632,42 @@ krt_scan(timer *t UNUSED) } static void -krt_scan_timer_start(struct krt_proto *p) +krt_scan_all_timer_start(struct krt_proto *p) { - if (!krt_scan_count) - krt_scan_timer = tm_new_init(krt_pool, krt_scan, NULL, KRT_CF->scan_time, 0); + if (!krt_scan_all_count) + krt_scan_all_timer = tm_new_init(krt_pool, krt_scan_all, NULL, KRT_CF->scan_time, 0); - krt_scan_count++; + krt_scan_all_count++; - tm_start(krt_scan_timer, 1 S); + tm_start(krt_scan_all_timer, 1 S); } static void -krt_scan_timer_stop(struct krt_proto *p UNUSED) +krt_scan_all_timer_stop(void) { - krt_scan_count--; + ASSERT(krt_scan_all_count > 0); + + krt_scan_all_count--; - if (!krt_scan_count) + if (!krt_scan_all_count) { - rfree(krt_scan_timer); - krt_scan_timer = NULL; + rfree(krt_scan_all_timer); + krt_scan_all_timer = NULL; } } static void -krt_scan_timer_kick(struct krt_proto *p UNUSED) +krt_scan_all_timer_kick(void) { - tm_start(krt_scan_timer, 0); + tm_start(krt_scan_all_timer, 0); +} + +void +krt_use_shared_scan(void) +{ + krt_scan_all_tables = 1; } -#else static void krt_scan(timer *t) @@ -652,37 +685,44 @@ krt_scan(timer *t) static void krt_scan_timer_start(struct krt_proto *p) { - p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0); - tm_start(p->scan_timer, 1 S); + if (krt_scan_all_tables) + krt_scan_all_timer_start(p); + else + { + p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0); + tm_start(p->scan_timer, 1 S); + } } static void krt_scan_timer_stop(struct krt_proto *p) { - tm_stop(p->scan_timer); + if (krt_scan_all_tables) + krt_scan_all_timer_stop(); + else + tm_stop(p->scan_timer); } static void krt_scan_timer_kick(struct krt_proto *p) { - tm_start(p->scan_timer, 0); + if (krt_scan_all_tables) + krt_scan_all_timer_kick(); + else + tm_start(p->scan_timer, 0); } -#endif - - - /* * Updates */ static int -krt_preexport(struct channel *c, rte *e) +krt_preexport(struct channel *C, rte *e) { - if (e->src->owner == &c->proto->sources) + if (e->src->owner == &C->proto->sources) #ifdef CONFIG_SINGLE_ROUTE - return 1; /* Passing the route directly for rt_notify() to ignore */ + return 1; #else return -1; #endif @@ -703,11 +743,8 @@ krt_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net, return; #ifdef CONFIG_SINGLE_ROUTE - /* - * When the imported kernel route becomes the best one, we get it directly and - * we simply know that it is already there. Nothing to do. - */ - if (new->src->owner == &P->sources) + /* Got the same route as we imported. Keep it, do nothing. */ + if (new && new->src->owner == &P->sources) return; #endif @@ -755,6 +792,14 @@ krt_feed_end(struct channel *C) krt_scan_timer_kick(p); } +static int +krt_rte_better(const rte *new, const rte *old) +{ + u32 n = ea_get_int(new->attrs, &ea_krt_metric, IGP_METRIC_UNKNOWN); + u32 o = ea_get_int(old->attrs, &ea_krt_metric, IGP_METRIC_UNKNOWN); + + return (n < o); +} /* * Protocol glue @@ -781,11 +826,6 @@ krt_postconfig(struct proto_config *CF) if (! proto_cf_main_channel(CF)) cf_error("Channel not specified"); -#ifdef CONFIG_ALL_TABLES_AT_ONCE - if (krt_cf->scan_time != cf->scan_time) - cf_error("All kernel syncers must use the same table scan interval"); -#endif - struct channel_config *cc = proto_cf_main_channel(CF); struct rtable_config *tab = cc->table; if (tab->krt_attached) @@ -801,6 +841,10 @@ krt_postconfig(struct proto_config *CF) krt_sys_postconfig(cf); } +struct rte_owner_class krt_rte_owner_class = { + .rte_better = krt_rte_better, +}; + static struct proto * krt_init(struct proto_config *CF) { @@ -811,10 +855,11 @@ krt_init(struct proto_config *CF) p->p.preexport = krt_preexport; p->p.rt_notify = krt_rt_notify; - p->p.if_notify = krt_if_notify; + p->p.iface_sub.if_notify = krt_if_notify; p->p.reload_routes = krt_reload_routes; p->p.feed_end = krt_feed_end; - p->p.rte_better = krt_rte_better; + + p->p.sources.class = &krt_rte_owner_class; krt_sys_init(p); return &p->p; @@ -840,10 +885,6 @@ krt_start(struct proto *P) bmap_init(&p->seen_map, p->p.pool, 1024); add_tail(&krt_proto_list, &p->krt_node); -#ifdef KRT_ALLOW_LEARN - krt_learn_init(p); -#endif - if (!krt_sys_start(p)) { rem_node(&p->krt_node); @@ -923,24 +964,15 @@ krt_copy_config(struct proto_config *dest, struct proto_config *src) krt_sys_copy_config(d, s); } -static int -krt_get_attr(const eattr *a, byte *buf, int buflen) -{ - switch (a->id) - { - case EA_KRT_SOURCE: - bsprintf(buf, "source"); - return GA_NAME; - - case EA_KRT_METRIC: - bsprintf(buf, "metric"); - return GA_NAME; - - default: - return krt_sys_get_attr(a, buf, buflen); - } -} +struct ea_class ea_krt_source = { + .name = "krt_source", + .type = T_INT, +}; +struct ea_class ea_krt_metric = { + .name = "krt_metric", + .type = T_INT, +}; #ifdef CONFIG_IP6_SADR_KERNEL #define MAYBE_IP6_SADR NB_IP6_SADR @@ -957,7 +989,6 @@ krt_get_attr(const eattr *a, byte *buf, int buflen) struct protocol proto_unix_kernel = { .name = "Kernel", .template = "kernel%d", - .class = PROTOCOL_KERNEL, .preference = DEF_PREF_INHERITED, .channel_mask = NB_IP | MAYBE_IP6_SADR | MAYBE_MPLS, .proto_size = sizeof(struct krt_proto), @@ -969,5 +1000,15 @@ struct protocol proto_unix_kernel = { .shutdown = krt_shutdown, .reconfigure = krt_reconfigure, .copy_config = krt_copy_config, - .get_attr = krt_get_attr, }; + +void +krt_build(void) +{ + proto_build(&proto_unix_kernel); + + EA_REGISTER_ALL( + &ea_krt_source, + &ea_krt_metric, + ); +} diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h index 968c5b16..9f7ebb4f 100644 --- a/sysdep/unix/krt.h +++ b/sysdep/unix/krt.h @@ -21,8 +21,7 @@ struct kif_proto; #define KRT_DEFAULT_ECMP_LIMIT 16 -#define EA_KRT_SOURCE EA_CODE(PROTOCOL_KERNEL, 0) -#define EA_KRT_METRIC EA_CODE(PROTOCOL_KERNEL, 1) +extern struct ea_class ea_krt_source, ea_krt_metric; #define KRT_REF_SEEN 0x1 /* Seen in table */ #define KRT_REF_BEST 0x2 /* Best in table */ @@ -51,10 +50,7 @@ struct krt_proto { struct proto p; struct krt_state sys; /* Sysdep state */ -#ifndef CONFIG_ALL_TABLES_AT_ONCE timer *scan_timer; -#endif - struct bmap sync_map; /* Keeps track which exported routes were successfully written to kernel */ struct bmap seen_map; /* Routes seen during last periodic scan */ node krt_node; /* Node in krt_proto_list */ @@ -76,6 +72,7 @@ extern pool *krt_pool; struct proto_config * kif_init_config(int class); void kif_request_scan(void); +void krt_use_shared_scan(void); void krt_got_route(struct krt_proto *p, struct rte *e, s8 src); void krt_got_route_async(struct krt_proto *p, struct rte *e, int new, s8 src); diff --git a/sysdep/unix/log.c b/sysdep/unix/log.c index 68a04e78..be7f8adf 100644 --- a/sysdep/unix/log.c +++ b/sysdep/unix/log.c @@ -32,14 +32,15 @@ #include "lib/lists.h" #include "sysdep/unix/unix.h" +static int dbg_fd = -1; static FILE *dbgf; static list *current_log_list; static char *current_syslog_name; /* NULL -> syslog closed */ -static _Atomic uint max_coro_id = ATOMIC_VAR_INIT(1); -static _Thread_local uint this_coro_id; +static _Atomic uint max_thread_id = ATOMIC_VAR_INIT(1); +static _Thread_local uint this_thread_id; -#define THIS_CORO_ID (this_coro_id ?: (this_coro_id = atomic_fetch_add_explicit(&max_coro_id, 1, memory_order_acq_rel))) +#define THIS_THREAD_ID (this_thread_id ?: (this_thread_id = atomic_fetch_add_explicit(&max_thread_id, 1, memory_order_acq_rel))) #include <pthread.h> @@ -183,7 +184,7 @@ log_commit(int class, buffer *buf) l->pos += msg_len; } - fprintf(l->fh, "%s [%04x] <%s> ", tbuf, THIS_CORO_ID, class_names[class]); + fprintf(l->fh, "%s [%04x] <%s> ", tbuf, THIS_THREAD_ID, class_names[class]); } fputs(buf->start, l->fh); fputc('\n', l->fh); @@ -313,6 +314,7 @@ debug(const char *msg, ...) va_start(args, msg); if (dbgf) { +#if 0 struct timespec dbg_time; clock_gettime(CLOCK_MONOTONIC, &dbg_time); uint nsec; @@ -329,10 +331,10 @@ debug(const char *msg, ...) sec = dbg_time.tv_sec - dbg_time_start.tv_sec - 1; } - int n = bsnprintf(pos, max, "%u.%09u: [%04x] ", sec, nsec, THIS_CORO_ID); + int n = bsnprintf(pos, max, "%u.%09u: [%04x] ", sec, nsec, THIS_THREAD_ID); pos += n; max -= n; - +#endif if (bvsnprintf(pos, max, msg, args) < 0) bug("Extremely long debug output, split it."); @@ -341,6 +343,21 @@ debug(const char *msg, ...) va_end(args); } +/** + * debug_safe - async-safe write to debug output + * @msg: a string message + * + * This function prints the message @msg to the debugging output in a + * way that is async safe and can be used in signal handlers. No newline + * character is appended. + */ +void +debug_safe(const char *msg) +{ + if (dbg_fd >= 0) + write(dbg_fd, msg, strlen(msg)); +} + static list * default_log_list(int initial, const char **syslog_name) { @@ -388,21 +405,6 @@ default_log_list(int initial, const char **syslog_name) } void -log_cleanup(int syslog) -{ - struct log_config *l; - - if (current_log_list) - WALK_LIST(l, *current_log_list) - if (l->rf) - log_close(l); - - if (syslog && current_syslog_name) - closelog(); -} - - -void log_switch(int initial, list *logs, const char *new_syslog_name) { struct log_config *l; @@ -414,7 +416,10 @@ log_switch(int initial, list *logs, const char *new_syslog_name) logs = default_log_list(initial, &new_syslog_name); /* Close the logs to avoid pinning them on disk when deleted */ - log_cleanup(0); + if (current_log_list) + WALK_LIST(l, *current_log_list) + if (l->rf) + log_close(l); /* Reopen the logs, needed for 'configure undo' */ if (logs) @@ -453,8 +458,10 @@ log_init_debug(char *f) { clock_gettime(CLOCK_MONOTONIC, &dbg_time_start); + dbg_fd = -1; if (dbgf && dbgf != stderr) fclose(dbgf); + if (!f) dbgf = NULL; else if (!*f) @@ -465,6 +472,10 @@ log_init_debug(char *f) fprintf(stderr, "bird: Unable to open debug file %s: %s\n", f, strerror(errno)); exit(1); } + if (dbgf) + { setvbuf(dbgf, NULL, _IONBF, 0); + dbg_fd = fileno(dbgf); + } } diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c index c326ba2b..0337c755 100644 --- a/sysdep/unix/main.c +++ b/sysdep/unix/main.c @@ -31,7 +31,7 @@ #include "lib/locking.h" #include "lib/timer.h" #include "lib/string.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "nest/iface.h" #include "nest/cli.h" @@ -52,12 +52,12 @@ async_dump(void) { debug("INTERNAL STATE DUMP\n\n"); - rp_dump(&root_pool); + rdump(&root_pool, 0); sk_dump_all(); // XXXX tm_dump_all(); if_dump_all(); neigh_dump_all(); - rta_dump_all(); + ea_dump_all(); rt_dump_all(); protos_dump_all(); @@ -117,7 +117,7 @@ add_num_const(char *name, int val, const char *file, const uint line) struct f_val *v = cfg_alloc(sizeof(struct f_val)); *v = (struct f_val) { .type = T_INT, .val.i = val }; struct symbol *sym = cf_get_symbol(name); - if (sym->class && (sym->scope == conf_this_scope)) + if (sym->class && cf_symbol_is_local(sym)) cf_error("Error reading value for %s from %s:%d: already defined", name, file, line); cf_define_symbol(sym, SYM_CONSTANT | T_INT, val, v); @@ -200,10 +200,10 @@ sysdep_preconfig(struct config *c) } int -sysdep_commit(struct config *new, struct config *old UNUSED) +sysdep_commit(struct config *new, struct config *old) { - if (!new->shutdown) - log_switch(0, &new->logfiles, new->syslog_name); + log_switch(0, &new->logfiles, new->syslog_name); + bird_thread_commit(new, old); return 0; } @@ -245,6 +245,8 @@ async_config(void) { struct config *conf; + config_free_old(); + log(L_INFO "Reconfiguration requested by SIGHUP"); if (!unix_read_config(&conf, config_name)) { @@ -283,6 +285,9 @@ cmd_read_config(const char *name) void cmd_check_config(const char *name) { + if (cli_access_restricted()) + return; + struct config *conf = cmd_read_config(name); if (!conf) return; @@ -327,6 +332,8 @@ cmd_reconfig(const char *name, int type, uint timeout) if (cli_access_restricted()) return; + config_free_old(); + struct config *conf = cmd_read_config(name); if (!conf) return; @@ -397,7 +404,8 @@ static char *path_control_socket = PATH_CONTROL_SOCKET; static void cli_write(cli *c) { - sock *s = c->priv; + sock *s = c->sock; + ASSERT_DIE(c->sock); while (c->tx_pos) { @@ -421,7 +429,9 @@ cli_write(cli *c) void cli_write_trigger(cli *c) { - sock *s = c->priv; + sock *s = c->sock; + if (!s) + return; if (s->tbuf == NULL) cli_write(c); @@ -436,8 +446,9 @@ cli_tx(sock *s) int cli_get_command(cli *c) { - sock *s = c->priv; - byte *t = c->rx_aux ? : s->rbuf; + sock *s = c->sock; + ASSERT_DIE(c->sock); + byte *t = s->rbuf; byte *tend = s->rpos; byte *d = c->rx_pos; byte *dend = c->rx_buf + CLI_RX_BUF_SIZE - 2; @@ -448,16 +459,22 @@ cli_get_command(cli *c) t++; else if (*t == '\n') { + *d = 0; t++; + + /* Move remaining data and reset pointers */ + uint rest = (t < tend) ? (tend - t) : 0; + memmove(s->rbuf, t, rest); + s->rpos = s->rbuf + rest; c->rx_pos = c->rx_buf; - c->rx_aux = t; - *d = 0; + return (d < dend) ? 1 : -1; } else if (d < dend) *d++ = *t++; } - c->rx_aux = s->rpos = s->rbuf; + + s->rpos = s->rbuf; c->rx_pos = d; return 0; } @@ -479,6 +496,7 @@ cli_err(sock *s, int err) else log(L_INFO "CLI connection closed"); } + cli_free(s->data); } @@ -504,7 +522,6 @@ cli_connect(sock *s, uint size UNUSED) s->pool = c->pool; /* We need to have all the socket buffers allocated in the cli pool */ s->fast_rx = 1; c->rx_pos = c->rx_buf; - c->rx_aux = NULL; rmove(s, c->pool); return 1; } @@ -525,7 +542,7 @@ cli_init_unix(uid_t use_uid, gid_t use_gid) /* Return value intentionally ignored */ unlink(path_control_socket); - if (sk_open_unix(s, path_control_socket) < 0) + if (sk_open_unix(s, &main_birdloop, path_control_socket) < 0) die("Cannot create control socket %s: %m", path_control_socket); if (use_uid || use_gid) @@ -615,7 +632,6 @@ sysdep_shutdown_done(void) unlink_pid_file(); unlink(path_control_socket); log_msg(L_FATAL "Shutdown completed"); - log_cleanup(1); exit(0); } @@ -686,7 +702,7 @@ signal_init(void) * Parsing of command-line arguments */ -static char *opt_list = "c:dD:ps:P:u:g:flRh"; +static char *opt_list = "bc:dD:ps:P:u:g:flRh"; int parse_and_exit; char *bird_name; static char *use_user; @@ -865,8 +881,6 @@ parse_args(int argc, char **argv) } } -void resource_sys_init(void); - /* * Hic Est main() */ @@ -880,19 +894,17 @@ main(int argc, char **argv) #endif times_update(); - resource_sys_init(); parse_args(argc, argv); log_switch(1, NULL, NULL); the_bird_lock(); random_init(); - net_init(); resource_init(); birdloop_init(); olock_init(); - io_init(); rt_init(); + io_init(); if_init(); // roa_init(); config_init(); @@ -916,14 +928,14 @@ main(int argc, char **argv) open_pid_file(); protos_build(); - proto_build(&proto_unix_kernel); - proto_build(&proto_unix_iface); struct config *conf = read_config(); if (parse_and_exit) exit(0); + flush_local_pages(); + if (!run_in_foreground) { pid_t pid = fork(); diff --git a/sysdep/unix/unix.h b/sysdep/unix/unix.h index 51ec1b1e..606b79cd 100644 --- a/sysdep/unix/unix.h +++ b/sysdep/unix/unix.h @@ -9,6 +9,9 @@ #ifndef _BIRD_UNIX_H_ #define _BIRD_UNIX_H_ +#include "nest/bird.h" +#include "lib/io-loop.h" + #include <sys/socket.h> #include <signal.h> @@ -16,6 +19,7 @@ struct pool; struct iface; struct birdsock; struct rfile; +struct config; /* main.c */ @@ -32,6 +36,8 @@ void cmd_reconfig_undo(void); void cmd_reconfig_status(void); void cmd_shutdown(void); void cmd_graceful_restart(void); +void cmd_show_threads(int); +void bird_thread_commit(struct config *new, struct config *old); #define UNIX_DEFAULT_CONFIGURE_TIMEOUT 300 @@ -107,7 +113,7 @@ extern volatile sig_atomic_t async_shutdown_flag; void io_init(void); void io_loop(void); void io_log_dump(void); -int sk_open_unix(struct birdsock *s, char *name); +int sk_open_unix(struct birdsock *s, struct birdloop *, char *name); struct rfile *rf_open(struct pool *, const char *name, const char *mode); void *rf_file(struct rfile *f); int rf_fileno(struct rfile *f); @@ -122,7 +128,6 @@ void krt_io_init(void); void main_thread_init(void); void log_init_debug(char *); /* Initialize debug dump to given file (NULL=stderr, ""=off) */ void log_switch(int initial, list *l, const char *); -void log_cleanup(int syslog); struct log_config { node n; diff --git a/test/birdtest.c b/test/birdtest.c index c6a09684..cdca3829 100644 --- a/test/birdtest.c +++ b/test/birdtest.c @@ -20,6 +20,8 @@ #include "test/birdtest.h" #include "lib/string.h" +#include "lib/event.h" +#include "lib/io-loop.h" #ifdef HAVE_EXECINFO_H #include <execinfo.h> @@ -58,14 +60,14 @@ u64 bt_random_state[] = { 0x53d9772877c1b647, 0xab8ce3eb466de6c5, 0xad02844c8a8e865f, 0xe8cc78080295065d }; -void resource_sys_init(void); - void bt_init(int argc, char *argv[]) { int c; - resource_sys_init(); + /* We have no interest in stdin */ + close(0); + initstate(BT_RANDOM_SEED, (char *) bt_random_state, sizeof(bt_random_state)); bt_verbose = 0; @@ -122,6 +124,11 @@ bt_init(int argc, char *argv[]) clock_gettime(CLOCK_MONOTONIC, &bt_begin); bt_suite_case_begin = bt_suite_begin = bt_begin; + the_bird_lock(); + resource_init(); + ev_init_list(&global_event_list, &main_birdloop, "Global event list in unit tests"); + ev_init_list(&global_work_list, &main_birdloop, "Global work list in unit tests"); + birdloop_init(); return; usage: @@ -175,6 +182,8 @@ int bt_run_test_fn(int (*fn)(const void *), const void *fn_arg, int timeout) if (!bt_suite_result) result = 0; + tmp_flush(); + return result; } @@ -312,6 +321,12 @@ bt_log_suite_case_result(int result, const char *fmt, ...) } } +void +bt_reset_suite_case_timer(void) +{ + clock_gettime(CLOCK_MONOTONIC, &bt_suite_case_begin); +} + int bt_test_suite_base(int (*fn)(const void *), const char *id, const void *fn_arg, int forked, int timeout, const char *dsc, ...) { @@ -504,6 +519,15 @@ bt_fmt_ipa(char *buf, size_t size, const void *data) bsnprintf(buf, size, "(null)"); } +void +bt_format_net(char *buf, size_t size, const void *data) +{ + if (data) + bsnprintf(buf, size, "%N", (const net_addr *) data); + else + bsnprintf(buf, size, "(null)"); +} + int bt_is_char(byte c) { @@ -533,7 +557,14 @@ void cmd_reconfig_undo_notify(void) {} #include "lib/net.h" #include "conf/conf.h" void sysdep_preconfig(struct config *c UNUSED) {} -int sysdep_commit(struct config *new UNUSED, struct config *old UNUSED) { return 0; } + +void bird_thread_commit(struct config *new, struct config *old); +int sysdep_commit(struct config *new, struct config *old) +{ + bird_thread_commit(new, old); + return 0; +} + void sysdep_shutdown_done(void) {} #include "nest/cli.h" diff --git a/test/birdtest.h b/test/birdtest.h index 23557122..cfeebb98 100644 --- a/test/birdtest.h +++ b/test/birdtest.h @@ -32,6 +32,7 @@ extern const char *bt_test_id; void bt_init(int argc, char *argv[]); int bt_exit_value(void); +void bt_reset_suite_case_timer(void); int bt_test_suite_base(int (*test_fn)(const void *), const char *test_id, const void *test_fn_argument, int forked, int timeout, const char *dsc, ...); static inline u64 bt_random(void) { return ((u64) random() & 0xffffffff) | ((u64) random() << 32); } @@ -39,7 +40,7 @@ static inline u64 bt_random(void) void bt_log_suite_result(int result, const char *fmt, ...); void bt_log_suite_case_result(int result, const char *fmt, ...); -#define BT_TIMEOUT 30 /* Default timeout in seconds */ +#define BT_TIMEOUT 60 /* Default timeout in seconds */ #define BT_FORKING 1 /* Forking is enabled in default */ #define BT_RANDOM_SEED 0x5097d2bb @@ -165,6 +166,8 @@ struct bt_batch { void bt_fmt_str(char *buf, size_t size, const void *data); void bt_fmt_unsigned(char *buf, size_t size, const void *data); void bt_fmt_ipa(char *buf, size_t size, const void *data); +void bt_format_net(char *buf, size_t size, const void *data); + int bt_assert_batch__(struct bt_batch *opts); int bt_is_char(byte c); diff --git a/test/bt-utils.c b/test/bt-utils.c index 98aaed3d..d497840f 100644 --- a/test/bt-utils.c +++ b/test/bt-utils.c @@ -14,7 +14,7 @@ #include "test/bt-utils.h" #include "nest/bird.h" -#include "nest/route.h" +#include "nest/rt.h" #include "nest/protocol.h" #include "sysdep/unix/unix.h" @@ -58,32 +58,17 @@ void resource_sys_init(void); void bt_bird_init(void) { - resource_sys_init(); if(bt_verbose) log_init_debug(""); log_switch(bt_verbose != 0, NULL, NULL); - the_bird_lock(); - resource_init(); - birdloop_init(); olock_init(); - io_init(); rt_init(); + io_init(); if_init(); config_init(); protos_build(); - proto_build(&proto_unix_kernel); - proto_build(&proto_unix_iface); -} - -void bt_bird_cleanup(void) -{ - for (int i = 0; i < PROTOCOL__MAX; i++) - class_to_protocol[i] = NULL; - - config = new_config = NULL; - the_bird_unlock(); } static char * diff --git a/test/bt-utils.h b/test/bt-utils.h index 13d267cc..758dbf48 100644 --- a/test/bt-utils.h +++ b/test/bt-utils.h @@ -28,7 +28,6 @@ uint bt_naive_pow(uint base, uint power); void bt_bytes_to_hex(char *buf, const byte *in_data, size_t size); void bt_bird_init(void); -void bt_bird_cleanup(void); struct config *bt_config_parse(const char *cfg); struct config *bt_config_file_parse(const char *filepath); |