summaryrefslogtreecommitdiff
path: root/sysdep
diff options
context:
space:
mode:
Diffstat (limited to 'sysdep')
-rw-r--r--sysdep/bsd/krt-sock.c53
-rw-r--r--sysdep/bsd/sysio.h6
-rw-r--r--sysdep/cf/README1
-rw-r--r--sysdep/cf/linux.h1
-rw-r--r--sysdep/config.h2
-rw-r--r--sysdep/linux/krt-sys.h33
-rw-r--r--sysdep/linux/netlink.Y50
-rw-r--r--sysdep/linux/netlink.c890
-rw-r--r--sysdep/linux/sysio.h19
-rw-r--r--sysdep/unix/Makefile2
-rw-r--r--sysdep/unix/alloc.c226
-rw-r--r--sysdep/unix/domain.c (renamed from sysdep/unix/coroutine.c)78
-rw-r--r--sysdep/unix/io-loop.c97
-rw-r--r--sysdep/unix/io-loop.h11
-rw-r--r--sysdep/unix/io.c37
-rw-r--r--sysdep/unix/krt.Y5
-rw-r--r--sysdep/unix/krt.c222
-rw-r--r--sysdep/unix/krt.h7
-rw-r--r--sysdep/unix/log.c10
-rw-r--r--sysdep/unix/main.c42
20 files changed, 938 insertions, 854 deletions
diff --git a/sysdep/bsd/krt-sock.c b/sysdep/bsd/krt-sock.c
index 6f788ac2..1c1bd50c 100644
--- a/sysdep/bsd/krt-sock.c
+++ b/sysdep/bsd/krt-sock.c
@@ -25,7 +25,7 @@
#include "nest/bird.h"
#include "nest/iface.h"
-#include "nest/route.h"
+#include "nest/rt.h"
#include "nest/protocol.h"
#include "nest/iface.h"
#include "sysdep/unix/unix.h"
@@ -366,6 +366,30 @@ krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old)
}
}
+/**
+ * krt_assume_onlink - check if routes on interface are considered onlink
+ * @iface: The interface of the next hop
+ * @ipv6: Switch to only consider IPv6 or IPv4 addresses.
+ *
+ * The BSD kernel does not support an onlink flag. If the interface has only
+ * host addresses configured, all routes should be considered as onlink and
+ * the function returns 1.
+ */
+static int
+krt_assume_onlink(struct iface *iface, int ipv6)
+{
+ const u8 type = ipv6 ? NET_IP6 : NET_IP4;
+
+ struct ifa *ifa;
+ WALK_LIST(ifa, iface->addrs)
+ {
+ if ((ifa->prefix.type == type) && !(ifa->flags & IA_HOST))
+ return 0;
+ }
+
+ return 1;
+}
+
#define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0)
static void
@@ -494,10 +518,10 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan)
net = net_get(p->p.main_channel->table, &ndst);
rta a = {
- .source = RTS_INHERIT,
- .scope = SCOPE_UNIVERSE,
};
+ ea_set_attr_u32(&a->eattrs, &ea_gen_source, 0, RTS_INHERIT);
+
/* reject/blackhole routes have also set RTF_GATEWAY,
we wil check them first. */
@@ -526,15 +550,21 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan)
a.dest = RTD_UNICAST;
if (flags & RTF_GATEWAY)
{
- neighbor *ng;
a.nh.gw = igate;
/* Clean up embedded interface ID returned in link-local address */
if (ipa_is_link_local(a.nh.gw))
_I0(a.nh.gw) = 0xfe800000;
- ng = neigh_find(&p->p, a.nh.gw, a.nh.iface, 0);
- if (!ng || (ng->scope == SCOPE_HOST))
+ /* The BSD kernel does not support an onlink flag. We heuristically
+ set the onlink flag, if the iface has only host addresses. */
+ if (krt_assume_onlink(a.nh.iface, ipv6))
+ a.nh.flags |= RNF_ONLINK;
+
+ neighbor *nbr;
+ nbr = neigh_find(&p->p, a.nh.gw, a.nh.iface,
+ (a.nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0);
+ if (!nbr || (nbr->scope == SCOPE_HOST))
{
/* Ignore routes with next-hop 127.0.0.1, host routes with such
next-hop appear on OpenBSD for address aliases. */
@@ -550,15 +580,8 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan)
done:;
rte e0 = { .attrs = &a, .net = net, };
- ea_list *ea = alloca(sizeof(ea_list) + 1 * sizeof(eattr));
- *ea = (ea_list) { .count = 1, .next = e0.attrs->eattrs };
- e0.attrs->eattrs = ea;
-
- ea->attrs[0] = (eattr) {
- .id = EA_KRT_SOURCE,
- .type = EAF_TYPE_INT,
- .u.data = src2,
- };
+ ea_set_attr(e0.attrs->eattrs,
+ EA_LITERAL_EMBEDDED(EA_KRT_SOURCE, T_INT, 0, src2));
if (scan)
krt_got_route(p, &e0, src);
diff --git a/sysdep/bsd/sysio.h b/sysdep/bsd/sysio.h
index c757960a..f1887fb4 100644
--- a/sysdep/bsd/sysio.h
+++ b/sysdep/bsd/sysio.h
@@ -271,3 +271,9 @@ sk_set_priority(sock *s, int prio UNUSED)
{
ERR_MSG("Socket priority not supported");
}
+
+static inline int
+sk_set_freebind(sock *s)
+{
+ ERR_MSG("Freebind is not supported");
+}
diff --git a/sysdep/cf/README b/sysdep/cf/README
index 9a7a4afa..68078bbe 100644
--- a/sysdep/cf/README
+++ b/sysdep/cf/README
@@ -4,7 +4,6 @@ Available configuration variables:
CONFIG_AUTO_ROUTES Device routes are added automagically by the kernel
CONFIG_SELF_CONSCIOUS We're able to recognize whether route was installed by us
CONFIG_MULTIPLE_TABLES The kernel supports multiple routing tables
-CONFIG_ALL_TABLES_AT_ONCE Kernel scanner wants to process all tables at once
CONFIG_SINGLE_ROUTE There is only one route per network
CONFIG_MC_PROPER_SRC Multicast packets have source address according to socket saddr field
diff --git a/sysdep/cf/linux.h b/sysdep/cf/linux.h
index 047d3764..c640bef4 100644
--- a/sysdep/cf/linux.h
+++ b/sysdep/cf/linux.h
@@ -9,7 +9,6 @@
#define CONFIG_AUTO_ROUTES
#define CONFIG_SELF_CONSCIOUS
#define CONFIG_MULTIPLE_TABLES
-#define CONFIG_ALL_TABLES_AT_ONCE
#define CONFIG_IP6_SADR_KERNEL
#define CONFIG_MC_PROPER_SRC
diff --git a/sysdep/config.h b/sysdep/config.h
index 55be90f0..5cdadbb0 100644
--- a/sysdep/config.h
+++ b/sysdep/config.h
@@ -13,7 +13,7 @@
#ifdef GIT_LABEL
#define BIRD_VERSION XSTR1(GIT_LABEL)
#else
-#define BIRD_VERSION "2.0.8"
+#define BIRD_VERSION "2.0.10"
#endif
/* Include parameters determined by configure script */
diff --git a/sysdep/linux/krt-sys.h b/sysdep/linux/krt-sys.h
index a8af4c95..aa90f6e4 100644
--- a/sysdep/linux/krt-sys.h
+++ b/sysdep/linux/krt-sys.h
@@ -34,41 +34,10 @@ static inline struct ifa * kif_get_primary_ip(struct iface *i UNUSED) { return N
#define KRT_ALLOW_MERGE_PATHS 1
-#define EA_KRT_PREFSRC EA_CODE(PROTOCOL_KERNEL, 0x10)
-#define EA_KRT_REALM EA_CODE(PROTOCOL_KERNEL, 0x11)
-#define EA_KRT_SCOPE EA_CODE(PROTOCOL_KERNEL, 0x12)
-
-
-#define KRT_METRICS_MAX 0x10 /* RTAX_QUICKACK+1 */
-#define KRT_METRICS_OFFSET 0x20 /* Offset of EA_KRT_* vs RTAX_* */
-
-#define KRT_FEATURES_MAX 4
-
-/*
- * Following attributes are parts of RTA_METRICS kernel route attribute, their
- * ids must be consistent with their RTAX_* constants (+ KRT_METRICS_OFFSET)
- */
-#define EA_KRT_METRICS EA_CODE(PROTOCOL_KERNEL, 0x20) /* Dummy one */
-#define EA_KRT_LOCK EA_CODE(PROTOCOL_KERNEL, 0x21)
-#define EA_KRT_MTU EA_CODE(PROTOCOL_KERNEL, 0x22)
-#define EA_KRT_WINDOW EA_CODE(PROTOCOL_KERNEL, 0x23)
-#define EA_KRT_RTT EA_CODE(PROTOCOL_KERNEL, 0x24)
-#define EA_KRT_RTTVAR EA_CODE(PROTOCOL_KERNEL, 0x25)
-#define EA_KRT_SSTRESH EA_CODE(PROTOCOL_KERNEL, 0x26)
-#define EA_KRT_CWND EA_CODE(PROTOCOL_KERNEL, 0x27)
-#define EA_KRT_ADVMSS EA_CODE(PROTOCOL_KERNEL, 0x28)
-#define EA_KRT_REORDERING EA_CODE(PROTOCOL_KERNEL, 0x29)
-#define EA_KRT_HOPLIMIT EA_CODE(PROTOCOL_KERNEL, 0x2a)
-#define EA_KRT_INITCWND EA_CODE(PROTOCOL_KERNEL, 0x2b)
-#define EA_KRT_FEATURES EA_CODE(PROTOCOL_KERNEL, 0x2c)
-#define EA_KRT_RTO_MIN EA_CODE(PROTOCOL_KERNEL, 0x2d)
-#define EA_KRT_INITRWND EA_CODE(PROTOCOL_KERNEL, 0x2e)
-#define EA_KRT_QUICKACK EA_CODE(PROTOCOL_KERNEL, 0x2f)
-
-
struct krt_params {
u32 table_id; /* Kernel table ID we sync with */
u32 metric; /* Kernel metric used for all routes */
+ uint netlink_rx_buffer; /* Rx buffer size for the netlink socket */
};
struct krt_state {
diff --git a/sysdep/linux/netlink.Y b/sysdep/linux/netlink.Y
index 7097f577..7ba8c7c9 100644
--- a/sysdep/linux/netlink.Y
+++ b/sysdep/linux/netlink.Y
@@ -10,9 +10,7 @@ CF_HDR
CF_DECLS
-CF_KEYWORDS(KERNEL, TABLE, METRIC, KRT_PREFSRC, KRT_REALM, KRT_SCOPE, KRT_MTU, KRT_WINDOW,
- KRT_RTT, KRT_RTTVAR, KRT_SSTRESH, KRT_CWND, KRT_ADVMSS, KRT_REORDERING,
- KRT_HOPLIMIT, KRT_INITCWND, KRT_RTO_MIN, KRT_INITRWND, KRT_QUICKACK,
+CF_KEYWORDS(KERNEL, TABLE, METRIC, NETLINK, RX, BUFFER,
KRT_LOCK_MTU, KRT_LOCK_WINDOW, KRT_LOCK_RTT, KRT_LOCK_RTTVAR,
KRT_LOCK_SSTRESH, KRT_LOCK_CWND, KRT_LOCK_ADVMSS, KRT_LOCK_REORDERING,
KRT_LOCK_HOPLIMIT, KRT_LOCK_RTO_MIN, KRT_FEATURE_ECN, KRT_FEATURE_ALLFRAG)
@@ -24,41 +22,25 @@ kern_proto: kern_proto kern_sys_item ';' ;
kern_sys_item:
KERNEL TABLE expr { THIS_KRT->sys.table_id = $3; }
| METRIC expr { THIS_KRT->sys.metric = $2; }
+ | NETLINK RX BUFFER expr { THIS_KRT->sys.netlink_rx_buffer = $4; }
;
-dynamic_attr: KRT_PREFSRC { $$ = f_new_dynamic_attr(EAF_TYPE_IP_ADDRESS, T_IP, EA_KRT_PREFSRC); } ;
-dynamic_attr: KRT_REALM { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_REALM); } ;
-dynamic_attr: KRT_SCOPE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SCOPE); } ;
-
-dynamic_attr: KRT_MTU { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_MTU); } ;
-dynamic_attr: KRT_WINDOW { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_WINDOW); } ;
-dynamic_attr: KRT_RTT { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTT); } ;
-dynamic_attr: KRT_RTTVAR { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTTVAR); } ;
-dynamic_attr: KRT_SSTRESH { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SSTRESH); } ;
-dynamic_attr: KRT_CWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_CWND); } ;
-dynamic_attr: KRT_ADVMSS { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_ADVMSS); } ;
-dynamic_attr: KRT_REORDERING { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_REORDERING); } ;
-dynamic_attr: KRT_HOPLIMIT { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_HOPLIMIT); } ;
-dynamic_attr: KRT_INITCWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_INITCWND); } ;
-dynamic_attr: KRT_RTO_MIN { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTO_MIN); } ;
-dynamic_attr: KRT_INITRWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_INITRWND); } ;
-dynamic_attr: KRT_QUICKACK { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_QUICKACK); } ;
-
/* Bits of EA_KRT_LOCK, based on RTAX_* constants */
-dynamic_attr: KRT_LOCK_MTU { $$ = f_new_dynamic_attr_bit(2, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_WINDOW { $$ = f_new_dynamic_attr_bit(3, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_RTT { $$ = f_new_dynamic_attr_bit(4, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_RTTVAR { $$ = f_new_dynamic_attr_bit(5, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_SSTRESH { $$ = f_new_dynamic_attr_bit(6, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_CWND { $$ = f_new_dynamic_attr_bit(7, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_ADVMSS { $$ = f_new_dynamic_attr_bit(8, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_REORDERING { $$ = f_new_dynamic_attr_bit(9, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_HOPLIMIT { $$ = f_new_dynamic_attr_bit(10, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_RTO_MIN { $$ = f_new_dynamic_attr_bit(13, T_BOOL, EA_KRT_LOCK); } ;
-
-dynamic_attr: KRT_FEATURE_ECN { $$ = f_new_dynamic_attr_bit(0, T_BOOL, EA_KRT_FEATURES); } ;
-dynamic_attr: KRT_FEATURE_ALLFRAG { $$ = f_new_dynamic_attr(3, T_BOOL, EA_KRT_FEATURES); } ;
+attr_bit: KRT_LOCK_MTU { $$ = f_new_dynamic_attr_bit(2, "krt_lock"); } ;
+attr_bit: KRT_LOCK_WINDOW { $$ = f_new_dynamic_attr_bit(3, "krt_lock"); } ;
+attr_bit: KRT_LOCK_RTT { $$ = f_new_dynamic_attr_bit(4, "krt_lock"); } ;
+attr_bit: KRT_LOCK_RTTVAR { $$ = f_new_dynamic_attr_bit(5, "krt_lock"); } ;
+attr_bit: KRT_LOCK_SSTRESH { $$ = f_new_dynamic_attr_bit(6, "krt_lock"); } ;
+attr_bit: KRT_LOCK_CWND { $$ = f_new_dynamic_attr_bit(7, "krt_lock"); } ;
+attr_bit: KRT_LOCK_ADVMSS { $$ = f_new_dynamic_attr_bit(8, "krt_lock"); } ;
+attr_bit: KRT_LOCK_REORDERING { $$ = f_new_dynamic_attr_bit(9, "krt_lock"); } ;
+attr_bit: KRT_LOCK_HOPLIMIT { $$ = f_new_dynamic_attr_bit(10, "krt_lock"); } ;
+attr_bit: KRT_LOCK_RTO_MIN { $$ = f_new_dynamic_attr_bit(13, "krt_lock"); } ;
+
+/* Bits of EA_KRT_FEATURES */
+attr_bit: KRT_FEATURE_ECN { $$ = f_new_dynamic_attr_bit(0, "krt_features"); } ;
+attr_bit: KRT_FEATURE_ALLFRAG { $$ = f_new_dynamic_attr_bit(3, "krt_features"); } ;
CF_CODE
diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c
index bff2d579..94a37a73 100644
--- a/sysdep/linux/netlink.c
+++ b/sysdep/linux/netlink.c
@@ -17,7 +17,7 @@
#undef LOCAL_DEBUG
#include "nest/bird.h"
-#include "nest/route.h"
+#include "nest/rt.h"
#include "nest/protocol.h"
#include "nest/iface.h"
#include "lib/alloca.h"
@@ -26,6 +26,7 @@
#include "lib/socket.h"
#include "lib/string.h"
#include "lib/hash.h"
+#include "lib/macro.h"
#include "conf/conf.h"
#include <asm/types.h>
@@ -69,54 +70,118 @@
#define RTA_ENCAP 22
#endif
+#ifndef NETLINK_GET_STRICT_CHK
+#define NETLINK_GET_STRICT_CHK 12
+#endif
+
#define krt_ipv4(p) ((p)->af == AF_INET)
-#define krt_ecmp6(p) ((p)->af == AF_INET6)
const int rt_default_ecmp = 16;
-/*
- * Structure nl_parse_state keeps state of received route processing. Ideally,
- * we could just independently parse received Netlink messages and immediately
- * propagate received routes to the rest of BIRD, but older Linux kernel (before
- * version 4.11) represents and announces IPv6 ECMP routes not as one route with
- * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of
- * routes with the same prefix. More recent kernels work as with IPv4.
- *
- * Therefore, BIRD keeps currently processed route in nl_parse_state structure
- * and postpones its propagation until we expect it to be final; i.e., when
- * non-matching route is received or when the scan ends. When another matching
- * route is received, it is merged with the already processed route to form an
- * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the
- * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6
- * routes with RTA_MULTIPATH set are just considered non-matching.
- *
- * This is ignored for asynchronous notifications (every notification is handled
- * as a separate route). It is not an issue for our routes, as we ignore such
- * notifications anyways. But importing alien IPv6 ECMP routes does not work
- * properly with older kernels.
- *
- * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes
- * for the same prefix.
- */
-
struct nl_parse_state
{
+ struct krt_proto *proto;
struct linpool *pool;
int scan;
- int merge;
- net_addr *net;
- rta *attrs;
- struct krt_proto *proto;
- s8 new;
- s8 krt_src;
- u8 krt_type;
- u8 krt_proto;
- u32 krt_metric;
+ u32 rta_flow;
+};
+
+/*
+ * Netlink eattr definitions
+ */
+
+#define KRT_METRICS_MAX ARRAY_SIZE(ea_krt_metrics)
+#define KRT_FEATURES_MAX 4
+
+static void krt_bitfield_format(const eattr *e, byte *buf, uint buflen);
+
+static struct ea_class
+ ea_krt_prefsrc = {
+ .name = "krt_prefsrc",
+ .type = T_IP,
+ },
+ ea_krt_realm = {
+ .name = "krt_realm",
+ .type = T_INT,
+ },
+ ea_krt_scope = {
+ .name = "krt_scope",
+ .type = T_INT,
+ };
+
+static struct ea_class ea_krt_metrics[] = {
+ [RTAX_LOCK] = {
+ .name = "krt_lock",
+ .type = T_INT,
+ .format = krt_bitfield_format,
+ },
+ [RTAX_FEATURES] = {
+ .name = "krt_features",
+ .type = T_INT,
+ .format = krt_bitfield_format,
+ },
+#define KRT_METRIC_INT(_rtax, _name) [_rtax] = { .name = _name, .type = T_INT }
+ KRT_METRIC_INT(RTAX_MTU, "krt_mtu"),
+ KRT_METRIC_INT(RTAX_WINDOW, "krt_window"),
+ KRT_METRIC_INT(RTAX_RTT, "krt_rtt"),
+ KRT_METRIC_INT(RTAX_RTTVAR, "krt_rttvar"),
+ KRT_METRIC_INT(RTAX_SSTHRESH, "krt_sstresh"),
+ KRT_METRIC_INT(RTAX_CWND, "krt_cwnd"),
+ KRT_METRIC_INT(RTAX_ADVMSS, "krt_advmss"),
+ KRT_METRIC_INT(RTAX_REORDERING, "krt_reordering"),
+ KRT_METRIC_INT(RTAX_HOPLIMIT, "krt_hoplimit"),
+ KRT_METRIC_INT(RTAX_INITCWND, "krt_initcwnd"),
+ KRT_METRIC_INT(RTAX_RTO_MIN, "krt_rto_min"),
+ KRT_METRIC_INT(RTAX_INITRWND, "krt_initrwnd"),
+ KRT_METRIC_INT(RTAX_QUICKACK, "krt_quickack"),
+#undef KRT_METRIC_INT
+};
- u32 rta_flow; /* Used during parsing */
+static const char *krt_metrics_names[KRT_METRICS_MAX] = {
+ NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss",
+ "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack"
};
+static const char *krt_features_names[KRT_FEATURES_MAX] = {
+ "ecn", NULL, NULL, "allfrag"
+};
+
+static void
+krt_bitfield_format(const eattr *a, byte *buf, uint buflen)
+{
+ if (a->id == ea_krt_metrics[RTAX_LOCK].id)
+ ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX);
+ else if (a->id == ea_krt_metrics[RTAX_FEATURES].id)
+ ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX);
+}
+
+static void
+nl_ea_register(void)
+{
+ EA_REGISTER_ALL(
+ &ea_krt_prefsrc,
+ &ea_krt_realm,
+ &ea_krt_scope
+ );
+
+ for (uint i = 0; i < KRT_METRICS_MAX; i++)
+ {
+ if (!ea_krt_metrics[i].name)
+ ea_krt_metrics[i] = (struct ea_class) {
+ .name = mb_sprintf(&root_pool, "krt_metric_%d", i),
+ .type = T_INT,
+ };
+
+ ea_register_init(&ea_krt_metrics[i]);
+ }
+
+ for (uint i = 1; i < KRT_METRICS_MAX; i++)
+ ASSERT_DIE(ea_krt_metrics[i].id == ea_krt_metrics[0].id + i);
+}
+
+
+
/*
* Synchronous Netlink interface
*/
@@ -130,7 +195,7 @@ struct nl_sock
uint last_size;
};
-#define NL_RX_SIZE 8192
+#define NL_RX_SIZE 32768
#define NL_OP_DELETE 0
#define NL_OP_ADD (NLM_F_CREATE|NLM_F_EXCL)
@@ -157,11 +222,51 @@ nl_open_sock(struct nl_sock *nl)
}
}
+static int
+nl_set_strict_dump(struct nl_sock *nl UNUSED, int strict UNUSED)
+{
+#ifdef SOL_NETLINK
+ return setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict));
+#else
+ return -1;
+#endif
+}
+
+static void
+nl_set_rcvbuf(int fd, uint val)
+{
+ if (setsockopt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &val, sizeof(val)) < 0)
+ log(L_WARN "KRT: Cannot set netlink rx buffer size to %u: %m", val);
+}
+
+static uint
+nl_cfg_rx_buffer_size(struct config *cfg)
+{
+ uint bufsize = 0;
+
+ struct proto_config *pc;
+ WALK_LIST(pc, cfg->protos)
+ if ((pc->protocol == &proto_unix_kernel) && !pc->disabled)
+ bufsize = MAX(bufsize, ((struct krt_config *) pc)->sys.netlink_rx_buffer);
+
+ return bufsize;
+}
+
+
static void
nl_open(void)
{
+ if ((nl_scan.fd >= 0) && (nl_req.fd >= 0))
+ return;
+
nl_open_sock(&nl_scan);
nl_open_sock(&nl_req);
+
+ if (nl_set_strict_dump(&nl_scan, 1) < 0)
+ {
+ log(L_WARN "KRT: Netlink strict checking failed, will scan all tables at once");
+ krt_use_shared_scan();
+ }
}
static void
@@ -180,20 +285,72 @@ nl_send(struct nl_sock *nl, struct nlmsghdr *nh)
}
static void
-nl_request_dump(int af, int cmd)
+nl_request_dump_link(void)
{
struct {
struct nlmsghdr nh;
- struct rtgenmsg g;
+ struct ifinfomsg ifi;
} req = {
- .nh.nlmsg_type = cmd,
- .nh.nlmsg_len = sizeof(req),
+ .nh.nlmsg_type = RTM_GETLINK,
+ .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
- .g.rtgen_family = af
+ .nh.nlmsg_seq = ++(nl_scan.seq),
+ .ifi.ifi_family = AF_UNSPEC,
};
- nl_send(&nl_scan, &req.nh);
+
+ send(nl_scan.fd, &req, sizeof(req), 0);
+ nl_scan.last_hdr = NULL;
}
+static void
+nl_request_dump_addr(int af)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct ifaddrmsg ifa;
+ } req = {
+ .nh.nlmsg_type = RTM_GETADDR,
+ .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)),
+ .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
+ .nh.nlmsg_seq = ++(nl_scan.seq),
+ .ifa.ifa_family = af,
+ };
+
+ send(nl_scan.fd, &req, sizeof(req), 0);
+ nl_scan.last_hdr = NULL;
+}
+
+static void
+nl_request_dump_route(int af, int table_id)
+{
+ struct {
+ struct nlmsghdr nh;
+ struct rtmsg rtm;
+ struct rtattr rta;
+ u32 table_id;
+ } req = {
+ .nh.nlmsg_type = RTM_GETROUTE,
+ .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)),
+ .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
+ .nh.nlmsg_seq = ++(nl_scan.seq),
+ .rtm.rtm_family = af,
+ };
+
+ if (table_id < 256)
+ req.rtm.rtm_table = table_id;
+ else
+ {
+ req.rta.rta_type = RTA_TABLE;
+ req.rta.rta_len = RTA_LENGTH(4);
+ req.table_id = table_id;
+ req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + req.rta.rta_len;
+ }
+
+ send(nl_scan.fd, &req, req.nh.nlmsg_len, 0);
+ nl_scan.last_hdr = NULL;
+}
+
+
static struct nlmsghdr *
nl_get_reply(struct nl_sock *nl)
{
@@ -651,12 +808,12 @@ nl_add_nexthop(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af UNUS
}
static void
-nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, ea_list *eattrs)
+nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop_adata *nhad, int af, ea_list *eattrs)
{
struct rtattr *a = nl_open_attr(h, bufsize, RTA_MULTIPATH);
- eattr *flow = ea_find(eattrs, EA_KRT_REALM);
+ eattr *flow = ea_find(eattrs, &ea_krt_realm);
- for (; nh; nh = nh->next)
+ NEXTHOP_WALK(nh, nhad)
{
struct rtnexthop *rtnh = nl_open_nexthop(h, bufsize);
@@ -680,33 +837,49 @@ nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, e
nl_close_attr(h, a);
}
-static struct nexthop *
-nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr *ra, int af)
+static struct nexthop_adata *
+nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr *n, struct rtattr *ra, int af, int krt_src)
{
struct rtattr *a[BIRD_RTA_MAX];
- struct rtnexthop *nh = RTA_DATA(ra);
- struct nexthop *rv, *first, **last;
- unsigned len = RTA_PAYLOAD(ra);
+ struct rtnexthop *nh, *orig_nh = RTA_DATA(ra);
+ unsigned len, orig_len = RTA_PAYLOAD(ra);
+ uint cnt = 0;
- first = NULL;
- last = &first;
+ /* First count the nexthops */
+ for (len = orig_len, nh = orig_nh; len; len -= NLMSG_ALIGN(nh->rtnh_len), nh = RTNH_NEXT(nh))
+ {
+ /* Use RTNH_OK(nh,len) ?? */
+ if ((len < sizeof(*nh)) || (len < nh->rtnh_len))
+ goto err;
- while (len)
+ if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD))
+ ;
+ else
+ cnt++;
+ }
+
+ struct nexthop_adata *nhad = lp_allocz(s->pool, cnt * NEXTHOP_MAX_SIZE + sizeof *nhad);
+ struct nexthop *rv = &nhad->nh;
+
+ for (len = orig_len, nh = orig_nh; len; len -= NLMSG_ALIGN(nh->rtnh_len), nh = RTNH_NEXT(nh))
{
/* Use RTNH_OK(nh,len) ?? */
if ((len < sizeof(*nh)) || (len < nh->rtnh_len))
- return NULL;
+ goto err;
- if (nh->rtnh_flags & RTNH_F_DEAD)
- goto next;
+ if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD))
+ continue;
- *last = rv = lp_allocz(s->pool, NEXTHOP_MAX_SIZE);
- last = &(rv->next);
+ *rv = (struct nexthop) {
+ .weight = nh->rtnh_hops,
+ .iface = if_find_by_index(nh->rtnh_ifindex),
+ };
- rv->weight = nh->rtnh_hops;
- rv->iface = if_find_by_index(nh->rtnh_ifindex);
if (!rv->iface)
- return NULL;
+ {
+ log(L_ERR "KRT: Received route %N with unknown ifindex %u", n, nh->rtnh_ifindex);
+ return NULL;
+ }
/* Nonexistent RTNH_PAYLOAD ?? */
nl_attr_len = nh->rtnh_len - RTNH_LENGTH(0);
@@ -714,18 +887,18 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr
{
case AF_INET:
if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want4, a, sizeof(a)))
- return NULL;
+ goto err;
break;
case AF_INET6:
if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want6, a, sizeof(a)))
- return NULL;
+ goto err;
break;
#ifdef HAVE_MPLS_KERNEL
case AF_MPLS:
if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want_mpls, a, sizeof(a)))
- return NULL;
+ goto err;
if (a[RTA_NEWDST])
rv->labels = rta_get_mpls(a[RTA_NEWDST], rv->label);
@@ -734,7 +907,7 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr
#endif
default:
- return NULL;
+ goto err;
}
if (a[RTA_GATEWAY])
@@ -757,14 +930,19 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr
nbr = neigh_find(&p->p, rv->gw, rv->iface,
(rv->flags & RNF_ONLINK) ? NEF_ONLINK : 0);
if (!nbr || (nbr->scope == SCOPE_HOST))
- return NULL;
+ {
+ log(L_ERR "KRT: Received route %N with strange next-hop %I", n, rv->gw);
+ return NULL;
+ }
}
#ifdef HAVE_MPLS_KERNEL
if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE])
{
- if (rta_get_u16(a[RTA_ENCAP_TYPE]) != LWTUNNEL_ENCAP_MPLS) {
- log(L_WARN "KRT: Unknown encapsulation method %d in multipath", rta_get_u16(a[RTA_ENCAP_TYPE]));
+ if (rta_get_u16(a[RTA_ENCAP_TYPE]) != LWTUNNEL_ENCAP_MPLS)
+ {
+ log(L_WARN "KRT: Received route %N with unknown encapsulation method %d",
+ n, rta_get_u16(a[RTA_ENCAP_TYPE]));
return NULL;
}
@@ -775,16 +953,18 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr
}
#endif
- next:
- len -= NLMSG_ALIGN(nh->rtnh_len);
- nh = RTNH_NEXT(nh);
+ rv = NEXTHOP_NEXT(rv);
}
+ /* Store final length */
+ nhad->ad.length = (void *) rv - (void *) nhad->ad.data;
+
/* Ensure nexthops are sorted to satisfy nest invariant */
- if (!nexthop_is_sorted(first))
- first = nexthop_sort(first);
+ return nexthop_is_sorted(nhad) ? nhad : nexthop_sort(nhad, s->pool);
- return first;
+err:
+ log(L_ERR "KRT: Received strange multipath route %N", n);
+ return NULL;
}
static void
@@ -1139,7 +1319,7 @@ kif_do_scan(struct kif_proto *p UNUSED)
if_start_update();
- nl_request_dump(AF_UNSPEC, RTM_GETLINK);
+ nl_request_dump_link();
while (h = nl_get_scan())
if (h->nlmsg_type == RTM_NEWLINK || h->nlmsg_type == RTM_DELLINK)
nl_parse_link(h, 1);
@@ -1166,14 +1346,14 @@ kif_do_scan(struct kif_proto *p UNUSED)
}
}
- nl_request_dump(AF_INET, RTM_GETADDR);
+ nl_request_dump_addr(AF_INET);
while (h = nl_get_scan())
if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
nl_parse_addr(h, 1);
else
log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
- nl_request_dump(AF_INET6, RTM_GETADDR);
+ nl_request_dump_addr(AF_INET6);
while (h = nl_get_scan())
if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
nl_parse_addr(h, 1);
@@ -1208,11 +1388,16 @@ HASH_DEFINE_REHASH_FN(RTH, struct krt_proto)
int
krt_capable(rte *e)
{
- rta *a = e->attrs;
+ eattr *ea = ea_find(e->attrs, &ea_gen_nexthop);
+ if (!ea)
+ return 0;
+
+ struct nexthop_adata *nhad = (void *) ea->u.ptr;
+ if (NEXTHOP_IS_REACHABLE(nhad))
+ return 1;
- switch (a->dest)
+ switch (nhad->dest)
{
- case RTD_UNICAST:
case RTD_BLACKHOLE:
case RTD_UNREACHABLE:
case RTD_PROHIBIT:
@@ -1224,21 +1409,24 @@ krt_capable(rte *e)
}
static inline int
-nh_bufsize(struct nexthop *nh)
+nh_bufsize(struct nexthop_adata *nhad)
{
int rv = 0;
- for (; nh != NULL; nh = nh->next)
+ NEXTHOP_WALK(nh, nhad)
rv += RTNH_LENGTH(RTA_LENGTH(sizeof(ip_addr)));
return rv;
}
static int
-nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nexthop *nh)
+nl_send_route(struct krt_proto *p, const rte *e, int op)
{
eattr *ea;
- rta *a = e->attrs;
- ea_list *eattrs = a->eattrs;
- int bufsize = 128 + KRT_METRICS_MAX*8 + nh_bufsize(&(a->nh));
+ ea_list *eattrs = e->attrs;
+ eattr *nhea = ea_find(eattrs, &ea_gen_nexthop);
+ struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL;
+ int dest = nhea_dest(nhea);
+
+ int bufsize = 128 + KRT_METRICS_MAX*8 + (nh ? nh_bufsize(nh) : 0);
u32 priority = 0;
struct {
@@ -1306,7 +1494,7 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho
priority = 0;
else if (KRT_CF->sys.metric)
priority = KRT_CF->sys.metric;
- else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, EA_KRT_METRIC)))
+ else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, &ea_krt_metric)))
priority = ea->u.data;
if (priority)
@@ -1314,20 +1502,22 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho
/* For route delete, we do not specify remaining route attributes */
if (op == NL_OP_DELETE)
- goto dest;
+ goto done;
/* Default scope is LINK for device routes, UNIVERSE otherwise */
if (p->af == AF_MPLS)
r->r.rtm_scope = RT_SCOPE_UNIVERSE;
- else if (ea = ea_find(eattrs, EA_KRT_SCOPE))
+ else if (ea = ea_find(eattrs, &ea_krt_scope))
r->r.rtm_scope = ea->u.data;
+ else if (dest == RTD_UNICAST && ipa_zero(nh->nh.gw))
+ r->r.rtm_scope = RT_SCOPE_LINK;
else
- r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
+ r->r.rtm_scope = RT_SCOPE_UNIVERSE;
- if (ea = ea_find(eattrs, EA_KRT_PREFSRC))
+ if (ea = ea_find(eattrs, &ea_krt_prefsrc))
nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data);
- if (ea = ea_find(eattrs, EA_KRT_REALM))
+ if (ea = ea_find(eattrs, &ea_krt_realm))
nl_add_attr_u32(&r->h, rsize, RTA_FLOW, ea->u.data);
@@ -1335,9 +1525,9 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho
metrics[0] = 0;
struct ea_walk_state ews = { .eattrs = eattrs };
- while (ea = ea_walk(&ews, EA_KRT_METRICS, KRT_METRICS_MAX))
+ while (ea = ea_walk(&ews, ea_krt_metrics[0].id, KRT_METRICS_MAX))
{
- int id = ea->id - EA_KRT_METRICS;
+ int id = ea->id - ea_krt_metrics[0].id;
metrics[0] |= 1 << id;
metrics[id] = ea->u.data;
}
@@ -1345,20 +1535,18 @@ nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nextho
if (metrics[0])
nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX);
-
-dest:
switch (dest)
{
case RTD_UNICAST:
r->r.rtm_type = RTN_UNICAST;
- if (nh->next && !krt_ecmp6(p))
+ if (!NEXTHOP_ONE(nh))
nl_add_multipath(&r->h, rsize, nh, p->af, eattrs);
else
{
- nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->iface->index);
- nl_add_nexthop(&r->h, rsize, nh, p->af);
+ nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->nh.iface->index);
+ nl_add_nexthop(&r->h, rsize, &nh->nh, p->af);
- if (nh->flags & RNF_ONLINK)
+ if (nh->nh.flags & RNF_ONLINK)
r->r.rtm_flags |= RTNH_F_ONLINK;
}
break;
@@ -1377,82 +1565,56 @@ dest:
bug("krt_capable inconsistent with nl_send_route");
}
+done:
/* Ignore missing for DELETE */
return nl_exchange(&r->h, (op == NL_OP_DELETE));
}
static inline int
-nl_add_rte(struct krt_proto *p, rte *e)
+nl_allow_replace(struct krt_proto *p, rte *new)
{
- rta *a = e->attrs;
- int err = 0;
-
- if (krt_ecmp6(p) && a->nh.next)
- {
- struct nexthop *nh = &(a->nh);
-
- err = nl_send_route(p, e, NL_OP_ADD, RTD_UNICAST, nh);
- if (err < 0)
- return err;
-
- for (nh = nh->next; nh; nh = nh->next)
- err += nl_send_route(p, e, NL_OP_APPEND, RTD_UNICAST, nh);
-
- return err;
- }
-
- return nl_send_route(p, e, NL_OP_ADD, a->dest, &(a->nh));
-}
-
-static inline int
-nl_delete_rte(struct krt_proto *p, const rte *e)
-{
- int err = 0;
+ /*
+ * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for
+ * matching rtm_protocol, but that is OK when dedicated priority is used.
+ *
+ * For IPv6, the NL_OP_REPLACE is still broken even in Linux 4.19 LTS
+ * (although it seems to be fixed in Linux 5.10 LTS) for sequence:
+ *
+ * ip route add 2001:db8::/32 via fe80::1 dev eth0
+ * ip route replace 2001:db8::/32 dev eth0
+ *
+ * (it ends with two routes instead of replacing the first by the second one)
+ *
+ * Replacing with direct and special type (e.g. unreachable) routes does not
+ * work, but replacing with regular routes work reliably
+ */
- /* For IPv6, we just repeatedly request DELETE until we get error */
- do
- err = nl_send_route(p, e, NL_OP_DELETE, RTD_NONE, NULL);
- while (krt_ecmp6(p) && !err);
+ if (krt_ipv4(p))
+ return 1;
- return err;
-}
+ eattr *nhea = ea_find(new->attrs, &ea_gen_nexthop);
+ struct nexthop_adata *nh = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL;
+ int dest = nhea_dest(nhea);
-static inline int
-nl_replace_rte(struct krt_proto *p, rte *e)
-{
- rta *a = e->attrs;
- return nl_send_route(p, e, NL_OP_REPLACE, a->dest, &(a->nh));
+ return (dest == RTD_UNICAST) && ipa_nonzero(nh->nh.gw);
}
-
void
krt_replace_rte(struct krt_proto *p, const net_addr *n UNUSED, rte *new, const rte *old)
{
int err = 0;
- /*
- * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for
- * matching rtm_protocol, but that is OK when dedicated priority is used.
- *
- * We do not use NL_OP_REPLACE for IPv6, as it has broken semantics for ECMP
- * and with some kernel versions ECMP replace crashes kernel. Would need more
- * testing and checks for kernel versions.
- *
- * For IPv6, we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the
- * old route value, so we do not try to optimize IPv6 ECMP reconfigurations.
- */
-
- if (krt_ipv4(p) && old && new)
+ if (old && new && nl_allow_replace(p, new))
{
- err = nl_replace_rte(p, new);
+ err = nl_send_route(p, new, NL_OP_REPLACE);
}
else
{
if (old)
- nl_delete_rte(p, old);
+ nl_send_route(p, old, NL_OP_DELETE);
if (new)
- err = nl_add_rte(p, new);
+ err = nl_send_route(p, new, NL_OP_ADD);
}
if (new)
@@ -1464,75 +1626,9 @@ krt_replace_rte(struct krt_proto *p, const net_addr *n UNUSED, rte *new, const r
}
}
-static int
-nl_mergable_route(struct nl_parse_state *s, const net_addr *net, struct krt_proto *p, uint priority, uint krt_type, uint rtm_family)
-{
- /* Route merging is used for IPv6 scans */
- if (!s->scan || (rtm_family != AF_INET6))
- return 0;
-
- /* Saved and new route must have same network, proto/table, and priority */
- if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority))
- return 0;
-
- /* Both must be regular unicast routes */
- if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST))
- return 0;
- return 1;
-}
-
-static void
-nl_announce_route(struct nl_parse_state *s)
-{
- rte e0 = {
- .attrs = s->attrs,
- .net = s->net,
- };
-
- ea_list *ea = alloca(sizeof(ea_list) + 2 * sizeof(eattr));
- *ea = (ea_list) { .count = 2, .next = e0.attrs->eattrs };
- e0.attrs->eattrs = ea;
-
- ea->attrs[0] = (eattr) {
- .id = EA_KRT_SOURCE,
- .type = EAF_TYPE_INT,
- .u.data = s->krt_proto,
- };
- ea->attrs[1] = (eattr) {
- .id = EA_KRT_METRIC,
- .type = EAF_TYPE_INT,
- .u.data = s->krt_metric,
- };
-
- if (s->scan)
- krt_got_route(s->proto, &e0, s->krt_src);
- else
- krt_got_route_async(s->proto, &e0, s->new, s->krt_src);
-
- s->net = NULL;
- s->attrs = NULL;
- s->proto = NULL;
- lp_flush(s->pool);
-}
-
-static inline void
-nl_parse_begin(struct nl_parse_state *s, int scan)
-{
- memset(s, 0, sizeof (struct nl_parse_state));
- s->pool = nl_linpool;
- s->scan = scan;
-}
-
-static inline void
-nl_parse_end(struct nl_parse_state *s)
-{
- if (s->net)
- nl_announce_route(s);
-}
-
-
-#define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0)
+#define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0)
+#define SKIP(ARG, ...) do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0)
static void
nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
@@ -1585,10 +1681,10 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
return;
if (!a[RTA_DST])
- SKIP("MPLS route without RTA_DST");
+ SKIP0("MPLS route without RTA_DST\n");
if (rta_get_mpls(a[RTA_DST], rta_mpls_stack) != 1)
- SKIP("MPLS route with multi-label RTA_DST");
+ SKIP0("MPLS route with multi-label RTA_DST\n");
net_fill_mpls(&dst, rta_mpls_stack[0]);
break;
@@ -1606,6 +1702,9 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
else
table_id = i->rtm_table;
+ if (i->rtm_flags & RTM_F_CLONED)
+ SKIP("cloned\n");
+
/* Do we know this table? */
p = HASH_FIND(nl_table_map, RTH, i->rtm_family, table_id);
if (!p)
@@ -1662,83 +1761,112 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
net6_prefix(&src), net6_pxlen(&src));
}
- if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type, i->rtm_family))
- nl_announce_route(s);
-
- rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE);
- ra->source = RTS_INHERIT;
- ra->scope = SCOPE_UNIVERSE;
+ ea_list *ra = NULL;
+ ea_set_attr_u32(&ra, &ea_gen_source, 0, RTS_INHERIT);
+ ea_set_attr_u32(&ra, &ea_krt_source, 0, i->rtm_protocol);
+ ea_set_attr_u32(&ra, &ea_krt_metric, 0, priority);
if (a[RTA_FLOW])
s->rta_flow = rta_get_u32(a[RTA_FLOW]);
else
s->rta_flow = 0;
+ union {
+ struct {
+ struct adata ad;
+ struct nexthop nh;
+ u32 labels[MPLS_MAX_LABEL_STACK];
+ };
+ struct nexthop_adata nhad;
+ } nhad = {};
+
switch (i->rtm_type)
{
case RTN_UNICAST:
- ra->dest = RTD_UNICAST;
-
if (a[RTA_MULTIPATH])
{
- struct nexthop *nh = nl_parse_multipath(s, p, a[RTA_MULTIPATH], i->rtm_family);
+ struct nexthop_adata *nh = nl_parse_multipath(s, p, net, a[RTA_MULTIPATH], i->rtm_family, krt_src);
if (!nh)
- {
- log(L_ERR "KRT: Received strange multipath route %N", net);
- return;
- }
+ SKIP("strange RTA_MULTIPATH\n");
- nexthop_link(ra, nh);
+ ea_set_attr(&ra, EA_LITERAL_DIRECT_ADATA(
+ &ea_gen_nexthop, 0, &nh->ad));
break;
}
- if (i->rtm_flags & RTNH_F_DEAD)
- return;
+ if ((i->rtm_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD))
+ SKIP("ignore RTNH_F_DEAD\n");
- ra->nh.iface = if_find_by_index(oif);
- if (!ra->nh.iface)
+ nhad.nh.iface = if_find_by_index(oif);
+ if (!nhad.nh.iface)
{
log(L_ERR "KRT: Received route %N with unknown ifindex %u", net, oif);
return;
}
if (a[RTA_GATEWAY])
- ra->nh.gw = rta_get_ipa(a[RTA_GATEWAY]);
+ nhad.nh.gw = rta_get_ipa(a[RTA_GATEWAY]);
#ifdef HAVE_MPLS_KERNEL
if (a[RTA_VIA])
- ra->nh.gw = rta_get_via(a[RTA_VIA]);
+ nhad.nh.gw = rta_get_via(a[RTA_VIA]);
#endif
- if (ipa_nonzero(ra->nh.gw))
+ if (ipa_nonzero(nhad.nh.gw))
{
/* Silently skip strange 6to4 routes */
const net_addr_ip6 sit = NET_ADDR_IP6(IP6_NONE, 96);
- if ((i->rtm_family == AF_INET6) && ipa_in_netX(ra->nh.gw, (net_addr *) &sit))
+ if ((i->rtm_family == AF_INET6) && ipa_in_netX(nhad.nh.gw, (net_addr *) &sit))
return;
if (i->rtm_flags & RTNH_F_ONLINK)
- ra->nh.flags |= RNF_ONLINK;
+ nhad.nh.flags |= RNF_ONLINK;
neighbor *nbr;
- nbr = neigh_find(&p->p, ra->nh.gw, ra->nh.iface,
- (ra->nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0);
+ nbr = neigh_find(&p->p, nhad.nh.gw, nhad.nh.iface,
+ (nhad.nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0);
if (!nbr || (nbr->scope == SCOPE_HOST))
{
- log(L_ERR "KRT: Received route %N with strange next-hop %I", net, ra->nh.gw);
+ log(L_ERR "KRT: Received route %N with strange next-hop %I", net,
+ nhad.nh.gw);
return;
}
}
+#ifdef HAVE_MPLS_KERNEL
+ if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !a[RTA_MULTIPATH])
+ nhad.nh.labels = rta_get_mpls(a[RTA_NEWDST], nhad.nh.label);
+
+ if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !a[RTA_MULTIPATH])
+ {
+ switch (rta_get_u16(a[RTA_ENCAP_TYPE]))
+ {
+ case LWTUNNEL_ENCAP_MPLS:
+ {
+ struct rtattr *enca[BIRD_RTA_MAX];
+ nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
+ nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
+ nhad.nh.labels = rta_get_mpls(enca[RTA_DST], nhad.nh.label);
+ break;
+ }
+ default:
+ SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE]));
+ break;
+ }
+ }
+#endif
+
+ /* Finalize the nexthop */
+ nhad.ad.length = (void *) NEXTHOP_NEXT(&nhad.nh) - (void *) nhad.ad.data;
break;
case RTN_BLACKHOLE:
- ra->dest = RTD_BLACKHOLE;
+ nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_BLACKHOLE);
break;
case RTN_UNREACHABLE:
- ra->dest = RTD_UNREACHABLE;
+ nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_UNREACHABLE);
break;
case RTN_PROHIBIT:
- ra->dest = RTD_PROHIBIT;
+ nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_PROHIBIT);
break;
/* FIXME: What about RTN_THROW? */
default:
@@ -1746,163 +1874,74 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
return;
}
-#ifdef HAVE_MPLS_KERNEL
- if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !ra->nh.next)
- ra->nh.labels = rta_get_mpls(a[RTA_NEWDST], ra->nh.label);
-
- if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !ra->nh.next)
- {
- switch (rta_get_u16(a[RTA_ENCAP_TYPE]))
- {
- case LWTUNNEL_ENCAP_MPLS:
- {
- struct rtattr *enca[BIRD_RTA_MAX];
- nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
- nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
- ra->nh.labels = rta_get_mpls(enca[RTA_DST], ra->nh.label);
- break;
- }
- default:
- SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE]));
- break;
- }
- }
-#endif
-
if (i->rtm_scope != def_scope)
- {
- ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
- ea->next = ra->eattrs;
- ra->eattrs = ea;
- ea->flags = EALF_SORTED;
- ea->count = 1;
- ea->attrs[0].id = EA_KRT_SCOPE;
- ea->attrs[0].flags = 0;
- ea->attrs[0].type = EAF_TYPE_INT;
- ea->attrs[0].u.data = i->rtm_scope;
- }
+ ea_set_attr(&ra,
+ EA_LITERAL_EMBEDDED(&ea_krt_scope, 0, i->rtm_scope));
if (a[RTA_PREFSRC])
- {
- ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]);
-
- ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
- ea->next = ra->eattrs;
- ra->eattrs = ea;
- ea->flags = EALF_SORTED;
- ea->count = 1;
- ea->attrs[0].id = EA_KRT_PREFSRC;
- ea->attrs[0].flags = 0;
- ea->attrs[0].type = EAF_TYPE_IP_ADDRESS;
-
- struct adata *ad = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps));
- ad->length = sizeof(ps);
- memcpy(ad->data, &ps, sizeof(ps));
-
- ea->attrs[0].u.ptr = ad;
- }
+ {
+ ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]);
+
+ ea_set_attr(&ra,
+ EA_LITERAL_STORE_ADATA(&ea_krt_prefsrc, 0, &ps, sizeof(ps)));
+ }
/* Can be set per-route or per-nexthop */
if (s->rta_flow)
- {
- ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
- ea->next = ra->eattrs;
- ra->eattrs = ea;
- ea->flags = EALF_SORTED;
- ea->count = 1;
- ea->attrs[0].id = EA_KRT_REALM;
- ea->attrs[0].flags = 0;
- ea->attrs[0].type = EAF_TYPE_INT;
- ea->attrs[0].u.data = s->rta_flow;
- }
+ ea_set_attr(&ra,
+ EA_LITERAL_EMBEDDED(&ea_krt_realm, 0, s->rta_flow));
if (a[RTA_METRICS])
{
u32 metrics[KRT_METRICS_MAX];
- ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr));
- int t, n = 0;
-
if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0)
{
log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net);
return;
}
- for (t = 1; t < KRT_METRICS_MAX; t++)
+ for (uint t = 1; t < KRT_METRICS_MAX; t++)
if (metrics[0] & (1 << t))
- {
- ea->attrs[n].id = EA_CODE(PROTOCOL_KERNEL, KRT_METRICS_OFFSET + t);
- ea->attrs[n].flags = 0;
- ea->attrs[n].type = EAF_TYPE_INT; /* FIXME: Some are EAF_TYPE_BITFIELD */
- ea->attrs[n].u.data = metrics[t];
- n++;
- }
-
- if (n > 0)
- {
- ea->next = ra->eattrs;
- ea->flags = EALF_SORTED;
- ea->count = n;
- ra->eattrs = ea;
- }
+ ea_set_attr(&ra,
+ EA_LITERAL_EMBEDDED(&ea_krt_metrics[t], 0, metrics[t]));
}
- /*
- * Ideally, now we would send the received route to the rest of kernel code.
- * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we
- * postpone it and merge next hops until the end of the sequence. Note that
- * when doing merging of next hops, we expect the new route to be unipath.
- * Otherwise, we ignore additional next hops in nexthop_insert().
- */
+ rte e0 = {
+ .net = net,
+ .attrs = ra,
+ };
- if (!s->net)
- {
- /* Store the new route */
- s->net = lp_alloc(s->pool, net->length);
- net_copy(s->net, net);
-
- s->attrs = ra;
- s->proto = p;
- s->new = new;
- s->krt_src = krt_src;
- s->krt_type = i->rtm_type;
- s->krt_proto = i->rtm_protocol;
- s->krt_metric = priority;
- }
+ if (s->scan)
+ krt_got_route(p, &e0, krt_src);
else
- {
- /* Merge next hops with the stored route */
- rta *oa = s->attrs;
+ krt_got_route_async(p, &e0, new, krt_src);
- struct nexthop *nhs = &oa->nh;
- nexthop_insert(&nhs, &ra->nh);
-
- /* Perhaps new nexthop is inserted at the first position */
- if (nhs == &ra->nh)
- {
- /* Swap rtas */
- s->attrs = ra;
-
- /* Keep old eattrs */
- ra->eattrs = oa->eattrs;
- }
- }
+ lp_flush(s->pool);
}
void
-krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
+krt_do_scan(struct krt_proto *p)
{
- struct nlmsghdr *h;
- struct nl_parse_state s;
+ struct nl_parse_state s = {
+ .proto = p,
+ .pool = nl_linpool,
+ .scan = 1,
+ };
- nl_parse_begin(&s, 1);
- nl_request_dump(AF_UNSPEC, RTM_GETROUTE);
+ /* Table-specific scan or shared scan */
+ if (p)
+ nl_request_dump_route(p->af, krt_table_id(p));
+ else
+ nl_request_dump_route(AF_UNSPEC, 0);
+
+ struct nlmsghdr *h;
while (h = nl_get_scan())
+ {
if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
nl_parse_route(&s, h);
else
log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
- nl_parse_end(&s);
+ }
}
/*
@@ -1911,20 +1950,24 @@ krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NUL
static sock *nl_async_sk; /* BIRD socket for asynchronous notifications */
static byte *nl_async_rx_buffer; /* Receive buffer */
+static uint nl_async_bufsize; /* Kernel rx buffer size for the netlink socket */
+static struct config *nl_last_config; /* For tracking changes to nl_async_bufsize */
static void
nl_async_msg(struct nlmsghdr *h)
{
- struct nl_parse_state s;
+ struct nl_parse_state s = {
+ .proto = NULL,
+ .pool = nl_linpool,
+ .scan = 0,
+ };
switch (h->nlmsg_type)
{
case RTM_NEWROUTE:
case RTM_DELROUTE:
DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type);
- nl_parse_begin(&s, 0);
nl_parse_route(&s, h);
- nl_parse_end(&s);
break;
case RTM_NEWLINK:
case RTM_DELLINK:
@@ -2046,6 +2089,32 @@ nl_open_async(void)
bug("Netlink: sk_open failed");
}
+static void
+nl_update_async_bufsize(void)
+{
+ /* No async socket */
+ if (!nl_async_sk)
+ return;
+
+ /* Already reconfigured */
+ if (nl_last_config == config)
+ return;
+
+ /* Update netlink buffer size */
+ uint bufsize = nl_cfg_rx_buffer_size(config);
+ if (bufsize && (bufsize != nl_async_bufsize))
+ {
+ /* Log message for reconfigurations only */
+ if (nl_last_config)
+ log(L_INFO "KRT: Changing netlink rx buffer size to %u", bufsize);
+
+ nl_set_rcvbuf(nl_async_sk->fd, bufsize);
+ nl_async_bufsize = bufsize;
+ }
+
+ nl_last_config = config;
+}
+
/*
* Interface to the UNIX krt module
@@ -2056,6 +2125,8 @@ krt_sys_io_init(void)
{
nl_linpool = lp_new_default(krt_pool);
HASH_INIT(nl_table_map, krt_pool, 6);
+
+ nl_ea_register();
}
int
@@ -2074,6 +2145,7 @@ krt_sys_start(struct krt_proto *p)
nl_open();
nl_open_async();
+ nl_update_async_bufsize();
return 1;
}
@@ -2081,12 +2153,16 @@ krt_sys_start(struct krt_proto *p)
void
krt_sys_shutdown(struct krt_proto *p)
{
+ nl_update_async_bufsize();
+
HASH_REMOVE2(nl_table_map, RTH, krt_pool, p);
}
int
krt_sys_reconfigure(struct krt_proto *p UNUSED, struct krt_config *n, struct krt_config *o)
{
+ nl_update_async_bufsize();
+
return (n->sys.table_id == o->sys.table_id) && (n->sys.metric == o->sys.metric);
}
@@ -2104,56 +2180,6 @@ krt_sys_copy_config(struct krt_config *d, struct krt_config *s)
d->sys.metric = s->sys.metric;
}
-static const char *krt_metrics_names[KRT_METRICS_MAX] = {
- NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss",
- "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack"
-};
-
-static const char *krt_features_names[KRT_FEATURES_MAX] = {
- "ecn", NULL, NULL, "allfrag"
-};
-
-int
-krt_sys_get_attr(const eattr *a, byte *buf, int buflen UNUSED)
-{
- switch (a->id)
- {
- case EA_KRT_PREFSRC:
- bsprintf(buf, "prefsrc");
- return GA_NAME;
-
- case EA_KRT_REALM:
- bsprintf(buf, "realm");
- return GA_NAME;
-
- case EA_KRT_SCOPE:
- bsprintf(buf, "scope");
- return GA_NAME;
-
- case EA_KRT_LOCK:
- buf += bsprintf(buf, "lock:");
- ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX);
- return GA_FULL;
-
- case EA_KRT_FEATURES:
- buf += bsprintf(buf, "features:");
- ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX);
- return GA_FULL;
-
- default:;
- int id = (int)EA_ID(a->id) - KRT_METRICS_OFFSET;
- if (id > 0 && id < KRT_METRICS_MAX)
- {
- bsprintf(buf, "%s", krt_metrics_names[id]);
- return GA_NAME;
- }
-
- return GA_UNKNOWN;
- }
-}
-
-
-
void
kif_sys_start(struct kif_proto *p UNUSED)
{
diff --git a/sysdep/linux/sysio.h b/sysdep/linux/sysio.h
index e21ff487..f13eda7c 100644
--- a/sysdep/linux/sysio.h
+++ b/sysdep/linux/sysio.h
@@ -10,6 +10,10 @@
#define IPV6_MINHOPCOUNT 73
#endif
+#ifndef IPV6_FREEBIND
+#define IPV6_FREEBIND 78
+#endif
+
#ifndef TCP_MD5SIG_EXT
#define TCP_MD5SIG_EXT 32
#endif
@@ -266,3 +270,18 @@ sk_set_priority(sock *s, int prio)
return 0;
}
+static inline int
+sk_set_freebind(sock *s)
+{
+ int y = 1;
+
+ if (sk_is_ipv4(s))
+ if (setsockopt(s->fd, SOL_IP, IP_FREEBIND, &y, sizeof(y)) < 0)
+ ERR("IP_FREEBIND");
+
+ if (sk_is_ipv6(s))
+ if (setsockopt(s->fd, SOL_IPV6, IPV6_FREEBIND, &y, sizeof(y)) < 0)
+ ERR("IPV6_FREEBIND");
+
+ return 0;
+}
diff --git a/sysdep/unix/Makefile b/sysdep/unix/Makefile
index 07f454ab..6f6b0d26 100644
--- a/sysdep/unix/Makefile
+++ b/sysdep/unix/Makefile
@@ -1,4 +1,4 @@
-src := alloc.c io.c io-loop.c krt.c log.c main.c random.c coroutine.c
+src := alloc.c io.c io-loop.c krt.c log.c main.c random.c domain.c
obj := $(src-o-files)
$(all-daemon)
$(cf-local)
diff --git a/sysdep/unix/alloc.c b/sysdep/unix/alloc.c
index 4ae1a9db..47cd4624 100644
--- a/sysdep/unix/alloc.c
+++ b/sysdep/unix/alloc.c
@@ -8,130 +8,180 @@
#include "nest/bird.h"
#include "lib/resource.h"
+#include "lib/lists.h"
+#include "lib/event.h"
+#include "lib/rcu.h"
+#include <errno.h>
#include <stdlib.h>
#include <unistd.h>
-#include <stdatomic.h>
-#include <errno.h>
#ifdef HAVE_MMAP
#include <sys/mman.h>
#endif
long page_size = 0;
-_Bool alloc_multipage = 0;
-static _Atomic int global_page_list_not_empty;
-static list global_page_list;
-static _Atomic int global_page_spinlock;
+#ifdef HAVE_MMAP
+#define KEEP_PAGES_MAX 256
+#define KEEP_PAGES_MIN 8
-#define GLOBAL_PAGE_SPIN_LOCK for (int v = 0; !atomic_compare_exchange_weak_explicit(&global_page_spinlock, &v, 1, memory_order_acq_rel, memory_order_acquire); v = 0)
-#define GLOBAL_PAGE_SPIN_UNLOCK do { int v = 1; ASSERT_DIE(atomic_compare_exchange_strong_explicit(&global_page_spinlock, &v, 0, memory_order_acq_rel, memory_order_acquire)); } while (0)
+STATIC_ASSERT(KEEP_PAGES_MIN * 4 < KEEP_PAGES_MAX);
-#ifdef HAVE_MMAP
static _Bool use_fake = 0;
+static _Bool initialized = 0;
+
+#if DEBUGGING
+struct free_page {
+ node unused[42];
+ struct free_page * _Atomic next;
+};
#else
-static _Bool use_fake = 1;
+struct free_page {
+ struct free_page * _Atomic next;
+};
#endif
-void resource_sys_init(void)
+static struct free_page * _Atomic page_stack = NULL;
+
+static void page_cleanup(void *);
+static event page_cleanup_event = { .hook = page_cleanup, };
+#define SCHEDULE_CLEANUP do if (initialized && !shutting_down) ev_send(&global_event_list, &page_cleanup_event); while (0)
+
+_Atomic int pages_kept = 0;
+
+static void *
+alloc_sys_page(void)
{
-#ifdef HAVE_MMAP
- init_list(&global_page_list);
+ void *ptr = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (!(page_size = sysconf(_SC_PAGESIZE)))
- die("System page size must be non-zero");
+ if (ptr == MAP_FAILED)
+ bug("mmap(%lu) failed: %m", page_size);
- if ((u64_popcount(page_size) > 1) || (page_size > 16384))
+ return ptr;
+}
+
+extern int shutting_down; /* Shutdown requested. */
+
+#else // ! HAVE_MMAP
+#define use_fake 1
#endif
+
+void *
+alloc_page(void)
+{
+ if (use_fake)
{
- /* Too big or strange page, use the aligned allocator instead */
- page_size = 4096;
- use_fake = 1;
+ void *ptr = NULL;
+ int err = posix_memalign(&ptr, page_size, page_size);
+
+ if (err || !ptr)
+ bug("posix_memalign(%lu) failed", (long unsigned int) page_size);
+
+ return ptr;
}
+
+#ifdef HAVE_MMAP
+ rcu_read_lock();
+ struct free_page *fp = atomic_load_explicit(&page_stack, memory_order_acquire);
+ while (fp && !atomic_compare_exchange_strong_explicit(
+ &page_stack, &fp, atomic_load_explicit(&fp->next, memory_order_acquire),
+ memory_order_acq_rel, memory_order_acquire))
+ ;
+ rcu_read_unlock();
+
+ if (!fp)
+ return alloc_sys_page();
+
+ if (atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed) <= KEEP_PAGES_MIN)
+ SCHEDULE_CLEANUP;
+
+ return fp;
+#endif
}
-void *
-alloc_sys_page(void)
+void
+free_page(void *ptr)
{
-#ifdef HAVE_MMAP
- if (!use_fake)
+ if (use_fake)
{
- if (atomic_load_explicit(&global_page_list_not_empty, memory_order_relaxed))
- {
- GLOBAL_PAGE_SPIN_LOCK;
- if (!EMPTY_LIST(global_page_list))
- {
- node *ret = HEAD(global_page_list);
- rem_node(ret);
- if (EMPTY_LIST(global_page_list))
- atomic_store_explicit(&global_page_list_not_empty, 0, memory_order_relaxed);
- GLOBAL_PAGE_SPIN_UNLOCK;
- memset(ret, 0, sizeof(node));
- return (void *) ret;
- }
- GLOBAL_PAGE_SPIN_UNLOCK;
- }
-
- if (alloc_multipage)
- {
- void *big = mmap(NULL, page_size * 2, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (big == MAP_FAILED)
- bug("mmap(%lu) failed: %m", page_size);
-
- uintptr_t offset = ((uintptr_t) big) % page_size;
- if (offset)
- {
- void *ret = big + page_size - offset;
- munmap(big, page_size - offset);
- munmap(ret + page_size, offset);
- return ret;
- }
- else
- {
- munmap(big + page_size, page_size);
- return big;
- }
- }
-
- void *ret = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (ret == MAP_FAILED)
- bug("mmap(%lu) failed: %m", page_size);
-
- return ret;
+ free(ptr);
+ return;
}
- else
+
+#ifdef HAVE_MMAP
+ rcu_read_lock();
+ struct free_page *fp = ptr;
+ struct free_page *next = atomic_load_explicit(&page_stack, memory_order_acquire);
+
+ do atomic_store_explicit(&fp->next, next, memory_order_release);
+ while (!atomic_compare_exchange_strong_explicit(
+ &page_stack, &next, fp,
+ memory_order_acq_rel, memory_order_acquire));
+ rcu_read_unlock();
+
+ if (atomic_fetch_add_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX)
+ SCHEDULE_CLEANUP;
#endif
+}
+
+#ifdef HAVE_MMAP
+static void
+page_cleanup(void *_ UNUSED)
+{
+ struct free_page *stack = atomic_exchange_explicit(&page_stack, NULL, memory_order_acq_rel);
+ if (!stack)
+ return;
+
+ synchronize_rcu();
+
+ do {
+ struct free_page *f = stack;
+ stack = atomic_load_explicit(&f->next, memory_order_acquire);
+
+ if (munmap(f, page_size) == 0)
+ continue;
+ else if (errno != ENOMEM)
+ bug("munmap(%p) failed: %m", f);
+ else
+ free_page(f);
+ }
+ while (stack && (atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX / 2));
+
+ while (stack)
{
- void *ret = aligned_alloc(page_size, page_size);
- if (!ret)
- bug("aligned_alloc(%lu) failed", page_size);
- return ret;
+ atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed);
+
+ struct free_page *f = stack;
+ stack = atomic_load_explicit(&f->next, memory_order_acquire);
+ free_page(f);
}
}
+#endif
void
-free_sys_page(void *ptr)
+resource_sys_init(void)
{
#ifdef HAVE_MMAP
- if (!use_fake)
+ if (!(page_size = sysconf(_SC_PAGESIZE)))
+ die("System page size must be non-zero");
+
+ if (u64_popcount(page_size) == 1)
{
- if (munmap(ptr, page_size) < 0)
-#ifdef ENOMEM
- if (errno == ENOMEM)
- {
- memset(ptr, 0, page_size);
-
- GLOBAL_PAGE_SPIN_LOCK;
- add_tail(&global_page_list, (node *) ptr);
- atomic_store_explicit(&global_page_list_not_empty, 1, memory_order_relaxed);
- GLOBAL_PAGE_SPIN_UNLOCK;
- }
- else
-#endif
- bug("munmap(%p) failed: %m", ptr);
+
+ for (int i = 0; i < (KEEP_PAGES_MIN * 2); i++)
+ free_page(alloc_page());
+
+ page_cleanup(NULL);
+ initialized = 1;
+ return;
}
- else
+
+ /* Too big or strange page, use the aligned allocator instead */
+ log(L_WARN "Got strange memory page size (%lu), using the aligned allocator instead", page_size);
+ use_fake = 1;
#endif
- free(ptr);
+
+ page_size = 4096;
+ initialized = 1;
}
diff --git a/sysdep/unix/coroutine.c b/sysdep/unix/domain.c
index 4758c056..0a5858a6 100644
--- a/sysdep/unix/coroutine.c
+++ b/sysdep/unix/domain.c
@@ -1,7 +1,6 @@
/*
- * BIRD Coroutines
+ * BIRD Locking
*
- * (c) 2017 Martin Mares <mj@ucw.cz>
* (c) 2020 Maria Matejka <mq@jmq.cz>
*
* Can be freely distributed and used under the terms of the GNU GPL.
@@ -17,18 +16,11 @@
#include "lib/birdlib.h"
#include "lib/locking.h"
-#include "lib/coro.h"
#include "lib/resource.h"
#include "lib/timer.h"
#include "conf/conf.h"
-#define CORO_STACK_SIZE 65536
-
-/*
- * Implementation of coroutines based on POSIX threads
- */
-
#include <errno.h>
#include <fcntl.h>
#include <poll.h>
@@ -122,71 +114,3 @@ void do_unlock(struct domain_generic *dg, struct domain_generic **lsp)
dg->prev = NULL;
pthread_mutex_unlock(&dg->mutex);
}
-
-/* Coroutines */
-struct coroutine {
- resource r;
- pthread_t id;
- pthread_attr_t attr;
- void (*entry)(void *);
- void *data;
-};
-
-static _Thread_local _Bool coro_cleaned_up = 0;
-
-static void coro_free(resource *r)
-{
- struct coroutine *c = (void *) r;
- ASSERT_DIE(pthread_equal(pthread_self(), c->id));
- pthread_attr_destroy(&c->attr);
- coro_cleaned_up = 1;
-}
-
-static struct resclass coro_class = {
- .name = "Coroutine",
- .size = sizeof(struct coroutine),
- .free = coro_free,
-};
-
-_Thread_local struct coroutine *this_coro = NULL;
-
-static void *coro_entry(void *p)
-{
- struct coroutine *c = p;
-
- ASSERT_DIE(c->entry);
-
- this_coro = c;
-
- c->entry(c->data);
- ASSERT_DIE(coro_cleaned_up);
-
- return NULL;
-}
-
-struct coroutine *coro_run(pool *p, void (*entry)(void *), void *data)
-{
- ASSERT_DIE(entry);
- ASSERT_DIE(p);
-
- struct coroutine *c = ralloc(p, &coro_class);
-
- c->entry = entry;
- c->data = data;
-
- int e = 0;
-
- if (e = pthread_attr_init(&c->attr))
- die("pthread_attr_init() failed: %M", e);
-
- if (e = pthread_attr_setstacksize(&c->attr, CORO_STACK_SIZE))
- die("pthread_attr_setstacksize(%u) failed: %M", CORO_STACK_SIZE, e);
-
- if (e = pthread_attr_setdetachstate(&c->attr, PTHREAD_CREATE_DETACHED))
- die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e);
-
- if (e = pthread_create(&c->id, &c->attr, coro_entry, c))
- die("pthread_create() failed: %M", e);
-
- return c;
-}
diff --git a/sysdep/unix/io-loop.c b/sysdep/unix/io-loop.c
index c7cf4ad2..575e5403 100644
--- a/sysdep/unix/io-loop.c
+++ b/sysdep/unix/io-loop.c
@@ -17,7 +17,6 @@
#include "nest/bird.h"
#include "lib/buffer.h"
-#include "lib/coro.h"
#include "lib/lists.h"
#include "lib/resource.h"
#include "lib/event.h"
@@ -28,6 +27,8 @@
#include "sysdep/unix/io-loop.h"
#include "conf/conf.h"
+#define THREAD_STACK_SIZE 65536 /* To be lowered in near future */
+
/*
* Current thread context
*/
@@ -58,6 +59,31 @@ birdloop_inside(struct birdloop *loop)
return 0;
}
+void
+birdloop_flag(struct birdloop *loop, u32 flag)
+{
+ atomic_fetch_or_explicit(&loop->flags, flag, memory_order_acq_rel);
+ birdloop_ping(loop);
+}
+
+void
+birdloop_flag_set_handler(struct birdloop *loop, struct birdloop_flag_handler *fh)
+{
+ ASSERT_DIE(birdloop_inside(loop));
+ loop->flag_handler = fh;
+}
+
+static int
+birdloop_process_flags(struct birdloop *loop)
+{
+ if (!loop->flag_handler)
+ return 0;
+
+ u32 flags = atomic_exchange_explicit(&loop->flags, 0, memory_order_acq_rel);
+ loop->flag_handler->hook(loop->flag_handler, flags);
+ return !!flags;
+}
+
/*
* Wakeup code for birdloop
*/
@@ -132,11 +158,10 @@ wakeup_do_kick(struct birdloop *loop)
pipe_kick(loop->wakeup_fds[1]);
}
-void
-birdloop_ping(struct birdloop *loop)
+static inline void
+birdloop_do_ping(struct birdloop *loop)
{
- u32 ping_sent = atomic_fetch_add_explicit(&loop->ping_sent, 1, memory_order_acq_rel);
- if (ping_sent)
+ if (atomic_fetch_add_explicit(&loop->ping_sent, 1, memory_order_acq_rel))
return;
if (loop == birdloop_wakeup_masked)
@@ -145,6 +170,15 @@ birdloop_ping(struct birdloop *loop)
wakeup_do_kick(loop);
}
+void
+birdloop_ping(struct birdloop *loop)
+{
+ if (birdloop_inside(loop) && !loop->ping_pending)
+ loop->ping_pending++;
+ else
+ birdloop_do_ping(loop);
+}
+
/*
* Sockets
@@ -205,7 +239,7 @@ sk_stop(sock *s)
}
static inline uint sk_want_events(sock *s)
-{ return ((s->rx_hook && !ev_corked(s->cork)) ? POLLIN : 0) | ((s->ttx != s->tpos) ? POLLOUT : 0); }
+{ return (s->rx_hook ? POLLIN : 0) | ((s->ttx != s->tpos) ? POLLOUT : 0); }
/*
FIXME: this should be called from sock code
@@ -336,7 +370,7 @@ birdloop_init(void)
birdloop_enter_locked(&main_birdloop);
}
-static void birdloop_main(void *arg);
+static void *birdloop_main(void *arg);
struct birdloop *
birdloop_new(pool *pp, uint order, const char *name)
@@ -357,7 +391,19 @@ birdloop_new(pool *pp, uint order, const char *name)
timers_init(&loop->time, p);
sockets_init(loop);
- loop->time.coro = coro_run(p, birdloop_main, loop);
+ int e = 0;
+
+ if (e = pthread_attr_init(&loop->thread_attr))
+ die("pthread_attr_init() failed: %M", e);
+
+ if (e = pthread_attr_setstacksize(&loop->thread_attr, THREAD_STACK_SIZE))
+ die("pthread_attr_setstacksize(%u) failed: %M", THREAD_STACK_SIZE, e);
+
+ if (e = pthread_attr_setdetachstate(&loop->thread_attr, PTHREAD_CREATE_DETACHED))
+ die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e);
+
+ if (e = pthread_create(&loop->thread_id, &loop->thread_attr, birdloop_main, loop))
+ die("pthread_create() failed: %M", e);
birdloop_leave(loop);
@@ -393,6 +439,11 @@ void
birdloop_free(struct birdloop *loop)
{
ASSERT_DIE(loop->links == 0);
+ ASSERT_DIE(pthread_equal(pthread_self(), loop->thread_id));
+
+ rcu_birdloop_stop(&loop->rcu);
+ pthread_attr_destroy(&loop->thread_attr);
+
domain_free(loop->time.domain);
rfree(loop->pool);
}
@@ -423,6 +474,13 @@ birdloop_leave_locked(struct birdloop *loop)
/* Check the current context */
ASSERT_DIE(birdloop_current == loop);
+ /* Send pending pings */
+ if (loop->ping_pending)
+ {
+ loop->ping_pending = 0;
+ birdloop_do_ping(loop);
+ }
+
/* Restore the old context */
birdloop_current = loop->prev_loop;
}
@@ -466,20 +524,24 @@ birdloop_unlink(struct birdloop *loop)
loop->links--;
}
-static void
+static void *
birdloop_main(void *arg)
{
struct birdloop *loop = arg;
timer *t;
int rv, timeout;
+ rcu_birdloop_start(&loop->rcu);
+
btime loop_begin = current_time();
+ tmp_init(loop->pool);
+
birdloop_enter(loop);
while (1)
{
timers_fire(&loop->time, 0);
- if (ev_run_list(&loop->event_list))
+ if (birdloop_process_flags(loop) + ev_run_list(&loop->event_list))
timeout = 0;
else if (t = timers_first(&loop->time))
timeout = (tm_remains(t) TO_MS) + 1;
@@ -523,13 +585,22 @@ birdloop_main(void *arg)
/* Flush remaining events */
ASSERT_DIE(!ev_run_list(&loop->event_list));
- /* No timers allowed */
- ASSERT_DIE(timers_count(&loop->time) == 0);
+ /* Drop timers */
+ while (t = timers_first(&loop->time))
+ tm_stop(t);
+
+ /* No sockets allowed */
ASSERT_DIE(EMPTY_LIST(loop->sock_list));
ASSERT_DIE(loop->sock_num == 0);
birdloop_leave(loop);
loop->stopped(loop->stop_data);
-}
+ return NULL;
+}
+void
+birdloop_yield(void)
+{
+ usleep(100);
+}
diff --git a/sysdep/unix/io-loop.h b/sysdep/unix/io-loop.h
index 4024b6c5..aec7a409 100644
--- a/sysdep/unix/io-loop.h
+++ b/sysdep/unix/io-loop.h
@@ -7,6 +7,8 @@
#ifndef _BIRD_SYSDEP_UNIX_IO_LOOP_H_
#define _BIRD_SYSDEP_UNIX_IO_LOOP_H_
+#include "lib/rcu.h"
+
struct birdloop
{
pool *pool;
@@ -21,11 +23,20 @@ struct birdloop
u8 poll_changed;
u8 close_scheduled;
+ uint ping_pending;
_Atomic u32 ping_sent;
int wakeup_fds[2];
+ pthread_t thread_id;
+ pthread_attr_t thread_attr;
+
+ struct rcu_birdloop rcu;
+
uint links;
+ _Atomic u32 flags;
+ struct birdloop_flag_handler *flag_handler;
+
void (*stopped)(void *data);
void *stop_data;
diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c
index dd385c80..23baffb2 100644
--- a/sysdep/unix/io.c
+++ b/sysdep/unix/io.c
@@ -1435,6 +1435,10 @@ sk_open(sock *s)
if (sk_set_high_port(s) < 0)
log(L_WARN "Socket error: %s%#m", s->err);
+ if (s->flags & SKF_FREEBIND)
+ if (sk_set_freebind(s) < 0)
+ log(L_WARN "Socket error: %s%#m", s->err);
+
sockaddr_fill(&sa, s->af, bind_addr, s->iface, bind_port);
if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
ERR2("bind");
@@ -1879,8 +1883,8 @@ sk_read_ssh(sock *s)
/* sk_read() and sk_write() are called from BFD's event loop */
-int
-sk_read(sock *s, int revents)
+static inline int
+sk_read_noflush(sock *s, int revents)
{
switch (s->type)
{
@@ -1943,7 +1947,15 @@ sk_read(sock *s, int revents)
}
int
-sk_write(sock *s)
+sk_read(sock *s, int revents)
+{
+ int e = sk_read_noflush(s, revents);
+ tmp_flush();
+ return e;
+}
+
+static inline int
+sk_write_noflush(sock *s)
{
switch (s->type)
{
@@ -1991,6 +2003,14 @@ sk_write(sock *s)
}
}
+int
+sk_write(sock *s)
+{
+ int e = sk_write_noflush(s);
+ tmp_flush();
+ return e;
+}
+
int sk_is_ipv4(sock *s)
{ return s->af == AF_INET; }
@@ -2009,6 +2029,7 @@ sk_err(sock *s, int revents)
}
s->err_hook(s, se);
+ tmp_flush();
}
void
@@ -2058,8 +2079,8 @@ io_update_time(void)
event_open->duration = last_io_time - event_open->timestamp;
if (event_open->duration > config->latency_limit)
- log(L_WARN "Event 0x%p 0x%p took %d ms",
- event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
+ log(L_WARN "Event 0x%p 0x%p took %u.%03u ms",
+ event_open->hook, event_open->data, (uint) (event_open->duration TO_MS), (uint) (event_open->duration % 1000));
event_open = NULL;
}
@@ -2163,8 +2184,8 @@ watchdog_stop(void)
btime duration = last_io_time - loop_time;
if (duration > config->watchdog_warning)
- log(L_WARN "I/O loop cycle took %d ms for %d events",
- (int) (duration TO_MS), event_log_num);
+ log(L_WARN "I/O loop cycle took %u.%03u ms for %d events",
+ (uint) (duration TO_MS), (uint) (duration % 1000), event_log_num);
}
@@ -2234,7 +2255,7 @@ io_loop(void)
{
pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */
s = SKIP_BACK(sock, n, n);
- if (s->rx_hook && !ev_corked(s->cork))
+ if (s->rx_hook)
{
pfd[nfds].fd = s->fd;
pfd[nfds].events |= POLLIN;
diff --git a/sysdep/unix/krt.Y b/sysdep/unix/krt.Y
index 95b54d65..4ce9a328 100644
--- a/sysdep/unix/krt.Y
+++ b/sysdep/unix/krt.Y
@@ -29,7 +29,7 @@ kif_set_preferred(ip_addr ip)
CF_DECLS
-CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, KRT_SOURCE, KRT_METRIC, MERGE, PATHS)
+CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, MERGE, PATHS)
CF_KEYWORDS(INTERFACE, PREFERRED)
%type <i> kern_mp_limit
@@ -122,9 +122,6 @@ kif_iface:
kif_iface_start iface_patt_list_nopx kif_iface_opt_list;
-dynamic_attr: KRT_SOURCE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SOURCE); } ;
-dynamic_attr: KRT_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_METRIC); } ;
-
CF_CODE
CF_END
diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c
index 98c56391..d507c133 100644
--- a/sysdep/unix/krt.c
+++ b/sysdep/unix/krt.c
@@ -53,7 +53,7 @@
#include "nest/bird.h"
#include "nest/iface.h"
-#include "nest/route.h"
+#include "nest/rt.h"
#include "nest/protocol.h"
#include "filter/filter.h"
#include "conf/conf.h"
@@ -232,7 +232,6 @@ kif_copy_config(struct proto_config *dest, struct proto_config *src)
struct protocol proto_unix_iface = {
.name = "Device",
.template = "device%d",
- .class = PROTOCOL_DEVICE,
.proto_size = sizeof(struct kif_proto),
.config_size = sizeof(struct kif_config),
.preconfig = kif_preconfig,
@@ -243,6 +242,13 @@ struct protocol proto_unix_iface = {
.copy_config = kif_copy_config
};
+void
+kif_build(void)
+{
+ proto_build(&proto_unix_iface);
+}
+
+
/*
* Tracing of routes
*/
@@ -280,30 +286,46 @@ static struct tbf rl_alien = TBF_DEFAULT_LOG_LIMITS;
static inline u32
krt_metric(rte *a)
{
- eattr *ea = ea_find(a->attrs->eattrs, EA_KRT_METRIC);
+ eattr *ea = ea_find(a->attrs, &ea_krt_metric);
return ea ? ea->u.data : 0;
}
static inline int
-krt_rte_better(rte *a, rte *b)
+krt_same_key(rte *a, rte *b)
+{
+ return (krt_metric(a) == krt_metric(b));
+}
+
+static inline int
+krt_uptodate(rte *a, rte *b)
{
- return (krt_metric(a) > krt_metric(b));
+ return (a->attrs == b->attrs);
}
/* Called when alien route is discovered during scan */
static void
-krt_learn_rte(struct krt_proto *p, rte *e)
+krt_learn_scan(struct krt_proto *p, rte *e)
{
- struct rte_src *src = e->src = rt_get_source(&p->p, krt_metric(e));
- rte_update(p->p.main_channel, e->net, e, e->src);
- rt_unlock_source(src);
+ rte e0 = {
+ .attrs = e->attrs,
+ .src = rt_get_source(&p->p, krt_metric(e)),
+ };
+
+ ea_set_attr_u32(&e0.attrs, &ea_gen_preference, 0, p->p.main_channel->preference);
+
+ rte_update(p->p.main_channel, e->net, &e0, e0.src);
+ rt_unlock_source(e0.src);
}
static void
-krt_learn_init(struct krt_proto *p)
+krt_learn_async(struct krt_proto *p, rte *e, int new)
{
- if (KRT_CF->learn)
- channel_setup_in_table(p->p.main_channel, 1);
+ if (new)
+ return krt_learn_scan(p, e);
+
+ struct rte_src *src = rt_get_source(&p->p, krt_metric(e));
+ rte_update(p->p.main_channel, e->net, NULL, src);
+ rt_unlock_source(src);
}
#endif
@@ -323,7 +345,7 @@ rte_feed_count(net *n)
{
uint count = 0;
for (struct rte_storage *e = n->routes; e; e = e->next)
- if (rte_is_valid(RTES_OR_NULL(e)))
+ if (rte_is_valid(RTE_OR_NULL(e)))
count++;
return count;
}
@@ -333,7 +355,7 @@ rte_feed_obtain(net *n, rte **feed, uint count)
{
uint i = 0;
for (struct rte_storage *e = n->routes; e; e = e->next)
- if (rte_is_valid(RTES_OR_NULL(e)))
+ if (rte_is_valid(RTE_OR_NULL(e)))
{
ASSERT_DIE(i < count);
feed[i++] = &e->rte;
@@ -344,6 +366,13 @@ rte_feed_obtain(net *n, rte **feed, uint count)
static struct rte *
krt_export_net(struct krt_proto *p, net *net)
{
+ /* FIXME: Here we are calling filters in table-locked context when exporting
+ * to kernel. Here BIRD can crash if the user requested ROA check in kernel
+ * export filter. It doesn't make much sense to write the filters like this,
+ * therefore we may keep this unfinished piece of work here for later as it
+ * won't really affect anybody. */
+ ASSERT_DIE(RT_IS_LOCKED(p->p.main_channel->table));
+
struct channel *c = p->p.main_channel;
const struct filter *filter = c->out_filter;
@@ -372,7 +401,7 @@ krt_export_net(struct krt_proto *p, net *net)
if (filter == FILTER_ACCEPT)
goto accept;
- if (f_run(filter, &rt, krt_filter_lp, FF_SILENT) > F_ACCEPT)
+ if (f_run(filter, &rt, FF_SILENT) > F_ACCEPT)
goto reject;
@@ -386,15 +415,12 @@ reject:
static int
krt_same_dest(rte *k, rte *e)
{
- rta *ka = k->attrs, *ea = e->attrs;
-
- if (ka->dest != ea->dest)
- return 0;
+ ea_list *ka = k->attrs, *ea = e->attrs;
- if (ka->dest == RTD_UNICAST)
- return nexthop_same(&(ka->nh), &(ea->nh));
+ eattr *nhea_k = ea_find(ka, &ea_gen_nexthop);
+ eattr *nhea_e = ea_find(ea, &ea_gen_nexthop);
- return 1;
+ return (!nhea_k == !nhea_e) && adata_same(nhea_k->u.ptr, nhea_e->u.ptr);
}
/*
@@ -419,7 +445,7 @@ krt_got_route(struct krt_proto *p, rte *e, s8 src)
case KRT_SRC_ALIEN:
if (KRT_CF->learn)
- krt_learn_rte(p, e);
+ krt_learn_scan(p, e);
else
krt_trace_in_rl(&rl_alien, p, e, "[alien] ignored");
return;
@@ -427,7 +453,9 @@ krt_got_route(struct krt_proto *p, rte *e, s8 src)
#endif
/* The rest is for KRT_SRC_BIRD (or KRT_SRC_UNKNOWN) */
- RT_LOCK(p->p.main_channel->table);
+ RT_LOCKED(p->p.main_channel->table, tab)
+ {
+
/* Deleting all routes if flush is requested */
if (p->flush_routes)
goto delete;
@@ -436,7 +464,7 @@ krt_got_route(struct krt_proto *p, rte *e, s8 src)
if (!p->ready)
goto ignore;
- net *net = net_find(RT_PRIV(p->p.main_channel->table), e->net);
+ net *net = net_find(tab, e->net);
if (!net || !krt_is_installed(p, net))
goto delete;
@@ -481,8 +509,9 @@ delete:
krt_replace_rte(p, e->net, NULL, e);
goto done;
-done:
- RT_UNLOCK(p->p.main_channel->table);
+done:;
+ }
+
lp_flush(krt_filter_lp);
}
@@ -490,18 +519,13 @@ static void
krt_init_scan(struct krt_proto *p)
{
bmap_reset(&p->seen_map, 1024);
-
-#ifdef KRT_ALLOW_LEARN
- if (KRT_CF->learn)
- channel_refresh_begin(p->p.main_channel);
-#endif
}
static void
krt_prune(struct krt_proto *p)
{
- RT_LOCK(p->p.main_channel->table);
- rtable_private *t = RT_PRIV(p->p.main_channel->table);
+ RT_LOCKED(p->p.main_channel->table, t)
+ {
KRT_TRACE(p, D_EVENTS, "Pruning table %s", t->name);
FIB_WALK(&t->fib, net, n)
@@ -521,15 +545,10 @@ krt_prune(struct krt_proto *p)
}
FIB_WALK_END;
- RT_UNLOCK(p->p.main_channel->table);
-
-#ifdef KRT_ALLOW_LEARN
- if (KRT_CF->learn)
- channel_refresh_end(p->p.main_channel);
-#endif
-
if (p->ready)
p->initialized = 1;
+
+ }
}
static void
@@ -567,25 +586,24 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src)
case KRT_SRC_ALIEN:
if (KRT_CF->learn)
{
- krt_learn_rte(p, e);
+ krt_learn_async(p, e, new);
return;
}
#endif
}
}
+
/*
* Periodic scanning
*/
-
-#ifdef CONFIG_ALL_TABLES_AT_ONCE
-
-static timer *krt_scan_timer;
-static int krt_scan_count;
+static timer *krt_scan_all_timer;
+static int krt_scan_all_count;
+static _Bool krt_scan_all_tables;
static void
-krt_scan(timer *t UNUSED)
+krt_scan_all(timer *t UNUSED)
{
struct krt_proto *p;
node *n;
@@ -606,35 +624,42 @@ krt_scan(timer *t UNUSED)
}
static void
-krt_scan_timer_start(struct krt_proto *p)
+krt_scan_all_timer_start(struct krt_proto *p)
{
- if (!krt_scan_count)
- krt_scan_timer = tm_new_init(krt_pool, krt_scan, NULL, KRT_CF->scan_time, 0);
+ if (!krt_scan_all_count)
+ krt_scan_all_timer = tm_new_init(krt_pool, krt_scan_all, NULL, KRT_CF->scan_time, 0);
- krt_scan_count++;
+ krt_scan_all_count++;
- tm_start(krt_scan_timer, 1 S);
+ tm_start(krt_scan_all_timer, 1 S);
}
static void
-krt_scan_timer_stop(struct krt_proto *p UNUSED)
+krt_scan_all_timer_stop(void)
{
- krt_scan_count--;
+ ASSERT(krt_scan_all_count > 0);
+
+ krt_scan_all_count--;
- if (!krt_scan_count)
+ if (!krt_scan_all_count)
{
- rfree(krt_scan_timer);
- krt_scan_timer = NULL;
+ rfree(krt_scan_all_timer);
+ krt_scan_all_timer = NULL;
}
}
static void
-krt_scan_timer_kick(struct krt_proto *p UNUSED)
+krt_scan_all_timer_kick(void)
{
- tm_start(krt_scan_timer, 0);
+ tm_start(krt_scan_all_timer, 0);
+}
+
+void
+krt_use_shared_scan(void)
+{
+ krt_scan_all_tables = 1;
}
-#else
static void
krt_scan(timer *t)
@@ -652,35 +677,42 @@ krt_scan(timer *t)
static void
krt_scan_timer_start(struct krt_proto *p)
{
- p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0);
- tm_start(p->scan_timer, 1 S);
+ if (krt_scan_all_tables)
+ krt_scan_all_timer_start(p);
+ else
+ {
+ p->scan_timer = tm_new_init(p->p.pool, krt_scan, p, KRT_CF->scan_time, 0);
+ tm_start(p->scan_timer, 1 S);
+ }
}
static void
krt_scan_timer_stop(struct krt_proto *p)
{
- tm_stop(p->scan_timer);
+ if (krt_scan_all_tables)
+ krt_scan_all_timer_stop();
+ else
+ tm_stop(p->scan_timer);
}
static void
krt_scan_timer_kick(struct krt_proto *p)
{
- tm_start(p->scan_timer, 0);
+ if (krt_scan_all_tables)
+ krt_scan_all_timer_kick();
+ else
+ tm_start(p->scan_timer, 0);
}
-#endif
-
-
-
/*
* Updates
*/
static int
-krt_preexport(struct channel *c, rte *e)
+krt_preexport(struct channel *C, rte *e)
{
- if (e->src->owner == &c->proto->sources)
+ if (e->src->owner == &C->proto->sources)
return -1;
if (!krt_capable(e))
@@ -780,11 +812,6 @@ krt_postconfig(struct proto_config *CF)
if (! proto_cf_main_channel(CF))
cf_error("Channel not specified");
-#ifdef CONFIG_ALL_TABLES_AT_ONCE
- if (krt_cf->scan_time != cf->scan_time)
- cf_error("All kernel syncers must use the same table scan interval");
-#endif
-
struct channel_config *cc = proto_cf_main_channel(CF);
struct rtable_config *tab = cc->table;
if (tab->krt_attached)
@@ -813,7 +840,6 @@ krt_init(struct proto_config *CF)
p->p.if_notify = krt_if_notify;
p->p.reload_routes = krt_reload_routes;
p->p.feed_end = krt_feed_end;
- p->p.rte_better = krt_rte_better;
krt_sys_init(p);
return &p->p;
@@ -839,10 +865,6 @@ krt_start(struct proto *P)
bmap_init(&p->seen_map, p->p.pool, 1024);
add_tail(&krt_proto_list, &p->krt_node);
-#ifdef KRT_ALLOW_LEARN
- krt_learn_init(p);
-#endif
-
if (!krt_sys_start(p))
{
rem_node(&p->krt_node);
@@ -922,24 +944,15 @@ krt_copy_config(struct proto_config *dest, struct proto_config *src)
krt_sys_copy_config(d, s);
}
-static int
-krt_get_attr(const eattr *a, byte *buf, int buflen)
-{
- switch (a->id)
- {
- case EA_KRT_SOURCE:
- bsprintf(buf, "source");
- return GA_NAME;
-
- case EA_KRT_METRIC:
- bsprintf(buf, "metric");
- return GA_NAME;
-
- default:
- return krt_sys_get_attr(a, buf, buflen);
- }
-}
+struct ea_class ea_krt_source = {
+ .name = "krt_source",
+ .type = T_INT,
+};
+struct ea_class ea_krt_metric = {
+ .name = "krt_metric",
+ .type = T_INT,
+};
#ifdef CONFIG_IP6_SADR_KERNEL
#define MAYBE_IP6_SADR NB_IP6_SADR
@@ -956,7 +969,6 @@ krt_get_attr(const eattr *a, byte *buf, int buflen)
struct protocol proto_unix_kernel = {
.name = "Kernel",
.template = "kernel%d",
- .class = PROTOCOL_KERNEL,
.preference = DEF_PREF_INHERITED,
.channel_mask = NB_IP | MAYBE_IP6_SADR | MAYBE_MPLS,
.proto_size = sizeof(struct krt_proto),
@@ -968,5 +980,15 @@ struct protocol proto_unix_kernel = {
.shutdown = krt_shutdown,
.reconfigure = krt_reconfigure,
.copy_config = krt_copy_config,
- .get_attr = krt_get_attr,
};
+
+void
+krt_build(void)
+{
+ proto_build(&proto_unix_kernel);
+
+ EA_REGISTER_ALL(
+ &ea_krt_source,
+ &ea_krt_metric,
+ );
+}
diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h
index 968c5b16..9f7ebb4f 100644
--- a/sysdep/unix/krt.h
+++ b/sysdep/unix/krt.h
@@ -21,8 +21,7 @@ struct kif_proto;
#define KRT_DEFAULT_ECMP_LIMIT 16
-#define EA_KRT_SOURCE EA_CODE(PROTOCOL_KERNEL, 0)
-#define EA_KRT_METRIC EA_CODE(PROTOCOL_KERNEL, 1)
+extern struct ea_class ea_krt_source, ea_krt_metric;
#define KRT_REF_SEEN 0x1 /* Seen in table */
#define KRT_REF_BEST 0x2 /* Best in table */
@@ -51,10 +50,7 @@ struct krt_proto {
struct proto p;
struct krt_state sys; /* Sysdep state */
-#ifndef CONFIG_ALL_TABLES_AT_ONCE
timer *scan_timer;
-#endif
-
struct bmap sync_map; /* Keeps track which exported routes were successfully written to kernel */
struct bmap seen_map; /* Routes seen during last periodic scan */
node krt_node; /* Node in krt_proto_list */
@@ -76,6 +72,7 @@ extern pool *krt_pool;
struct proto_config * kif_init_config(int class);
void kif_request_scan(void);
+void krt_use_shared_scan(void);
void krt_got_route(struct krt_proto *p, struct rte *e, s8 src);
void krt_got_route_async(struct krt_proto *p, struct rte *e, int new, s8 src);
diff --git a/sysdep/unix/log.c b/sysdep/unix/log.c
index f48588b6..185231e8 100644
--- a/sysdep/unix/log.c
+++ b/sysdep/unix/log.c
@@ -36,10 +36,10 @@ static FILE *dbgf;
static list *current_log_list;
static char *current_syslog_name; /* NULL -> syslog closed */
-static _Atomic uint max_coro_id = ATOMIC_VAR_INIT(1);
-static _Thread_local uint this_coro_id;
+static _Atomic uint max_thread_id = ATOMIC_VAR_INIT(1);
+static _Thread_local uint this_thread_id;
-#define THIS_CORO_ID (this_coro_id ?: (this_coro_id = atomic_fetch_add_explicit(&max_coro_id, 1, memory_order_acq_rel)))
+#define THIS_THREAD_ID (this_thread_id ?: (this_thread_id = atomic_fetch_add_explicit(&max_thread_id, 1, memory_order_acq_rel)))
#include <pthread.h>
@@ -183,7 +183,7 @@ log_commit(int class, buffer *buf)
l->pos += msg_len;
}
- fprintf(l->fh, "%s [%04x] <%s> ", tbuf, THIS_CORO_ID, class_names[class]);
+ fprintf(l->fh, "%s [%04x] <%s> ", tbuf, THIS_THREAD_ID, class_names[class]);
}
fputs(buf->start, l->fh);
fputc('\n', l->fh);
@@ -329,7 +329,7 @@ debug(const char *msg, ...)
sec = dbg_time.tv_sec - dbg_time_start.tv_sec - 1;
}
- int n = bsnprintf(pos, max, "%u.%09u: [%04x] ", sec, nsec, THIS_CORO_ID);
+ int n = bsnprintf(pos, max, "%u.%09u: [%04x] ", sec, nsec, THIS_THREAD_ID);
pos += n;
max -= n;
diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c
index 5da27cb6..bf9f2be0 100644
--- a/sysdep/unix/main.c
+++ b/sysdep/unix/main.c
@@ -31,7 +31,7 @@
#include "lib/locking.h"
#include "lib/timer.h"
#include "lib/string.h"
-#include "nest/route.h"
+#include "nest/rt.h"
#include "nest/protocol.h"
#include "nest/iface.h"
#include "nest/cli.h"
@@ -57,7 +57,7 @@ async_dump(void)
// XXXX tm_dump_all();
if_dump_all();
neigh_dump_all();
- rta_dump_all();
+ ea_dump_all();
rt_dump_all();
protos_dump_all();
@@ -117,7 +117,7 @@ add_num_const(char *name, int val, const char *file, const uint line)
struct f_val *v = cfg_alloc(sizeof(struct f_val));
*v = (struct f_val) { .type = T_INT, .val.i = val };
struct symbol *sym = cf_get_symbol(name);
- if (sym->class && (sym->scope == conf_this_scope))
+ if (sym->class && cf_symbol_is_local(sym))
cf_error("Error reading value for %s from %s:%d: already defined", name, file, line);
cf_define_symbol(sym, SYM_CONSTANT | T_INT, val, v);
@@ -683,7 +683,7 @@ signal_init(void)
* Parsing of command-line arguments
*/
-static char *opt_list = "B:c:dD:ps:P:u:g:flRh";
+static char *opt_list = "bc:dD:ps:P:u:g:flRh";
int parse_and_exit;
char *bird_name;
static char *use_user;
@@ -704,7 +704,6 @@ display_help(void)
fprintf(stderr,
"\n"
"Options: \n"
- " -B <block-size> Use 2^this number as memory allocation block size (default: 12)\n"
" -c <config-file> Use given configuration file instead of\n"
" " PATH_CONFIG_FILE "\n"
" -d Enable debug messages and run bird in foreground\n"
@@ -791,15 +790,12 @@ get_gid(const char *s)
return gr->gr_gid;
}
-extern _Bool alloc_multipage;
-
static void
parse_args(int argc, char **argv)
{
int config_changed = 0;
int socket_changed = 0;
int c;
- int bp;
bird_name = get_bird_name(argv[0], "bird");
if (argc == 2)
@@ -812,29 +808,6 @@ parse_args(int argc, char **argv)
while ((c = getopt(argc, argv, opt_list)) >= 0)
switch (c)
{
- case 'B':
- bp = atoi(optarg);
- if (bp < 1)
- {
- fprintf(stderr, "Strange block size power %d\n\n", bp);
- display_usage();
- exit(1);
- }
-
- if ((1 << bp) < page_size)
- {
- fprintf(stderr, "Requested block size %ld is lesser than page size %ld\n\n", (1L<<bp), page_size);
- display_usage();
- exit(1);
- }
-
- if ((1L << bp) > page_size)
- {
- alloc_multipage = 1;
- page_size = (1L << bp);
- }
-
- break;
case 'c':
config_name = optarg;
config_changed = 1;
@@ -889,8 +862,6 @@ parse_args(int argc, char **argv)
}
}
-void resource_sys_init(void);
-
/*
* Hic Est main()
*/
@@ -904,7 +875,6 @@ main(int argc, char **argv)
#endif
times_update();
- resource_sys_init();
parse_args(argc, argv);
log_switch(1, NULL, NULL);
@@ -915,8 +885,8 @@ main(int argc, char **argv)
resource_init();
birdloop_init();
olock_init();
- io_init();
rt_init();
+ io_init();
if_init();
// roa_init();
config_init();
@@ -940,8 +910,6 @@ main(int argc, char **argv)
open_pid_file();
protos_build();
- proto_build(&proto_unix_kernel);
- proto_build(&proto_unix_iface);
struct config *conf = read_config();