summaryrefslogtreecommitdiff
path: root/sysdep
diff options
context:
space:
mode:
Diffstat (limited to 'sysdep')
-rw-r--r--sysdep/bsd/krt-sock.c25
-rw-r--r--sysdep/linux/krt-sys.h32
-rw-r--r--sysdep/linux/netlink.Y48
-rw-r--r--sysdep/linux/netlink.c558
-rw-r--r--sysdep/unix/Makefile4
-rw-r--r--sysdep/unix/alloc.c190
-rw-r--r--sysdep/unix/domain.c116
-rw-r--r--sysdep/unix/io-loop.c578
-rw-r--r--sysdep/unix/io-loop.h43
-rw-r--r--sysdep/unix/io.c196
-rw-r--r--sysdep/unix/krt.Y5
-rw-r--r--sysdep/unix/krt.c485
-rw-r--r--sysdep/unix/krt.h17
-rw-r--r--sysdep/unix/log.c56
-rw-r--r--sysdep/unix/main.c24
15 files changed, 1497 insertions, 880 deletions
diff --git a/sysdep/bsd/krt-sock.c b/sysdep/bsd/krt-sock.c
index 521f43f3..1c1bd50c 100644
--- a/sysdep/bsd/krt-sock.c
+++ b/sysdep/bsd/krt-sock.c
@@ -25,7 +25,7 @@
#include "nest/bird.h"
#include "nest/iface.h"
-#include "nest/route.h"
+#include "nest/rt.h"
#include "nest/protocol.h"
#include "nest/iface.h"
#include "sysdep/unix/unix.h"
@@ -398,7 +398,6 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan)
/* p is NULL iff KRT_SHARED_SOCKET and !scan */
int ipv6;
- rte *e;
net *net;
sockaddr dst, gate, mask;
ip_addr idst, igate, imask;
@@ -519,11 +518,10 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan)
net = net_get(p->p.main_channel->table, &ndst);
rta a = {
- .src = p->p.main_source,
- .source = RTS_INHERIT,
- .scope = SCOPE_UNIVERSE,
};
+ ea_set_attr_u32(&a->eattrs, &ea_gen_source, 0, RTS_INHERIT);
+
/* reject/blackhole routes have also set RTF_GATEWAY,
we wil check them first. */
@@ -579,19 +577,16 @@ krt_read_route(struct ks_msg *msg, struct krt_proto *p, int scan)
}
}
- done:
- e = rte_get_temp(&a);
- e->net = net;
- e->u.krt.src = src;
- e->u.krt.proto = src2;
- e->u.krt.seen = 0;
- e->u.krt.best = 0;
- e->u.krt.metric = 0;
+ done:;
+ rte e0 = { .attrs = &a, .net = net, };
+
+ ea_set_attr(e0.attrs->eattrs,
+ EA_LITERAL_EMBEDDED(EA_KRT_SOURCE, T_INT, 0, src2));
if (scan)
- krt_got_route(p, e);
+ krt_got_route(p, &e0, src);
else
- krt_got_route_async(p, e, new);
+ krt_got_route_async(p, &e0, new, src);
}
static void
diff --git a/sysdep/linux/krt-sys.h b/sysdep/linux/krt-sys.h
index 8897f889..aa90f6e4 100644
--- a/sysdep/linux/krt-sys.h
+++ b/sysdep/linux/krt-sys.h
@@ -34,38 +34,6 @@ static inline struct ifa * kif_get_primary_ip(struct iface *i UNUSED) { return N
#define KRT_ALLOW_MERGE_PATHS 1
-#define EA_KRT_PREFSRC EA_CODE(PROTOCOL_KERNEL, 0x10)
-#define EA_KRT_REALM EA_CODE(PROTOCOL_KERNEL, 0x11)
-#define EA_KRT_SCOPE EA_CODE(PROTOCOL_KERNEL, 0x12)
-
-
-#define KRT_METRICS_MAX 0x10 /* RTAX_QUICKACK+1 */
-#define KRT_METRICS_OFFSET 0x20 /* Offset of EA_KRT_* vs RTAX_* */
-
-#define KRT_FEATURES_MAX 4
-
-/*
- * Following attributes are parts of RTA_METRICS kernel route attribute, their
- * ids must be consistent with their RTAX_* constants (+ KRT_METRICS_OFFSET)
- */
-#define EA_KRT_METRICS EA_CODE(PROTOCOL_KERNEL, 0x20) /* Dummy one */
-#define EA_KRT_LOCK EA_CODE(PROTOCOL_KERNEL, 0x21)
-#define EA_KRT_MTU EA_CODE(PROTOCOL_KERNEL, 0x22)
-#define EA_KRT_WINDOW EA_CODE(PROTOCOL_KERNEL, 0x23)
-#define EA_KRT_RTT EA_CODE(PROTOCOL_KERNEL, 0x24)
-#define EA_KRT_RTTVAR EA_CODE(PROTOCOL_KERNEL, 0x25)
-#define EA_KRT_SSTRESH EA_CODE(PROTOCOL_KERNEL, 0x26)
-#define EA_KRT_CWND EA_CODE(PROTOCOL_KERNEL, 0x27)
-#define EA_KRT_ADVMSS EA_CODE(PROTOCOL_KERNEL, 0x28)
-#define EA_KRT_REORDERING EA_CODE(PROTOCOL_KERNEL, 0x29)
-#define EA_KRT_HOPLIMIT EA_CODE(PROTOCOL_KERNEL, 0x2a)
-#define EA_KRT_INITCWND EA_CODE(PROTOCOL_KERNEL, 0x2b)
-#define EA_KRT_FEATURES EA_CODE(PROTOCOL_KERNEL, 0x2c)
-#define EA_KRT_RTO_MIN EA_CODE(PROTOCOL_KERNEL, 0x2d)
-#define EA_KRT_INITRWND EA_CODE(PROTOCOL_KERNEL, 0x2e)
-#define EA_KRT_QUICKACK EA_CODE(PROTOCOL_KERNEL, 0x2f)
-
-
struct krt_params {
u32 table_id; /* Kernel table ID we sync with */
u32 metric; /* Kernel metric used for all routes */
diff --git a/sysdep/linux/netlink.Y b/sysdep/linux/netlink.Y
index 487ad1d8..7ba8c7c9 100644
--- a/sysdep/linux/netlink.Y
+++ b/sysdep/linux/netlink.Y
@@ -11,9 +11,6 @@ CF_HDR
CF_DECLS
CF_KEYWORDS(KERNEL, TABLE, METRIC, NETLINK, RX, BUFFER,
- KRT_PREFSRC, KRT_REALM, KRT_SCOPE, KRT_MTU, KRT_WINDOW,
- KRT_RTT, KRT_RTTVAR, KRT_SSTRESH, KRT_CWND, KRT_ADVMSS, KRT_REORDERING,
- KRT_HOPLIMIT, KRT_INITCWND, KRT_RTO_MIN, KRT_INITRWND, KRT_QUICKACK,
KRT_LOCK_MTU, KRT_LOCK_WINDOW, KRT_LOCK_RTT, KRT_LOCK_RTTVAR,
KRT_LOCK_SSTRESH, KRT_LOCK_CWND, KRT_LOCK_ADVMSS, KRT_LOCK_REORDERING,
KRT_LOCK_HOPLIMIT, KRT_LOCK_RTO_MIN, KRT_FEATURE_ECN, KRT_FEATURE_ALLFRAG)
@@ -28,39 +25,22 @@ kern_sys_item:
| NETLINK RX BUFFER expr { THIS_KRT->sys.netlink_rx_buffer = $4; }
;
-dynamic_attr: KRT_PREFSRC { $$ = f_new_dynamic_attr(EAF_TYPE_IP_ADDRESS, T_IP, EA_KRT_PREFSRC); } ;
-dynamic_attr: KRT_REALM { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_REALM); } ;
-dynamic_attr: KRT_SCOPE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SCOPE); } ;
-
-dynamic_attr: KRT_MTU { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_MTU); } ;
-dynamic_attr: KRT_WINDOW { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_WINDOW); } ;
-dynamic_attr: KRT_RTT { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTT); } ;
-dynamic_attr: KRT_RTTVAR { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTTVAR); } ;
-dynamic_attr: KRT_SSTRESH { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SSTRESH); } ;
-dynamic_attr: KRT_CWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_CWND); } ;
-dynamic_attr: KRT_ADVMSS { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_ADVMSS); } ;
-dynamic_attr: KRT_REORDERING { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_REORDERING); } ;
-dynamic_attr: KRT_HOPLIMIT { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_HOPLIMIT); } ;
-dynamic_attr: KRT_INITCWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_INITCWND); } ;
-dynamic_attr: KRT_RTO_MIN { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_RTO_MIN); } ;
-dynamic_attr: KRT_INITRWND { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_INITRWND); } ;
-dynamic_attr: KRT_QUICKACK { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_QUICKACK); } ;
-
/* Bits of EA_KRT_LOCK, based on RTAX_* constants */
-dynamic_attr: KRT_LOCK_MTU { $$ = f_new_dynamic_attr_bit(2, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_WINDOW { $$ = f_new_dynamic_attr_bit(3, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_RTT { $$ = f_new_dynamic_attr_bit(4, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_RTTVAR { $$ = f_new_dynamic_attr_bit(5, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_SSTRESH { $$ = f_new_dynamic_attr_bit(6, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_CWND { $$ = f_new_dynamic_attr_bit(7, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_ADVMSS { $$ = f_new_dynamic_attr_bit(8, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_REORDERING { $$ = f_new_dynamic_attr_bit(9, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_HOPLIMIT { $$ = f_new_dynamic_attr_bit(10, T_BOOL, EA_KRT_LOCK); } ;
-dynamic_attr: KRT_LOCK_RTO_MIN { $$ = f_new_dynamic_attr_bit(13, T_BOOL, EA_KRT_LOCK); } ;
-
-dynamic_attr: KRT_FEATURE_ECN { $$ = f_new_dynamic_attr_bit(0, T_BOOL, EA_KRT_FEATURES); } ;
-dynamic_attr: KRT_FEATURE_ALLFRAG { $$ = f_new_dynamic_attr(3, T_BOOL, EA_KRT_FEATURES); } ;
+attr_bit: KRT_LOCK_MTU { $$ = f_new_dynamic_attr_bit(2, "krt_lock"); } ;
+attr_bit: KRT_LOCK_WINDOW { $$ = f_new_dynamic_attr_bit(3, "krt_lock"); } ;
+attr_bit: KRT_LOCK_RTT { $$ = f_new_dynamic_attr_bit(4, "krt_lock"); } ;
+attr_bit: KRT_LOCK_RTTVAR { $$ = f_new_dynamic_attr_bit(5, "krt_lock"); } ;
+attr_bit: KRT_LOCK_SSTRESH { $$ = f_new_dynamic_attr_bit(6, "krt_lock"); } ;
+attr_bit: KRT_LOCK_CWND { $$ = f_new_dynamic_attr_bit(7, "krt_lock"); } ;
+attr_bit: KRT_LOCK_ADVMSS { $$ = f_new_dynamic_attr_bit(8, "krt_lock"); } ;
+attr_bit: KRT_LOCK_REORDERING { $$ = f_new_dynamic_attr_bit(9, "krt_lock"); } ;
+attr_bit: KRT_LOCK_HOPLIMIT { $$ = f_new_dynamic_attr_bit(10, "krt_lock"); } ;
+attr_bit: KRT_LOCK_RTO_MIN { $$ = f_new_dynamic_attr_bit(13, "krt_lock"); } ;
+
+/* Bits of EA_KRT_FEATURES */
+attr_bit: KRT_FEATURE_ECN { $$ = f_new_dynamic_attr_bit(0, "krt_features"); } ;
+attr_bit: KRT_FEATURE_ALLFRAG { $$ = f_new_dynamic_attr_bit(3, "krt_features"); } ;
CF_CODE
diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c
index bc65e0d2..099ae6e9 100644
--- a/sysdep/linux/netlink.c
+++ b/sysdep/linux/netlink.c
@@ -17,7 +17,7 @@
#undef LOCAL_DEBUG
#include "nest/bird.h"
-#include "nest/route.h"
+#include "nest/rt.h"
#include "nest/protocol.h"
#include "nest/iface.h"
#include "lib/alloca.h"
@@ -26,6 +26,7 @@
#include "lib/socket.h"
#include "lib/string.h"
#include "lib/hash.h"
+#include "lib/macro.h"
#include "conf/conf.h"
#include <asm/types.h>
@@ -109,8 +110,8 @@ struct nl_parse_state
int scan;
int merge;
- net *net;
- rta *attrs;
+ net_addr *net;
+ ea_list *attrs;
struct krt_proto *proto;
s8 new;
s8 krt_src;
@@ -122,6 +123,101 @@ struct nl_parse_state
};
/*
+ * Netlink eattr definitions
+ */
+
+#define KRT_METRICS_MAX ARRAY_SIZE(ea_krt_metrics)
+#define KRT_FEATURES_MAX 4
+
+static void krt_bitfield_format(const eattr *e, byte *buf, uint buflen);
+
+static struct ea_class
+ ea_krt_prefsrc = {
+ .name = "krt_prefsrc",
+ .type = T_IP,
+ },
+ ea_krt_realm = {
+ .name = "krt_realm",
+ .type = T_INT,
+ },
+ ea_krt_scope = {
+ .name = "krt_scope",
+ .type = T_INT,
+ };
+
+static struct ea_class ea_krt_metrics[] = {
+ [RTAX_LOCK] = {
+ .name = "krt_lock",
+ .type = T_INT,
+ .format = krt_bitfield_format,
+ },
+ [RTAX_FEATURES] = {
+ .name = "krt_features",
+ .type = T_INT,
+ .format = krt_bitfield_format,
+ },
+#define KRT_METRIC_INT(_rtax, _name) [_rtax] = { .name = _name, .type = T_INT }
+ KRT_METRIC_INT(RTAX_MTU, "krt_mtu"),
+ KRT_METRIC_INT(RTAX_WINDOW, "krt_window"),
+ KRT_METRIC_INT(RTAX_RTT, "krt_rtt"),
+ KRT_METRIC_INT(RTAX_RTTVAR, "krt_rttvar"),
+ KRT_METRIC_INT(RTAX_SSTHRESH, "krt_sstresh"),
+ KRT_METRIC_INT(RTAX_CWND, "krt_cwnd"),
+ KRT_METRIC_INT(RTAX_ADVMSS, "krt_advmss"),
+ KRT_METRIC_INT(RTAX_REORDERING, "krt_reordering"),
+ KRT_METRIC_INT(RTAX_HOPLIMIT, "krt_hoplimit"),
+ KRT_METRIC_INT(RTAX_INITCWND, "krt_initcwnd"),
+ KRT_METRIC_INT(RTAX_RTO_MIN, "krt_rto_min"),
+ KRT_METRIC_INT(RTAX_INITRWND, "krt_initrwnd"),
+ KRT_METRIC_INT(RTAX_QUICKACK, "krt_quickack"),
+#undef KRT_METRIC_INT
+};
+
+static const char *krt_metrics_names[KRT_METRICS_MAX] = {
+ NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss",
+ "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack"
+};
+
+static const char *krt_features_names[KRT_FEATURES_MAX] = {
+ "ecn", NULL, NULL, "allfrag"
+};
+
+static void
+krt_bitfield_format(const eattr *a, byte *buf, uint buflen)
+{
+ if (a->id == ea_krt_metrics[RTAX_LOCK].id)
+ ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX);
+ else if (a->id == ea_krt_metrics[RTAX_FEATURES].id)
+ ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX);
+}
+
+static void
+nl_ea_register(void)
+{
+ EA_REGISTER_ALL(
+ &ea_krt_prefsrc,
+ &ea_krt_realm,
+ &ea_krt_scope
+ );
+
+ for (uint i = 0; i < KRT_METRICS_MAX; i++)
+ {
+ if (!ea_krt_metrics[i].name)
+ ea_krt_metrics[i] = (struct ea_class) {
+ .name = mb_sprintf(&root_pool, "krt_metric_%d", i),
+ .type = T_INT,
+ };
+
+ ea_register_init(&ea_krt_metrics[i]);
+ }
+
+ for (uint i = 1; i < KRT_METRICS_MAX; i++)
+ ASSERT_DIE(ea_krt_metrics[i].id == ea_krt_metrics[0].id + i);
+}
+
+
+
+/*
* Synchronous Netlink interface
*/
@@ -747,12 +843,12 @@ nl_add_nexthop(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af UNUS
}
static void
-nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, ea_list *eattrs)
+nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop_adata *nhad, int af, ea_list *eattrs)
{
struct rtattr *a = nl_open_attr(h, bufsize, RTA_MULTIPATH);
- eattr *flow = ea_find(eattrs, EA_KRT_REALM);
+ eattr *flow = ea_find(eattrs, &ea_krt_realm);
- for (; nh; nh = nh->next)
+ NEXTHOP_WALK(nh, nhad)
{
struct rtnexthop *rtnh = nl_open_nexthop(h, bufsize);
@@ -776,31 +872,44 @@ nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, e
nl_close_attr(h, a);
}
-static struct nexthop *
+static struct nexthop_adata *
nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr *n, struct rtattr *ra, int af, int krt_src)
{
struct rtattr *a[BIRD_RTA_MAX];
- struct rtnexthop *nh = RTA_DATA(ra);
- struct nexthop *rv, *first, **last;
- unsigned len = RTA_PAYLOAD(ra);
+ struct rtnexthop *nh, *orig_nh = RTA_DATA(ra);
+ unsigned len, orig_len = RTA_PAYLOAD(ra);
+ uint cnt = 0;
- first = NULL;
- last = &first;
+ /* First count the nexthops */
+ for (len = orig_len, nh = orig_nh; len; len -= NLMSG_ALIGN(nh->rtnh_len), nh = RTNH_NEXT(nh))
+ {
+ /* Use RTNH_OK(nh,len) ?? */
+ if ((len < sizeof(*nh)) || (len < nh->rtnh_len))
+ goto err;
+
+ if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD))
+ ;
+ else
+ cnt++;
+ }
+
+ struct nexthop_adata *nhad = lp_allocz(s->pool, cnt * NEXTHOP_MAX_SIZE + sizeof *nhad);
+ struct nexthop *rv = &nhad->nh;
- while (len)
+ for (len = orig_len, nh = orig_nh; len; len -= NLMSG_ALIGN(nh->rtnh_len), nh = RTNH_NEXT(nh))
{
/* Use RTNH_OK(nh,len) ?? */
if ((len < sizeof(*nh)) || (len < nh->rtnh_len))
goto err;
if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD))
- goto next;
+ continue;
- *last = rv = lp_allocz(s->pool, NEXTHOP_MAX_SIZE);
- last = &(rv->next);
+ *rv = (struct nexthop) {
+ .weight = nh->rtnh_hops,
+ .iface = if_find_by_index(nh->rtnh_ifindex),
+ };
- rv->weight = nh->rtnh_hops;
- rv->iface = if_find_by_index(nh->rtnh_ifindex);
if (!rv->iface)
{
log(L_ERR "KRT: Received route %N with unknown ifindex %u", n, nh->rtnh_ifindex);
@@ -879,16 +988,14 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr
}
#endif
- next:
- len -= NLMSG_ALIGN(nh->rtnh_len);
- nh = RTNH_NEXT(nh);
+ rv = NEXTHOP_NEXT(rv);
}
- /* Ensure nexthops are sorted to satisfy nest invariant */
- if (!nexthop_is_sorted(first))
- first = nexthop_sort(first);
+ /* Store final length */
+ nhad->ad.length = (void *) rv - (void *) nhad->ad.data;
- return first;
+ /* Ensure nexthops are sorted to satisfy nest invariant */
+ return nexthop_is_sorted(nhad) ? nhad : nexthop_sort(nhad, s->pool);
err:
log(L_ERR "KRT: Received strange multipath route %N", n);
@@ -1316,11 +1423,16 @@ HASH_DEFINE_REHASH_FN(RTH, struct krt_proto)
int
krt_capable(rte *e)
{
- rta *a = e->attrs;
+ eattr *ea = ea_find(e->attrs, &ea_gen_nexthop);
+ if (!ea)
+ return 0;
+
+ struct nexthop_adata *nhad = (void *) ea->u.ptr;
+ if (NEXTHOP_IS_REACHABLE(nhad))
+ return 1;
- switch (a->dest)
+ switch (nhad->dest)
{
- case RTD_UNICAST:
case RTD_BLACKHOLE:
case RTD_UNREACHABLE:
case RTD_PROHIBIT:
@@ -1332,22 +1444,21 @@ krt_capable(rte *e)
}
static inline int
-nh_bufsize(struct nexthop *nh)
+nh_bufsize(struct nexthop_adata *nhad)
{
int rv = 0;
- for (; nh != NULL; nh = nh->next)
+ NEXTHOP_WALK(nh, nhad)
rv += RTNH_LENGTH(RTA_LENGTH(sizeof(ip_addr)));
return rv;
}
static int
-nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)
+nl_send_route(struct krt_proto *p, const rte *e, int op, int dest, struct nexthop_adata *nh)
{
eattr *ea;
- net *net = e->net;
- rta *a = e->attrs;
- ea_list *eattrs = a->eattrs;
- int bufsize = 128 + KRT_METRICS_MAX*8 + nh_bufsize(&(a->nh));
+ ea_list *eattrs = e->attrs;
+
+ int bufsize = 128 + KRT_METRICS_MAX*8 + (nh ? nh_bufsize(nh) : 0);
u32 priority = 0;
struct {
@@ -1359,7 +1470,7 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)
int rsize = sizeof(*r) + bufsize;
r = alloca(rsize);
- DBG("nl_send_route(%N,op=%x)\n", net->n.addr, op);
+ DBG("nl_send_route(%N,op=%x)\n", e->net, op);
bzero(&r->h, sizeof(r->h));
bzero(&r->r, sizeof(r->r));
@@ -1368,7 +1479,7 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)
r->h.nlmsg_flags = op | NLM_F_REQUEST | NLM_F_ACK;
r->r.rtm_family = p->af;
- r->r.rtm_dst_len = net_pxlen(net->n.addr);
+ r->r.rtm_dst_len = net_pxlen(e->net);
r->r.rtm_protocol = RTPROT_BIRD;
r->r.rtm_scope = RT_SCOPE_NOWHERE;
#ifdef HAVE_MPLS_KERNEL
@@ -1380,7 +1491,7 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)
* 2) Never use RTA_PRIORITY
*/
- u32 label = net_mpls(net->n.addr);
+ u32 label = net_mpls(e->net);
nl_add_attr_mpls(&r->h, rsize, RTA_DST, 1, &label);
r->r.rtm_scope = RT_SCOPE_UNIVERSE;
r->r.rtm_type = RTN_UNICAST;
@@ -1388,12 +1499,12 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)
else
#endif
{
- nl_add_attr_ipa(&r->h, rsize, RTA_DST, net_prefix(net->n.addr));
+ nl_add_attr_ipa(&r->h, rsize, RTA_DST, net_prefix(e->net));
/* Add source address for IPv6 SADR routes */
- if (net->n.addr->type == NET_IP6_SADR)
+ if (e->net->type == NET_IP6_SADR)
{
- net_addr_ip6_sadr *a = (void *) &net->n.addr;
+ net_addr_ip6_sadr *a = (void *) &e->net;
nl_add_attr_ip6(&r->h, rsize, RTA_SRC, a->src_prefix);
r->r.rtm_src_len = a->src_pxlen;
}
@@ -1413,11 +1524,9 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)
if (p->af == AF_MPLS)
priority = 0;
- else if (a->source == RTS_DUMMY)
- priority = e->u.krt.metric;
else if (KRT_CF->sys.metric)
priority = KRT_CF->sys.metric;
- else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, EA_KRT_METRIC)))
+ else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, &ea_krt_metric)))
priority = ea->u.data;
if (priority)
@@ -1430,15 +1539,15 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)
/* Default scope is LINK for device routes, UNIVERSE otherwise */
if (p->af == AF_MPLS)
r->r.rtm_scope = RT_SCOPE_UNIVERSE;
- else if (ea = ea_find(eattrs, EA_KRT_SCOPE))
+ else if (ea = ea_find(eattrs, &ea_krt_scope))
r->r.rtm_scope = ea->u.data;
else
- r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
+ r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->nh.gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
- if (ea = ea_find(eattrs, EA_KRT_PREFSRC))
+ if (ea = ea_find(eattrs, &ea_krt_prefsrc))
nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data);
- if (ea = ea_find(eattrs, EA_KRT_REALM))
+ if (ea = ea_find(eattrs, &ea_krt_realm))
nl_add_attr_u32(&r->h, rsize, RTA_FLOW, ea->u.data);
@@ -1446,9 +1555,9 @@ nl_send_route(struct krt_proto *p, rte *e, int op, int dest, struct nexthop *nh)
metrics[0] = 0;
struct ea_walk_state ews = { .eattrs = eattrs };
- while (ea = ea_walk(&ews, EA_KRT_METRICS, KRT_METRICS_MAX))
+ while (ea = ea_walk(&ews, ea_krt_metrics[0].id, KRT_METRICS_MAX))
{
- int id = ea->id - EA_KRT_METRICS;
+ int id = ea->id - ea_krt_metrics[0].id;
metrics[0] |= 1 << id;
metrics[id] = ea->u.data;
}
@@ -1462,14 +1571,14 @@ dest:
{
case RTD_UNICAST:
r->r.rtm_type = RTN_UNICAST;
- if (nh->next && !krt_ecmp6(p))
+ if (!NEXTHOP_ONE(nh) && !krt_ecmp6(p))
nl_add_multipath(&r->h, rsize, nh, p->af, eattrs);
else
{
- nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->iface->index);
- nl_add_nexthop(&r->h, rsize, nh, p->af);
+ nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->nh.iface->index);
+ nl_add_nexthop(&r->h, rsize, &nh->nh, p->af);
- if (nh->flags & RNF_ONLINK)
+ if (nh->nh.flags & RNF_ONLINK)
r->r.rtm_flags |= RTNH_F_ONLINK;
}
break;
@@ -1495,28 +1604,43 @@ dest:
static inline int
nl_add_rte(struct krt_proto *p, rte *e)
{
- rta *a = e->attrs;
+ ea_list *ea = e->attrs;
int err = 0;
- if (krt_ecmp6(p) && a->nh.next)
- {
- struct nexthop *nh = &(a->nh);
-
- err = nl_send_route(p, e, NL_OP_ADD, RTD_UNICAST, nh);
- if (err < 0)
- return err;
+ eattr *nhea = ea_find(ea, &ea_gen_nexthop);
+ struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL;
- for (nh = nh->next; nh; nh = nh->next)
- err += nl_send_route(p, e, NL_OP_APPEND, RTD_UNICAST, nh);
+ if (krt_ecmp6(p) && nhad && NEXTHOP_IS_REACHABLE(nhad) && !NEXTHOP_ONE(nhad))
+ {
+ uint cnt = 0;
+ NEXTHOP_WALK(nh, nhad)
+ {
+ struct {
+ struct nexthop_adata nhad;
+ u32 labels[MPLS_MAX_LABEL_STACK];
+ } nhx;
+ memcpy(&nhx.nhad.nh, nh, NEXTHOP_SIZE(nh));
+ nhx.nhad.ad.length = (void *) NEXTHOP_NEXT(&nhx.nhad.nh) - (void *) nhx.nhad.ad.data;
+
+ if (!cnt++)
+ {
+ err = nl_send_route(p, e, NL_OP_ADD, RTD_UNICAST, &nhx.nhad);
+ if (err < 0)
+ return err;
+ }
+ else
+ err += nl_send_route(p, e, NL_OP_APPEND, RTD_UNICAST, &nhx.nhad);
+ }
return err;
}
- return nl_send_route(p, e, NL_OP_ADD, a->dest, &(a->nh));
+ return nl_send_route(p, e, NL_OP_ADD,
+ NEXTHOP_IS_REACHABLE(nhad) ? RTD_UNICAST : nhad->dest, nhad);
}
static inline int
-nl_delete_rte(struct krt_proto *p, rte *e)
+nl_delete_rte(struct krt_proto *p, const rte *e)
{
int err = 0;
@@ -1531,13 +1655,15 @@ nl_delete_rte(struct krt_proto *p, rte *e)
static inline int
nl_replace_rte(struct krt_proto *p, rte *e)
{
- rta *a = e->attrs;
- return nl_send_route(p, e, NL_OP_REPLACE, a->dest, &(a->nh));
+ eattr *nhea = ea_find(e->attrs, &ea_gen_nexthop);
+ struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL;
+ return nl_send_route(p, e, NL_OP_REPLACE,
+ NEXTHOP_IS_REACHABLE(nhad) ? RTD_UNICAST : nhad->dest, nhad);
}
void
-krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old)
+krt_replace_rte(struct krt_proto *p, const net_addr *n UNUSED, rte *new, const rte *old)
{
int err = 0;
@@ -1576,7 +1702,7 @@ krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old)
}
static int
-nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint priority, uint krt_type, uint rtm_family)
+nl_mergable_route(struct nl_parse_state *s, const net_addr *net, struct krt_proto *p, uint priority, uint krt_type, uint rtm_family)
{
/* Route merging is used for IPv6 scans */
if (!s->scan || (rtm_family != AF_INET6))
@@ -1596,18 +1722,25 @@ nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint
static void
nl_announce_route(struct nl_parse_state *s)
{
- rte *e = rte_get_temp(s->attrs);
- e->net = s->net;
- e->u.krt.src = s->krt_src;
- e->u.krt.proto = s->krt_proto;
- e->u.krt.seen = 0;
- e->u.krt.best = 0;
- e->u.krt.metric = s->krt_metric;
+ rte e0 = {
+ .attrs = s->attrs,
+ .net = s->net,
+ };
+
+ EA_LOCAL_LIST(2) ea = {
+ .l = { .count = 2, .next = e0.attrs },
+ .a = {
+ EA_LITERAL_EMBEDDED(&ea_krt_source, 0, s->krt_proto),
+ EA_LITERAL_EMBEDDED(&ea_krt_metric, 0, s->krt_metric),
+ },
+ };
+
+ e0.attrs = &ea.l;
if (s->scan)
- krt_got_route(s->proto, e);
+ krt_got_route(s->proto, &e0, s->krt_src);
else
- krt_got_route_async(s->proto, e, s->new);
+ krt_got_route_async(s->proto, &e0, s->new, s->krt_src);
s->net = NULL;
s->attrs = NULL;
@@ -1757,92 +1890,121 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
krt_src = KRT_SRC_ALIEN;
}
- net_addr *n = &dst;
+ net_addr *net = &dst;
if (p->p.net_type == NET_IP6_SADR)
{
- n = alloca(sizeof(net_addr_ip6_sadr));
- net_fill_ip6_sadr(n, net6_prefix(&dst), net6_pxlen(&dst),
+ net = alloca(sizeof(net_addr_ip6_sadr));
+ net_fill_ip6_sadr(net, net6_prefix(&dst), net6_pxlen(&dst),
net6_prefix(&src), net6_pxlen(&src));
}
- net *net = net_get(p->p.main_channel->table, n);
-
if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type, i->rtm_family))
nl_announce_route(s);
- rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE);
- ra->src = p->p.main_source;
- ra->source = RTS_INHERIT;
- ra->scope = SCOPE_UNIVERSE;
+ ea_list *ra = NULL;
+ ea_set_attr_u32(&ra, &ea_gen_source, 0, RTS_INHERIT);
if (a[RTA_FLOW])
s->rta_flow = rta_get_u32(a[RTA_FLOW]);
else
s->rta_flow = 0;
+ union {
+ struct {
+ struct adata ad;
+ struct nexthop nh;
+ u32 labels[MPLS_MAX_LABEL_STACK];
+ };
+ struct nexthop_adata nhad;
+ } nhad = {};
+
switch (i->rtm_type)
{
case RTN_UNICAST:
- ra->dest = RTD_UNICAST;
-
if (a[RTA_MULTIPATH])
{
- struct nexthop *nh = nl_parse_multipath(s, p, n, a[RTA_MULTIPATH], i->rtm_family, krt_src);
+ struct nexthop_adata *nh = nl_parse_multipath(s, p, net, a[RTA_MULTIPATH], i->rtm_family, krt_src);
if (!nh)
SKIP("strange RTA_MULTIPATH\n");
- nexthop_link(ra, nh);
+ ea_set_attr(&ra, EA_LITERAL_DIRECT_ADATA(
+ &ea_gen_nexthop, 0, &nh->ad));
break;
}
if ((i->rtm_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD))
SKIP("ignore RTNH_F_DEAD\n");
- ra->nh.iface = if_find_by_index(oif);
- if (!ra->nh.iface)
+ nhad.nh.iface = if_find_by_index(oif);
+ if (!nhad.nh.iface)
{
- log(L_ERR "KRT: Received route %N with unknown ifindex %u", net->n.addr, oif);
+ log(L_ERR "KRT: Received route %N with unknown ifindex %u", net, oif);
return;
}
if (a[RTA_GATEWAY])
- ra->nh.gw = rta_get_ipa(a[RTA_GATEWAY]);
+ nhad.nh.gw = rta_get_ipa(a[RTA_GATEWAY]);
#ifdef HAVE_MPLS_KERNEL
if (a[RTA_VIA])
- ra->nh.gw = rta_get_via(a[RTA_VIA]);
+ nhad.nh.gw = rta_get_via(a[RTA_VIA]);
#endif
- if (ipa_nonzero(ra->nh.gw))
+ if (ipa_nonzero(nhad.nh.gw))
{
/* Silently skip strange 6to4 routes */
const net_addr_ip6 sit = NET_ADDR_IP6(IP6_NONE, 96);
- if ((i->rtm_family == AF_INET6) && ipa_in_netX(ra->nh.gw, (net_addr *) &sit))
+ if ((i->rtm_family == AF_INET6) && ipa_in_netX(nhad.nh.gw, (net_addr *) &sit))
return;
if (i->rtm_flags & RTNH_F_ONLINK)
- ra->nh.flags |= RNF_ONLINK;
+ nhad.nh.flags |= RNF_ONLINK;
neighbor *nbr;
- nbr = neigh_find(&p->p, ra->nh.gw, ra->nh.iface,
- (ra->nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0);
+ nbr = neigh_find(&p->p, nhad.nh.gw, nhad.nh.iface,
+ (nhad.nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0);
if (!nbr || (nbr->scope == SCOPE_HOST))
{
- log(L_ERR "KRT: Received route %N with strange next-hop %I", net->n.addr,
- ra->nh.gw);
+ log(L_ERR "KRT: Received route %N with strange next-hop %I", net,
+ nhad.nh.gw);
return;
}
}
+#ifdef HAVE_MPLS_KERNEL
+ if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !a[RTA_MULTIPATH])
+ nhad.nh.labels = rta_get_mpls(a[RTA_NEWDST], nhad.nh.label);
+
+ if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !a[RTA_MULTIPATH])
+ {
+ switch (rta_get_u16(a[RTA_ENCAP_TYPE]))
+ {
+ case LWTUNNEL_ENCAP_MPLS:
+ {
+ struct rtattr *enca[BIRD_RTA_MAX];
+ nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
+ nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
+ nhad.nh.labels = rta_get_mpls(enca[RTA_DST], nhad.nh.label);
+ break;
+ }
+ default:
+ SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE]));
+ break;
+ }
+ }
+#endif
+
+ /* Finalize the nexthop */
+ nhad.ad.length = (void *) NEXTHOP_NEXT(&nhad.nh) - (void *) nhad.ad.data;
break;
case RTN_BLACKHOLE:
- ra->dest = RTD_BLACKHOLE;
+ nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_BLACKHOLE);
break;
case RTN_UNREACHABLE:
- ra->dest = RTD_UNREACHABLE;
+ nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_UNREACHABLE);
break;
case RTN_PROHIBIT:
- ra->dest = RTD_PROHIBIT;
+ nhad.nhad = NEXTHOP_DEST_LITERAL(RTD_PROHIBIT);
break;
/* FIXME: What about RTN_THROW? */
default:
@@ -1850,105 +2012,36 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
return;
}
-#ifdef HAVE_MPLS_KERNEL
- if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !ra->nh.next)
- ra->nh.labels = rta_get_mpls(a[RTA_NEWDST], ra->nh.label);
-
- if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !ra->nh.next)
- {
- switch (rta_get_u16(a[RTA_ENCAP_TYPE]))
- {
- case LWTUNNEL_ENCAP_MPLS:
- {
- struct rtattr *enca[BIRD_RTA_MAX];
- nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
- nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
- ra->nh.labels = rta_get_mpls(enca[RTA_DST], ra->nh.label);
- break;
- }
- default:
- SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE]));
- break;
- }
- }
-#endif
-
if (i->rtm_scope != def_scope)
- {
- ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
- ea->next = ra->eattrs;
- ra->eattrs = ea;
- ea->flags = EALF_SORTED;
- ea->count = 1;
- ea->attrs[0].id = EA_KRT_SCOPE;
- ea->attrs[0].flags = 0;
- ea->attrs[0].type = EAF_TYPE_INT;
- ea->attrs[0].u.data = i->rtm_scope;
- }
+ ea_set_attr(&ra,
+ EA_LITERAL_EMBEDDED(&ea_krt_scope, 0, i->rtm_scope));
if (a[RTA_PREFSRC])
- {
- ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]);
-
- ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
- ea->next = ra->eattrs;
- ra->eattrs = ea;
- ea->flags = EALF_SORTED;
- ea->count = 1;
- ea->attrs[0].id = EA_KRT_PREFSRC;
- ea->attrs[0].flags = 0;
- ea->attrs[0].type = EAF_TYPE_IP_ADDRESS;
-
- struct adata *ad = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps));
- ad->length = sizeof(ps);
- memcpy(ad->data, &ps, sizeof(ps));
-
- ea->attrs[0].u.ptr = ad;
- }
+ {
+ ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]);
+
+ ea_set_attr(&ra,
+ EA_LITERAL_STORE_ADATA(&ea_krt_prefsrc, 0, &ps, sizeof(ps)));
+ }
/* Can be set per-route or per-nexthop */
if (s->rta_flow)
- {
- ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
- ea->next = ra->eattrs;
- ra->eattrs = ea;
- ea->flags = EALF_SORTED;
- ea->count = 1;
- ea->attrs[0].id = EA_KRT_REALM;
- ea->attrs[0].flags = 0;
- ea->attrs[0].type = EAF_TYPE_INT;
- ea->attrs[0].u.data = s->rta_flow;
- }
+ ea_set_attr(&ra,
+ EA_LITERAL_EMBEDDED(&ea_krt_realm, 0, s->rta_flow));
if (a[RTA_METRICS])
{
u32 metrics[KRT_METRICS_MAX];
- ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr));
- int t, n = 0;
-
if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0)
{
- log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net->n.addr);
+ log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net);
return;
}
- for (t = 1; t < KRT_METRICS_MAX; t++)
+ for (uint t = 1; t < KRT_METRICS_MAX; t++)
if (metrics[0] & (1 << t))
- {
- ea->attrs[n].id = EA_CODE(PROTOCOL_KERNEL, KRT_METRICS_OFFSET + t);
- ea->attrs[n].flags = 0;
- ea->attrs[n].type = EAF_TYPE_INT; /* FIXME: Some are EAF_TYPE_BITFIELD */
- ea->attrs[n].u.data = metrics[t];
- n++;
- }
-
- if (n > 0)
- {
- ea->next = ra->eattrs;
- ea->flags = EALF_SORTED;
- ea->count = n;
- ra->eattrs = ea;
- }
+ ea_set_attr(&ra,
+ EA_LITERAL_EMBEDDED(&ea_krt_metrics[t], 0, metrics[t]));
}
/*
@@ -1962,7 +2055,12 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
if (!s->net)
{
/* Store the new route */
- s->net = net;
+ s->net = lp_alloc(s->pool, net->length);
+ net_copy(s->net, net);
+
+ ea_set_attr_data(&ra, &ea_gen_nexthop, 0,
+ nhad.ad.data, nhad.ad.length);
+
s->attrs = ra;
s->proto = p;
s->new = new;
@@ -1974,20 +2072,18 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
else
{
/* Merge next hops with the stored route */
- rta *oa = s->attrs;
-
- struct nexthop *nhs = &oa->nh;
- nexthop_insert(&nhs, &ra->nh);
-
- /* Perhaps new nexthop is inserted at the first position */
- if (nhs == &ra->nh)
- {
- /* Swap rtas */
- s->attrs = ra;
-
- /* Keep old eattrs */
- ra->eattrs = oa->eattrs;
- }
+ eattr *nhea = ea_find(s->attrs, &ea_gen_nexthop);
+ struct nexthop_adata *nhad_old = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL;
+
+ if (nhad_old)
+ ea_set_attr(&s->attrs,
+ EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0,
+ &(nexthop_merge(nhad_old, &nhad.nhad,
+ KRT_CF->merge_paths, s->pool)->ad)
+ ));
+ else
+ ea_set_attr_data(&s->attrs, &ea_gen_nexthop, 0,
+ nhad.ad.data, nhad.ad.length);
}
}
@@ -2195,6 +2291,8 @@ krt_sys_io_init(void)
{
nl_linpool = lp_new_default(krt_pool);
HASH_INIT(nl_table_map, krt_pool, 6);
+
+ nl_ea_register();
}
int
@@ -2248,56 +2346,6 @@ krt_sys_copy_config(struct krt_config *d, struct krt_config *s)
d->sys.metric = s->sys.metric;
}
-static const char *krt_metrics_names[KRT_METRICS_MAX] = {
- NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss",
- "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack"
-};
-
-static const char *krt_features_names[KRT_FEATURES_MAX] = {
- "ecn", NULL, NULL, "allfrag"
-};
-
-int
-krt_sys_get_attr(const eattr *a, byte *buf, int buflen UNUSED)
-{
- switch (a->id)
- {
- case EA_KRT_PREFSRC:
- bsprintf(buf, "prefsrc");
- return GA_NAME;
-
- case EA_KRT_REALM:
- bsprintf(buf, "realm");
- return GA_NAME;
-
- case EA_KRT_SCOPE:
- bsprintf(buf, "scope");
- return GA_NAME;
-
- case EA_KRT_LOCK:
- buf += bsprintf(buf, "lock:");
- ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX);
- return GA_FULL;
-
- case EA_KRT_FEATURES:
- buf += bsprintf(buf, "features:");
- ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX);
- return GA_FULL;
-
- default:;
- int id = (int)EA_ID(a->id) - KRT_METRICS_OFFSET;
- if (id > 0 && id < KRT_METRICS_MAX)
- {
- bsprintf(buf, "%s", krt_metrics_names[id]);
- return GA_NAME;
- }
-
- return GA_UNKNOWN;
- }
-}
-
-
-
void
kif_sys_start(struct kif_proto *p UNUSED)
{
diff --git a/sysdep/unix/Makefile b/sysdep/unix/Makefile
index d0d36b5f..3f1a8b3a 100644
--- a/sysdep/unix/Makefile
+++ b/sysdep/unix/Makefile
@@ -1,7 +1,9 @@
-src := alloc.c io.c krt.c log.c main.c random.c
+src := alloc.c io.c io-loop.c krt.c log.c main.c random.c domain.c
obj := $(src-o-files)
$(all-daemon)
$(cf-local)
+$(call proto-build,kif_build)
+$(call proto-build,krt_build)
$(conf-y-targets): $(s)krt.Y
src := $(filter-out main.c, $(src))
diff --git a/sysdep/unix/alloc.c b/sysdep/unix/alloc.c
index 90453f7b..a2384ca8 100644
--- a/sysdep/unix/alloc.c
+++ b/sysdep/unix/alloc.c
@@ -19,111 +19,169 @@
#include <sys/mman.h>
#endif
-#ifdef HAVE_MMAP
-#define KEEP_PAGES 512
+long page_size = 0;
-static u64 page_size = 0;
-static _Bool use_fake = 0;
+#ifdef HAVE_MMAP
+#define KEEP_PAGES_MAIN_MAX 256
+#define KEEP_PAGES_MAIN_MIN 8
+#define CLEANUP_PAGES_BULK 256
-uint pages_kept = 0;
-static list pages_list;
+STATIC_ASSERT(KEEP_PAGES_MAIN_MIN * 4 < KEEP_PAGES_MAIN_MAX);
-static void cleanup_pages(void *data);
-static event page_cleanup_event = { .hook = cleanup_pages };
+static _Bool use_fake = 0;
+#if DEBUGGING
+struct free_page {
+ node unused[42];
+ node n;
+};
#else
-static const u64 page_size = 4096; /* Fake page size */
+struct free_page {
+ node n;
+};
#endif
-u64 get_page_size(void)
+struct free_pages {
+ list pages;
+ u16 min, max; /* Minimal and maximal number of free pages kept */
+ uint cnt; /* Number of empty pages */
+ event cleanup;
+};
+
+static void global_free_pages_cleanup_event(void *);
+
+static struct free_pages global_free_pages = {
+ .min = KEEP_PAGES_MAIN_MIN,
+ .max = KEEP_PAGES_MAIN_MAX,
+ .cleanup = { .hook = global_free_pages_cleanup_event },
+};
+
+uint *pages_kept = &global_free_pages.cnt;
+
+static void *
+alloc_sys_page(void)
{
- if (page_size)
- return page_size;
+ void *ptr = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-#ifdef HAVE_MMAP
- if (page_size = sysconf(_SC_PAGESIZE))
- {
- if ((u64_popcount(page_size) > 1) || (page_size > 16384))
- {
- /* Too big or strange page, use the aligned allocator instead */
- page_size = 4096;
- use_fake = 1;
- }
- return page_size;
- }
+ if (ptr == MAP_FAILED)
+ bug("mmap(%lu) failed: %m", page_size);
- bug("Page size must be non-zero");
-#endif
+ return ptr;
}
+extern int shutting_down; /* Shutdown requested. */
+
+#else // ! HAVE_MMAP
+#define use_fake 1
+#endif
+
void *
alloc_page(void)
{
-#ifdef HAVE_MMAP
- if (pages_kept)
- {
- node *page = TAIL(pages_list);
- rem_node(page);
- pages_kept--;
- memset(page, 0, get_page_size());
- return page;
- }
-
- if (!use_fake)
- {
- void *ret = mmap(NULL, get_page_size(), PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (ret == MAP_FAILED)
- bug("mmap(%lu) failed: %m", (long unsigned int) page_size);
- return ret;
- }
- else
-#endif
+ if (use_fake)
{
void *ptr = NULL;
int err = posix_memalign(&ptr, page_size, page_size);
+
if (err || !ptr)
bug("posix_memalign(%lu) failed", (long unsigned int) page_size);
+
return ptr;
}
+
+#ifdef HAVE_MMAP
+ struct free_pages *fps = &global_free_pages;
+
+ if (fps->cnt)
+ {
+ struct free_page *fp = SKIP_BACK(struct free_page, n, HEAD(fps->pages));
+ rem_node(&fp->n);
+ if ((--fps->cnt < fps->min) && !shutting_down)
+ ev_send(&global_work_list, &fps->cleanup);
+
+ bzero(fp, page_size);
+ return fp;
+ }
+
+ return alloc_sys_page();
+#endif
}
void
free_page(void *ptr)
{
-#ifdef HAVE_MMAP
- if (!use_fake)
+ if (use_fake)
{
- if (!pages_kept)
- init_list(&pages_list);
+ free(ptr);
+ return;
+ }
+
+#ifdef HAVE_MMAP
+ struct free_pages *fps = &global_free_pages;
+ struct free_page *fp = ptr;
- memset(ptr, 0, sizeof(node));
- add_tail(&pages_list, ptr);
+ fp->n = (node) {};
+ add_tail(&fps->pages, &fp->n);
- if (++pages_kept > KEEP_PAGES)
- ev_schedule(&page_cleanup_event);
- }
- else
+ if ((++fps->cnt > fps->max) && !shutting_down)
+ ev_send(&global_work_list, &fps->cleanup);
#endif
- free(ptr);
}
#ifdef HAVE_MMAP
static void
-cleanup_pages(void *data UNUSED)
+global_free_pages_cleanup_event(void *data UNUSED)
{
- for (uint seen = 0; (pages_kept > KEEP_PAGES) && (seen < KEEP_PAGES); seen++)
+ if (shutting_down)
+ return;
+
+ struct free_pages *fps = &global_free_pages;
+
+ while (fps->cnt / 2 < fps->min)
{
- void *ptr = HEAD(pages_list);
- rem_node(ptr);
- if (munmap(ptr, get_page_size()) == 0)
- pages_kept--;
+ struct free_page *fp = alloc_sys_page();
+ fp->n = (node) {};
+ add_tail(&fps->pages, &fp->n);
+ fps->cnt++;
+ }
+
+ for (uint seen = 0; (seen < CLEANUP_PAGES_BULK) && (fps->cnt > fps->max / 2); seen++)
+ {
+ struct free_page *fp = SKIP_BACK(struct free_page, n, TAIL(fps->pages));
+ rem_node(&fp->n);
+
+ if (munmap(fp, page_size) == 0)
+ fps->cnt--;
else if (errno == ENOMEM)
- add_tail(&pages_list, ptr);
+ add_head(&fps->pages, &fp->n);
else
- bug("munmap(%p) failed: %m", ptr);
+ bug("munmap(%p) failed: %m", fp);
}
-
- if (pages_kept > KEEP_PAGES)
- ev_schedule(&page_cleanup_event);
}
#endif
+
+void
+resource_sys_init(void)
+{
+#ifdef HAVE_MMAP
+ ASSERT_DIE(global_free_pages.cnt == 0);
+
+ if (!(page_size = sysconf(_SC_PAGESIZE)))
+ die("System page size must be non-zero");
+
+ if (u64_popcount(page_size) == 1)
+ {
+ struct free_pages *fps = &global_free_pages;
+
+ init_list(&fps->pages);
+ global_free_pages_cleanup_event(NULL);
+ return;
+ }
+
+ /* Too big or strange page, use the aligned allocator instead */
+ log(L_WARN "Got strange memory page size (%lu), using the aligned allocator instead", page_size);
+ use_fake = 1;
+#endif
+
+ page_size = 4096;
+}
diff --git a/sysdep/unix/domain.c b/sysdep/unix/domain.c
new file mode 100644
index 00000000..0a5858a6
--- /dev/null
+++ b/sysdep/unix/domain.c
@@ -0,0 +1,116 @@
+/*
+ * BIRD Locking
+ *
+ * (c) 2020 Maria Matejka <mq@jmq.cz>
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#undef LOCAL_DEBUG
+
+#undef DEBUG_LOCKING
+
+#include "lib/birdlib.h"
+#include "lib/locking.h"
+#include "lib/resource.h"
+#include "lib/timer.h"
+
+#include "conf/conf.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/*
+ * Locking subsystem
+ */
+
+_Thread_local struct lock_order locking_stack = {};
+_Thread_local struct domain_generic **last_locked = NULL;
+
+#define ASSERT_NO_LOCK ASSERT_DIE(last_locked == NULL)
+
+struct domain_generic {
+ pthread_mutex_t mutex;
+ uint order;
+ struct domain_generic **prev;
+ struct lock_order *locked_by;
+ const char *name;
+};
+
+#define DOMAIN_INIT(_name, _order) { .mutex = PTHREAD_MUTEX_INITIALIZER, .name = _name, .order = _order }
+
+static struct domain_generic the_bird_domain_gen = DOMAIN_INIT("The BIRD", OFFSETOF(struct lock_order, the_bird));
+
+DOMAIN(the_bird) the_bird_domain = { .the_bird = &the_bird_domain_gen };
+
+struct domain_generic *
+domain_new(const char *name, uint order)
+{
+ ASSERT_DIE(order < sizeof(struct lock_order));
+ struct domain_generic *dg = xmalloc(sizeof(struct domain_generic));
+ *dg = (struct domain_generic) DOMAIN_INIT(name, order);
+ return dg;
+}
+
+void
+domain_free(struct domain_generic *dg)
+{
+ pthread_mutex_destroy(&dg->mutex);
+ xfree(dg);
+}
+
+uint dg_order(struct domain_generic *dg)
+{
+ return dg->order;
+}
+
+void do_lock(struct domain_generic *dg, struct domain_generic **lsp)
+{
+ if ((char *) lsp - (char *) &locking_stack != dg->order)
+ bug("Trying to lock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack);
+
+ if (lsp <= last_locked)
+ bug("Trying to lock in a bad order");
+ if (*lsp)
+ bug("Inconsistent locking stack state on lock");
+
+ btime lock_begin = current_time();
+ pthread_mutex_lock(&dg->mutex);
+ btime duration = current_time() - lock_begin;
+ if (config && (duration > config->watchdog_warning))
+ log(L_WARN "Locking of %s took %d ms", dg->name, (int) (duration TO_MS));
+
+ if (dg->prev || dg->locked_by)
+ bug("Previous unlock not finished correctly");
+ dg->prev = last_locked;
+ *lsp = dg;
+ last_locked = lsp;
+ dg->locked_by = &locking_stack;
+}
+
+void do_unlock(struct domain_generic *dg, struct domain_generic **lsp)
+{
+ if ((char *) lsp - (char *) &locking_stack != dg->order)
+ bug("Trying to unlock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack);
+
+ if (dg->locked_by != &locking_stack)
+ bug("Inconsistent domain state on unlock");
+ if ((last_locked != lsp) || (*lsp != dg))
+ bug("Inconsistent locking stack state on unlock");
+ dg->locked_by = NULL;
+ last_locked = dg->prev;
+ *lsp = NULL;
+ dg->prev = NULL;
+ pthread_mutex_unlock(&dg->mutex);
+}
diff --git a/sysdep/unix/io-loop.c b/sysdep/unix/io-loop.c
new file mode 100644
index 00000000..3e3fc31a
--- /dev/null
+++ b/sysdep/unix/io-loop.c
@@ -0,0 +1,578 @@
+/*
+ * BIRD -- I/O and event loop
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <pthread.h>
+#include <time.h>
+#include <sys/time.h>
+
+#include "nest/bird.h"
+
+#include "lib/buffer.h"
+#include "lib/lists.h"
+#include "lib/resource.h"
+#include "lib/event.h"
+#include "lib/timer.h"
+#include "lib/socket.h"
+
+#include "lib/io-loop.h"
+#include "sysdep/unix/io-loop.h"
+#include "conf/conf.h"
+
+#define THREAD_STACK_SIZE 65536 /* To be lowered in near future */
+
+/*
+ * Current thread context
+ */
+
+_Thread_local struct birdloop *birdloop_current;
+static _Thread_local struct birdloop *birdloop_wakeup_masked;
+static _Thread_local uint birdloop_wakeup_masked_count;
+
+event_list *
+birdloop_event_list(struct birdloop *loop)
+{
+ return &loop->event_list;
+}
+
+struct timeloop *
+birdloop_time_loop(struct birdloop *loop)
+{
+ return &loop->time;
+}
+
+_Bool
+birdloop_inside(struct birdloop *loop)
+{
+ for (struct birdloop *c = birdloop_current; c; c = c->prev_loop)
+ if (loop == c)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Wakeup code for birdloop
+ */
+
+static void
+pipe_new(int *pfds)
+{
+ int rv = pipe(pfds);
+ if (rv < 0)
+ die("pipe: %m");
+
+ if (fcntl(pfds[0], F_SETFL, O_NONBLOCK) < 0)
+ die("fcntl(O_NONBLOCK): %m");
+
+ if (fcntl(pfds[1], F_SETFL, O_NONBLOCK) < 0)
+ die("fcntl(O_NONBLOCK): %m");
+}
+
+void
+pipe_drain(int fd)
+{
+ char buf[64];
+ int rv;
+
+ try:
+ rv = read(fd, buf, 64);
+ if (rv < 0)
+ {
+ if (errno == EINTR)
+ goto try;
+ if (errno == EAGAIN)
+ return;
+ die("wakeup read: %m");
+ }
+ if (rv == 64)
+ goto try;
+}
+
+void
+pipe_kick(int fd)
+{
+ u64 v = 1;
+ int rv;
+
+ try:
+ rv = write(fd, &v, sizeof(u64));
+ if (rv < 0)
+ {
+ if (errno == EINTR)
+ goto try;
+ if (errno == EAGAIN)
+ return;
+ die("wakeup write: %m");
+ }
+}
+
+static inline void
+wakeup_init(struct birdloop *loop)
+{
+ pipe_new(loop->wakeup_fds);
+}
+
+static inline void
+wakeup_drain(struct birdloop *loop)
+{
+ pipe_drain(loop->wakeup_fds[0]);
+}
+
+static inline void
+wakeup_do_kick(struct birdloop *loop)
+{
+ pipe_kick(loop->wakeup_fds[1]);
+}
+
+static inline void
+birdloop_do_ping(struct birdloop *loop)
+{
+ if (atomic_fetch_add_explicit(&loop->ping_sent, 1, memory_order_acq_rel))
+ return;
+
+ if (loop == birdloop_wakeup_masked)
+ birdloop_wakeup_masked_count++;
+ else
+ wakeup_do_kick(loop);
+}
+
+void
+birdloop_ping(struct birdloop *loop)
+{
+ if (birdloop_inside(loop) && !loop->ping_pending)
+ loop->ping_pending++;
+ else
+ birdloop_do_ping(loop);
+}
+
+
+/*
+ * Sockets
+ */
+
+static void
+sockets_init(struct birdloop *loop)
+{
+ init_list(&loop->sock_list);
+ loop->sock_num = 0;
+
+ BUFFER_INIT(loop->poll_sk, loop->pool, 4);
+ BUFFER_INIT(loop->poll_fd, loop->pool, 4);
+ loop->poll_changed = 1; /* add wakeup fd */
+}
+
+static void
+sockets_add(struct birdloop *loop, sock *s)
+{
+ add_tail(&loop->sock_list, &s->n);
+ loop->sock_num++;
+
+ s->index = -1;
+ loop->poll_changed = 1;
+
+ birdloop_ping(loop);
+}
+
+void
+sk_start(sock *s)
+{
+ ASSERT_DIE(birdloop_current != &main_birdloop);
+ sockets_add(birdloop_current, s);
+}
+
+static void
+sockets_remove(struct birdloop *loop, sock *s)
+{
+ rem_node(&s->n);
+ loop->sock_num--;
+
+ if (s->index >= 0)
+ {
+ loop->poll_sk.data[s->index] = NULL;
+ s->index = -1;
+ loop->poll_changed = 1;
+ loop->close_scheduled = 1;
+ birdloop_ping(loop);
+ }
+ else
+ close(s->fd);
+}
+
+void
+sk_stop(sock *s)
+{
+ sockets_remove(birdloop_current, s);
+}
+
+static inline uint sk_want_events(sock *s)
+{ return (s->rx_hook ? POLLIN : 0) | ((s->ttx != s->tpos) ? POLLOUT : 0); }
+
+/*
+FIXME: this should be called from sock code
+
+static void
+sockets_update(struct birdloop *loop, sock *s)
+{
+ if (s->index >= 0)
+ loop->poll_fd.data[s->index].events = sk_want_events(s);
+}
+*/
+
+static void
+sockets_prepare(struct birdloop *loop)
+{
+ BUFFER_SET(loop->poll_sk, loop->sock_num + 1);
+ BUFFER_SET(loop->poll_fd, loop->sock_num + 1);
+
+ struct pollfd *pfd = loop->poll_fd.data;
+ sock **psk = loop->poll_sk.data;
+ uint i = 0;
+ node *n;
+
+ WALK_LIST(n, loop->sock_list)
+ {
+ sock *s = SKIP_BACK(sock, n, n);
+
+ ASSERT(i < loop->sock_num);
+
+ s->index = i;
+ *psk = s;
+ pfd->fd = s->fd;
+ pfd->events = sk_want_events(s);
+ pfd->revents = 0;
+
+ pfd++;
+ psk++;
+ i++;
+ }
+
+ ASSERT(i == loop->sock_num);
+
+ /* Add internal wakeup fd */
+ *psk = NULL;
+ pfd->fd = loop->wakeup_fds[0];
+ pfd->events = POLLIN;
+ pfd->revents = 0;
+
+ loop->poll_changed = 0;
+}
+
+static void
+sockets_close_fds(struct birdloop *loop)
+{
+ struct pollfd *pfd = loop->poll_fd.data;
+ sock **psk = loop->poll_sk.data;
+ int poll_num = loop->poll_fd.used - 1;
+
+ int i;
+ for (i = 0; i < poll_num; i++)
+ if (psk[i] == NULL)
+ close(pfd[i].fd);
+
+ loop->close_scheduled = 0;
+}
+
+int sk_read(sock *s, int revents);
+int sk_write(sock *s);
+
+static void
+sockets_fire(struct birdloop *loop)
+{
+ struct pollfd *pfd = loop->poll_fd.data;
+ sock **psk = loop->poll_sk.data;
+ int poll_num = loop->poll_fd.used - 1;
+
+ times_update();
+
+ /* Last fd is internal wakeup fd */
+ if (pfd[poll_num].revents & POLLIN)
+ wakeup_drain(loop);
+
+ int i;
+ for (i = 0; i < poll_num; pfd++, psk++, i++)
+ {
+ int e = 1;
+
+ if (! pfd->revents)
+ continue;
+
+ if (pfd->revents & POLLNVAL)
+ die("poll: invalid fd %d", pfd->fd);
+
+ if (pfd->revents & POLLIN)
+ while (e && *psk && (*psk)->rx_hook)
+ e = sk_read(*psk, pfd->revents);
+
+ e = 1;
+ if (pfd->revents & POLLOUT)
+ {
+ loop->poll_changed = 1;
+ while (e && *psk)
+ e = sk_write(*psk);
+ }
+ }
+}
+
+
+/*
+ * Birdloop
+ */
+
+struct birdloop main_birdloop;
+
+static void birdloop_enter_locked(struct birdloop *loop);
+
+void
+birdloop_init(void)
+{
+ wakeup_init(&main_birdloop);
+
+ main_birdloop.time.domain = the_bird_domain.the_bird;
+ main_birdloop.time.loop = &main_birdloop;
+
+ times_update();
+ timers_init(&main_birdloop.time, &root_pool);
+
+ birdloop_enter_locked(&main_birdloop);
+}
+
+static void *birdloop_main(void *arg);
+
+struct birdloop *
+birdloop_new(pool *pp, uint order, const char *name)
+{
+ struct domain_generic *dg = domain_new(name, order);
+
+ pool *p = rp_new(pp, name);
+ struct birdloop *loop = mb_allocz(p, sizeof(struct birdloop));
+ loop->pool = p;
+
+ loop->time.domain = dg;
+ loop->time.loop = loop;
+
+ birdloop_enter(loop);
+
+ wakeup_init(loop);
+ ev_init_list(&loop->event_list, loop, name);
+ timers_init(&loop->time, p);
+ sockets_init(loop);
+
+ int e = 0;
+
+ if (e = pthread_attr_init(&loop->thread_attr))
+ die("pthread_attr_init() failed: %M", e);
+
+ if (e = pthread_attr_setstacksize(&loop->thread_attr, THREAD_STACK_SIZE))
+ die("pthread_attr_setstacksize(%u) failed: %M", THREAD_STACK_SIZE, e);
+
+ if (e = pthread_attr_setdetachstate(&loop->thread_attr, PTHREAD_CREATE_DETACHED))
+ die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e);
+
+ if (e = pthread_create(&loop->thread_id, &loop->thread_attr, birdloop_main, loop))
+ die("pthread_create() failed: %M", e);
+
+ birdloop_leave(loop);
+
+ return loop;
+}
+
+static void
+birdloop_do_stop(struct birdloop *loop, void (*stopped)(void *data), void *data)
+{
+ loop->stopped = stopped;
+ loop->stop_data = data;
+ wakeup_do_kick(loop);
+}
+
+void
+birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data)
+{
+ DG_LOCK(loop->time.domain);
+ birdloop_do_stop(loop, stopped, data);
+ DG_UNLOCK(loop->time.domain);
+}
+
+void
+birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data)
+{
+ ASSERT_DIE(loop == birdloop_current);
+ ASSERT_DIE(DG_IS_LOCKED(loop->time.domain));
+
+ birdloop_do_stop(loop, stopped, data);
+}
+
+void
+birdloop_free(struct birdloop *loop)
+{
+ ASSERT_DIE(loop->links == 0);
+ ASSERT_DIE(pthread_equal(pthread_self(), loop->thread_id));
+
+ rcu_birdloop_stop(&loop->rcu);
+ pthread_attr_destroy(&loop->thread_attr);
+
+ domain_free(loop->time.domain);
+ rfree(loop->pool);
+}
+
+static void
+birdloop_enter_locked(struct birdloop *loop)
+{
+ ASSERT_DIE(DG_IS_LOCKED(loop->time.domain));
+ ASSERT_DIE(!birdloop_inside(loop));
+
+ /* Store the old context */
+ loop->prev_loop = birdloop_current;
+
+ /* Put the new context */
+ birdloop_current = loop;
+}
+
+void
+birdloop_enter(struct birdloop *loop)
+{
+ DG_LOCK(loop->time.domain);
+ return birdloop_enter_locked(loop);
+}
+
+static void
+birdloop_leave_locked(struct birdloop *loop)
+{
+ /* Check the current context */
+ ASSERT_DIE(birdloop_current == loop);
+
+ /* Send pending pings */
+ if (loop->ping_pending)
+ {
+ loop->ping_pending = 0;
+ birdloop_do_ping(loop);
+ }
+
+ /* Restore the old context */
+ birdloop_current = loop->prev_loop;
+}
+
+void
+birdloop_leave(struct birdloop *loop)
+{
+ birdloop_leave_locked(loop);
+ DG_UNLOCK(loop->time.domain);
+}
+
+void
+birdloop_mask_wakeups(struct birdloop *loop)
+{
+ ASSERT_DIE(birdloop_wakeup_masked == NULL);
+ birdloop_wakeup_masked = loop;
+}
+
+void
+birdloop_unmask_wakeups(struct birdloop *loop)
+{
+ ASSERT_DIE(birdloop_wakeup_masked == loop);
+ birdloop_wakeup_masked = NULL;
+ if (birdloop_wakeup_masked_count)
+ wakeup_do_kick(loop);
+
+ birdloop_wakeup_masked_count = 0;
+}
+
+void
+birdloop_link(struct birdloop *loop)
+{
+ ASSERT_DIE(birdloop_inside(loop));
+ loop->links++;
+}
+
+void
+birdloop_unlink(struct birdloop *loop)
+{
+ ASSERT_DIE(birdloop_inside(loop));
+ loop->links--;
+}
+
+static void *
+birdloop_main(void *arg)
+{
+ struct birdloop *loop = arg;
+ timer *t;
+ int rv, timeout;
+
+ rcu_birdloop_start(&loop->rcu);
+
+ btime loop_begin = current_time();
+
+ tmp_init(loop->pool);
+
+ birdloop_enter(loop);
+ while (1)
+ {
+ timers_fire(&loop->time, 0);
+ if (ev_run_list(&loop->event_list))
+ timeout = 0;
+ else if (t = timers_first(&loop->time))
+ timeout = (tm_remains(t) TO_MS) + 1;
+ else
+ timeout = -1;
+
+ if (loop->poll_changed)
+ sockets_prepare(loop);
+
+ btime duration = current_time() - loop_begin;
+ if (duration > config->watchdog_warning)
+ log(L_WARN "I/O loop cycle took %d ms", (int) (duration TO_MS));
+
+ birdloop_leave(loop);
+
+ try:
+ rv = poll(loop->poll_fd.data, loop->poll_fd.used, timeout);
+ if (rv < 0)
+ {
+ if (errno == EINTR || errno == EAGAIN)
+ goto try;
+ die("poll: %m");
+ }
+
+ birdloop_enter(loop);
+
+ if (loop->close_scheduled)
+ sockets_close_fds(loop);
+
+ if (loop->stopped)
+ break;
+
+ loop_begin = current_time();
+
+ if (rv)
+ sockets_fire(loop);
+
+ atomic_exchange_explicit(&loop->ping_sent, 0, memory_order_acq_rel);
+ }
+
+ /* Flush remaining events */
+ ASSERT_DIE(!ev_run_list(&loop->event_list));
+
+ /* No timers allowed */
+ ASSERT_DIE(timers_count(&loop->time) == 0);
+ ASSERT_DIE(EMPTY_LIST(loop->sock_list));
+ ASSERT_DIE(loop->sock_num == 0);
+
+ birdloop_leave(loop);
+ loop->stopped(loop->stop_data);
+
+ return NULL;
+}
+
+void
+birdloop_yield(void)
+{
+ usleep(100);
+}
diff --git a/sysdep/unix/io-loop.h b/sysdep/unix/io-loop.h
new file mode 100644
index 00000000..31c40459
--- /dev/null
+++ b/sysdep/unix/io-loop.h
@@ -0,0 +1,43 @@
+/*
+ * BIRD -- I/O and event loop
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#ifndef _BIRD_SYSDEP_UNIX_IO_LOOP_H_
+#define _BIRD_SYSDEP_UNIX_IO_LOOP_H_
+
+#include "lib/rcu.h"
+
+struct birdloop
+{
+ pool *pool;
+
+ struct timeloop time;
+ event_list event_list;
+ list sock_list;
+ uint sock_num;
+
+ BUFFER(sock *) poll_sk;
+ BUFFER(struct pollfd) poll_fd;
+ u8 poll_changed;
+ u8 close_scheduled;
+
+ uint ping_pending;
+ _Atomic u32 ping_sent;
+ int wakeup_fds[2];
+
+ pthread_t thread_id;
+ pthread_attr_t thread_attr;
+
+ struct rcu_birdloop rcu;
+
+ uint links;
+
+ void (*stopped)(void *data);
+ void *stop_data;
+
+ struct birdloop *prev_loop;
+};
+
+#endif
diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c
index 17dc05a3..23baffb2 100644
--- a/sysdep/unix/io.c
+++ b/sysdep/unix/io.c
@@ -36,12 +36,14 @@
#include "lib/resource.h"
#include "lib/socket.h"
#include "lib/event.h"
+#include "lib/locking.h"
#include "lib/timer.h"
#include "lib/string.h"
#include "nest/iface.h"
#include "conf/conf.h"
#include "sysdep/unix/unix.h"
+#include "sysdep/unix/io-loop.h"
#include CONFIG_INCLUDE_SYSIO_H
/* Maximum number of calls of tx handler for one socket in one
@@ -122,55 +124,50 @@ rf_fileno(struct rfile *f)
btime boot_time;
+
void
-times_init(struct timeloop *loop)
+times_update(void)
{
struct timespec ts;
int rv;
+ btime old_time = current_time();
+ btime old_real_time = current_real_time();
+
rv = clock_gettime(CLOCK_MONOTONIC, &ts);
if (rv < 0)
die("Monotonic clock is missing");
if ((ts.tv_sec < 0) || (((u64) ts.tv_sec) > ((u64) 1 << 40)))
log(L_WARN "Monotonic clock is crazy");
-
- loop->last_time = ts.tv_sec S + ts.tv_nsec NS;
- loop->real_time = 0;
-}
-
-void
-times_update(struct timeloop *loop)
-{
- struct timespec ts;
- int rv;
-
- rv = clock_gettime(CLOCK_MONOTONIC, &ts);
- if (rv < 0)
- die("clock_gettime: %m");
-
+
btime new_time = ts.tv_sec S + ts.tv_nsec NS;
- if (new_time < loop->last_time)
+ if (new_time < old_time)
log(L_ERR "Monotonic clock is broken");
- loop->last_time = new_time;
- loop->real_time = 0;
-}
-
-void
-times_update_real_time(struct timeloop *loop)
-{
- struct timespec ts;
- int rv;
-
rv = clock_gettime(CLOCK_REALTIME, &ts);
if (rv < 0)
die("clock_gettime: %m");
- loop->real_time = ts.tv_sec S + ts.tv_nsec NS;
-}
+ btime new_real_time = ts.tv_sec S + ts.tv_nsec NS;
+ if (!atomic_compare_exchange_strong_explicit(
+ &last_time,
+ &old_time,
+ new_time,
+ memory_order_acq_rel,
+ memory_order_relaxed))
+ DBG("Time update collision: last_time");
+
+ if (!atomic_compare_exchange_strong_explicit(
+ &real_time,
+ &old_real_time,
+ new_real_time,
+ memory_order_acq_rel,
+ memory_order_relaxed))
+ DBG("Time update collision: real_time");
+}
/**
* DOC: Sockets
@@ -804,18 +801,16 @@ sk_free(resource *r)
sk_ssh_free(s);
#endif
- if (s->fd < 0)
+ if ((s->fd < 0) || (s->flags & SKF_THREAD))
return;
- /* FIXME: we should call sk_stop() for SKF_THREAD sockets */
- if (!(s->flags & SKF_THREAD))
- {
- if (s == current_sock)
- current_sock = sk_next(s);
- if (s == stored_sock)
- stored_sock = sk_next(s);
+ if (s == current_sock)
+ current_sock = sk_next(s);
+ if (s == stored_sock)
+ stored_sock = sk_next(s);
+
+ if (enlisted(&s->n))
rem_node(&s->n);
- }
if (s->type != SK_SSH && s->type != SK_SSH_ACTIVE)
close(s->fd);
@@ -1108,7 +1103,11 @@ sk_passive_connected(sock *s, int type)
return 1;
}
- sk_insert(t);
+ if (s->flags & SKF_PASSIVE_THREAD)
+ t->flags |= SKF_THREAD;
+ else
+ sk_insert(t);
+
sk_alloc_bufs(t);
s->rx_hook(t, 0);
return 1;
@@ -1516,6 +1515,36 @@ sk_open_unix(sock *s, char *name)
return 0;
}
+static void
+sk_reloop_hook(void *_vs)
+{
+ sock *s = _vs;
+ if (birdloop_inside(&main_birdloop))
+ {
+ s->flags &= ~SKF_THREAD;
+ sk_insert(s);
+ }
+ else
+ {
+ s->flags |= SKF_THREAD;
+ sk_start(s);
+ }
+}
+
+void
+sk_reloop(sock *s, struct birdloop *loop)
+{
+ if (enlisted(&s->n))
+ rem_node(&s->n);
+
+ s->reloop = (event) {
+ .hook = sk_reloop_hook,
+ .data = s,
+ };
+
+ ev_send_loop(loop, &s->reloop);
+}
+
#define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
@@ -1854,8 +1883,8 @@ sk_read_ssh(sock *s)
/* sk_read() and sk_write() are called from BFD's event loop */
-int
-sk_read(sock *s, int revents)
+static inline int
+sk_read_noflush(sock *s, int revents)
{
switch (s->type)
{
@@ -1918,7 +1947,15 @@ sk_read(sock *s, int revents)
}
int
-sk_write(sock *s)
+sk_read(sock *s, int revents)
+{
+ int e = sk_read_noflush(s, revents);
+ tmp_flush();
+ return e;
+}
+
+static inline int
+sk_write_noflush(sock *s)
{
switch (s->type)
{
@@ -1966,6 +2003,14 @@ sk_write(sock *s)
}
}
+int
+sk_write(sock *s)
+{
+ int e = sk_write_noflush(s);
+ tmp_flush();
+ return e;
+}
+
int sk_is_ipv4(sock *s)
{ return s->af == AF_INET; }
@@ -1984,6 +2029,7 @@ sk_err(sock *s, int revents)
}
s->err_hook(s, se);
+ tmp_flush();
}
void
@@ -2020,30 +2066,17 @@ struct event_log_entry
static struct event_log_entry event_log[EVENT_LOG_LENGTH];
static struct event_log_entry *event_open;
static int event_log_pos, event_log_num, watchdog_active;
-static btime last_time;
+static btime last_io_time;
static btime loop_time;
static void
io_update_time(void)
{
- struct timespec ts;
- int rv;
-
- /*
- * This is third time-tracking procedure (after update_times() above and
- * times_update() in BFD), dedicated to internal event log and latency
- * tracking. Hopefully, we consolidate these sometimes.
- */
-
- rv = clock_gettime(CLOCK_MONOTONIC, &ts);
- if (rv < 0)
- die("clock_gettime: %m");
-
- last_time = ts.tv_sec S + ts.tv_nsec NS;
+ last_io_time = current_time();
if (event_open)
{
- event_open->duration = last_time - event_open->timestamp;
+ event_open->duration = last_io_time - event_open->timestamp;
if (event_open->duration > config->latency_limit)
log(L_WARN "Event 0x%p 0x%p took %u.%03u ms",
@@ -2072,7 +2105,7 @@ io_log_event(void *hook, void *data)
en->hook = hook;
en->data = data;
- en->timestamp = last_time;
+ en->timestamp = last_io_time;
en->duration = 0;
event_log_num++;
@@ -2100,14 +2133,14 @@ io_log_dump(void)
struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
if (en->hook)
log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
- (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
+ (int) ((last_io_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
}
}
void
watchdog_sigalrm(int sig UNUSED)
{
- /* Update last_time and duration, but skip latency check */
+ /* Update last_io_time and duration, but skip latency check */
config->latency_limit = 0xffffffff;
io_update_time();
@@ -2120,7 +2153,7 @@ watchdog_start1(void)
{
io_update_time();
- loop_time = last_time;
+ loop_time = last_io_time;
}
static inline void
@@ -2128,7 +2161,7 @@ watchdog_start(void)
{
io_update_time();
- loop_time = last_time;
+ loop_time = last_io_time;
event_log_num = 0;
if (config->watchdog_timeout)
@@ -2149,7 +2182,7 @@ watchdog_stop(void)
watchdog_active = 0;
}
- btime duration = last_time - loop_time;
+ btime duration = last_io_time - loop_time;
if (duration > config->watchdog_warning)
log(L_WARN "I/O loop cycle took %u.%03u ms for %d events",
(uint) (duration TO_MS), (uint) (duration % 1000), event_log_num);
@@ -2164,8 +2197,9 @@ void
io_init(void)
{
init_list(&sock_list);
- init_list(&global_event_list);
- init_list(&global_work_list);
+ ev_init_list(&global_event_list, &main_birdloop, "Global event list");
+ ev_init_list(&global_work_list, &main_birdloop, "Global work list");
+ ev_init_list(&main_birdloop.event_list, &main_birdloop, "Global fast event list");
krt_io_init();
// XXX init_times();
// XXX update_times();
@@ -2179,6 +2213,8 @@ static int short_loops = 0;
#define SHORT_LOOP_MAX 10
#define WORK_EVENTS_MAX 10
+void pipe_drain(int fd);
+
void
io_loop(void)
{
@@ -2193,22 +2229,28 @@ io_loop(void)
watchdog_start1();
for(;;)
{
- times_update(&main_timeloop);
+ times_update();
events = ev_run_list(&global_event_list);
events = ev_run_list_limited(&global_work_list, WORK_EVENTS_MAX) || events;
- timers_fire(&main_timeloop);
+ events = ev_run_list(&main_birdloop.event_list) || events;
+ timers_fire(&main_birdloop.time, 1);
io_close_event();
// FIXME
poll_tout = (events ? 0 : 3000); /* Time in milliseconds */
- if (t = timers_first(&main_timeloop))
+ if (t = timers_first(&main_birdloop.time))
{
- times_update(&main_timeloop);
+ times_update();
timeout = (tm_remains(t) TO_MS) + 1;
poll_tout = MIN(poll_tout, timeout);
}
- nfds = 0;
+ /* A hack to reload main io_loop() when something has changed asynchronously. */
+ pfd[0].fd = main_birdloop.wakeup_fds[0];
+ pfd[0].events = POLLIN;
+
+ nfds = 1;
+
WALK_LIST(n, sock_list)
{
pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */
@@ -2267,7 +2309,9 @@ io_loop(void)
/* And finally enter poll() to find active sockets */
watchdog_stop();
+ birdloop_leave(&main_birdloop);
pout = poll(pfd, nfds, poll_tout);
+ birdloop_enter(&main_birdloop);
watchdog_start();
if (pout < 0)
@@ -2278,7 +2322,15 @@ io_loop(void)
}
if (pout)
{
- times_update(&main_timeloop);
+ if (pfd[0].revents & POLLIN)
+ {
+ /* IO loop reload requested */
+ pipe_drain(main_birdloop.wakeup_fds[0]);
+ atomic_exchange_explicit(&main_birdloop.ping_sent, 0, memory_order_acq_rel);
+ continue;
+ }
+
+ times_update();
/* guaranteed to be non-empty */
current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
diff --git a/sysdep/unix/krt.Y b/sysdep/unix/krt.Y
index 95b54d65..4ce9a328 100644
--- a/sysdep/unix/krt.Y
+++ b/sysdep/unix/krt.Y
@@ -29,7 +29,7 @@ kif_set_preferred(ip_addr ip)
CF_DECLS
-CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, KRT_SOURCE, KRT_METRIC, MERGE, PATHS)
+CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, MERGE, PATHS)
CF_KEYWORDS(INTERFACE, PREFERRED)
%type <i> kern_mp_limit
@@ -122,9 +122,6 @@ kif_iface:
kif_iface_start iface_patt_list_nopx kif_iface_opt_list;
-dynamic_attr: KRT_SOURCE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SOURCE); } ;
-dynamic_attr: KRT_METRIC { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_METRIC); } ;
-
CF_CODE
CF_END
diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c
index 342adee5..cfa9b1ee 100644
--- a/sysdep/unix/krt.c
+++ b/sysdep/unix/krt.c
@@ -53,7 +53,7 @@
#include "nest/bird.h"
#include "nest/iface.h"
-#include "nest/route.h"
+#include "nest/rt.h"
#include "nest/protocol.h"
#include "filter/filter.h"
#include "conf/conf.h"
@@ -232,7 +232,6 @@ kif_copy_config(struct proto_config *dest, struct proto_config *src)
struct protocol proto_unix_iface = {
.name = "Device",
.template = "device%d",
- .class = PROTOCOL_DEVICE,
.proto_size = sizeof(struct kif_proto),
.config_size = sizeof(struct kif_config),
.preconfig = kif_preconfig,
@@ -243,6 +242,13 @@ struct protocol proto_unix_iface = {
.copy_config = kif_copy_config
};
+void
+kif_build(void)
+{
+ proto_build(&proto_unix_iface);
+}
+
+
/*
* Tracing of routes
*/
@@ -251,14 +257,14 @@ static inline void
krt_trace_in(struct krt_proto *p, rte *e, char *msg)
{
if (p->p.debug & D_PACKETS)
- log(L_TRACE "%s: %N: %s", p->p.name, e->net->n.addr, msg);
+ log(L_TRACE "%s: %N: %s", p->p.name, e->net, msg);
}
static inline void
krt_trace_in_rl(struct tbf *f, struct krt_proto *p, rte *e, char *msg)
{
if (p->p.debug & D_PACKETS)
- log_rl(f, L_TRACE "%s: %N: %s", p->p.name, e->net->n.addr, msg);
+ log_rl(f, L_TRACE "%s: %N: %s", p->p.name, e->net, msg);
}
/*
@@ -277,261 +283,49 @@ static struct tbf rl_alien = TBF_DEFAULT_LOG_LIMITS;
* the same key.
*/
-static inline int
-krt_same_key(rte *a, rte *b)
+static inline u32
+krt_metric(rte *a)
{
- return a->u.krt.metric == b->u.krt.metric;
+ eattr *ea = ea_find(a->attrs, &ea_krt_metric);
+ return ea ? ea->u.data : 0;
}
static inline int
-krt_uptodate(rte *a, rte *b)
-{
- if (a->attrs != b->attrs)
- return 0;
-
- if (a->u.krt.proto != b->u.krt.proto)
- return 0;
-
- return 1;
-}
-
-static void
-krt_learn_announce_update(struct krt_proto *p, rte *e)
+krt_same_key(rte *a, rte *b)
{
- net *n = e->net;
- rta *aa = rta_clone(e->attrs);
- rte *ee = rte_get_temp(aa);
- ee->pflags = EA_ID_FLAG(EA_KRT_SOURCE) | EA_ID_FLAG(EA_KRT_METRIC);
- ee->u.krt = e->u.krt;
- rte_update(&p->p, n->n.addr, ee);
+ return (krt_metric(a) == krt_metric(b));
}
-static void
-krt_learn_announce_delete(struct krt_proto *p, net *n)
+static inline int
+krt_uptodate(rte *a, rte *b)
{
- rte_update(&p->p, n->n.addr, NULL);
+ return (a->attrs == b->attrs);
}
/* Called when alien route is discovered during scan */
static void
krt_learn_scan(struct krt_proto *p, rte *e)
{
- net *n0 = e->net;
- net *n = net_get(p->krt_table, n0->n.addr);
- rte *m, **mm;
-
- e->attrs = rta_lookup(e->attrs);
-
- for(mm=&n->routes; m = *mm; mm=&m->next)
- if (krt_same_key(m, e))
- break;
- if (m)
- {
- if (krt_uptodate(m, e))
- {
- krt_trace_in_rl(&rl_alien, p, e, "[alien] seen");
- rte_free(e);
- m->u.krt.seen = 1;
- }
- else
- {
- krt_trace_in(p, e, "[alien] updated");
- *mm = m->next;
- rte_free(m);
- m = NULL;
- }
- }
- else
- krt_trace_in(p, e, "[alien] created");
- if (!m)
- {
- e->next = n->routes;
- n->routes = e;
- e->u.krt.seen = 1;
- }
-}
-
-static void
-krt_learn_prune(struct krt_proto *p)
-{
- struct fib *fib = &p->krt_table->fib;
- struct fib_iterator fit;
-
- KRT_TRACE(p, D_EVENTS, "Pruning inherited routes");
-
- FIB_ITERATE_INIT(&fit, fib);
-again:
- FIB_ITERATE_START(fib, &fit, net, n)
- {
- rte *e, **ee, *best, **pbest, *old_best;
-
- /*
- * Note that old_best may be NULL even if there was an old best route in
- * the previous step, because it might be replaced in krt_learn_scan().
- * But in that case there is a new valid best route.
- */
-
- old_best = NULL;
- best = NULL;
- pbest = NULL;
- ee = &n->routes;
- while (e = *ee)
- {
- if (e->u.krt.best)
- old_best = e;
-
- if (!e->u.krt.seen)
- {
- *ee = e->next;
- rte_free(e);
- continue;
- }
-
- if (!best || best->u.krt.metric > e->u.krt.metric)
- {
- best = e;
- pbest = ee;
- }
-
- e->u.krt.seen = 0;
- e->u.krt.best = 0;
- ee = &e->next;
- }
- if (!n->routes)
- {
- DBG("%I/%d: deleting\n", n->n.prefix, n->n.pxlen);
- if (old_best)
- krt_learn_announce_delete(p, n);
+ rte e0 = {
+ .attrs = e->attrs,
+ .src = rt_get_source(&p->p, krt_metric(e)),
+ };
- FIB_ITERATE_PUT(&fit);
- fib_delete(fib, n);
- goto again;
- }
+ ea_set_attr_u32(&e0.attrs, &ea_gen_preference, 0, p->p.main_channel->preference);
- best->u.krt.best = 1;
- *pbest = best->next;
- best->next = n->routes;
- n->routes = best;
-
- if ((best != old_best) || p->reload)
- {
- DBG("%I/%d: announcing (metric=%d)\n", n->n.prefix, n->n.pxlen, best->u.krt.metric);
- krt_learn_announce_update(p, best);
- }
- else
- DBG("%I/%d: uptodate (metric=%d)\n", n->n.prefix, n->n.pxlen, best->u.krt.metric);
- }
- FIB_ITERATE_END;
-
- p->reload = 0;
+ rte_update(p->p.main_channel, e->net, &e0, e0.src);
+ rt_unlock_source(e0.src);
}
static void
krt_learn_async(struct krt_proto *p, rte *e, int new)
{
- net *n0 = e->net;
- net *n = net_get(p->krt_table, n0->n.addr);
- rte *g, **gg, *best, **bestp, *old_best;
-
- e->attrs = rta_lookup(e->attrs);
-
- old_best = n->routes;
- for(gg=&n->routes; g = *gg; gg = &g->next)
- if (krt_same_key(g, e))
- break;
if (new)
- {
- if (g)
- {
- if (krt_uptodate(g, e))
- {
- krt_trace_in(p, e, "[alien async] same");
- rte_free(e);
- return;
- }
- krt_trace_in(p, e, "[alien async] updated");
- *gg = g->next;
- rte_free(g);
- }
- else
- krt_trace_in(p, e, "[alien async] created");
-
- e->next = n->routes;
- n->routes = e;
- }
- else if (!g)
- {
- krt_trace_in(p, e, "[alien async] delete failed");
- rte_free(e);
- return;
- }
- else
- {
- krt_trace_in(p, e, "[alien async] removed");
- *gg = g->next;
- rte_free(e);
- rte_free(g);
- }
- best = n->routes;
- bestp = &n->routes;
- for(gg=&n->routes; g=*gg; gg=&g->next)
- {
- if (best->u.krt.metric > g->u.krt.metric)
- {
- best = g;
- bestp = gg;
- }
-
- g->u.krt.best = 0;
- }
-
- if (best)
- {
- best->u.krt.best = 1;
- *bestp = best->next;
- best->next = n->routes;
- n->routes = best;
- }
-
- if (best != old_best)
- {
- DBG("krt_learn_async: distributing change\n");
- if (best)
- krt_learn_announce_update(p, best);
- else
- krt_learn_announce_delete(p, n);
- }
-}
-
-static void
-krt_learn_init(struct krt_proto *p)
-{
- if (KRT_CF->learn)
- {
- struct rtable_config *cf = mb_allocz(p->p.pool, sizeof(struct rtable_config));
- cf->name = "Inherited";
- cf->addr_type = p->p.net_type;
- cf->internal = 1;
-
- p->krt_table = rt_setup(p->p.pool, cf);
- }
-}
+ return krt_learn_scan(p, e);
-static void
-krt_dump(struct proto *P)
-{
- struct krt_proto *p = (struct krt_proto *) P;
-
- if (!KRT_CF->learn)
- return;
- debug("KRT: Table of inheritable routes\n");
- rt_dump(p->krt_table);
-}
-
-static void
-krt_dump_attrs(rte *e)
-{
- debug(" [m=%d,p=%d]", e->u.krt.metric, e->u.krt.proto);
+ struct rte_src *src = rt_get_source(&p->p, krt_metric(e));
+ rte_update(p->p.main_channel, e->net, NULL, src);
+ rt_unlock_source(src);
}
#endif
@@ -543,79 +337,83 @@ krt_dump_attrs(rte *e)
static inline int
krt_is_installed(struct krt_proto *p, net *n)
{
- return n->routes && bmap_test(&p->p.main_channel->export_map, n->routes->id);
+ return n->routes && bmap_test(&p->p.main_channel->export_map, n->routes->rte.id);
}
-static void
-krt_flush_routes(struct krt_proto *p)
+static uint
+rte_feed_count(net *n)
{
- struct rtable *t = p->p.main_channel->table;
+ uint count = 0;
+ for (struct rte_storage *e = n->routes; e; e = e->next)
+ if (rte_is_valid(RTE_OR_NULL(e)))
+ count++;
+ return count;
+}
- KRT_TRACE(p, D_EVENTS, "Flushing kernel routes");
- FIB_WALK(&t->fib, net, n)
+static void
+rte_feed_obtain(net *n, rte **feed, uint count)
+{
+ uint i = 0;
+ for (struct rte_storage *e = n->routes; e; e = e->next)
+ if (rte_is_valid(RTE_OR_NULL(e)))
{
- if (krt_is_installed(p, n))
- {
- /* FIXME: this does not work if gw is changed in export filter */
- krt_replace_rte(p, n, NULL, n->routes);
- }
+ ASSERT_DIE(i < count);
+ feed[i++] = &e->rte;
}
- FIB_WALK_END;
+ ASSERT_DIE(i == count);
}
static struct rte *
-krt_export_net(struct krt_proto *p, net *net, rte **rt_free)
+krt_export_net(struct krt_proto *p, net *net)
{
struct channel *c = p->p.main_channel;
const struct filter *filter = c->out_filter;
- rte *rt;
if (c->ra_mode == RA_MERGED)
- return rt_export_merged(c, net, rt_free, krt_filter_lp, 1);
+ {
+ uint count = rte_feed_count(net);
+ if (!count)
+ return NULL;
- rt = net->routes;
- *rt_free = NULL;
+ rte **feed = alloca(count * sizeof(rte *));
+ rte_feed_obtain(net, feed, count);
+ return rt_export_merged(c, feed, count, krt_filter_lp, 1);
+ }
+
+ static _Thread_local rte rt;
+ rt = net->routes->rte;
- if (!rte_is_valid(rt))
+ if (!rte_is_valid(&rt))
return NULL;
if (filter == FILTER_REJECT)
return NULL;
- rte_make_tmp_attrs(&rt, krt_filter_lp, NULL);
-
/* We could run krt_preexport() here, but it is already handled by krt_is_installed() */
if (filter == FILTER_ACCEPT)
goto accept;
- if (f_run(filter, &rt, krt_filter_lp, FF_SILENT) > F_ACCEPT)
+ if (f_run(filter, &rt, FF_SILENT) > F_ACCEPT)
goto reject;
accept:
- if (rt != net->routes)
- *rt_free = rt;
- return rt;
+ return &rt;
reject:
- if (rt != net->routes)
- rte_free(rt);
return NULL;
}
static int
krt_same_dest(rte *k, rte *e)
{
- rta *ka = k->attrs, *ea = e->attrs;
-
- if (ka->dest != ea->dest)
- return 0;
+ ea_list *ka = k->attrs, *ea = e->attrs;
- if (ka->dest == RTD_UNICAST)
- return nexthop_same(&(ka->nh), &(ea->nh));
+ eattr *nhea_k = ea_find(ka, &ea_gen_nexthop);
+ eattr *nhea_e = ea_find(ea, &ea_gen_nexthop);
- return 1;
+ return (!nhea_k == !nhea_e) && adata_same(nhea_k->u.ptr, nhea_e->u.ptr);
}
/*
@@ -624,13 +422,13 @@ krt_same_dest(rte *k, rte *e)
*/
void
-krt_got_route(struct krt_proto *p, rte *e)
+krt_got_route(struct krt_proto *p, rte *e, s8 src)
{
- rte *new = NULL, *rt_free = NULL;
- net *n = e->net;
+ rte *new = NULL;
+ e->pflags = 0;
#ifdef KRT_ALLOW_LEARN
- switch (e->u.krt.src)
+ switch (src)
{
case KRT_SRC_KERNEL:
goto ignore;
@@ -642,24 +440,25 @@ krt_got_route(struct krt_proto *p, rte *e)
if (KRT_CF->learn)
krt_learn_scan(p, e);
else
- {
- krt_trace_in_rl(&rl_alien, p, e, "[alien] ignored");
- rte_free(e);
- }
+ krt_trace_in_rl(&rl_alien, p, e, "[alien] ignored");
return;
}
#endif
/* The rest is for KRT_SRC_BIRD (or KRT_SRC_UNKNOWN) */
+ /* Deleting all routes if flush is requested */
+ if (p->flush_routes)
+ goto delete;
/* We wait for the initial feed to have correct installed state */
if (!p->ready)
goto ignore;
- if (!krt_is_installed(p, n))
+ net *net = net_find(p->p.main_channel->table, e->net);
+ if (!net || !krt_is_installed(p, net))
goto delete;
- new = krt_export_net(p, n, &rt_free);
+ new = krt_export_net(p, net);
/* Rejected by filters */
if (!new)
@@ -692,20 +491,15 @@ ignore:
update:
krt_trace_in(p, new, "updating");
- krt_replace_rte(p, n, new, e);
+ krt_replace_rte(p, e->net, new, e);
goto done;
delete:
krt_trace_in(p, e, "deleting");
- krt_replace_rte(p, n, NULL, e);
+ krt_replace_rte(p, e->net, NULL, e);
goto done;
done:
- rte_free(e);
-
- if (rt_free)
- rte_free(rt_free);
-
lp_flush(krt_filter_lp);
}
@@ -723,40 +517,42 @@ krt_prune(struct krt_proto *p)
KRT_TRACE(p, D_EVENTS, "Pruning table %s", t->name);
FIB_WALK(&t->fib, net, n)
{
- if (p->ready && krt_is_installed(p, n) && !bmap_test(&p->seen_map, n->routes->id))
+ if (p->ready && krt_is_installed(p, n) && !bmap_test(&p->seen_map, n->routes->rte.id))
{
- rte *rt_free = NULL;
- rte *new = krt_export_net(p, n, &rt_free);
+ rte *new = krt_export_net(p, n);
if (new)
{
krt_trace_in(p, new, "installing");
- krt_replace_rte(p, n, new, NULL);
+ krt_replace_rte(p, n->n.addr, new, NULL);
}
- if (rt_free)
- rte_free(rt_free);
-
lp_flush(krt_filter_lp);
}
}
FIB_WALK_END;
-#ifdef KRT_ALLOW_LEARN
- if (KRT_CF->learn)
- krt_learn_prune(p);
-#endif
-
if (p->ready)
p->initialized = 1;
}
+static void
+krt_flush_routes(struct krt_proto *p)
+{
+ KRT_TRACE(p, D_EVENTS, "Flushing kernel routes");
+ p->flush_routes = 1;
+ krt_init_scan(p);
+ krt_do_scan(p);
+ /* No prune! */
+ p->flush_routes = 0;
+}
+
void
-krt_got_route_async(struct krt_proto *p, rte *e, int new)
+krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src)
{
- net *net = e->net;
+ e->pflags = 0;
- switch (e->u.krt.src)
+ switch (src)
{
case KRT_SRC_BIRD:
/* Should be filtered by the back end */
@@ -766,7 +562,7 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new)
if (new)
{
krt_trace_in(p, e, "[redirect] deleting");
- krt_replace_rte(p, net, NULL, e);
+ krt_replace_rte(p, e->net, NULL, e);
}
/* If !new, it is probably echo of our deletion */
break;
@@ -780,7 +576,6 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new)
}
#endif
}
- rte_free(e);
}
@@ -899,29 +694,10 @@ krt_scan_timer_kick(struct krt_proto *p)
* Updates
*/
-static void
-krt_make_tmp_attrs(struct rte *rt, struct linpool *pool)
-{
- rte_init_tmp_attrs(rt, pool, 2);
- rte_make_tmp_attr(rt, EA_KRT_SOURCE, EAF_TYPE_INT, rt->u.krt.proto);
- rte_make_tmp_attr(rt, EA_KRT_METRIC, EAF_TYPE_INT, rt->u.krt.metric);
-}
-
-static void
-krt_store_tmp_attrs(struct rte *rt, struct linpool *pool)
-{
- rte_init_tmp_attrs(rt, pool, 2);
- rt->u.krt.proto = rte_store_tmp_attr(rt, EA_KRT_SOURCE);
- rt->u.krt.metric = rte_store_tmp_attr(rt, EA_KRT_METRIC);
-}
-
static int
-krt_preexport(struct channel *C, rte **new, struct linpool *pool UNUSED)
+krt_preexport(struct channel *C, rte *e)
{
- // struct krt_proto *p = (struct krt_proto *) P;
- rte *e = *new;
-
- if (e->attrs->src->proto == C->proto)
+ if (e->src->owner == &C->proto->sources)
return -1;
if (!krt_capable(e))
@@ -931,8 +707,8 @@ krt_preexport(struct channel *C, rte **new, struct linpool *pool UNUSED)
}
static void
-krt_rt_notify(struct proto *P, struct channel *ch UNUSED, net *net,
- rte *new, rte *old)
+krt_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net,
+ rte *new, const rte *old)
{
struct krt_proto *p = (struct krt_proto *) P;
@@ -996,14 +772,6 @@ krt_feed_end(struct channel *C)
}
-static int
-krt_rte_same(rte *a, rte *b)
-{
- /* src is always KRT_SRC_ALIEN and type is irrelevant */
- return (a->u.krt.proto == b->u.krt.proto) && (a->u.krt.metric == b->u.krt.metric);
-}
-
-
/*
* Protocol glue
*/
@@ -1057,9 +825,6 @@ krt_init(struct proto_config *CF)
p->p.if_notify = krt_if_notify;
p->p.reload_routes = krt_reload_routes;
p->p.feed_end = krt_feed_end;
- p->p.make_tmp_attrs = krt_make_tmp_attrs;
- p->p.store_tmp_attrs = krt_store_tmp_attrs;
- p->p.rte_same = krt_rte_same;
krt_sys_init(p);
return &p->p;
@@ -1085,10 +850,6 @@ krt_start(struct proto *P)
bmap_init(&p->seen_map, p->p.pool, 1024);
add_tail(&krt_proto_list, &p->krt_node);
-#ifdef KRT_ALLOW_LEARN
- krt_learn_init(p);
-#endif
-
if (!krt_sys_start(p))
{
rem_node(&p->krt_node);
@@ -1168,24 +929,15 @@ krt_copy_config(struct proto_config *dest, struct proto_config *src)
krt_sys_copy_config(d, s);
}
-static int
-krt_get_attr(const eattr *a, byte *buf, int buflen)
-{
- switch (a->id)
- {
- case EA_KRT_SOURCE:
- bsprintf(buf, "source");
- return GA_NAME;
-
- case EA_KRT_METRIC:
- bsprintf(buf, "metric");
- return GA_NAME;
-
- default:
- return krt_sys_get_attr(a, buf, buflen);
- }
-}
+struct ea_class ea_krt_source = {
+ .name = "krt_source",
+ .type = T_INT,
+};
+struct ea_class ea_krt_metric = {
+ .name = "krt_metric",
+ .type = T_INT,
+};
#ifdef CONFIG_IP6_SADR_KERNEL
#define MAYBE_IP6_SADR NB_IP6_SADR
@@ -1202,7 +954,6 @@ krt_get_attr(const eattr *a, byte *buf, int buflen)
struct protocol proto_unix_kernel = {
.name = "Kernel",
.template = "kernel%d",
- .class = PROTOCOL_KERNEL,
.preference = DEF_PREF_INHERITED,
.channel_mask = NB_IP | MAYBE_IP6_SADR | MAYBE_MPLS,
.proto_size = sizeof(struct krt_proto),
@@ -1214,9 +965,15 @@ struct protocol proto_unix_kernel = {
.shutdown = krt_shutdown,
.reconfigure = krt_reconfigure,
.copy_config = krt_copy_config,
- .get_attr = krt_get_attr,
-#ifdef KRT_ALLOW_LEARN
- .dump = krt_dump,
- .dump_attrs = krt_dump_attrs,
-#endif
};
+
+void
+krt_build(void)
+{
+ proto_build(&proto_unix_kernel);
+
+ EA_REGISTER_ALL(
+ &ea_krt_source,
+ &ea_krt_metric,
+ );
+}
diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h
index 1536e502..9f7ebb4f 100644
--- a/sysdep/unix/krt.h
+++ b/sysdep/unix/krt.h
@@ -21,8 +21,10 @@ struct kif_proto;
#define KRT_DEFAULT_ECMP_LIMIT 16
-#define EA_KRT_SOURCE EA_CODE(PROTOCOL_KERNEL, 0)
-#define EA_KRT_METRIC EA_CODE(PROTOCOL_KERNEL, 1)
+extern struct ea_class ea_krt_source, ea_krt_metric;
+
+#define KRT_REF_SEEN 0x1 /* Seen in table */
+#define KRT_REF_BEST 0x2 /* Best in table */
/* Whenever we recognize our own routes, we allow learing of foreign routes */
@@ -48,10 +50,6 @@ struct krt_proto {
struct proto p;
struct krt_state sys; /* Sysdep state */
-#ifdef KRT_ALLOW_LEARN
- struct rtable *krt_table; /* Internal table of inherited routes */
-#endif
-
timer *scan_timer;
struct bmap sync_map; /* Keeps track which exported routes were successfully written to kernel */
struct bmap seen_map; /* Routes seen during last periodic scan */
@@ -60,6 +58,7 @@ struct krt_proto {
byte ready; /* Initial feed has been finished */
byte initialized; /* First scan has been finished */
byte reload; /* Next scan is doing reload */
+ byte flush_routes; /* Scanning to flush */
};
extern pool *krt_pool;
@@ -74,8 +73,8 @@ extern pool *krt_pool;
struct proto_config * kif_init_config(int class);
void kif_request_scan(void);
void krt_use_shared_scan(void);
-void krt_got_route(struct krt_proto *p, struct rte *e);
-void krt_got_route_async(struct krt_proto *p, struct rte *e, int new);
+void krt_got_route(struct krt_proto *p, struct rte *e, s8 src);
+void krt_got_route_async(struct krt_proto *p, struct rte *e, int new, s8 src);
static inline int
krt_get_sync_error(struct krt_proto *p, struct rte *e)
@@ -138,7 +137,7 @@ void krt_sys_copy_config(struct krt_config *, struct krt_config *);
int krt_capable(rte *e);
void krt_do_scan(struct krt_proto *);
-void krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old);
+void krt_replace_rte(struct krt_proto *p, const net_addr *n, rte *new, const rte *old);
int krt_sys_get_attr(const eattr *a, byte *buf, int buflen);
diff --git a/sysdep/unix/log.c b/sysdep/unix/log.c
index 14d18c01..185231e8 100644
--- a/sysdep/unix/log.c
+++ b/sysdep/unix/log.c
@@ -15,6 +15,7 @@
* user's manual.
*/
+#include <stdatomic.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
@@ -35,8 +36,10 @@ static FILE *dbgf;
static list *current_log_list;
static char *current_syslog_name; /* NULL -> syslog closed */
+static _Atomic uint max_thread_id = ATOMIC_VAR_INIT(1);
+static _Thread_local uint this_thread_id;
-#ifdef USE_PTHREADS
+#define THIS_THREAD_ID (this_thread_id ?: (this_thread_id = atomic_fetch_add_explicit(&max_thread_id, 1, memory_order_acq_rel)))
#include <pthread.h>
@@ -48,15 +51,6 @@ static pthread_t main_thread;
void main_thread_init(void) { main_thread = pthread_self(); }
static int main_thread_self(void) { return pthread_equal(pthread_self(), main_thread); }
-#else
-
-static inline void log_lock(void) { }
-static inline void log_unlock(void) { }
-void main_thread_init(void) { }
-static int main_thread_self(void) { return 1; }
-
-#endif
-
#ifdef HAVE_SYSLOG_H
#include <sys/syslog.h>
@@ -189,7 +183,7 @@ log_commit(int class, buffer *buf)
l->pos += msg_len;
}
- fprintf(l->fh, "%s <%s> ", tbuf, class_names[class]);
+ fprintf(l->fh, "%s [%04x] <%s> ", tbuf, THIS_THREAD_ID, class_names[class]);
}
fputs(buf->start, l->fh);
fputc('\n', l->fh);
@@ -299,6 +293,8 @@ die(const char *msg, ...)
exit(1);
}
+static struct timespec dbg_time_start;
+
/**
* debug - write to debug output
* @msg: a printf-like message
@@ -309,22 +305,36 @@ die(const char *msg, ...)
void
debug(const char *msg, ...)
{
-#define MAX_DEBUG_BUFSIZE 65536
+#define MAX_DEBUG_BUFSIZE 16384
va_list args;
- static uint bufsize = 4096;
- static char *buf = NULL;
-
- if (!buf)
- buf = mb_alloc(&root_pool, bufsize);
+ char buf[MAX_DEBUG_BUFSIZE], *pos = buf;
+ int max = MAX_DEBUG_BUFSIZE;
va_start(args, msg);
if (dbgf)
{
- while (bvsnprintf(buf, bufsize, msg, args) < 0)
- if (bufsize >= MAX_DEBUG_BUFSIZE)
- bug("Extremely long debug output, split it.");
- else
- buf = mb_realloc(buf, (bufsize *= 2));
+ struct timespec dbg_time;
+ clock_gettime(CLOCK_MONOTONIC, &dbg_time);
+ uint nsec;
+ uint sec;
+
+ if (dbg_time.tv_nsec > dbg_time_start.tv_nsec)
+ {
+ nsec = dbg_time.tv_nsec - dbg_time_start.tv_nsec;
+ sec = dbg_time.tv_sec - dbg_time_start.tv_sec;
+ }
+ else
+ {
+ nsec = 1000000000 + dbg_time.tv_nsec - dbg_time_start.tv_nsec;
+ sec = dbg_time.tv_sec - dbg_time_start.tv_sec - 1;
+ }
+
+ int n = bsnprintf(pos, max, "%u.%09u: [%04x] ", sec, nsec, THIS_THREAD_ID);
+ pos += n;
+ max -= n;
+
+ if (bvsnprintf(pos, max, msg, args) < 0)
+ bug("Extremely long debug output, split it.");
fputs(buf, dbgf);
}
@@ -429,6 +439,8 @@ done:
void
log_init_debug(char *f)
{
+ clock_gettime(CLOCK_MONOTONIC, &dbg_time_start);
+
if (dbgf && dbgf != stderr)
fclose(dbgf);
if (!f)
diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c
index f87d0c43..bf9f2be0 100644
--- a/sysdep/unix/main.c
+++ b/sysdep/unix/main.c
@@ -28,9 +28,10 @@
#include "lib/resource.h"
#include "lib/socket.h"
#include "lib/event.h"
+#include "lib/locking.h"
#include "lib/timer.h"
#include "lib/string.h"
-#include "nest/route.h"
+#include "nest/rt.h"
#include "nest/protocol.h"
#include "nest/iface.h"
#include "nest/cli.h"
@@ -56,7 +57,7 @@ async_dump(void)
// XXXX tm_dump_all();
if_dump_all();
neigh_dump_all();
- rta_dump_all();
+ ea_dump_all();
rt_dump_all();
protos_dump_all();
@@ -479,6 +480,14 @@ cli_err(sock *s, int err)
cli_free(s->data);
}
+static void
+cli_connect_err(sock *s UNUSED, int err)
+{
+ ASSERT_DIE(err);
+ if (config->cli_debug)
+ log(L_INFO "Failed to accept CLI connection: %s", strerror(err));
+}
+
static int
cli_connect(sock *s, uint size UNUSED)
{
@@ -507,6 +516,7 @@ cli_init_unix(uid_t use_uid, gid_t use_gid)
s = cli_sk = sk_new(cli_pool);
s->type = SK_UNIX_PASSIVE;
s->rx_hook = cli_connect;
+ s->err_hook = cli_connect_err;
s->rbsize = 1024;
s->fast_rx = 1;
@@ -864,16 +874,19 @@ main(int argc, char **argv)
dmalloc_debug(0x2f03d00);
#endif
+ times_update();
parse_args(argc, argv);
log_switch(1, NULL, NULL);
+ the_bird_lock();
+
random_init();
net_init();
resource_init();
- timer_init();
+ birdloop_init();
olock_init();
- io_init();
rt_init();
+ io_init();
if_init();
// roa_init();
config_init();
@@ -897,8 +910,6 @@ main(int argc, char **argv)
open_pid_file();
protos_build();
- proto_build(&proto_unix_kernel);
- proto_build(&proto_unix_iface);
struct config *conf = read_config();
@@ -920,6 +931,7 @@ main(int argc, char **argv)
dup2(0, 2);
}
+
main_thread_init();
write_pid_file();