From e37d2e3e708db78f9b8f15eb34bb266cfae7bb42 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Wed, 20 Jul 2016 15:06:57 +0200 Subject: Netlink: Ignore tentative addresses Ignore tentative IPv6 addresses and wait until finish of Duplicate Address Detection (We got notification when an address is no longer tentative) to avoid problems when protocols try to use interfaces with tentative link-local addresses. Based on patch from Jan Moskyto Matejka --- sysdep/linux/netlink.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'sysdep/linux') diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index b42e7b6f..f6cedbd8 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -247,18 +247,20 @@ static struct nl_want_attrs ifla_attr_want[BIRD_IFLA_MAX] = { }; -#define BIRD_IFA_MAX (IFA_ANYCAST+1) +#define BIRD_IFA_MAX (IFA_FLAGS+1) #ifndef IPV6 static struct nl_want_attrs ifa_attr_want4[BIRD_IFA_MAX] = { [IFA_ADDRESS] = { 1, 1, sizeof(ip4_addr) }, [IFA_LOCAL] = { 1, 1, sizeof(ip4_addr) }, [IFA_BROADCAST] = { 1, 1, sizeof(ip4_addr) }, + [IFA_FLAGS] = { 1, 1, sizeof(u32) }, }; #else static struct nl_want_attrs ifa_attr_want6[BIRD_IFA_MAX] = { [IFA_ADDRESS] = { 1, 1, sizeof(ip6_addr) }, [IFA_LOCAL] = { 1, 1, sizeof(ip6_addr) }, + [IFA_FLAGS] = { 1, 1, sizeof(u32) }, }; #endif @@ -611,6 +613,7 @@ nl_parse_addr(struct nlmsghdr *h, int scan) struct ifa ifa; struct iface *ifi; int scope; + u32 ifa_flags; if (!(i = nl_checkin(h, sizeof(*i)))) return; @@ -643,6 +646,11 @@ nl_parse_addr(struct nlmsghdr *h, int scan) return; } + if (a[IFA_FLAGS]) + ifa_flags = rta_get_u32(a[IFA_FLAGS]); + else + ifa_flags = i->ifa_flags; + ifi = if_find_by_index(i->ifa_index); if (!ifi) { @@ -652,9 +660,15 @@ nl_parse_addr(struct nlmsghdr *h, int scan) bzero(&ifa, sizeof(ifa)); ifa.iface = ifi; - if (i->ifa_flags & IFA_F_SECONDARY) + if (ifa_flags & IFA_F_SECONDARY) ifa.flags |= IA_SECONDARY; +#ifdef IPV6 + /* Ignore tentative addresses silently */ + if (ifa_flags & IFA_F_TENTATIVE) + return; +#endif + /* IFA_LOCAL can be unset for IPv6 interfaces */ memcpy(&ifa.ip, RTA_DATA(a[IFA_LOCAL] ? : a[IFA_ADDRESS]), sizeof(ifa.ip)); ipa_ntoh(ifa.ip); -- cgit v1.2.3 From a08a81c6b40dcf07e786b67e5015fc91a44333ca Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Wed, 20 Jul 2016 15:31:25 +0200 Subject: Netlink: Fix build with older headers missing IFA_FLAGS --- sysdep/linux/netlink.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'sysdep/linux') diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index f6cedbd8..fa9013bd 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -38,6 +38,10 @@ #define MSG_TRUNC 0x20 #endif +#ifndef IFA_FLAGS +#define IFA_FLAGS 8 +#endif + #ifndef IFF_LOWER_UP #define IFF_LOWER_UP 0x10000 #endif -- cgit v1.2.3 From f9f2e280ea4fa1af19d8ce9e54abeb495694c0d5 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Tue, 30 Aug 2016 12:43:46 +0200 Subject: KRT: Forbid path merging on BSD We support ECMP routes only on Linux. Exported routes are checked in krt_capable(), but a route generated during path merging avoids this check. --- sysdep/linux/krt-sys.h | 2 ++ sysdep/unix/krt.Y | 18 +++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'sysdep/linux') diff --git a/sysdep/linux/krt-sys.h b/sysdep/linux/krt-sys.h index 7fd5f139..076870f5 100644 --- a/sysdep/linux/krt-sys.h +++ b/sysdep/linux/krt-sys.h @@ -32,6 +32,8 @@ static inline struct ifa * kif_get_primary_ip(struct iface *i) { return NULL; } /* Kernel routes */ +#define KRT_ALLOW_MERGE_PATHS 1 + #define EA_KRT_PREFSRC EA_CODE(EAP_KRT, 0x10) #define EA_KRT_REALM EA_CODE(EAP_KRT, 0x11) diff --git a/sysdep/unix/krt.Y b/sysdep/unix/krt.Y index e036081d..2ef6e3c4 100644 --- a/sysdep/unix/krt.Y +++ b/sysdep/unix/krt.Y @@ -19,6 +19,8 @@ CF_DECLS CF_KEYWORDS(KERNEL, PERSIST, SCAN, TIME, LEARN, DEVICE, ROUTES, GRACEFUL, RESTART, KRT_SOURCE, KRT_METRIC, MERGE, PATHS) +%type kern_mp_limit + CF_GRAMMAR /* Kernel syncer protocol */ @@ -32,6 +34,11 @@ CF_ADDTO(kern_proto, kern_proto_start proto_name '{') CF_ADDTO(kern_proto, kern_proto proto_item ';') CF_ADDTO(kern_proto, kern_proto kern_item ';') +kern_mp_limit: + /* empty */ { $$ = KRT_DEFAULT_ECMP_LIMIT; } + | LIMIT expr { $$ = $2; if (($2 <= 0) || ($2 > 255)) cf_error("Merge paths limit must be in range 1-255"); } + ; + kern_item: PERSIST bool { THIS_KRT->persist = $2; } | SCAN TIME expr { @@ -42,13 +49,18 @@ kern_item: THIS_KRT->learn = $2; #ifndef KRT_ALLOW_LEARN if ($2) - cf_error("Learning of kernel routes not supported in this configuration"); + cf_error("Learning of kernel routes not supported on this platform"); #endif } | DEVICE ROUTES bool { THIS_KRT->devroutes = $3; } | GRACEFUL RESTART bool { THIS_KRT->graceful_restart = $3; } - | MERGE PATHS bool { THIS_KRT->merge_paths = $3 ? KRT_DEFAULT_ECMP_LIMIT : 0; } - | MERGE PATHS bool LIMIT expr { THIS_KRT->merge_paths = $3 ? $5 : 0; if (($5 <= 0) || ($5 > 255)) cf_error("Merge paths limit must be in range 1-255"); } + | MERGE PATHS bool kern_mp_limit { + THIS_KRT->merge_paths = $3 ? $4 : 0; +#ifndef KRT_ALLOW_LEARN + if ($3) + cf_error("Path merging not supported on this platform"); +#endif + } ; /* Kernel interface protocol */ -- cgit v1.2.3 From 2feaa6931bfe39eba696b33b0c8aac13d313b223 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Wed, 14 Sep 2016 11:40:15 +0200 Subject: KRT: Support for IPv6 ECMP Linux represents IPv6 ECMP routes as a sequence of unipath routes with the same prefix. We have to translate between our representation (one route with multipath next hop) and the Linux representation in both directions. Proper learning of alien IPv6 ECMP routes still not supported. Thanks to Mikhail Sennikovskii for the original patch. --- sysdep/linux/netlink.c | 354 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 277 insertions(+), 77 deletions(-) (limited to 'sysdep/linux') diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index fa9013bd..1ab1cda1 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -19,7 +19,6 @@ #include "nest/route.h" #include "nest/protocol.h" #include "nest/iface.h" -#include "lib/alloca.h" #include "lib/timer.h" #include "lib/unix.h" #include "lib/krt.h" @@ -51,6 +50,49 @@ #endif +#ifdef IPV6 +#define krt_ecmp6(X) 1 +#else +#define krt_ecmp6(X) 0 +#endif + +/* + * Structure nl_parse_state keeps state of received route processing. Ideally, + * we could just independently parse received Netlink messages and immediately + * propagate received routes to the rest of BIRD, but Linux kernel represents + * and announces IPv6 ECMP routes not as one route with multiple next hops (like + * RTA_MULTIPATH in IPv4 ECMP), but as a set of routes with the same prefix. + * + * Therefore, BIRD keeps currently processed route in nl_parse_state structure + * and postpones its propagation until we expect it to be final; i.e., when + * non-matching route is received or when the scan ends. When another matching + * route is received, it is merged with the already processed route to form an + * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the + * postponing is done in both cases (for simplicity). All IPv4 routes are just + * considered non-matching. + * + * This is ignored for asynchronous notifications (every notification is handled + * as a separate route). It is not an issue for our routes, as we ignore such + * notifications anyways. But importing alien IPv6 ECMP routes does not work + * properly. + */ + +struct nl_parse_state +{ + struct linpool *pool; + int scan; + int merge; + + net *net; + rta *attrs; + struct krt_proto *proto; + s8 new; + s8 krt_src; + u8 krt_type; + u8 krt_proto; + u32 krt_metric; +}; + /* * Synchronous Netlink interface */ @@ -66,6 +108,13 @@ struct nl_sock #define NL_RX_SIZE 8192 +#define NL_OP_DELETE 0 +#define NL_OP_ADD (NLM_F_CREATE|NLM_F_EXCL) +#define NL_OP_REPLACE (NLM_F_CREATE|NLM_F_REPLACE) +#define NL_OP_APPEND (NLM_F_CREATE|NLM_F_APPEND) + +static linpool *nl_linpool; + static struct nl_sock nl_scan = {.fd = -1}; /* Netlink socket for synchronous scan */ static struct nl_sock nl_req = {.fd = -1}; /* Netlink socket for requests */ @@ -169,7 +218,7 @@ nl_get_reply(struct nl_sock *nl) static struct tbf rl_netlink_err = TBF_DEFAULT_LOG_LIMITS; static int -nl_error(struct nlmsghdr *h) +nl_error(struct nlmsghdr *h, int ignore_esrch) { struct nlmsgerr *e; int ec; @@ -181,7 +230,7 @@ nl_error(struct nlmsghdr *h) } e = (struct nlmsgerr *) NLMSG_DATA(h); ec = -e->error; - if (ec) + if (ec && !(ignore_esrch && (ec == ESRCH))) log_rl(&rl_netlink_err, L_WARN "Netlink: %s", strerror(ec)); return ec; } @@ -195,14 +244,14 @@ nl_get_scan(void) return NULL; if (h->nlmsg_type == NLMSG_ERROR) { - nl_error(h); + nl_error(h, 0); return NULL; } return h; } static int -nl_exchange(struct nlmsghdr *pkt) +nl_exchange(struct nlmsghdr *pkt, int ignore_esrch) { struct nlmsghdr *h; @@ -214,7 +263,7 @@ nl_exchange(struct nlmsghdr *pkt) break; log(L_WARN "nl_exchange: Unexpected reply received"); } - return nl_error(h) ? -1 : 0; + return nl_error(h, ignore_esrch) ? -1 : 0; } /* @@ -826,7 +875,7 @@ nh_bufsize(struct mpnh *nh) } static int -nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new) +nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int op, int dest, ip_addr gw, struct iface *iface) { eattr *ea; net *net = e->net; @@ -837,13 +886,13 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new) char buf[128 + KRT_METRICS_MAX*8 + nh_bufsize(a->nexthops)]; } r; - DBG("nl_send_route(%I/%d,new=%d)\n", net->n.prefix, net->n.pxlen, new); + DBG("nl_send_route(%I/%d,op=%x)\n", net->n.prefix, net->n.pxlen, op); bzero(&r.h, sizeof(r.h)); bzero(&r.r, sizeof(r.r)); - r.h.nlmsg_type = new ? RTM_NEWROUTE : RTM_DELROUTE; + r.h.nlmsg_type = op ? RTM_NEWROUTE : RTM_DELROUTE; r.h.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); - r.h.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | (new ? NLM_F_CREATE|NLM_F_EXCL : 0); + r.h.nlmsg_flags = op | NLM_F_REQUEST | NLM_F_ACK; r.r.rtm_family = BIRD_AF; r.r.rtm_dst_len = net->n.pxlen; @@ -851,15 +900,21 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new) r.r.rtm_scope = RT_SCOPE_UNIVERSE; nl_add_attr_ipa(&r.h, sizeof(r), RTA_DST, net->n.prefix); + /* + * Strange behavior for RTM_DELROUTE: + * 1) rtm_family is ignored in IPv6, works for IPv4 + * 2) not setting RTA_PRIORITY is different from setting default value (on IPv6) + * 3) not setting RTA_PRIORITY is equivalent to setting 0, which is wildcard + */ + if (krt_table_id(p) < 256) r.r.rtm_table = krt_table_id(p); else nl_add_attr_u32(&r.h, sizeof(r), RTA_TABLE, krt_table_id(p)); - /* For route delete, we do not specify route attributes */ - if (!new) - return nl_exchange(&r.h); - + /* For route delete, we do not specify remaining route attributes */ + if (op == NL_OP_DELETE) + goto dest; if (ea = ea_find(eattrs, EA_KRT_METRIC)) nl_add_attr_u32(&r.h, sizeof(r), RTA_PRIORITY, ea->u.data); @@ -886,18 +941,18 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new) nl_add_metrics(&r.h, sizeof(r), metrics, KRT_METRICS_MAX); +dest: /* a->iface != NULL checked in krt_capable() for router and device routes */ - - switch (a->dest) + switch (dest) { case RTD_ROUTER: r.r.rtm_type = RTN_UNICAST; - nl_add_attr_u32(&r.h, sizeof(r), RTA_OIF, a->iface->index); - nl_add_attr_ipa(&r.h, sizeof(r), RTA_GATEWAY, a->gw); + nl_add_attr_u32(&r.h, sizeof(r), RTA_OIF, iface->index); + nl_add_attr_ipa(&r.h, sizeof(r), RTA_GATEWAY, gw); break; case RTD_DEVICE: r.r.rtm_type = RTN_UNICAST; - nl_add_attr_u32(&r.h, sizeof(r), RTA_OIF, a->iface->index); + nl_add_attr_u32(&r.h, sizeof(r), RTA_OIF, iface->index); break; case RTD_BLACKHOLE: r.r.rtm_type = RTN_BLACKHOLE; @@ -912,11 +967,50 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int new) r.r.rtm_type = RTN_UNICAST; nl_add_multipath(&r.h, sizeof(r), a->nexthops); break; + case RTD_NONE: + break; default: bug("krt_capable inconsistent with nl_send_route"); } - return nl_exchange(&r.h); + /* Ignore missing for DELETE */ + return nl_exchange(&r.h, (op == NL_OP_DELETE)); +} + +static inline int +nl_add_rte(struct krt_proto *p, rte *e, struct ea_list *eattrs) +{ + rta *a = e->attrs; + int err = 0; + + if (krt_ecmp6(p) && (a->dest == RTD_MULTIPATH)) + { + struct mpnh *nh = a->nexthops; + + err = nl_send_route(p, e, eattrs, NL_OP_ADD, RTD_ROUTER, nh->gw, nh->iface); + if (err < 0) + return err; + + for (nh = nh->next; nh; nh = nh->next) + err += nl_send_route(p, e, eattrs, NL_OP_APPEND, RTD_ROUTER, nh->gw, nh->iface); + + return err; + } + + return nl_send_route(p, e, eattrs, NL_OP_ADD, a->dest, a->gw, a->iface); +} + +static inline int +nl_delete_rte(struct krt_proto *p, rte *e, struct ea_list *eattrs) +{ + int err = 0; + + /* For IPv6, we just repeatedly request DELETE until we get error */ + do + err = nl_send_route(p, e, eattrs, NL_OP_DELETE, RTD_NONE, IPA_NONE, NULL); + while (krt_ecmp6(p) && !err); + + return err; } void @@ -925,17 +1019,21 @@ krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list int err = 0; /* - * NULL for eattr of the old route is a little hack, but we don't - * get proper eattrs for old in rt_notify() anyway. NULL means no - * extended route attributes and therefore matches if the kernel - * route has any of them. + * We could use NL_OP_REPLACE, but route replace on Linux has some problems: + * + * 1) Does not check for matching rtm_protocol + * 2) Has broken semantics for IPv6 ECMP + * 3) Crashes some kernel version when used for IPv6 ECMP + * + * So we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the old + * route value, so we do not try to optimize IPv6 ECMP reconfigurations. */ if (old) - nl_send_route(p, old, NULL, 0); + nl_delete_rte(p, old, eattrs); if (new) - err = nl_send_route(p, new, eattrs, 1); + err = nl_add_rte(p, new, eattrs); if (err < 0) n->n.flags |= KRF_SYNC_ERROR; @@ -944,10 +1042,80 @@ krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list } +static inline struct mpnh * +nl_alloc_mpnh(struct nl_parse_state *s, ip_addr gw, struct iface *iface, byte weight) +{ + struct mpnh *nh = lp_alloc(s->pool, sizeof(struct mpnh)); + + nh->gw = gw; + nh->iface = iface; + nh->next = NULL; + nh->weight = weight; + + return nh; +} + +static int +nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint priority, uint krt_type) +{ + /* Route merging must be active */ + if (!s->merge) + return 0; + + /* Saved and new route must have same network, proto/table, and priority */ + if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority)) + return 0; + + /* Both must be regular unicast routes */ + if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST)) + return 0; + + return 1; +} + +static void +nl_announce_route(struct nl_parse_state *s) +{ + rte *e = rte_get_temp(s->attrs); + e->net = s->net; + e->u.krt.src = s->krt_src; + e->u.krt.proto = s->krt_proto; + e->u.krt.seen = 0; + e->u.krt.best = 0; + e->u.krt.metric = s->krt_metric; + + if (s->scan) + krt_got_route(s->proto, e); + else + krt_got_route_async(s->proto, e, s->new); + + s->net = NULL; + s->attrs = NULL; + s->proto = NULL; + lp_flush(s->pool); +} + +static inline void +nl_parse_begin(struct nl_parse_state *s, int scan, int merge) +{ + memset(s, 0, sizeof (struct nl_parse_state)); + s->pool = nl_linpool; + s->scan = scan; + s->merge = merge; +} + +static inline void +nl_parse_end(struct nl_parse_state *s) +{ + if (s->net) + nl_announce_route(s); +} + + #define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0) static void -nl_parse_route(struct nlmsghdr *h, int scan) +nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) { struct krt_proto *p; struct rtmsg *i; @@ -957,6 +1125,7 @@ nl_parse_route(struct nlmsghdr *h, int scan) ip_addr dst = IPA_NONE; u32 oif = ~0; u32 table; + u32 priority = 0; int src; if (!(i = nl_checkin(h, sizeof(*i)))) @@ -999,7 +1168,6 @@ nl_parse_route(struct nlmsghdr *h, int scan) if (!p) SKIP("unknown table %d\n", table); - #ifdef IPV6 if (a[RTA_IIF]) SKIP("IIF set\n"); @@ -1008,9 +1176,12 @@ nl_parse_route(struct nlmsghdr *h, int scan) SKIP("TOS %02x\n", i->rtm_tos); #endif - if (scan && !new) + if (s->scan && !new) SKIP("RTM_DELROUTE in scan\n"); + if (a[RTA_PRIORITY]) + priority = rta_get_u32(a[RTA_PRIORITY]); + int c = ipa_classify_net(dst); if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK)) SKIP("strange class/scope\n"); @@ -1019,6 +1190,7 @@ nl_parse_route(struct nlmsghdr *h, int scan) // if (i->rtm_scope != RT_SCOPE_UNIVERSE) // SKIP("scope %u\n", i->rtm_scope); + switch (i->rtm_protocol) { case RTPROT_UNSPEC: @@ -1033,7 +1205,7 @@ nl_parse_route(struct nlmsghdr *h, int scan) return; case RTPROT_BIRD: - if (!scan) + if (!s->scan) SKIP("echo\n"); src = KRT_SRC_BIRD; break; @@ -1045,12 +1217,14 @@ nl_parse_route(struct nlmsghdr *h, int scan) net *net = net_get(p->p.table, dst, i->rtm_dst_len); - rta ra = { - .src= p->p.main_source, - .source = RTS_INHERIT, - .scope = SCOPE_UNIVERSE, - .cast = RTC_UNICAST - }; + if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type)) + nl_announce_route(s); + + rta *ra = lp_allocz(s->pool, sizeof(rta)); + ra->src = p->p.main_source; + ra->source = RTS_INHERIT; + ra->scope = SCOPE_UNIVERSE; + ra->cast = RTC_UNICAST; switch (i->rtm_type) { @@ -1058,9 +1232,9 @@ nl_parse_route(struct nlmsghdr *h, int scan) if (a[RTA_MULTIPATH] && (i->rtm_family == AF_INET)) { - ra.dest = RTD_MULTIPATH; - ra.nexthops = nl_parse_multipath(p, a[RTA_MULTIPATH]); - if (!ra.nexthops) + ra->dest = RTD_MULTIPATH; + ra->nexthops = nl_parse_multipath(p, a[RTA_MULTIPATH]); + if (!ra->nexthops) { log(L_ERR "KRT: Received strange multipath route %I/%d", net->n.prefix, net->n.pxlen); @@ -1070,8 +1244,8 @@ nl_parse_route(struct nlmsghdr *h, int scan) break; } - ra.iface = if_find_by_index(oif); - if (!ra.iface) + ra->iface = if_find_by_index(oif); + if (!ra->iface) { log(L_ERR "KRT: Received route %I/%d with unknown ifindex %u", net->n.prefix, net->n.pxlen, oif); @@ -1081,39 +1255,39 @@ nl_parse_route(struct nlmsghdr *h, int scan) if (a[RTA_GATEWAY]) { neighbor *ng; - ra.dest = RTD_ROUTER; - memcpy(&ra.gw, RTA_DATA(a[RTA_GATEWAY]), sizeof(ra.gw)); - ipa_ntoh(ra.gw); + ra->dest = RTD_ROUTER; + memcpy(&ra->gw, RTA_DATA(a[RTA_GATEWAY]), sizeof(ra->gw)); + ipa_ntoh(ra->gw); #ifdef IPV6 /* Silently skip strange 6to4 routes */ - if (ipa_in_net(ra.gw, IPA_NONE, 96)) + if (ipa_in_net(ra->gw, IPA_NONE, 96)) return; #endif - ng = neigh_find2(&p->p, &ra.gw, ra.iface, + ng = neigh_find2(&p->p, &ra->gw, ra->iface, (i->rtm_flags & RTNH_F_ONLINK) ? NEF_ONLINK : 0); if (!ng || (ng->scope == SCOPE_HOST)) { log(L_ERR "KRT: Received route %I/%d with strange next-hop %I", - net->n.prefix, net->n.pxlen, ra.gw); + net->n.prefix, net->n.pxlen, ra->gw); return; } } else { - ra.dest = RTD_DEVICE; + ra->dest = RTD_DEVICE; } break; case RTN_BLACKHOLE: - ra.dest = RTD_BLACKHOLE; + ra->dest = RTD_BLACKHOLE; break; case RTN_UNREACHABLE: - ra.dest = RTD_UNREACHABLE; + ra->dest = RTD_UNREACHABLE; break; case RTN_PROHIBIT: - ra.dest = RTD_PROHIBIT; + ra->dest = RTD_PROHIBIT; break; /* FIXME: What about RTN_THROW? */ default: @@ -1121,41 +1295,30 @@ nl_parse_route(struct nlmsghdr *h, int scan) return; } - rte *e = rte_get_temp(&ra); - e->net = net; - e->u.krt.src = src; - e->u.krt.proto = i->rtm_protocol; - e->u.krt.seen = 0; - e->u.krt.best = 0; - e->u.krt.metric = 0; - - if (a[RTA_PRIORITY]) - e->u.krt.metric = rta_get_u32(a[RTA_PRIORITY]); - if (a[RTA_PREFSRC]) { ip_addr ps; memcpy(&ps, RTA_DATA(a[RTA_PREFSRC]), sizeof(ps)); ipa_ntoh(ps); - ea_list *ea = alloca(sizeof(ea_list) + sizeof(eattr)); - ea->next = ra.eattrs; - ra.eattrs = ea; + ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); + ea->next = ra->eattrs; + ra->eattrs = ea; ea->flags = EALF_SORTED; ea->count = 1; ea->attrs[0].id = EA_KRT_PREFSRC; ea->attrs[0].flags = 0; ea->attrs[0].type = EAF_TYPE_IP_ADDRESS; - ea->attrs[0].u.ptr = alloca(sizeof(struct adata) + sizeof(ps)); + ea->attrs[0].u.ptr = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps)); ea->attrs[0].u.ptr->length = sizeof(ps); memcpy(ea->attrs[0].u.ptr->data, &ps, sizeof(ps)); } if (a[RTA_FLOW]) { - ea_list *ea = alloca(sizeof(ea_list) + sizeof(eattr)); - ea->next = ra.eattrs; - ra.eattrs = ea; + ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); + ea->next = ra->eattrs; + ra->eattrs = ea; ea->flags = EALF_SORTED; ea->count = 1; ea->attrs[0].id = EA_KRT_REALM; @@ -1167,7 +1330,7 @@ nl_parse_route(struct nlmsghdr *h, int scan) if (a[RTA_METRICS]) { u32 metrics[KRT_METRICS_MAX]; - ea_list *ea = alloca(sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr)); + ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr)); int t, n = 0; if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0) @@ -1189,30 +1352,62 @@ nl_parse_route(struct nlmsghdr *h, int scan) if (n > 0) { - ea->next = ra.eattrs; + ea->next = ra->eattrs; ea->flags = EALF_SORTED; ea->count = n; - ra.eattrs = ea; + ra->eattrs = ea; } } - if (scan) - krt_got_route(p, e); + /* + * Ideally, now we would send the received route to the rest of kernel code. + * But IPv6 ECMP routes are sent as a sequence of routes, so we postpone it + * and merge next hops until the end of the sequence. + */ + + if (!s->net) + { + /* Store the new route */ + s->net = net; + s->attrs = ra; + s->proto = p; + s->new = new; + s->krt_src = src; + s->krt_type = i->rtm_type; + s->krt_proto = i->rtm_protocol; + s->krt_metric = priority; + } else - krt_got_route_async(p, e, new); + { + /* Merge next hops with the stored route */ + rta *a = s->attrs; + + if (a->dest != RTD_MULTIPATH) + { + a->dest = RTD_MULTIPATH; + a->nexthops = nl_alloc_mpnh(s, a->gw, a->iface, 0); + } + + mpnh_insert(&a->nexthops, nl_alloc_mpnh(s, ra->gw, ra->iface, 0)); + } } void krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */ { struct nlmsghdr *h; + struct nl_parse_state s; + + nl_parse_begin(&s, 1, krt_ecmp6(p)); nl_request_dump(BIRD_AF, RTM_GETROUTE); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) - nl_parse_route(h, 1); + nl_parse_route(&s, h); else log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type); + + nl_parse_end(&s); } /* @@ -1225,12 +1420,16 @@ static byte *nl_async_rx_buffer; /* Receive buffer */ static void nl_async_msg(struct nlmsghdr *h) { + struct nl_parse_state s; + switch (h->nlmsg_type) { case RTM_NEWROUTE: case RTM_DELROUTE: DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type); - nl_parse_route(h, 0); + nl_parse_begin(&s, 0, 0); + nl_parse_route(&s, h); + nl_parse_end(&s); break; case RTM_NEWLINK: case RTM_DELLINK: @@ -1353,6 +1552,7 @@ nl_open_async(void) void krt_sys_io_init(void) { + nl_linpool = lp_new(krt_pool, 4080); HASH_INIT(nl_table_map, krt_pool, 6); } -- cgit v1.2.3 From 4adcb9df1bf551cc5fd1145c09af1843fdc4fe85 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Thu, 15 Sep 2016 14:59:06 +0200 Subject: KRT: Add kernel metric protocol option Kernel routes with different metrics do not clash with each other, therefore using dedicated metric value is a reliable way to avoid overwriting routes from other sources (e.g. kernel device routes). Although kernel route metric could already be set as a route attribute by filters, that is not consistent with the way how Linux kernel handles route metric - not just a route attribute, but a part of a route key. --- doc/bird.sgml | 15 ++++++++++++++- sysdep/linux/krt-sys.h | 1 + sysdep/linux/netlink.Y | 9 ++++----- sysdep/linux/netlink.c | 19 +++++++++++++++---- 4 files changed, 34 insertions(+), 10 deletions(-) (limited to 'sysdep/linux') diff --git a/doc/bird.sgml b/doc/bird.sgml index 1af7f2c1..9bd7df86 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -2388,6 +2388,17 @@ limitations can be overcome using another routing table and the pipe protocol. protocol work with. Available only on systems supporting multiple routing tables. + metric (Linux) + Use specified value as a kernel metric (priority) for all routes sent to + the kernel. When multiple routes for the same network are in the kernel + routing table, the Linux kernel chooses one with lower metric. Also, + routes with different metrics do not clash with each other, therefore + using dedicated metric value is a reliable way to avoid overwriting + routes from other sources (e.g. kernel device routes). Metric 0 has a + special meaning of undefined metric, in which either OS default is used, + or per-route metric can be set using graceful restart Participate in graceful restart recovery. If this option is enabled and a graceful restart recovery is active, the Kernel protocol will defer @@ -2420,9 +2431,11 @@ these attributes: route. See /etc/iproute2/rt_protos for common values. On BSD, it is based on STATIC and PROTOx flags. The attribute is read-only. - int + int (Linux) The kernel metric of the route. When multiple same routes are in a kernel routing table, the Linux kernel chooses one with lower metric. + Note that preferred way to set kernel metric is to use protocol option + ip (Linux) The preferred source address. Used in source address selection for diff --git a/sysdep/linux/krt-sys.h b/sysdep/linux/krt-sys.h index 076870f5..96688e34 100644 --- a/sysdep/linux/krt-sys.h +++ b/sysdep/linux/krt-sys.h @@ -88,6 +88,7 @@ static inline struct ifa * kif_get_primary_ip(struct iface *i) { return NULL; } struct krt_params { u32 table_id; /* Kernel table ID we sync with */ + u32 metric; /* Kernel metric used for all routes */ }; struct krt_state { diff --git a/sysdep/linux/netlink.Y b/sysdep/linux/netlink.Y index e9c225a2..a1c22f3e 100644 --- a/sysdep/linux/netlink.Y +++ b/sysdep/linux/netlink.Y @@ -10,8 +10,8 @@ CF_HDR CF_DECLS -CF_KEYWORDS(KERNEL, TABLE, KRT_PREFSRC, KRT_REALM, KRT_MTU, KRT_WINDOW, KRT_RTT, - KRT_RTTVAR, KRT_SSTRESH, KRT_CWND, KRT_ADVMSS, KRT_REORDERING, +CF_KEYWORDS(KERNEL, TABLE, METRIC, KRT_PREFSRC, KRT_REALM, KRT_MTU, KRT_WINDOW, + KRT_RTT, KRT_RTTVAR, KRT_SSTRESH, KRT_CWND, KRT_ADVMSS, KRT_REORDERING, KRT_HOPLIMIT, KRT_INITCWND, KRT_RTO_MIN, KRT_INITRWND, KRT_QUICKACK, KRT_LOCK_MTU, KRT_LOCK_WINDOW, KRT_LOCK_RTT, KRT_LOCK_RTTVAR, KRT_LOCK_SSTRESH, KRT_LOCK_CWND, KRT_LOCK_ADVMSS, KRT_LOCK_REORDERING, @@ -22,9 +22,8 @@ CF_GRAMMAR CF_ADDTO(kern_proto, kern_proto kern_sys_item ';') kern_sys_item: - KERNEL TABLE expr { - THIS_KRT->sys.table_id = $3; - } + KERNEL TABLE expr { THIS_KRT->sys.table_id = $3; } + | METRIC expr { THIS_KRT->sys.metric = $2; } ; CF_ADDTO(dynamic_attr, KRT_PREFSRC { $$ = f_new_dynamic_attr(EAF_TYPE_IP_ADDRESS, T_IP, EA_KRT_PREFSRC); }) diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 1ab1cda1..9bdcc0d2 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -880,6 +880,8 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int op, int d eattr *ea; net *net = e->net; rta *a = e->attrs; + u32 priority = 0; + struct { struct nlmsghdr h; struct rtmsg r; @@ -912,13 +914,20 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int op, int d else nl_add_attr_u32(&r.h, sizeof(r), RTA_TABLE, krt_table_id(p)); + if (a->source == RTS_DUMMY) + priority = e->u.krt.metric; + else if (KRT_CF->sys.metric) + priority = KRT_CF->sys.metric; + else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, EA_KRT_METRIC))) + priority = ea->u.data; + + if (priority) + nl_add_attr_u32(&r.h, sizeof(r), RTA_PRIORITY, priority); + /* For route delete, we do not specify remaining route attributes */ if (op == NL_OP_DELETE) goto dest; - if (ea = ea_find(eattrs, EA_KRT_METRIC)) - nl_add_attr_u32(&r.h, sizeof(r), RTA_PRIORITY, ea->u.data); - if (ea = ea_find(eattrs, EA_KRT_PREFSRC)) nl_add_attr_ipa(&r.h, sizeof(r), RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data); @@ -1585,19 +1594,21 @@ krt_sys_shutdown(struct krt_proto *p) int krt_sys_reconfigure(struct krt_proto *p UNUSED, struct krt_config *n, struct krt_config *o) { - return n->sys.table_id == o->sys.table_id; + return (n->sys.table_id == o->sys.table_id) && (n->sys.metric == o->sys.metric); } void krt_sys_init_config(struct krt_config *cf) { cf->sys.table_id = RT_TABLE_MAIN; + cf->sys.metric = 0; } void krt_sys_copy_config(struct krt_config *d, struct krt_config *s) { d->sys.table_id = s->sys.table_id; + d->sys.metric = s->sys.metric; } static const char *krt_metrics_names[KRT_METRICS_MAX] = { -- cgit v1.2.3 From 6e75d0d27fe85f12a22928e5729465823704281e Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Mon, 19 Sep 2016 12:29:56 +0200 Subject: KRT: Add krt_scope attribute Add a new route attribute, krt_scope, to expose the Linux kernel route scope. Constants from /etc/iproute2/rt_scopes (prefixed by "ips_") are expected to be used with the attribute. Both import and export are supported. Also, the patch fixes device route export to the kernel, by setting link scope automatically. --- doc/bird.sgml | 9 +++++++++ sysdep/linux/krt-sys.h | 1 + sysdep/linux/netlink.Y | 3 ++- sysdep/linux/netlink.c | 33 ++++++++++++++++++++++++++------- 4 files changed, 38 insertions(+), 8 deletions(-) (limited to 'sysdep/linux') diff --git a/doc/bird.sgml b/doc/bird.sgml index 9bd7df86..af5b7980 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -2443,6 +2443,15 @@ these attributes: int (Linux) The realm of the route. Can be used for traffic classification. + + int (Linux IPv4) + The scope of the route. Valid values are 0-254, although Linux kernel + may reject some values depending on route type and nexthop. It is + supposed to represent `indirectness' of the route, where nexthops of + routes are resolved through routes with a higher scope, but in current + kernels anything below

In Linux, there is also a plenty of obscure route attributes mostly focused diff --git a/sysdep/linux/krt-sys.h b/sysdep/linux/krt-sys.h index 96688e34..6d6586d1 100644 --- a/sysdep/linux/krt-sys.h +++ b/sysdep/linux/krt-sys.h @@ -36,6 +36,7 @@ static inline struct ifa * kif_get_primary_ip(struct iface *i) { return NULL; } #define EA_KRT_PREFSRC EA_CODE(EAP_KRT, 0x10) #define EA_KRT_REALM EA_CODE(EAP_KRT, 0x11) +#define EA_KRT_SCOPE EA_CODE(EAP_KRT, 0x12) #define KRT_METRICS_MAX 0x10 /* RTAX_QUICKACK+1 */ diff --git a/sysdep/linux/netlink.Y b/sysdep/linux/netlink.Y index a1c22f3e..f577244d 100644 --- a/sysdep/linux/netlink.Y +++ b/sysdep/linux/netlink.Y @@ -10,7 +10,7 @@ CF_HDR CF_DECLS -CF_KEYWORDS(KERNEL, TABLE, METRIC, KRT_PREFSRC, KRT_REALM, KRT_MTU, KRT_WINDOW, +CF_KEYWORDS(KERNEL, TABLE, METRIC, KRT_PREFSRC, KRT_REALM, KRT_SCOPE, KRT_MTU, KRT_WINDOW, KRT_RTT, KRT_RTTVAR, KRT_SSTRESH, KRT_CWND, KRT_ADVMSS, KRT_REORDERING, KRT_HOPLIMIT, KRT_INITCWND, KRT_RTO_MIN, KRT_INITRWND, KRT_QUICKACK, KRT_LOCK_MTU, KRT_LOCK_WINDOW, KRT_LOCK_RTT, KRT_LOCK_RTTVAR, @@ -28,6 +28,7 @@ kern_sys_item: CF_ADDTO(dynamic_attr, KRT_PREFSRC { $$ = f_new_dynamic_attr(EAF_TYPE_IP_ADDRESS, T_IP, EA_KRT_PREFSRC); }) CF_ADDTO(dynamic_attr, KRT_REALM { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_REALM); }) +CF_ADDTO(dynamic_attr, KRT_SCOPE { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_SCOPE); }) CF_ADDTO(dynamic_attr, KRT_MTU { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_MTU); }) CF_ADDTO(dynamic_attr, KRT_WINDOW { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_INT, EA_KRT_WINDOW); }) diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 9bdcc0d2..1490213e 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -899,7 +899,7 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int op, int d r.r.rtm_family = BIRD_AF; r.r.rtm_dst_len = net->n.pxlen; r.r.rtm_protocol = RTPROT_BIRD; - r.r.rtm_scope = RT_SCOPE_UNIVERSE; + r.r.rtm_scope = RT_SCOPE_NOWHERE; nl_add_attr_ipa(&r.h, sizeof(r), RTA_DST, net->n.prefix); /* @@ -928,6 +928,12 @@ nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int op, int d if (op == NL_OP_DELETE) goto dest; + /* Default scope is LINK for device routes, UNIVERSE otherwise */ + if (ea = ea_find(eattrs, EA_KRT_SCOPE)) + r.r.rtm_scope = ea->u.data; + else + r.r.rtm_scope = (dest == RTD_DEVICE) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; + if (ea = ea_find(eattrs, EA_KRT_PREFSRC)) nl_add_attr_ipa(&r.h, sizeof(r), RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data); @@ -1135,6 +1141,7 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) u32 oif = ~0; u32 table; u32 priority = 0; + u32 def_scope = RT_SCOPE_UNIVERSE; int src; if (!(i = nl_checkin(h, sizeof(*i)))) @@ -1157,7 +1164,6 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) return; } - if (a[RTA_DST]) { memcpy(&dst, RTA_DATA(a[RTA_DST]), sizeof(dst)); @@ -1195,11 +1201,6 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK)) SKIP("strange class/scope\n"); - // ignore rtm_scope, it is not a real scope - // if (i->rtm_scope != RT_SCOPE_UNIVERSE) - // SKIP("scope %u\n", i->rtm_scope); - - switch (i->rtm_protocol) { case RTPROT_UNSPEC: @@ -1286,6 +1287,7 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) else { ra->dest = RTD_DEVICE; + def_scope = RT_SCOPE_LINK; } break; @@ -1304,6 +1306,19 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) return; } + if (i->rtm_scope != def_scope) + { + ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr)); + ea->next = ra->eattrs; + ra->eattrs = ea; + ea->flags = EALF_SORTED; + ea->count = 1; + ea->attrs[0].id = EA_KRT_SCOPE; + ea->attrs[0].flags = 0; + ea->attrs[0].type = EAF_TYPE_INT; + ea->attrs[0].u.data = i->rtm_scope; + } + if (a[RTA_PREFSRC]) { ip_addr ps; @@ -1633,6 +1648,10 @@ krt_sys_get_attr(eattr *a, byte *buf, int buflen UNUSED) bsprintf(buf, "realm"); return GA_NAME; + case EA_KRT_SCOPE: + bsprintf(buf, "scope"); + return GA_NAME; + case EA_KRT_LOCK: buf += bsprintf(buf, "lock:"); ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX); -- cgit v1.2.3 From ccd2a3eda24230df550e9880f4340fc6341c8f52 Mon Sep 17 00:00:00 2001 From: Jan Moskyto Matejka Date: Thu, 29 Sep 2016 12:00:53 +0200 Subject: Kernel socket missing err_hook fix Thanks to Tim Weippert for bugreport. --- sysdep/bsd/krt-sock.c | 7 +++++++ sysdep/linux/netlink.c | 7 +++++++ sysdep/unix/io.c | 2 +- 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'sysdep/linux') diff --git a/sysdep/bsd/krt-sock.c b/sysdep/bsd/krt-sock.c index 6ff3b2b7..f2ae81c3 100644 --- a/sysdep/bsd/krt-sock.c +++ b/sysdep/bsd/krt-sock.c @@ -911,6 +911,12 @@ krt_sock_hook(sock *sk, int size UNUSED) return 0; } +static void +krt_sock_err_hook(sock *sk, int e UNUSED) +{ + krt_sock_hook(sk, 0); +} + static sock * krt_sock_open(pool *pool, void *data, int table_id) { @@ -932,6 +938,7 @@ krt_sock_open(pool *pool, void *data, int table_id) sk = sk_new(pool); sk->type = SK_MAGIC; sk->rx_hook = krt_sock_hook; + sk->err_hook = krt_sock_err_hook; sk->fd = fd; sk->data = data; diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 1490213e..79dd1405 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -1525,6 +1525,12 @@ nl_async_hook(sock *sk, int size UNUSED) return 1; } +static void +nl_async_err_hook(sock *sk, int e UNUSED) +{ + nl_async_hook(sk, 0); +} + static void nl_open_async(void) { @@ -1563,6 +1569,7 @@ nl_open_async(void) sk = nl_async_sk = sk_new(krt_pool); sk->type = SK_MAGIC; sk->rx_hook = nl_async_hook; + sk->err_hook = nl_async_err_hook; sk->fd = fd; if (sk_open(sk) < 0) bug("Netlink: sk_open failed"); diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index 120eb906..3ceadd98 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -1857,7 +1857,7 @@ void sk_err(sock *s, int revents) { int se = 0, sse = sizeof(se); - if (revents & POLLERR) + if ((s->type != SK_MAGIC) && (revents & POLLERR)) if (getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &se, &sse) < 0) { log(L_ERR "IO: Socket error: SO_ERROR: %m"); -- cgit v1.2.3