From 77d032c71f62e44293a10ccc22f8c157442df179 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Wed, 5 Jan 2022 18:46:41 +0100 Subject: Netlink: Improve multipath parsing errors Function nl_parse_multipath() should handle errors internally. --- sysdep/linux/netlink.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) (limited to 'sysdep/linux') diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index fdf3f2db..1293df4d 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -681,7 +681,7 @@ nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, e } static struct nexthop * -nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr *ra, int af) +nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr *n, struct rtattr *ra, int af) { struct rtattr *a[BIRD_RTA_MAX]; struct rtnexthop *nh = RTA_DATA(ra); @@ -695,7 +695,7 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr { /* Use RTNH_OK(nh,len) ?? */ if ((len < sizeof(*nh)) || (len < nh->rtnh_len)) - return NULL; + goto err; if (nh->rtnh_flags & RTNH_F_DEAD) goto next; @@ -706,7 +706,10 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr rv->weight = nh->rtnh_hops; rv->iface = if_find_by_index(nh->rtnh_ifindex); if (!rv->iface) - return NULL; + { + log(L_ERR "KRT: Received route %N with unknown ifindex %u", n, nh->rtnh_ifindex); + return NULL; + } /* Nonexistent RTNH_PAYLOAD ?? */ nl_attr_len = nh->rtnh_len - RTNH_LENGTH(0); @@ -714,18 +717,18 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr { case AF_INET: if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want4, a, sizeof(a))) - return NULL; + goto err; break; case AF_INET6: if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want6, a, sizeof(a))) - return NULL; + goto err; break; #ifdef HAVE_MPLS_KERNEL case AF_MPLS: if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want_mpls, a, sizeof(a))) - return NULL; + goto err; if (a[RTA_NEWDST]) rv->labels = rta_get_mpls(a[RTA_NEWDST], rv->label); @@ -734,7 +737,7 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr #endif default: - return NULL; + goto err; } if (a[RTA_GATEWAY]) @@ -757,14 +760,19 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr nbr = neigh_find(&p->p, rv->gw, rv->iface, (rv->flags & RNF_ONLINK) ? NEF_ONLINK : 0); if (!nbr || (nbr->scope == SCOPE_HOST)) - return NULL; + { + log(L_ERR "KRT: Received route %N with strange next-hop %I", n, rv->gw); + return NULL; + } } #ifdef HAVE_MPLS_KERNEL if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE]) { - if (rta_get_u16(a[RTA_ENCAP_TYPE]) != LWTUNNEL_ENCAP_MPLS) { - log(L_WARN "KRT: Unknown encapsulation method %d in multipath", rta_get_u16(a[RTA_ENCAP_TYPE])); + if (rta_get_u16(a[RTA_ENCAP_TYPE]) != LWTUNNEL_ENCAP_MPLS) + { + log(L_WARN "KRT: Received route %N with unknown encapsulation method %d", + n, rta_get_u16(a[RTA_ENCAP_TYPE])); return NULL; } @@ -785,6 +793,10 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr first = nexthop_sort(first); return first; + +err: + log(L_ERR "KRT: Received strange multipath route %N", n); + return NULL; } static void @@ -1675,19 +1687,16 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) if (a[RTA_MULTIPATH]) { - struct nexthop *nh = nl_parse_multipath(s, p, a[RTA_MULTIPATH], i->rtm_family); + struct nexthop *nh = nl_parse_multipath(s, p, n, a[RTA_MULTIPATH], i->rtm_family); if (!nh) - { - log(L_ERR "KRT: Received strange multipath route %N", net->n.addr); - return; - } + SKIP("strange RTA_MULTIPATH\n"); nexthop_link(ra, nh); break; } if (i->rtm_flags & RTNH_F_DEAD) - return; + SKIP("ignore RTNH_F_DEAD\n"); ra->nh.iface = if_find_by_index(oif); if (!ra->nh.iface) -- cgit v1.2.3 From f5c8fb5fba959d356ce1ea0fb5879223f76137f7 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Wed, 5 Jan 2022 19:25:42 +0100 Subject: Netlink: Do not ignore dead routes from BIRD Currently, BIRD ignores dead routes to consider them absent. But it also ignores its own routes and thus it can not correctly manage such routes in some cases. This patch makes an exception for routes with proto bird when ignoring dead routes, so they can be properly updated or removed. Thanks to Alexander Zubkov for the original patch. --- sysdep/linux/netlink.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'sysdep/linux') diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 1293df4d..e127052a 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -681,7 +681,7 @@ nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, e } static struct nexthop * -nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr *n, struct rtattr *ra, int af) +nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr *n, struct rtattr *ra, int af, int krt_src) { struct rtattr *a[BIRD_RTA_MAX]; struct rtnexthop *nh = RTA_DATA(ra); @@ -697,7 +697,7 @@ nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr if ((len < sizeof(*nh)) || (len < nh->rtnh_len)) goto err; - if (nh->rtnh_flags & RTNH_F_DEAD) + if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD)) goto next; *last = rv = lp_allocz(s->pool, NEXTHOP_MAX_SIZE); @@ -1687,7 +1687,7 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) if (a[RTA_MULTIPATH]) { - struct nexthop *nh = nl_parse_multipath(s, p, n, a[RTA_MULTIPATH], i->rtm_family); + struct nexthop *nh = nl_parse_multipath(s, p, n, a[RTA_MULTIPATH], i->rtm_family, krt_src); if (!nh) SKIP("strange RTA_MULTIPATH\n"); @@ -1695,7 +1695,7 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) break; } - if (i->rtm_flags & RTNH_F_DEAD) + if ((i->rtm_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD)) SKIP("ignore RTNH_F_DEAD\n"); ra->nh.iface = if_find_by_index(oif); -- cgit v1.2.3 From 87a02489f3880689a4e2ad72b0b981649dad2154 Mon Sep 17 00:00:00 2001 From: Alexander Zubkov Date: Sat, 8 Jan 2022 18:31:56 +0100 Subject: IO: Support nonlocal bind in socket interface Add option to socket interface for nonlocal binding, i.e. binding to an IP address that is not present on interfaces. This behaviour is enabled when SKF_FREEBIND socket flag is set. For Linux systems, it is implemented by IP_FREEBIND socket flag. Minor changes done by commiter. --- lib/socket.h | 1 + sysdep/bsd/sysio.h | 6 ++++++ sysdep/linux/sysio.h | 19 +++++++++++++++++++ sysdep/unix/io.c | 4 ++++ 4 files changed, 30 insertions(+) (limited to 'sysdep/linux') diff --git a/lib/socket.h b/lib/socket.h index 96fedeeb..0b6ac589 100644 --- a/lib/socket.h +++ b/lib/socket.h @@ -123,6 +123,7 @@ extern int sk_priority_control; /* Suggested priority for control traffic, shou #define SKF_TTL_RX 0x08 /* Report TTL / Hop Limit for RX packets */ #define SKF_BIND 0x10 /* Bind datagram socket to given source address */ #define SKF_HIGH_PORT 0x20 /* Choose port from high range if possible */ +#define SKF_FREEBIND 0x40 /* Allow socket to bind to a nonlocal address */ #define SKF_THREAD 0x100 /* Socked used in thread, Do not add to main loop */ #define SKF_TRUNCATED 0x200 /* Received packet was truncated, set by IO layer */ diff --git a/sysdep/bsd/sysio.h b/sysdep/bsd/sysio.h index c757960a..f1887fb4 100644 --- a/sysdep/bsd/sysio.h +++ b/sysdep/bsd/sysio.h @@ -271,3 +271,9 @@ sk_set_priority(sock *s, int prio UNUSED) { ERR_MSG("Socket priority not supported"); } + +static inline int +sk_set_freebind(sock *s) +{ + ERR_MSG("Freebind is not supported"); +} diff --git a/sysdep/linux/sysio.h b/sysdep/linux/sysio.h index e21ff487..f13eda7c 100644 --- a/sysdep/linux/sysio.h +++ b/sysdep/linux/sysio.h @@ -10,6 +10,10 @@ #define IPV6_MINHOPCOUNT 73 #endif +#ifndef IPV6_FREEBIND +#define IPV6_FREEBIND 78 +#endif + #ifndef TCP_MD5SIG_EXT #define TCP_MD5SIG_EXT 32 #endif @@ -266,3 +270,18 @@ sk_set_priority(sock *s, int prio) return 0; } +static inline int +sk_set_freebind(sock *s) +{ + int y = 1; + + if (sk_is_ipv4(s)) + if (setsockopt(s->fd, SOL_IP, IP_FREEBIND, &y, sizeof(y)) < 0) + ERR("IP_FREEBIND"); + + if (sk_is_ipv6(s)) + if (setsockopt(s->fd, SOL_IPV6, IPV6_FREEBIND, &y, sizeof(y)) < 0) + ERR("IPV6_FREEBIND"); + + return 0; +} diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index 3d67d0a7..4fd77453 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -1436,6 +1436,10 @@ sk_open(sock *s) if (sk_set_high_port(s) < 0) log(L_WARN "Socket error: %s%#m", s->err); + if (s->flags & SKF_FREEBIND) + if (sk_set_freebind(s) < 0) + log(L_WARN "Socket error: %s%#m", s->err); + sockaddr_fill(&sa, s->af, bind_addr, s->iface, bind_port); if (bind(fd, &sa.sa, SA_LEN(sa)) < 0) ERR2("bind"); -- cgit v1.2.3 From d0dd1d20cd40e75e417d58569fac3ff0bf1db41a Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Fri, 14 Jan 2022 19:07:57 +0100 Subject: Netlink: Explicitly skip received cloned routes Kernel uses cloned routes to keep route cache entries, but reports them together with regular routes. They were skipped implicitly as they do not have rtm_protocol filled. Add explicit check for cloned flag and skip such routes explicitly. Also, improve debug logs of skipped routes. --- sysdep/linux/netlink.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'sysdep/linux') diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index e127052a..7cea5322 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -1535,7 +1535,8 @@ nl_parse_end(struct nl_parse_state *s) } -#define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0) +#define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0) +#define SKIP(ARG, ...) do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0) static void nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) @@ -1588,10 +1589,10 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) return; if (!a[RTA_DST]) - SKIP("MPLS route without RTA_DST"); + SKIP0("MPLS route without RTA_DST\n"); if (rta_get_mpls(a[RTA_DST], rta_mpls_stack) != 1) - SKIP("MPLS route with multi-label RTA_DST"); + SKIP0("MPLS route with multi-label RTA_DST\n"); net_fill_mpls(&dst, rta_mpls_stack[0]); break; @@ -1609,6 +1610,9 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) else table_id = i->rtm_table; + if (i->rtm_flags & RTM_F_CLONED) + SKIP("cloned\n"); + /* Do we know this table? */ p = HASH_FIND(nl_table_map, RTH, i->rtm_family, table_id); if (!p) -- cgit v1.2.3 From e818f16448e918ed07633480291283f3449dd9e4 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Fri, 14 Jan 2022 21:53:40 +0100 Subject: Netlink: Enable strict checking for KRT dumps Add strict checking for netlink KRT dumps to avoid PMTU cache records from FNHE table dump along with KRT. Linux Kernel added FNHE table dump to the netlink API in patch: https://patchwork.ozlabs.org/project/netdev/patch/8d3b68cd37fb5fddc470904cdd6793fcf480c6c1.1561131177.git.sbrivio@redhat.com/ Therefore, since Linux 5.3 these route cache entries are dumped together with regular routes during periodic KRT scans, which in some cases may be huge amount of useless data. This can be avoided by using strict checking for netlink dumps: https://lore.kernel.org/netdev/20181008031644.15989-1-dsahern@kernel.org/ The patch mitigates the risk of receiving unknown and potentially large number of FNHE records that would block BIRD I/O in each sync. There is a known issue caused by the GRE tunnels on Linux that seems to be creating one FNHE record for each destination IP address that is routed through the tunnel, even when the PMTU equals to GRE interface MTU. Thanks to Tomas Hlavacek for the original patch. --- sysdep/linux/netlink.c | 70 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 11 deletions(-) (limited to 'sysdep/linux') diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 7cea5322..71b290fd 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -130,7 +130,7 @@ struct nl_sock uint last_size; }; -#define NL_RX_SIZE 8192 +#define NL_RX_SIZE 32768 #define NL_OP_DELETE 0 #define NL_OP_ADD (NLM_F_CREATE|NLM_F_EXCL) @@ -157,11 +157,19 @@ nl_open_sock(struct nl_sock *nl) } } +static void +nl_set_strict_dump(struct nl_sock *nl, int strict) +{ + setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); +} + static void nl_open(void) { nl_open_sock(&nl_scan); nl_open_sock(&nl_req); + + nl_set_strict_dump(&nl_scan, 1); } static void @@ -180,20 +188,60 @@ nl_send(struct nl_sock *nl, struct nlmsghdr *nh) } static void -nl_request_dump(int af, int cmd) +nl_request_dump_link(void) { struct { struct nlmsghdr nh; - struct rtgenmsg g; + struct ifinfomsg ifi; } req = { - .nh.nlmsg_type = cmd, - .nh.nlmsg_len = sizeof(req), + .nh.nlmsg_type = RTM_GETLINK, + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, - .g.rtgen_family = af + .nh.nlmsg_seq = ++(nl_scan.seq), + .ifi.ifi_family = AF_UNSPEC, }; - nl_send(&nl_scan, &req.nh); + + send(nl_scan.fd, &req, sizeof(req), 0); + nl_scan.last_hdr = NULL; } +static void +nl_request_dump_addr(int af) +{ + struct { + struct nlmsghdr nh; + struct ifaddrmsg ifa; + } req = { + .nh.nlmsg_type = RTM_GETADDR, + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .nh.nlmsg_seq = ++(nl_scan.seq), + .ifa.ifa_family = af, + }; + + send(nl_scan.fd, &req, sizeof(req), 0); + nl_scan.last_hdr = NULL; +} + +static void +nl_request_dump_route(int af) +{ + struct { + struct nlmsghdr nh; + struct rtmsg rtm; + } req = { + .nh.nlmsg_type = RTM_GETROUTE, + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)), + .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .nh.nlmsg_seq = ++(nl_scan.seq), + .rtm.rtm_family = af, + }; + + send(nl_scan.fd, &req, sizeof(req), 0); + nl_scan.last_hdr = NULL; +} + + static struct nlmsghdr * nl_get_reply(struct nl_sock *nl) { @@ -1151,7 +1199,7 @@ kif_do_scan(struct kif_proto *p UNUSED) if_start_update(); - nl_request_dump(AF_UNSPEC, RTM_GETLINK); + nl_request_dump_link(); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWLINK || h->nlmsg_type == RTM_DELLINK) nl_parse_link(h, 1); @@ -1178,14 +1226,14 @@ kif_do_scan(struct kif_proto *p UNUSED) } } - nl_request_dump(AF_INET, RTM_GETADDR); + nl_request_dump_addr(AF_INET); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR) nl_parse_addr(h, 1); else log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type); - nl_request_dump(AF_INET6, RTM_GETADDR); + nl_request_dump_addr(AF_INET6); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR) nl_parse_addr(h, 1); @@ -1902,7 +1950,7 @@ krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NUL struct nl_parse_state s; nl_parse_begin(&s, 1); - nl_request_dump(AF_UNSPEC, RTM_GETROUTE); + nl_request_dump_route(AF_UNSPEC); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) nl_parse_route(&s, h); -- cgit v1.2.3 From 8988264a64dc9985303332568832b108dba3acd3 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Fri, 14 Jan 2022 23:15:05 +0100 Subject: Netlink: Add workaround for older kernel headers --- sysdep/linux/netlink.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'sysdep/linux') diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 71b290fd..27b1a617 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -69,6 +69,10 @@ #define RTA_ENCAP 22 #endif +#ifndef NETLINK_GET_STRICT_CHK +#define NETLINK_GET_STRICT_CHK 12 +#endif + #define krt_ipv4(p) ((p)->af == AF_INET) #define krt_ecmp6(p) ((p)->af == AF_INET6) -- cgit v1.2.3 From bbc33f6ec310d98b9100fb883a2b8908ede1b5a8 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Sat, 15 Jan 2022 22:39:40 +0100 Subject: Netlink: Add another workaround for older kernel headers Unfortunately, SOL_NETLINK is both recently added and arch-dependent, so we cannot just define it. --- sysdep/linux/netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'sysdep/linux') diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 27b1a617..ccd62f26 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -164,7 +164,14 @@ nl_open_sock(struct nl_sock *nl) static void nl_set_strict_dump(struct nl_sock *nl, int strict) { + /* + * Strict checking is not necessary, it improves behavior on newer kernels. + * If it is not available (missing SOL_NETLINK compile-time, or ENOPROTOOPT + * run-time), we can just ignore it. + */ +#ifdef SOL_NETLINK setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); +#endif } static void -- cgit v1.2.3 From 81ee6cda2e60bbd3d97ab63da30657a54b09feda Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Mon, 17 Jan 2022 05:11:29 +0100 Subject: Netlink: Add option to specify netlink socket receive buffer size Add option 'netlink rx buffer' to specify netlink socket receive buffer size. Uses SO_RCVBUFFORCE, so it can override rmem_max limit. Thanks to Trisha Biswas and Michal for the original patches. --- doc/bird.sgml | 6 + sysdep/linux/krt-sys.h | 1 + sysdep/linux/netlink.Y | 4 +- sysdep/linux/netlink.c | 54 ++ sysdep/linux/netlink.c.orig | 2179 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 2243 insertions(+), 1 deletion(-) create mode 100644 sysdep/linux/netlink.c.orig (limited to 'sysdep/linux') diff --git a/doc/bird.sgml b/doc/bird.sgml index 0112622e..f10b15e2 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -3248,6 +3248,12 @@ channels. allows to specify a limit on maximal number of nexthops in one route. By default, multipath merging is disabled. If enabled, default value of the limit is 16. + +