From d0dd1d20cd40e75e417d58569fac3ff0bf1db41a Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Fri, 14 Jan 2022 19:07:57 +0100 Subject: Netlink: Explicitly skip received cloned routes Kernel uses cloned routes to keep route cache entries, but reports them together with regular routes. They were skipped implicitly as they do not have rtm_protocol filled. Add explicit check for cloned flag and skip such routes explicitly. Also, improve debug logs of skipped routes. --- sysdep/linux/netlink.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'sysdep/linux/netlink.c') diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index e127052a..7cea5322 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -1535,7 +1535,8 @@ nl_parse_end(struct nl_parse_state *s) } -#define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0) +#define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0) +#define SKIP(ARG, ...) do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0) static void nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) @@ -1588,10 +1589,10 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) return; if (!a[RTA_DST]) - SKIP("MPLS route without RTA_DST"); + SKIP0("MPLS route without RTA_DST\n"); if (rta_get_mpls(a[RTA_DST], rta_mpls_stack) != 1) - SKIP("MPLS route with multi-label RTA_DST"); + SKIP0("MPLS route with multi-label RTA_DST\n"); net_fill_mpls(&dst, rta_mpls_stack[0]); break; @@ -1609,6 +1610,9 @@ nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h) else table_id = i->rtm_table; + if (i->rtm_flags & RTM_F_CLONED) + SKIP("cloned\n"); + /* Do we know this table? */ p = HASH_FIND(nl_table_map, RTH, i->rtm_family, table_id); if (!p) -- cgit v1.2.3 From e818f16448e918ed07633480291283f3449dd9e4 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Fri, 14 Jan 2022 21:53:40 +0100 Subject: Netlink: Enable strict checking for KRT dumps Add strict checking for netlink KRT dumps to avoid PMTU cache records from FNHE table dump along with KRT. Linux Kernel added FNHE table dump to the netlink API in patch: https://patchwork.ozlabs.org/project/netdev/patch/8d3b68cd37fb5fddc470904cdd6793fcf480c6c1.1561131177.git.sbrivio@redhat.com/ Therefore, since Linux 5.3 these route cache entries are dumped together with regular routes during periodic KRT scans, which in some cases may be huge amount of useless data. This can be avoided by using strict checking for netlink dumps: https://lore.kernel.org/netdev/20181008031644.15989-1-dsahern@kernel.org/ The patch mitigates the risk of receiving unknown and potentially large number of FNHE records that would block BIRD I/O in each sync. There is a known issue caused by the GRE tunnels on Linux that seems to be creating one FNHE record for each destination IP address that is routed through the tunnel, even when the PMTU equals to GRE interface MTU. Thanks to Tomas Hlavacek for the original patch. --- sysdep/linux/netlink.c | 70 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 11 deletions(-) (limited to 'sysdep/linux/netlink.c') diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 7cea5322..71b290fd 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -130,7 +130,7 @@ struct nl_sock uint last_size; }; -#define NL_RX_SIZE 8192 +#define NL_RX_SIZE 32768 #define NL_OP_DELETE 0 #define NL_OP_ADD (NLM_F_CREATE|NLM_F_EXCL) @@ -157,11 +157,19 @@ nl_open_sock(struct nl_sock *nl) } } +static void +nl_set_strict_dump(struct nl_sock *nl, int strict) +{ + setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); +} + static void nl_open(void) { nl_open_sock(&nl_scan); nl_open_sock(&nl_req); + + nl_set_strict_dump(&nl_scan, 1); } static void @@ -180,20 +188,60 @@ nl_send(struct nl_sock *nl, struct nlmsghdr *nh) } static void -nl_request_dump(int af, int cmd) +nl_request_dump_link(void) { struct { struct nlmsghdr nh; - struct rtgenmsg g; + struct ifinfomsg ifi; } req = { - .nh.nlmsg_type = cmd, - .nh.nlmsg_len = sizeof(req), + .nh.nlmsg_type = RTM_GETLINK, + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)), .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, - .g.rtgen_family = af + .nh.nlmsg_seq = ++(nl_scan.seq), + .ifi.ifi_family = AF_UNSPEC, }; - nl_send(&nl_scan, &req.nh); + + send(nl_scan.fd, &req, sizeof(req), 0); + nl_scan.last_hdr = NULL; } +static void +nl_request_dump_addr(int af) +{ + struct { + struct nlmsghdr nh; + struct ifaddrmsg ifa; + } req = { + .nh.nlmsg_type = RTM_GETADDR, + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .nh.nlmsg_seq = ++(nl_scan.seq), + .ifa.ifa_family = af, + }; + + send(nl_scan.fd, &req, sizeof(req), 0); + nl_scan.last_hdr = NULL; +} + +static void +nl_request_dump_route(int af) +{ + struct { + struct nlmsghdr nh; + struct rtmsg rtm; + } req = { + .nh.nlmsg_type = RTM_GETROUTE, + .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)), + .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP, + .nh.nlmsg_seq = ++(nl_scan.seq), + .rtm.rtm_family = af, + }; + + send(nl_scan.fd, &req, sizeof(req), 0); + nl_scan.last_hdr = NULL; +} + + static struct nlmsghdr * nl_get_reply(struct nl_sock *nl) { @@ -1151,7 +1199,7 @@ kif_do_scan(struct kif_proto *p UNUSED) if_start_update(); - nl_request_dump(AF_UNSPEC, RTM_GETLINK); + nl_request_dump_link(); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWLINK || h->nlmsg_type == RTM_DELLINK) nl_parse_link(h, 1); @@ -1178,14 +1226,14 @@ kif_do_scan(struct kif_proto *p UNUSED) } } - nl_request_dump(AF_INET, RTM_GETADDR); + nl_request_dump_addr(AF_INET); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR) nl_parse_addr(h, 1); else log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type); - nl_request_dump(AF_INET6, RTM_GETADDR); + nl_request_dump_addr(AF_INET6); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR) nl_parse_addr(h, 1); @@ -1902,7 +1950,7 @@ krt_do_scan(struct krt_proto *p UNUSED) /* CONFIG_ALL_TABLES_AT_ONCE => p is NUL struct nl_parse_state s; nl_parse_begin(&s, 1); - nl_request_dump(AF_UNSPEC, RTM_GETROUTE); + nl_request_dump_route(AF_UNSPEC); while (h = nl_get_scan()) if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE) nl_parse_route(&s, h); -- cgit v1.2.3 From 8988264a64dc9985303332568832b108dba3acd3 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Fri, 14 Jan 2022 23:15:05 +0100 Subject: Netlink: Add workaround for older kernel headers --- sysdep/linux/netlink.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'sysdep/linux/netlink.c') diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 71b290fd..27b1a617 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -69,6 +69,10 @@ #define RTA_ENCAP 22 #endif +#ifndef NETLINK_GET_STRICT_CHK +#define NETLINK_GET_STRICT_CHK 12 +#endif + #define krt_ipv4(p) ((p)->af == AF_INET) #define krt_ecmp6(p) ((p)->af == AF_INET6) -- cgit v1.2.3 From bbc33f6ec310d98b9100fb883a2b8908ede1b5a8 Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Sat, 15 Jan 2022 22:39:40 +0100 Subject: Netlink: Add another workaround for older kernel headers Unfortunately, SOL_NETLINK is both recently added and arch-dependent, so we cannot just define it. --- sysdep/linux/netlink.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'sysdep/linux/netlink.c') diff --git a/sysdep/linux/netlink.c b/sysdep/linux/netlink.c index 27b1a617..ccd62f26 100644 --- a/sysdep/linux/netlink.c +++ b/sysdep/linux/netlink.c @@ -164,7 +164,14 @@ nl_open_sock(struct nl_sock *nl) static void nl_set_strict_dump(struct nl_sock *nl, int strict) { + /* + * Strict checking is not necessary, it improves behavior on newer kernels. + * If it is not available (missing SOL_NETLINK compile-time, or ENOPROTOOPT + * run-time), we can just ignore it. + */ +#ifdef SOL_NETLINK setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict)); +#endif } static void -- cgit v1.2.3 From 81ee6cda2e60bbd3d97ab63da30657a54b09feda Mon Sep 17 00:00:00 2001 From: "Ondrej Zajicek (work)" Date: Mon, 17 Jan 2022 05:11:29 +0100 Subject: Netlink: Add option to specify netlink socket receive buffer size Add option 'netlink rx buffer' to specify netlink socket receive buffer size. Uses SO_RCVBUFFORCE, so it can override rmem_max limit. Thanks to Trisha Biswas and Michal for the original patches. --- doc/bird.sgml | 6 + sysdep/linux/krt-sys.h | 1 + sysdep/linux/netlink.Y | 4 +- sysdep/linux/netlink.c | 54 ++ sysdep/linux/netlink.c.orig | 2179 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 2243 insertions(+), 1 deletion(-) create mode 100644 sysdep/linux/netlink.c.orig (limited to 'sysdep/linux/netlink.c') diff --git a/doc/bird.sgml b/doc/bird.sgml index 0112622e..f10b15e2 100644 --- a/doc/bird.sgml +++ b/doc/bird.sgml @@ -3248,6 +3248,12 @@ channels. allows to specify a limit on maximal number of nexthops in one route. By default, multipath merging is disabled. If enabled, default value of the limit is 16. + +