diff options
Diffstat (limited to 'proto/bgp/packets.c')
-rw-r--r-- | proto/bgp/packets.c | 2920 |
1 files changed, 1774 insertions, 1146 deletions
diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index 3e816839..385d5a36 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -2,12 +2,16 @@ * BIRD -- BGP Packet Processing * * (c) 2000 Martin Mares <mj@ucw.cz> + * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2008--2016 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ #undef LOCAL_DEBUG +#include <stdlib.h> + #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" @@ -16,6 +20,7 @@ #include "nest/mrtdump.h" #include "conf/conf.h" #include "lib/unaligned.h" +#include "lib/flowspec.h" #include "lib/socket.h" #include "nest/cli.h" @@ -38,6 +43,46 @@ static byte fsm_err_subcode[BS_MAX] = { [BS_ESTABLISHED] = 3 }; + +static struct bgp_channel * +bgp_get_channel(struct bgp_proto *p, u32 afi) +{ + uint i; + + for (i = 0; i < p->channel_count; i++) + if (p->afi_map[i] == afi) + return p->channel_map[i]; + + return NULL; +} + +static inline void +put_af3(byte *buf, u32 id) +{ + put_u16(buf, id >> 16); + buf[2] = id & 0xff; +} + +static inline void +put_af4(byte *buf, u32 id) +{ + put_u16(buf, id >> 16); + buf[2] = 0; + buf[3] = id & 0xff; +} + +static inline u32 +get_af3(byte *buf) +{ + return (get_u16(buf) << 16) | buf[2]; +} + +static inline u32 +get_af4(byte *buf) +{ + return (get_u16(buf) << 16) | buf[3]; +} + /* * MRT Dump format is not semantically specified. * We will use these values in appropriate fields: @@ -58,31 +103,41 @@ static byte * mrt_put_bgp4_hdr(byte *buf, struct bgp_conn *conn, int as4) { struct bgp_proto *p = conn->bgp; + uint v4 = ipa_is_ip4(p->cf->remote_ip); if (as4) - { - put_u32(buf+0, p->remote_as); - put_u32(buf+4, p->local_as); - buf+=8; - } + { + put_u32(buf+0, p->remote_as); + put_u32(buf+4, p->public_as); + buf+=8; + } else - { - put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS); - put_u16(buf+2, (p->local_as <= 0xFFFF) ? p->local_as : AS_TRANS); - buf+=4; - } + { + put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS); + put_u16(buf+2, (p->public_as <= 0xFFFF) ? p->public_as : AS_TRANS); + buf+=4; + } put_u16(buf+0, (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0); - put_u16(buf+2, BGP_AF); + put_u16(buf+2, v4 ? BGP_AFI_IPV4 : BGP_AFI_IPV6); buf+=4; - buf = put_ipa(buf, conn->sk ? conn->sk->daddr : IPA_NONE); - buf = put_ipa(buf, conn->sk ? conn->sk->saddr : IPA_NONE); + + if (v4) + { + buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->daddr) : IP4_NONE); + buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->saddr) : IP4_NONE); + } + else + { + buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->daddr) : IP6_NONE); + buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->saddr) : IP6_NONE); + } return buf; } static void -mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, unsigned len) +mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, uint len) { byte *buf = alloca(128+len); /* 128 is enough for MRT headers */ byte *bp = buf + MRTDUMP_HDR_LENGTH; @@ -96,14 +151,14 @@ mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, unsigned len) } static inline u16 -convert_state(unsigned state) +convert_state(uint state) { /* Convert state from our BS_* values to values used in MRTDump */ return (state == BS_CLOSE) ? 1 : state + 1; } void -mrt_dump_bgp_state_change(struct bgp_conn *conn, unsigned old, unsigned new) +mrt_dump_bgp_state_change(struct bgp_conn *conn, uint old, uint new) { byte buf[128]; byte *bp = buf + MRTDUMP_HDR_LENGTH; @@ -127,1298 +182,1947 @@ bgp_create_notification(struct bgp_conn *conn, byte *buf) return buf + 2 + conn->notify_size; } -#ifdef IPV6 -static byte * -bgp_put_cap_ipv6(struct bgp_proto *p UNUSED, byte *buf) -{ - *buf++ = 1; /* Capability 1: Multiprotocol extensions */ - *buf++ = 4; /* Capability data length */ - *buf++ = 0; /* We support AF IPv6 */ - *buf++ = BGP_AF_IPV6; - *buf++ = 0; /* RFU */ - *buf++ = 1; /* and SAFI 1 */ - return buf; -} -#else +/* Capability negotiation as per RFC 5492 */ -static byte * -bgp_put_cap_ipv4(struct bgp_proto *p UNUSED, byte *buf) -{ - *buf++ = 1; /* Capability 1: Multiprotocol extensions */ - *buf++ = 4; /* Capability data length */ - *buf++ = 0; /* We support AF IPv4 */ - *buf++ = BGP_AF_IPV4; - *buf++ = 0; /* RFU */ - *buf++ = 1; /* and SAFI 1 */ - return buf; +const struct bgp_af_caps * +bgp_find_af_caps(struct bgp_caps *caps, u32 afi) +{ + struct bgp_af_caps *ac; + + WALK_AF_CAPS(caps, ac) + if (ac->afi == afi) + return ac; + + return NULL; } -#endif -static byte * -bgp_put_cap_rr(struct bgp_proto *p UNUSED, byte *buf) +static struct bgp_af_caps * +bgp_get_af_caps(struct bgp_caps *caps, u32 afi) { - *buf++ = 2; /* Capability 2: Support for route refresh */ - *buf++ = 0; /* Capability data length */ - return buf; + struct bgp_af_caps *ac; + + WALK_AF_CAPS(caps, ac) + if (ac->afi == afi) + return ac; + + ac = &caps->af_data[caps->af_count++]; + memset(ac, 0, sizeof(struct bgp_af_caps)); + ac->afi = afi; + + return ac; } -static byte * -bgp_put_cap_ext_msg(struct bgp_proto *p UNUSED, byte *buf) +static int +bgp_af_caps_cmp(const void *X, const void *Y) { - *buf++ = 6; /* Capability 6: Support for extended messages */ - *buf++ = 0; /* Capability data length */ - return buf; + const struct bgp_af_caps *x = X, *y = Y; + return (x->afi < y->afi) ? -1 : (x->afi > y->afi) ? 1 : 0; } + static byte * -bgp_put_cap_gr1(struct bgp_proto *p, byte *buf) +bgp_write_capabilities(struct bgp_conn *conn, byte *buf) { - *buf++ = 64; /* Capability 64: Support for graceful restart */ - *buf++ = 6; /* Capability data length */ + struct bgp_proto *p = conn->bgp; + struct bgp_channel *c; + struct bgp_caps *caps; + struct bgp_af_caps *ac; + uint any_ext_next_hop = 0; + uint any_add_path = 0; + byte *data; - put_u16(buf, p->cf->gr_time); - if (p->p.gr_recovery) - buf[0] |= BGP_GRF_RESTART; - buf += 2; + /* Prepare bgp_caps structure */ + + int n = list_length(&p->p.channels); + caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + n * sizeof(struct bgp_af_caps)); + conn->local_caps = caps; + + caps->as4_support = p->cf->enable_as4; + caps->ext_messages = p->cf->enable_extended_messages; + caps->route_refresh = p->cf->enable_refresh; + caps->enhanced_refresh = p->cf->enable_refresh; + + if (caps->as4_support) + caps->as4_number = p->public_as; + + if (p->cf->gr_mode) + { + caps->gr_aware = 1; + caps->gr_time = p->cf->gr_time; + caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0; + } + + /* Allocate and fill per-AF fields */ + WALK_LIST(c, p->p.channels) + { + ac = &caps->af_data[caps->af_count++]; + ac->afi = c->afi; + ac->ready = 1; - *buf++ = 0; /* Appropriate AF */ - *buf++ = BGP_AF; - *buf++ = 1; /* and SAFI 1 */ - *buf++ = p->p.gr_recovery ? BGP_GRF_FORWARDING : 0; + ac->ext_next_hop = bgp_channel_is_ipv4(c) && c->cf->ext_next_hop; + any_ext_next_hop |= ac->ext_next_hop; + + ac->add_path = c->cf->add_path; + any_add_path |= ac->add_path; + + if (c->cf->gr_able) + { + ac->gr_able = 1; + + if (p->p.gr_recovery) + ac->gr_af_flags |= BGP_GRF_FORWARDING; + } + } + + /* Sort capability fields by AFI/SAFI */ + qsort(caps->af_data, caps->af_count, sizeof(struct bgp_af_caps), bgp_af_caps_cmp); + + + /* Create capability list in buffer */ + + /* + * Note that max length is ~ 20+14*af_count. With max 6 channels that is + * 104. Option limit is 253 and buffer size is 4096, so we cannot overflow + * unless we add new capabilities or more AFs. + */ + + WALK_AF_CAPS(caps, ac) + if (ac->ready) + { + *buf++ = 1; /* Capability 1: Multiprotocol extensions */ + *buf++ = 4; /* Capability data length */ + put_af4(buf, ac->afi); + buf += 4; + } + + if (caps->route_refresh) + { + *buf++ = 2; /* Capability 2: Support for route refresh */ + *buf++ = 0; /* Capability data length */ + } + + if (any_ext_next_hop) + { + *buf++ = 5; /* Capability 5: Support for extended next hop */ + *buf++ = 0; /* Capability data length, will be fixed later */ + data = buf; + + WALK_AF_CAPS(caps, ac) + if (ac->ext_next_hop) + { + put_af4(buf, ac->afi); + put_u16(buf+4, BGP_AFI_IPV6); + buf += 6; + } + + data[-1] = buf - data; + } + + if (caps->ext_messages) + { + *buf++ = 6; /* Capability 6: Support for extended messages */ + *buf++ = 0; /* Capability data length */ + } + + if (caps->gr_aware) + { + *buf++ = 64; /* Capability 64: Support for graceful restart */ + *buf++ = 0; /* Capability data length, will be fixed later */ + data = buf; + + put_u16(buf, caps->gr_time); + buf[0] |= caps->gr_flags; + buf += 2; + + WALK_AF_CAPS(caps, ac) + if (ac->gr_able) + { + put_af3(buf, ac->afi); + buf[3] = ac->gr_af_flags; + buf += 4; + } + + data[-1] = buf - data; + } + + if (caps->as4_support) + { + *buf++ = 65; /* Capability 65: Support for 4-octet AS number */ + *buf++ = 4; /* Capability data length */ + put_u32(buf, p->public_as); + buf += 4; + } + + if (any_add_path) + { + *buf++ = 69; /* Capability 69: Support for ADD-PATH */ + *buf++ = 0; /* Capability data length, will be fixed later */ + data = buf; + + WALK_AF_CAPS(caps, ac) + if (ac->add_path) + { + put_af3(buf, ac->afi); + buf[3] = ac->add_path; + buf += 4; + } + + data[-1] = buf - data; + } + + if (caps->enhanced_refresh) + { + *buf++ = 70; /* Capability 70: Support for enhanced route refresh */ + *buf++ = 0; /* Capability data length */ + } return buf; } -static byte * -bgp_put_cap_gr2(struct bgp_proto *p UNUSED, byte *buf) +static void +bgp_read_capabilities(struct bgp_conn *conn, struct bgp_caps *caps, byte *pos, int len) { - *buf++ = 64; /* Capability 64: Support for graceful restart */ - *buf++ = 2; /* Capability data length */ - put_u16(buf, 0); - return buf + 2; -} + struct bgp_proto *p = conn->bgp; + struct bgp_af_caps *ac; + int i, cl; + u32 af; -static byte * -bgp_put_cap_as4(struct bgp_proto *p, byte *buf) -{ - *buf++ = 65; /* Capability 65: Support for 4-octet AS number */ - *buf++ = 4; /* Capability data length */ - put_u32(buf, p->local_as); - return buf + 4; -} + while (len > 0) + { + if (len < 2 || len < (2 + pos[1])) + goto err; -static byte * -bgp_put_cap_add_path(struct bgp_proto *p, byte *buf) -{ - *buf++ = 69; /* Capability 69: Support for ADD-PATH */ - *buf++ = 4; /* Capability data length */ + /* Capability length */ + cl = pos[1]; - *buf++ = 0; /* Appropriate AF */ - *buf++ = BGP_AF; - *buf++ = 1; /* SAFI 1 */ + /* Capability type */ + switch (pos[0]) + { + case 1: /* Multiprotocol capability, RFC 4760 */ + if (cl != 4) + goto err; - *buf++ = p->cf->add_path; + af = get_af4(pos+2); + ac = bgp_get_af_caps(caps, af); + ac->ready = 1; + break; - return buf; + case 2: /* Route refresh capability, RFC 2918 */ + if (cl != 0) + goto err; + + caps->route_refresh = 1; + break; + + case 5: /* Extended next hop encoding capability, RFC 5549 */ + if (cl % 6) + goto err; + + for (i = 0; i < cl; i += 6) + { + /* Specified only for IPv4 prefixes with IPv6 next hops */ + if ((get_u16(pos+2+i+0) != BGP_AFI_IPV4) || + (get_u16(pos+2+i+4) != BGP_AFI_IPV6)) + continue; + + af = get_af4(pos+2+i); + ac = bgp_get_af_caps(caps, af); + ac->ext_next_hop = 1; + } + break; + + case 6: /* Extended message length capability, RFC draft */ + if (cl != 0) + goto err; + + caps->ext_messages = 1; + break; + + case 64: /* Graceful restart capability, RFC 4724 */ + if (cl % 4 != 2) + goto err; + + /* Only the last instance is valid */ + WALK_AF_CAPS(caps, ac) + { + ac->gr_able = 0; + ac->gr_af_flags = 0; + } + + caps->gr_aware = 1; + caps->gr_flags = pos[2] & 0xf0; + caps->gr_time = get_u16(pos + 2) & 0x0fff; + + for (i = 2; i < cl; i += 4) + { + af = get_af3(pos+2+i); + ac = bgp_get_af_caps(caps, af); + ac->gr_able = 1; + ac->gr_af_flags = pos[2+i+3]; + } + break; + + case 65: /* AS4 capability, RFC 4893 */ + if (cl != 4) + goto err; + + caps->as4_support = 1; + caps->as4_number = get_u32(pos + 2); + break; + + case 69: /* ADD-PATH capability, RFC 7911 */ + if (cl % 4) + goto err; + + for (i = 0; i < cl; i += 4) + { + byte val = pos[2+i+3]; + if (!val || (val > BGP_ADD_PATH_FULL)) + { + log(L_WARN "%s: Got ADD-PATH capability with unknown value %u, ignoring", + p->p.name, val); + break; + } + } + + for (i = 0; i < cl; i += 4) + { + af = get_af3(pos+2+i); + ac = bgp_get_af_caps(caps, af); + ac->add_path = pos[2+i+3]; + } + break; + + case 70: /* Enhanced route refresh capability, RFC 7313 */ + if (cl != 0) + goto err; + + caps->enhanced_refresh = 1; + break; + + /* We can safely ignore all other capabilities */ + } + + ADVANCE(pos, len, 2 + cl); + } + return; + +err: + bgp_error(conn, 2, 0, NULL, 0); + return; } -static byte * -bgp_put_cap_err(struct bgp_proto *p UNUSED, byte *buf) +static int +bgp_read_options(struct bgp_conn *conn, byte *pos, int len) { - *buf++ = 70; /* Capability 70: Support for enhanced route refresh */ - *buf++ = 0; /* Capability data length */ - return buf; -} + struct bgp_proto *p = conn->bgp; + struct bgp_caps *caps; + int ol; + + /* Max number of announced AFIs is limited by max option length (255) */ + caps = alloca(sizeof(struct bgp_caps) + 64 * sizeof(struct bgp_af_caps)); + memset(caps, 0, sizeof(struct bgp_caps)); + + while (len > 0) + { + if ((len < 2) || (len < (2 + pos[1]))) + { bgp_error(conn, 2, 0, NULL, 0); return -1; } + + ol = pos[1]; + if (pos[0] == 2) + { + /* BGP capabilities, RFC 5492 */ + if (p->cf->capabilities) + bgp_read_capabilities(conn, caps, pos + 2, ol); + } + else + { + /* Unknown option */ + bgp_error(conn, 2, 4, pos, ol); /* FIXME: ol or ol+2 ? */ + return -1; + } + + ADVANCE(pos, len, 2 + ol); + } + + uint n = sizeof(struct bgp_caps) + caps->af_count * sizeof(struct bgp_af_caps); + conn->remote_caps = mb_allocz(p->p.pool, n); + memcpy(conn->remote_caps, caps, n); + return 0; +} static byte * bgp_create_open(struct bgp_conn *conn, byte *buf) { struct bgp_proto *p = conn->bgp; - byte *cap; - int cap_len; BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)", - BGP_VERSION, p->local_as, p->cf->hold_time, p->local_id); + BGP_VERSION, p->public_as, p->cf->hold_time, p->local_id); + buf[0] = BGP_VERSION; - put_u16(buf+1, (p->local_as < 0xFFFF) ? p->local_as : AS_TRANS); + put_u16(buf+1, (p->public_as < 0xFFFF) ? p->public_as : AS_TRANS); put_u16(buf+3, p->cf->hold_time); put_u32(buf+5, p->local_id); - if (conn->start_state == BSS_CONNECT_NOCAP) - { - BGP_TRACE(D_PACKETS, "Skipping capabilities"); - buf[9] = 0; - return buf + 10; - } + if (p->cf->capabilities) + { + /* Prepare local_caps and write capabilities to buffer */ + byte *end = bgp_write_capabilities(conn, buf+12); + uint len = end - (buf+12); - /* Skipped 3 B for length field and Capabilities parameter header */ - cap = buf + 12; + buf[9] = len + 2; /* Optional parameters length */ + buf[10] = 2; /* Option 2: Capability list */ + buf[11] = len; /* Option data length */ -#ifndef IPV6 - if (p->cf->advertise_ipv4) - cap = bgp_put_cap_ipv4(p, cap); -#endif + return end; + } + else + { + /* Prepare empty local_caps */ + conn->local_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps)); -#ifdef IPV6 - cap = bgp_put_cap_ipv6(p, cap); -#endif + buf[9] = 0; /* No optional parameters */ + return buf + 10; + } - if (p->cf->enable_refresh) - cap = bgp_put_cap_rr(p, cap); + return buf; +} - if (p->cf->gr_mode == BGP_GR_ABLE) - cap = bgp_put_cap_gr1(p, cap); - else if (p->cf->gr_mode == BGP_GR_AWARE) - cap = bgp_put_cap_gr2(p, cap); +static void +bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) +{ + struct bgp_proto *p = conn->bgp; + struct bgp_conn *other; + u32 asn, hold, id; - if (p->cf->enable_as4) - cap = bgp_put_cap_as4(p, cap); + /* Check state */ + if (conn->state != BS_OPENSENT) + { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - if (p->cf->add_path) - cap = bgp_put_cap_add_path(p, cap); + /* Check message contents */ + if (len < 29 || len != 29 + (uint) pkt[28]) + { bgp_error(conn, 1, 2, pkt+16, 2); return; } - if (p->cf->enable_refresh) - cap = bgp_put_cap_err(p, cap); + if (pkt[19] != BGP_VERSION) + { u16 val = BGP_VERSION; bgp_error(conn, 2, 1, (byte *) &val, 2); return; } - if (p->cf->enable_extended_messages) - cap = bgp_put_cap_ext_msg(p, cap); + asn = get_u16(pkt+20); + hold = get_u16(pkt+22); + id = get_u32(pkt+24); + BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%R)", asn, hold, id); - cap_len = cap - buf - 12; - if (cap_len > 0) - { - buf[9] = cap_len + 2; /* Optional params len */ - buf[10] = 2; /* Option: Capability list */ - buf[11] = cap_len; /* Option length */ - return cap; - } + if (bgp_read_options(conn, pkt+29, pkt[28]) < 0) + return; + + if (hold > 0 && hold < 3) + { bgp_error(conn, 2, 6, pkt+22, 2); return; } + + /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */ + if (!id || (p->is_internal && id == p->local_id)) + { bgp_error(conn, 2, 3, pkt+24, -4); return; } + + struct bgp_caps *caps = conn->remote_caps; + + if (caps->as4_support) + { + u32 as4 = caps->as4_number; + + if ((as4 != asn) && (asn != AS_TRANS)) + log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name); + + if (as4 != p->remote_as) + { as4 = htonl(as4); bgp_error(conn, 2, 2, (byte *) &as4, 4); return; } + } else + { + if (asn != p->remote_as) + { bgp_error(conn, 2, 2, pkt+20, 2); return; } + } + + /* Check the other connection */ + other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; + switch (other->state) + { + case BS_CONNECT: + case BS_ACTIVE: + /* Stop outgoing connection attempts */ + bgp_conn_enter_idle_state(other); + break; + + case BS_IDLE: + case BS_OPENSENT: + case BS_CLOSE: + break; + + case BS_OPENCONFIRM: + /* + * Description of collision detection rules in RFC 4271 is confusing and + * contradictory, but it is essentially: + * + * 1. Router with higher ID is dominant + * 2. If both have the same ID, router with higher ASN is dominant [RFC6286] + * 3. When both connections are in OpenConfirm state, one initiated by + * the dominant router is kept. + * + * The first line in the expression below evaluates whether the neighbor + * is dominant, the second line whether the new connection was initiated + * by the neighbor. If both are true (or both are false), we keep the new + * connection, otherwise we keep the old one. + */ + if (((p->local_id < id) || ((p->local_id == id) && (p->public_as < p->remote_as))) + == (conn == &p->incoming_conn)) { - buf[9] = 0; /* No optional parameters */ - return buf + 10; + /* Should close the other connection */ + BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection"); + bgp_error(other, 6, 7, NULL, 0); + break; } -} + /* Fall thru */ + case BS_ESTABLISHED: + /* Should close this connection */ + BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection"); + bgp_error(conn, 6, 7, NULL, 0); + return; -static uint -bgp_encode_prefixes(struct bgp_proto *p, byte *w, struct bgp_bucket *buck, uint remains) -{ - byte *start = w; - ip_addr a; - int bytes; + default: + bug("bgp_rx_open: Unknown state"); + } - while (!EMPTY_LIST(buck->prefixes) && (remains >= (5+sizeof(ip_addr)))) - { - struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes)); - DBG("\tDequeued route %I/%d\n", px->n.prefix, px->n.pxlen); + /* Update our local variables */ + conn->hold_time = MIN(hold, p->cf->hold_time); + conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3; + conn->as4_session = conn->local_caps->as4_support && caps->as4_support; + conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages; + p->remote_id = id; - if (p->add_path_tx) - { - put_u32(w, px->path_id); - w += 4; - remains -= 4; - } + DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n", + conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session); - *w++ = px->n.pxlen; - bytes = (px->n.pxlen + 7) / 8; - a = px->n.prefix; - ipa_hton(a); - memcpy(w, &a, bytes); - w += bytes; - remains -= bytes + 1; - rem_node(&px->bucket_node); - bgp_free_prefix(p, px); - // fib_delete(&p->prefix_fib, px); - } - return w - start; + bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE); + bgp_start_timer(conn->hold_timer, conn->hold_time); + bgp_conn_enter_openconfirm_state(conn); } + +/* + * Next hop handling + */ + +#define REPORT(msg, args...) \ + ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); }) + +#define DISCARD(msg, args...) \ + ({ REPORT(msg, ## args); return; }) + +#define WITHDRAW(msg, args...) \ + ({ REPORT(msg, ## args); s->err_withdraw = 1; return; }) + +#define BAD_AFI "Unexpected AF <%u/%u> in UPDATE" +#define BAD_NEXT_HOP "Invalid NEXT_HOP attribute" +#define NO_NEXT_HOP "Missing NEXT_HOP attribute" + + static void -bgp_flush_prefixes(struct bgp_proto *p, struct bgp_bucket *buck) +bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll) { - while (!EMPTY_LIST(buck->prefixes)) - { - struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes)); - log(L_ERR "%s: - route %I/%d skipped", p->p.name, px->n.prefix, px->n.pxlen); - rem_node(&px->bucket_node); - bgp_free_prefix(p, px); - // fib_delete(&p->prefix_fib, px); - } + struct bgp_proto *p = s->proto; + struct bgp_channel *c = s->channel; + + if (c->cf->gw_mode == GW_DIRECT) + { + neighbor *nbr = NULL; + + /* GW_DIRECT -> single_hop -> p->neigh != NULL */ + if (ipa_nonzero(gw)) + nbr = neigh_find2(&p->p, &gw, NULL, 0); + else if (ipa_nonzero(ll)) + nbr = neigh_find2(&p->p, &ll, p->neigh->iface, 0); + + if (!nbr || (nbr->scope == SCOPE_HOST)) + WITHDRAW(BAD_NEXT_HOP); + + a->dest = RTD_ROUTER; + a->gw = nbr->addr; + a->iface = nbr->iface; + a->hostentry = NULL; + a->igp_metric = 0; + } + else /* GW_RECURSIVE */ + { + if (ipa_zero(gw)) + WITHDRAW(BAD_NEXT_HOP); + + rta_set_recursive_next_hop(c->c.table, a, c->igp_table, gw, ll); + } } -#ifndef IPV6 /* IPv4 version */ +static inline int +bgp_use_next_hop(struct bgp_export_state *s, eattr *a) +{ + struct bgp_proto *p = s->proto; + ip_addr *nh = (void *) a->u.ptr->data; -static byte * -bgp_create_update(struct bgp_conn *conn, byte *buf) + if (s->channel->cf->next_hop_self) + return 0; + + if (s->channel->cf->next_hop_keep) + return 1; + + /* Keep it when explicitly set in export filter */ + if (a->type & EAF_FRESH) + return 1; + + /* Keep it when exported to internal peers */ + if (p->is_interior && ipa_nonzero(*nh)) + return 1; + + /* Keep it when forwarded between single-hop BGPs on the same iface */ + struct iface *ifa = (s->src && s->src->neigh) ? s->src->neigh->iface : NULL; + return p->neigh && (p->neigh->iface == ifa); +} + +static inline int +bgp_use_gateway(struct bgp_export_state *s) { - struct bgp_proto *p = conn->bgp; - struct bgp_bucket *buck; - int remains = bgp_max_packet_length(p) - BGP_HEADER_LENGTH - 4; - byte *w; - int wd_size = 0; - int r_size = 0; - int a_size = 0; - - w = buf+2; - if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) - { - DBG("Withdrawn routes:\n"); - wd_size = bgp_encode_prefixes(p, w, buck, remains); - w += wd_size; - remains -= wd_size; - } - put_u16(buf, wd_size); + struct bgp_proto *p = s->proto; + rta *ra = s->route->attrs; - if (!wd_size) - { - while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next) - { - if (EMPTY_LIST(buck->prefixes)) - { - DBG("Deleting empty bucket %p\n", buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - } - - DBG("Processing bucket %p\n", buck); - a_size = bgp_encode_attrs(p, w+2, buck->eattrs, remains - 1024); - - if (a_size < 0) - { - log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name); - bgp_flush_prefixes(p, buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - } - - put_u16(w, a_size); - w += a_size + 2; - r_size = bgp_encode_prefixes(p, w, buck, remains - a_size); - w += r_size; - break; - } - } - if (!a_size) /* Attributes not already encoded */ + if (s->channel->cf->next_hop_self) + return 0; + + /* We need valid global gateway */ + if ((ra->dest != RTD_ROUTER) || ipa_zero(ra->gw) || ipa_is_link_local(ra->gw)) + return 0; + + /* Use it when exported to internal peers */ + if (p->is_interior) + return 1; + + /* Use it when forwarded to single-hop BGP peer on on the same iface */ + return p->neigh && (p->neigh->iface == ra->iface); +} + +static void +bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to) +{ + if (!a || !bgp_use_next_hop(s, a)) + { + if (bgp_use_gateway(s)) { - put_u16(w, 0); - w += 2; + ip_addr nh[1] = { s->route->attrs->gw }; + bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16); } - if (wd_size || r_size) + else { - BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE"); - return w; + ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr }; + bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16); } - else - return NULL; + } + + /* Check if next hop is valid */ + a = bgp_find_attr(*to, BA_NEXT_HOP); + if (!a) + WITHDRAW(NO_NEXT_HOP); + + ip_addr *nh = (void *) a->u.ptr->data; + ip_addr peer = s->proto->cf->remote_ip; + uint len = a->u.ptr->length; + + if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1]))) + WITHDRAW(BAD_NEXT_HOP); + + if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1]))) + WITHDRAW(BAD_NEXT_HOP); } -static byte * -bgp_create_end_mark(struct bgp_conn *conn, byte *buf) +static uint +bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte *buf UNUSED, uint size UNUSED) { - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending END-OF-RIB"); + return 0; +} - put_u32(buf, 0); - return buf+4; +static void +bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED) +{ + /* + * Although we expect no next hop and RFC 7606 7.11 states that attribute + * MP_REACH_NLRI with unexpected next hop length is considered malformed, + * FlowSpec RFC 5575 4 states that next hop shall be ignored on receipt. + */ + + return; } -#else /* IPv6 version */ +static void +bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to) +{ + /* NEXT_HOP shall not pass */ + if (a) + bgp_unset_attr(to, s->pool, BA_NEXT_HOP); +} -static inline int -same_iface(struct bgp_proto *p, ip_addr *ip) + +/* + * UPDATE + */ + +static void +bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0) { - neighbor *n = neigh_find(&p->p, ip, 0); - return n && p->neigh && n->iface == p->neigh->iface; + if (path_id != s->last_id) + { + s->last_src = rt_get_source(&s->proto->p, path_id); + s->last_id = path_id; + + rta_free(s->cached_rta); + s->cached_rta = NULL; + } + + if (!a0) + { + /* Route withdraw */ + rte_update2(&s->channel->c, n, NULL, s->last_src); + return; + } + + /* Prepare cached route attributes */ + if (s->cached_rta == NULL) + { + a0->src = s->last_src; + + /* Workaround for rta_lookup() breaking eattrs */ + ea_list *ea = a0->eattrs; + s->cached_rta = rta_lookup(a0); + a0->eattrs = ea; + } + + rta *a = rta_clone(s->cached_rta); + rte *e = rte_get_temp(a); + + e->pflags = 0; + e->u.bgp.suppressed = 0; + rte_update2(&s->channel->c, n, e, s->last_src); } -static byte * -bgp_create_update(struct bgp_conn *conn, byte *buf) + + +static uint +bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) { - struct bgp_proto *p = conn->bgp; - struct bgp_bucket *buck; - int size, second, rem_stored; - int remains = bgp_max_packet_length(p) - BGP_HEADER_LENGTH - 4; - byte *w, *w_stored, *tmp, *tstart; - ip_addr *ipp, ip, ip_ll; - ea_list *ea; - eattr *nh; + byte *pos = buf; - put_u16(buf, 0); - w = buf+4; + while (!EMPTY_LIST(buck->prefixes) && (size >= (5 + sizeof(ip4_addr)))) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_ip4 *net = (void *) px->net; - if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) + /* Encode path ID */ + if (s->add_path) { - DBG("Withdrawn routes:\n"); - tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_UNREACH_NLRI, remains-8); - *tmp++ = 0; - *tmp++ = BGP_AF_IPV6; - *tmp++ = 1; - ea->attrs[0].u.ptr->length = 3 + bgp_encode_prefixes(p, tmp, buck, remains-11); - size = bgp_encode_attrs(p, w, ea, remains); - ASSERT(size >= 0); - w += size; - remains -= size; - } - else - { - while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next) - { - if (EMPTY_LIST(buck->prefixes)) - { - DBG("Deleting empty bucket %p\n", buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - } - - DBG("Processing bucket %p\n", buck); - rem_stored = remains; - w_stored = w; - - size = bgp_encode_attrs(p, w, buck->eattrs, remains - 1024); - if (size < 0) - { - log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name); - bgp_flush_prefixes(p, buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - } - w += size; - remains -= size; - - /* We have two addresses here in NEXT_HOP eattr. Really. - Unless NEXT_HOP was modified by filter */ - nh = ea_find(buck->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP)); - ASSERT(nh); - second = (nh->u.ptr->length == NEXT_HOP_LENGTH); - ipp = (ip_addr *) nh->u.ptr->data; - ip = ipp[0]; - ip_ll = IPA_NONE; - - if (ipa_equal(ip, p->source_addr)) - ip_ll = p->local_link; - else - { - /* If we send a route with 'third party' next hop destinated - * in the same interface, we should also send a link local - * next hop address. We use the received one (stored in the - * other part of BA_NEXT_HOP eattr). If we didn't received - * it (for example it is a static route), we can't use - * 'third party' next hop and we have to use local IP address - * as next hop. Sending original next hop address without - * link local address seems to be a natural way to solve that - * problem, but it is contrary to RFC 2545 and Quagga does not - * accept such routes. - * - * There are two cases, either we have global IP, or - * IPA_NONE if the neighbor is link-local. For IPA_NONE, - * we suppose it is on the same iface, see bgp_update_attrs(). - */ - - if (ipa_zero(ip) || same_iface(p, &ip)) - { - if (second && ipa_nonzero(ipp[1])) - ip_ll = ipp[1]; - else - { - switch (p->cf->missing_lladdr) - { - case MLL_SELF: - ip = p->source_addr; - ip_ll = p->local_link; - break; - case MLL_DROP: - log(L_ERR "%s: Missing link-local next hop address, skipping corresponding routes", p->p.name); - w = w_stored; - remains = rem_stored; - bgp_flush_prefixes(p, buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - case MLL_IGNORE: - break; - } - } - } - } - - tstart = tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_REACH_NLRI, remains-8); - *tmp++ = 0; - *tmp++ = BGP_AF_IPV6; - *tmp++ = 1; - - if (ipa_is_link_local(ip)) - ip = IPA_NONE; - - if (ipa_nonzero(ip_ll)) - { - *tmp++ = 32; - ipa_hton(ip); - memcpy(tmp, &ip, 16); - ipa_hton(ip_ll); - memcpy(tmp+16, &ip_ll, 16); - tmp += 32; - } - else - { - *tmp++ = 16; - ipa_hton(ip); - memcpy(tmp, &ip, 16); - tmp += 16; - } - - *tmp++ = 0; /* No SNPA information */ - tmp += bgp_encode_prefixes(p, tmp, buck, remains - (8+3+32+1)); - ea->attrs[0].u.ptr->length = tmp - tstart; - size = bgp_encode_attrs(p, w, ea, remains); - ASSERT(size >= 0); - w += size; - break; - } + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); } - size = w - (buf+4); - put_u16(buf+2, size); - lp_flush(bgp_linpool); - if (size) + ip4_addr a = ip4_hton(net->prefix); + uint b = (net->pxlen + 7) / 8; + + /* Encode prefix length */ + *pos = net->pxlen; + ADVANCE(pos, size, 1); + + /* Encode prefix body */ + memcpy(pos, &a, b); + ADVANCE(pos, size, b); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; +} + +static void +bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +{ + while (len) + { + net_addr_ip4 net; + u32 path_id = 0; + + /* Decode path ID */ + if (s->add_path) { - BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE"); - return w; + if (len < 5) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); } - else - return NULL; + + /* Decode prefix length */ + uint l = *pos; + uint b = (l + 7) / 8; + ADVANCE(pos, len, 1); + + if (l > IP4_MAX_PREFIX_LENGTH) + bgp_parse_error(s, 10); + + if (len < b) + bgp_parse_error(s, 1); + + /* Decode prefix body */ + ip4_addr addr = IP4_NONE; + memcpy(&addr, pos, b); + ADVANCE(pos, len, b); + + net = NET_ADDR_IP4(ip4_ntoh(addr), l); + net_normalize_ip4(&net); + + // XXXX validate prefix + + bgp_rte_update(s, (net_addr *) &net, path_id, a); + } } -static byte * -bgp_create_end_mark(struct bgp_conn *conn, byte *buf) +static uint +bgp_encode_next_hop_ip4(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size UNUSED) { - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending END-OF-RIB"); + /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */ - put_u16(buf+0, 0); - put_u16(buf+2, 6); /* length 4-9 */ - buf += 4; + ASSERT(a->u.ptr->length == sizeof(ip_addr)); - /* Empty MP_UNREACH_NLRI atribute */ - *buf++ = BAF_OPTIONAL; - *buf++ = BA_MP_UNREACH_NLRI; - *buf++ = 3; /* Length 7-9 */ - *buf++ = 0; /* AFI */ - *buf++ = BGP_AF_IPV6; - *buf++ = 1; /* SAFI */ - return buf; -} + put_ip4(buf, ipa_to_ip4( *(ip_addr *) a->u.ptr->data )); -#endif + return 4; +} -static inline byte * -bgp_create_route_refresh(struct bgp_conn *conn, byte *buf) +static void +bgp_decode_next_hop_ip4(struct bgp_parse_state *s, byte *data, uint len, rta *a) { - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH"); + if (len != 4) + bgp_parse_error(s, 9); - /* Original original route refresh request, RFC 2918 */ - *buf++ = 0; - *buf++ = BGP_AF; - *buf++ = BGP_RR_REQUEST; - *buf++ = 1; /* SAFI */ - return buf; + ip_addr nh = ipa_from_ip4(get_ip4(data)); + + // XXXX validate next hop + + bgp_set_attr_data(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, &nh, sizeof(nh)); + bgp_apply_next_hop(s, a, nh, IPA_NONE); } -static inline byte * -bgp_create_begin_refresh(struct bgp_conn *conn, byte *buf) + +static uint +bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) { - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR"); + byte *pos = buf; - /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */ - *buf++ = 0; - *buf++ = BGP_AF; - *buf++ = BGP_RR_BEGIN; - *buf++ = 1; /* SAFI */ - return buf; + while (!EMPTY_LIST(buck->prefixes) && (size >= (5 + sizeof(ip6_addr)))) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_ip6 *net = (void *) px->net; + + /* Encode path ID */ + if (s->add_path) + { + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); + } + + ip6_addr a = ip6_hton(net->prefix); + uint b = (net->pxlen + 7) / 8; + + /* Encode prefix length */ + *pos = net->pxlen; + ADVANCE(pos, size, 1); + + /* Encode prefix body */ + memcpy(pos, &a, b); + ADVANCE(pos, size, b); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; } -static inline byte * -bgp_create_end_refresh(struct bgp_conn *conn, byte *buf) +static void +bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) { - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending END-OF-RR"); + while (len) + { + net_addr_ip6 net; + u32 path_id = 0; - /* Demarcation of ending of route refresh (EoRR), RFC 7313 */ - *buf++ = 0; - *buf++ = BGP_AF; - *buf++ = BGP_RR_END; - *buf++ = 1; /* SAFI */ - return buf; + /* Decode path ID */ + if (s->add_path) + { + if (len < 5) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); + } + + /* Decode prefix length */ + uint l = *pos; + uint b = (l + 7) / 8; + ADVANCE(pos, len, 1); + + if (l > IP6_MAX_PREFIX_LENGTH) + bgp_parse_error(s, 10); + + if (len < b) + bgp_parse_error(s, 1); + + /* Decode prefix body */ + ip6_addr addr = IP6_NONE; + memcpy(&addr, pos, b); + ADVANCE(pos, len, b); + + net = NET_ADDR_IP6(ip6_ntoh(addr), l); + net_normalize_ip6(&net); + + // XXXX validate prefix + + bgp_rte_update(s, (net_addr *) &net, path_id, a); + } } +static uint +bgp_encode_next_hop_ip6(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size UNUSED) +{ + ip_addr *nh = (void *) a->u.ptr->data; + uint len = a->u.ptr->length; + + ASSERT((len == 16) || (len == 32)); + + put_ip6(buf, ipa_to_ip6(nh[0])); + + if (len == 32) + put_ip6(buf+16, ipa_to_ip6(nh[1])); + + return len; +} static void -bgp_create_header(byte *buf, uint len, uint type) +bgp_decode_next_hop_ip6(struct bgp_parse_state *s, byte *data, uint len, rta *a) { - memset(buf, 0xff, 16); /* Marker */ - put_u16(buf+16, len); - buf[18] = type; + struct adata *ad = lp_alloc_adata(s->pool, 32); + ip_addr *nh = (void *) ad->data; + + if ((len != 16) && (len != 32)) + bgp_parse_error(s, 9); + + nh[0] = ipa_from_ip6(get_ip6(data)); + nh[1] = (len == 32) ? ipa_from_ip6(get_ip6(data+16)) : IPA_NONE; + + if (ip6_is_link_local(nh[0])) + { + nh[1] = nh[0]; + nh[0] = IPA_NONE; + } + + if (!ip6_is_link_local(nh[1])) + nh[1] = IPA_NONE; + + if (ipa_zero(nh[1])) + ad->length = 16; + + // XXXX validate next hop + + bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad); + bgp_apply_next_hop(s, a, nh[0], nh[1]); } -/** - * bgp_fire_tx - transmit packets - * @conn: connection - * - * Whenever the transmit buffers of the underlying TCP connection - * are free and we have any packets queued for sending, the socket functions - * call bgp_fire_tx() which takes care of selecting the highest priority packet - * queued (Notification > Keepalive > Open > Update), assembling its header - * and body and sending it to the connection. - */ -static int -bgp_fire_tx(struct bgp_conn *conn) + +static uint +bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) { - struct bgp_proto *p = conn->bgp; - uint s = conn->packets_to_send; - sock *sk = conn->sk; - byte *buf, *pkt, *end; - int type; + byte *pos = buf; + + while (!EMPTY_LIST(buck->prefixes) && (size >= 4)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_flow4 *net = (void *) px->net; + uint flen = net->length - sizeof(net_addr_flow4); - if (!sk) + /* Encode path ID */ + if (s->add_path) { - conn->packets_to_send = 0; - return 0; + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); } - buf = sk->tbuf; - pkt = buf + BGP_HEADER_LENGTH; - if (s & (1 << PKT_SCHEDULE_CLOSE)) + if (flen > size) + break; + + /* Copy whole flow data including length */ + memcpy(pos, net->data, flen); + ADVANCE(pos, size, flen); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; +} + +static void +bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +{ + while (len) + { + u32 path_id = 0; + + /* Decode path ID */ + if (s->add_path) { - /* We can finally close connection and enter idle state */ - bgp_conn_enter_idle_state(conn); - return 0; + if (len < 4) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); } - if (s & (1 << PKT_NOTIFICATION)) + + if (len < 2) + bgp_parse_error(s, 1); + + /* Decode flow length */ + uint hlen = flow_hdr_length(pos); + uint dlen = flow_read_length(pos); + uint flen = hlen + dlen; + byte *data = pos + hlen; + + if (len < flen) + bgp_parse_error(s, 1); + + /* Validate flow data */ + enum flow_validated_state r = flow4_validate(data, dlen); + if (r != FLOW_ST_VALID) { - s = 1 << PKT_SCHEDULE_CLOSE; - type = PKT_NOTIFICATION; - end = bgp_create_notification(conn, pkt); + log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r)); + bgp_parse_error(s, 1); } - else if (s & (1 << PKT_KEEPALIVE)) + + if (data[0] != FLOW_TYPE_DST_PREFIX) { - s &= ~(1 << PKT_KEEPALIVE); - type = PKT_KEEPALIVE; - end = pkt; /* Keepalives carry no data */ - BGP_TRACE(D_PACKETS, "Sending KEEPALIVE"); - bgp_start_timer(conn->keepalive_timer, conn->keepalive_time); + log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name); + bgp_parse_error(s, 1); } - else if (s & (1 << PKT_OPEN)) + + /* Decode dst prefix */ + ip4_addr px = IP4_NONE; + uint pxlen = data[1]; + + // FIXME: Use some generic function + memcpy(&px, data, BYTES(pxlen)); + px = ip4_and(px, ip4_mkmask(pxlen)); + + /* Prepare the flow */ + net_addr *n = alloca(sizeof(struct net_addr_flow4) + flen); + net_fill_flow4(n, px, pxlen, pos, flen); + ADVANCE(pos, len, flen); + + bgp_rte_update(s, n, path_id, a); + } +} + + +static uint +bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) +{ + byte *pos = buf; + + while (!EMPTY_LIST(buck->prefixes) && (size >= 4)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_flow6 *net = (void *) px->net; + uint flen = net->length - sizeof(net_addr_flow6); + + /* Encode path ID */ + if (s->add_path) { - s &= ~(1 << PKT_OPEN); - type = PKT_OPEN; - end = bgp_create_open(conn, pkt); + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); } - else if (s & (1 << PKT_ROUTE_REFRESH)) + + if (flen > size) + break; + + /* Copy whole flow data including length */ + memcpy(pos, net->data, flen); + ADVANCE(pos, size, flen); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; +} + +static void +bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +{ + while (len) + { + u32 path_id = 0; + + /* Decode path ID */ + if (s->add_path) { - s &= ~(1 << PKT_ROUTE_REFRESH); - type = PKT_ROUTE_REFRESH; - end = bgp_create_route_refresh(conn, pkt); + if (len < 4) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); } - else if (s & (1 << PKT_BEGIN_REFRESH)) + + if (len < 2) + bgp_parse_error(s, 1); + + /* Decode flow length */ + uint hlen = flow_hdr_length(pos); + uint dlen = flow_read_length(pos); + uint flen = hlen + dlen; + byte *data = pos + hlen; + + if (len < flen) + bgp_parse_error(s, 1); + + /* Validate flow data */ + enum flow_validated_state r = flow6_validate(data, dlen); + if (r != FLOW_ST_VALID) { - s &= ~(1 << PKT_BEGIN_REFRESH); - type = PKT_ROUTE_REFRESH; /* BoRR is a subtype of RR */ - end = bgp_create_begin_refresh(conn, pkt); + log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r)); + bgp_parse_error(s, 1); } - else if (s & (1 << PKT_UPDATE)) + + if (data[0] != FLOW_TYPE_DST_PREFIX) { - type = PKT_UPDATE; - end = bgp_create_update(conn, pkt); + log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name); + bgp_parse_error(s, 1); + } - if (!end) - { - /* No update to send, perhaps we need to send End-of-RIB or EoRR */ + /* Decode dst prefix */ + ip6_addr px = IP6_NONE; + uint pxlen = data[1]; - conn->packets_to_send = 0; + // FIXME: Use some generic function + memcpy(&px, data, BYTES(pxlen)); + px = ip6_and(px, ip6_mkmask(pxlen)); - if (p->feed_state == BFS_LOADED) - { - type = PKT_UPDATE; - end = bgp_create_end_mark(conn, pkt); - } + /* Prepare the flow */ + net_addr *n = alloca(sizeof(struct net_addr_flow6) + flen); + net_fill_flow6(n, px, pxlen, pos, flen); + ADVANCE(pos, len, flen); - else if (p->feed_state == BFS_REFRESHED) - { - type = PKT_ROUTE_REFRESH; - end = bgp_create_end_refresh(conn, pkt); - } + bgp_rte_update(s, n, path_id, a); + } +} - else /* Really nothing to send */ - return 0; - p->feed_state = BFS_NONE; - } - } - else - return 0; +static const struct bgp_af_desc bgp_af_table[] = { + { + .afi = BGP_AF_IPV4, + .net = NET_IP4, + .name = "ipv4", + .encode_nlri = bgp_encode_nlri_ip4, + .decode_nlri = bgp_decode_nlri_ip4, + .encode_next_hop = bgp_encode_next_hop_ip4, + .decode_next_hop = bgp_decode_next_hop_ip4, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_IPV4_MC, + .net = NET_IP4, + .name = "ipv4-mc", + .encode_nlri = bgp_encode_nlri_ip4, + .decode_nlri = bgp_decode_nlri_ip4, + .encode_next_hop = bgp_encode_next_hop_ip4, + .decode_next_hop = bgp_decode_next_hop_ip4, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_FLOW4, + .net = NET_FLOW4, + .name = "flow4", + .encode_nlri = bgp_encode_nlri_flow4, + .decode_nlri = bgp_decode_nlri_flow4, + .encode_next_hop = bgp_encode_next_hop_none, + .decode_next_hop = bgp_decode_next_hop_none, + .update_next_hop = bgp_update_next_hop_none, + }, + { + .afi = BGP_AF_IPV6, + .net = NET_IP6, + .name = "ipv6", + .encode_nlri = bgp_encode_nlri_ip6, + .decode_nlri = bgp_decode_nlri_ip6, + .encode_next_hop = bgp_encode_next_hop_ip6, + .decode_next_hop = bgp_decode_next_hop_ip6, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_IPV6_MC, + .net = NET_IP6, + .name = "ipv6-mc", + .encode_nlri = bgp_encode_nlri_ip6, + .decode_nlri = bgp_decode_nlri_ip6, + .encode_next_hop = bgp_encode_next_hop_ip6, + .decode_next_hop = bgp_decode_next_hop_ip6, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_FLOW6, + .net = NET_FLOW6, + .name = "flow6", + .encode_nlri = bgp_encode_nlri_flow6, + .decode_nlri = bgp_decode_nlri_flow6, + .encode_next_hop = bgp_encode_next_hop_none, + .decode_next_hop = bgp_decode_next_hop_none, + .update_next_hop = bgp_update_next_hop_none, + }, +}; + +const struct bgp_af_desc * +bgp_get_af_desc(u32 afi) +{ + uint i; + for (i = 0; i < ARRAY_SIZE(bgp_af_table); i++) + if (bgp_af_table[i].afi == afi) + return &bgp_af_table[i]; - conn->packets_to_send = s; - bgp_create_header(buf, end - buf, type); - return sk_send(sk, end - buf); + return NULL; } -/** - * bgp_schedule_packet - schedule a packet for transmission - * @conn: connection - * @type: packet type - * - * Schedule a packet of type @type to be sent as soon as possible. - */ -void -bgp_schedule_packet(struct bgp_conn *conn, int type) +static inline uint +bgp_encode_nlri(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - DBG("BGP: Scheduling packet type %d\n", type); - conn->packets_to_send |= 1 << type; - if (conn->sk && conn->sk->tpos == conn->sk->tbuf && !ev_active(conn->tx_ev)) - ev_schedule(conn->tx_ev); + return s->channel->desc->encode_nlri(s, buck, buf, end - buf); } -void -bgp_kick_tx(void *vconn) +static inline uint +bgp_encode_next_hop(struct bgp_write_state *s, eattr *nh, byte *buf) { - struct bgp_conn *conn = vconn; - - DBG("BGP: kicking TX\n"); - while (bgp_fire_tx(conn) > 0) - ; + return s->channel->desc->encode_next_hop(s, nh, buf, 255); } void -bgp_tx(sock *sk) +bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to) { - struct bgp_conn *conn = sk->data; - - DBG("BGP: TX hook\n"); - while (bgp_fire_tx(conn) > 0) - ; + s->channel->desc->update_next_hop(s, a, to); } -/* Capatibility negotiation as per RFC 2842 */ +#define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024) -void -bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) +static byte * +bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - // struct bgp_proto *p = conn->bgp; - int i, cl; + /* + * 2 B Withdrawn Routes Length (zero) + * --- IPv4 Withdrawn Routes NLRI (unused) + * 2 B Total Path Attribute Length + * var Path Attributes + * var IPv4 Network Layer Reachability Information + */ + + int lr, la; + + la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH); + if (la < 0) + { + /* Attribute list too long */ + bgp_withdraw_bucket(s->channel, buck); + return NULL; + } - while (len > 0) - { - if (len < 2 || len < 2 + opt[1]) - goto err; + put_u16(buf+0, 0); + put_u16(buf+2, la); - cl = opt[1]; + lr = bgp_encode_nlri(s, buck, buf+4+la, end); - switch (opt[0]) - { - case 2: /* Route refresh capability, RFC 2918 */ - if (cl != 0) - goto err; - conn->peer_refresh_support = 1; - break; + return buf+4+la+lr; +} - case 6: /* Extended message length capability, draft */ - if (cl != 0) - goto err; - conn->peer_ext_messages_support = 1; - break; +static byte * +bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) +{ + /* + * 2 B IPv4 Withdrawn Routes Length (zero) + * --- IPv4 Withdrawn Routes NLRI (unused) + * 2 B Total Path Attribute Length + * 1 B MP_REACH_NLRI hdr - Attribute Flags + * 1 B MP_REACH_NLRI hdr - Attribute Type Code + * 2 B MP_REACH_NLRI hdr - Length of Attribute Data + * 2 B MP_REACH_NLRI data - Address Family Identifier + * 1 B MP_REACH_NLRI data - Subsequent Address Family Identifier + * 1 B MP_REACH_NLRI data - Length of Next Hop Network Address + * var MP_REACH_NLRI data - Network Address of Next Hop + * 1 B MP_REACH_NLRI data - Reserved (zero) + * var MP_REACH_NLRI data - Network Layer Reachability Information + * var Rest of Path Attributes + * --- IPv4 Network Layer Reachability Information (unused) + */ + + int lh, lr, la; /* Lengths of next hop, NLRI and attributes */ + + /* Begin of MP_REACH_NLRI atribute */ + buf[4] = BAF_OPTIONAL | BAF_EXT_LEN; + buf[5] = BA_MP_REACH_NLRI; + put_u16(buf+6, 0); /* Will be fixed later */ + put_af3(buf+8, s->channel->afi); + byte *pos = buf+11; + + /* Encode attributes to temporary buffer */ + byte *abuf = alloca(MAX_ATTRS_LENGTH); + la = bgp_encode_attrs(s, buck->eattrs, abuf, abuf + MAX_ATTRS_LENGTH); + if (la < 0) + { + /* Attribute list too long */ + bgp_withdraw_bucket(s->channel, buck); + return NULL; + } - case 64: /* Graceful restart capability, RFC 4724 */ - if (cl % 4 != 2) - goto err; - conn->peer_gr_aware = 1; - conn->peer_gr_able = 0; - conn->peer_gr_time = get_u16(opt + 2) & 0x0fff; - conn->peer_gr_flags = opt[2] & 0xf0; - conn->peer_gr_aflags = 0; - for (i = 2; i < cl; i += 4) - if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */ - { - conn->peer_gr_able = 1; - conn->peer_gr_aflags = opt[2+i+3]; - } - break; + /* Encode the next hop */ + lh = bgp_encode_next_hop(s, s->mp_next_hop, pos+1); + *pos = lh; + pos += 1+lh; - case 65: /* AS4 capability, RFC 4893 */ - if (cl != 4) - goto err; - conn->peer_as4_support = 1; - if (conn->bgp->cf->enable_as4) - conn->advertised_as = get_u32(opt + 2); - break; + /* Reserved field */ + *pos++ = 0; - case 69: /* ADD-PATH capability, draft */ - if (cl % 4) - goto err; - for (i = 0; i < cl; i += 4) - if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */ - conn->peer_add_path = opt[2+i+3]; - if (conn->peer_add_path > ADD_PATH_FULL) - goto err; - break; + /* Encode the NLRI */ + lr = bgp_encode_nlri(s, buck, pos, end - la); + pos += lr; - case 70: /* Enhanced route refresh capability, RFC 7313 */ - if (cl != 0) - goto err; - conn->peer_enhanced_refresh_support = 1; - break; + /* End of MP_REACH_NLRI atribute, update data length */ + put_u16(buf+6, pos-buf-8); - /* We can safely ignore all other capabilities */ - } - len -= 2 + cl; - opt += 2 + cl; - } - return; + /* Copy remaining attributes */ + memcpy(pos, abuf, la); + pos += la; - err: - bgp_error(conn, 2, 0, NULL, 0); - return; + /* Initial UPDATE fields */ + put_u16(buf+0, 0); + put_u16(buf+2, pos-buf-4); + + return pos; } -static int -bgp_parse_options(struct bgp_conn *conn, byte *opt, int len) +#undef MAX_ATTRS_LENGTH + +static byte * +bgp_create_ip_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - struct bgp_proto *p = conn->bgp; - int ol; + /* + * 2 B Withdrawn Routes Length + * var IPv4 Withdrawn Routes NLRI + * 2 B Total Path Attribute Length (zero) + * --- Path Attributes (unused) + * --- IPv4 Network Layer Reachability Information (unused) + */ - while (len > 0) - { - if (len < 2 || len < 2 + opt[1]) - { bgp_error(conn, 2, 0, NULL, 0); return 0; } -#ifdef LOCAL_DEBUG - { - int i; - DBG("\tOption %02x:", opt[0]); - for(i=0; i<opt[1]; i++) - DBG(" %02x", opt[2+i]); - DBG("\n"); - } -#endif + uint len = bgp_encode_nlri(s, buck, buf+2, end); - ol = opt[1]; - switch (opt[0]) - { - case 2: - if (conn->start_state == BSS_CONNECT_NOCAP) - BGP_TRACE(D_PACKETS, "Ignoring received capabilities"); - else - bgp_parse_capabilities(conn, opt + 2, ol); - break; + put_u16(buf+0, len); + put_u16(buf+2+len, 0); - default: - /* - * BGP specs don't tell us to send which option - * we didn't recognize, but it's common practice - * to do so. Also, capability negotiation with - * Cisco routers doesn't work without that. - */ - bgp_error(conn, 2, 4, opt, ol); - return 0; - } - len -= 2 + ol; - opt += 2 + ol; - } - return 0; + return buf+4+len; } -static void -bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) +static byte * +bgp_create_mp_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - struct bgp_conn *other; - struct bgp_proto *p = conn->bgp; - unsigned hold; - u16 base_as; - u32 id; + /* + * 2 B Withdrawn Routes Length (zero) + * --- IPv4 Withdrawn Routes NLRI (unused) + * 2 B Total Path Attribute Length + * 1 B MP_UNREACH_NLRI hdr - Attribute Flags + * 1 B MP_UNREACH_NLRI hdr - Attribute Type Code + * 2 B MP_UNREACH_NLRI hdr - Length of Attribute Data + * 2 B MP_UNREACH_NLRI data - Address Family Identifier + * 1 B MP_UNREACH_NLRI data - Subsequent Address Family Identifier + * var MP_UNREACH_NLRI data - Network Layer Reachability Information + * --- IPv4 Network Layer Reachability Information (unused) + */ + + uint len = bgp_encode_nlri(s, buck, buf+11, end); - /* Check state */ - if (conn->state != BS_OPENSENT) - { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } + put_u16(buf+0, 0); + put_u16(buf+2, 7+len); - /* Check message contents */ - if (len < 29 || len != 29U + pkt[28]) - { bgp_error(conn, 1, 2, pkt+16, 2); return; } - if (pkt[19] != BGP_VERSION) - { bgp_error(conn, 2, 1, pkt+19, 1); return; } /* RFC 1771 says 16 bits, draft-09 tells to use 8 */ - conn->advertised_as = base_as = get_u16(pkt+20); - hold = get_u16(pkt+22); - id = get_u32(pkt+24); - BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%08x)", conn->advertised_as, hold, id); + /* Begin of MP_UNREACH_NLRI atribute */ + buf[4] = BAF_OPTIONAL | BAF_EXT_LEN; + buf[5] = BA_MP_UNREACH_NLRI; + put_u16(buf+6, 3+len); + put_af3(buf+8, s->channel->afi); - if (bgp_parse_options(conn, pkt+29, pkt[28])) - return; + return buf+11+len; +} - if (hold > 0 && hold < 3) - { bgp_error(conn, 2, 6, pkt+22, 2); return; } +static byte * +bgp_create_update(struct bgp_channel *c, byte *buf) +{ + struct bgp_proto *p = (void *) c->c.proto; + struct bgp_bucket *buck; + byte *end = buf + (bgp_max_packet_length(p->conn) - BGP_HEADER_LENGTH); + byte *res = NULL; + + /* Initialize write state */ + struct bgp_write_state s = { + .proto = p, + .channel = c, + .pool = bgp_linpool, + .as4_session = p->as4_session, + .add_path = c->add_path_tx, + }; + +again: + + /* Try unreachable bucket */ + if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) + { + res = (c->afi == BGP_AF_IPV4) ? + bgp_create_ip_unreach(&s, buck, buf, end): + bgp_create_mp_unreach(&s, buck, buf, end); - /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */ - if (!id || (p->is_internal && id == p->local_id)) - { bgp_error(conn, 2, 3, pkt+24, -4); return; } + goto done; + } - if ((conn->advertised_as != base_as) && (base_as != AS_TRANS)) - log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name); + /* Try reachable buckets */ + if (!EMPTY_LIST(c->bucket_queue)) + { + buck = HEAD(c->bucket_queue); - if (conn->advertised_as != p->remote_as) + /* Cleanup empty buckets */ + if (EMPTY_LIST(buck->prefixes)) { - if (conn->peer_as4_support) - { - u32 val = htonl(conn->advertised_as); - bgp_error(conn, 2, 2, (byte *) &val, 4); - } - else - bgp_error(conn, 2, 2, pkt+20, 2); - - return; + bgp_free_bucket(c, buck); + goto again; } - /* Check the other connection */ - other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; - switch (other->state) - { - case BS_CONNECT: - case BS_ACTIVE: - /* Stop outgoing connection attempts */ - bgp_conn_enter_idle_state(other); - break; + res = (c->afi == BGP_AF_IPV4) ? + bgp_create_ip_reach(&s, buck, buf, end): + bgp_create_mp_reach(&s, buck, buf, end); - case BS_IDLE: - case BS_OPENSENT: - case BS_CLOSE: - break; + if (EMPTY_LIST(buck->prefixes)) + bgp_free_bucket(c, buck); + else + bgp_defer_bucket(c, buck); - case BS_OPENCONFIRM: - /* - * Description of collision detection rules in RFC 4271 is confusing and - * contradictory, but it is essentially: - * - * 1. Router with higher ID is dominant - * 2. If both have the same ID, router with higher ASN is dominant [RFC6286] - * 3. When both connections are in OpenConfirm state, one initiated by - * the dominant router is kept. - * - * The first line in the expression below evaluates whether the neighbor - * is dominant, the second line whether the new connection was initiated - * by the neighbor. If both are true (or both are false), we keep the new - * connection, otherwise we keep the old one. - */ - if (((p->local_id < id) || ((p->local_id == id) && (p->local_as < p->remote_as))) - == (conn == &p->incoming_conn)) - { - /* Should close the other connection */ - BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection"); - bgp_error(other, 6, 7, NULL, 0); - break; - } - /* Fall thru */ - case BS_ESTABLISHED: - /* Should close this connection */ - BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection"); - bgp_error(conn, 6, 7, NULL, 0); - return; - default: - bug("bgp_rx_open: Unknown state"); - } + if (!res) + goto again; - /* Update our local variables */ - conn->hold_time = MIN(hold, p->cf->hold_time); - conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3; - p->remote_id = id; - p->as4_session = p->cf->enable_as4 && conn->peer_as4_support; - p->add_path_rx = (p->cf->add_path & ADD_PATH_RX) && (conn->peer_add_path & ADD_PATH_TX); - p->add_path_tx = (p->cf->add_path & ADD_PATH_TX) && (conn->peer_add_path & ADD_PATH_RX); - p->gr_ready = p->cf->gr_mode && conn->peer_gr_able; - p->ext_messages = p->cf->enable_extended_messages && conn->peer_ext_messages_support; + goto done; + } - if (p->add_path_tx) - p->p.accept_ra_types = RA_ANY; + /* No more prefixes to send */ + return NULL; - DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n", conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, p->as4_session); +done: + BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE"); + lp_flush(s.pool); - bgp_schedule_packet(conn, PKT_KEEPALIVE); - bgp_start_timer(conn->hold_timer, conn->hold_time); - bgp_conn_enter_openconfirm_state(conn); + return res; } +static byte * +bgp_create_ip_end_mark(struct bgp_channel *c UNUSED, byte *buf) +{ + /* Empty update packet */ + put_u32(buf, 0); -static inline void -bgp_rx_end_mark(struct bgp_proto *p) + return buf+4; +} + +static byte * +bgp_create_mp_end_mark(struct bgp_channel *c, byte *buf) { - BGP_TRACE(D_PACKETS, "Got END-OF-RIB"); + put_u16(buf+0, 0); + put_u16(buf+2, 6); /* length 4--9 */ - if (p->load_state == BFS_LOADING) - p->load_state = BFS_NONE; + /* Empty MP_UNREACH_NLRI atribute */ + buf[4] = BAF_OPTIONAL; + buf[5] = BA_MP_UNREACH_NLRI; + buf[6] = 3; /* Length 7--9 */ + put_af3(buf+7, c->afi); - if (p->p.gr_recovery) - proto_graceful_restart_unlock(&p->p); - - if (p->gr_active) - bgp_graceful_restart_done(p); -} - - -#define DECODE_PREFIX(pp, ll) do { \ - if (p->add_path_rx) \ - { \ - if (ll < 5) { err=1; goto done; } \ - path_id = get_u32(pp); \ - pp += 4; \ - ll -= 4; \ - } \ - int b = *pp++; \ - int q; \ - ll--; \ - if (b > BITS_PER_IP_ADDRESS) { err=10; goto done; } \ - q = (b+7) / 8; \ - if (ll < q) { err=1; goto done; } \ - memcpy(&prefix, pp, q); \ - pp += q; \ - ll -= q; \ - ipa_ntoh(prefix); \ - prefix = ipa_and(prefix, ipa_mkmask(b)); \ - pxlen = b; \ -} while (0) + return buf+10; +} +static byte * +bgp_create_end_mark(struct bgp_channel *c, byte *buf) +{ + struct bgp_proto *p = (void *) c->c.proto; + + BGP_TRACE(D_PACKETS, "Sending END-OF-RIB"); + + return (c->afi == BGP_AF_IPV4) ? + bgp_create_ip_end_mark(c, buf): + bgp_create_mp_end_mark(c, buf); +} static inline void -bgp_rte_update(struct bgp_proto *p, ip_addr prefix, int pxlen, - u32 path_id, u32 *last_id, struct rte_src **src, - rta *a0, rta **a) +bgp_rx_end_mark(struct bgp_parse_state *s, u32 afi) { - if (path_id != *last_id) - { - *src = rt_get_source(&p->p, path_id); - *last_id = path_id; + struct bgp_proto *p = s->proto; + struct bgp_channel *c = bgp_get_channel(p, afi); - if (*a) - { - rta_free(*a); - *a = NULL; - } - } + BGP_TRACE(D_PACKETS, "Got END-OF-RIB"); - /* Prepare cached route attributes */ - if (!*a) - { - a0->src = *src; + if (!c) + DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi)); - /* Workaround for rta_lookup() breaking eattrs */ - ea_list *ea = a0->eattrs; - *a = rta_lookup(a0); - a0->eattrs = ea; - } + if (c->load_state == BFS_LOADING) + c->load_state = BFS_NONE; - net *n = net_get(p->p.table, prefix, pxlen); - rte *e = rte_get_temp(rta_clone(*a)); - e->net = n; - e->pflags = 0; - e->u.bgp.suppressed = 0; - rte_update2(p->p.main_ahook, n, e, *src); + if (p->p.gr_recovery) + channel_graceful_restart_unlock(&c->c); + + if (c->gr_active) + bgp_graceful_restart_done(c); } static inline void -bgp_rte_withdraw(struct bgp_proto *p, ip_addr prefix, int pxlen, - u32 path_id, u32 *last_id, struct rte_src **src) +bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len) { - if (path_id != *last_id) - { - *src = rt_find_source(&p->p, path_id); - *last_id = path_id; - } + struct bgp_channel *c = bgp_get_channel(s->proto, afi); + rta *a = NULL; - net *n = net_find(p->p.table, prefix, pxlen); - rte_update2( p->p.main_ahook, n, NULL, *src); -} + if (!c) + DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi)); -static inline int -bgp_set_next_hop(struct bgp_proto *p, rta *a) -{ - struct eattr *nh = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP)); - ip_addr *nexthop = (ip_addr *) nh->u.ptr->data; + s->channel = c; + s->add_path = c->add_path_rx; -#ifdef IPV6 - int second = (nh->u.ptr->length == NEXT_HOP_LENGTH) && ipa_nonzero(nexthop[1]); + s->last_id = 0; + s->last_src = s->proto->p.main_source; - /* First address should not be link-local, but may be zero in direct mode */ - if (ipa_is_link_local(*nexthop)) - *nexthop = IPA_NONE; -#else - int second = 0; -#endif + /* + * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not + * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for + * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by + * decode_next_hop hooks) by restoring a->eattrs afterwards. + */ - if (p->cf->gw_mode == GW_DIRECT) - { - neighbor *ng = NULL; + if (ea) + { + a = alloca(sizeof(struct rta)); + memset(a, 0, sizeof(struct rta)); - if (ipa_nonzero(*nexthop)) - ng = neigh_find(&p->p, nexthop, 0); - else if (second) /* GW_DIRECT -> single_hop -> p->neigh != NULL */ - ng = neigh_find2(&p->p, nexthop + 1, p->neigh->iface, 0); + a->source = RTS_BGP; + a->scope = SCOPE_UNIVERSE; + a->cast = RTC_UNICAST; + a->dest = RTD_UNREACHABLE; + a->from = s->proto->cf->remote_ip; + a->eattrs = ea; - /* Fallback */ - if (!ng) - ng = p->neigh; + c->desc->decode_next_hop(s, nh, nh_len, a); - if (ng->scope == SCOPE_HOST) - return 0; + /* Handle withdraw during next hop decoding */ + if (s->err_withdraw) + a = NULL; + } - a->dest = RTD_ROUTER; - a->gw = ng->addr; - a->iface = ng->iface; - a->hostentry = NULL; - a->igp_metric = 0; - } - else /* GW_RECURSIVE */ - { - if (ipa_zero(*nexthop)) - return 0; + c->desc->decode_nlri(s, nlri, len, a); - rta_set_recursive_next_hop(p->p.table, a, p->igp_table, nexthop, nexthop + second); - } - - return 1; + rta_free(s->cached_rta); + s->cached_rta = NULL; } -#ifndef IPV6 /* IPv4 version */ - static void -bgp_do_rx_update(struct bgp_conn *conn, - byte *withdrawn, int withdrawn_len, - byte *nlri, int nlri_len, - byte *attrs, int attr_len) +bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) { struct bgp_proto *p = conn->bgp; - struct rte_src *src = p->p.main_source; - rta *a0, *a = NULL; - ip_addr prefix; - int pxlen, err = 0; - u32 path_id = 0; - u32 last_id = 0; + ea_list *ea = NULL; - /* Check for End-of-RIB marker */ - if (!withdrawn_len && !attr_len && !nlri_len) - { - bgp_rx_end_mark(p); - return; - } + BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE"); - /* Withdraw routes */ - while (withdrawn_len) - { - DECODE_PREFIX(withdrawn, withdrawn_len); - DBG("Withdraw %I/%d\n", prefix, pxlen); + /* Workaround for some BGP implementations that skip initial KEEPALIVE */ + if (conn->state == BS_OPENCONFIRM) + bgp_conn_enter_established_state(conn); - bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); - } + if (conn->state != BS_ESTABLISHED) + { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - if (!attr_len && !nlri_len) /* shortcut */ - return; + bgp_start_timer(conn->hold_timer, conn->hold_time); - a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, nlri_len); + /* Initialize parse state */ + struct bgp_parse_state s = { + .proto = p, + .pool = bgp_linpool, + .as4_session = p->as4_session, + }; - if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */ - return; + /* Parse error handler */ + if (setjmp(s.err_jmpbuf)) + { + bgp_error(conn, 3, s.err_subcode, NULL, 0); + goto done; + } - if (a0 && nlri_len && !bgp_set_next_hop(p, a0)) - a0 = NULL; + /* Check minimal length */ + if (len < 23) + { bgp_error(conn, 1, 2, pkt+16, 2); return; } - last_id = 0; - src = p->p.main_source; + /* Skip fixed header */ + uint pos = 19; - while (nlri_len) - { - DECODE_PREFIX(nlri, nlri_len); - DBG("Add %I/%d\n", prefix, pxlen); + /* + * UPDATE message format + * + * 2 B IPv4 Withdrawn Routes Length + * var IPv4 Withdrawn Routes NLRI + * 2 B Total Path Attribute Length + * var Path Attributes + * var IPv4 Reachable Routes NLRI + */ - if (a0) - bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a); - else /* Forced withdraw as a result of soft error */ - bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); - } + s.ip_unreach_len = get_u16(pkt + pos); + s.ip_unreach_nlri = pkt + pos + 2; + pos += 2 + s.ip_unreach_len; - done: - if (a) - rta_free(a); + if (pos + 2 > len) + bgp_parse_error(&s, 1); - if (err) - bgp_error(conn, 3, err, NULL, 0); + s.attr_len = get_u16(pkt + pos); + s.attrs = pkt + pos + 2; + pos += 2 + s.attr_len; + + if (pos > len) + bgp_parse_error(&s, 1); + + s.ip_reach_len = len - pos; + s.ip_reach_nlri = pkt + pos; + + + if (s.attr_len) + ea = bgp_decode_attrs(&s, s.attrs, s.attr_len); + + /* Check for End-of-RIB marker */ + if (!s.attr_len && !s.ip_unreach_len && !s.ip_reach_len) + { bgp_rx_end_mark(&s, BGP_AF_IPV4); goto done; } + /* Check for MP End-of-RIB marker */ + if ((s.attr_len < 8) && !s.ip_unreach_len && !s.ip_reach_len && + !s.mp_reach_len && !s.mp_unreach_len && s.mp_unreach_af) + { bgp_rx_end_mark(&s, s.mp_unreach_af); goto done; } + + if (s.ip_unreach_len) + bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_unreach_nlri, s.ip_unreach_len, NULL, NULL, 0); + + if (s.mp_unreach_len) + bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0); + + if (s.ip_reach_len) + bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len, + ea, s.ip_next_hop_data, s.ip_next_hop_len); + + if (s.mp_reach_len) + bgp_decode_nlri(&s, s.mp_reach_af, s.mp_reach_nlri, s.mp_reach_len, + ea, s.mp_next_hop_data, s.mp_next_hop_len); + +done: + rta_free(s.cached_rta); + lp_flush(s.pool); return; } -#else /* IPv6 version */ -#define DO_NLRI(name) \ - x = p->name##_start; \ - len = len0 = p->name##_len; \ - if (len) \ - { \ - if (len < 3) { err=9; goto done; } \ - af = get_u16(x); \ - x += 3; \ - len -= 3; \ - DBG("\tNLRI AF=%d sub=%d len=%d\n", af, x[-1], len);\ - } \ - else \ - af = 0; \ - if (af == BGP_AF_IPV6) +/* + * ROUTE-REFRESH + */ -static void -bgp_attach_next_hop(rta *a0, byte *x) +static inline byte * +bgp_create_route_refresh(struct bgp_channel *c, byte *buf) { - ip_addr *nh = (ip_addr *) bgp_attach_attr_wa(&a0->eattrs, bgp_linpool, BA_NEXT_HOP, NEXT_HOP_LENGTH); - memcpy(nh, x+1, 16); - ipa_ntoh(nh[0]); + struct bgp_proto *p = (void *) c->c.proto; - /* We store received link local address in the other part of BA_NEXT_HOP eattr. */ - if (*x == 32) - { - memcpy(nh+1, x+17, 16); - ipa_ntoh(nh[1]); - } - else - nh[1] = IPA_NONE; + BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH"); + + /* Original route refresh request, RFC 2918 */ + put_af4(buf, c->afi); + buf[2] = BGP_RR_REQUEST; + + return buf+4; } +static inline byte * +bgp_create_begin_refresh(struct bgp_channel *c, byte *buf) +{ + struct bgp_proto *p = (void *) c->c.proto; + + BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR"); + + /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */ + put_af4(buf, c->afi); + buf[2] = BGP_RR_BEGIN; + + return buf+4; +} + +static inline byte * +bgp_create_end_refresh(struct bgp_channel *c, byte *buf) +{ + struct bgp_proto *p = (void *) c->c.proto; + + BGP_TRACE(D_PACKETS, "Sending END-OF-RR"); + + /* Demarcation of ending of route refresh (EoRR), RFC 7313 */ + put_af4(buf, c->afi); + buf[2] = BGP_RR_END; + + return buf+4; +} static void -bgp_do_rx_update(struct bgp_conn *conn, - byte *withdrawn UNUSED, int withdrawn_len, - byte *nlri UNUSED, int nlri_len, - byte *attrs, int attr_len) +bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len) { struct bgp_proto *p = conn->bgp; - struct rte_src *src = p->p.main_source; - byte *x; - int len, len0; - unsigned af; - rta *a0, *a = NULL; - ip_addr prefix; - int pxlen, err = 0; - u32 path_id = 0; - u32 last_id = 0; - - p->mp_reach_len = 0; - p->mp_unreach_len = 0; - a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, 0); - - if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */ - return; - /* Check for End-of-RIB marker */ - if ((attr_len < 8) && !withdrawn_len && !nlri_len && !p->mp_reach_len && - (p->mp_unreach_len == 3) && (get_u16(p->mp_unreach_start) == BGP_AF_IPV6)) - { - bgp_rx_end_mark(p); - return; - } + if (conn->state != BS_ESTABLISHED) + { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - DO_NLRI(mp_unreach) - { - while (len) - { - DECODE_PREFIX(x, len); - DBG("Withdraw %I/%d\n", prefix, pxlen); - bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); - } - } + if (!conn->local_caps->route_refresh) + { bgp_error(conn, 1, 3, pkt+18, 1); return; } - DO_NLRI(mp_reach) - { - /* Create fake NEXT_HOP attribute */ - if (len < 1 || (*x != 16 && *x != 32) || len < *x + 2) - { err = 9; goto done; } + if (len < (BGP_HEADER_LENGTH + 4)) + { bgp_error(conn, 1, 2, pkt+16, 2); return; } - if (a0) - bgp_attach_next_hop(a0, x); + if (len > (BGP_HEADER_LENGTH + 4)) + { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; } - /* Also ignore one reserved byte */ - len -= *x + 2; - x += *x + 2; + struct bgp_channel *c = bgp_get_channel(p, get_af4(pkt+19)); + if (!c) + { + log(L_WARN "%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring", + p->p.name, pkt[21], get_u16(pkt+19), pkt[22]); + return; + } - if (a0 && ! bgp_set_next_hop(p, a0)) - a0 = NULL; + /* RFC 7313 redefined reserved field as RR message subtype */ + uint subtype = p->enhanced_refresh ? pkt[21] : BGP_RR_REQUEST; - last_id = 0; - src = p->p.main_source; + switch (subtype) + { + case BGP_RR_REQUEST: + BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH"); + channel_request_feeding(&c->c); + break; - while (len) - { - DECODE_PREFIX(x, len); - DBG("Add %I/%d\n", prefix, pxlen); + case BGP_RR_BEGIN: + BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR"); + bgp_refresh_begin(c); + break; - if (a0) - bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a); - else /* Forced withdraw as a result of soft error */ - bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); - } - } + case BGP_RR_END: + BGP_TRACE(D_PACKETS, "Got END-OF-RR"); + bgp_refresh_end(c); + break; - done: - if (a) - rta_free(a); + default: + log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring", + p->p.name, subtype); + break; + } +} - if (err) /* Use subcode 9, not err */ - bgp_error(conn, 3, 9, NULL, 0); +static inline struct bgp_channel * +bgp_get_channel_to_send(struct bgp_proto *p, struct bgp_conn *conn) +{ + uint i = conn->last_channel; - return; + /* Try the last channel, but at most several times */ + if ((conn->channels_to_send & (1 << i)) && + (conn->last_channel_count < 16)) + goto found; + + /* Find channel with non-zero channels_to_send */ + do + { + i++; + if (i >= p->channel_count) + i = 0; + } + while (! (conn->channels_to_send & (1 << i))); + + /* Use that channel */ + conn->last_channel = i; + conn->last_channel_count = 0; + +found: + conn->last_channel_count++; + return p->channel_map[i]; } -#endif +static inline int +bgp_send(struct bgp_conn *conn, uint type, uint len) +{ + sock *sk = conn->sk; + byte *buf = sk->tbuf; -static void -bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) + memset(buf, 0xff, 16); /* Marker */ + put_u16(buf+16, len); + buf[18] = type; + + return sk_send(sk, len); +} + +/** + * bgp_fire_tx - transmit packets + * @conn: connection + * + * Whenever the transmit buffers of the underlying TCP connection + * are free and we have any packets queued for sending, the socket functions + * call bgp_fire_tx() which takes care of selecting the highest priority packet + * queued (Notification > Keepalive > Open > Update), assembling its header + * and body and sending it to the connection. + */ +static int +bgp_fire_tx(struct bgp_conn *conn) { struct bgp_proto *p = conn->bgp; - byte *withdrawn, *attrs, *nlri; - uint withdrawn_len, attr_len, nlri_len; + struct bgp_channel *c; + byte *buf, *pkt, *end; + uint s; - BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE"); + if (!conn->sk) + return 0; - /* Workaround for some BGP implementations that skip initial KEEPALIVE */ - if (conn->state == BS_OPENCONFIRM) - bgp_conn_enter_established_state(conn); + buf = conn->sk->tbuf; + pkt = buf + BGP_HEADER_LENGTH; + s = conn->packets_to_send; - if (conn->state != BS_ESTABLISHED) - { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - bgp_start_timer(conn->hold_timer, conn->hold_time); + if (s & (1 << PKT_SCHEDULE_CLOSE)) + { + /* We can finally close connection and enter idle state */ + bgp_conn_enter_idle_state(conn); + return 0; + } + if (s & (1 << PKT_NOTIFICATION)) + { + conn->packets_to_send = 1 << PKT_SCHEDULE_CLOSE; + end = bgp_create_notification(conn, pkt); + return bgp_send(conn, PKT_NOTIFICATION, end - buf); + } + else if (s & (1 << PKT_KEEPALIVE)) + { + conn->packets_to_send &= ~(1 << PKT_KEEPALIVE); + BGP_TRACE(D_PACKETS, "Sending KEEPALIVE"); + bgp_start_timer(conn->keepalive_timer, conn->keepalive_time); + return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH); + } + else if (s & (1 << PKT_OPEN)) + { + conn->packets_to_send &= ~(1 << PKT_OPEN); + end = bgp_create_open(conn, pkt); + return bgp_send(conn, PKT_OPEN, end - buf); + } + else while (conn->channels_to_send) + { + c = bgp_get_channel_to_send(p, conn); + s = c->packets_to_send; - /* Find parts of the packet and check sizes */ - if (len < 23) + if (s & (1 << PKT_ROUTE_REFRESH)) { - bgp_error(conn, 1, 2, pkt+16, 2); - return; + c->packets_to_send &= ~(1 << PKT_ROUTE_REFRESH); + end = bgp_create_route_refresh(c, pkt); + return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf); } - withdrawn = pkt + 21; - withdrawn_len = get_u16(pkt + 19); - if (withdrawn_len + 23 > len) - goto malformed; - attrs = withdrawn + withdrawn_len + 2; - attr_len = get_u16(attrs - 2); - if (withdrawn_len + attr_len + 23 > len) - goto malformed; - nlri = attrs + attr_len; - nlri_len = len - withdrawn_len - attr_len - 23; - if (!attr_len && nlri_len) - goto malformed; - DBG("Sizes: withdrawn=%d, attrs=%d, NLRI=%d\n", withdrawn_len, attr_len, nlri_len); - - lp_flush(bgp_linpool); - - bgp_do_rx_update(conn, withdrawn, withdrawn_len, nlri, nlri_len, attrs, attr_len); - return; + else if (s & (1 << PKT_BEGIN_REFRESH)) + { + /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */ + c->packets_to_send &= ~(1 << PKT_BEGIN_REFRESH); + end = bgp_create_begin_refresh(c, pkt); + return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf); + } + else if (s & (1 << PKT_UPDATE)) + { + end = bgp_create_update(c, pkt); + if (end) + return bgp_send(conn, PKT_UPDATE, end - buf); + + /* No update to send, perhaps we need to send End-of-RIB or EoRR */ + c->packets_to_send = 0; + conn->channels_to_send &= ~(1 << c->index); + + if (c->feed_state == BFS_LOADED) + { + c->feed_state = BFS_NONE; + end = bgp_create_end_mark(c, pkt); + return bgp_send(conn, PKT_UPDATE, end - buf); + } + + else if (c->feed_state == BFS_REFRESHED) + { + c->feed_state = BFS_NONE; + end = bgp_create_end_refresh(c, pkt); + return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf); + } + } + else if (s) + bug("Channel packets_to_send: %x", s); + + c->packets_to_send = 0; + conn->channels_to_send &= ~(1 << c->index); + } + + return 0; +} + +/** + * bgp_schedule_packet - schedule a packet for transmission + * @conn: connection + * @c: channel + * @type: packet type + * + * Schedule a packet of type @type to be sent as soon as possible. + */ +void +bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) +{ + ASSERT(conn->sk); + + DBG("BGP: Scheduling packet type %d\n", type); + + if (c) + { + if (! conn->channels_to_send) + { + conn->last_channel = c->index; + conn->last_channel_count = 0; + } + + c->packets_to_send |= 1 << type; + conn->channels_to_send |= 1 << c->index; + } + else + conn->packets_to_send |= 1 << type; + + if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev)) + ev_schedule(conn->tx_ev); +} + +void +bgp_kick_tx(void *vconn) +{ + struct bgp_conn *conn = vconn; + + DBG("BGP: kicking TX\n"); + while (bgp_fire_tx(conn) > 0) + ; +} + +void +bgp_tx(sock *sk) +{ + struct bgp_conn *conn = sk->data; -malformed: - bgp_error(conn, 3, 1, NULL, 0); + DBG("BGP: TX hook\n"); + while (bgp_fire_tx(conn) > 0) + ; } + static struct { byte major, minor; byte *msg; @@ -1475,26 +2179,25 @@ static struct { * which might be static string or given temporary buffer. */ const char * -bgp_error_dsc(unsigned code, unsigned subcode) +bgp_error_dsc(uint code, uint subcode) { static char buff[32]; - unsigned i; + uint i; + for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++) if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode) - { - return bgp_msg_table[i].msg; - } + return bgp_msg_table[i].msg; - bsprintf(buff, "Unknown error %d.%d", code, subcode); + bsprintf(buff, "Unknown error %u.%u", code, subcode); return buff; } void -bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len) +bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, byte *data, uint len) { const byte *name; byte *t, argbuf[36]; - unsigned i; + uint i; /* Don't report Cease messages generated by myself */ if (code == 6 && class == BE_BGP_TX) @@ -1510,7 +2213,7 @@ bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4))) { /* Bad peer AS - we would like to print the AS */ - t += bsprintf(t, "%d", (len == 2) ? get_u16(data) : get_u32(data)); + t += bsprintf(t, "%u", (len == 2) ? get_u16(data) : get_u32(data)); goto done; } if (len > 16) @@ -1527,47 +2230,25 @@ static void bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len) { struct bgp_proto *p = conn->bgp; + if (len < 21) - { - bgp_error(conn, 1, 2, pkt+16, 2); - return; - } + { bgp_error(conn, 1, 2, pkt+16, 2); return; } - unsigned code = pkt[19]; - unsigned subcode = pkt[20]; + uint code = pkt[19]; + uint subcode = pkt[20]; int err = (code != 6); bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21); bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode); -#ifndef IPV6 - if ((code == 2) && ((subcode == 4) || (subcode == 7)) - /* Error related to capability: - * 4 - Peer does not support capabilities at all. - * 7 - Peer request some capability. Strange unless it is IPv6 only peer. - */ - && (p->cf->capabilities == 2) - /* Capabilities are not explicitly enabled or disabled, therefore heuristic is used */ - && (conn->start_state == BSS_CONNECT) - /* Failed connection attempt have used capabilities */ - && (p->cf->remote_as <= 0xFFFF)) - /* Not possible with disabled capabilities */ - { - /* We try connect without capabilities */ - log(L_WARN "%s: Capability related error received, retry with capabilities disabled", p->p.name); - p->start_state = BSS_CONNECT_NOCAP; - err = 0; - } -#endif - bgp_conn_enter_close_state(conn); - bgp_schedule_packet(conn, PKT_SCHEDULE_CLOSE); + bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE); - if (err) - { - bgp_update_startup_delay(p); - bgp_stop(p, 0); - } + if (err) + { + bgp_update_startup_delay(p); + bgp_stop(p, 0); + } } static void @@ -1577,64 +2258,12 @@ bgp_rx_keepalive(struct bgp_conn *conn) BGP_TRACE(D_PACKETS, "Got KEEPALIVE"); bgp_start_timer(conn->hold_timer, conn->hold_time); - switch (conn->state) - { - case BS_OPENCONFIRM: - bgp_conn_enter_established_state(conn); - break; - case BS_ESTABLISHED: - break; - default: - bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); - } -} -static void -bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len) -{ - struct bgp_proto *p = conn->bgp; + if (conn->state == BS_OPENCONFIRM) + { bgp_conn_enter_established_state(conn); return; } if (conn->state != BS_ESTABLISHED) - { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - - if (!p->cf->enable_refresh) - { bgp_error(conn, 1, 3, pkt+18, 1); return; } - - if (len < (BGP_HEADER_LENGTH + 4)) - { bgp_error(conn, 1, 2, pkt+16, 2); return; } - - if (len > (BGP_HEADER_LENGTH + 4)) - { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; } - - /* FIXME - we ignore AFI/SAFI values, as we support - just one value and even an error code for an invalid - request is not defined */ - - /* RFC 7313 redefined reserved field as RR message subtype */ - uint subtype = conn->peer_enhanced_refresh_support ? pkt[21] : BGP_RR_REQUEST; - - switch (subtype) - { - case BGP_RR_REQUEST: - BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH"); - proto_request_feeding(&p->p); - break; - - case BGP_RR_BEGIN: - BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR"); - bgp_refresh_begin(p); - break; - - case BGP_RR_END: - BGP_TRACE(D_PACKETS, "Got END-OF-RR"); - bgp_refresh_end(p); - break; - - default: - log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring", - p->p.name, subtype); - break; - } + bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); } @@ -1648,7 +2277,7 @@ bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len) * packet handler according to the packet type. */ static void -bgp_rx_packet(struct bgp_conn *conn, byte *pkt, unsigned len) +bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len) { byte type = pkt[18]; @@ -1658,14 +2287,14 @@ bgp_rx_packet(struct bgp_conn *conn, byte *pkt, unsigned len) mrt_dump_bgp_packet(conn, pkt, len); switch (type) - { - case PKT_OPEN: return bgp_rx_open(conn, pkt, len); - case PKT_UPDATE: return bgp_rx_update(conn, pkt, len); - case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len); - case PKT_KEEPALIVE: return bgp_rx_keepalive(conn); - case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len); - default: bgp_error(conn, 1, 3, pkt+18, 1); - } + { + case PKT_OPEN: return bgp_rx_open(conn, pkt, len); + case PKT_UPDATE: return bgp_rx_update(conn, pkt, len); + case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len); + case PKT_KEEPALIVE: return bgp_rx_keepalive(conn); + case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len); + default: bgp_error(conn, 1, 3, pkt+18, 1); + } } /** @@ -1682,10 +2311,9 @@ int bgp_rx(sock *sk, uint size) { struct bgp_conn *conn = sk->data; - struct bgp_proto *p = conn->bgp; byte *pkt_start = sk->rbuf; byte *end = pkt_start + size; - unsigned i, len; + uint i, len; DBG("BGP: RX hook: Got %d bytes\n", size); while (end >= pkt_start + BGP_HEADER_LENGTH) @@ -1699,7 +2327,7 @@ bgp_rx(sock *sk, uint size) break; } len = get_u16(pkt_start+16); - if (len < BGP_HEADER_LENGTH || len > bgp_max_packet_length(p)) + if ((len < BGP_HEADER_LENGTH) || (len > bgp_max_packet_length(conn))) { bgp_error(conn, 1, 2, pkt_start+16, 2); break; |