diff options
Diffstat (limited to 'proto/bgp')
-rw-r--r-- | proto/bgp/Makefile | 9 | ||||
-rw-r--r-- | proto/bgp/attrs.c | 2765 | ||||
-rw-r--r-- | proto/bgp/bgp.c | 1700 | ||||
-rw-r--r-- | proto/bgp/bgp.h | 507 | ||||
-rw-r--r-- | proto/bgp/config.Y | 160 | ||||
-rw-r--r-- | proto/bgp/packets.c | 3406 |
6 files changed, 5266 insertions, 3281 deletions
diff --git a/proto/bgp/Makefile b/proto/bgp/Makefile index a634cf0d..00aaef5e 100644 --- a/proto/bgp/Makefile +++ b/proto/bgp/Makefile @@ -1,5 +1,6 @@ -source=bgp.c attrs.c packets.c -root-rel=../../ -dir-name=proto/bgp +src := attrs.c bgp.c packets.c +obj := $(src-o-files) +$(all-daemon) +$(cf-local) -include ../../Rules +tests_objs := $(tests_objs) $(src-o-files)
\ No newline at end of file diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c index 73eb4040..0f41f818 100644 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@ -2,6 +2,8 @@ * BIRD -- BGP Attributes * * (c) 2000 Martin Mares <mj@ucw.cz> + * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2008--2016 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -39,888 +41,1266 @@ * specifies that such updates should be ignored, but that is generally * a bad idea. * - * Error checking of optional transitive attributes is done according to - * draft-ietf-idr-optional-transitive-03, but errors are handled always - * as withdraws. + * BGP attribute table has several hooks: * - * Unexpected AS_CONFED_* segments in AS_PATH are logged and removed, - * but unknown segments cause a session drop with Malformed AS_PATH - * error (see validate_path()). The behavior in such case is not - * explicitly specified by RFC 4271. RFC 5065 specifies that - * inconsistent AS_CONFED_* segments should cause a session drop, but - * implementations that pass invalid AS_CONFED_* segments are - * widespread. + * export - Hook that validates and normalizes attribute during export phase. + * Receives eattr, may modify it (e.g., sort community lists for canonical + * representation), UNSET() it (e.g., skip empty lists), or WITHDRAW() it if + * necessary. May assume that eattr has value valid w.r.t. its type, but may be + * invalid w.r.t. BGP constraints. Optional. * - * Error handling of AS4_* attributes is done as specified by RFC 6793. There - * are several possible inconsistencies between AGGREGATOR and AS4_AGGREGATOR - * that are not handled by that RFC, these are logged and ignored (see - * bgp_reconstruct_4b_attrs()). + * encode - Hook that converts internal representation to external one during + * packet writing. Receives eattr and puts it in the buffer (including attribute + * header). Returns number of bytes, or -1 if not enough space. May assume that + * eattr has value valid w.r.t. its type and validated by export hook. Mandatory + * for all known attributes that exist internally after export phase (i.e., all + * except pseudoattributes MP_(UN)REACH_NLRI). + * + * decode - Hook that converts external representation to internal one during + * packet parsing. Receives attribute data in buffer, validates it and adds + * attribute to ea_list. If data are invalid, steps DISCARD(), WITHDRAW() or + * bgp_parse_error() may be used to escape. Mandatory for all known attributes. + * + * format - Optional hook that converts eattr to textual representation. */ -static byte bgp_mandatory_attrs[] = { BA_ORIGIN, BA_AS_PATH -#ifndef IPV6 -,BA_NEXT_HOP -#endif +struct bgp_attr_desc { + const char *name; + uint type; + uint flags; + void (*export)(struct bgp_export_state *s, eattr *a); + int (*encode)(struct bgp_write_state *s, eattr *a, byte *buf, uint size); + void (*decode)(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to); + void (*format)(eattr *ea, byte *buf, uint size); }; -struct attr_desc { - char *name; - int expected_length; - int expected_flags; - int type; - int allow_in_ebgp; - int (*validate)(struct bgp_proto *p, byte *attr, int len); - void (*format)(eattr *ea, byte *buf, int buflen); -}; +static const struct bgp_attr_desc bgp_attr_table[]; + +static inline int bgp_attr_known(uint code); + +eattr * +bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintptr_t val) +{ + ASSERT(bgp_attr_known(code)); + + ea_list *a = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr)); + eattr *e = &a->attrs[0]; + + a->flags = EALF_SORTED; + a->count = 1; + a->next = *attrs; + *attrs = a; + + e->id = EA_CODE(EAP_BGP, code); + e->type = bgp_attr_table[code].type; + e->flags = flags; + + if (e->type & EAF_EMBEDDED) + e->u.data = (u32) val; + else + e->u.ptr = (struct adata *) val; + + return e; +} + + + +#define REPORT(msg, args...) \ + ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); }) + +#define DISCARD(msg, args...) \ + ({ REPORT(msg, ## args); return; }) -#define IGNORE -1 -#define WITHDRAW -2 +#define WITHDRAW(msg, args...) \ + ({ REPORT(msg, ## args); s->err_withdraw = 1; return; }) + +#define UNSET(a) \ + ({ a->type = EAF_TYPE_UNDEF; return; }) + +#define NEW_BGP "Discarding %s attribute received from AS4-aware neighbor" +#define BAD_EBGP "Discarding %s attribute received from EBGP neighbor" +#define BAD_LENGTH "Malformed %s attribute - invalid length (%u)" +#define BAD_VALUE "Malformed %s attribute - invalid value (%u)" +#define NO_MANDATORY "Missing mandatory %s attribute" + + +static inline int +bgp_put_attr_hdr3(byte *buf, uint code, uint flags, uint len) +{ + *buf++ = flags; + *buf++ = code; + *buf++ = len; + return 3; +} + +static inline int +bgp_put_attr_hdr4(byte *buf, uint code, uint flags, uint len) +{ + *buf++ = flags | BAF_EXT_LEN; + *buf++ = code; + put_u16(buf, len); + return 4; +} + +static inline int +bgp_put_attr_hdr(byte *buf, uint code, uint flags, uint len) +{ + if (len < 256) + return bgp_put_attr_hdr3(buf, code, flags, len); + else + return bgp_put_attr_hdr4(buf, code, flags, len); +} static int -bgp_check_origin(struct bgp_proto *p UNUSED, byte *a, int len UNUSED) +bgp_encode_u8(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) { - if (*a > 2) - return 6; - return 0; + if (size < (3+1)) + return -1; + + bgp_put_attr_hdr3(buf, EA_ID(a->id), a->flags, 1); + buf[3] = a->u.data; + + return 3+1; } -static void -bgp_format_origin(eattr *a, byte *buf, int buflen UNUSED) +static int +bgp_encode_u32(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) { - static char *bgp_origin_names[] = { "IGP", "EGP", "Incomplete" }; + if (size < (3+4)) + return -1; - bsprintf(buf, bgp_origin_names[a->u.data]); + bgp_put_attr_hdr3(buf, EA_ID(a->id), a->flags, 4); + put_u32(buf+3, a->u.data); + + return 3+4; } static int -path_segment_contains(byte *p, int bs, u32 asn) +bgp_encode_u32s(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) { - int i; - int len = p[1]; - p += 2; + uint len = a->u.ptr->length; - for(i=0; i<len; i++) - { - u32 asn2 = (bs == 4) ? get_u32(p) : get_u16(p); - if (asn2 == asn) - return 1; - p += bs; - } + if (size < (4+len)) + return -1; - return 0; + uint hdr = bgp_put_attr_hdr(buf, EA_ID(a->id), a->flags, len); + put_u32s(buf + hdr, (u32 *) a->u.ptr->data, len / 4); + + return hdr + len; } -/* Validates path attribute, removes AS_CONFED_* segments, and also returns path length */ static int -validate_path(struct bgp_proto *p, int as_path, int bs, byte *idata, uint *ilength) +bgp_put_attr(byte *buf, uint size, uint code, uint flags, byte *data, uint len) { - int res = 0; - u8 *a, *dst; - int len, plen; + if (size < (4+len)) + return -1; - dst = a = idata; - len = *ilength; + uint hdr = bgp_put_attr_hdr(buf, code, flags, len); + memcpy(buf + hdr, data, len); - while (len) - { - if (len < 2) - return -1; - - plen = 2 + bs * a[1]; - if (len < plen) - return -1; - - if (a[1] == 0) - { - log(L_WARN "%s: %s_PATH attribute contains empty segment, skipping it", - p->p.name, as_path ? "AS" : "AS4"); - goto skip; - } - - switch (a[0]) - { - case AS_PATH_SET: - res++; - break; - - case AS_PATH_SEQUENCE: - res += a[1]; - break; - - case AS_PATH_CONFED_SEQUENCE: - case AS_PATH_CONFED_SET: - if (as_path && path_segment_contains(a, bs, p->remote_as)) - { - log(L_WARN "%s: AS_CONFED_* segment with peer ASN found, misconfigured confederation?", p->p.name); - return -1; - } - - log(L_WARN "%s: %s_PATH attribute contains AS_CONFED_* segment, skipping segment", - p->p.name, as_path ? "AS" : "AS4"); - goto skip; - - default: - return -1; - } - - if (dst != a) - memmove(dst, a, plen); - dst += plen; - - skip: - len -= plen; - a += plen; - } + return hdr + len; +} - *ilength = dst - idata; - return res; +static int +bgp_encode_raw(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size) +{ + return bgp_put_attr(buf, size, EA_ID(a->id), a->flags, a->u.ptr->data, a->u.ptr->length); } -static inline int -validate_as_path(struct bgp_proto *p, byte *a, int *len) + +/* + * Attribute hooks + */ + +static void +bgp_export_origin(struct bgp_export_state *s, eattr *a) { - return validate_path(p, 1, p->as4_session ? 4 : 2, a, len); + if (a->u.data > 2) + WITHDRAW(BAD_VALUE, "ORIGIN", a->u.data); } -static inline int -validate_as4_path(struct bgp_proto *p, struct adata *path) +static void +bgp_decode_origin(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) +{ + if (len != 1) + WITHDRAW(BAD_LENGTH, "ORIGIN", len); + + if (data[0] > 2) + WITHDRAW(BAD_VALUE, "ORIGIN", data[0]); + + bgp_set_attr_u32(to, s->pool, BA_ORIGIN, flags, data[0]); +} + +static void +bgp_format_origin(eattr *a, byte *buf, uint size UNUSED) +{ + static const char *bgp_origin_names[] = { "IGP", "EGP", "Incomplete" }; + + bsprintf(buf, (a->u.data <= 2) ? bgp_origin_names[a->u.data] : "?"); +} + + +static int +bgp_encode_as_path(struct bgp_write_state *s, eattr *a, byte *buf, uint size) +{ + byte *data = a->u.ptr->data; + uint len = a->u.ptr->length; + + if (!s->as4_session) + { + /* Prepare 16-bit AS_PATH (from 32-bit one) in a temporary buffer */ + byte *src = data; + data = alloca(len); + len = as_path_32to16(data, src, len); + } + + return bgp_put_attr(buf, size, BA_AS_PATH, a->flags, data, len); +} + +static void +bgp_decode_as_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - return validate_path(p, 0, 4, path->data, &path->length); + struct bgp_proto *p = s->proto; + int as_length = s->as4_session ? 4 : 2; + int as_confed = p->cf->confederation && p->is_interior; + char err[128]; + + if (!as_path_valid(data, len, as_length, as_confed, err, sizeof(err))) + WITHDRAW("Malformed AS_PATH attribute - %s", err); + + /* In some circumstances check for initial AS_CONFED_SEQUENCE; RFC 5065 5.0 */ + if (p->is_interior && !p->is_internal && + ((len < 2) || (data[0] != AS_PATH_CONFED_SEQUENCE))) + WITHDRAW("Malformed AS_PATH attribute - %s", "missing initial AS_CONFED_SEQUENCE"); + + if (!s->as4_session) + { + /* Prepare 32-bit AS_PATH (from 16-bit one) in a temporary buffer */ + byte *src = data; + data = alloca(2*len); + len = as_path_16to32(data, src, len); + } + + bgp_set_attr_data(to, s->pool, BA_AS_PATH, flags, data, len); } + static int -bgp_check_next_hop(struct bgp_proto *p UNUSED, byte *a UNUSED6, int len UNUSED6) +bgp_encode_next_hop(struct bgp_write_state *s, eattr *a, byte *buf, uint size) { -#ifdef IPV6 - return IGNORE; -#else - ip_addr addr; + /* + * The NEXT_HOP attribute is used only in traditional (IPv4) BGP. In MP-BGP, + * the next hop is encoded as a part of the MP_REACH_NLRI attribute, so we + * store it and encode it later by AFI-specific hooks. + */ + + if (s->channel->afi == BGP_AF_IPV4) + { + ASSERT(a->u.ptr->length == sizeof(ip_addr)); - memcpy(&addr, a, len); - ipa_ntoh(addr); - if (ipa_classify(addr) & IADDR_HOST) + if (size < (3+4)) + return -1; + + bgp_put_attr_hdr3(buf, BA_NEXT_HOP, a->flags, 4); + put_ip4(buf+3, ipa_to_ip4( *(ip_addr *) a->u.ptr->data )); + + return 3+4; + } + else + { + s->mp_next_hop = a; return 0; + } +} + +static void +bgp_decode_next_hop(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data, uint len, ea_list **to UNUSED) +{ + if (len != 4) + WITHDRAW(BAD_LENGTH, "NEXT_HOP", len); + + /* Semantic checks are done later */ + s->ip_next_hop_len = len; + s->ip_next_hop_data = data; +} + +/* TODO: This function should use AF-specific hook */ +static void +bgp_format_next_hop(eattr *a, byte *buf, uint size UNUSED) +{ + ip_addr *nh = (void *) a->u.ptr->data; + uint len = a->u.ptr->length; + + ASSERT((len == 16) || (len == 32)); + + /* in IPv6, we may have two addresses in NEXT HOP */ + if ((len == 16) || ipa_zero(nh[1])) + bsprintf(buf, "%I", nh[0]); else - return 8; -#endif + bsprintf(buf, "%I %I", nh[0], nh[1]); } + static void -bgp_format_next_hop(eattr *a, byte *buf, int buflen UNUSED) +bgp_decode_med(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - ip_addr *ipp = (ip_addr *) a->u.ptr->data; -#ifdef IPV6 - /* in IPv6, we might have two addresses in NEXT HOP */ - if ((a->u.ptr->length == NEXT_HOP_LENGTH) && ipa_nonzero(ipp[1])) - { - bsprintf(buf, "%I %I", ipp[0], ipp[1]); - return; - } -#endif + if (len != 4) + WITHDRAW(BAD_LENGTH, "MULTI_EXIT_DISC", len); - bsprintf(buf, "%I", ipp[0]); + u32 val = get_u32(data); + bgp_set_attr_u32(to, s->pool, BA_MULTI_EXIT_DISC, flags, val); } -static int -bgp_check_aggregator(struct bgp_proto *p, byte *a UNUSED, int len) + +static void +bgp_export_local_pref(struct bgp_export_state *s, eattr *a) { - int exp_len = p->as4_session ? 8 : 6; - - return (len == exp_len) ? 0 : WITHDRAW; + if (!s->proto->is_interior && !s->proto->cf->allow_local_pref) + UNSET(a); } static void -bgp_format_aggregator(eattr *a, byte *buf, int buflen UNUSED) +bgp_decode_local_pref(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - struct adata *ad = a->u.ptr; - byte *data = ad->data; - u32 as; + if (!s->proto->is_interior && !s->proto->cf->allow_local_pref) + DISCARD(BAD_EBGP, "LOCAL_PREF"); - as = get_u32(data); - data += 4; + if (len != 4) + WITHDRAW(BAD_LENGTH, "LOCAL_PREF", len); - bsprintf(buf, "%d.%d.%d.%d AS%u", data[0], data[1], data[2], data[3], as); + u32 val = get_u32(data); + bgp_set_attr_u32(to, s->pool, BA_LOCAL_PREF, flags, val); } -static int -bgp_check_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len) + +static void +bgp_decode_atomic_aggr(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to) { - return ((len % 4) == 0) ? 0 : WITHDRAW; + if (len != 0) + DISCARD(BAD_LENGTH, "ATOMIC_AGGR", len); + + bgp_set_attr_data(to, s->pool, BA_ATOMIC_AGGR, flags, NULL, 0); } static int -bgp_check_cluster_list(struct bgp_proto *p UNUSED, byte *a UNUSED, int len) +bgp_encode_aggregator(struct bgp_write_state *s, eattr *a, byte *buf, uint size) { - return ((len % 4) == 0) ? 0 : 5; + byte *data = a->u.ptr->data; + uint len = a->u.ptr->length; + + if (!s->as4_session) + { + /* Prepare 16-bit AGGREGATOR (from 32-bit one) in a temporary buffer */ + byte *src = data; + data = alloca(6); + len = aggregator_32to16(data, src); + } + + return bgp_put_attr(buf, size, BA_AGGREGATOR, a->flags, data, len); } static void -bgp_format_cluster_list(eattr *a, byte *buf, int buflen) +bgp_decode_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - /* Truncates cluster lists larger than buflen, probably not a problem */ - int_set_format(a->u.ptr, 0, -1, buf, buflen); + if (len != (s->as4_session ? 8 : 6)) + DISCARD(BAD_LENGTH, "AGGREGATOR", len); + + if (!s->as4_session) + { + /* Prepare 32-bit AGGREGATOR (from 16-bit one) in a temporary buffer */ + byte *src = data; + data = alloca(8); + len = aggregator_16to32(data, src); + } + + bgp_set_attr_data(to, s->pool, BA_AGGREGATOR, flags, data, len); } -static int -bgp_check_reach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED) +static void +bgp_format_aggregator(eattr *a, byte *buf, uint size UNUSED) { -#ifdef IPV6 - p->mp_reach_start = a; - p->mp_reach_len = len; -#endif - return IGNORE; + byte *data = a->u.ptr->data; + + bsprintf(buf, "%I4 AS%u", get_ip4(data+4), get_u32(data+0)); } -static int -bgp_check_unreach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED) + +static void +bgp_export_community(struct bgp_export_state *s, eattr *a) { -#ifdef IPV6 - p->mp_unreach_start = a; - p->mp_unreach_len = len; -#endif - return IGNORE; + if (a->u.ptr->length == 0) + UNSET(a); + + a->u.ptr = int_set_sort(s->pool, a->u.ptr); } -static int -bgp_check_ext_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len) +static void +bgp_decode_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - return ((len % 8) == 0) ? 0 : WITHDRAW; + if (!len || (len % 4)) + WITHDRAW(BAD_LENGTH, "COMMUNITY", len); + + struct adata *ad = lp_alloc_adata(s->pool, len); + get_u32s(data, (u32 *) ad->data, len / 4); + bgp_set_attr_ptr(to, s->pool, BA_COMMUNITY, flags, ad); } -static int -bgp_check_large_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len) -{ - return ((len % 12) == 0) ? 0 : WITHDRAW; -} - - -static struct attr_desc bgp_attr_table[] = { - { NULL, -1, 0, 0, 0, /* Undefined */ - NULL, NULL }, - { "origin", 1, BAF_TRANSITIVE, EAF_TYPE_INT, 1, /* BA_ORIGIN */ - bgp_check_origin, bgp_format_origin }, - { "as_path", -1, BAF_TRANSITIVE, EAF_TYPE_AS_PATH, 1, /* BA_AS_PATH */ - NULL, NULL }, /* is checked by validate_as_path() as a special case */ - { "next_hop", 4, BAF_TRANSITIVE, EAF_TYPE_IP_ADDRESS, 1, /* BA_NEXT_HOP */ - bgp_check_next_hop, bgp_format_next_hop }, - { "med", 4, BAF_OPTIONAL, EAF_TYPE_INT, 1, /* BA_MULTI_EXIT_DISC */ - NULL, NULL }, - { "local_pref", 4, BAF_TRANSITIVE, EAF_TYPE_INT, 1, /* BA_LOCAL_PREF */ - NULL, NULL }, - { "atomic_aggr", 0, BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_ATOMIC_AGGR */ - NULL, NULL }, - { "aggregator", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AGGREGATOR */ - bgp_check_aggregator, bgp_format_aggregator }, - { "community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_INT_SET, 1, /* BA_COMMUNITY */ - bgp_check_community, NULL }, - { "originator_id", 4, BAF_OPTIONAL, EAF_TYPE_ROUTER_ID, 0, /* BA_ORIGINATOR_ID */ - NULL, NULL }, - { "cluster_list", -1, BAF_OPTIONAL, EAF_TYPE_INT_SET, 0, /* BA_CLUSTER_LIST */ - bgp_check_cluster_list, bgp_format_cluster_list }, - { .name = NULL }, /* BA_DPA */ - { .name = NULL }, /* BA_ADVERTISER */ - { .name = NULL }, /* BA_RCID_PATH */ - { "mp_reach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1, /* BA_MP_REACH_NLRI */ - bgp_check_reach_nlri, NULL }, - { "mp_unreach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1, /* BA_MP_UNREACH_NLRI */ - bgp_check_unreach_nlri, NULL }, - { "ext_community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_EC_SET, 1, /* BA_EXT_COMMUNITY */ - bgp_check_ext_community, NULL }, - { "as4_path", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AS4_PATH */ - NULL, NULL }, - { "as4_aggregator", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AS4_PATH */ - NULL, NULL }, - [BA_LARGE_COMMUNITY] = - { "large_community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_LC_SET, 1, - bgp_check_large_community, NULL } -}; -/* BA_AS4_PATH is type EAF_TYPE_OPAQUE and not type EAF_TYPE_AS_PATH. - * It does not matter as this attribute does not appear on routes in the routing table. - */ +static void +bgp_export_originator_id(struct bgp_export_state *s, eattr *a) +{ + if (!s->proto->is_internal) + UNSET(a); +} -#define ATTR_KNOWN(code) ((code) < ARRAY_SIZE(bgp_attr_table) && bgp_attr_table[code].name) +static void +bgp_decode_originator_id(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) +{ + if (!s->proto->is_internal) + DISCARD(BAD_EBGP, "ORIGINATOR_ID"); -static inline struct adata * -bgp_alloc_adata(struct linpool *pool, unsigned len) + if (len != 4) + WITHDRAW(BAD_LENGTH, "ORIGINATOR_ID", len); + + u32 val = get_u32(data); + bgp_set_attr_u32(to, s->pool, BA_ORIGINATOR_ID, flags, val); +} + + +static void +bgp_export_cluster_list(struct bgp_export_state *s UNUSED, eattr *a) { - struct adata *ad = lp_alloc(pool, sizeof(struct adata) + len); - ad->length = len; - return ad; + if (!s->proto->is_internal) + UNSET(a); + + if (a->u.ptr->length == 0) + UNSET(a); } static void -bgp_set_attr(eattr *e, unsigned attr, uintptr_t val) +bgp_decode_cluster_list(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - ASSERT(ATTR_KNOWN(attr)); - e->id = EA_CODE(EAP_BGP, attr); - e->type = bgp_attr_table[attr].type; - e->flags = bgp_attr_table[attr].expected_flags; - if (e->type & EAF_EMBEDDED) - e->u.data = val; - else - e->u.ptr = (struct adata *) val; + if (!s->proto->is_internal) + DISCARD(BAD_EBGP, "CLUSTER_LIST"); + + if (!len || (len % 4)) + WITHDRAW(BAD_LENGTH, "CLUSTER_LIST", len); + + struct adata *ad = lp_alloc_adata(s->pool, len); + get_u32s(data, (u32 *) ad->data, len / 4); + bgp_set_attr_ptr(to, s->pool, BA_CLUSTER_LIST, flags, ad); } -static byte * -bgp_set_attr_wa(eattr *e, struct linpool *pool, unsigned attr, unsigned len) +static void +bgp_format_cluster_list(eattr *a, byte *buf, uint size) { - struct adata *ad = bgp_alloc_adata(pool, len); - bgp_set_attr(e, attr, (uintptr_t) ad); - return ad->data; + /* Truncates cluster lists larger than buflen, probably not a problem */ + int_set_format(a->u.ptr, 0, -1, buf, size); } -void -bgp_attach_attr(ea_list **to, struct linpool *pool, unsigned attr, uintptr_t val) + +static inline u32 +get_af3(byte *buf) { - ea_list *a = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr)); - a->next = *to; - *to = a; - a->flags = EALF_SORTED; - a->count = 1; - bgp_set_attr(a->attrs, attr, val); + return (get_u16(buf) << 16) | buf[2]; } -byte * -bgp_attach_attr_wa(ea_list **to, struct linpool *pool, unsigned attr, unsigned len) +static void +bgp_decode_mp_reach_nlri(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data, uint len, ea_list **to UNUSED) { - struct adata *ad = bgp_alloc_adata(pool, len); - bgp_attach_attr(to, pool, attr, (uintptr_t) ad); - return ad->data; + /* + * 2 B MP_REACH_NLRI data - Address Family Identifier + * 1 B MP_REACH_NLRI data - Subsequent Address Family Identifier + * 1 B MP_REACH_NLRI data - Length of Next Hop Network Address + * var MP_REACH_NLRI data - Network Address of Next Hop + * 1 B MP_REACH_NLRI data - Reserved (zero) + * var MP_REACH_NLRI data - Network Layer Reachability Information + */ + + if ((len < 5) || (len < (5 + (uint) data[3]))) + bgp_parse_error(s, 9); + + s->mp_reach_af = get_af3(data); + s->mp_next_hop_len = data[3]; + s->mp_next_hop_data = data + 4; + s->mp_reach_len = len - 5 - s->mp_next_hop_len; + s->mp_reach_nlri = data + 5 + s->mp_next_hop_len; } -static int -bgp_encode_attr_hdr(byte *dst, uint flags, unsigned code, int len) + +static void +bgp_decode_mp_unreach_nlri(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data, uint len, ea_list **to UNUSED) { - int wlen; + /* + * 2 B MP_UNREACH_NLRI data - Address Family Identifier + * 1 B MP_UNREACH_NLRI data - Subsequent Address Family Identifier + * var MP_UNREACH_NLRI data - Network Layer Reachability Information + */ - DBG("\tAttribute %02x (%d bytes, flags %02x)\n", code, len, flags); + if (len < 3) + bgp_parse_error(s, 9); + + s->mp_unreach_af = get_af3(data); + s->mp_unreach_len = len - 3; + s->mp_unreach_nlri = data + 3; +} - if (len < 256) - { - *dst++ = flags; - *dst++ = code; - *dst++ = len; - wlen = 3; - } - else - { - *dst++ = flags | BAF_EXT_LEN; - *dst++ = code; - put_u16(dst, len); - wlen = 4; - } - return wlen; +static void +bgp_export_ext_community(struct bgp_export_state *s, eattr *a) +{ + a->u.ptr = ec_set_del_nontrans(s->pool, a->u.ptr); + + if (a->u.ptr->length == 0) + UNSET(a); + + ec_set_sort_x(a->u.ptr); } static void -aggregator_convert_to_old(struct adata *aggr, byte *dst, int *new_used) +bgp_decode_ext_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - byte *src = aggr->data; - *new_used = 0; + if (!len || (len % 8)) + WITHDRAW(BAD_LENGTH, "EXT_COMMUNITY", len); - u32 as = get_u32(src); - if (as > 0xFFFF) - { - as = AS_TRANS; - *new_used = 1; - } - put_u16(dst, as); + struct adata *ad = lp_alloc_adata(s->pool, len); + get_u32s(data, (u32 *) ad->data, len / 4); + bgp_set_attr_ptr(to, s->pool, BA_EXT_COMMUNITY, flags, ad); +} + + +static void +bgp_decode_as4_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) +{ + if (s->as4_session) + DISCARD(NEW_BGP, "AS4_AGGREGATOR"); - /* Copy IPv4 address */ - memcpy(dst + 2, src + 4, 4); + if (len != 8) + DISCARD(BAD_LENGTH, "AS4_AGGREGATOR", len); + + bgp_set_attr_data(to, s->pool, BA_AS4_AGGREGATOR, flags, data, len); } static void -aggregator_convert_to_new(struct adata *aggr, byte *dst) +bgp_decode_as4_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) { - byte *src = aggr->data; + char err[128]; + + if (s->as4_session) + DISCARD(NEW_BGP, "AS4_PATH"); + + if (len < 6) + DISCARD(BAD_LENGTH, "AS4_PATH", len); + + if (!as_path_valid(data, len, 4, 1, err, sizeof(err))) + DISCARD("Malformed AS4_PATH attribute - %s", err); + + struct adata *a = lp_alloc_adata(s->pool, len); + memcpy(a->data, data, len); - u32 as = get_u16(src); - put_u32(dst, as); + /* AS_CONFED* segments are invalid in AS4_PATH; RFC 6793 6 */ + if (as_path_contains_confed(a)) + { + REPORT("Discarding AS_CONFED* segment from AS4_PATH attribute"); + a = as_path_strip_confed(s->pool, a); + } - /* Copy IPv4 address */ - memcpy(dst + 4, src + 2, 4); + bgp_set_attr_ptr(to, s->pool, BA_AS4_PATH, flags, a); +} + +static void +bgp_export_large_community(struct bgp_export_state *s, eattr *a) +{ + if (a->u.ptr->length == 0) + UNSET(a); + + a->u.ptr = lc_set_sort(s->pool, a->u.ptr); +} + +static void +bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to) +{ + if (!len || (len % 12)) + WITHDRAW(BAD_LENGTH, "LARGE_COMMUNITY", len); + + struct adata *ad = lp_alloc_adata(s->pool, len); + get_u32s(data, (u32 *) ad->data, len / 4); + bgp_set_attr_ptr(to, s->pool, BA_LARGE_COMMUNITY, flags, ad); +} + +static void +bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a) +{ + net_addr *n = s->route->net->n.addr; + u32 *labels = (u32 *) a->u.ptr->data; + uint lnum = a->u.ptr->length / 4; + + /* Perhaps we should just ignore it? */ + if (!s->mpls) + WITHDRAW("Unexpected MPLS stack"); + + /* Empty MPLS stack is not allowed */ + if (!lnum) + WITHDRAW("Malformed MPLS stack - empty"); + + /* This is ugly, but we must ensure that labels fit into NLRI field */ + if ((24*lnum + (net_is_vpn(n) ? 64 : 0) + net_pxlen(n)) > 255) + WITHDRAW("Malformed MPLS stack - too many labels (%u)", lnum); + + for (uint i = 0; i < lnum; i++) + { + if (labels[i] > 0xfffff) + WITHDRAW("Malformed MPLS stack - invalid label (%u)", labels[i]); + + /* TODO: Check for special-purpose label values? */ + } } static int -bgp_get_attr_len(eattr *a) +bgp_encode_mpls_label_stack(struct bgp_write_state *s, eattr *a, byte *buf UNUSED, uint size UNUSED) { - int len; - if (ATTR_KNOWN(EA_ID(a->id))) + /* + * MPLS labels are encoded as a part of the NLRI in MP_REACH_NLRI attribute, + * so we store MPLS_LABEL_STACK and encode it later by AFI-specific hooks. + */ + + s->mpls_labels = a->u.ptr; + return 0; +} + +static void +bgp_decode_mpls_label_stack(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data UNUSED, uint len UNUSED, ea_list **to UNUSED) +{ + DISCARD("Discarding received attribute #0"); +} + +static void +bgp_format_mpls_label_stack(eattr *a, byte *buf, uint size) +{ + u32 *labels = (u32 *) a->u.ptr->data; + uint lnum = a->u.ptr->length / 4; + char *pos = buf; + + for (uint i = 0; i < lnum; i++) + { + if (size < 20) { - int code = EA_ID(a->id); - struct attr_desc *desc = &bgp_attr_table[code]; - len = desc->expected_length; - if (len < 0) - { - ASSERT(!(a->type & EAF_EMBEDDED)); - len = a->u.ptr->length; - } + bsprintf(pos, "..."); + return; } + + uint l = bsprintf(pos, "%d/", labels[i]); + ADVANCE(pos, size, l); + } + + /* Clear last slash or terminate empty string */ + pos[lnum ? -1 : 0] = 0; +} + +static inline void +bgp_decode_unknown(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to) +{ + /* Cannot use bgp_set_attr_data() as it works on known attributes only */ + ea_set_attr_data(to, s->pool, EA_CODE(EAP_BGP, code), flags, EAF_TYPE_OPAQUE, data, len); +} + + +/* + * Attribute table + */ + +static const struct bgp_attr_desc bgp_attr_table[] = { + [BA_ORIGIN] = { + .name = "origin", + .type = EAF_TYPE_INT, + .flags = BAF_TRANSITIVE, + .export = bgp_export_origin, + .encode = bgp_encode_u8, + .decode = bgp_decode_origin, + .format = bgp_format_origin, + }, + [BA_AS_PATH] = { + .name = "as_path", + .type = EAF_TYPE_AS_PATH, + .flags = BAF_TRANSITIVE, + .encode = bgp_encode_as_path, + .decode = bgp_decode_as_path, + }, + [BA_NEXT_HOP] = { + .name = "next_hop", + .type = EAF_TYPE_IP_ADDRESS, + .flags = BAF_TRANSITIVE, + .encode = bgp_encode_next_hop, + .decode = bgp_decode_next_hop, + .format = bgp_format_next_hop, + }, + [BA_MULTI_EXIT_DISC] = { + .name = "med", + .type = EAF_TYPE_INT, + .flags = BAF_OPTIONAL, + .encode = bgp_encode_u32, + .decode = bgp_decode_med, + }, + [BA_LOCAL_PREF] = { + .name = "local_pref", + .type = EAF_TYPE_INT, + .flags = BAF_TRANSITIVE, + .export = bgp_export_local_pref, + .encode = bgp_encode_u32, + .decode = bgp_decode_local_pref, + }, + [BA_ATOMIC_AGGR] = { + .name = "atomic_aggr", + .type = EAF_TYPE_OPAQUE, + .flags = BAF_TRANSITIVE, + .encode = bgp_encode_raw, + .decode = bgp_decode_atomic_aggr, + }, + [BA_AGGREGATOR] = { + .name = "aggregator", + .type = EAF_TYPE_OPAQUE, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .encode = bgp_encode_aggregator, + .decode = bgp_decode_aggregator, + .format = bgp_format_aggregator, + }, + [BA_COMMUNITY] = { + .name = "community", + .type = EAF_TYPE_INT_SET, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .export = bgp_export_community, + .encode = bgp_encode_u32s, + .decode = bgp_decode_community, + }, + [BA_ORIGINATOR_ID] = { + .name = "originator_id", + .type = EAF_TYPE_ROUTER_ID, + .flags = BAF_OPTIONAL, + .export = bgp_export_originator_id, + .encode = bgp_encode_u32, + .decode = bgp_decode_originator_id, + }, + [BA_CLUSTER_LIST] = { + .name = "cluster_list", + .type = EAF_TYPE_INT_SET, + .flags = BAF_OPTIONAL, + .export = bgp_export_cluster_list, + .encode = bgp_encode_u32s, + .decode = bgp_decode_cluster_list, + .format = bgp_format_cluster_list, + }, + [BA_MP_REACH_NLRI] = { + .name = "mp_reach_nlri", + .type = EAF_TYPE_OPAQUE, + .flags = BAF_OPTIONAL, + .decode = bgp_decode_mp_reach_nlri, + }, + [BA_MP_UNREACH_NLRI] = { + .name = "mp_unreach_nlri", + .type = EAF_TYPE_OPAQUE, + .flags = BAF_OPTIONAL, + .decode = bgp_decode_mp_unreach_nlri, + }, + [BA_EXT_COMMUNITY] = { + .name = "ext_community", + .type = EAF_TYPE_EC_SET, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .export = bgp_export_ext_community, + .encode = bgp_encode_u32s, + .decode = bgp_decode_ext_community, + }, + [BA_AS4_PATH] = { + .name = "as4_path", + .type = EAF_TYPE_AS_PATH, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .encode = bgp_encode_raw, + .decode = bgp_decode_as4_path, + }, + [BA_AS4_AGGREGATOR] = { + .name = "as4_aggregator", + .type = EAF_TYPE_OPAQUE, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .encode = bgp_encode_raw, + .decode = bgp_decode_as4_aggregator, + .format = bgp_format_aggregator, + }, + [BA_LARGE_COMMUNITY] = { + .name = "large_community", + .type = EAF_TYPE_LC_SET, + .flags = BAF_OPTIONAL | BAF_TRANSITIVE, + .export = bgp_export_large_community, + .encode = bgp_encode_u32s, + .decode = bgp_decode_large_community, + }, + [BA_MPLS_LABEL_STACK] = { + .name = "mpls_label_stack", + .type = EAF_TYPE_INT_SET, + .export = bgp_export_mpls_label_stack, + .encode = bgp_encode_mpls_label_stack, + .decode = bgp_decode_mpls_label_stack, + .format = bgp_format_mpls_label_stack, + }, +}; + +static inline int +bgp_attr_known(uint code) +{ + return (code < ARRAY_SIZE(bgp_attr_table)) && bgp_attr_table[code].name; +} + + +/* + * Attribute export + */ + +static inline void +bgp_export_attr(struct bgp_export_state *s, eattr *a, ea_list *to) +{ + if (EA_PROTO(a->id) != EAP_BGP) + return; + + uint code = EA_ID(a->id); + + if (bgp_attr_known(code)) + { + const struct bgp_attr_desc *desc = &bgp_attr_table[code]; + + /* The flags might have been zero if the attr was added by filters */ + a->flags = (a->flags & BAF_PARTIAL) | desc->flags; + + /* Set partial bit if new opt-trans attribute is attached to non-local route */ + if ((s->src != NULL) && (a->type & EAF_ORIGINATED) && + (a->flags & BAF_OPTIONAL) && (a->flags & BAF_TRANSITIVE)) + a->flags |= BAF_PARTIAL; + + /* Call specific hook */ + CALL(desc->export, s, a); + + /* Attribute might become undefined in hook */ + if ((a->type & EAF_TYPE_MASK) == EAF_TYPE_UNDEF) + return; + } else - { - ASSERT((a->type & EAF_TYPE_MASK) == EAF_TYPE_OPAQUE); - len = a->u.ptr->length; - } - - return len; + { + /* Don't re-export unknown non-transitive attributes */ + if (!(a->flags & BAF_TRANSITIVE)) + return; + + a->flags |= BAF_PARTIAL; + } + + /* Append updated attribute */ + to->attrs[to->count++] = *a; +} + +/** + * bgp_export_attrs - export BGP attributes + * @s: BGP export state + * @attrs: a list of extended attributes + * + * The bgp_export_attrs() function takes a list of attributes and merges it to + * one newly allocated and sorted segment. Attributes are validated and + * normalized by type-specific export hooks and attribute flags are updated. + * Some attributes may be eliminated (e.g. unknown non-tranitive attributes, or + * empty community sets). + * + * Result: one sorted attribute list segment, or NULL if attributes are unsuitable. + */ +static inline ea_list * +bgp_export_attrs(struct bgp_export_state *s, ea_list *attrs) +{ + /* Merge the attribute list */ + ea_list *new = lp_alloc(s->pool, ea_scan(attrs)); + ea_merge(attrs, new); + ea_sort(new); + + uint i, count; + count = new->count; + new->count = 0; + + /* Export each attribute */ + for (i = 0; i < count; i++) + bgp_export_attr(s, &new->attrs[i], new); + + if (s->err_withdraw) + return NULL; + + return new; +} + + +/* + * Attribute encoding + */ + +static inline int +bgp_encode_attr(struct bgp_write_state *s, eattr *a, byte *buf, uint size) +{ + ASSERT(EA_PROTO(a->id) == EAP_BGP); + + uint code = EA_ID(a->id); + + if (bgp_attr_known(code)) + return bgp_attr_table[code].encode(s, a, buf, size); + else + return bgp_encode_raw(s, a, buf, size); } /** * bgp_encode_attrs - encode BGP attributes - * @p: BGP instance - * @w: buffer + * @s: BGP write state * @attrs: a list of extended attributes - * @remains: remaining space in the buffer + * @buf: buffer + * @end: buffer end * * The bgp_encode_attrs() function takes a list of extended attributes * and converts it to its BGP representation (a part of an Update message). * * Result: Length of the attribute block generated or -1 if not enough space. */ -uint -bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains) +int +bgp_encode_attrs(struct bgp_write_state *s, ea_list *attrs, byte *buf, byte *end) { - uint i, code, type, flags; - byte *start = w; - int len, rv; + byte *pos = buf; + int i, len; - for(i=0; i<attrs->count; i++) - { - eattr *a = &attrs->attrs[i]; - ASSERT(EA_PROTO(a->id) == EAP_BGP); - code = EA_ID(a->id); - -#ifdef IPV6 - /* When talking multiprotocol BGP, the NEXT_HOP attributes are used only temporarily. */ - if (code == BA_NEXT_HOP) - continue; -#endif - - /* When AS4-aware BGP speaker is talking to non-AS4-aware BGP speaker, - * we have to convert our 4B AS_PATH to 2B AS_PATH and send our AS_PATH - * as optional AS4_PATH attribute. - */ - if ((code == BA_AS_PATH) && (! p->as4_session)) - { - len = a->u.ptr->length; - - if (remains < (len + 4)) - goto err_no_buffer; - - /* Using temporary buffer because don't know a length of created attr - * and therefore a length of a header. Perhaps i should better always - * use BAF_EXT_LEN. */ - - byte buf[len]; - int new_used; - int nl = as_path_convert_to_old(a->u.ptr, buf, &new_used); - - DBG("BGP: Encoding old AS_PATH\n"); - rv = bgp_encode_attr_hdr(w, BAF_TRANSITIVE, BA_AS_PATH, nl); - ADVANCE(w, remains, rv); - memcpy(w, buf, nl); - ADVANCE(w, remains, nl); - - if (! new_used) - continue; - - if (remains < (len + 4)) - goto err_no_buffer; - - /* We should discard AS_CONFED_SEQUENCE or AS_CONFED_SET path segments - * here but we don't support confederations and such paths we already - * discarded in bgp_check_as_path(). - */ - - DBG("BGP: Encoding AS4_PATH\n"); - rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_PATH, len); - ADVANCE(w, remains, rv); - memcpy(w, a->u.ptr->data, len); - ADVANCE(w, remains, len); - - continue; - } - - /* The same issue with AGGREGATOR attribute */ - if ((code == BA_AGGREGATOR) && (! p->as4_session)) - { - int new_used; - - len = 6; - if (remains < (len + 3)) - goto err_no_buffer; - - rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AGGREGATOR, len); - ADVANCE(w, remains, rv); - aggregator_convert_to_old(a->u.ptr, w, &new_used); - ADVANCE(w, remains, len); - - if (! new_used) - continue; - - len = 8; - if (remains < (len + 3)) - goto err_no_buffer; - - rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_AGGREGATOR, len); - ADVANCE(w, remains, rv); - memcpy(w, a->u.ptr->data, len); - ADVANCE(w, remains, len); - - continue; - } - - /* Standard path continues here ... */ - - type = a->type & EAF_TYPE_MASK; - flags = a->flags & (BAF_OPTIONAL | BAF_TRANSITIVE | BAF_PARTIAL); - len = bgp_get_attr_len(a); - - /* Skip empty sets */ - if (((type == EAF_TYPE_INT_SET) || (type == EAF_TYPE_EC_SET) || (type == EAF_TYPE_LC_SET)) && (len == 0)) - continue; - - if (remains < len + 4) - goto err_no_buffer; - - rv = bgp_encode_attr_hdr(w, flags, code, len); - ADVANCE(w, remains, rv); - - switch (type) - { - case EAF_TYPE_INT: - case EAF_TYPE_ROUTER_ID: - if (len == 4) - put_u32(w, a->u.data); - else - *w = a->u.data; - break; - case EAF_TYPE_IP_ADDRESS: - { - ip_addr ip = *(ip_addr *)a->u.ptr->data; - ipa_hton(ip); - memcpy(w, &ip, len); - break; - } - case EAF_TYPE_INT_SET: - case EAF_TYPE_LC_SET: - case EAF_TYPE_EC_SET: - { - u32 *z = int_set_get_data(a->u.ptr); - int i; - for(i=0; i<len; i+=4) - put_u32(w+i, *z++); - break; - } - case EAF_TYPE_OPAQUE: - case EAF_TYPE_AS_PATH: - memcpy(w, a->u.ptr->data, len); - break; - default: - bug("bgp_encode_attrs: unknown attribute type %02x", a->type); - } - ADVANCE(w, remains, len); - } - return w - start; + for (i = 0; i < attrs->count; i++) + { + len = bgp_encode_attr(s, &attrs->attrs[i], pos, end - pos); + + if (len < 0) + return -1; + + pos += len; + } - err_no_buffer: - return -1; + return pos - buf; } + /* -static void -bgp_init_prefix(struct fib_node *N) + * Attribute decoding + */ + +static void bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool); + +static inline int +bgp_as_path_loopy(struct bgp_proto *p, ea_list *attrs, u32 asn) { - struct bgp_prefix *p = (struct bgp_prefix *) N; - p->bucket_node.next = NULL; + eattr *e = bgp_find_attr(attrs, BA_AS_PATH); + int num = p->cf->allow_local_as + 1; + return (e && (num > 0) && as_path_contains(e->u.ptr, asn, num)); } -*/ -static int -bgp_compare_u32(const u32 *x, const u32 *y) +static inline int +bgp_originator_id_loopy(struct bgp_proto *p, ea_list *attrs) { - return (*x < *y) ? -1 : (*x > *y) ? 1 : 0; + eattr *e = bgp_find_attr(attrs, BA_ORIGINATOR_ID); + return (e && (e->u.data == p->local_id)); } -static inline void -bgp_normalize_int_set(u32 *dest, u32 *src, unsigned cnt) +static inline int +bgp_cluster_list_loopy(struct bgp_proto *p, ea_list *attrs) { - memcpy(dest, src, sizeof(u32) * cnt); - qsort(dest, cnt, sizeof(u32), (int(*)(const void *, const void *)) bgp_compare_u32); + eattr *e = bgp_find_attr(attrs, BA_CLUSTER_LIST); + return (e && int_set_contains(e->u.ptr, p->rr_cluster_id)); } -static int -bgp_compare_ec(const u32 *xp, const u32 *yp) +static inline void +bgp_decode_attr(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to) { - u64 x = ec_get(xp, 0); - u64 y = ec_get(yp, 0); - return (x < y) ? -1 : (x > y) ? 1 : 0; + /* Handle duplicate attributes; RFC 7606 3 (g) */ + if (BIT32_TEST(s->attrs_seen, code)) + { + if ((code == BA_MP_REACH_NLRI) || (code == BA_MP_UNREACH_NLRI)) + bgp_parse_error(s, 1); + else + DISCARD("Discarding duplicate attribute (code %u)", code); + } + BIT32_SET(s->attrs_seen, code); + + if (bgp_attr_known(code)) + { + const struct bgp_attr_desc *desc = &bgp_attr_table[code]; + + /* Handle conflicting flags; RFC 7606 3 (c) */ + if ((flags ^ desc->flags) & (BAF_OPTIONAL | BAF_TRANSITIVE)) + WITHDRAW("Malformed %s attribute - conflicting flags (%02x)", desc->name, flags); + + desc->decode(s, code, flags, data, len, to); + } + else /* Unknown attribute */ + { + if (!(flags & BAF_OPTIONAL)) + WITHDRAW("Unknown attribute (code %u) - conflicting flags (%02x)", code, flags); + + bgp_decode_unknown(s, code, flags, data, len, to); + } } -static inline void -bgp_normalize_ec_set(struct adata *ad, u32 *src, int internal) +/** + * bgp_decode_attrs - check and decode BGP attributes + * @s: BGP parse state + * @data: start of attribute block + * @len: length of attribute block + * + * This function takes a BGP attribute block (a part of an Update message), checks + * its consistency and converts it to a list of BIRD route attributes represented + * by an (uncached) &rta. + */ +ea_list * +bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len) { - u32 *dst = int_set_get_data(ad); + struct bgp_proto *p = s->proto; + ea_list *attrs = NULL; + uint code, flags, alen; + byte *pos = data; - /* Remove non-transitive communities (EC_TBIT active) on external sessions */ - if (! internal) + /* Parse the attributes */ + while (len) + { + alen = 0; + + /* Read attribute type */ + if (len < 2) + goto framing_error; + flags = pos[0]; + code = pos[1]; + ADVANCE(pos, len, 2); + + /* Read attribute length */ + if (flags & BAF_EXT_LEN) { - int len = int_set_get_size(ad); - u32 *t = dst; - int i; - - for (i=0; i < len; i += 2) - { - if (src[i] & EC_TBIT) - continue; - - *t++ = src[i]; - *t++ = src[i+1]; - } - - ad->length = (t - dst) * 4; + if (len < 2) + goto framing_error; + alen = get_u16(pos); + ADVANCE(pos, len, 2); + } + else + { + if (len < 1) + goto framing_error; + alen = *pos; + ADVANCE(pos, len, 1); } - else - memcpy(dst, src, ad->length); - qsort(dst, ad->length / 8, 8, (int(*)(const void *, const void *)) bgp_compare_ec); -} + if (alen > len) + goto framing_error; -static int -bgp_compare_lc(const u32 *x, const u32 *y) -{ - if (x[0] != y[0]) - return (x[0] > y[0]) ? 1 : -1; - if (x[1] != y[1]) - return (x[1] > y[1]) ? 1 : -1; - if (x[2] != y[2]) - return (x[2] > y[2]) ? 1 : -1; - return 0; + DBG("Attr %02x %02x %u\n", code, flags, alen); + + bgp_decode_attr(s, code, flags, pos, alen, &attrs); + ADVANCE(pos, len, alen); + } + + if (s->err_withdraw) + goto withdraw; + + /* If there is no reachability NLRI, we are finished */ + if (!s->ip_reach_len && !s->mp_reach_len) + return NULL; + + + /* Handle missing mandatory attributes; RFC 7606 3 (d) */ + if (!BIT32_TEST(s->attrs_seen, BA_ORIGIN)) + { REPORT(NO_MANDATORY, "ORIGIN"); goto withdraw; } + + if (!BIT32_TEST(s->attrs_seen, BA_AS_PATH)) + { REPORT(NO_MANDATORY, "AS_PATH"); goto withdraw; } + + /* When receiving attributes from non-AS4-aware BGP speaker, we have to + reconstruct AS_PATH and AGGREGATOR attributes; RFC 6793 4.2.3 */ + if (!p->as4_session) + bgp_process_as4_attrs(&attrs, s->pool); + + /* Reject routes with our ASN in AS_PATH attribute */ + if (bgp_as_path_loopy(p, attrs, p->local_as)) + goto withdraw; + + /* Reject routes with our Confederation ID in AS_PATH attribute; RFC 5065 4.0 */ + if ((p->public_as != p->local_as) && bgp_as_path_loopy(p, attrs, p->public_as)) + goto withdraw; + + /* Reject routes with our Router ID in ORIGINATOR_ID attribute; RFC 4456 8 */ + if (p->is_internal && bgp_originator_id_loopy(p, attrs)) + goto withdraw; + + /* Reject routes with our Cluster ID in CLUSTER_LIST attribute; RFC 4456 8 */ + if (p->rr_client && bgp_cluster_list_loopy(p, attrs)) + goto withdraw; + + /* If there is no local preference, define one */ + if (!BIT32_TEST(s->attrs_seen, BA_LOCAL_PREF)) + bgp_set_attr_u32(&attrs, s->pool, BA_LOCAL_PREF, 0, p->cf->default_local_pref); + + return attrs; + + +framing_error: + /* RFC 7606 4 - handle attribute framing errors */ + REPORT("Malformed attribute list - framing error (%u/%u) at %d", + alen, len, (int) (pos - s->attrs)); + +withdraw: + /* RFC 7606 5.2 - handle missing NLRI during errors */ + if (!s->ip_reach_len && !s->mp_reach_len) + bgp_parse_error(s, 1); + + s->err_withdraw = 1; + return NULL; } -static inline void -bgp_normalize_lc_set(u32 *dest, u32 *src, unsigned cnt) + +/* + * Route bucket hash table + */ + +#define RBH_KEY(b) b->eattrs, b->hash +#define RBH_NEXT(b) b->next +#define RBH_EQ(a1,h1,a2,h2) h1 == h2 && ea_same(a1, a2) +#define RBH_FN(a,h) h + +#define RBH_REHASH bgp_rbh_rehash +#define RBH_PARAMS /8, *2, 2, 2, 8, 20 + + +HASH_DEFINE_REHASH_FN(RBH, struct bgp_bucket) + +void +bgp_init_bucket_table(struct bgp_channel *c) { - memcpy(dest, src, LCOMM_LENGTH * cnt); - qsort(dest, cnt, LCOMM_LENGTH, (int(*)(const void *, const void *)) bgp_compare_lc); + HASH_INIT(c->bucket_hash, c->pool, 8); + + init_list(&c->bucket_queue); + c->withdraw_bucket = NULL; } -static void -bgp_rehash_buckets(struct bgp_proto *p) +void +bgp_free_bucket_table(struct bgp_channel *c) { - struct bgp_bucket **old = p->bucket_hash; - struct bgp_bucket **new; - unsigned oldn = p->hash_size; - unsigned i, e, mask; + HASH_FREE(c->bucket_hash); + struct bgp_bucket *b; + WALK_LIST_FIRST(b, c->bucket_queue) + { + rem_node(&b->send_node); + mb_free(b); + } - p->hash_size = p->hash_limit; - DBG("BGP: Rehashing bucket table from %d to %d\n", oldn, p->hash_size); - p->hash_limit *= 4; - if (p->hash_limit >= 65536) - p->hash_limit = ~0; - new = p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *)); - mask = p->hash_size - 1; - for (i=0; i<oldn; i++) - while (b = old[i]) - { - old[i] = b->hash_next; - e = b->hash & mask; - b->hash_next = new[e]; - if (b->hash_next) - b->hash_next->hash_prev = b; - b->hash_prev = NULL; - new[e] = b; - } - mb_free(old); + mb_free(c->withdraw_bucket); + c->withdraw_bucket = NULL; } static struct bgp_bucket * -bgp_new_bucket(struct bgp_proto *p, ea_list *new, unsigned hash) +bgp_get_bucket(struct bgp_channel *c, ea_list *new) { - struct bgp_bucket *b; - unsigned ea_size = sizeof(ea_list) + new->count * sizeof(eattr); - unsigned ea_size_aligned = BIRD_ALIGN(ea_size, CPU_STRUCT_ALIGN); - unsigned size = sizeof(struct bgp_bucket) + ea_size_aligned; - unsigned i; + /* Hash and lookup */ + u32 hash = ea_hash(new); + struct bgp_bucket *b = HASH_FIND(c->bucket_hash, RBH, new, hash); + + if (b) + return b; + + uint ea_size = sizeof(ea_list) + new->count * sizeof(eattr); + uint ea_size_aligned = BIRD_ALIGN(ea_size, CPU_STRUCT_ALIGN); + uint size = sizeof(struct bgp_bucket) + ea_size_aligned; + uint i; byte *dest; - unsigned index = hash & (p->hash_size - 1); /* Gather total size of non-inline attributes */ - for (i=0; i<new->count; i++) - { - eattr *a = &new->attrs[i]; - if (!(a->type & EAF_EMBEDDED)) - size += BIRD_ALIGN(sizeof(struct adata) + a->u.ptr->length, CPU_STRUCT_ALIGN); - } + for (i = 0; i < new->count; i++) + { + eattr *a = &new->attrs[i]; - /* Create the bucket and hash it */ - b = mb_alloc(p->p.pool, size); - b->hash_next = p->bucket_hash[index]; - if (b->hash_next) - b->hash_next->hash_prev = b; - p->bucket_hash[index] = b; - b->hash_prev = NULL; - b->hash = hash; - add_tail(&p->bucket_queue, &b->send_node); + if (!(a->type & EAF_EMBEDDED)) + size += BIRD_ALIGN(sizeof(struct adata) + a->u.ptr->length, CPU_STRUCT_ALIGN); + } + + /* Create the bucket */ + b = mb_alloc(c->pool, size); init_list(&b->prefixes); + b->hash = hash; + + /* Copy list of extended attributes */ memcpy(b->eattrs, new, ea_size); - dest = ((byte *)b->eattrs) + ea_size_aligned; + dest = ((byte *) b->eattrs) + ea_size_aligned; /* Copy values of non-inline attributes */ - for (i=0; i<new->count; i++) + for (i = 0; i < new->count; i++) + { + eattr *a = &b->eattrs->attrs[i]; + + if (!(a->type & EAF_EMBEDDED)) { - eattr *a = &b->eattrs->attrs[i]; - if (!(a->type & EAF_EMBEDDED)) - { - struct adata *oa = a->u.ptr; - struct adata *na = (struct adata *) dest; - memcpy(na, oa, sizeof(struct adata) + oa->length); - a->u.ptr = na; - dest += BIRD_ALIGN(sizeof(struct adata) + na->length, CPU_STRUCT_ALIGN); - } + struct adata *oa = a->u.ptr; + struct adata *na = (struct adata *) dest; + memcpy(na, oa, sizeof(struct adata) + oa->length); + a->u.ptr = na; + dest += BIRD_ALIGN(sizeof(struct adata) + na->length, CPU_STRUCT_ALIGN); } + } - /* If needed, rehash */ - p->hash_count++; - if (p->hash_count > p->hash_limit) - bgp_rehash_buckets(p); + /* Insert the bucket to send queue and bucket hash */ + add_tail(&c->bucket_queue, &b->send_node); + HASH_INSERT2(c->bucket_hash, RBH, c->pool, b); return b; } static struct bgp_bucket * -bgp_get_bucket(struct bgp_proto *p, net *n, ea_list *attrs, int originate) +bgp_get_withdraw_bucket(struct bgp_channel *c) { - ea_list *new; - unsigned i, cnt, hash, code; - eattr *a, *d; - u32 seen = 0; - struct bgp_bucket *b; - - /* Merge the attribute list */ - new = alloca(ea_scan(attrs)); - ea_merge(attrs, new); - ea_sort(new); + if (!c->withdraw_bucket) + { + c->withdraw_bucket = mb_allocz(c->pool, sizeof(struct bgp_bucket)); + init_list(&c->withdraw_bucket->prefixes); + } - /* Normalize attributes */ - d = new->attrs; - cnt = new->count; - new->count = 0; - for(i=0; i<cnt; i++) - { - a = &new->attrs[i]; - if (EA_PROTO(a->id) != EAP_BGP) - continue; - code = EA_ID(a->id); - if (ATTR_KNOWN(code)) - { - if (!p->is_internal) - { - if (!bgp_attr_table[code].allow_in_ebgp) - continue; - if ((code == BA_LOCAL_PREF) && !p->cf->allow_local_pref) - continue; - } - /* The flags might have been zero if the attr was added by filters */ - a->flags = (a->flags & BAF_PARTIAL) | bgp_attr_table[code].expected_flags; - if (code < 32) - seen |= 1 << code; - } - else - { - /* Don't re-export unknown non-transitive attributes */ - if (!(a->flags & BAF_TRANSITIVE)) - continue; - } - *d = *a; - if ((d->type & EAF_ORIGINATED) && !originate && (d->flags & BAF_TRANSITIVE) && (d->flags & BAF_OPTIONAL)) - d->flags |= BAF_PARTIAL; - switch (d->type & EAF_TYPE_MASK) - { - case EAF_TYPE_INT_SET: - { - struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length); - z->length = d->u.ptr->length; - bgp_normalize_int_set((u32 *) z->data, (u32 *) d->u.ptr->data, z->length / 4); - d->u.ptr = z; - break; - } - case EAF_TYPE_EC_SET: - { - struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length); - z->length = d->u.ptr->length; - bgp_normalize_ec_set(z, (u32 *) d->u.ptr->data, p->is_internal); - d->u.ptr = z; - break; - } - case EAF_TYPE_LC_SET: - { - struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length); - z->length = d->u.ptr->length; - bgp_normalize_lc_set((u32 *) z->data, (u32 *) d->u.ptr->data, z->length / LCOMM_LENGTH); - d->u.ptr = z; - break; - } - default: ; - } - d++; - new->count++; - } + return c->withdraw_bucket; +} - /* Hash */ - hash = ea_hash(new); - for(b=p->bucket_hash[hash & (p->hash_size - 1)]; b; b=b->hash_next) - if (b->hash == hash && ea_same(b->eattrs, new)) - { - DBG("Found bucket.\n"); - return b; - } - - /* Ensure that there are all mandatory attributes */ - for(i=0; i<ARRAY_SIZE(bgp_mandatory_attrs); i++) - if (!(seen & (1 << bgp_mandatory_attrs[i]))) - { - log(L_ERR "%s: Mandatory attribute %s missing in route %I/%d", p->p.name, bgp_attr_table[bgp_mandatory_attrs[i]].name, n->n.prefix, n->n.pxlen); - return NULL; - } - - /* Check if next hop is valid */ - a = ea_find(new, EA_CODE(EAP_BGP, BA_NEXT_HOP)); - if (!a || ipa_equal(p->cf->remote_ip, *(ip_addr *)a->u.ptr->data)) - { - log(L_ERR "%s: Invalid NEXT_HOP attribute in route %I/%d", p->p.name, n->n.prefix, n->n.pxlen); - return NULL; - } +void +bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b) +{ + rem_node(&b->send_node); + HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b); + mb_free(b); +} - /* Create new bucket */ - DBG("Creating bucket.\n"); - return bgp_new_bucket(p, new, hash); +void +bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b) +{ + rem_node(&b->send_node); + add_tail(&c->bucket_queue, &b->send_node); } void -bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck) +bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b) { - if (buck->hash_next) - buck->hash_next->hash_prev = buck->hash_prev; - if (buck->hash_prev) - buck->hash_prev->hash_next = buck->hash_next; - else - p->bucket_hash[buck->hash & (p->hash_size-1)] = buck->hash_next; - mb_free(buck); + struct bgp_proto *p = (void *) c->c.proto; + struct bgp_bucket *wb = bgp_get_withdraw_bucket(c); + + log(L_ERR "%s: Attribute list too long", p->p.name); + while (!EMPTY_LIST(b->prefixes)) + { + struct bgp_prefix *px = HEAD(b->prefixes); + + log(L_ERR "%s: - withdrawing %N", p->p.name, &px->net); + rem_node(&px->buck_node); + add_tail(&wb->prefixes, &px->buck_node); + } } -/* Prefix hash table */ +/* + * Prefix hash table + */ -#define PXH_KEY(n1) n1->n.prefix, n1->n.pxlen, n1->path_id -#define PXH_NEXT(n) n->next -#define PXH_EQ(p1,l1,i1,p2,l2,i2) ipa_equal(p1, p2) && l1 == l2 && i1 == i2 -#define PXH_FN(p,l,i) ipa_hash32(p) ^ u32_hash((l << 16) ^ i) +#define PXH_KEY(px) px->net, px->path_id, px->hash +#define PXH_NEXT(px) px->next +#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && i1 == i2 && net_equal(n1, n2) +#define PXH_FN(n,i,h) h #define PXH_REHASH bgp_pxh_rehash #define PXH_PARAMS /8, *2, 2, 2, 8, 20 @@ -929,308 +1309,282 @@ bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck) HASH_DEFINE_REHASH_FN(PXH, struct bgp_prefix) void -bgp_init_prefix_table(struct bgp_proto *p, u32 order) +bgp_init_prefix_table(struct bgp_channel *c) { - HASH_INIT(p->prefix_hash, p->p.pool, order); + HASH_INIT(c->prefix_hash, c->pool, 8); - p->prefix_slab = sl_new(p->p.pool, sizeof(struct bgp_prefix)); + uint alen = net_addr_length[c->c.net_type]; + c->prefix_slab = alen ? sl_new(c->pool, sizeof(struct bgp_prefix) + alen) : NULL; } void -bgp_free_prefix_table(struct bgp_proto *p) +bgp_free_prefix_table(struct bgp_channel *c) { - HASH_FREE(p->prefix_hash); + HASH_FREE(c->prefix_hash); - rfree(p->prefix_slab); - p->prefix_slab = NULL; + rfree(c->prefix_slab); + c->prefix_slab = NULL; } static struct bgp_prefix * -bgp_get_prefix(struct bgp_proto *p, ip_addr prefix, int pxlen, u32 path_id) +bgp_get_prefix(struct bgp_channel *c, net_addr *net, u32 path_id) { - struct bgp_prefix *bp = HASH_FIND(p->prefix_hash, PXH, prefix, pxlen, path_id); + u32 hash = net_hash(net) ^ u32_hash(path_id); + struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id, hash); - if (bp) - return bp; + if (px) + { + rem_node(&px->buck_node); + return px; + } - bp = sl_alloc(p->prefix_slab); - bp->n.prefix = prefix; - bp->n.pxlen = pxlen; - bp->path_id = path_id; - bp->bucket_node.next = NULL; + if (c->prefix_slab) + px = sl_alloc(c->prefix_slab); + else + px = mb_alloc(c->pool, sizeof(struct bgp_prefix) + net->length); - HASH_INSERT2(p->prefix_hash, PXH, p->p.pool, bp); + px->buck_node.next = NULL; + px->buck_node.prev = NULL; + px->hash = hash; + px->path_id = path_id; + net_copy(px->net, net); - return bp; + HASH_INSERT2(c->prefix_hash, PXH, c->pool, px); + + return px; } void -bgp_free_prefix(struct bgp_proto *p, struct bgp_prefix *bp) +bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px) { - HASH_REMOVE2(p->prefix_hash, PXH, p->p.pool, bp); - sl_free(p->prefix_slab, bp); + rem_node(&px->buck_node); + HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px); + + if (c->prefix_slab) + sl_free(c->prefix_slab, px); + else + mb_free(px); } -void -bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs) +/* + * BGP protocol glue + */ + +int +bgp_import_control(struct proto *P, rte **new, ea_list **attrs UNUSED, struct linpool *pool UNUSED) { + rte *e = *new; + struct proto *SRC = e->attrs->src->proto; struct bgp_proto *p = (struct bgp_proto *) P; - struct bgp_bucket *buck; - struct bgp_prefix *px; - rte *key; - u32 path_id; + struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (struct bgp_proto *) SRC : NULL; - DBG("BGP: Got route %I/%d %s\n", n->n.prefix, n->n.pxlen, new ? "up" : "down"); + /* Reject our routes */ + if (src == p) + return -1; - if (new) - { - key = new; - buck = bgp_get_bucket(p, n, attrs, new->attrs->source != RTS_BGP); - if (!buck) /* Inconsistent attribute list */ - return; - } - else - { - key = old; - if (!(buck = p->withdraw_bucket)) - { - buck = p->withdraw_bucket = mb_alloc(P->pool, sizeof(struct bgp_bucket)); - init_list(&buck->prefixes); - } - } - path_id = p->add_path_tx ? key->attrs->src->global_id : 0; - px = bgp_get_prefix(p, n->n.prefix, n->n.pxlen, path_id); - if (px->bucket_node.next) - { - DBG("\tRemoving old entry.\n"); - rem_node(&px->bucket_node); - } - add_tail(&buck->prefixes, &px->bucket_node); - bgp_schedule_packet(p->conn, PKT_UPDATE); -} + /* Accept non-BGP routes */ + if (src == NULL) + return 0; -static int -bgp_create_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool) -{ - ea_list *ea = lp_alloc(pool, sizeof(ea_list) + 4*sizeof(eattr)); - rta *rta = e->attrs; - byte *z; + // XXXX: Check next hop AF - ea->next = *attrs; - *attrs = ea; - ea->flags = EALF_SORTED; - ea->count = 4; + /* IBGP route reflection, RFC 4456 */ + if (p->is_internal && src->is_internal && (p->local_as == src->local_as)) + { + /* Rejected unless configured as route reflector */ + if (!p->rr_client && !src->rr_client) + return -1; + + /* Generally, this should be handled when path is received, but we check it + also here as rr_cluster_id may be undefined or different in src. */ + if (p->rr_cluster_id && bgp_cluster_list_loopy(p, e->attrs->eattrs)) + return -1; + } - bgp_set_attr(ea->attrs, BA_ORIGIN, - ((rta->source == RTS_OSPF_EXT1) || (rta->source == RTS_OSPF_EXT2)) ? ORIGIN_INCOMPLETE : ORIGIN_IGP); + /* Handle well-known communities, RFC 1997 */ + struct eattr *c; + if (p->cf->interpret_communities && + (c = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_COMMUNITY)))) + { + struct adata *d = c->u.ptr; - if (p->is_internal) - bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, 0); - else - { - z = bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, 6); - z[0] = AS_PATH_SEQUENCE; - z[1] = 1; /* 1 AS */ - put_u32(z+2, p->local_as); - } + /* Do not export anywhere */ + if (int_set_contains(d, BGP_COMM_NO_ADVERTISE)) + return -1; - /* iBGP -> use gw, eBGP multi-hop -> use source_addr, - eBGP single-hop -> use gw if on the same iface */ - z = bgp_set_attr_wa(ea->attrs+2, pool, BA_NEXT_HOP, NEXT_HOP_LENGTH); - if (p->cf->next_hop_self || - rta->dest != RTD_ROUTER || - ipa_equal(rta->gw, IPA_NONE) || - ipa_is_link_local(rta->gw) || - (!p->is_internal && !p->cf->next_hop_keep && - (!p->neigh || (rta->iface != p->neigh->iface)))) - set_next_hop(z, p->source_addr); - else - set_next_hop(z, rta->gw); + /* Do not export outside of AS (or member-AS) */ + if (!p->is_internal && int_set_contains(d, BGP_COMM_NO_EXPORT_SUBCONFED)) + return -1; - bgp_set_attr(ea->attrs+3, BA_LOCAL_PREF, p->cf->default_local_pref); + /* Do not export outside of AS (or confederation) */ + if (!p->is_interior && int_set_contains(d, BGP_COMM_NO_EXPORT)) + return -1; + } - return 0; /* Leave decision to the filters */ + return 0; } -static inline int -bgp_as_path_loopy(struct bgp_proto *p, rta *a) -{ - int num = p->cf->allow_local_as + 1; - eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - return (e && (num > 0) && as_path_contains(e->u.ptr, p->local_as, num)); -} - -static inline int -bgp_originator_id_loopy(struct bgp_proto *p, rta *a) -{ - eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID)); - return (e && (e->u.data == p->local_id)); -} +static adata null_adata; /* adata of length 0 */ -static inline int -bgp_cluster_list_loopy(struct bgp_proto *p, rta *a) +static ea_list * +bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *attrs0, struct linpool *pool) { - eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST)); - return (e && p->rr_client && int_set_contains(e->u.ptr, p->rr_cluster_id)); -} + struct proto *SRC = e->attrs->src->proto; + struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (void *) SRC : NULL; + struct bgp_export_state s = { .proto = p, .channel = c, .pool = pool, .src = src, .route = e, .mpls = c->desc->mpls }; + ea_list *attrs = attrs0; + eattr *a; + adata *ad; + /* ORIGIN attribute - mandatory, attach if missing */ + if (! bgp_find_attr(attrs0, BA_ORIGIN)) + bgp_set_attr_u32(&attrs, pool, BA_ORIGIN, 0, src ? ORIGIN_INCOMPLETE : ORIGIN_IGP); -static inline void -bgp_path_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 as) -{ - eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - bgp_attach_attr(attrs, pool, BA_AS_PATH, (uintptr_t) as_path_prepend(pool, a->u.ptr, as)); -} + /* AS_PATH attribute - mandatory */ + a = bgp_find_attr(attrs0, BA_AS_PATH); + ad = a ? a->u.ptr : &null_adata; -static inline void -bgp_cluster_list_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 cid) -{ - eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST)); - bgp_attach_attr(attrs, pool, BA_CLUSTER_LIST, (uintptr_t) int_set_prepend(pool, a ? a->u.ptr : NULL, cid)); -} + /* AS_PATH attribute - strip AS_CONFED* segments outside confederation */ + if ((!p->cf->confederation || !p->is_interior) && as_path_contains_confed(ad)) + ad = as_path_strip_confed(pool, ad); -static int -bgp_update_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool, int rr) -{ - eattr *a; + /* AS_PATH attribute - keep or prepend ASN */ + if (p->is_internal || + (p->rs_client && src && src->rs_client)) + { + /* IBGP or route server -> just ensure there is one */ + if (!a) + bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, &null_adata); + } + else if (p->is_interior) + { + /* Confederation -> prepend ASN as AS_CONFED_SEQUENCE */ + ad = as_path_prepend2(pool, ad, AS_PATH_CONFED_SEQUENCE, p->public_as); + bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, ad); + } + else /* Regular EBGP (no RS, no confederation) */ + { + /* Regular EBGP -> prepend ASN as regular sequence */ + ad = as_path_prepend2(pool, ad, AS_PATH_SEQUENCE, p->public_as); + bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, ad); + + /* MULTI_EXIT_DESC attribute - accept only if set in export filter */ + a = bgp_find_attr(attrs0, BA_MULTI_EXIT_DISC); + if (a && !(a->type & EAF_FRESH)) + bgp_unset_attr(&attrs, pool, BA_MULTI_EXIT_DISC); + } - if (!p->is_internal && !p->rs_client) - { - bgp_path_prepend(e, attrs, pool, p->local_as); - - /* The MULTI_EXIT_DISC attribute received from a neighboring AS MUST NOT be - * propagated to other neighboring ASes. - * Perhaps it would be better to undefine it. - */ - a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); - if (a) - bgp_attach_attr(attrs, pool, BA_MULTI_EXIT_DISC, 0); - } + /* NEXT_HOP attribute - delegated to AF-specific hook */ + a = bgp_find_attr(attrs0, BA_NEXT_HOP); + bgp_update_next_hop(&s, a, &attrs); - /* iBGP -> keep next_hop, eBGP multi-hop -> use source_addr, - * eBGP single-hop -> keep next_hop if on the same iface. - * If the next_hop is zero (i.e. link-local), keep only if on the same iface. - * - * Note that same-iface-check uses iface from route, which is based on gw. - */ - a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP)); - if (a && !p->cf->next_hop_self && - (p->cf->next_hop_keep || - (p->is_internal && ipa_nonzero(*((ip_addr *) a->u.ptr->data))) || - (p->neigh && (e->attrs->iface == p->neigh->iface)))) - { - /* Leave the original next hop attribute, will check later where does it point */ - } - else - { - /* Need to create new one */ - byte *b = bgp_attach_attr_wa(attrs, pool, BA_NEXT_HOP, NEXT_HOP_LENGTH); - set_next_hop(b, p->source_addr); - } + /* LOCAL_PREF attribute - required for IBGP, attach if missing */ + if (p->is_interior && ! bgp_find_attr(attrs0, BA_LOCAL_PREF)) + bgp_set_attr_u32(&attrs, pool, BA_LOCAL_PREF, 0, p->cf->default_local_pref); - if (rr) - { - /* Handling route reflection, RFC 4456 */ - struct bgp_proto *src = (struct bgp_proto *) e->attrs->src->proto; + /* IBGP route reflection, RFC 4456 */ + if (src && src->is_internal && p->is_internal && (src->local_as == p->local_as)) + { + /* ORIGINATOR_ID attribute - attach if not already set */ + if (! bgp_find_attr(attrs0, BA_ORIGINATOR_ID)) + bgp_set_attr_u32(&attrs, pool, BA_ORIGINATOR_ID, 0, src->remote_id); - a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID)); - if (!a) - bgp_attach_attr(attrs, pool, BA_ORIGINATOR_ID, src->remote_id); + /* CLUSTER_LIST attribute - prepend cluster ID */ + a = bgp_find_attr(attrs0, BA_CLUSTER_LIST); + ad = a ? a->u.ptr : NULL; - /* We attach proper cluster ID according to whether the route is entering or leaving the cluster */ - bgp_cluster_list_prepend(e, attrs, pool, src->rr_client ? src->rr_cluster_id : p->rr_cluster_id); + /* Prepend src cluster ID */ + if (src->rr_cluster_id) + ad = int_set_prepend(pool, ad, src->rr_cluster_id); - /* Two RR clients with different cluster ID, hmmm */ - if (src->rr_client && p->rr_client && (src->rr_cluster_id != p->rr_cluster_id)) - bgp_cluster_list_prepend(e, attrs, pool, p->rr_cluster_id); - } + /* Prepend dst cluster ID if src and dst clusters are different */ + if (p->rr_cluster_id && (src->rr_cluster_id != p->rr_cluster_id)) + ad = int_set_prepend(pool, ad, p->rr_cluster_id); - return 0; /* Leave decision to the filters */ -} + /* Should be at least one prepended cluster ID */ + bgp_set_attr_ptr(&attrs, pool, BA_CLUSTER_LIST, 0, ad); + } -static int -bgp_community_filter(struct bgp_proto *p, rte *e) -{ - eattr *a; - struct adata *d; + /* AS4_* transition attributes, RFC 6793 4.2.2 */ + if (! p->as4_session) + { + a = bgp_find_attr(attrs, BA_AS_PATH); + if (a && as_path_contains_as4(a->u.ptr)) + { + bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, as_path_to_old(pool, a->u.ptr)); + bgp_set_attr_ptr(&attrs, pool, BA_AS4_PATH, 0, as_path_strip_confed(pool, a->u.ptr)); + } - /* Check if we aren't forbidden to export the route by communities */ - a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_COMMUNITY)); - if (a) + a = bgp_find_attr(attrs, BA_AGGREGATOR); + if (a && aggregator_contains_as4(a->u.ptr)) { - d = a->u.ptr; - if (int_set_contains(d, BGP_COMM_NO_ADVERTISE)) - { - DBG("\tNO_ADVERTISE\n"); - return 1; - } - if (!p->is_internal && - (int_set_contains(d, BGP_COMM_NO_EXPORT) || - int_set_contains(d, BGP_COMM_NO_EXPORT_SUBCONFED))) - { - DBG("\tNO_EXPORT\n"); - return 1; - } + bgp_set_attr_ptr(&attrs, pool, BA_AGGREGATOR, 0, aggregator_to_old(pool, a->u.ptr)); + bgp_set_attr_ptr(&attrs, pool, BA_AS4_AGGREGATOR, 0, a->u.ptr); } + } - return 0; + /* + * Presence of mandatory attributes ORIGIN and AS_PATH is ensured by above + * conditions. Presence and validity of quasi-mandatory NEXT_HOP attribute + * should be checked in AF-specific hooks. + */ + + /* Apply per-attribute export hooks for validatation and normalization */ + return bgp_export_attrs(&s, attrs); } -int -bgp_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool *pool) +void +bgp_rt_notify(struct proto *P, struct channel *C, net *n, rte *new, rte *old, ea_list *attrs) { - rte *e = *new; - struct bgp_proto *p = (struct bgp_proto *) P; - struct bgp_proto *new_bgp = (e->attrs->src->proto->proto == &proto_bgp) ? - (struct bgp_proto *) e->attrs->src->proto : NULL; + struct bgp_proto *p = (void *) P; + struct bgp_channel *c = (void *) C; + struct bgp_bucket *buck; + struct bgp_prefix *px; + u32 path; - if (p == new_bgp) /* Poison reverse updates */ - return -1; - if (new_bgp) - { - /* We should check here for cluster list loop, because the receiving BGP instance - might have different cluster ID */ - if (bgp_cluster_list_loopy(p, e->attrs)) - return -1; - - if (p->cf->interpret_communities && bgp_community_filter(p, e)) - return -1; - - if (p->local_as == new_bgp->local_as && p->is_internal && new_bgp->is_internal) - { - /* Redistribution of internal routes with IBGP */ - if (p->rr_client || new_bgp->rr_client) - /* Route reflection, RFC 4456 */ - return bgp_update_attrs(p, e, attrs, pool, 1); - else - return -1; - } - else - return bgp_update_attrs(p, e, attrs, pool, 0); - } + if (new) + { + attrs = bgp_update_attrs(p, c, new, attrs, bgp_linpool2); + + /* If attributes are invalid, we fail back to withdraw */ + buck = attrs ? bgp_get_bucket(c, attrs) : bgp_get_withdraw_bucket(c); + path = new->attrs->src->global_id; + + lp_flush(bgp_linpool2); + } else - return bgp_create_attrs(p, e, attrs, pool); + { + buck = bgp_get_withdraw_bucket(c); + path = old->attrs->src->global_id; + } + + px = bgp_get_prefix(c, n->n.addr, c->add_path_tx ? path : 0); + add_tail(&buck->prefixes, &px->buck_node); + + bgp_schedule_packet(p->conn, c, PKT_UPDATE); } + static inline u32 bgp_get_neighbor(rte *r) { eattr *e = ea_find(r->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); u32 as; - if (e && as_path_get_first(e->u.ptr, &as)) + if (e && as_path_get_first_regular(e->u.ptr, &as)) return as; - else - return ((struct bgp_proto *) r->attrs->src->proto)->remote_as; + + /* If AS_PATH is not defined, we treat rte as locally originated */ + struct bgp_proto *p = (void *) r->attrs->src->proto; + return p->cf->confederation ?: p->local_as; } static inline int rte_resolvable(rte *rt) { - int rd = rt->attrs->dest; - return (rd == RTD_ROUTER) || (rd == RTD_DEVICE) || (rd == RTD_MULTIPATH); + return rt->attrs->dest == RTD_UNICAST; } int @@ -1269,16 +1623,16 @@ bgp_rte_better(rte *new, rte *old) /* RFC 4271 9.1.2.2. a) Use AS path lengths */ if (new_bgp->cf->compare_path_lengths || old_bgp->cf->compare_path_lengths) - { - x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; - o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; - if (n < o) - return 1; - if (n > o) - return 0; - } + { + x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); + y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); + n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; + o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; + if (n < o) + return 1; + if (n > o) + return 0; + } /* RFC 4271 9.1.2.2. b) Use origins */ x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN)); @@ -1303,21 +1657,21 @@ bgp_rte_better(rte *new, rte *old) */ if (new_bgp->cf->med_metric || old_bgp->cf->med_metric || (bgp_get_neighbor(new) == bgp_get_neighbor(old))) - { - x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); - y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); - n = x ? x->u.data : new_bgp->cf->default_med; - o = y ? y->u.data : old_bgp->cf->default_med; - if (n < o) - return 1; - if (n > o) - return 0; - } + { + x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); + y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); + n = x ? x->u.data : new_bgp->cf->default_med; + o = y ? y->u.data : old_bgp->cf->default_med; + if (n < o) + return 1; + if (n > o) + return 0; + } /* RFC 4271 9.1.2.2. d) Prefer external peers */ - if (new_bgp->is_internal > old_bgp->is_internal) + if (new_bgp->is_interior > old_bgp->is_interior) return 0; - if (new_bgp->is_internal < old_bgp->is_internal) + if (new_bgp->is_interior < old_bgp->is_interior) return 1; /* RFC 4271 9.1.2.2. e) Compare IGP metrics */ @@ -1329,7 +1683,7 @@ bgp_rte_better(rte *new, rte *old) return 0; /* RFC 4271 9.1.2.2. f) Compare BGP identifiers */ - /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighor ID */ + /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighbor ID */ x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID)); y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID)); n = x ? x->u.data : new_bgp->remote_id; @@ -1388,18 +1742,18 @@ bgp_rte_mergable(rte *pri, rte *sec) /* RFC 4271 9.1.2.2. a) Use AS path lengths */ if (pri_bgp->cf->compare_path_lengths || sec_bgp->cf->compare_path_lengths) - { - x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - p = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; - s = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; + { + x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); + y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); + p = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN; + s = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN; - if (p != s) - return 0; + if (p != s) + return 0; -// if (DELTA(p, s) > pri_bgp->cf->relax_multipath) -// return 0; - } +// if (DELTA(p, s) > pri_bgp->cf->relax_multipath) +// return 0; + } /* RFC 4271 9.1.2.2. b) Use origins */ x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN)); @@ -1412,17 +1766,17 @@ bgp_rte_mergable(rte *pri, rte *sec) /* RFC 4271 9.1.2.2. c) Compare MED's */ if (pri_bgp->cf->med_metric || sec_bgp->cf->med_metric || (bgp_get_neighbor(pri) == bgp_get_neighbor(sec))) - { - x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); - y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); - p = x ? x->u.data : pri_bgp->cf->default_med; - s = y ? y->u.data : sec_bgp->cf->default_med; - if (p != s) - return 0; - } + { + x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); + y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC)); + p = x ? x->u.data : pri_bgp->cf->default_med; + s = y ? y->u.data : sec_bgp->cf->default_med; + if (p != s) + return 0; + } /* RFC 4271 9.1.2.2. d) Prefer external peers */ - if (pri_bgp->is_internal != sec_bgp->is_internal) + if (pri_bgp->is_interior != sec_bgp->is_interior) return 0; /* RFC 4271 9.1.2.2. e) Compare IGP metrics */ @@ -1437,7 +1791,6 @@ bgp_rte_mergable(rte *pri, rte *sec) } - static inline int same_group(rte *r, u32 lpref, u32 lasn) { @@ -1482,7 +1835,7 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) * that this fn is not called for them. * * The idea is simple, the implementation is more problematic, - * mostly because of optimizations in rte_recalculate() that + * mostly because of optimizations in rte_recalculate() that * avoids full recalculation in most cases. * * We can assume that at least one of new, old is non-NULL and both @@ -1494,14 +1847,14 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) /* If new and old are from different groups, we just process that as two independent events */ if (new && old && !same_group(old, lpref, lasn)) - { - int i1, i2; - i1 = bgp_rte_recalculate(table, net, NULL, old, old_best); - i2 = bgp_rte_recalculate(table, net, new, NULL, old_best); - return i1 || i2; - } + { + int i1, i2; + i1 = bgp_rte_recalculate(table, net, NULL, old, old_best); + i2 = bgp_rte_recalculate(table, net, new, NULL, old_best); + return i1 || i2; + } - /* + /* * We could find the best-in-group and then make some shortcuts like * in rte_recalculate, but as we would have to walk through all * net->routes just to find it, it is probably not worth. So we @@ -1513,35 +1866,35 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) new->u.bgp.suppressed = 1; if (old) + { + old_is_group_best = !old->u.bgp.suppressed; + old->u.bgp.suppressed = 1; + int new_is_better = new && bgp_rte_better(new, old); + + /* The first case - replace not best with worse (or remove not best) */ + if (!old_is_group_best && !new_is_better) + return 0; + + /* The second case - replace the best with better */ + if (old_is_group_best && new_is_better) { - old_is_group_best = !old->u.bgp.suppressed; - old->u.bgp.suppressed = 1; - int new_is_better = new && bgp_rte_better(new, old); - - /* The first case - replace not best with worse (or remove not best) */ - if (!old_is_group_best && !new_is_better) - return 0; - - /* The second case - replace the best with better */ - if (old_is_group_best && new_is_better) - { - /* new is best-in-group, the see discussion below - this is - a special variant of NBG && OBG. From OBG we can deduce - that same_group(old_best) iff (old == old_best) */ - new->u.bgp.suppressed = 0; - return (old == old_best); - } + /* new is best-in-group, the see discussion below - this is + a special variant of NBG && OBG. From OBG we can deduce + that same_group(old_best) iff (old == old_best) */ + new->u.bgp.suppressed = 0; + return (old == old_best); } + } /* The default case - find a new best-in-group route */ r = new; /* new may not be in the list */ for (s=net->routes; rte_is_valid(s); s=s->next) if (use_deterministic_med(s) && same_group(s, lpref, lasn)) - { - s->u.bgp.suppressed = 1; - if (!r || bgp_rte_better(s, r)) - r = s; - } + { + s->u.bgp.suppressed = 1; + if (!r || bgp_rte_better(s, r)) + r = s; + } /* Simple case - the last route in group disappears */ if (!r) @@ -1580,397 +1933,77 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best) return old_is_group_best; } -static struct adata * -bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool) -{ - struct adata *newa = lp_alloc(pool, sizeof(struct adata) + 8); - newa->length = 8; - aggregator_convert_to_new(old, newa->data); - return newa; -} - -/* Take last req_as ASNs from path old2 (in 2B format), convert to 4B format - * and append path old4 (in 4B format). +/* + * Reconstruct AS_PATH and AGGREGATOR according to RFC 6793 4.2.3 */ -static struct adata * -bgp_merge_as_paths(struct adata *old2, struct adata *old4, int req_as, struct linpool *pool) -{ - byte buf[old2->length * 2]; - - int ol = as_path_convert_to_new(old2, buf, req_as); - int nl = ol + (old4 ? old4->length : 0); - - struct adata *newa = lp_alloc(pool, sizeof(struct adata) + nl); - newa->length = nl; - memcpy(newa->data, buf, ol); - if (old4) memcpy(newa->data + ol, old4->data, old4->length); - - return newa; -} - -static int -as4_aggregator_valid(struct adata *aggr) -{ - return aggr->length == 8; -} - - -/* Reconstruct 4B AS_PATH and AGGREGATOR according to RFC 4893 4.2.3 */ -static void -bgp_reconstruct_4b_atts(struct bgp_proto *p, rta *a, struct linpool *pool) -{ - eattr *p2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH)); - eattr *p4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_PATH)); - eattr *a2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AGGREGATOR)); - eattr *a4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR)); - int a4_removed = 0; - - if (a4 && !as4_aggregator_valid(a4->u.ptr)) - { - log(L_WARN "%s: AS4_AGGREGATOR attribute is invalid, skipping attribute", p->p.name); - a4 = NULL; - a4_removed = 1; - } - - if (a2) - { - u32 a2_as = get_u16(a2->u.ptr->data); - - if (a4) - { - if (a2_as != AS_TRANS) - { - /* Routes were aggregated by old router and therefore AS4_PATH - * and AS4_AGGREGATOR is invalid - * - * Convert AS_PATH and AGGREGATOR to 4B format and finish. - */ - - a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool); - p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool); - - return; - } - else - { - /* Common case, use AS4_AGGREGATOR attribute */ - a2->u.ptr = a4->u.ptr; - } - } - else - { - /* Common case, use old AGGREGATOR attribute */ - a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool); - - if ((a2_as == AS_TRANS) && !a4_removed) - log(L_WARN "%s: AGGREGATOR attribute contain AS_TRANS, but AS4_AGGREGATOR is missing", p->p.name); - } - } - else - if (a4) - log(L_WARN "%s: AS4_AGGREGATOR attribute received, but AGGREGATOR attribute is missing", p->p.name); - - int p2_len = as_path_getlen_int(p2->u.ptr, 2); - int p4_len = p4 ? validate_as4_path(p, p4->u.ptr) : -1; - - if (p4 && (p4_len < 0)) - log(L_WARN "%s: AS4_PATH attribute is malformed, skipping attribute", p->p.name); - - if ((p4_len <= 0) || (p2_len < p4_len)) - p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool); - else - p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, p4->u.ptr, p2_len - p4_len, pool); -} - static void -bgp_remove_as4_attrs(struct bgp_proto *p, rta *a) +bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool) { - unsigned id1 = EA_CODE(EAP_BGP, BA_AS4_PATH); - unsigned id2 = EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR); - ea_list **el = &(a->eattrs); - - /* We know that ea_lists constructed in bgp_decode attrs have one attribute per ea_list struct */ - while (*el != NULL) - { - unsigned fid = (*el)->attrs[0].id; - - if ((fid == id1) || (fid == id2)) - { - *el = (*el)->next; - if (p->as4_session) - log(L_WARN "%s: Unexpected AS4_* attributes received", p->p.name); - } - else - el = &((*el)->next); - } -} - -/** - * bgp_decode_attrs - check and decode BGP attributes - * @conn: connection - * @attr: start of attribute block - * @len: length of attribute block - * @pool: linear pool to make all the allocations in - * @mandatory: 1 iff presence of mandatory attributes has to be checked - * - * This function takes a BGP attribute block (a part of an Update message), checks - * its consistency and converts it to a list of BIRD route attributes represented - * by a &rta. - */ -struct rta * -bgp_decode_attrs(struct bgp_conn *conn, byte *attr, uint len, struct linpool *pool, int mandatory) -{ - struct bgp_proto *bgp = conn->bgp; - rta *a = lp_alloc(pool, sizeof(struct rta)); - uint flags, code, l, i, type; - int errcode; - byte *z, *attr_start; - byte seen[256/8]; - ea_list *ea; - struct adata *ad; - int withdraw = 0; - - bzero(a, sizeof(rta)); - a->source = RTS_BGP; - a->scope = SCOPE_UNIVERSE; - a->cast = RTC_UNICAST; - /* a->dest = RTD_ROUTER; -- set in bgp_set_next_hop() */ - a->from = bgp->cf->remote_ip; - - /* Parse the attributes */ - bzero(seen, sizeof(seen)); - DBG("BGP: Parsing attributes\n"); - while (len) - { - if (len < 2) - goto malformed; - attr_start = attr; - flags = *attr++; - code = *attr++; - len -= 2; - if (flags & BAF_EXT_LEN) - { - if (len < 2) - goto malformed; - l = get_u16(attr); - attr += 2; - len -= 2; - } - else - { - if (len < 1) - goto malformed; - l = *attr++; - len--; - } - if (l > len) - goto malformed; - len -= l; - z = attr; - attr += l; - DBG("Attr %02x %02x %d\n", code, flags, l); - if (seen[code/8] & (1 << (code%8))) - goto malformed; - if (ATTR_KNOWN(code)) - { - struct attr_desc *desc = &bgp_attr_table[code]; - if (desc->expected_length >= 0 && desc->expected_length != (int) l) - { errcode = 5; goto err; } - if ((desc->expected_flags ^ flags) & (BAF_OPTIONAL | BAF_TRANSITIVE)) - { errcode = 4; goto err; } - if (!bgp->is_internal) - { - if (!desc->allow_in_ebgp) - continue; - if ((code == BA_LOCAL_PREF) && !bgp->cf->allow_local_pref) - continue; - } - if (desc->validate) - { - errcode = desc->validate(bgp, z, l); - if (errcode > 0) - goto err; - if (errcode == IGNORE) - continue; - if (errcode <= WITHDRAW) - { - log(L_WARN "%s: Attribute %s is malformed, withdrawing update", - bgp->p.name, desc->name); - withdraw = 1; - } - } - else if (code == BA_AS_PATH) - { - /* Special case as it might also trim the attribute */ - if (validate_as_path(bgp, z, &l) < 0) - { errcode = 11; goto err; } - } - type = desc->type; - } - else /* Unknown attribute */ - { - if (!(flags & BAF_OPTIONAL)) - { errcode = 2; goto err; } - type = EAF_TYPE_OPAQUE; - } - - // Only OPTIONAL and TRANSITIVE attributes may have non-zero PARTIAL flag - // if (!((flags & BAF_OPTIONAL) && (flags & BAF_TRANSITIVE)) && (flags & BAF_PARTIAL)) - // { errcode = 4; goto err; } - - seen[code/8] |= (1 << (code%8)); - ea = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr)); - ea->next = a->eattrs; - a->eattrs = ea; - ea->flags = 0; - ea->count = 1; - ea->attrs[0].id = EA_CODE(EAP_BGP, code); - ea->attrs[0].flags = flags; - ea->attrs[0].type = type; - if (type & EAF_EMBEDDED) - ad = NULL; - else - { - ad = lp_alloc(pool, sizeof(struct adata) + l); - ea->attrs[0].u.ptr = ad; - ad->length = l; - memcpy(ad->data, z, l); - } - switch (type) - { - case EAF_TYPE_ROUTER_ID: - case EAF_TYPE_INT: - if (l == 1) - ea->attrs[0].u.data = *z; - else - ea->attrs[0].u.data = get_u32(z); - break; - case EAF_TYPE_IP_ADDRESS: - ipa_ntoh(*(ip_addr *)ad->data); - break; - case EAF_TYPE_INT_SET: - case EAF_TYPE_LC_SET: - case EAF_TYPE_EC_SET: - { - u32 *z = (u32 *) ad->data; - for(i=0; i<ad->length/4; i++) - z[i] = ntohl(z[i]); - break; - } - } - } - - if (withdraw) - goto withdraw; + eattr *p2 = bgp_find_attr(*attrs, BA_AS_PATH); + eattr *p4 = bgp_find_attr(*attrs, BA_AS4_PATH); + eattr *a2 = bgp_find_attr(*attrs, BA_AGGREGATOR); + eattr *a4 = bgp_find_attr(*attrs, BA_AS4_AGGREGATOR); -#ifdef IPV6 - /* If we received MP_REACH_NLRI we should check mandatory attributes */ - if (bgp->mp_reach_len != 0) - mandatory = 1; -#endif + /* First, unset AS4_* attributes */ + if (p4) bgp_unset_attr(attrs, pool, BA_AS4_PATH); + if (a4) bgp_unset_attr(attrs, pool, BA_AS4_AGGREGATOR); - /* If there is no (reachability) NLRI, we should exit now */ - if (! mandatory) - return a; - - /* Check if all mandatory attributes are present */ - for(i=0; i < ARRAY_SIZE(bgp_mandatory_attrs); i++) - { - code = bgp_mandatory_attrs[i]; - if (!(seen[code/8] & (1 << (code%8)))) - { - bgp_error(conn, 3, 3, &bgp_mandatory_attrs[i], 1); - return NULL; - } - } - - /* When receiving attributes from non-AS4-aware BGP speaker, - * we have to reconstruct 4B AS_PATH and AGGREGATOR attributes - */ - if (! bgp->as4_session) - bgp_reconstruct_4b_atts(bgp, a, pool); - - bgp_remove_as4_attrs(bgp, a); - - /* If the AS path attribute contains our AS, reject the routes */ - if (bgp_as_path_loopy(bgp, a)) - goto withdraw; - - /* Two checks for IBGP loops caused by route reflection, RFC 4456 */ - if (bgp_originator_id_loopy(bgp, a) || - bgp_cluster_list_loopy(bgp, a)) - goto withdraw; + /* Handle AGGREGATOR attribute */ + if (a2 && a4) + { + u32 a2_asn = get_u32(a2->u.ptr->data); - /* If there's no local preference, define one */ - if (!(seen[0] & (1 << BA_LOCAL_PREF))) - bgp_attach_attr(&a->eattrs, pool, BA_LOCAL_PREF, bgp->cf->default_local_pref); + /* If routes were aggregated by an old router, then AS4_PATH and + AS4_AGGREGATOR are invalid. In that case we give up. */ + if (a2_asn != AS_TRANS) + return; - return a; + /* Use AS4_AGGREGATOR instead of AGGREGATOR */ + a2->u.ptr = a4->u.ptr; + } -withdraw: - return NULL; + /* Handle AS_PATH attribute */ + if (p2 && p4) + { + /* Both as_path_getlen() and as_path_cut() take AS_CONFED* as zero length */ + int p2_len = as_path_getlen(p2->u.ptr); + int p4_len = as_path_getlen(p4->u.ptr); -malformed: - bgp_error(conn, 3, 1, NULL, 0); - return NULL; + /* AS_PATH is too short, give up */ + if (p2_len < p4_len) + return; -err: - bgp_error(conn, 3, errcode, attr_start, z+l-attr_start); - return NULL; + /* Merge AS_PATH and AS4_PATH */ + as_path_cut(p2->u.ptr, p2_len - p4_len); + p2->u.ptr = as_path_merge(pool, p2->u.ptr, p4->u.ptr); + } } int bgp_get_attr(eattr *a, byte *buf, int buflen) { uint i = EA_ID(a->id); - struct attr_desc *d; + const struct bgp_attr_desc *d; int len; - if (ATTR_KNOWN(i)) + if (bgp_attr_known(i)) + { + d = &bgp_attr_table[i]; + len = bsprintf(buf, "%s", d->name); + buf += len; + if (d->format) { - d = &bgp_attr_table[i]; - len = bsprintf(buf, "%s", d->name); - buf += len; - if (d->format) - { - *buf++ = ':'; - *buf++ = ' '; - d->format(a, buf, buflen - len - 2); - return GA_FULL; - } - return GA_NAME; + *buf++ = ':'; + *buf++ = ' '; + d->format(a, buf, buflen - len - 2); + return GA_FULL; } - bsprintf(buf, "%02x%s", i, (a->flags & BAF_TRANSITIVE) ? " [t]" : ""); - return GA_NAME; -} - -void -bgp_init_bucket_table(struct bgp_proto *p) -{ - p->hash_size = 256; - p->hash_limit = p->hash_size * 4; - p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *)); - init_list(&p->bucket_queue); - p->withdraw_bucket = NULL; - // fib_init(&p->prefix_fib, p->p.pool, sizeof(struct bgp_prefix), 0, bgp_init_prefix); -} - -void -bgp_free_bucket_table(struct bgp_proto *p) -{ - mb_free(p->bucket_hash); - p->bucket_hash = NULL; - - struct bgp_bucket *b; - WALK_LIST_FIRST(b, p->bucket_queue) - { - rem_node(&b->send_node); - mb_free(b); + return GA_NAME; } - mb_free(p->withdraw_bucket); - p->withdraw_bucket = NULL; + bsprintf(buf, "%02x%s", i, (a->flags & BAF_TRANSITIVE) ? " [t]" : ""); + return GA_NAME; } void @@ -1986,14 +2019,14 @@ bgp_get_route_info(rte *e, byte *buf, ea_list *attrs) buf += bsprintf(buf, "-"); if (e->attrs->hostentry) - { - if (!rte_resolvable(e)) - buf += bsprintf(buf, "/-"); - else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN) - buf += bsprintf(buf, "/?"); - else - buf += bsprintf(buf, "/%d", e->attrs->igp_metric); - } + { + if (!rte_resolvable(e)) + buf += bsprintf(buf, "/-"); + else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN) + buf += bsprintf(buf, "/?"); + else + buf += bsprintf(buf, "/%d", e->attrs->igp_metric); + } buf += bsprintf(buf, ") ["); if (p && as_path_get_last(p->u.ptr, &origas)) diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c index b99672f5..8da5b0aa 100644 --- a/proto/bgp/bgp.c +++ b/proto/bgp/bgp.c @@ -2,6 +2,8 @@ * BIRD -- The Border Gateway Protocol * * (c) 2000 Martin Mares <mj@ucw.cz> + * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2008--2016 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -9,48 +11,52 @@ /** * DOC: Border Gateway Protocol * - * The BGP protocol is implemented in three parts: |bgp.c| which takes care of the - * connection and most of the interface with BIRD core, |packets.c| handling + * The BGP protocol is implemented in three parts: |bgp.c| which takes care of + * the connection and most of the interface with BIRD core, |packets.c| handling * both incoming and outgoing BGP packets and |attrs.c| containing functions for * manipulation with BGP attribute lists. * - * As opposed to the other existing routing daemons, BIRD has a sophisticated core - * architecture which is able to keep all the information needed by BGP in the - * primary routing table, therefore no complex data structures like a central - * BGP table are needed. This increases memory footprint of a BGP router with - * many connections, but not too much and, which is more important, it makes - * BGP much easier to implement. + * As opposed to the other existing routing daemons, BIRD has a sophisticated + * core architecture which is able to keep all the information needed by BGP in + * the primary routing table, therefore no complex data structures like a + * central BGP table are needed. This increases memory footprint of a BGP router + * with many connections, but not too much and, which is more important, it + * makes BGP much easier to implement. * - * Each instance of BGP (corresponding to a single BGP peer) is described by a &bgp_proto - * structure to which are attached individual connections represented by &bgp_connection - * (usually, there exists only one connection, but during BGP session setup, there - * can be more of them). The connections are handled according to the BGP state machine - * defined in the RFC with all the timers and all the parameters configurable. + * Each instance of BGP (corresponding to a single BGP peer) is described by a + * &bgp_proto structure to which are attached individual connections represented + * by &bgp_connection (usually, there exists only one connection, but during BGP + * session setup, there can be more of them). The connections are handled + * according to the BGP state machine defined in the RFC with all the timers and + * all the parameters configurable. * - * In incoming direction, we listen on the connection's socket and each time we receive - * some input, we pass it to bgp_rx(). It decodes packet headers and the markers and - * passes complete packets to bgp_rx_packet() which distributes the packet according - * to its type. + * In incoming direction, we listen on the connection's socket and each time we + * receive some input, we pass it to bgp_rx(). It decodes packet headers and the + * markers and passes complete packets to bgp_rx_packet() which distributes the + * packet according to its type. * - * In outgoing direction, we gather all the routing updates and sort them to buckets - * (&bgp_bucket) according to their attributes (we keep a hash table for fast comparison - * of &rta's and a &fib which helps us to find if we already have another route for - * the same destination queued for sending, so that we can replace it with the new one - * immediately instead of sending both updates). There also exists a special bucket holding - * all the route withdrawals which cannot be queued anywhere else as they don't have any - * attributes. If we have any packet to send (due to either new routes or the connection - * tracking code wanting to send a Open, Keepalive or Notification message), we call - * bgp_schedule_packet() which sets the corresponding bit in a @packet_to_send - * bit field in &bgp_conn and as soon as the transmit socket buffer becomes empty, - * we call bgp_fire_tx(). It inspects state of all the packet type bits and calls - * the corresponding bgp_create_xx() functions, eventually rescheduling the same packet - * type if we have more data of the same type to send. + * In outgoing direction, we gather all the routing updates and sort them to + * buckets (&bgp_bucket) according to their attributes (we keep a hash table for + * fast comparison of &rta's and a &fib which helps us to find if we already + * have another route for the same destination queued for sending, so that we + * can replace it with the new one immediately instead of sending both + * updates). There also exists a special bucket holding all the route + * withdrawals which cannot be queued anywhere else as they don't have any + * attributes. If we have any packet to send (due to either new routes or the + * connection tracking code wanting to send a Open, Keepalive or Notification + * message), we call bgp_schedule_packet() which sets the corresponding bit in a + * @packet_to_send bit field in &bgp_conn and as soon as the transmit socket + * buffer becomes empty, we call bgp_fire_tx(). It inspects state of all the + * packet type bits and calls the corresponding bgp_create_xx() functions, + * eventually rescheduling the same packet type if we have more data of the same + * type to send. * - * The processing of attributes consists of two functions: bgp_decode_attrs() for checking - * of the attribute blocks and translating them to the language of BIRD's extended attributes - * and bgp_encode_attrs() which does the converse. Both functions are built around a - * @bgp_attr_table array describing all important characteristics of all known attributes. - * Unknown transitive attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams. + * The processing of attributes consists of two functions: bgp_decode_attrs() + * for checking of the attribute blocks and translating them to the language of + * BIRD's extended attributes and bgp_encode_attrs() which does the + * converse. Both functions are built around a @bgp_attr_table array describing + * all important characteristics of all known attributes. Unknown transitive + * attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams. * * BGP protocol implements graceful restart in both restarting (local restart) * and receiving (neighbor restart) roles. The first is handled mostly by the @@ -61,10 +67,45 @@ * point of view and therefore maintaining received routes. Routing table * refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing * stale routes after reestablishment of BGP session during graceful restart. - */ + * + * Supported standards: + * <itemize> + * <item> <rfc id="4271"> - Border Gateway Protocol 4 (BGP) + * <item> <rfc id="1997"> - BGP Communities Attribute + * <item> <rfc id="2385"> - Protection of BGP Sessions via TCP MD5 Signature + * <item> <rfc id="2545"> - Use of BGP Multiprotocol Extensions for IPv6 + * <item> <rfc id="2918"> - Route Refresh Capability + * <item> <rfc id="3107"> - Carrying Label Information in BGP + * <item> <rfc id="4360"> - BGP Extended Communities Attribute + * <item> <rfc id="4364"> - BGP/MPLS IPv4 Virtual Private Networks + * <item> <rfc id="4456"> - BGP Route Reflection + * <item> <rfc id="4486"> - Subcodes for BGP Cease Notification Message + * <item> <rfc id="4659"> - BGP/MPLS IPv6 Virtual Private Networks + * <item> <rfc id="4724"> - Graceful Restart Mechanism for BGP + * <item> <rfc id="4760"> - Multiprotocol extensions for BGP + * <item> <rfc id="4798"> - Connecting IPv6 Islands over IPv4 MPLS + * <item> <rfc id="5065"> - AS confederations for BGP + * <item> <rfc id="5082"> - Generalized TTL Security Mechanism + * <item> <rfc id="5492"> - Capabilities Advertisement with BGP + * <item> <rfc id="5549"> - Advertising IPv4 NLRI with an IPv6 Next Hop + * <item> <rfc id="5575"> - Dissemination of Flow Specification Rules + * <item> <rfc id="5668"> - 4-Octet AS Specific BGP Extended Community + * <item> <rfc id="6286"> - AS-Wide Unique BGP Identifier + * <item> <rfc id="6608"> - Subcodes for BGP Finite State Machine Error + * <item> <rfc id="6793"> - BGP Support for 4-Octet AS Numbers + * <item> <rfc id="7313"> - Enhanced Route Refresh Capability for BGP + * <item> <rfc id="7606"> - Revised Error Handling for BGP UPDATE Messages + * <item> <rfc id="7911"> - Advertisement of Multiple Paths in BGP + * <item> <rfc id="7947"> - Internet Exchange BGP Route Server + * <item> <rfc id="8092"> - BGP Large Communities Attribute + * <item> <rfc id="8203"> - BGP Administrative Shutdown Communication + * </itemize> +*/ #undef LOCAL_DEBUG +#include <stdlib.h> + #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" @@ -80,70 +121,150 @@ struct linpool *bgp_linpool; /* Global temporary pool */ -static sock *bgp_listen_sk; /* Global listening socket */ -static int bgp_counter; /* Number of protocol instances using the listening socket */ +struct linpool *bgp_linpool2; /* Global temporary pool for bgp_rt_notify() */ +static list bgp_sockets; /* Global list of listening sockets */ + -static void bgp_close(struct bgp_proto *p, int apply_md5); static void bgp_connect(struct bgp_proto *p); static void bgp_active(struct bgp_proto *p); -static sock *bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags); static void bgp_update_bfd(struct bgp_proto *p, int use_bfd); +static int bgp_incoming_connection(sock *sk, uint dummy UNUSED); +static void bgp_listen_sock_err(sock *sk UNUSED, int err); /** * bgp_open - open a BGP instance * @p: BGP instance * - * This function allocates and configures shared BGP resources. - * Should be called as the last step during initialization - * (when lock is acquired and neighbor is ready). - * When error, state changed to PS_DOWN, -1 is returned and caller - * should return immediately. + * This function allocates and configures shared BGP resources, mainly listening + * sockets. Should be called as the last step during initialization (when lock + * is acquired and neighbor is ready). When error, caller should change state to + * PS_DOWN and return immediately. */ static int bgp_open(struct bgp_proto *p) { - struct config *cfg = p->cf->c.global; - int errcode; + struct bgp_socket *bs = NULL; + struct iface *ifa = p->cf->strict_bind ? p->cf->iface : NULL; + ip_addr addr = p->cf->strict_bind ? p->cf->local_ip : + (ipa_is_ip4(p->cf->remote_ip) ? IPA_NONE4 : IPA_NONE6); + uint port = p->cf->local_port; - if (!bgp_listen_sk) - bgp_listen_sk = bgp_setup_listen_sk(cfg->listen_bgp_addr, cfg->listen_bgp_port, cfg->listen_bgp_flags); + /* FIXME: Add some global init? */ + if (!bgp_linpool) + init_list(&bgp_sockets); + + /* We assume that cf->iface is defined iff cf->local_ip is link-local */ - if (!bgp_listen_sk) + WALK_LIST(bs, bgp_sockets) + if (ipa_equal(bs->sk->saddr, addr) && (bs->sk->iface == ifa) && (bs->sk->sport == port)) { - errcode = BEM_NO_SOCKET; - goto err; + bs->uc++; + p->sock = bs; + return 0; } - if (!bgp_linpool) - bgp_linpool = lp_new(&root_pool, 4080); + sock *sk = sk_new(proto_pool); + sk->type = SK_TCP_PASSIVE; + sk->ttl = 255; + sk->saddr = addr; + sk->sport = port; + sk->flags = 0; + sk->tos = IP_PREC_INTERNET_CONTROL; + sk->rbsize = BGP_RX_BUFFER_SIZE; + sk->tbsize = BGP_TX_BUFFER_SIZE; + sk->rx_hook = bgp_incoming_connection; + sk->err_hook = bgp_listen_sock_err; + + if (sk_open(sk) < 0) + goto err; - bgp_counter++; + bs = mb_allocz(proto_pool, sizeof(struct bgp_socket)); + bs->sk = sk; + bs->uc = 1; + p->sock = bs; - if (p->cf->password) - if (sk_set_md5_auth(bgp_listen_sk, p->cf->source_addr, p->cf->remote_ip, - p->cf->iface, p->cf->password, p->cf->setkey) < 0) - { - sk_log_error(bgp_listen_sk, p->p.name); - bgp_close(p, 0); - errcode = BEM_INVALID_MD5; - goto err; - } + add_tail(&bgp_sockets, &bs->n); + + if (!bgp_linpool) + { + bgp_linpool = lp_new_default(proto_pool); + bgp_linpool2 = lp_new_default(proto_pool); + } return 0; err: - p->p.disabled = 1; - bgp_store_error(p, NULL, BE_MISC, errcode); - proto_notify_state(&p->p, PS_DOWN); + sk_log_error(sk, p->p.name); + log(L_ERR "%s: Cannot open listening socket", p->p.name); + rfree(sk); return -1; } +/** + * bgp_close - close a BGP instance + * @p: BGP instance + * + * This function frees and deconfigures shared BGP resources. + */ +static void +bgp_close(struct bgp_proto *p) +{ + struct bgp_socket *bs = p->sock; + + ASSERT(bs && bs->uc); + + if (--bs->uc) + return; + + rfree(bs->sk); + rem_node(&bs->n); + mb_free(bs); + + if (!EMPTY_LIST(bgp_sockets)) + return; + + rfree(bgp_linpool); + bgp_linpool = NULL; + + rfree(bgp_linpool2); + bgp_linpool2 = NULL; +} + +static inline int +bgp_setup_auth(struct bgp_proto *p, int enable) +{ + if (p->cf->password) + { + int rv = sk_set_md5_auth(p->sock->sk, + p->cf->local_ip, p->cf->remote_ip, p->cf->iface, + enable ? p->cf->password : NULL, p->cf->setkey); + + if (rv < 0) + sk_log_error(p->sock->sk, p->p.name); + + return rv; + } + else + return 0; +} + +static inline struct bgp_channel * +bgp_find_channel(struct bgp_proto *p, u32 afi) +{ + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + if (c->afi == afi) + return c; + + return NULL; +} + static void bgp_startup(struct bgp_proto *p) { BGP_TRACE(D_EVENTS, "Started"); - p->start_state = p->cf->capabilities ? BSS_CONNECT : BSS_CONNECT_NOCAP; + p->start_state = BSS_CONNECT; if (!p->cf->passive) bgp_active(p); @@ -159,70 +280,57 @@ bgp_startup_timeout(timer *t) static void bgp_initiate(struct bgp_proto *p) { - int rv = bgp_open(p); - if (rv < 0) - return; + int err_val; + + if (bgp_open(p) < 0) + { err_val = BEM_NO_SOCKET; goto err1; } + + if (bgp_setup_auth(p, 1) < 0) + { err_val = BEM_INVALID_MD5; goto err2; } if (p->cf->bfd) bgp_update_bfd(p, p->cf->bfd); if (p->startup_delay) - { - p->start_state = BSS_DELAY; - BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay); - bgp_start_timer(p->startup_timer, p->startup_delay); - } + { + p->start_state = BSS_DELAY; + BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay); + bgp_start_timer(p->startup_timer, p->startup_delay); + } else bgp_startup(p); -} -/** - * bgp_close - close a BGP instance - * @p: BGP instance - * @apply_md5: 0 to disable unsetting MD5 auth - * - * This function frees and deconfigures shared BGP resources. - * @apply_md5 is set to 0 when bgp_close is called as a cleanup - * from failed bgp_open(). - */ -static void -bgp_close(struct bgp_proto *p, int apply_md5) -{ - ASSERT(bgp_counter); - bgp_counter--; + return; - if (p->cf->password && apply_md5) - if (sk_set_md5_auth(bgp_listen_sk, p->cf->source_addr, p->cf->remote_ip, - p->cf->iface, NULL, p->cf->setkey) < 0) - sk_log_error(bgp_listen_sk, p->p.name); +err2: + bgp_close(p); +err1: + p->p.disabled = 1; + bgp_store_error(p, NULL, BE_MISC, err_val); + proto_notify_state(&p->p, PS_DOWN); - if (!bgp_counter) - { - rfree(bgp_listen_sk); - bgp_listen_sk = NULL; - rfree(bgp_linpool); - bgp_linpool = NULL; - } + return; } /** * bgp_start_timer - start a BGP timer * @t: timer - * @value: time to fire (0 to disable the timer) + * @value: time (in seconds) to fire (0 to disable the timer) * - * This functions calls tm_start() on @t with time @value and the - * amount of randomization suggested by the BGP standard. Please use - * it for all BGP timers. + * This functions calls tm_start() on @t with time @value and the amount of + * randomization suggested by the BGP standard. Please use it for all BGP + * timers. */ void -bgp_start_timer(timer *t, int value) +bgp_start_timer(timer *t, uint value) { if (value) - { - /* The randomization procedure is specified in RFC 1771: 9.2.3.3 */ - t->randomize = value / 4; - tm_start(t, value - t->randomize); - } + { + /* The randomization procedure is specified in RFC 4271 section 10 */ + btime time = value S; + btime randomize = random() % ((time / 4) + 1); + tm_start(t, time - randomize); + } else tm_stop(t); } @@ -231,8 +339,8 @@ bgp_start_timer(timer *t, int value) * bgp_close_conn - close a BGP connection * @conn: connection to close * - * This function takes a connection described by the &bgp_conn structure, - * closes its socket and frees all resources associated with it. + * This function takes a connection described by the &bgp_conn structure, closes + * its socket and frees all resources associated with it. */ void bgp_close_conn(struct bgp_conn *conn) @@ -241,16 +349,22 @@ bgp_close_conn(struct bgp_conn *conn) DBG("BGP: Closing connection\n"); conn->packets_to_send = 0; - rfree(conn->connect_retry_timer); - conn->connect_retry_timer = NULL; + conn->channels_to_send = 0; + rfree(conn->connect_timer); + conn->connect_timer = NULL; rfree(conn->keepalive_timer); conn->keepalive_timer = NULL; rfree(conn->hold_timer); conn->hold_timer = NULL; - rfree(conn->sk); - conn->sk = NULL; rfree(conn->tx_ev); conn->tx_ev = NULL; + rfree(conn->sk); + conn->sk = NULL; + + mb_free(conn->local_caps); + conn->local_caps = NULL; + mb_free(conn->remote_caps); + conn->remote_caps = NULL; } @@ -258,9 +372,9 @@ bgp_close_conn(struct bgp_conn *conn) * bgp_update_startup_delay - update a startup delay * @p: BGP instance * - * This function updates a startup delay that is used to postpone next BGP connect. - * It also handles disable_after_error and might stop BGP instance when error - * happened and disable_after_error is on. + * This function updates a startup delay that is used to postpone next BGP + * connect. It also handles disable_after_error and might stop BGP instance + * when error happened and disable_after_error is on. * * It should be called when BGP protocol error happened. */ @@ -271,17 +385,17 @@ bgp_update_startup_delay(struct bgp_proto *p) DBG("BGP: Updating startup delay\n"); - if (p->last_proto_error && ((now - p->last_proto_error) >= (int) cf->error_amnesia_time)) + if (p->last_proto_error && ((current_time() - p->last_proto_error) >= cf->error_amnesia_time S)) p->startup_delay = 0; - p->last_proto_error = now; + p->last_proto_error = current_time(); if (cf->disable_after_error) - { - p->startup_delay = 0; - p->p.disabled = 1; - return; - } + { + p->startup_delay = 0; + p->p.disabled = 1; + return; + } if (!p->startup_delay) p->startup_delay = cf->error_delay_time_min; @@ -293,29 +407,35 @@ static void bgp_graceful_close_conn(struct bgp_conn *conn, uint subcode, byte *data, uint len) { switch (conn->state) - { - case BS_IDLE: - case BS_CLOSE: - return; - case BS_CONNECT: - case BS_ACTIVE: - bgp_conn_enter_idle_state(conn); - return; - case BS_OPENSENT: - case BS_OPENCONFIRM: - case BS_ESTABLISHED: - bgp_error(conn, 6, subcode, data, len); - return; - default: - bug("bgp_graceful_close_conn: Unknown state %d", conn->state); - } + { + case BS_IDLE: + case BS_CLOSE: + return; + + case BS_CONNECT: + case BS_ACTIVE: + bgp_conn_enter_idle_state(conn); + return; + + case BS_OPENSENT: + case BS_OPENCONFIRM: + case BS_ESTABLISHED: + bgp_error(conn, 6, subcode, data, len); + return; + + default: + bug("bgp_graceful_close_conn: Unknown state %d", conn->state); + } } static void bgp_down(struct bgp_proto *p) { if (p->start_state > BSS_PREPARE) - bgp_close(p, 1); + { + bgp_setup_auth(p, 0); + bgp_close(p); + } BGP_TRACE(D_EVENTS, "Down"); proto_notify_state(&p->p, PS_DOWN); @@ -327,15 +447,15 @@ bgp_decision(void *vp) struct bgp_proto *p = vp; DBG("BGP: Decision start\n"); - if ((p->p.proto_state == PS_START) - && (p->outgoing_conn.state == BS_IDLE) - && (p->incoming_conn.state != BS_OPENCONFIRM) - && (!p->cf->passive)) + if ((p->p.proto_state == PS_START) && + (p->outgoing_conn.state == BS_IDLE) && + (p->incoming_conn.state != BS_OPENCONFIRM) && + !p->cf->passive) bgp_active(p); - if ((p->p.proto_state == PS_STOP) - && (p->outgoing_conn.state == BS_IDLE) - && (p->incoming_conn.state == BS_IDLE)) + if ((p->p.proto_state == PS_STOP) && + (p->outgoing_conn.state == BS_IDLE) && + (p->incoming_conn.state == BS_IDLE)) bgp_down(p); } @@ -349,7 +469,7 @@ bgp_stop(struct bgp_proto *p, uint subcode, byte *data, uint len) } static inline void -bgp_conn_set_state(struct bgp_conn *conn, unsigned new_state) +bgp_conn_set_state(struct bgp_conn *conn, uint new_state) { if (conn->bgp->p.mrtdump & MD_STATES) mrt_dump_bgp_state_change(conn, conn->state, new_state); @@ -364,13 +484,17 @@ bgp_conn_enter_openconfirm_state(struct bgp_conn *conn) bgp_conn_set_state(conn, BS_OPENCONFIRM); } +static const struct bgp_af_caps dummy_af_caps = { }; + void bgp_conn_enter_established_state(struct bgp_conn *conn) { struct bgp_proto *p = conn->bgp; + struct bgp_caps *local = conn->local_caps; + struct bgp_caps *peer = conn->remote_caps; + struct bgp_channel *c; BGP_TRACE(D_EVENTS, "BGP session established"); - DBG("BGP: UP!!!\n"); /* For multi-hop BGP sessions */ if (ipa_zero(p->source_addr)) @@ -381,30 +505,92 @@ bgp_conn_enter_established_state(struct bgp_conn *conn) p->conn = conn; p->last_error_class = 0; p->last_error_code = 0; - p->feed_state = BFS_NONE; - p->load_state = BFS_NONE; - bgp_init_bucket_table(p); - bgp_init_prefix_table(p, 8); - int peer_gr_ready = conn->peer_gr_aware && !(conn->peer_gr_flags & BGP_GRF_RESTART); + p->as4_session = conn->as4_session; + + p->route_refresh = peer->route_refresh; + p->enhanced_refresh = local->enhanced_refresh && peer->enhanced_refresh; - if (p->p.gr_recovery && !peer_gr_ready) - proto_graceful_restart_unlock(&p->p); + /* Whether we may handle possible GR of peer (it has some AF GR-able) */ + p->gr_ready = 0; /* Updated later */ - if (p->p.gr_recovery && (p->cf->gr_mode == BGP_GR_ABLE) && peer_gr_ready) - p->p.gr_wait = 1; + /* Whether peer is ready to handle our GR recovery */ + int peer_gr_ready = peer->gr_aware && !(peer->gr_flags & BGP_GRF_RESTART); - if (p->gr_active) + if (p->gr_active_num) tm_stop(p->gr_timer); - if (p->gr_active && (!conn->peer_gr_able || !(conn->peer_gr_aflags & BGP_GRF_FORWARDING))) - bgp_graceful_restart_done(p); + /* Number of active channels */ + int num = 0; - /* GR capability implies that neighbor will send End-of-RIB */ - if (conn->peer_gr_aware) - p->load_state = BFS_LOADING; + WALK_LIST(c, p->p.channels) + { + const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi); + const struct bgp_af_caps *rem = bgp_find_af_caps(peer, c->afi); - /* proto_notify_state() will likely call bgp_feed_begin(), setting p->feed_state */ + /* Ignore AFIs that were not announced in multiprotocol capability */ + if (!loc || !loc->ready) + loc = &dummy_af_caps; + + if (!rem || !rem->ready) + rem = &dummy_af_caps; + + int active = loc->ready && rem->ready; + c->c.disabled = !active; + c->c.reloadable = p->route_refresh; + + c->index = active ? num++ : 0; + + c->feed_state = BFS_NONE; + c->load_state = BFS_NONE; + + /* Channels where peer may do GR */ + c->gr_ready = active && local->gr_aware && rem->gr_able; + p->gr_ready = p->gr_ready || c->gr_ready; + + /* Channels not able to recover gracefully */ + if (p->p.gr_recovery && (!active || !peer_gr_ready)) + channel_graceful_restart_unlock(&c->c); + + /* Channels waiting for local convergence */ + if (p->p.gr_recovery && loc->gr_able && peer_gr_ready) + c->c.gr_wait = 1; + + /* Channels where peer is not able to recover gracefully */ + if (c->gr_active && ! (c->gr_ready && (rem->gr_af_flags & BGP_GRF_FORWARDING))) + bgp_graceful_restart_done(c); + + /* GR capability implies that neighbor will send End-of-RIB */ + if (peer->gr_aware) + c->load_state = BFS_LOADING; + + c->ext_next_hop = c->cf->ext_next_hop && (bgp_channel_is_ipv6(c) || rem->ext_next_hop); + c->add_path_rx = (loc->add_path & BGP_ADD_PATH_RX) && (rem->add_path & BGP_ADD_PATH_TX); + c->add_path_tx = (loc->add_path & BGP_ADD_PATH_TX) && (rem->add_path & BGP_ADD_PATH_RX); + + /* Update RA mode */ + if (c->add_path_tx) + c->c.ra_mode = RA_ANY; + else if (c->cf->secondary) + c->c.ra_mode = RA_ACCEPTED; + else + c->c.ra_mode = RA_OPTIMAL; + } + + p->afi_map = mb_alloc(p->p.pool, num * sizeof(u32)); + p->channel_map = mb_alloc(p->p.pool, num * sizeof(void *)); + p->channel_count = num; + + WALK_LIST(c, p->p.channels) + { + if (c->c.disabled) + continue; + + p->afi_map[c->index] = c->afi; + p->channel_map[c->index] = c; + } + + /* proto_notify_state() will likely call bgp_feed_begin(), setting c->feed_state */ bgp_conn_set_state(conn, BS_ESTABLISHED); proto_notify_state(&p->p, PS_UP); @@ -416,9 +602,6 @@ bgp_conn_leave_established_state(struct bgp_proto *p) BGP_TRACE(D_EVENTS, "BGP session closed"); p->conn = NULL; - bgp_free_prefix_table(p); - bgp_free_bucket_table(p); - if (p->p.proto_state == PS_UP) bgp_stop(p, 0, NULL, 0); } @@ -471,34 +654,71 @@ bgp_handle_graceful_restart(struct bgp_proto *p) ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready); BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s", - p->gr_active ? " - already pending" : ""); - proto_notify_state(&p->p, PS_START); + p->gr_active_num ? " - already pending" : ""); + + p->gr_active_num = 0; + + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + { + /* FIXME: perhaps check for channel state instead of disabled flag? */ + if (c->c.disabled) + continue; + + if (c->gr_ready) + { + if (c->gr_active) + rt_refresh_end(c->c.table, &c->c); + + c->gr_active = 1; + p->gr_active_num++; + rt_refresh_begin(c->c.table, &c->c); + } + else + { + /* Just flush the routes */ + rt_refresh_begin(c->c.table, &c->c); + rt_refresh_end(c->c.table, &c->c); + } + + /* Reset bucket and prefix tables */ + bgp_free_bucket_table(c); + bgp_free_prefix_table(c); + bgp_init_bucket_table(c); + bgp_init_prefix_table(c); + c->packets_to_send = 0; + } - if (p->gr_active) - rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook); + /* p->gr_ready -> at least one active channel is c->gr_ready */ + ASSERT(p->gr_active_num > 0); - p->gr_active = 1; - bgp_start_timer(p->gr_timer, p->conn->peer_gr_time); - rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook); + proto_notify_state(&p->p, PS_START); + bgp_start_timer(p->gr_timer, p->conn->remote_caps->gr_time); } /** * bgp_graceful_restart_done - finish active BGP graceful restart - * @p: BGP instance + * @c: BGP channel * * This function is called when the active BGP graceful restart of the neighbor - * should be finished - either successfully (the neighbor sends all paths and - * reports end-of-RIB on the new session) or unsuccessfully (the neighbor does - * not support BGP graceful restart on the new session). The function ends - * routing table refresh cycle and stops BGP restart timer. + * should be finished for channel @c - either successfully (the neighbor sends + * all paths and reports end-of-RIB for given AFI/SAFI on the new session) or + * unsuccessfully (the neighbor does not support BGP graceful restart on the new + * session). The function ends the routing table refresh cycle. */ void -bgp_graceful_restart_done(struct bgp_proto *p) +bgp_graceful_restart_done(struct bgp_channel *c) { - BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); - p->gr_active = 0; - tm_stop(p->gr_timer); - rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook); + struct bgp_proto *p = (void *) c->c.proto; + + ASSERT(c->gr_active); + c->gr_active = 0; + p->gr_active_num--; + + if (!p->gr_active_num) + BGP_TRACE(D_EVENTS, "Neighbor graceful restart done"); + + rt_refresh_end(c->c.table, &c->c); } /** @@ -522,7 +742,7 @@ bgp_graceful_restart_timeout(timer *t) /** * bgp_refresh_begin - start incoming enhanced route refresh sequence - * @p: BGP instance + * @c: BGP channel * * This function is called when an incoming enhanced route refresh sequence is * started by the neighbor, demarcated by the BoRR packet. The function updates @@ -531,18 +751,20 @@ bgp_graceful_restart_timeout(timer *t) * ensure that these two sequences do not overlap. */ void -bgp_refresh_begin(struct bgp_proto *p) +bgp_refresh_begin(struct bgp_channel *c) { - if (p->load_state == BFS_LOADING) - { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } + struct bgp_proto *p = (void *) c->c.proto; - p->load_state = BFS_REFRESHING; - rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook); + if (c->load_state == BFS_LOADING) + { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; } + + c->load_state = BFS_REFRESHING; + rt_refresh_begin(c->c.table, &c->c); } /** * bgp_refresh_end - finish incoming enhanced route refresh sequence - * @p: BGP instance + * @c: BGP channel * * This function is called when an incoming enhanced route refresh sequence is * finished by the neighbor, demarcated by the EoRR packet. The function updates @@ -550,39 +772,26 @@ bgp_refresh_begin(struct bgp_proto *p) * during the sequence are removed by the nest. */ void -bgp_refresh_end(struct bgp_proto *p) +bgp_refresh_end(struct bgp_channel *c) { - if (p->load_state != BFS_REFRESHING) - { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } + struct bgp_proto *p = (void *) c->c.proto; - p->load_state = BFS_NONE; - rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook); + if (c->load_state != BFS_REFRESHING) + { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; } + + c->load_state = BFS_NONE; + rt_refresh_end(c->c.table, &c->c); } static void bgp_send_open(struct bgp_conn *conn) { - conn->start_state = conn->bgp->start_state; - - // Default values, possibly changed by receiving capabilities. - conn->advertised_as = 0; - conn->peer_refresh_support = 0; - conn->peer_as4_support = 0; - conn->peer_add_path = 0; - conn->peer_enhanced_refresh_support = 0; - conn->peer_gr_aware = 0; - conn->peer_gr_able = 0; - conn->peer_gr_time = 0; - conn->peer_gr_flags = 0; - conn->peer_gr_aflags = 0; - conn->peer_ext_messages_support = 0; - DBG("BGP: Sending open\n"); conn->sk->rx_hook = bgp_rx; conn->sk->tx_hook = bgp_tx; - tm_stop(conn->connect_retry_timer); - bgp_schedule_packet(conn, PKT_OPEN); + tm_stop(conn->connect_timer); + bgp_schedule_packet(conn, NULL, PKT_OPEN); bgp_conn_set_state(conn, BS_OPENSENT); bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time); } @@ -605,10 +814,10 @@ bgp_connect_timeout(timer *t) DBG("BGP: connect_timeout\n"); if (p->p.proto_state == PS_START) - { - bgp_close_conn(conn); - bgp_connect(p); - } + { + bgp_close_conn(conn); + bgp_connect(p); + } else bgp_conn_enter_idle_state(conn); } @@ -672,7 +881,7 @@ bgp_keepalive_timeout(timer *t) struct bgp_conn *conn = t->data; DBG("BGP: Keepalive timer\n"); - bgp_schedule_packet(conn, PKT_KEEPALIVE); + bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE); /* Kick TX a bit faster */ if (ev_active(conn->tx_ev)) @@ -682,21 +891,18 @@ bgp_keepalive_timeout(timer *t) static void bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn) { - timer *t; - conn->sk = NULL; conn->bgp = p; + conn->packets_to_send = 0; + conn->channels_to_send = 0; + conn->last_channel = 0; + conn->last_channel_count = 0; + + conn->connect_timer = tm_new_init(p->p.pool, bgp_connect_timeout, conn, 0, 0); + conn->hold_timer = tm_new_init(p->p.pool, bgp_hold_timeout, conn, 0, 0); + conn->keepalive_timer = tm_new_init(p->p.pool, bgp_keepalive_timeout, conn, 0, 0); - t = conn->connect_retry_timer = tm_new(p->p.pool); - t->hook = bgp_connect_timeout; - t->data = conn; - t = conn->hold_timer = tm_new(p->p.pool); - t->hook = bgp_hold_timeout; - t->data = conn; - t = conn->keepalive_timer = tm_new(p->p.pool); - t->hook = bgp_keepalive_timeout; - t->data = conn; conn->tx_ev = ev_new(p->p.pool); conn->tx_ev->hook = bgp_kick_tx; conn->tx_ev->data = conn; @@ -720,7 +926,7 @@ bgp_active(struct bgp_proto *p) BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay); bgp_setup_conn(p, conn); bgp_conn_set_state(conn, BS_ACTIVE); - bgp_start_timer(conn->connect_retry_timer, delay); + bgp_start_timer(conn->connect_timer, delay); } /** @@ -734,12 +940,11 @@ bgp_active(struct bgp_proto *p) static void bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing connection */ { - sock *s; struct bgp_conn *conn = &p->outgoing_conn; int hops = p->cf->multihop ? : 1; DBG("BGP: Connecting\n"); - s = sk_new(p->p.pool); + sock *s = sk_new(p->p.pool); s->type = SK_TCP_ACTIVE; s->saddr = p->source_addr; s->daddr = p->cf->remote_ip; @@ -767,10 +972,10 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c goto err; DBG("BGP: Waiting for connect success\n"); - bgp_start_timer(conn->connect_retry_timer, p->cf->connect_retry_time); + bgp_start_timer(conn->connect_timer, p->cf->connect_retry_time); return; - err: +err: sk_log_error(s, p->p.name); bgp_sock_err(s, 0); return; @@ -784,16 +989,15 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c static struct bgp_proto * bgp_find_proto(sock *sk) { - struct proto_config *pc; + struct bgp_proto *p; - WALK_LIST(pc, config->protos) - if ((pc->protocol == &proto_bgp) && pc->proto) - { - struct bgp_proto *p = (struct bgp_proto *) pc->proto; - if (ipa_equal(p->cf->remote_ip, sk->daddr) && - (!p->cf->iface || (p->cf->iface == sk->iface))) - return p; - } + WALK_LIST(p, proto_list) + if ((p->p.proto == &proto_bgp) && + ipa_equal(p->cf->remote_ip, sk->daddr) && + (!p->cf->iface || (p->cf->iface == sk->iface)) && + (ipa_zero(p->cf->local_ip) || ipa_equal(p->cf->local_ip, sk->saddr)) && + (p->cf->local_port == sk->sport)) + return p; return NULL; } @@ -819,12 +1023,12 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport); p = bgp_find_proto(sk); if (!p) - { - log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)", - sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport); - rfree(sk); - return 0; - } + { + log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)", + sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport); + rfree(sk); + return 0; + } /* * BIRD should keep multiple incoming connections in OpenSent state (for @@ -837,26 +1041,26 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk); if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready) - { - bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART); - bgp_handle_graceful_restart(p); - bgp_conn_enter_idle_state(p->conn); - acc = 1; - - /* There might be separate incoming connection in OpenSent state */ - if (p->incoming_conn.state > BS_ACTIVE) - bgp_close_conn(&p->incoming_conn); - } + { + bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART); + bgp_handle_graceful_restart(p); + bgp_conn_enter_idle_state(p->conn); + acc = 1; + + /* There might be separate incoming connection in OpenSent state */ + if (p->incoming_conn.state > BS_ACTIVE) + bgp_close_conn(&p->incoming_conn); + } BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s", sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport, acc ? "accepted" : "rejected"); if (!acc) - { - rfree(sk); - return 0; - } + { + rfree(sk); + return 0; + } hops = p->cf->multihop ? : 1; @@ -868,11 +1072,11 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED) goto err; if (p->cf->enable_extended_messages) - { - sk->rbsize = BGP_RX_BUFFER_EXT_SIZE; - sk->tbsize = BGP_TX_BUFFER_EXT_SIZE; - sk_reallocate(sk); - } + { + sk->rbsize = BGP_RX_BUFFER_EXT_SIZE; + sk->tbsize = BGP_TX_BUFFER_EXT_SIZE; + sk_reallocate(sk); + } bgp_setup_conn(p, &p->incoming_conn); bgp_setup_sk(&p->incoming_conn, sk); @@ -895,34 +1099,6 @@ bgp_listen_sock_err(sock *sk UNUSED, int err) log(L_ERR "BGP: Error on listening socket: %M", err); } -static sock * -bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags) -{ - sock *s = sk_new(&root_pool); - DBG("BGP: Creating listening socket\n"); - s->type = SK_TCP_PASSIVE; - s->ttl = 255; - s->saddr = addr; - s->sport = port ? port : BGP_PORT; - s->flags = flags ? 0 : SKF_V6ONLY; - s->tos = IP_PREC_INTERNET_CONTROL; - s->rbsize = BGP_RX_BUFFER_SIZE; - s->tbsize = BGP_TX_BUFFER_SIZE; - s->rx_hook = bgp_incoming_connection; - s->err_hook = bgp_listen_sock_err; - - if (sk_open(s) < 0) - goto err; - - return s; - - err: - sk_log_error(s, "BGP"); - log(L_ERR "BGP: Cannot open listening socket"); - rfree(s); - return NULL; -} - static void bgp_start_neighbor(struct bgp_proto *p) { @@ -931,23 +1107,10 @@ bgp_start_neighbor(struct bgp_proto *p) if (ipa_zero(p->source_addr)) p->source_addr = p->neigh->ifa->ip; -#ifdef IPV6 - { - struct ifa *a; - p->local_link = IPA_NONE; - WALK_LIST(a, p->neigh->iface->addrs) - if (a->scope == SCOPE_LINK) - { - p->local_link = a->ip; - break; - } - - if (! ipa_nonzero(p->local_link)) - log(L_WARN "%s: Missing link local address on interface %s", p->p.name, p->neigh->iface->name); - - DBG("BGP: Selected link-level address %I\n", p->local_link); - } -#endif + if (ipa_is_link_local(p->source_addr)) + p->link_addr = p->source_addr; + else if (p->neigh->iface->llv6) + p->link_addr = p->neigh->iface->llv6->ip; bgp_initiate(p); } @@ -967,34 +1130,34 @@ bgp_neigh_notify(neighbor *n) int prepare = (ps == PS_START) && (p->start_state == BSS_PREPARE); if (n->scope <= 0) + { + if (!prepare) { - if (!prepare) - { - BGP_TRACE(D_EVENTS, "Neighbor lost"); - bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST); - /* Perhaps also run bgp_update_startup_delay(p)? */ - bgp_stop(p, 0, NULL, 0); - } + BGP_TRACE(D_EVENTS, "Neighbor lost"); + bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST); + /* Perhaps also run bgp_update_startup_delay(p)? */ + bgp_stop(p, 0, NULL, 0); } + } else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP)) + { + if (!prepare) { - if (!prepare) - { - BGP_TRACE(D_EVENTS, "Link down"); - bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN); - if (ps == PS_UP) - bgp_update_startup_delay(p); - bgp_stop(p, 0, NULL, 0); - } + BGP_TRACE(D_EVENTS, "Link down"); + bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN); + if (ps == PS_UP) + bgp_update_startup_delay(p); + bgp_stop(p, 0, NULL, 0); } + } else + { + if (prepare) { - if (prepare) - { - BGP_TRACE(D_EVENTS, "Neighbor ready"); - bgp_start_neighbor(p); - } + BGP_TRACE(D_EVENTS, "Neighbor ready"); + bgp_start_neighbor(p); } + } } static void @@ -1004,13 +1167,13 @@ bgp_bfd_notify(struct bfd_request *req) int ps = p->p.proto_state; if (req->down && ((ps == PS_START) || (ps == PS_UP))) - { - BGP_TRACE(D_EVENTS, "BFD session down"); - bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN); - if (ps == PS_UP) - bgp_update_startup_delay(p); - bgp_stop(p, 0, NULL, 0); - } + { + BGP_TRACE(D_EVENTS, "BFD session down"); + bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN); + if (ps == PS_UP) + bgp_update_startup_delay(p); + bgp_stop(p, 0, NULL, 0); + } } static void @@ -1022,71 +1185,72 @@ bgp_update_bfd(struct bgp_proto *p, int use_bfd) bgp_bfd_notify, p); if (!use_bfd && p->bfd_req) - { - rfree(p->bfd_req); - p->bfd_req = NULL; - } + { + rfree(p->bfd_req); + p->bfd_req = NULL; + } } -static int -bgp_reload_routes(struct proto *P) +static void +bgp_reload_routes(struct channel *C) { - struct bgp_proto *p = (struct bgp_proto *) P; - if (!p->conn || !p->conn->peer_refresh_support) - return 0; + struct bgp_proto *p = (void *) C->proto; + struct bgp_channel *c = (void *) C; - bgp_schedule_packet(p->conn, PKT_ROUTE_REFRESH); - return 1; + ASSERT(p->conn && p->route_refresh); + + bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH); } static void -bgp_feed_begin(struct proto *P, int initial) +bgp_feed_begin(struct channel *C, int initial) { - struct bgp_proto *p = (struct bgp_proto *) P; + struct bgp_proto *p = (void *) C->proto; + struct bgp_channel *c = (void *) C; /* This should not happen */ if (!p->conn) return; if (initial && p->cf->gr_mode) - p->feed_state = BFS_LOADING; + c->feed_state = BFS_LOADING; /* It is refeed and both sides support enhanced route refresh */ - if (!initial && p->cf->enable_refresh && - p->conn->peer_enhanced_refresh_support) - { - /* BoRR must not be sent before End-of-RIB */ - if (p->feed_state == BFS_LOADING || p->feed_state == BFS_LOADED) - return; + if (!initial && p->enhanced_refresh) + { + /* BoRR must not be sent before End-of-RIB */ + if (c->feed_state == BFS_LOADING || c->feed_state == BFS_LOADED) + return; - p->feed_state = BFS_REFRESHING; - bgp_schedule_packet(p->conn, PKT_BEGIN_REFRESH); - } + c->feed_state = BFS_REFRESHING; + bgp_schedule_packet(p->conn, c, PKT_BEGIN_REFRESH); + } } static void -bgp_feed_end(struct proto *P) +bgp_feed_end(struct channel *C) { - struct bgp_proto *p = (struct bgp_proto *) P; + struct bgp_proto *p = (void *) C->proto; + struct bgp_channel *c = (void *) C; /* This should not happen */ if (!p->conn) return; /* Non-demarcated feed ended, nothing to do */ - if (p->feed_state == BFS_NONE) + if (c->feed_state == BFS_NONE) return; /* Schedule End-of-RIB packet */ - if (p->feed_state == BFS_LOADING) - p->feed_state = BFS_LOADED; + if (c->feed_state == BFS_LOADING) + c->feed_state = BFS_LOADED; /* Schedule EoRR packet */ - if (p->feed_state == BFS_REFRESHING) - p->feed_state = BFS_REFRESHED; + if (c->feed_state == BFS_REFRESHING) + c->feed_state = BFS_REFRESHED; /* Kick TX hook */ - bgp_schedule_packet(p->conn, PKT_UPDATE); + bgp_schedule_packet(p->conn, c, PKT_UPDATE); } @@ -1097,30 +1261,30 @@ bgp_start_locked(struct object_lock *lock) struct bgp_config *cf = p->cf; if (p->p.proto_state != PS_START) - { - DBG("BGP: Got lock in different state %d\n", p->p.proto_state); - return; - } + { + DBG("BGP: Got lock in different state %d\n", p->p.proto_state); + return; + } DBG("BGP: Got lock\n"); if (cf->multihop) - { - /* Multi-hop sessions do not use neighbor entries */ - bgp_initiate(p); - return; - } + { + /* Multi-hop sessions do not use neighbor entries */ + bgp_initiate(p); + return; + } neighbor *n = neigh_find2(&p->p, &cf->remote_ip, cf->iface, NEF_STICKY); if (!n) - { - log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface); - /* As we do not start yet, we can just disable protocol */ - p->p.disabled = 1; - bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP); - proto_notify_state(&p->p, PS_DOWN); - return; - } + { + log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface); + /* As we do not start yet, we can just disable protocol */ + p->p.disabled = 1; + bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP); + proto_notify_state(&p->p, PS_DOWN); + return; + } p->neigh = n; @@ -1145,36 +1309,34 @@ bgp_start(struct proto *P) p->neigh = NULL; p->bfd_req = NULL; p->gr_ready = 0; - p->gr_active = 0; - - rt_lock_table(p->igp_table); + p->gr_active_num = 0; p->event = ev_new(p->p.pool); p->event->hook = bgp_decision; p->event->data = p; - p->startup_timer = tm_new(p->p.pool); - p->startup_timer->hook = bgp_startup_timeout; - p->startup_timer->data = p; - - p->gr_timer = tm_new(p->p.pool); - p->gr_timer->hook = bgp_graceful_restart_timeout; - p->gr_timer->data = p; + p->startup_timer = tm_new_init(p->p.pool, bgp_startup_timeout, p, 0, 0); + p->gr_timer = tm_new_init(p->p.pool, bgp_graceful_restart_timeout, p, 0, 0); p->local_id = proto_get_router_id(P->cf); if (p->rr_client) p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id; p->remote_id = 0; - p->source_addr = p->cf->source_addr; + p->source_addr = p->cf->local_ip; + p->link_addr = IPA_NONE; + /* Lock all channels when in GR recovery mode */ if (p->p.gr_recovery && p->cf->gr_mode) - proto_graceful_restart_lock(P); + { + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) + channel_graceful_restart_lock(&c->c); + } /* - * Before attempting to create the connection, we need to lock the - * port, so that are sure we're the only instance attempting to talk - * with that neighbor. + * Before attempting to create the connection, we need to lock the port, + * so that we are the only instance attempting to talk with that neighbor. */ lock = p->lock = olock_new(P->pool); @@ -1205,45 +1367,45 @@ bgp_shutdown(struct proto *P) BGP_TRACE(D_EVENTS, "Shutdown requested"); switch (P->down_code) - { - case PDC_CF_REMOVE: - case PDC_CF_DISABLE: - subcode = 3; // Errcode 6, 3 - peer de-configured - break; - - case PDC_CF_RESTART: - subcode = 6; // Errcode 6, 6 - other configuration change - break; - - case PDC_CMD_DISABLE: - case PDC_CMD_SHUTDOWN: - subcode = 2; // Errcode 6, 2 - administrative shutdown - message = P->message; - break; - - case PDC_CMD_RESTART: - subcode = 4; // Errcode 6, 4 - administrative reset - message = P->message; - break; - - case PDC_RX_LIMIT_HIT: - case PDC_IN_LIMIT_HIT: - subcode = 1; // Errcode 6, 1 - max number of prefixes reached - /* log message for compatibility */ - log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name); - goto limit; - - case PDC_OUT_LIMIT_HIT: - subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown - - limit: - bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED); - if (proto_restart) - bgp_update_startup_delay(p); - else - p->startup_delay = 0; - goto done; - } + { + case PDC_CF_REMOVE: + case PDC_CF_DISABLE: + subcode = 3; // Errcode 6, 3 - peer de-configured + break; + + case PDC_CF_RESTART: + subcode = 6; // Errcode 6, 6 - other configuration change + break; + + case PDC_CMD_DISABLE: + case PDC_CMD_SHUTDOWN: + subcode = 2; // Errcode 6, 2 - administrative shutdown + message = P->message; + break; + + case PDC_CMD_RESTART: + subcode = 4; // Errcode 6, 4 - administrative reset + message = P->message; + break; + + case PDC_RX_LIMIT_HIT: + case PDC_IN_LIMIT_HIT: + subcode = 1; // Errcode 6, 1 - max number of prefixes reached + /* log message for compatibility */ + log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name); + goto limit; + + case PDC_OUT_LIMIT_HIT: + subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown + + limit: + bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED); + if (proto_restart) + bgp_update_startup_delay(p); + else + p->startup_delay = 0; + goto done; + } bgp_store_error(p, NULL, BE_MAN_DOWN, 0); p->startup_delay = 0; @@ -1267,27 +1429,13 @@ done: return p->p.proto_state; } -static void -bgp_cleanup(struct proto *P) -{ - struct bgp_proto *p = (struct bgp_proto *) P; - rt_unlock_table(p->igp_table); -} - -static rtable * -get_igp_table(struct bgp_config *cf) -{ - return cf->igp_table ? cf->igp_table->table : cf->c.table->table; -} - static struct proto * -bgp_init(struct proto_config *C) +bgp_init(struct proto_config *CF) { - struct proto *P = proto_new(C, sizeof(struct bgp_proto)); - struct bgp_config *c = (struct bgp_config *) C; + struct proto *P = proto_new(CF); struct bgp_proto *p = (struct bgp_proto *) P; + struct bgp_config *cf = (struct bgp_config *) CF; - P->accept_ra_types = c->secondary ? RA_ACCEPTED : RA_OPTIMAL; P->rt_notify = bgp_rt_notify; P->import_control = bgp_import_control; P->neigh_notify = bgp_neigh_notify; @@ -1296,102 +1444,293 @@ bgp_init(struct proto_config *C) P->feed_end = bgp_feed_end; P->rte_better = bgp_rte_better; P->rte_mergable = bgp_rte_mergable; - P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL; - - p->cf = c; - p->local_as = c->local_as; - p->remote_as = c->remote_as; - p->is_internal = (c->local_as == c->remote_as); - p->rs_client = c->rs_client; - p->rr_client = c->rr_client; - p->igp_table = get_igp_table(c); + P->rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL; + + p->cf = cf; + p->local_as = cf->local_as; + p->remote_as = cf->remote_as; + p->public_as = cf->local_as; + p->is_internal = (cf->local_as == cf->remote_as); + p->is_interior = p->is_internal || cf->confederation_member; + p->rs_client = cf->rs_client; + p->rr_client = cf->rr_client; + + /* Confederation ID is used for truly external peers */ + if (cf->confederation && !p->is_interior) + p->public_as = cf->confederation; + + /* Add all channels */ + struct bgp_channel_config *cc; + WALK_LIST(cc, CF->channels) + proto_add_channel(P, &cc->c); return P; } +static void +bgp_channel_init(struct channel *C, struct channel_config *CF) +{ + struct bgp_channel *c = (void *) C; + struct bgp_channel_config *cf = (void *) CF; + + c->cf = cf; + c->afi = cf->afi; + c->desc = cf->desc; + + if (cf->igp_table_ip4) + c->igp_table_ip4 = cf->igp_table_ip4->table; + + if (cf->igp_table_ip6) + c->igp_table_ip6 = cf->igp_table_ip6->table; +} + +static int +bgp_channel_start(struct channel *C) +{ + struct bgp_proto *p = (void *) C->proto; + struct bgp_channel *c = (void *) C; + ip_addr src = p->source_addr; + + if (c->igp_table_ip4) + rt_lock_table(c->igp_table_ip4); + + if (c->igp_table_ip6) + rt_lock_table(c->igp_table_ip6); + + c->pool = p->p.pool; // XXXX + bgp_init_bucket_table(c); + bgp_init_prefix_table(c); + + c->next_hop_addr = c->cf->next_hop_addr; + c->link_addr = IPA_NONE; + c->packets_to_send = 0; + + /* Try to use source address as next hop address */ + if (ipa_zero(c->next_hop_addr)) + { + if (bgp_channel_is_ipv4(c) && (ipa_is_ip4(src) || c->ext_next_hop)) + c->next_hop_addr = src; + + if (bgp_channel_is_ipv6(c) && (ipa_is_ip6(src) || c->ext_next_hop)) + c->next_hop_addr = src; + } + + /* Use preferred addresses associated with interface / source address */ + if (ipa_zero(c->next_hop_addr)) + { + /* We know the iface for single-hop, we make lookup for multihop */ + struct neighbor *nbr = p->neigh ?: neigh_find2(&p->p, &src, NULL, 0); + struct iface *iface = nbr ? nbr->iface : NULL; + + if (bgp_channel_is_ipv4(c) && iface && iface->addr4) + c->next_hop_addr = iface->addr4->ip; + + if (bgp_channel_is_ipv6(c) && iface && iface->addr6) + c->next_hop_addr = iface->addr6->ip; + } + + /* Exit if no feasible next hop address is found */ + if (ipa_zero(c->next_hop_addr)) + { + log(L_WARN "%s: Missing next hop address", p->p.name); + return 0; + } + + /* Set link-local address for IPv6 single-hop BGP */ + if (ipa_is_ip6(c->next_hop_addr) && p->neigh) + { + c->link_addr = p->link_addr; + + if (ipa_zero(c->link_addr)) + log(L_WARN "%s: Missing link-local address", p->p.name); + } + + /* Link local address is already in c->link_addr */ + if (ipa_is_link_local(c->next_hop_addr)) + c->next_hop_addr = IPA_NONE; + + return 0; /* XXXX: Currently undefined */ +} + +static void +bgp_channel_shutdown(struct channel *C) +{ + struct bgp_channel *c = (void *) C; + + c->next_hop_addr = IPA_NONE; + c->link_addr = IPA_NONE; + c->packets_to_send = 0; +} + +static void +bgp_channel_cleanup(struct channel *C) +{ + struct bgp_channel *c = (void *) C; + + if (c->igp_table_ip4) + rt_unlock_table(c->igp_table_ip4); + + if (c->igp_table_ip6) + rt_unlock_table(c->igp_table_ip6); +} + +static inline struct bgp_channel_config * +bgp_find_channel_config(struct bgp_config *cf, u32 afi) +{ + struct bgp_channel_config *cc; + + WALK_LIST(cc, cf->c.channels) + if (cc->afi == afi) + return cc; + + return NULL; +} + +struct rtable_config * +bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32 type) +{ + struct bgp_channel_config *cc2; + struct rtable_config *tab; + + /* First, try table connected by the channel */ + if (cc->c.table->addr_type == type) + return cc->c.table; + + /* Find paired channel with the same SAFI but the other AFI */ + u32 afi2 = cc->afi ^ 0x30000; + cc2 = bgp_find_channel_config(cf, afi2); + + /* Second, try IGP table configured in the paired channel */ + if (cc2 && (tab = (type == NET_IP4) ? cc2->igp_table_ip4 : cc2->igp_table_ip6)) + return tab; + + /* Third, try table connected by the paired channel */ + if (cc2 && (cc2->c.table->addr_type == type)) + return cc2->c.table; + + /* Last, try default table of given type */ + if (tab = cf->c.global->def_tables[type]) + return tab; + + cf_error("Undefined IGP table"); +} + void -bgp_check_config(struct bgp_config *c) +bgp_postconfig(struct proto_config *CF) { - int internal = (c->local_as == c->remote_as); + struct bgp_config *cf = (void *) CF; + int internal = (cf->local_as == cf->remote_as); /* Do not check templates at all */ - if (c->c.class == SYM_TEMPLATE) + if (cf->c.class == SYM_TEMPLATE) return; /* EBGP direct by default, IBGP multihop by default */ - if (c->multihop < 0) - c->multihop = internal ? 64 : 0; - - /* Different default for gw_mode */ - if (!c->gw_mode) - c->gw_mode = c->multihop ? GW_RECURSIVE : GW_DIRECT; - - /* Different default based on rs_client */ - if (!c->missing_lladdr) - c->missing_lladdr = c->rs_client ? MLL_IGNORE : MLL_SELF; + if (cf->multihop < 0) + cf->multihop = internal ? 64 : 0; - /* Disable after error incompatible with restart limit action */ - if (c->c.in_limit && (c->c.in_limit->action == PLA_RESTART) && c->disable_after_error) - c->c.in_limit->action = PLA_DISABLE; + /* Link check for single-hop BGP by default */ + if (cf->check_link < 0) + cf->check_link = !cf->multihop; - if (!c->local_as) + if (!cf->local_as) cf_error("Local AS number must be set"); - if (ipa_zero(c->remote_ip)) + if (ipa_zero(cf->remote_ip)) cf_error("Neighbor must be configured"); - if (!c->remote_as) + if (!cf->remote_as) cf_error("Remote AS number must be set"); - if (ipa_is_link_local(c->remote_ip) && !c->iface) + if (ipa_is_link_local(cf->remote_ip) && !cf->iface) cf_error("Link-local neighbor address requires specified interface"); - if (!(c->capabilities && c->enable_as4) && (c->remote_as > 0xFFFF)) + if (!(cf->capabilities && cf->enable_as4) && (cf->remote_as > 0xFFFF)) cf_error("Neighbor AS number out of range (AS4 not available)"); - if (!internal && c->rr_client) + if (!internal && cf->rr_client) cf_error("Only internal neighbor can be RR client"); - if (internal && c->rs_client) + if (internal && cf->rs_client) cf_error("Only external neighbor can be RS client"); - if (c->multihop && (c->gw_mode == GW_DIRECT)) - cf_error("Multihop BGP cannot use direct gateway mode"); + if (!cf->confederation && cf->confederation_member) + cf_error("Confederation ID must be set for member sessions"); - if (c->multihop && (ipa_is_link_local(c->remote_ip) || - ipa_is_link_local(c->source_addr))) + if (cf->multihop && (ipa_is_link_local(cf->local_ip) || + ipa_is_link_local(cf->remote_ip))) cf_error("Multihop BGP cannot be used with link-local addresses"); - if (c->multihop && c->iface) + if (cf->multihop && cf->iface) cf_error("Multihop BGP cannot be bound to interface"); - if (c->multihop && c->check_link) + if (cf->multihop && cf->check_link) cf_error("Multihop BGP cannot depend on link state"); - if (c->multihop && c->bfd && ipa_zero(c->source_addr)) - cf_error("Multihop BGP with BFD requires specified source address"); + if (cf->multihop && cf->bfd && ipa_zero(cf->local_ip)) + cf_error("Multihop BGP with BFD requires specified local address"); - if ((c->gw_mode == GW_RECURSIVE) && c->c.table->sorted) - cf_error("BGP in recursive mode prohibits sorted table"); - if (c->deterministic_med && c->c.table->sorted) - cf_error("BGP with deterministic MED prohibits sorted table"); + struct bgp_channel_config *cc; + WALK_LIST(cc, CF->channels) + { + /* Disable after error incompatible with restart limit action */ + if ((cc->c.in_limit.action == PLA_RESTART) && cf->disable_after_error) + cc->c.in_limit.action = PLA_DISABLE; + + /* Different default based on rs_client */ + if (!cc->missing_lladdr) + cc->missing_lladdr = cf->rs_client ? MLL_IGNORE : MLL_SELF; + + /* Different default for gw_mode */ + if (!cc->gw_mode) + cc->gw_mode = cf->multihop ? GW_RECURSIVE : GW_DIRECT; + + /* Default based on proto config */ + if (cc->gr_able == 0xff) + cc->gr_able = (cf->gr_mode == BGP_GR_ABLE); + + /* Default values of IGP tables */ + if ((cc->gw_mode == GW_RECURSIVE) && !cc->desc->no_igp) + { + if (!cc->igp_table_ip4 && (bgp_cc_is_ipv4(cc) || cc->ext_next_hop)) + cc->igp_table_ip4 = bgp_default_igp_table(cf, cc, NET_IP4); + + if (!cc->igp_table_ip6 && (bgp_cc_is_ipv6(cc) || cc->ext_next_hop)) + cc->igp_table_ip6 = bgp_default_igp_table(cf, cc, NET_IP6); - if (c->secondary && !c->c.table->sorted) - cf_error("BGP with secondary option requires sorted table"); + if (cc->igp_table_ip4 && bgp_cc_is_ipv6(cc) && !cc->ext_next_hop) + cf_error("Mismatched IGP table type"); + + if (cc->igp_table_ip6 && bgp_cc_is_ipv4(cc) && !cc->ext_next_hop) + cf_error("Mismatched IGP table type"); + } + + if (cf->multihop && (cc->gw_mode == GW_DIRECT)) + cf_error("Multihop BGP cannot use direct gateway mode"); + + if ((cc->gw_mode == GW_RECURSIVE) && cc->c.table->sorted) + cf_error("BGP in recursive mode prohibits sorted table"); + + if (cf->deterministic_med && cc->c.table->sorted) + cf_error("BGP with deterministic MED prohibits sorted table"); + + if (cc->secondary && !cc->c.table->sorted) + cf_error("BGP with secondary option requires sorted table"); + } } static int -bgp_reconfigure(struct proto *P, struct proto_config *C) +bgp_reconfigure(struct proto *P, struct proto_config *CF) { - struct bgp_config *new = (struct bgp_config *) C; - struct bgp_proto *p = (struct bgp_proto *) P; + struct bgp_proto *p = (void *) P; + struct bgp_config *new = (void *) CF; struct bgp_config *old = p->cf; - if (proto_get_router_id(C) != p->local_id) + if (proto_get_router_id(CF) != p->local_id) return 0; int same = !memcmp(((byte *) old) + sizeof(struct proto_config), @@ -1399,8 +1738,26 @@ bgp_reconfigure(struct proto *P, struct proto_config *C) // password item is last and must be checked separately OFFSETOF(struct bgp_config, password) - sizeof(struct proto_config)) && ((!old->password && !new->password) - || (old->password && new->password && !strcmp(old->password, new->password))) - && (get_igp_table(old) == get_igp_table(new)); + || (old->password && new->password && !strcmp(old->password, new->password))); + + /* FIXME: Move channel reconfiguration to generic protocol code ? */ + struct channel *C, *C2; + struct bgp_channel_config *cc; + + WALK_LIST(C, p->p.channels) + C->stale = 1; + + WALK_LIST(cc, new->c.channels) + { + C = (struct channel *) bgp_find_channel(p, cc->afi); + same = proto_configure_channel(P, &C, &cc->c) && same; + C->stale = 0; + } + + WALK_LIST_DELSAFE(C, C2, p->p.channels) + if (C->stale) + same = proto_configure_channel(P, &C, NULL) && same; + if (same && (p->start_state > BSS_PREPARE)) bgp_update_bfd(p, new->bfd); @@ -1412,11 +1769,34 @@ bgp_reconfigure(struct proto *P, struct proto_config *C) return same; } +#define IGP_TABLE(cf, sym) ((cf)->igp_table_##sym ? (cf)->igp_table_##sym ->table : NULL ) + +static int +bgp_channel_reconfigure(struct channel *C, struct channel_config *CC) +{ + struct bgp_channel *c = (void *) C; + struct bgp_channel_config *new = (void *) CC; + struct bgp_channel_config *old = c->cf; + + if (memcmp(((byte *) old) + sizeof(struct channel_config), + ((byte *) new) + sizeof(struct channel_config), + /* Remaining items must be checked separately */ + OFFSETOF(struct bgp_channel_config, rest) - sizeof(struct channel_config))) + return 0; + + /* Check change in IGP tables */ + if ((IGP_TABLE(old, ip4) != IGP_TABLE(new, ip4)) || + (IGP_TABLE(old, ip6) != IGP_TABLE(new, ip6))) + return 0; + + c->cf = new; + return 1; +} + static void -bgp_copy_config(struct proto_config *dest, struct proto_config *src) +bgp_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUSED) { /* Just a shallow copy */ - proto_copy_rest(dest, src, sizeof(struct bgp_config)); } @@ -1433,14 +1813,14 @@ bgp_copy_config(struct proto_config *dest, struct proto_config *src) * closes the connection. */ void -bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int len) +bgp_error(struct bgp_conn *c, uint code, uint subcode, byte *data, int len) { struct bgp_proto *p = c->bgp; if (c->state == BS_CLOSE) return; - bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, (len > 0) ? len : -len); + bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, ABS(len)); bgp_store_error(p, c, BE_BGP_TX, (code << 16) | subcode); bgp_conn_enter_close_state(c); @@ -1448,13 +1828,13 @@ bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int l c->notify_subcode = subcode; c->notify_data = data; c->notify_size = (len > 0) ? len : 0; - bgp_schedule_packet(c, PKT_NOTIFICATION); + bgp_schedule_packet(c, NULL, PKT_NOTIFICATION); if (code != 6) - { - bgp_update_startup_delay(p); - bgp_stop(p, 0, NULL, 0); - } + { + bgp_update_startup_delay(p); + bgp_stop(p, 0, NULL, 0); + } } /** @@ -1493,19 +1873,19 @@ static const char * bgp_last_errmsg(struct bgp_proto *p) { switch (p->last_error_class) - { - case BE_MISC: - return bgp_misc_errors[p->last_error_code]; - case BE_SOCKET: - return (p->last_error_code == 0) ? "Connection closed" : strerror(p->last_error_code); - case BE_BGP_RX: - case BE_BGP_TX: - return bgp_error_dsc(p->last_error_code >> 16, p->last_error_code & 0xFF); - case BE_AUTO_DOWN: - return bgp_auto_errors[p->last_error_code]; - default: - return ""; - } + { + case BE_MISC: + return bgp_misc_errors[p->last_error_code]; + case BE_SOCKET: + return (p->last_error_code == 0) ? "Connection closed" : strerror(p->last_error_code); + case BE_BGP_RX: + case BE_BGP_TX: + return bgp_error_dsc(p->last_error_code >> 16, p->last_error_code & 0xFF); + case BE_AUTO_DOWN: + return bgp_auto_errors[p->last_error_code]; + default: + return ""; + } } static const char * @@ -1536,86 +1916,230 @@ bgp_get_status(struct proto *P, byte *buf) } static void +bgp_show_afis(int code, char *s, u32 *afis, uint count) +{ + buffer b; + LOG_BUFFER_INIT(b); + + buffer_puts(&b, s); + + for (u32 *af = afis; af < (afis + count); af++) + { + const struct bgp_af_desc *desc = bgp_get_af_desc(*af); + if (desc) + buffer_print(&b, " %s", desc->name); + else + buffer_print(&b, " <%u/%u>", BGP_AFI(*af), BGP_SAFI(*af)); + } + + if (b.pos == b.end) + strcpy(b.end - 32, " ... <too long>"); + + cli_msg(code, b.start); +} + +static void +bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps) +{ + struct bgp_af_caps *ac; + uint any_mp_bgp = 0; + uint any_gr_able = 0; + uint any_add_path = 0; + uint any_ext_next_hop = 0; + u32 *afl1 = alloca(caps->af_count * sizeof(u32)); + u32 *afl2 = alloca(caps->af_count * sizeof(u32)); + uint afn1, afn2; + + WALK_AF_CAPS(caps, ac) + { + any_mp_bgp |= ac->ready; + any_gr_able |= ac->gr_able; + any_add_path |= ac->add_path; + any_ext_next_hop |= ac->ext_next_hop; + } + + if (any_mp_bgp) + { + cli_msg(-1006, " Multiprotocol"); + + afn1 = 0; + WALK_AF_CAPS(caps, ac) + if (ac->ready) + afl1[afn1++] = ac->afi; + + bgp_show_afis(-1006, " AF announced:", afl1, afn1); + } + + if (caps->route_refresh) + cli_msg(-1006, " Route refresh"); + + if (any_ext_next_hop) + { + cli_msg(-1006, " Extended next hop"); + + afn1 = 0; + WALK_AF_CAPS(caps, ac) + if (ac->ext_next_hop) + afl1[afn1++] = ac->afi; + + bgp_show_afis(-1006, " IPv6 nexthop:", afl1, afn1); + } + + if (caps->ext_messages) + cli_msg(-1006, " Extended message"); + + if (caps->gr_aware) + cli_msg(-1006, " Graceful restart"); + + if (any_gr_able) + { + /* Continues from gr_aware */ + cli_msg(-1006, " Restart time: %u", caps->gr_time); + if (caps->gr_flags & BGP_GRF_RESTART) + cli_msg(-1006, " Restart recovery"); + + afn1 = afn2 = 0; + WALK_AF_CAPS(caps, ac) + { + if (ac->gr_able) + afl1[afn1++] = ac->afi; + + if (ac->gr_af_flags & BGP_GRF_FORWARDING) + afl2[afn2++] = ac->afi; + } + + bgp_show_afis(-1006, " AF supported:", afl1, afn1); + bgp_show_afis(-1006, " AF preserved:", afl2, afn2); + } + + if (caps->as4_support) + cli_msg(-1006, " 4-octet AS numbers"); + + if (any_add_path) + { + cli_msg(-1006, " ADD-PATH"); + + afn1 = afn2 = 0; + WALK_AF_CAPS(caps, ac) + { + if (ac->add_path & BGP_ADD_PATH_RX) + afl1[afn1++] = ac->afi; + + if (ac->add_path & BGP_ADD_PATH_TX) + afl2[afn2++] = ac->afi; + } + + bgp_show_afis(-1006, " RX:", afl1, afn1); + bgp_show_afis(-1006, " TX:", afl2, afn2); + } + + if (caps->enhanced_refresh) + cli_msg(-1006, " Enhanced refresh"); +} + +static void bgp_show_proto_info(struct proto *P) { struct bgp_proto *p = (struct bgp_proto *) P; - struct bgp_conn *c = p->conn; - - proto_show_basic_info(P); cli_msg(-1006, " BGP state: %s", bgp_state_dsc(p)); cli_msg(-1006, " Neighbor address: %I%J", p->cf->remote_ip, p->cf->iface); cli_msg(-1006, " Neighbor AS: %u", p->remote_as); - if (p->gr_active) + if (p->gr_active_num) cli_msg(-1006, " Neighbor graceful restart active"); if (P->proto_state == PS_START) - { - struct bgp_conn *oc = &p->outgoing_conn; + { + struct bgp_conn *oc = &p->outgoing_conn; - if ((p->start_state < BSS_CONNECT) && - (p->startup_timer->expires)) - cli_msg(-1006, " Error wait: %d/%d", - p->startup_timer->expires - now, p->startup_delay); + if ((p->start_state < BSS_CONNECT) && + (tm_active(p->startup_timer))) + cli_msg(-1006, " Error wait: %t/%u", + tm_remains(p->startup_timer), p->startup_delay); - if ((oc->state == BS_ACTIVE) && - (oc->connect_retry_timer->expires)) - cli_msg(-1006, " Connect delay: %d/%d", - oc->connect_retry_timer->expires - now, p->cf->connect_delay_time); + if ((oc->state == BS_ACTIVE) && + (tm_active(oc->connect_timer))) + cli_msg(-1006, " Connect delay: %t/%u", + tm_remains(oc->connect_timer), p->cf->connect_delay_time); - if (p->gr_active && p->gr_timer->expires) - cli_msg(-1006, " Restart timer: %d/-", p->gr_timer->expires - now); - } + if (p->gr_active_num && tm_active(p->gr_timer)) + cli_msg(-1006, " Restart timer: %t/-", + tm_remains(p->gr_timer)); + } else if (P->proto_state == PS_UP) - { - cli_msg(-1006, " Neighbor ID: %R", p->remote_id); - cli_msg(-1006, " Neighbor caps: %s%s%s%s%s%s%s", - c->peer_refresh_support ? " refresh" : "", - c->peer_enhanced_refresh_support ? " enhanced-refresh" : "", - c->peer_gr_able ? " restart-able" : (c->peer_gr_aware ? " restart-aware" : ""), - c->peer_as4_support ? " AS4" : "", - (c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "", - (c->peer_add_path & ADD_PATH_TX) ? " add-path-tx" : "", - c->peer_ext_messages_support ? " ext-messages" : ""); - cli_msg(-1006, " Session: %s%s%s%s%s%s%s%s", - p->is_internal ? "internal" : "external", - p->cf->multihop ? " multihop" : "", - p->rr_client ? " route-reflector" : "", - p->rs_client ? " route-server" : "", - p->as4_session ? " AS4" : "", - p->add_path_rx ? " add-path-rx" : "", - p->add_path_tx ? " add-path-tx" : "", - p->ext_messages ? " ext-messages" : ""); - cli_msg(-1006, " Source address: %I", p->source_addr); - if (P->cf->in_limit) - cli_msg(-1006, " Route limit: %d/%d", - p->p.stats.imp_routes + p->p.stats.filt_routes, P->cf->in_limit->limit); - cli_msg(-1006, " Hold timer: %d/%d", - tm_remains(c->hold_timer), c->hold_time); - cli_msg(-1006, " Keepalive timer: %d/%d", - tm_remains(c->keepalive_timer), c->keepalive_time); - } + { + cli_msg(-1006, " Neighbor ID: %R", p->remote_id); + cli_msg(-1006, " Local capabilities"); + bgp_show_capabilities(p, p->conn->local_caps); + cli_msg(-1006, " Neighbor capabilities"); + bgp_show_capabilities(p, p->conn->remote_caps); + cli_msg(-1006, " Session: %s%s%s%s%s", + p->is_internal ? "internal" : "external", + p->cf->multihop ? " multihop" : "", + p->rr_client ? " route-reflector" : "", + p->rs_client ? " route-server" : "", + p->as4_session ? " AS4" : ""); + cli_msg(-1006, " Source address: %I", p->source_addr); + cli_msg(-1006, " Hold timer: %t/%u", + tm_remains(p->conn->hold_timer), p->conn->hold_time); + cli_msg(-1006, " Keepalive timer: %t/%u", + tm_remains(p->conn->keepalive_timer), p->conn->keepalive_time); + } if ((p->last_error_class != BE_NONE) && (p->last_error_class != BE_MAN_DOWN)) + { + const char *err1 = bgp_err_classes[p->last_error_class]; + const char *err2 = bgp_last_errmsg(p); + cli_msg(-1006, " Last error: %s%s", err1, err2); + } + + { + struct bgp_channel *c; + WALK_LIST(c, p->p.channels) { - const char *err1 = bgp_err_classes[p->last_error_class]; - const char *err2 = bgp_last_errmsg(p); - cli_msg(-1006, " Last error: %s%s", err1, err2); + channel_show_info(&c->c); + + if (c->c.channel_state == CS_UP) + { + if (ipa_zero(c->link_addr)) + cli_msg(-1006, " BGP Next hop: %I", c->next_hop_addr); + else + cli_msg(-1006, " BGP Next hop: %I %I", c->next_hop_addr, c->link_addr); + } + + if (c->igp_table_ip4) + cli_msg(-1006, " IGP IPv4 table: %s", c->igp_table_ip4->name); + + if (c->igp_table_ip6) + cli_msg(-1006, " IGP IPv6 table: %s", c->igp_table_ip6->name); } + } } +struct channel_class channel_bgp = { + .channel_size = sizeof(struct bgp_channel), + .config_size = sizeof(struct bgp_channel_config), + .init = bgp_channel_init, + .start = bgp_channel_start, + .shutdown = bgp_channel_shutdown, + .cleanup = bgp_channel_cleanup, + .reconfigure = bgp_channel_reconfigure, +}; + struct protocol proto_bgp = { .name = "BGP", .template = "bgp%d", .attr_class = EAP_BGP, .preference = DEF_PREF_BGP, + .channel_mask = NB_IP | NB_VPN | NB_FLOW, + .proto_size = sizeof(struct bgp_proto), .config_size = sizeof(struct bgp_config), + .postconfig = bgp_postconfig, .init = bgp_init, .start = bgp_start, .shutdown = bgp_shutdown, - .cleanup = bgp_cleanup, .reconfigure = bgp_reconfigure, .copy_config = bgp_copy_config, .get_status = bgp_get_status, diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h index 22a150ab..1310582b 100644 --- a/proto/bgp/bgp.h +++ b/proto/bgp/bgp.h @@ -2,6 +2,8 @@ * BIRD -- The Border Gateway Protocol * * (c) 2000 Martin Mares <mj@ucw.cz> + * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2008--2016 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ @@ -10,26 +12,80 @@ #define _BIRD_BGP_H_ #include <stdint.h> +#include <setjmp.h> +#include "nest/bird.h" #include "nest/route.h" #include "nest/bfd.h" +//#include "lib/lists.h" #include "lib/hash.h" +#include "lib/socket.h" struct linpool; struct eattr; + +/* Address families */ + +#define BGP_AFI_IPV4 1 +#define BGP_AFI_IPV6 2 + +#define BGP_SAFI_UNICAST 1 +#define BGP_SAFI_MULTICAST 2 +#define BGP_SAFI_MPLS 4 +#define BGP_SAFI_MPLS_VPN 128 +#define BGP_SAFI_VPN_MULTICAST 129 +#define BGP_SAFI_FLOW 133 + +/* Internal AF codes */ + +#define BGP_AF(A, B) (((u32)(A) << 16) | (u32)(B)) +#define BGP_AFI(A) ((u32)(A) >> 16) +#define BGP_SAFI(A) ((u32)(A) & 0xFFFF) + +#define BGP_AF_IPV4 BGP_AF( BGP_AFI_IPV4, BGP_SAFI_UNICAST ) +#define BGP_AF_IPV6 BGP_AF( BGP_AFI_IPV6, BGP_SAFI_UNICAST ) +#define BGP_AF_IPV4_MC BGP_AF( BGP_AFI_IPV4, BGP_SAFI_MULTICAST ) +#define BGP_AF_IPV6_MC BGP_AF( BGP_AFI_IPV6, BGP_SAFI_MULTICAST ) +#define BGP_AF_IPV4_MPLS BGP_AF( BGP_AFI_IPV4, BGP_SAFI_MPLS ) +#define BGP_AF_IPV6_MPLS BGP_AF( BGP_AFI_IPV6, BGP_SAFI_MPLS ) +#define BGP_AF_VPN4_MPLS BGP_AF( BGP_AFI_IPV4, BGP_SAFI_MPLS_VPN ) +#define BGP_AF_VPN6_MPLS BGP_AF( BGP_AFI_IPV6, BGP_SAFI_MPLS_VPN ) +#define BGP_AF_VPN4_MC BGP_AF( BGP_AFI_IPV4, BGP_SAFI_VPN_MULTICAST ) +#define BGP_AF_VPN6_MC BGP_AF( BGP_AFI_IPV6, BGP_SAFI_VPN_MULTICAST ) +#define BGP_AF_FLOW4 BGP_AF( BGP_AFI_IPV4, BGP_SAFI_FLOW ) +#define BGP_AF_FLOW6 BGP_AF( BGP_AFI_IPV6, BGP_SAFI_FLOW ) + + +struct bgp_write_state; +struct bgp_parse_state; +struct bgp_export_state; +struct bgp_bucket; + +struct bgp_af_desc { + u32 afi; + u32 net; + u8 mpls; + u8 no_igp; + const char *name; + uint (*encode_nlri)(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size); + void (*decode_nlri)(struct bgp_parse_state *s, byte *pos, uint len, rta *a); + void (*update_next_hop)(struct bgp_export_state *s, eattr *nh, ea_list **to); + uint (*encode_next_hop)(struct bgp_write_state *s, eattr *nh, byte *buf, uint size); + void (*decode_next_hop)(struct bgp_parse_state *s, byte *pos, uint len, rta *a); +}; + + struct bgp_config { struct proto_config c; u32 local_as, remote_as; + ip_addr local_ip; /* Source address to use */ ip_addr remote_ip; - ip_addr source_addr; /* Source address to use */ struct iface *iface; /* Interface for link-local addresses */ + u16 local_port; /* Local listening port */ u16 remote_port; /* Neighbor destination port */ int multihop; /* Number of hops if multihop */ - int ttl_security; /* Enable TTL security [RFC5082] */ - int next_hop_self; /* Always set next hop to local IP address */ - int next_hop_keep; /* Do not touch next hop attribute */ - int missing_lladdr; /* What we will do when we don' know link-local addr, see MLL_* */ - int gw_mode; /* How we compute route gateway from next_hop attr, see GW_* */ + int strict_bind; /* Bind listening socket to local address */ + int ttl_security; /* Enable TTL security [RFC 5082] */ int compare_path_lengths; /* Use path lengths when selecting best route */ int med_metric; /* Compare MULTI_EXIT_DISC even between routes from differen ASes */ int igp_metric; /* Use IGP metrics when selecting best route */ @@ -37,22 +93,22 @@ struct bgp_config { int deterministic_med; /* Use more complicated algo to have strict RFC 4271 MED comparison */ u32 default_local_pref; /* Default value for LOCAL_PREF attribute */ u32 default_med; /* Default value for MULTI_EXIT_DISC attribute */ - int capabilities; /* Enable capability handshake [RFC3392] */ - int enable_refresh; /* Enable local support for route refresh [RFC2918] */ - int enable_as4; /* Enable local support for 4B AS numbers [RFC4893] */ + int capabilities; /* Enable capability handshake [RFC 5492] */ + int enable_refresh; /* Enable local support for route refresh [RFC 2918] */ + int enable_as4; /* Enable local support for 4B AS numbers [RFC 6793] */ int enable_extended_messages; /* Enable local support for extended messages [draft] */ u32 rr_cluster_id; /* Route reflector cluster ID, if different from local ID */ int rr_client; /* Whether neighbor is RR client of me */ int rs_client; /* Whether neighbor is RS client of me */ - int advertise_ipv4; /* Whether we should add IPv4 capability advertisement to OPEN message */ + u32 confederation; /* Confederation ID, or zero if confeds not active */ + int confederation_member; /* Whether neighbor AS is member of our confederation */ int passive; /* Do not initiate outgoing connection */ int interpret_communities; /* Hardwired handling of well-known communities */ - int secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */ - int add_path; /* Use ADD-PATH extension [RFC7911] */ int allow_local_as; /* Allow that number of local ASNs in incoming AS_PATHs */ int allow_local_pref; /* Allow LOCAL_PREF in EBGP sessions */ int gr_mode; /* Graceful restart mode (BGP_GR_*) */ int setkey; /* Set MD5 password to system SA/SP database */ + /* Times below are in seconds */ unsigned gr_time; /* Graceful restart timeout */ unsigned connect_delay_time; /* Minimum delay between connect attempts */ unsigned connect_retry_time; /* Timeout for connect attempts */ @@ -64,11 +120,31 @@ struct bgp_config { unsigned disable_after_error; /* Disable the protocol when error is detected */ char *password; /* Password used for MD5 authentication */ - struct rtable_config *igp_table; /* Table used for recursive next hop lookups */ int check_link; /* Use iface link state for liveness detection */ int bfd; /* Use BFD for liveness detection */ }; +struct bgp_channel_config { + struct channel_config c; + + u32 afi; + const struct bgp_af_desc *desc; + + ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */ + u8 next_hop_self; /* Always set next hop to local IP address */ + u8 next_hop_keep; /* Do not touch next hop attribute */ + u8 missing_lladdr; /* What we will do when we don' know link-local addr, see MLL_* */ + u8 gw_mode; /* How we compute route gateway from next_hop attr, see GW_* */ + u8 secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */ + u8 gr_able; /* Allow full graceful restart for the channel */ + u8 ext_next_hop; /* Allow both IPv4 and IPv6 next hops */ + u8 add_path; /* Use ADD-PATH extension [RFC 7911] */ + + uint rest[0]; /* Remaining items are reconfigured separately */ + struct rtable_config *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ + struct rtable_config *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ +}; + #define MLL_SELF 1 #define MLL_DROP 2 #define MLL_IGNORE 3 @@ -76,112 +152,241 @@ struct bgp_config { #define GW_DIRECT 1 #define GW_RECURSIVE 2 -#define ADD_PATH_RX 1 -#define ADD_PATH_TX 2 -#define ADD_PATH_FULL 3 +#define BGP_ADD_PATH_RX 1 +#define BGP_ADD_PATH_TX 2 +#define BGP_ADD_PATH_FULL 3 -#define BGP_GR_ABLE 1 -#define BGP_GR_AWARE 2 +#define BGP_GR_ABLE 1 +#define BGP_GR_AWARE 2 -/* For peer_gr_flags */ +/* For GR capability common flags */ #define BGP_GRF_RESTART 0x80 -/* For peer_gr_aflags */ +/* For GR capability per-AF flags */ #define BGP_GRF_FORWARDING 0x80 +struct bgp_af_caps { + u32 afi; + u8 ready; /* Multiprotocol capability, RFC 4760 */ + u8 gr_able; /* Graceful restart support, RFC 4724 */ + u8 gr_af_flags; /* Graceful restart per-AF flags */ + u8 ext_next_hop; /* Extended IPv6 next hop, RFC 5549 */ + u8 add_path; /* Multiple paths support, RFC 7911 */ +}; + +struct bgp_caps { + u32 as4_number; /* Announced ASN */ + + u8 as4_support; /* Four-octet AS capability, RFC 6793 */ + u8 ext_messages; /* Extended message length, RFC draft */ + u8 route_refresh; /* Route refresh capability, RFC 2918 */ + u8 enhanced_refresh; /* Enhanced route refresh, RFC 7313 */ + + u8 gr_aware; /* Graceful restart capability, RFC 4724 */ + u8 gr_flags; /* Graceful restart flags */ + u16 gr_time; /* Graceful restart time in seconds */ + + u16 af_count; /* Number of af_data items */ + + struct bgp_af_caps af_data[0]; /* Per-AF capability data */ +}; + +#define WALK_AF_CAPS(caps,ac) \ + for (ac = caps->af_data; ac < &caps->af_data[caps->af_count]; ac++) + + +struct bgp_socket { + node n; /* Node in global bgp_sockets */ + sock *sk; /* Real listening socket */ + u32 uc; /* Use count */ +}; + struct bgp_conn { struct bgp_proto *bgp; struct birdsock *sk; - uint state; /* State of connection state machine */ - struct timer *connect_retry_timer; - struct timer *hold_timer; - struct timer *keepalive_timer; - struct event *tx_ev; - int packets_to_send; /* Bitmap of packet types to be sent */ + u8 state; /* State of connection state machine */ + u8 as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */ + u8 ext_messages; /* Session uses extended message length */ + + struct bgp_caps *local_caps; + struct bgp_caps *remote_caps; + timer *connect_timer; + timer *hold_timer; + timer *keepalive_timer; + event *tx_ev; + u32 packets_to_send; /* Bitmap of packet types to be sent */ + u32 channels_to_send; /* Bitmap of channels with packets to be sent */ + u8 last_channel; /* Channel used last time for TX */ + u8 last_channel_count; /* Number of times the last channel was used in succession */ int notify_code, notify_subcode, notify_size; byte *notify_data; - u32 advertised_as; /* Temporary value for AS number received */ - int start_state; /* protocol start_state snapshot when connection established */ - u8 peer_refresh_support; /* Peer supports route refresh [RFC2918] */ - u8 peer_as4_support; /* Peer supports 4B AS numbers [RFC4893] */ - u8 peer_add_path; /* Peer supports ADD-PATH [RFC7911] */ - u8 peer_enhanced_refresh_support; /* Peer supports enhanced refresh [RFC7313] */ - u8 peer_gr_aware; - u8 peer_gr_able; - u16 peer_gr_time; - u8 peer_gr_flags; - u8 peer_gr_aflags; - u8 peer_ext_messages_support; /* Peer supports extended message length [draft] */ - unsigned hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */ + + uint hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */ }; struct bgp_proto { struct proto p; struct bgp_config *cf; /* Shortcut to BGP configuration */ u32 local_as, remote_as; - int start_state; /* Substates that partitions BS_START */ - u8 is_internal; /* Internal BGP connection (local_as == remote_as) */ - u8 as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */ - u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */ - u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */ - u8 ext_messages; /* Session allows to use extended messages (both sides support it) */ + u32 public_as; /* Externally visible ASN (local_as or confederation id) */ u32 local_id; /* BGP identifier of this router */ u32 remote_id; /* BGP identifier of the neighbor */ u32 rr_cluster_id; /* Route reflector cluster ID */ - int rr_client; /* Whether neighbor is RR client of me */ - int rs_client; /* Whether neighbor is RS client of me */ + int start_state; /* Substates that partitions BS_START */ + u8 is_internal; /* Internal BGP session (local_as == remote_as) */ + u8 is_interior; /* Internal or intra-confederation BGP session */ + u8 as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */ + u8 rr_client; /* Whether neighbor is RR client of me */ + u8 rs_client; /* Whether neighbor is RS client of me */ + u8 route_refresh; /* Route refresh allowed to send [RFC 2918] */ + u8 enhanced_refresh; /* Enhanced refresh is negotiated [RFC 7313] */ u8 gr_ready; /* Neighbor could do graceful restart */ - u8 gr_active; /* Neighbor is doing graceful restart */ - u8 feed_state; /* Feed state (TX) for EoR, RR packets, see BFS_* */ - u8 load_state; /* Load state (RX) for EoR, RR packets, see BFS_* */ + u8 gr_active_num; /* Neighbor is doing GR, number of active channels */ + u8 channel_count; /* Number of active channels */ + u32 *afi_map; /* Map channel index -> AFI */ + struct bgp_channel **channel_map; /* Map channel index -> channel */ struct bgp_conn *conn; /* Connection we have established */ struct bgp_conn outgoing_conn; /* Outgoing connection we're working with */ struct bgp_conn incoming_conn; /* Incoming connection we have neither accepted nor rejected yet */ struct object_lock *lock; /* Lock for neighbor connection */ struct neighbor *neigh; /* Neighbor entry corresponding to remote ip, NULL if multihop */ + struct bgp_socket *sock; /* Shared listening socket */ struct bfd_request *bfd_req; /* BFD request, if BFD is used */ ip_addr source_addr; /* Local address used as an advertised next hop */ - rtable *igp_table; /* Table used for recursive next hop lookups */ - struct event *event; /* Event for respawning and shutting process */ - struct timer *startup_timer; /* Timer used to delay protocol startup due to previous errors (startup_delay) */ - struct timer *gr_timer; /* Timer waiting for reestablishment after graceful restart */ - struct bgp_bucket **bucket_hash; /* Hash table of attribute buckets */ - uint hash_size, hash_count, hash_limit; - HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ - slab *prefix_slab; /* Slab holding prefix nodes */ - list bucket_queue; /* Queue of buckets to send */ - struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ - unsigned startup_delay; /* Time to delay protocol startup by due to errors */ - bird_clock_t last_proto_error; /* Time of last error that leads to protocol stop */ + ip_addr link_addr; /* Link-local version of source_addr */ + event *event; /* Event for respawning and shutting process */ + timer *startup_timer; /* Timer used to delay protocol startup due to previous errors (startup_delay) */ + timer *gr_timer; /* Timer waiting for reestablishment after graceful restart */ + uint startup_delay; /* Delay (in seconds) of protocol startup due to previous errors */ + btime last_proto_error; /* Time of last error that leads to protocol stop */ u8 last_error_class; /* Error class of last error */ u32 last_error_code; /* Error code of last error. BGP protocol errors are encoded as (bgp_err_code << 16 | bgp_err_subcode) */ -#ifdef IPV6 - byte *mp_reach_start, *mp_unreach_start; /* Multiprotocol BGP attribute notes */ - unsigned mp_reach_len, mp_unreach_len; - ip_addr local_link; /* Link-level version of source_addr */ -#endif +}; + +struct bgp_channel { + struct channel c; + + /* Rest are BGP specific data */ + struct bgp_channel_config *cf; + pool *pool; /* XXXX */ + + u32 afi; + u32 index; + const struct bgp_af_desc *desc; + + HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */ + struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */ + list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */ + + HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */ + slab *prefix_slab; /* Slab holding prefix nodes */ + + rtable *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */ + rtable *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */ + ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */ + ip_addr link_addr; /* Link-local version of next_hop_addr */ + + u32 packets_to_send; /* Bitmap of packet types to be sent */ + + u8 gr_ready; /* Neighbor could do GR on this AF */ + u8 gr_active; /* Neighbor is doing GR and keeping fwd state */ + + u8 ext_next_hop; /* Session allows both IPv4 and IPv6 next hops */ + + u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */ + u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */ + + u8 feed_state; /* Feed state (TX) for EoR, RR packets, see BFS_* */ + u8 load_state; /* Load state (RX) for EoR, RR packets, see BFS_* */ }; struct bgp_prefix { - struct { - ip_addr prefix; - int pxlen; - } n; + node buck_node; /* Node in per-bucket list */ + struct bgp_prefix *next; /* Node in prefix hash table */ + u32 hash; u32 path_id; - struct bgp_prefix *next; - node bucket_node; /* Node in per-bucket list */ + net_addr net[0]; }; struct bgp_bucket { node send_node; /* Node in send queue */ - struct bgp_bucket *hash_next, *hash_prev; /* Node in bucket hash table */ - unsigned hash; /* Hash over extended attributes */ - list prefixes; /* Prefixes in this buckets */ + struct bgp_bucket *next; /* Node in bucket hash table */ + list prefixes; /* Prefixes in this bucket (struct bgp_prefix) */ + u32 hash; /* Hash over extended attributes */ ea_list eattrs[0]; /* Per-bucket extended attributes */ }; +struct bgp_export_state { + struct bgp_proto *proto; + struct bgp_channel *channel; + struct linpool *pool; + + struct bgp_proto *src; + rte *route; + int mpls; + + u32 attrs_seen[1]; + uint err_withdraw; +}; + +struct bgp_write_state { + struct bgp_proto *proto; + struct bgp_channel *channel; + struct linpool *pool; + + int as4_session; + int add_path; + int mpls; + + eattr *mp_next_hop; + adata *mpls_labels; +}; + +struct bgp_parse_state { + struct bgp_proto *proto; + struct bgp_channel *channel; + struct linpool *pool; + + int as4_session; + int add_path; + int mpls; + + u32 attrs_seen[256/32]; + + u32 mp_reach_af; + u32 mp_unreach_af; + + uint attr_len; + uint ip_reach_len; + uint ip_unreach_len; + uint ip_next_hop_len; + uint mp_reach_len; + uint mp_unreach_len; + uint mp_next_hop_len; + + byte *attrs; + byte *ip_reach_nlri; + byte *ip_unreach_nlri; + byte *ip_next_hop_data; + byte *mp_reach_nlri; + byte *mp_unreach_nlri; + byte *mp_next_hop_data; + + uint err_withdraw; + uint err_subcode; + jmp_buf err_jmpbuf; + + struct hostentry *hostentry; + adata *mpls_labels; + + /* Cached state for bgp_rte_update() */ + u32 last_id; + struct rte_src *last_src; + rta *cached_rta; +}; + #define BGP_PORT 179 #define BGP_VERSION 4 #define BGP_HEADER_LENGTH 19 @@ -192,13 +397,33 @@ struct bgp_bucket { #define BGP_RX_BUFFER_EXT_SIZE 65535 #define BGP_TX_BUFFER_EXT_SIZE 65535 -static inline uint bgp_max_packet_length(struct bgp_proto *p) -{ return p->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; } +static inline int bgp_channel_is_ipv4(struct bgp_channel *c) +{ return BGP_AFI(c->afi) == BGP_AFI_IPV4; } + +static inline int bgp_channel_is_ipv6(struct bgp_channel *c) +{ return BGP_AFI(c->afi) == BGP_AFI_IPV6; } + +static inline int bgp_cc_is_ipv4(struct bgp_channel_config *c) +{ return BGP_AFI(c->afi) == BGP_AFI_IPV4; } + +static inline int bgp_cc_is_ipv6(struct bgp_channel_config *c) +{ return BGP_AFI(c->afi) == BGP_AFI_IPV6; } + +static inline uint bgp_max_packet_length(struct bgp_conn *conn) +{ return conn->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; } + +static inline void +bgp_parse_error(struct bgp_parse_state *s, uint subcode) +{ + s->err_subcode = subcode; + longjmp(s->err_jmpbuf, 1); +} extern struct linpool *bgp_linpool; +extern struct linpool *bgp_linpool2; -void bgp_start_timer(struct timer *t, int value); +void bgp_start_timer(timer *t, uint value); void bgp_check_config(struct bgp_config *c); void bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int len); void bgp_close_conn(struct bgp_conn *c); @@ -208,9 +433,9 @@ void bgp_conn_enter_established_state(struct bgp_conn *conn); void bgp_conn_enter_close_state(struct bgp_conn *conn); void bgp_conn_enter_idle_state(struct bgp_conn *conn); void bgp_handle_graceful_restart(struct bgp_proto *p); -void bgp_graceful_restart_done(struct bgp_proto *p); -void bgp_refresh_begin(struct bgp_proto *p); -void bgp_refresh_end(struct bgp_proto *p); +void bgp_graceful_restart_done(struct bgp_channel *c); +void bgp_refresh_begin(struct bgp_channel *c); +void bgp_refresh_end(struct bgp_channel *c); void bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code); void bgp_stop(struct bgp_proto *p, uint subcode, byte *data, uint len); @@ -233,48 +458,73 @@ struct rte_source *bgp_get_source(struct bgp_proto *p, u32 path_id); /* attrs.c */ -/* Hack: although BA_NEXT_HOP attribute has type EAF_TYPE_IP_ADDRESS, in IPv6 - * we store two addesses in it - a global address and a link local address. - */ -#ifdef IPV6 -#define NEXT_HOP_LENGTH (2*sizeof(ip_addr)) -static inline void set_next_hop(byte *b, ip_addr addr) { ((ip_addr *) b)[0] = addr; ((ip_addr *) b)[1] = IPA_NONE; } -#else -#define NEXT_HOP_LENGTH sizeof(ip_addr) -static inline void set_next_hop(byte *b, ip_addr addr) { ((ip_addr *) b)[0] = addr; } -#endif +static inline eattr * +bgp_find_attr(ea_list *attrs, uint code) +{ + return ea_find(attrs, EA_CODE(EAP_BGP, code)); +} + +eattr * +bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintptr_t val); + +static inline void +bgp_set_attr_u32(ea_list **to, struct linpool *pool, uint code, uint flags, u32 val) +{ bgp_set_attr(to, pool, code, flags, (uintptr_t) val); } + +static inline void +bgp_set_attr_ptr(ea_list **to, struct linpool *pool, uint code, uint flags, struct adata *val) +{ bgp_set_attr(to, pool, code, flags, (uintptr_t) val); } + +static inline void +bgp_set_attr_data(ea_list **to, struct linpool *pool, uint code, uint flags, void *data, uint len) +{ + struct adata *a = lp_alloc_adata(pool, len); + memcpy(a->data, data, len); + bgp_set_attr(to, pool, code, flags, (uintptr_t) a); +} + +static inline void +bgp_unset_attr(ea_list **to, struct linpool *pool, uint code) +{ eattr *e = bgp_set_attr(to, pool, code, 0, 0); e->type = EAF_TYPE_UNDEF; } + + +int bgp_encode_attrs(struct bgp_write_state *s, ea_list *attrs, byte *buf, byte *end); +ea_list * bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len); + +void bgp_init_bucket_table(struct bgp_channel *c); +void bgp_free_bucket_table(struct bgp_channel *c); +void bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b); +void bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b); +void bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b); + +void bgp_init_prefix_table(struct bgp_channel *c); +void bgp_free_prefix_table(struct bgp_channel *c); +void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *bp); -void bgp_attach_attr(struct ea_list **to, struct linpool *pool, unsigned attr, uintptr_t val); -byte *bgp_attach_attr_wa(struct ea_list **to, struct linpool *pool, unsigned attr, unsigned len); -struct rta *bgp_decode_attrs(struct bgp_conn *conn, byte *a, uint len, struct linpool *pool, int mandatory); -int bgp_get_attr(struct eattr *e, byte *buf, int buflen); int bgp_rte_better(struct rte *, struct rte *); int bgp_rte_mergable(rte *pri, rte *sec); int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best); -void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs); +void bgp_rt_notify(struct proto *P, struct channel *C, net *n, rte *new, rte *old, ea_list *attrs); int bgp_import_control(struct proto *, struct rte **, struct ea_list **, struct linpool *); -void bgp_init_bucket_table(struct bgp_proto *); -void bgp_free_bucket_table(struct bgp_proto *p); -void bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck); -void bgp_init_prefix_table(struct bgp_proto *p, u32 order); -void bgp_free_prefix_table(struct bgp_proto *p); -void bgp_free_prefix(struct bgp_proto *p, struct bgp_prefix *bp); -uint bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains); +int bgp_get_attr(struct eattr *e, byte *buf, int buflen); void bgp_get_route_info(struct rte *, byte *buf, struct ea_list *attrs); -inline static void bgp_attach_attr_ip(struct ea_list **to, struct linpool *pool, unsigned attr, ip_addr a) -{ *(ip_addr *) bgp_attach_attr_wa(to, pool, attr, sizeof(ip_addr)) = a; } /* packets.c */ void mrt_dump_bgp_state_change(struct bgp_conn *conn, unsigned old, unsigned new); -void bgp_schedule_packet(struct bgp_conn *conn, int type); +const struct bgp_af_desc *bgp_get_af_desc(u32 afi); +const struct bgp_af_caps *bgp_find_af_caps(struct bgp_caps *caps, u32 afi); +void bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type); void bgp_kick_tx(void *vconn); void bgp_tx(struct birdsock *sk); int bgp_rx(struct birdsock *sk, uint size); const char * bgp_error_dsc(unsigned code, unsigned subcode); void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len); +void bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to); + + /* Packet types */ #define PKT_OPEN 0x01 @@ -292,26 +542,25 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi #define BAF_PARTIAL 0x20 #define BAF_EXT_LEN 0x10 -#define BA_ORIGIN 0x01 /* [RFC1771] */ /* WM */ +#define BA_ORIGIN 0x01 /* RFC 4271 */ /* WM */ #define BA_AS_PATH 0x02 /* WM */ #define BA_NEXT_HOP 0x03 /* WM */ #define BA_MULTI_EXIT_DISC 0x04 /* ON */ #define BA_LOCAL_PREF 0x05 /* WD */ #define BA_ATOMIC_AGGR 0x06 /* WD */ #define BA_AGGREGATOR 0x07 /* OT */ -#define BA_COMMUNITY 0x08 /* [RFC1997] */ /* OT */ -#define BA_ORIGINATOR_ID 0x09 /* [RFC1966] */ /* ON */ -#define BA_CLUSTER_LIST 0x0a /* ON */ -/* We don't support these: */ -#define BA_DPA 0x0b /* ??? */ -#define BA_ADVERTISER 0x0c /* [RFC1863] */ -#define BA_RCID_PATH 0x0d -#define BA_MP_REACH_NLRI 0x0e /* [RFC2283] */ -#define BA_MP_UNREACH_NLRI 0x0f -#define BA_EXT_COMMUNITY 0x10 /* [RFC4360] */ -#define BA_AS4_PATH 0x11 /* [RFC4893] */ -#define BA_AS4_AGGREGATOR 0x12 -#define BA_LARGE_COMMUNITY 0x20 /* [RFC8092] */ +#define BA_COMMUNITY 0x08 /* RFC 1997 */ /* OT */ +#define BA_ORIGINATOR_ID 0x09 /* RFC 4456 */ /* ON */ +#define BA_CLUSTER_LIST 0x0a /* RFC 4456 */ /* ON */ +#define BA_MP_REACH_NLRI 0x0e /* RFC 4760 */ +#define BA_MP_UNREACH_NLRI 0x0f /* RFC 4760 */ +#define BA_EXT_COMMUNITY 0x10 /* RFC 4360 */ +#define BA_AS4_PATH 0x11 /* RFC 6793 */ +#define BA_AS4_AGGREGATOR 0x12 /* RFC 6793 */ +#define BA_LARGE_COMMUNITY 0x20 /* RFC 8092 */ + +/* Bird's private internal BGP attributes */ +#define BA_MPLS_LABEL_STACK 0xfe /* MPLS label stack transfer attribute */ /* BGP connection states */ @@ -331,14 +580,12 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi * * When BGP protocol is started by core, it goes to BSS_PREPARE. When BGP * protocol done what is neccessary to start itself (like acquiring the lock), - * it goes to BSS_CONNECT. When some connection attempt failed because of - * option or capability error, it goes to BSS_CONNECT_NOCAP. + * it goes to BSS_CONNECT. */ #define BSS_PREPARE 0 /* Used before ordinary BGP started, i. e. waiting for lock */ #define BSS_DELAY 1 /* Startup delay due to previous errors */ #define BSS_CONNECT 2 /* Ordinary BGP connecting */ -#define BSS_CONNECT_NOCAP 3 /* Legacy BGP connecting (without capabilities) */ /* BGP feed states (TX) @@ -347,7 +594,7 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi * * RFC 7313 specifies that a route refresh should be demarcated by BoRR and EoRR packets. * - * These states (stored in p->feed_state) are used to keep track of these + * These states (stored in c->feed_state) are used to keep track of these * requirements. When such feed is started, BFS_LOADING / BFS_REFRESHING is * set. When it ended, BFS_LOADED / BFS_REFRESHED is set to schedule End-of-RIB * or EoRR packet. When the packet is sent, the state returned to BFS_NONE. @@ -403,15 +650,5 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi #define ORIGIN_EGP 1 #define ORIGIN_INCOMPLETE 2 -/* Address families */ - -#define BGP_AF_IPV4 1 -#define BGP_AF_IPV6 2 - -#ifdef IPV6 -#define BGP_AF BGP_AF_IPV6 -#else -#define BGP_AF BGP_AF_IPV4 -#endif #endif diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y index 55c602f1..6ce0f1aa 100644 --- a/proto/bgp/config.Y +++ b/proto/bgp/config.Y @@ -13,28 +13,32 @@ CF_HDR CF_DEFINES #define BGP_CFG ((struct bgp_config *) this_proto) +#define BGP_CC ((struct bgp_channel_config *) this_channel) CF_DECLS -CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, - KEEPALIVE, MULTIHOP, STARTUP, VIA, NEXT, HOP, SELF, DEFAULT, - PATH, METRIC, ERROR, START, DELAY, FORGET, WAIT, ENABLE, - DISABLE, AFTER, BGP_PATH, BGP_LOCAL_PREF, BGP_MED, BGP_ORIGIN, - BGP_NEXT_HOP, BGP_ATOMIC_AGGR, BGP_AGGREGATOR, BGP_COMMUNITY, - BGP_EXT_COMMUNITY, SOURCE, ADDRESS, PASSWORD, RR, RS, CLIENT, - CLUSTER, ID, AS4, ADVERTISE, IPV4, CAPABILITIES, LIMIT, PASSIVE, - PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH, - INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP, - TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC, - SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX, GRACEFUL, RESTART, AWARE, - CHECK, LINK, PORT, EXTENDED, MESSAGES, SETKEY, BGP_LARGE_COMMUNITY) +CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE, + MULTIHOP, STARTUP, VIA, NEXT, HOP, SELF, DEFAULT, PATH, METRIC, ERROR, + START, DELAY, FORGET, WAIT, ENABLE, DISABLE, AFTER, BGP_PATH, + BGP_LOCAL_PREF, BGP_MED, BGP_ORIGIN, BGP_NEXT_HOP, BGP_ATOMIC_AGGR, + BGP_AGGREGATOR, BGP_COMMUNITY, BGP_EXT_COMMUNITY, BGP_LARGE_COMMUNITY, + SOURCE, ADDRESS, PASSWORD, RR, RS, CLIENT, CLUSTER, ID, AS4, ADVERTISE, + IPV4, CAPABILITIES, LIMIT, PASSIVE, PREFER, OLDER, MISSING, LLADDR, + DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, + BGP_CLUSTER_LIST, IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, + SECURITY, DETERMINISTIC, SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX, + GRACEFUL, RESTART, AWARE, CHECK, LINK, PORT, EXTENDED, MESSAGES, SETKEY, + STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6) + +%type <i32> bgp_afi CF_GRAMMAR -CF_ADDTO(proto, bgp_proto '}' { bgp_check_config(BGP_CFG); } ) +CF_ADDTO(proto, bgp_proto '}' ) bgp_proto_start: proto_start BGP { this_proto = proto_config_new(&proto_bgp, $1); + BGP_CFG->local_port = BGP_PORT; BGP_CFG->remote_port = BGP_PORT; BGP_CFG->multihop = -1; /* undefined */ BGP_CFG->hold_time = 240; @@ -49,26 +53,36 @@ bgp_proto_start: proto_start BGP { BGP_CFG->enable_refresh = 1; BGP_CFG->enable_as4 = 1; BGP_CFG->capabilities = 2; - BGP_CFG->advertise_ipv4 = 1; BGP_CFG->interpret_communities = 1; BGP_CFG->default_local_pref = 100; BGP_CFG->gr_mode = BGP_GR_AWARE; BGP_CFG->gr_time = 120; BGP_CFG->setkey = 1; - } + BGP_CFG->check_link = -1; + } + ; + +bgp_loc_opts: + /* empty */ + | bgp_loc_opts PORT expr { BGP_CFG->local_port = $3; if (($3<1) || ($3>65535)) cf_error("Invalid port number"); } + | bgp_loc_opts AS expr { BGP_CFG->local_as = $3; } ; bgp_nbr_opts: /* empty */ - | bgp_nbr_opts PORT expr { BGP_CFG->remote_port = $3; if (($3<1) || ($3>65535)) cf_error("Invalid port number"); } + | bgp_nbr_opts PORT expr { BGP_CFG->remote_port = $3; if (($3<1) || ($3>65535)) cf_error("Invalid port number"); } | bgp_nbr_opts AS expr { BGP_CFG->remote_as = $3; } ; bgp_proto: bgp_proto_start proto_name '{' | bgp_proto proto_item ';' - | bgp_proto LOCAL AS expr ';' { BGP_CFG->local_as = $4; } - | bgp_proto LOCAL ipa AS expr ';' { BGP_CFG->source_addr = $3; BGP_CFG->local_as = $5; } + | bgp_proto bgp_proto_channel ';' + | bgp_proto LOCAL bgp_loc_opts ';' + | bgp_proto LOCAL ipa ipa_scope bgp_loc_opts ';' { + BGP_CFG->local_ip = $3; + if ($4) BGP_CFG->iface = $4; + } | bgp_proto NEIGHBOR bgp_nbr_opts ';' | bgp_proto NEIGHBOR ipa ipa_scope bgp_nbr_opts ';' { if (ipa_nonzero(BGP_CFG->remote_ip)) @@ -78,20 +92,16 @@ bgp_proto: } | bgp_proto INTERFACE TEXT ';' { BGP_CFG->iface = if_get_by_name($3); } | bgp_proto RR CLUSTER ID idval ';' { BGP_CFG->rr_cluster_id = $5; } - | bgp_proto RR CLIENT ';' { BGP_CFG->rr_client = 1; } - | bgp_proto RS CLIENT ';' { BGP_CFG->rs_client = 1; } + | bgp_proto RR CLIENT bool ';' { BGP_CFG->rr_client = $4; } + | bgp_proto RS CLIENT bool ';' { BGP_CFG->rs_client = $4; } + | bgp_proto CONFEDERATION expr ';' { BGP_CFG->confederation = $3; } + | bgp_proto CONFEDERATION MEMBER bool ';' { BGP_CFG->confederation_member = $4; } | bgp_proto HOLD TIME expr ';' { BGP_CFG->hold_time = $4; } | bgp_proto STARTUP HOLD TIME expr ';' { BGP_CFG->initial_hold_time = $5; } | bgp_proto DIRECT ';' { BGP_CFG->multihop = 0; } | bgp_proto MULTIHOP ';' { BGP_CFG->multihop = 64; } | bgp_proto MULTIHOP expr ';' { BGP_CFG->multihop = $3; if (($3<1) || ($3>255)) cf_error("Multihop must be in range 1-255"); } - | bgp_proto NEXT HOP SELF ';' { BGP_CFG->next_hop_self = 1; BGP_CFG->next_hop_keep = 0; } - | bgp_proto NEXT HOP KEEP ';' { BGP_CFG->next_hop_keep = 1; BGP_CFG->next_hop_self = 0; } - | bgp_proto MISSING LLADDR SELF ';' { BGP_CFG->missing_lladdr = MLL_SELF; } - | bgp_proto MISSING LLADDR DROP ';' { BGP_CFG->missing_lladdr = MLL_DROP; } - | bgp_proto MISSING LLADDR IGNORE ';' { BGP_CFG->missing_lladdr = MLL_IGNORE; } - | bgp_proto GATEWAY DIRECT ';' { BGP_CFG->gw_mode = GW_DIRECT; } - | bgp_proto GATEWAY RECURSIVE ';' { BGP_CFG->gw_mode = GW_RECURSIVE; } + | bgp_proto STRICT BIND bool ';' { BGP_CFG->strict_bind = $4; } | bgp_proto PATH METRIC bool ';' { BGP_CFG->compare_path_lengths = $4; } | bgp_proto MED METRIC bool ';' { BGP_CFG->med_metric = $4; } | bgp_proto IGP METRIC bool ';' { BGP_CFG->igp_metric = $4; } @@ -99,7 +109,7 @@ bgp_proto: | bgp_proto DETERMINISTIC MED bool ';' { BGP_CFG->deterministic_med = $4; } | bgp_proto DEFAULT BGP_MED expr ';' { BGP_CFG->default_med = $4; } | bgp_proto DEFAULT BGP_LOCAL_PREF expr ';' { BGP_CFG->default_local_pref = $4; } - | bgp_proto SOURCE ADDRESS ipa ';' { BGP_CFG->source_addr = $4; } + | bgp_proto SOURCE ADDRESS ipa ';' { BGP_CFG->local_ip = $4; } | bgp_proto START DELAY TIME expr ';' { BGP_CFG->connect_delay_time = $5; log(L_WARN "%s: Start delay time option is deprecated, use connect delay time", this_proto->name); } | bgp_proto CONNECT DELAY TIME expr ';' { BGP_CFG->connect_delay_time = $5; } | bgp_proto CONNECT RETRY TIME expr ';' { BGP_CFG->connect_retry_time = $5; } @@ -111,33 +121,101 @@ bgp_proto: | bgp_proto ENABLE AS4 bool ';' { BGP_CFG->enable_as4 = $4; } | bgp_proto ENABLE EXTENDED MESSAGES bool ';' { BGP_CFG->enable_extended_messages = $5; } | bgp_proto CAPABILITIES bool ';' { BGP_CFG->capabilities = $3; } - | bgp_proto ADVERTISE IPV4 bool ';' { BGP_CFG->advertise_ipv4 = $4; } | bgp_proto PASSWORD text ';' { BGP_CFG->password = $3; } | bgp_proto SETKEY bool ';' { BGP_CFG->setkey = $3; } - | bgp_proto ROUTE LIMIT expr ';' { - this_proto->in_limit = cfg_allocz(sizeof(struct proto_limit)); - this_proto->in_limit->limit = $4; - this_proto->in_limit->action = PLA_RESTART; - log(L_WARN "%s: Route limit option is deprecated, use import limit", this_proto->name); - } | bgp_proto PASSIVE bool ';' { BGP_CFG->passive = $3; } | bgp_proto INTERPRET COMMUNITIES bool ';' { BGP_CFG->interpret_communities = $4; } - | bgp_proto SECONDARY bool ';' { BGP_CFG->secondary = $3; } - | bgp_proto ADD PATHS RX ';' { BGP_CFG->add_path = ADD_PATH_RX; } - | bgp_proto ADD PATHS TX ';' { BGP_CFG->add_path = ADD_PATH_TX; } - | bgp_proto ADD PATHS bool ';' { BGP_CFG->add_path = $4 ? ADD_PATH_FULL : 0; } - | bgp_proto ALLOW BGP_LOCAL_PREF bool ';' { BGP_CFG->allow_local_pref = $4; } | bgp_proto ALLOW LOCAL AS ';' { BGP_CFG->allow_local_as = -1; } | bgp_proto ALLOW LOCAL AS expr ';' { BGP_CFG->allow_local_as = $5; } + | bgp_proto ALLOW BGP_LOCAL_PREF bool ';' { BGP_CFG->allow_local_pref = $4; } | bgp_proto GRACEFUL RESTART bool ';' { BGP_CFG->gr_mode = $4; } | bgp_proto GRACEFUL RESTART AWARE ';' { BGP_CFG->gr_mode = BGP_GR_AWARE; } | bgp_proto GRACEFUL RESTART TIME expr ';' { BGP_CFG->gr_time = $5; } - | bgp_proto IGP TABLE rtable ';' { BGP_CFG->igp_table = $4; } | bgp_proto TTL SECURITY bool ';' { BGP_CFG->ttl_security = $4; } | bgp_proto CHECK LINK bool ';' { BGP_CFG->check_link = $4; } | bgp_proto BFD bool ';' { BGP_CFG->bfd = $3; cf_check_bfd($3); } ; +bgp_afi: + IPV4 { $$ = BGP_AF_IPV4; } + | IPV6 { $$ = BGP_AF_IPV6; } + | IPV4 MULTICAST { $$ = BGP_AF_IPV4_MC; } + | IPV6 MULTICAST { $$ = BGP_AF_IPV6_MC; } + | IPV4 MPLS { $$ = BGP_AF_IPV4_MPLS; } + | IPV6 MPLS { $$ = BGP_AF_IPV6_MPLS; } + | VPN4 MPLS { $$ = BGP_AF_VPN4_MPLS; } + | VPN6 MPLS { $$ = BGP_AF_VPN6_MPLS; } + | VPN4 MULTICAST { $$ = BGP_AF_VPN4_MC; } + | VPN6 MULTICAST { $$ = BGP_AF_VPN6_MC; } + | FLOW4 { $$ = BGP_AF_FLOW4; } + | FLOW6 { $$ = BGP_AF_FLOW6; } + ; + +bgp_channel_start: bgp_afi +{ + const struct bgp_af_desc *desc = bgp_get_af_desc($1); + + if (!desc) + cf_error("Unknown AFI/SAFI"); + + this_channel = channel_config_new(&channel_bgp, desc->net, this_proto); + BGP_CC->c.name = desc->name; + BGP_CC->c.ra_mode = RA_UNDEF; + BGP_CC->afi = $1; + BGP_CC->desc = desc; + BGP_CC->gr_able = 0xff; /* undefined */ +}; + +bgp_channel_item: + channel_item + | NEXT HOP ADDRESS ipa { BGP_CC->next_hop_addr = $4; } + | NEXT HOP SELF { BGP_CC->next_hop_self = 1; BGP_CC->next_hop_keep = 0; } + | NEXT HOP KEEP { BGP_CC->next_hop_keep = 1; BGP_CC->next_hop_self = 0; } + | MISSING LLADDR SELF { BGP_CC->missing_lladdr = MLL_SELF; } + | MISSING LLADDR DROP { BGP_CC->missing_lladdr = MLL_DROP; } + | MISSING LLADDR IGNORE { BGP_CC->missing_lladdr = MLL_IGNORE; } + | GATEWAY DIRECT { BGP_CC->gw_mode = GW_DIRECT; } + | GATEWAY RECURSIVE { BGP_CC->gw_mode = GW_RECURSIVE; } + | SECONDARY bool { BGP_CC->secondary = $2; } + | GRACEFUL RESTART bool { BGP_CC->gr_able = $3; } + | EXTENDED NEXT HOP bool { BGP_CC->ext_next_hop = $4; } + | ADD PATHS RX { BGP_CC->add_path = BGP_ADD_PATH_RX; } + | ADD PATHS TX { BGP_CC->add_path = BGP_ADD_PATH_TX; } + | ADD PATHS bool { BGP_CC->add_path = $3 ? BGP_ADD_PATH_FULL : 0; } + | IGP TABLE rtable { + if (BGP_CC->desc->no_igp) + cf_error("IGP table not allowed here"); + + if ($3->addr_type == NET_IP4) + BGP_CC->igp_table_ip4 = $3; + else if ($3->addr_type == NET_IP6) + BGP_CC->igp_table_ip6 = $3; + else + cf_error("Mismatched IGP table type"); + } + ; + +bgp_channel_opts: + /* empty */ + | bgp_channel_opts bgp_channel_item ';' + ; + +bgp_channel_opt_list: + /* empty */ + | '{' bgp_channel_opts '}' + ; + +bgp_channel_end: +{ + if (!this_channel->table) + cf_error("Routing table not specified"); + + this_channel = NULL; +}; + +bgp_proto_channel: bgp_channel_start bgp_channel_opt_list bgp_channel_end; + + CF_ADDTO(dynamic_attr, BGP_ORIGIN { $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_ENUM_BGP_ORIGIN, EA_CODE(EAP_BGP, BA_ORIGIN)); }) CF_ADDTO(dynamic_attr, BGP_PATH diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c index af3b15b5..95a974eb 100644 --- a/proto/bgp/packets.c +++ b/proto/bgp/packets.c @@ -2,12 +2,16 @@ * BIRD -- BGP Packet Processing * * (c) 2000 Martin Mares <mj@ucw.cz> + * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org> + * (c) 2008--2016 CZ.NIC z.s.p.o. * * Can be freely distributed and used under the terms of the GNU GPL. */ #undef LOCAL_DEBUG +#include <stdlib.h> + #include "nest/bird.h" #include "nest/iface.h" #include "nest/protocol.h" @@ -16,6 +20,7 @@ #include "nest/mrtdump.h" #include "conf/conf.h" #include "lib/unaligned.h" +#include "lib/flowspec.h" #include "lib/socket.h" #include "nest/cli.h" @@ -27,6 +32,13 @@ #define BGP_RR_BEGIN 1 #define BGP_RR_END 2 +#define BGP_NLRI_MAX (4 + 1 + 32) + +#define BGP_MPLS_BOS 1 /* Bottom-of-stack bit */ +#define BGP_MPLS_MAX 10 /* Max number of labels that 24*n <= 255 */ +#define BGP_MPLS_NULL 3 /* Implicit NULL label */ +#define BGP_MPLS_MAGIC 0x800000 /* Magic withdraw label value, RFC 3107 3 */ + static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS; static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS; @@ -38,6 +50,46 @@ static byte fsm_err_subcode[BS_MAX] = { [BS_ESTABLISHED] = 3 }; + +static struct bgp_channel * +bgp_get_channel(struct bgp_proto *p, u32 afi) +{ + uint i; + + for (i = 0; i < p->channel_count; i++) + if (p->afi_map[i] == afi) + return p->channel_map[i]; + + return NULL; +} + +static inline void +put_af3(byte *buf, u32 id) +{ + put_u16(buf, id >> 16); + buf[2] = id & 0xff; +} + +static inline void +put_af4(byte *buf, u32 id) +{ + put_u16(buf, id >> 16); + buf[2] = 0; + buf[3] = id & 0xff; +} + +static inline u32 +get_af3(byte *buf) +{ + return (get_u16(buf) << 16) | buf[2]; +} + +static inline u32 +get_af4(byte *buf) +{ + return (get_u16(buf) << 16) | buf[3]; +} + /* * MRT Dump format is not semantically specified. * We will use these values in appropriate fields: @@ -58,31 +110,41 @@ static byte * mrt_put_bgp4_hdr(byte *buf, struct bgp_conn *conn, int as4) { struct bgp_proto *p = conn->bgp; + uint v4 = ipa_is_ip4(p->cf->remote_ip); if (as4) - { - put_u32(buf+0, p->remote_as); - put_u32(buf+4, p->local_as); - buf+=8; - } + { + put_u32(buf+0, p->remote_as); + put_u32(buf+4, p->public_as); + buf+=8; + } else - { - put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS); - put_u16(buf+2, (p->local_as <= 0xFFFF) ? p->local_as : AS_TRANS); - buf+=4; - } + { + put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS); + put_u16(buf+2, (p->public_as <= 0xFFFF) ? p->public_as : AS_TRANS); + buf+=4; + } put_u16(buf+0, (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0); - put_u16(buf+2, BGP_AF); + put_u16(buf+2, v4 ? BGP_AFI_IPV4 : BGP_AFI_IPV6); buf+=4; - buf = put_ipa(buf, conn->sk ? conn->sk->daddr : IPA_NONE); - buf = put_ipa(buf, conn->sk ? conn->sk->saddr : IPA_NONE); + + if (v4) + { + buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->daddr) : IP4_NONE); + buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->saddr) : IP4_NONE); + } + else + { + buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->daddr) : IP6_NONE); + buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->saddr) : IP6_NONE); + } return buf; } static void -mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, unsigned len) +mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, uint len) { byte *buf = alloca(128+len); /* 128 is enough for MRT headers */ byte *bp = buf + MRTDUMP_HDR_LENGTH; @@ -96,14 +158,14 @@ mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, unsigned len) } static inline u16 -convert_state(unsigned state) +convert_state(uint state) { /* Convert state from our BS_* values to values used in MRTDump */ return (state == BS_CLOSE) ? 1 : state + 1; } void -mrt_dump_bgp_state_change(struct bgp_conn *conn, unsigned old, unsigned new) +mrt_dump_bgp_state_change(struct bgp_conn *conn, uint old, uint new) { byte buf[128]; byte *bp = buf + MRTDUMP_HDR_LENGTH; @@ -127,1303 +189,2429 @@ bgp_create_notification(struct bgp_conn *conn, byte *buf) return buf + 2 + conn->notify_size; } -#ifdef IPV6 -static byte * -bgp_put_cap_ipv6(struct bgp_proto *p UNUSED, byte *buf) -{ - *buf++ = 1; /* Capability 1: Multiprotocol extensions */ - *buf++ = 4; /* Capability data length */ - *buf++ = 0; /* We support AF IPv6 */ - *buf++ = BGP_AF_IPV6; - *buf++ = 0; /* RFU */ - *buf++ = 1; /* and SAFI 1 */ - return buf; -} -#else +/* Capability negotiation as per RFC 5492 */ -static byte * -bgp_put_cap_ipv4(struct bgp_proto *p UNUSED, byte *buf) -{ - *buf++ = 1; /* Capability 1: Multiprotocol extensions */ - *buf++ = 4; /* Capability data length */ - *buf++ = 0; /* We support AF IPv4 */ - *buf++ = BGP_AF_IPV4; - *buf++ = 0; /* RFU */ - *buf++ = 1; /* and SAFI 1 */ - return buf; +const struct bgp_af_caps * +bgp_find_af_caps(struct bgp_caps *caps, u32 afi) +{ + struct bgp_af_caps *ac; + + WALK_AF_CAPS(caps, ac) + if (ac->afi == afi) + return ac; + + return NULL; } -#endif -static byte * -bgp_put_cap_rr(struct bgp_proto *p UNUSED, byte *buf) +static struct bgp_af_caps * +bgp_get_af_caps(struct bgp_caps *caps, u32 afi) { - *buf++ = 2; /* Capability 2: Support for route refresh */ - *buf++ = 0; /* Capability data length */ - return buf; + struct bgp_af_caps *ac; + + WALK_AF_CAPS(caps, ac) + if (ac->afi == afi) + return ac; + + ac = &caps->af_data[caps->af_count++]; + memset(ac, 0, sizeof(struct bgp_af_caps)); + ac->afi = afi; + + return ac; } -static byte * -bgp_put_cap_ext_msg(struct bgp_proto *p UNUSED, byte *buf) +static int +bgp_af_caps_cmp(const void *X, const void *Y) { - *buf++ = 6; /* Capability 6: Support for extended messages */ - *buf++ = 0; /* Capability data length */ - return buf; + const struct bgp_af_caps *x = X, *y = Y; + return (x->afi < y->afi) ? -1 : (x->afi > y->afi) ? 1 : 0; } + static byte * -bgp_put_cap_gr1(struct bgp_proto *p, byte *buf) +bgp_write_capabilities(struct bgp_conn *conn, byte *buf) { - *buf++ = 64; /* Capability 64: Support for graceful restart */ - *buf++ = 6; /* Capability data length */ + struct bgp_proto *p = conn->bgp; + struct bgp_channel *c; + struct bgp_caps *caps; + struct bgp_af_caps *ac; + uint any_ext_next_hop = 0; + uint any_add_path = 0; + byte *data; - put_u16(buf, p->cf->gr_time); - if (p->p.gr_recovery) - buf[0] |= BGP_GRF_RESTART; - buf += 2; + /* Prepare bgp_caps structure */ + + int n = list_length(&p->p.channels); + caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + n * sizeof(struct bgp_af_caps)); + conn->local_caps = caps; + + caps->as4_support = p->cf->enable_as4; + caps->ext_messages = p->cf->enable_extended_messages; + caps->route_refresh = p->cf->enable_refresh; + caps->enhanced_refresh = p->cf->enable_refresh; + + if (caps->as4_support) + caps->as4_number = p->public_as; + + if (p->cf->gr_mode) + { + caps->gr_aware = 1; + caps->gr_time = p->cf->gr_time; + caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0; + } + + /* Allocate and fill per-AF fields */ + WALK_LIST(c, p->p.channels) + { + ac = &caps->af_data[caps->af_count++]; + ac->afi = c->afi; + ac->ready = 1; + + ac->ext_next_hop = bgp_channel_is_ipv4(c) && c->cf->ext_next_hop; + any_ext_next_hop |= ac->ext_next_hop; + + ac->add_path = c->cf->add_path; + any_add_path |= ac->add_path; + + if (c->cf->gr_able) + { + ac->gr_able = 1; + + if (p->p.gr_recovery) + ac->gr_af_flags |= BGP_GRF_FORWARDING; + } + } - *buf++ = 0; /* Appropriate AF */ - *buf++ = BGP_AF; - *buf++ = 1; /* and SAFI 1 */ - *buf++ = p->p.gr_recovery ? BGP_GRF_FORWARDING : 0; + /* Sort capability fields by AFI/SAFI */ + qsort(caps->af_data, caps->af_count, sizeof(struct bgp_af_caps), bgp_af_caps_cmp); + + + /* Create capability list in buffer */ + + /* + * Note that max length is ~ 20+14*af_count. With max 12 channels that is + * 188. Option limit is 253 and buffer size is 4096, so we cannot overflow + * unless we add new capabilities or more AFs. + */ + + WALK_AF_CAPS(caps, ac) + if (ac->ready) + { + *buf++ = 1; /* Capability 1: Multiprotocol extensions */ + *buf++ = 4; /* Capability data length */ + put_af4(buf, ac->afi); + buf += 4; + } + + if (caps->route_refresh) + { + *buf++ = 2; /* Capability 2: Support for route refresh */ + *buf++ = 0; /* Capability data length */ + } + + if (any_ext_next_hop) + { + *buf++ = 5; /* Capability 5: Support for extended next hop */ + *buf++ = 0; /* Capability data length, will be fixed later */ + data = buf; + + WALK_AF_CAPS(caps, ac) + if (ac->ext_next_hop) + { + put_af4(buf, ac->afi); + put_u16(buf+4, BGP_AFI_IPV6); + buf += 6; + } + + data[-1] = buf - data; + } + + if (caps->ext_messages) + { + *buf++ = 6; /* Capability 6: Support for extended messages */ + *buf++ = 0; /* Capability data length */ + } + + if (caps->gr_aware) + { + *buf++ = 64; /* Capability 64: Support for graceful restart */ + *buf++ = 0; /* Capability data length, will be fixed later */ + data = buf; + + put_u16(buf, caps->gr_time); + buf[0] |= caps->gr_flags; + buf += 2; + + WALK_AF_CAPS(caps, ac) + if (ac->gr_able) + { + put_af3(buf, ac->afi); + buf[3] = ac->gr_af_flags; + buf += 4; + } + + data[-1] = buf - data; + } + + if (caps->as4_support) + { + *buf++ = 65; /* Capability 65: Support for 4-octet AS number */ + *buf++ = 4; /* Capability data length */ + put_u32(buf, p->public_as); + buf += 4; + } + + if (any_add_path) + { + *buf++ = 69; /* Capability 69: Support for ADD-PATH */ + *buf++ = 0; /* Capability data length, will be fixed later */ + data = buf; + + WALK_AF_CAPS(caps, ac) + if (ac->add_path) + { + put_af3(buf, ac->afi); + buf[3] = ac->add_path; + buf += 4; + } + + data[-1] = buf - data; + } + + if (caps->enhanced_refresh) + { + *buf++ = 70; /* Capability 70: Support for enhanced route refresh */ + *buf++ = 0; /* Capability data length */ + } return buf; } -static byte * -bgp_put_cap_gr2(struct bgp_proto *p UNUSED, byte *buf) +static void +bgp_read_capabilities(struct bgp_conn *conn, struct bgp_caps *caps, byte *pos, int len) { - *buf++ = 64; /* Capability 64: Support for graceful restart */ - *buf++ = 2; /* Capability data length */ - put_u16(buf, 0); - return buf + 2; -} + struct bgp_proto *p = conn->bgp; + struct bgp_af_caps *ac; + int i, cl; + u32 af; -static byte * -bgp_put_cap_as4(struct bgp_proto *p, byte *buf) -{ - *buf++ = 65; /* Capability 65: Support for 4-octet AS number */ - *buf++ = 4; /* Capability data length */ - put_u32(buf, p->local_as); - return buf + 4; -} + while (len > 0) + { + if (len < 2 || len < (2 + pos[1])) + goto err; -static byte * -bgp_put_cap_add_path(struct bgp_proto *p, byte *buf) -{ - *buf++ = 69; /* Capability 69: Support for ADD-PATH */ - *buf++ = 4; /* Capability data length */ + /* Capability length */ + cl = pos[1]; + + /* Capability type */ + switch (pos[0]) + { + case 1: /* Multiprotocol capability, RFC 4760 */ + if (cl != 4) + goto err; - *buf++ = 0; /* Appropriate AF */ - *buf++ = BGP_AF; - *buf++ = 1; /* SAFI 1 */ + af = get_af4(pos+2); + ac = bgp_get_af_caps(caps, af); + ac->ready = 1; + break; - *buf++ = p->cf->add_path; + case 2: /* Route refresh capability, RFC 2918 */ + if (cl != 0) + goto err; - return buf; + caps->route_refresh = 1; + break; + + case 5: /* Extended next hop encoding capability, RFC 5549 */ + if (cl % 6) + goto err; + + for (i = 0; i < cl; i += 6) + { + /* Specified only for IPv4 prefixes with IPv6 next hops */ + if ((get_u16(pos+2+i+0) != BGP_AFI_IPV4) || + (get_u16(pos+2+i+4) != BGP_AFI_IPV6)) + continue; + + af = get_af4(pos+2+i); + ac = bgp_get_af_caps(caps, af); + ac->ext_next_hop = 1; + } + break; + + case 6: /* Extended message length capability, RFC draft */ + if (cl != 0) + goto err; + + caps->ext_messages = 1; + break; + + case 64: /* Graceful restart capability, RFC 4724 */ + if (cl % 4 != 2) + goto err; + + /* Only the last instance is valid */ + WALK_AF_CAPS(caps, ac) + { + ac->gr_able = 0; + ac->gr_af_flags = 0; + } + + caps->gr_aware = 1; + caps->gr_flags = pos[2] & 0xf0; + caps->gr_time = get_u16(pos + 2) & 0x0fff; + + for (i = 2; i < cl; i += 4) + { + af = get_af3(pos+2+i); + ac = bgp_get_af_caps(caps, af); + ac->gr_able = 1; + ac->gr_af_flags = pos[2+i+3]; + } + break; + + case 65: /* AS4 capability, RFC 6793 */ + if (cl != 4) + goto err; + + caps->as4_support = 1; + caps->as4_number = get_u32(pos + 2); + break; + + case 69: /* ADD-PATH capability, RFC 7911 */ + if (cl % 4) + goto err; + + for (i = 0; i < cl; i += 4) + { + byte val = pos[2+i+3]; + if (!val || (val > BGP_ADD_PATH_FULL)) + { + log(L_WARN "%s: Got ADD-PATH capability with unknown value %u, ignoring", + p->p.name, val); + break; + } + } + + for (i = 0; i < cl; i += 4) + { + af = get_af3(pos+2+i); + ac = bgp_get_af_caps(caps, af); + ac->add_path = pos[2+i+3]; + } + break; + + case 70: /* Enhanced route refresh capability, RFC 7313 */ + if (cl != 0) + goto err; + + caps->enhanced_refresh = 1; + break; + + /* We can safely ignore all other capabilities */ + } + + ADVANCE(pos, len, 2 + cl); + } + return; + +err: + bgp_error(conn, 2, 0, NULL, 0); + return; } -static byte * -bgp_put_cap_err(struct bgp_proto *p UNUSED, byte *buf) +static int +bgp_read_options(struct bgp_conn *conn, byte *pos, int len) { - *buf++ = 70; /* Capability 70: Support for enhanced route refresh */ - *buf++ = 0; /* Capability data length */ - return buf; -} + struct bgp_proto *p = conn->bgp; + struct bgp_caps *caps; + int ol; + + /* Max number of announced AFIs is limited by max option length (255) */ + caps = alloca(sizeof(struct bgp_caps) + 64 * sizeof(struct bgp_af_caps)); + memset(caps, 0, sizeof(struct bgp_caps)); + + while (len > 0) + { + if ((len < 2) || (len < (2 + pos[1]))) + { bgp_error(conn, 2, 0, NULL, 0); return -1; } + ol = pos[1]; + if (pos[0] == 2) + { + /* BGP capabilities, RFC 5492 */ + if (p->cf->capabilities) + bgp_read_capabilities(conn, caps, pos + 2, ol); + } + else + { + /* Unknown option */ + bgp_error(conn, 2, 4, pos, ol); /* FIXME: ol or ol+2 ? */ + return -1; + } + + ADVANCE(pos, len, 2 + ol); + } + + uint n = sizeof(struct bgp_caps) + caps->af_count * sizeof(struct bgp_af_caps); + conn->remote_caps = mb_allocz(p->p.pool, n); + memcpy(conn->remote_caps, caps, n); + + return 0; +} static byte * bgp_create_open(struct bgp_conn *conn, byte *buf) { struct bgp_proto *p = conn->bgp; - byte *cap; - int cap_len; BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)", - BGP_VERSION, p->local_as, p->cf->hold_time, p->local_id); + BGP_VERSION, p->public_as, p->cf->hold_time, p->local_id); + buf[0] = BGP_VERSION; - put_u16(buf+1, (p->local_as < 0xFFFF) ? p->local_as : AS_TRANS); + put_u16(buf+1, (p->public_as < 0xFFFF) ? p->public_as : AS_TRANS); put_u16(buf+3, p->cf->hold_time); put_u32(buf+5, p->local_id); - if (conn->start_state == BSS_CONNECT_NOCAP) - { - BGP_TRACE(D_PACKETS, "Skipping capabilities"); - buf[9] = 0; - return buf + 10; - } + if (p->cf->capabilities) + { + /* Prepare local_caps and write capabilities to buffer */ + byte *end = bgp_write_capabilities(conn, buf+12); + uint len = end - (buf+12); + + buf[9] = len + 2; /* Optional parameters length */ + buf[10] = 2; /* Option 2: Capability list */ + buf[11] = len; /* Option data length */ + + return end; + } + else + { + /* Prepare empty local_caps */ + conn->local_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps)); + + buf[9] = 0; /* No optional parameters */ + return buf + 10; + } + + return buf; +} + +static void +bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) +{ + struct bgp_proto *p = conn->bgp; + struct bgp_conn *other; + u32 asn, hold, id; - /* Skipped 3 B for length field and Capabilities parameter header */ - cap = buf + 12; + /* Check state */ + if (conn->state != BS_OPENSENT) + { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } -#ifndef IPV6 - if (p->cf->advertise_ipv4) - cap = bgp_put_cap_ipv4(p, cap); -#endif + /* Check message contents */ + if (len < 29 || len != 29 + (uint) pkt[28]) + { bgp_error(conn, 1, 2, pkt+16, 2); return; } + + if (pkt[19] != BGP_VERSION) + { u16 val = BGP_VERSION; bgp_error(conn, 2, 1, (byte *) &val, 2); return; } -#ifdef IPV6 - cap = bgp_put_cap_ipv6(p, cap); -#endif + asn = get_u16(pkt+20); + hold = get_u16(pkt+22); + id = get_u32(pkt+24); + BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%R)", asn, hold, id); - if (p->cf->enable_refresh) - cap = bgp_put_cap_rr(p, cap); + if (bgp_read_options(conn, pkt+29, pkt[28]) < 0) + return; - if (p->cf->gr_mode == BGP_GR_ABLE) - cap = bgp_put_cap_gr1(p, cap); - else if (p->cf->gr_mode == BGP_GR_AWARE) - cap = bgp_put_cap_gr2(p, cap); + if (hold > 0 && hold < 3) + { bgp_error(conn, 2, 6, pkt+22, 2); return; } - if (p->cf->enable_as4) - cap = bgp_put_cap_as4(p, cap); + /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */ + if (!id || (p->is_internal && id == p->local_id)) + { bgp_error(conn, 2, 3, pkt+24, -4); return; } - if (p->cf->add_path) - cap = bgp_put_cap_add_path(p, cap); + struct bgp_caps *caps = conn->remote_caps; - if (p->cf->enable_refresh) - cap = bgp_put_cap_err(p, cap); + if (caps->as4_support) + { + u32 as4 = caps->as4_number; - if (p->cf->enable_extended_messages) - cap = bgp_put_cap_ext_msg(p, cap); + if ((as4 != asn) && (asn != AS_TRANS)) + log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name); - cap_len = cap - buf - 12; - if (cap_len > 0) - { - buf[9] = cap_len + 2; /* Optional params len */ - buf[10] = 2; /* Option: Capability list */ - buf[11] = cap_len; /* Option length */ - return cap; - } + if (as4 != p->remote_as) + { as4 = htonl(as4); bgp_error(conn, 2, 2, (byte *) &as4, 4); return; } + } else + { + if (asn != p->remote_as) + { bgp_error(conn, 2, 2, pkt+20, 2); return; } + } + + /* Check the other connection */ + other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; + switch (other->state) + { + case BS_CONNECT: + case BS_ACTIVE: + /* Stop outgoing connection attempts */ + bgp_conn_enter_idle_state(other); + break; + + case BS_IDLE: + case BS_OPENSENT: + case BS_CLOSE: + break; + + case BS_OPENCONFIRM: + /* + * Description of collision detection rules in RFC 4271 is confusing and + * contradictory, but it is essentially: + * + * 1. Router with higher ID is dominant + * 2. If both have the same ID, router with higher ASN is dominant [RFC6286] + * 3. When both connections are in OpenConfirm state, one initiated by + * the dominant router is kept. + * + * The first line in the expression below evaluates whether the neighbor + * is dominant, the second line whether the new connection was initiated + * by the neighbor. If both are true (or both are false), we keep the new + * connection, otherwise we keep the old one. + */ + if (((p->local_id < id) || ((p->local_id == id) && (p->public_as < p->remote_as))) + == (conn == &p->incoming_conn)) { - buf[9] = 0; /* No optional parameters */ - return buf + 10; + /* Should close the other connection */ + BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection"); + bgp_error(other, 6, 7, NULL, 0); + break; } + /* Fall thru */ + case BS_ESTABLISHED: + /* Should close this connection */ + BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection"); + bgp_error(conn, 6, 7, NULL, 0); + return; + + default: + bug("bgp_rx_open: Unknown state"); + } + + /* Update our local variables */ + conn->hold_time = MIN(hold, p->cf->hold_time); + conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3; + conn->as4_session = conn->local_caps->as4_support && caps->as4_support; + conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages; + p->remote_id = id; + + DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n", + conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session); + + bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE); + bgp_start_timer(conn->hold_timer, conn->hold_time); + bgp_conn_enter_openconfirm_state(conn); } -static uint -bgp_encode_prefixes(struct bgp_proto *p, byte *w, struct bgp_bucket *buck, uint remains) + +/* + * Next hop handling + */ + +#define REPORT(msg, args...) \ + ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); }) + +#define DISCARD(msg, args...) \ + ({ REPORT(msg, ## args); return; }) + +#define WITHDRAW(msg, args...) \ + ({ REPORT(msg, ## args); s->err_withdraw = 1; return; }) + +#define BAD_AFI "Unexpected AF <%u/%u> in UPDATE" +#define BAD_NEXT_HOP "Invalid NEXT_HOP attribute" +#define NO_NEXT_HOP "Missing NEXT_HOP attribute" +#define NO_LABEL_STACK "Missing MPLS stack" + + +static void +bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll) { - byte *start = w; - ip_addr a; - int bytes; + struct bgp_proto *p = s->proto; + struct bgp_channel *c = s->channel; - while (!EMPTY_LIST(buck->prefixes) && (remains >= (5+sizeof(ip_addr)))) - { - struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes)); - DBG("\tDequeued route %I/%d\n", px->n.prefix, px->n.pxlen); + if (c->cf->gw_mode == GW_DIRECT) + { + neighbor *nbr = NULL; - if (p->add_path_tx) - { - put_u32(w, px->path_id); - w += 4; - remains -= 4; - } + /* GW_DIRECT -> single_hop -> p->neigh != NULL */ + if (ipa_nonzero(gw)) + nbr = neigh_find2(&p->p, &gw, NULL, 0); + else if (ipa_nonzero(ll)) + nbr = neigh_find2(&p->p, &ll, p->neigh->iface, 0); - *w++ = px->n.pxlen; - bytes = (px->n.pxlen + 7) / 8; - a = px->n.prefix; - ipa_hton(a); - memcpy(w, &a, bytes); - w += bytes; - remains -= bytes + 1; - rem_node(&px->bucket_node); - bgp_free_prefix(p, px); - // fib_delete(&p->prefix_fib, px); - } - return w - start; + if (!nbr || (nbr->scope == SCOPE_HOST)) + WITHDRAW(BAD_NEXT_HOP); + + a->dest = RTD_UNICAST; + a->nh.gw = nbr->addr; + a->nh.iface = nbr->iface; + } + else /* GW_RECURSIVE */ + { + if (ipa_zero(gw)) + WITHDRAW(BAD_NEXT_HOP); + + rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6; + s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table); + + if (!s->mpls) + rta_apply_hostentry(a, s->hostentry, NULL); + + /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */ + } } static void -bgp_flush_prefixes(struct bgp_proto *p, struct bgp_bucket *buck) +bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum) { - while (!EMPTY_LIST(buck->prefixes)) - { - struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes)); - log(L_ERR "%s: - route %I/%d skipped", p->p.name, px->n.prefix, px->n.pxlen); - rem_node(&px->bucket_node); - bgp_free_prefix(p, px); - // fib_delete(&p->prefix_fib, px); - } + if (lnum > MPLS_MAX_LABEL_STACK) + { + REPORT("Too many MPLS labels ($u)", lnum); + + a->dest = RTD_UNREACHABLE; + a->hostentry = NULL; + a->nh = (struct nexthop) { }; + return; + } + + /* Handle implicit NULL as empty MPLS stack */ + if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL)) + lnum = 0; + + if (s->channel->cf->gw_mode == GW_DIRECT) + { + a->nh.labels = lnum; + memcpy(a->nh.label, labels, 4*lnum); + } + else /* GW_RECURSIVE */ + { + mpls_label_stack ms; + + ms.len = lnum; + memcpy(ms.stack, labels, 4*lnum); + rta_apply_hostentry(a, s->hostentry, &ms); + } } -#ifndef IPV6 /* IPv4 version */ -static byte * -bgp_create_update(struct bgp_conn *conn, byte *buf) +static inline int +bgp_use_next_hop(struct bgp_export_state *s, eattr *a) { - struct bgp_proto *p = conn->bgp; - struct bgp_bucket *buck; - int remains = bgp_max_packet_length(p) - BGP_HEADER_LENGTH - 4; - byte *w; - int wd_size = 0; - int r_size = 0; - int a_size = 0; - - w = buf+2; - if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) - { - DBG("Withdrawn routes:\n"); - wd_size = bgp_encode_prefixes(p, w, buck, remains); - w += wd_size; - remains -= wd_size; - } - put_u16(buf, wd_size); + struct bgp_proto *p = s->proto; + ip_addr *nh = (void *) a->u.ptr->data; - if (!wd_size) - { - while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next) - { - if (EMPTY_LIST(buck->prefixes)) - { - DBG("Deleting empty bucket %p\n", buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - } - - DBG("Processing bucket %p\n", buck); - a_size = bgp_encode_attrs(p, w+2, buck->eattrs, remains - 1024); - - if (a_size < 0) - { - log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name); - bgp_flush_prefixes(p, buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - } - - put_u16(w, a_size); - w += a_size + 2; - r_size = bgp_encode_prefixes(p, w, buck, remains - a_size); - w += r_size; - break; - } - } - if (!a_size) /* Attributes not already encoded */ + if (s->channel->cf->next_hop_self) + return 0; + + if (s->channel->cf->next_hop_keep) + return 1; + + /* Keep it when explicitly set in export filter */ + if (a->type & EAF_FRESH) + return 1; + + /* Keep it when exported to internal peers */ + if (p->is_interior && ipa_nonzero(*nh)) + return 1; + + /* Keep it when forwarded between single-hop BGPs on the same iface */ + struct iface *ifa = (s->src && s->src->neigh) ? s->src->neigh->iface : NULL; + return p->neigh && (p->neigh->iface == ifa); +} + +static inline int +bgp_use_gateway(struct bgp_export_state *s) +{ + struct bgp_proto *p = s->proto; + rta *ra = s->route->attrs; + + if (s->channel->cf->next_hop_self) + return 0; + + /* We need one valid global gateway */ + if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw)) + return 0; + + /* Use it when exported to internal peers */ + if (p->is_interior) + return 1; + + /* Use it when forwarded to single-hop BGP peer on on the same iface */ + return p->neigh && (p->neigh->iface == ra->nh.iface); +} + +static void +bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to) +{ + if (!a || !bgp_use_next_hop(s, a)) + { + if (bgp_use_gateway(s)) { - put_u16(w, 0); - w += 2; + rta *ra = s->route->attrs; + ip_addr nh[1] = { ra->nh.gw }; + bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16); + + if (s->mpls) + { + u32 implicit_null = BGP_MPLS_NULL; + u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null; + uint lnum = ra->nh.labels ? ra->nh.labels : 1; + bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4); + } } - if (wd_size || r_size) + else { - BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE"); - return w; + ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr }; + bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16); + + /* TODO: Use local MPLS assigned label */ + if (s->mpls) + bgp_unset_attr(to, s->pool, BA_MPLS_LABEL_STACK); } + } + + /* Check if next hop is valid */ + a = bgp_find_attr(*to, BA_NEXT_HOP); + if (!a) + WITHDRAW(NO_NEXT_HOP); + + ip_addr *nh = (void *) a->u.ptr->data; + ip_addr peer = s->proto->cf->remote_ip; + uint len = a->u.ptr->length; + + /* Forbid zero next hop */ + if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1]))) + WITHDRAW(BAD_NEXT_HOP); + + /* Forbid next hop equal to neighbor IP */ + if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1]))) + WITHDRAW(BAD_NEXT_HOP); + + /* Forbid next hop with non-matching AF */ + if ((ipa_is_ip4(nh[0]) != bgp_channel_is_ipv4(s->channel)) && + !s->channel->ext_next_hop) + WITHDRAW(BAD_NEXT_HOP); + + /* Just check if MPLS stack */ + if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK)) + WITHDRAW(NO_LABEL_STACK); +} + +static uint +bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED) +{ + /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */ + ip_addr *nh = (void *) a->u.ptr->data; + uint len = a->u.ptr->length; + + ASSERT((len == 16) || (len == 32)); + + /* + * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This + * is specified in RFC 5549 for IPv4 and in RFC 4798 for IPv6. The difference + * is that IPv4 address is directly encoded with IPv4 NLRI, but as IPv4-mapped + * IPv6 address with IPv6 NLRI. + */ + + if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0])) + { + put_ip4(buf, ipa_to_ip4(nh[0])); + return 4; + } + + put_ip6(buf, ipa_to_ip6(nh[0])); + + if (len == 32) + put_ip6(buf+16, ipa_to_ip6(nh[1])); + + return len; +} + +static void +bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a) +{ + struct bgp_channel *c = s->channel; + struct adata *ad = lp_alloc_adata(s->pool, 32); + ip_addr *nh = (void *) ad->data; + + if (len == 4) + { + nh[0] = ipa_from_ip4(get_ip4(data)); + nh[1] = IPA_NONE; + } + else if (len == 16) + { + nh[0] = ipa_from_ip6(get_ip6(data)); + nh[1] = IPA_NONE; + + if (ipa_is_link_local(nh[0])) + { nh[1] = nh[0]; nh[0] = IPA_NONE; } + } + else if (len == 32) + { + nh[0] = ipa_from_ip6(get_ip6(data)); + nh[1] = ipa_from_ip6(get_ip6(data+16)); + + if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1])) + nh[1] = IPA_NONE; + } else - return NULL; + bgp_parse_error(s, 9); + + if (ipa_zero(nh[1])) + ad->length = 16; + + if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop) + WITHDRAW(BAD_NEXT_HOP); + + // XXXX validate next hop + + bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad); + bgp_apply_next_hop(s, a, nh[0], nh[1]); } -static byte * -bgp_create_end_mark(struct bgp_conn *conn, byte *buf) +static uint +bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED) { - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending END-OF-RIB"); + ip_addr *nh = (void *) a->u.ptr->data; + uint len = a->u.ptr->length; - put_u32(buf, 0); - return buf+4; + ASSERT((len == 16) || (len == 32)); + + /* + * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This + * is specified in RFC 5549 for VPNv4 and in RFC 4659 for VPNv6. The difference + * is that IPv4 address is directly encoded with VPNv4 NLRI, but as IPv4-mapped + * IPv6 address with VPNv6 NLRI. + */ + + if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0])) + { + put_u64(buf, 0); /* VPN RD is 0 */ + put_ip4(buf+8, ipa_to_ip4(nh[0])); + return 12; + } + + put_u64(buf, 0); /* VPN RD is 0 */ + put_ip6(buf+8, ipa_to_ip6(nh[0])); + + if (len == 16) + return 24; + + put_u64(buf+24, 0); /* VPN RD is 0 */ + put_ip6(buf+32, ipa_to_ip6(nh[1])); + + return 48; } -#else /* IPv6 version */ +static void +bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a) +{ + struct bgp_channel *c = s->channel; + struct adata *ad = lp_alloc_adata(s->pool, 32); + ip_addr *nh = (void *) ad->data; -static inline int -same_iface(struct bgp_proto *p, ip_addr *ip) + if (len == 12) + { + nh[0] = ipa_from_ip4(get_ip4(data+8)); + nh[1] = IPA_NONE; + } + else if (len == 24) + { + nh[0] = ipa_from_ip6(get_ip6(data+8)); + nh[1] = IPA_NONE; + + if (ipa_is_link_local(nh[0])) + { nh[1] = nh[0]; nh[0] = IPA_NONE; } + } + else if (len == 48) + { + nh[0] = ipa_from_ip6(get_ip6(data+8)); + nh[1] = ipa_from_ip6(get_ip6(data+32)); + + if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1])) + nh[1] = IPA_NONE; + } + else + bgp_parse_error(s, 9); + + if (ipa_zero(nh[1])) + ad->length = 16; + + /* XXXX which error */ + if ((get_u64(data) != 0) || ((len == 48) && (get_u64(data+24) != 0))) + bgp_parse_error(s, 9); + + if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop) + WITHDRAW(BAD_NEXT_HOP); + + // XXXX validate next hop + + bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad); + bgp_apply_next_hop(s, a, nh[0], nh[1]); +} + + + +static uint +bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte *buf UNUSED, uint size UNUSED) +{ + return 0; +} + +static void +bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED) { - neighbor *n = neigh_find(&p->p, ip, 0); - return n && p->neigh && n->iface == p->neigh->iface; + /* + * Although we expect no next hop and RFC 7606 7.11 states that attribute + * MP_REACH_NLRI with unexpected next hop length is considered malformed, + * FlowSpec RFC 5575 4 states that next hop shall be ignored on receipt. + */ + + return; } -static byte * -bgp_create_update(struct bgp_conn *conn, byte *buf) +static void +bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to) { - struct bgp_proto *p = conn->bgp; - struct bgp_bucket *buck; - int size, second, rem_stored; - int remains = bgp_max_packet_length(p) - BGP_HEADER_LENGTH - 4; - byte *w, *w_stored, *tmp, *tstart; - ip_addr *ipp, ip, ip_ll; - ea_list *ea; - eattr *nh; + /* NEXT_HOP shall not pass */ + if (a) + bgp_unset_attr(to, s->pool, BA_NEXT_HOP); +} - put_u16(buf, 0); - w = buf+4; - if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) +/* + * UPDATE + */ + +static void +bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0) +{ + if (path_id != s->last_id) + { + s->last_src = rt_get_source(&s->proto->p, path_id); + s->last_id = path_id; + + rta_free(s->cached_rta); + s->cached_rta = NULL; + } + + if (!a0) + { + /* Route withdraw */ + rte_update2(&s->channel->c, n, NULL, s->last_src); + return; + } + + /* Prepare cached route attributes */ + if (s->cached_rta == NULL) + { + a0->src = s->last_src; + + /* Workaround for rta_lookup() breaking eattrs */ + ea_list *ea = a0->eattrs; + s->cached_rta = rta_lookup(a0); + a0->eattrs = ea; + } + + rta *a = rta_clone(s->cached_rta); + rte *e = rte_get_temp(a); + + e->pflags = 0; + e->u.bgp.suppressed = 0; + rte_update2(&s->channel->c, n, e, s->last_src); +} + +static void +bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, adata *mpls, byte **pos, uint *size, byte *pxlen) +{ + u32 dummy = 0; + u32 *labels = mpls ? (u32 *) mpls->data : &dummy; + uint lnum = mpls ? (mpls->length / 4) : 1; + + for (uint i = 0; i < lnum; i++) + { + put_u24(*pos, labels[i] << 4); + ADVANCE(*pos, *size, 3); + } + + /* Add bottom-of-stack flag */ + (*pos)[-1] |= BGP_MPLS_BOS; + + *pxlen += 24 * lnum; +} + +static void +bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a) +{ + u32 labels[BGP_MPLS_MAX], label; + uint lnum = 0; + + do { + if (*pxlen < 24) + bgp_parse_error(s, 1); + + label = get_u24(*pos); + labels[lnum++] = label >> 4; + ADVANCE(*pos, *len, 3); + *pxlen -= 24; + + /* Withdraw: Magic label stack value 0x800000 according to RFC 3107, section 3, last paragraph */ + if (!a && !s->err_withdraw && (lnum == 1) && (label == BGP_MPLS_MAGIC)) + break; + } + while (!(label & BGP_MPLS_BOS)); + + if (!a) + return; + + /* Attach MPLS attribute unless we already have one */ + if (!s->mpls_labels) + { + s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX); + bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels); + } + + /* Overwrite data in the attribute */ + s->mpls_labels->length = 4*lnum; + memcpy(s->mpls_labels->data, labels, 4*lnum); + + /* Update next hop entry in rta */ + bgp_apply_mpls_labels(s, a, labels, lnum); + + /* Attributes were changed, invalidate cached entry */ + rta_free(s->cached_rta); + s->cached_rta = NULL; + + return; +} + +static uint +bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) +{ + byte *pos = buf; + + while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_ip4 *net = (void *) px->net; + + /* Encode path ID */ + if (s->add_path) { - DBG("Withdrawn routes:\n"); - tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_UNREACH_NLRI, remains-8); - *tmp++ = 0; - *tmp++ = BGP_AF_IPV6; - *tmp++ = 1; - ea->attrs[0].u.ptr->length = 3 + bgp_encode_prefixes(p, tmp, buck, remains-11); - size = bgp_encode_attrs(p, w, ea, remains); - ASSERT(size >= 0); - w += size; - remains -= size; + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); } - else + + /* Encode prefix length */ + *pos = net->pxlen; + ADVANCE(pos, size, 1); + + /* Encode MPLS labels */ + if (s->mpls) + bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1); + + /* Encode prefix body */ + ip4_addr a = ip4_hton(net->prefix); + uint b = (net->pxlen + 7) / 8; + memcpy(pos, &a, b); + ADVANCE(pos, size, b); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; +} + +static void +bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +{ + while (len) + { + net_addr_ip4 net; + u32 path_id = 0; + + /* Decode path ID */ + if (s->add_path) { - while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next) - { - if (EMPTY_LIST(buck->prefixes)) - { - DBG("Deleting empty bucket %p\n", buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - } - - DBG("Processing bucket %p\n", buck); - rem_stored = remains; - w_stored = w; - - size = bgp_encode_attrs(p, w, buck->eattrs, remains - 1024); - if (size < 0) - { - log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name); - bgp_flush_prefixes(p, buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - } - w += size; - remains -= size; - - /* We have two addresses here in NEXT_HOP eattr. Really. - Unless NEXT_HOP was modified by filter */ - nh = ea_find(buck->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP)); - ASSERT(nh); - second = (nh->u.ptr->length == NEXT_HOP_LENGTH); - ipp = (ip_addr *) nh->u.ptr->data; - ip = ipp[0]; - ip_ll = IPA_NONE; - - if (ipa_equal(ip, p->source_addr)) - ip_ll = p->local_link; - else - { - /* If we send a route with 'third party' next hop destinated - * in the same interface, we should also send a link local - * next hop address. We use the received one (stored in the - * other part of BA_NEXT_HOP eattr). If we didn't received - * it (for example it is a static route), we can't use - * 'third party' next hop and we have to use local IP address - * as next hop. Sending original next hop address without - * link local address seems to be a natural way to solve that - * problem, but it is contrary to RFC 2545 and Quagga does not - * accept such routes. - * - * There are two cases, either we have global IP, or - * IPA_NONE if the neighbor is link-local. For IPA_NONE, - * we suppose it is on the same iface, see bgp_update_attrs(). - */ - - if (ipa_zero(ip) || same_iface(p, &ip)) - { - if (second && ipa_nonzero(ipp[1])) - ip_ll = ipp[1]; - else - { - switch (p->cf->missing_lladdr) - { - case MLL_SELF: - ip = p->source_addr; - ip_ll = p->local_link; - break; - case MLL_DROP: - log(L_ERR "%s: Missing link-local next hop address, skipping corresponding routes", p->p.name); - w = w_stored; - remains = rem_stored; - bgp_flush_prefixes(p, buck); - rem_node(&buck->send_node); - bgp_free_bucket(p, buck); - continue; - case MLL_IGNORE: - break; - } - } - } - } - - tstart = tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_REACH_NLRI, remains-8); - *tmp++ = 0; - *tmp++ = BGP_AF_IPV6; - *tmp++ = 1; - - if (ipa_is_link_local(ip)) - ip = IPA_NONE; - - if (ipa_nonzero(ip_ll)) - { - *tmp++ = 32; - ipa_hton(ip); - memcpy(tmp, &ip, 16); - ipa_hton(ip_ll); - memcpy(tmp+16, &ip_ll, 16); - tmp += 32; - } - else - { - *tmp++ = 16; - ipa_hton(ip); - memcpy(tmp, &ip, 16); - tmp += 16; - } - - *tmp++ = 0; /* No SNPA information */ - tmp += bgp_encode_prefixes(p, tmp, buck, remains - (8+3+32+1)); - ea->attrs[0].u.ptr->length = tmp - tstart; - size = bgp_encode_attrs(p, w, ea, remains); - ASSERT(size >= 0); - w += size; - break; - } + if (len < 5) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); } - size = w - (buf+4); - put_u16(buf+2, size); - lp_flush(bgp_linpool); - if (size) + /* Decode prefix length */ + uint l = *pos; + ADVANCE(pos, len, 1); + + if (len < ((l + 7) / 8)) + bgp_parse_error(s, 1); + + /* Decode MPLS labels */ + if (s->mpls) + bgp_decode_mpls_labels(s, &pos, &len, &l, a); + + if (l > IP4_MAX_PREFIX_LENGTH) + bgp_parse_error(s, 10); + + /* Decode prefix body */ + ip4_addr addr = IP4_NONE; + uint b = (l + 7) / 8; + memcpy(&addr, pos, b); + ADVANCE(pos, len, b); + + net = NET_ADDR_IP4(ip4_ntoh(addr), l); + net_normalize_ip4(&net); + + // XXXX validate prefix + + bgp_rte_update(s, (net_addr *) &net, path_id, a); + } +} + + +static uint +bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) +{ + byte *pos = buf; + + while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_ip6 *net = (void *) px->net; + + /* Encode path ID */ + if (s->add_path) { - BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE"); - return w; + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); } - else - return NULL; + + /* Encode prefix length */ + *pos = net->pxlen; + ADVANCE(pos, size, 1); + + /* Encode MPLS labels */ + if (s->mpls) + bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1); + + /* Encode prefix body */ + ip6_addr a = ip6_hton(net->prefix); + uint b = (net->pxlen + 7) / 8; + memcpy(pos, &a, b); + ADVANCE(pos, size, b); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; } -static byte * -bgp_create_end_mark(struct bgp_conn *conn, byte *buf) +static void +bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) { - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending END-OF-RIB"); + while (len) + { + net_addr_ip6 net; + u32 path_id = 0; - put_u16(buf+0, 0); - put_u16(buf+2, 6); /* length 4-9 */ - buf += 4; + /* Decode path ID */ + if (s->add_path) + { + if (len < 5) + bgp_parse_error(s, 1); - /* Empty MP_UNREACH_NLRI atribute */ - *buf++ = BAF_OPTIONAL; - *buf++ = BA_MP_UNREACH_NLRI; - *buf++ = 3; /* Length 7-9 */ - *buf++ = 0; /* AFI */ - *buf++ = BGP_AF_IPV6; - *buf++ = 1; /* SAFI */ - return buf; -} + path_id = get_u32(pos); + ADVANCE(pos, len, 4); + } -#endif + /* Decode prefix length */ + uint l = *pos; + ADVANCE(pos, len, 1); -static inline byte * -bgp_create_route_refresh(struct bgp_conn *conn, byte *buf) -{ - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH"); + if (len < ((l + 7) / 8)) + bgp_parse_error(s, 1); - /* Original original route refresh request, RFC 2918 */ - *buf++ = 0; - *buf++ = BGP_AF; - *buf++ = BGP_RR_REQUEST; - *buf++ = 1; /* SAFI */ - return buf; + /* Decode MPLS labels */ + if (s->mpls) + bgp_decode_mpls_labels(s, &pos, &len, &l, a); + + if (l > IP6_MAX_PREFIX_LENGTH) + bgp_parse_error(s, 10); + + /* Decode prefix body */ + ip6_addr addr = IP6_NONE; + uint b = (l + 7) / 8; + memcpy(&addr, pos, b); + ADVANCE(pos, len, b); + + net = NET_ADDR_IP6(ip6_ntoh(addr), l); + net_normalize_ip6(&net); + + // XXXX validate prefix + + bgp_rte_update(s, (net_addr *) &net, path_id, a); + } } -static inline byte * -bgp_create_begin_refresh(struct bgp_conn *conn, byte *buf) +static uint +bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) { - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR"); + byte *pos = buf; - /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */ - *buf++ = 0; - *buf++ = BGP_AF; - *buf++ = BGP_RR_BEGIN; - *buf++ = 1; /* SAFI */ - return buf; + while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_vpn4 *net = (void *) px->net; + + /* Encode path ID */ + if (s->add_path) + { + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); + } + + /* Encode prefix length */ + *pos = 64 + net->pxlen; + ADVANCE(pos, size, 1); + + /* Encode MPLS labels */ + if (s->mpls) + bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1); + + /* Encode route distinguisher */ + put_u64(pos, net->rd); + ADVANCE(pos, size, 8); + + /* Encode prefix body */ + ip4_addr a = ip4_hton(net->prefix); + uint b = (net->pxlen + 7) / 8; + memcpy(pos, &a, b); + ADVANCE(pos, size, b); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; } -static inline byte * -bgp_create_end_refresh(struct bgp_conn *conn, byte *buf) +static void +bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) { - struct bgp_proto *p = conn->bgp; - BGP_TRACE(D_PACKETS, "Sending END-OF-RR"); + while (len) + { + net_addr_vpn4 net; + u32 path_id = 0; - /* Demarcation of ending of route refresh (EoRR), RFC 7313 */ - *buf++ = 0; - *buf++ = BGP_AF; - *buf++ = BGP_RR_END; - *buf++ = 1; /* SAFI */ - return buf; + /* Decode path ID */ + if (s->add_path) + { + if (len < 5) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); + } + + /* Decode prefix length */ + uint l = *pos; + ADVANCE(pos, len, 1); + + if (len < ((l + 7) / 8)) + bgp_parse_error(s, 1); + + /* Decode MPLS labels */ + if (s->mpls) + bgp_decode_mpls_labels(s, &pos, &len, &l, a); + + /* Decode route distinguisher */ + if (l < 64) + bgp_parse_error(s, 1); + + u64 rd = get_u64(pos); + ADVANCE(pos, len, 8); + l -= 64; + + if (l > IP4_MAX_PREFIX_LENGTH) + bgp_parse_error(s, 10); + + /* Decode prefix body */ + ip4_addr addr = IP4_NONE; + uint b = (l + 7) / 8; + memcpy(&addr, pos, b); + ADVANCE(pos, len, b); + + net = NET_ADDR_VPN4(ip4_ntoh(addr), l, rd); + net_normalize_vpn4(&net); + + // XXXX validate prefix + + bgp_rte_update(s, (net_addr *) &net, path_id, a); + } } +static uint +bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) +{ + byte *pos = buf; + + while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_vpn6 *net = (void *) px->net; + + /* Encode path ID */ + if (s->add_path) + { + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); + } + + /* Encode prefix length */ + *pos = 64 + net->pxlen; + ADVANCE(pos, size, 1); + + /* Encode MPLS labels */ + if (s->mpls) + bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1); + + /* Encode route distinguisher */ + put_u64(pos, net->rd); + ADVANCE(pos, size, 8); + + /* Encode prefix body */ + ip6_addr a = ip6_hton(net->prefix); + uint b = (net->pxlen + 7) / 8; + memcpy(pos, &a, b); + ADVANCE(pos, size, b); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; +} + static void -bgp_create_header(byte *buf, uint len, uint type) +bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) { - memset(buf, 0xff, 16); /* Marker */ - put_u16(buf+16, len); - buf[18] = type; + while (len) + { + net_addr_vpn6 net; + u32 path_id = 0; + + /* Decode path ID */ + if (s->add_path) + { + if (len < 5) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); + } + + /* Decode prefix length */ + uint l = *pos; + ADVANCE(pos, len, 1); + + if (len < ((l + 7) / 8)) + bgp_parse_error(s, 1); + + /* Decode MPLS labels */ + if (s->mpls) + bgp_decode_mpls_labels(s, &pos, &len, &l, a); + + /* Decode route distinguisher */ + if (l < 64) + bgp_parse_error(s, 1); + + u64 rd = get_u64(pos); + ADVANCE(pos, len, 8); + l -= 64; + + if (l > IP6_MAX_PREFIX_LENGTH) + bgp_parse_error(s, 10); + + /* Decode prefix body */ + ip6_addr addr = IP6_NONE; + uint b = (l + 7) / 8; + memcpy(&addr, pos, b); + ADVANCE(pos, len, b); + + net = NET_ADDR_VPN6(ip6_ntoh(addr), l, rd); + net_normalize_vpn6(&net); + + // XXXX validate prefix + + bgp_rte_update(s, (net_addr *) &net, path_id, a); + } } -/** - * bgp_fire_tx - transmit packets - * @conn: connection - * - * Whenever the transmit buffers of the underlying TCP connection - * are free and we have any packets queued for sending, the socket functions - * call bgp_fire_tx() which takes care of selecting the highest priority packet - * queued (Notification > Keepalive > Open > Update), assembling its header - * and body and sending it to the connection. - */ -static int -bgp_fire_tx(struct bgp_conn *conn) + +static uint +bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) { - struct bgp_proto *p = conn->bgp; - uint s = conn->packets_to_send; - sock *sk = conn->sk; - byte *buf, *pkt, *end; - int type; + byte *pos = buf; + + while (!EMPTY_LIST(buck->prefixes) && (size >= 4)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_flow4 *net = (void *) px->net; + uint flen = net->length - sizeof(net_addr_flow4); - if (!sk) + /* Encode path ID */ + if (s->add_path) { - conn->packets_to_send = 0; - return 0; + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); } - buf = sk->tbuf; - pkt = buf + BGP_HEADER_LENGTH; - if (s & (1 << PKT_SCHEDULE_CLOSE)) + if (flen > size) + break; + + /* Copy whole flow data including length */ + memcpy(pos, net->data, flen); + ADVANCE(pos, size, flen); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; +} + +static void +bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +{ + while (len) + { + u32 path_id = 0; + + /* Decode path ID */ + if (s->add_path) { - /* We can finally close connection and enter idle state */ - bgp_conn_enter_idle_state(conn); - return 0; + if (len < 4) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); } - if (s & (1 << PKT_NOTIFICATION)) + + if (len < 2) + bgp_parse_error(s, 1); + + /* Decode flow length */ + uint hlen = flow_hdr_length(pos); + uint dlen = flow_read_length(pos); + uint flen = hlen + dlen; + byte *data = pos + hlen; + + if (len < flen) + bgp_parse_error(s, 1); + + /* Validate flow data */ + enum flow_validated_state r = flow4_validate(data, dlen); + if (r != FLOW_ST_VALID) { - s = 1 << PKT_SCHEDULE_CLOSE; - type = PKT_NOTIFICATION; - end = bgp_create_notification(conn, pkt); + log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r)); + bgp_parse_error(s, 1); } - else if (s & (1 << PKT_KEEPALIVE)) + + if (data[0] != FLOW_TYPE_DST_PREFIX) { - s &= ~(1 << PKT_KEEPALIVE); - type = PKT_KEEPALIVE; - end = pkt; /* Keepalives carry no data */ - BGP_TRACE(D_PACKETS, "Sending KEEPALIVE"); - bgp_start_timer(conn->keepalive_timer, conn->keepalive_time); + log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name); + bgp_parse_error(s, 1); } - else if (s & (1 << PKT_OPEN)) + + /* Decode dst prefix */ + ip4_addr px = IP4_NONE; + uint pxlen = data[1]; + + // FIXME: Use some generic function + memcpy(&px, data+2, BYTES(pxlen)); + px = ip4_and(ip4_ntoh(px), ip4_mkmask(pxlen)); + + /* Prepare the flow */ + net_addr *n = alloca(sizeof(struct net_addr_flow4) + flen); + net_fill_flow4(n, px, pxlen, pos, flen); + ADVANCE(pos, len, flen); + + bgp_rte_update(s, n, path_id, a); + } +} + + +static uint +bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size) +{ + byte *pos = buf; + + while (!EMPTY_LIST(buck->prefixes) && (size >= 4)) + { + struct bgp_prefix *px = HEAD(buck->prefixes); + struct net_addr_flow6 *net = (void *) px->net; + uint flen = net->length - sizeof(net_addr_flow6); + + /* Encode path ID */ + if (s->add_path) { - s &= ~(1 << PKT_OPEN); - type = PKT_OPEN; - end = bgp_create_open(conn, pkt); + put_u32(pos, px->path_id); + ADVANCE(pos, size, 4); } - else if (s & (1 << PKT_ROUTE_REFRESH)) + + if (flen > size) + break; + + /* Copy whole flow data including length */ + memcpy(pos, net->data, flen); + ADVANCE(pos, size, flen); + + bgp_free_prefix(s->channel, px); + } + + return pos - buf; +} + +static void +bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a) +{ + while (len) + { + u32 path_id = 0; + + /* Decode path ID */ + if (s->add_path) { - s &= ~(1 << PKT_ROUTE_REFRESH); - type = PKT_ROUTE_REFRESH; - end = bgp_create_route_refresh(conn, pkt); + if (len < 4) + bgp_parse_error(s, 1); + + path_id = get_u32(pos); + ADVANCE(pos, len, 4); } - else if (s & (1 << PKT_BEGIN_REFRESH)) + + if (len < 2) + bgp_parse_error(s, 1); + + /* Decode flow length */ + uint hlen = flow_hdr_length(pos); + uint dlen = flow_read_length(pos); + uint flen = hlen + dlen; + byte *data = pos + hlen; + + if (len < flen) + bgp_parse_error(s, 1); + + /* Validate flow data */ + enum flow_validated_state r = flow6_validate(data, dlen); + if (r != FLOW_ST_VALID) { - s &= ~(1 << PKT_BEGIN_REFRESH); - type = PKT_ROUTE_REFRESH; /* BoRR is a subtype of RR */ - end = bgp_create_begin_refresh(conn, pkt); + log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r)); + bgp_parse_error(s, 1); } - else if (s & (1 << PKT_UPDATE)) + + if (data[0] != FLOW_TYPE_DST_PREFIX) { - type = PKT_UPDATE; - end = bgp_create_update(conn, pkt); + log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name); + bgp_parse_error(s, 1); + } - if (!end) - { - /* No update to send, perhaps we need to send End-of-RIB or EoRR */ + /* Decode dst prefix */ + ip6_addr px = IP6_NONE; + uint pxlen = data[1]; - conn->packets_to_send = 0; + // FIXME: Use some generic function + memcpy(&px, data+2, BYTES(pxlen)); + px = ip6_and(ip6_ntoh(px), ip6_mkmask(pxlen)); - if (p->feed_state == BFS_LOADED) - { - type = PKT_UPDATE; - end = bgp_create_end_mark(conn, pkt); - } + /* Prepare the flow */ + net_addr *n = alloca(sizeof(struct net_addr_flow6) + flen); + net_fill_flow6(n, px, pxlen, pos, flen); + ADVANCE(pos, len, flen); - else if (p->feed_state == BFS_REFRESHED) - { - type = PKT_ROUTE_REFRESH; - end = bgp_create_end_refresh(conn, pkt); - } + bgp_rte_update(s, n, path_id, a); + } +} - else /* Really nothing to send */ - return 0; - p->feed_state = BFS_NONE; - } - } - else - return 0; +static const struct bgp_af_desc bgp_af_table[] = { + { + .afi = BGP_AF_IPV4, + .net = NET_IP4, + .name = "ipv4", + .encode_nlri = bgp_encode_nlri_ip4, + .decode_nlri = bgp_decode_nlri_ip4, + .encode_next_hop = bgp_encode_next_hop_ip, + .decode_next_hop = bgp_decode_next_hop_ip, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_IPV4_MC, + .net = NET_IP4, + .name = "ipv4-mc", + .encode_nlri = bgp_encode_nlri_ip4, + .decode_nlri = bgp_decode_nlri_ip4, + .encode_next_hop = bgp_encode_next_hop_ip, + .decode_next_hop = bgp_decode_next_hop_ip, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_IPV4_MPLS, + .net = NET_IP4, + .mpls = 1, + .name = "ipv4-mpls", + .encode_nlri = bgp_encode_nlri_ip4, + .decode_nlri = bgp_decode_nlri_ip4, + .encode_next_hop = bgp_encode_next_hop_ip, + .decode_next_hop = bgp_decode_next_hop_ip, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_IPV6, + .net = NET_IP6, + .name = "ipv6", + .encode_nlri = bgp_encode_nlri_ip6, + .decode_nlri = bgp_decode_nlri_ip6, + .encode_next_hop = bgp_encode_next_hop_ip, + .decode_next_hop = bgp_decode_next_hop_ip, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_IPV6_MC, + .net = NET_IP6, + .name = "ipv6-mc", + .encode_nlri = bgp_encode_nlri_ip6, + .decode_nlri = bgp_decode_nlri_ip6, + .encode_next_hop = bgp_encode_next_hop_ip, + .decode_next_hop = bgp_decode_next_hop_ip, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_IPV6_MPLS, + .net = NET_IP6, + .mpls = 1, + .name = "ipv6-mpls", + .encode_nlri = bgp_encode_nlri_ip6, + .decode_nlri = bgp_decode_nlri_ip6, + .encode_next_hop = bgp_encode_next_hop_ip, + .decode_next_hop = bgp_decode_next_hop_ip, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_VPN4_MPLS, + .net = NET_VPN4, + .mpls = 1, + .name = "vpn4-mpls", + .encode_nlri = bgp_encode_nlri_vpn4, + .decode_nlri = bgp_decode_nlri_vpn4, + .encode_next_hop = bgp_encode_next_hop_vpn, + .decode_next_hop = bgp_decode_next_hop_vpn, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_VPN6_MPLS, + .net = NET_VPN6, + .mpls = 1, + .name = "vpn6-mpls", + .encode_nlri = bgp_encode_nlri_vpn6, + .decode_nlri = bgp_decode_nlri_vpn6, + .encode_next_hop = bgp_encode_next_hop_vpn, + .decode_next_hop = bgp_decode_next_hop_vpn, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_VPN4_MC, + .net = NET_VPN4, + .name = "vpn4-mc", + .encode_nlri = bgp_encode_nlri_vpn4, + .decode_nlri = bgp_decode_nlri_vpn4, + .encode_next_hop = bgp_encode_next_hop_vpn, + .decode_next_hop = bgp_decode_next_hop_vpn, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_VPN6_MC, + .net = NET_VPN6, + .name = "vpn6-mc", + .encode_nlri = bgp_encode_nlri_vpn6, + .decode_nlri = bgp_decode_nlri_vpn6, + .encode_next_hop = bgp_encode_next_hop_vpn, + .decode_next_hop = bgp_decode_next_hop_vpn, + .update_next_hop = bgp_update_next_hop_ip, + }, + { + .afi = BGP_AF_FLOW4, + .net = NET_FLOW4, + .no_igp = 1, + .name = "flow4", + .encode_nlri = bgp_encode_nlri_flow4, + .decode_nlri = bgp_decode_nlri_flow4, + .encode_next_hop = bgp_encode_next_hop_none, + .decode_next_hop = bgp_decode_next_hop_none, + .update_next_hop = bgp_update_next_hop_none, + }, + { + .afi = BGP_AF_FLOW6, + .net = NET_FLOW6, + .no_igp = 1, + .name = "flow6", + .encode_nlri = bgp_encode_nlri_flow6, + .decode_nlri = bgp_decode_nlri_flow6, + .encode_next_hop = bgp_encode_next_hop_none, + .decode_next_hop = bgp_decode_next_hop_none, + .update_next_hop = bgp_update_next_hop_none, + }, +}; - conn->packets_to_send = s; - bgp_create_header(buf, end - buf, type); - return sk_send(sk, end - buf); +const struct bgp_af_desc * +bgp_get_af_desc(u32 afi) +{ + uint i; + for (i = 0; i < ARRAY_SIZE(bgp_af_table); i++) + if (bgp_af_table[i].afi == afi) + return &bgp_af_table[i]; + + return NULL; } -/** - * bgp_schedule_packet - schedule a packet for transmission - * @conn: connection - * @type: packet type - * - * Schedule a packet of type @type to be sent as soon as possible. - */ -void -bgp_schedule_packet(struct bgp_conn *conn, int type) +static inline uint +bgp_encode_nlri(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - DBG("BGP: Scheduling packet type %d\n", type); - conn->packets_to_send |= 1 << type; - if (conn->sk && conn->sk->tpos == conn->sk->tbuf && !ev_active(conn->tx_ev)) - ev_schedule(conn->tx_ev); + return s->channel->desc->encode_nlri(s, buck, buf, end - buf); } -void -bgp_kick_tx(void *vconn) +static inline uint +bgp_encode_next_hop(struct bgp_write_state *s, eattr *nh, byte *buf) { - struct bgp_conn *conn = vconn; - - DBG("BGP: kicking TX\n"); - while (bgp_fire_tx(conn) > 0) - ; + return s->channel->desc->encode_next_hop(s, nh, buf, 255); } void -bgp_tx(sock *sk) +bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to) { - struct bgp_conn *conn = sk->data; - - DBG("BGP: TX hook\n"); - while (bgp_fire_tx(conn) > 0) - ; + s->channel->desc->update_next_hop(s, a, to); } -/* Capatibility negotiation as per RFC 2842 */ +#define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024) -void -bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len) +static byte * +bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - // struct bgp_proto *p = conn->bgp; - int i, cl; + /* + * 2 B Withdrawn Routes Length (zero) + * --- IPv4 Withdrawn Routes NLRI (unused) + * 2 B Total Path Attribute Length + * var Path Attributes + * var IPv4 Network Layer Reachability Information + */ + + int lr, la; + + la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH); + if (la < 0) + { + /* Attribute list too long */ + bgp_withdraw_bucket(s->channel, buck); + return NULL; + } - while (len > 0) - { - if (len < 2 || len < 2 + opt[1]) - goto err; + put_u16(buf+0, 0); + put_u16(buf+2, la); - cl = opt[1]; + lr = bgp_encode_nlri(s, buck, buf+4+la, end); - switch (opt[0]) - { - case 2: /* Route refresh capability, RFC 2918 */ - if (cl != 0) - goto err; - conn->peer_refresh_support = 1; - break; + return buf+4+la+lr; +} - case 6: /* Extended message length capability, draft */ - if (cl != 0) - goto err; - conn->peer_ext_messages_support = 1; - break; +static byte * +bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) +{ + /* + * 2 B IPv4 Withdrawn Routes Length (zero) + * --- IPv4 Withdrawn Routes NLRI (unused) + * 2 B Total Path Attribute Length + * 1 B MP_REACH_NLRI hdr - Attribute Flags + * 1 B MP_REACH_NLRI hdr - Attribute Type Code + * 2 B MP_REACH_NLRI hdr - Length of Attribute Data + * 2 B MP_REACH_NLRI data - Address Family Identifier + * 1 B MP_REACH_NLRI data - Subsequent Address Family Identifier + * 1 B MP_REACH_NLRI data - Length of Next Hop Network Address + * var MP_REACH_NLRI data - Network Address of Next Hop + * 1 B MP_REACH_NLRI data - Reserved (zero) + * var MP_REACH_NLRI data - Network Layer Reachability Information + * var Rest of Path Attributes + * --- IPv4 Network Layer Reachability Information (unused) + */ + + int lh, lr, la; /* Lengths of next hop, NLRI and attributes */ + + /* Begin of MP_REACH_NLRI atribute */ + buf[4] = BAF_OPTIONAL | BAF_EXT_LEN; + buf[5] = BA_MP_REACH_NLRI; + put_u16(buf+6, 0); /* Will be fixed later */ + put_af3(buf+8, s->channel->afi); + byte *pos = buf+11; + + /* Encode attributes to temporary buffer */ + byte *abuf = alloca(MAX_ATTRS_LENGTH); + la = bgp_encode_attrs(s, buck->eattrs, abuf, abuf + MAX_ATTRS_LENGTH); + if (la < 0) + { + /* Attribute list too long */ + bgp_withdraw_bucket(s->channel, buck); + return NULL; + } - case 64: /* Graceful restart capability, RFC 4724 */ - if (cl % 4 != 2) - goto err; - conn->peer_gr_aware = 1; - conn->peer_gr_able = 0; - conn->peer_gr_time = get_u16(opt + 2) & 0x0fff; - conn->peer_gr_flags = opt[2] & 0xf0; - conn->peer_gr_aflags = 0; - for (i = 2; i < cl; i += 4) - if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */ - { - conn->peer_gr_able = 1; - conn->peer_gr_aflags = opt[2+i+3]; - } - break; + /* Encode the next hop */ + lh = bgp_encode_next_hop(s, s->mp_next_hop, pos+1); + *pos = lh; + pos += 1+lh; - case 65: /* AS4 capability, RFC 4893 */ - if (cl != 4) - goto err; - conn->peer_as4_support = 1; - if (conn->bgp->cf->enable_as4) - conn->advertised_as = get_u32(opt + 2); - break; + /* Reserved field */ + *pos++ = 0; - case 69: /* ADD-PATH capability, RFC 7911 */ - if (cl % 4) - goto err; - for (i = 0; i < cl; i += 4) - if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */ - conn->peer_add_path = opt[2+i+3]; - if (conn->peer_add_path > ADD_PATH_FULL) - goto err; - break; + /* Encode the NLRI */ + lr = bgp_encode_nlri(s, buck, pos, end - la); + pos += lr; - case 70: /* Enhanced route refresh capability, RFC 7313 */ - if (cl != 0) - goto err; - conn->peer_enhanced_refresh_support = 1; - break; + /* End of MP_REACH_NLRI atribute, update data length */ + put_u16(buf+6, pos-buf-8); - /* We can safely ignore all other capabilities */ - } - len -= 2 + cl; - opt += 2 + cl; - } - return; + /* Copy remaining attributes */ + memcpy(pos, abuf, la); + pos += la; - err: - bgp_error(conn, 2, 0, NULL, 0); - return; + /* Initial UPDATE fields */ + put_u16(buf+0, 0); + put_u16(buf+2, pos-buf-4); + + return pos; } -static int -bgp_parse_options(struct bgp_conn *conn, byte *opt, int len) +#undef MAX_ATTRS_LENGTH + +static byte * +bgp_create_ip_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - struct bgp_proto *p = conn->bgp; - int ol; + /* + * 2 B Withdrawn Routes Length + * var IPv4 Withdrawn Routes NLRI + * 2 B Total Path Attribute Length (zero) + * --- Path Attributes (unused) + * --- IPv4 Network Layer Reachability Information (unused) + */ - while (len > 0) - { - if (len < 2 || len < 2 + opt[1]) - { bgp_error(conn, 2, 0, NULL, 0); return 0; } -#ifdef LOCAL_DEBUG - { - int i; - DBG("\tOption %02x:", opt[0]); - for(i=0; i<opt[1]; i++) - DBG(" %02x", opt[2+i]); - DBG("\n"); - } -#endif + uint len = bgp_encode_nlri(s, buck, buf+2, end); - ol = opt[1]; - switch (opt[0]) - { - case 2: - if (conn->start_state == BSS_CONNECT_NOCAP) - BGP_TRACE(D_PACKETS, "Ignoring received capabilities"); - else - bgp_parse_capabilities(conn, opt + 2, ol); - break; + put_u16(buf+0, len); + put_u16(buf+2+len, 0); - default: - /* - * BGP specs don't tell us to send which option - * we didn't recognize, but it's common practice - * to do so. Also, capability negotiation with - * Cisco routers doesn't work without that. - */ - bgp_error(conn, 2, 4, opt, ol); - return 0; - } - len -= 2 + ol; - opt += 2 + ol; - } - return 0; + return buf+4+len; } -static void -bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len) +static byte * +bgp_create_mp_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end) { - struct bgp_conn *other; - struct bgp_proto *p = conn->bgp; - unsigned hold; - u16 base_as; - u32 id; + /* + * 2 B Withdrawn Routes Length (zero) + * --- IPv4 Withdrawn Routes NLRI (unused) + * 2 B Total Path Attribute Length + * 1 B MP_UNREACH_NLRI hdr - Attribute Flags + * 1 B MP_UNREACH_NLRI hdr - Attribute Type Code + * 2 B MP_UNREACH_NLRI hdr - Length of Attribute Data + * 2 B MP_UNREACH_NLRI data - Address Family Identifier + * 1 B MP_UNREACH_NLRI data - Subsequent Address Family Identifier + * var MP_UNREACH_NLRI data - Network Layer Reachability Information + * --- IPv4 Network Layer Reachability Information (unused) + */ + + uint len = bgp_encode_nlri(s, buck, buf+11, end); - /* Check state */ - if (conn->state != BS_OPENSENT) - { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } + put_u16(buf+0, 0); + put_u16(buf+2, 7+len); - /* Check message contents */ - if (len < 29 || len != 29U + pkt[28]) - { bgp_error(conn, 1, 2, pkt+16, 2); return; } - if (pkt[19] != BGP_VERSION) - { bgp_error(conn, 2, 1, pkt+19, 1); return; } /* RFC 1771 says 16 bits, draft-09 tells to use 8 */ - conn->advertised_as = base_as = get_u16(pkt+20); - hold = get_u16(pkt+22); - id = get_u32(pkt+24); - BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%08x)", conn->advertised_as, hold, id); + /* Begin of MP_UNREACH_NLRI atribute */ + buf[4] = BAF_OPTIONAL | BAF_EXT_LEN; + buf[5] = BA_MP_UNREACH_NLRI; + put_u16(buf+6, 3+len); + put_af3(buf+8, s->channel->afi); - if (bgp_parse_options(conn, pkt+29, pkt[28])) - return; + return buf+11+len; +} - if (hold > 0 && hold < 3) - { bgp_error(conn, 2, 6, pkt+22, 2); return; } +static byte * +bgp_create_update(struct bgp_channel *c, byte *buf) +{ + struct bgp_proto *p = (void *) c->c.proto; + struct bgp_bucket *buck; + byte *end = buf + (bgp_max_packet_length(p->conn) - BGP_HEADER_LENGTH); + byte *res = NULL; + +again: ; + + /* Initialize write state */ + struct bgp_write_state s = { + .proto = p, + .channel = c, + .pool = bgp_linpool, + .as4_session = p->as4_session, + .add_path = c->add_path_tx, + .mpls = c->desc->mpls, + }; + + /* Try unreachable bucket */ + if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes)) + { + res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ? + bgp_create_ip_unreach(&s, buck, buf, end): + bgp_create_mp_unreach(&s, buck, buf, end); - /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */ - if (!id || (p->is_internal && id == p->local_id)) - { bgp_error(conn, 2, 3, pkt+24, -4); return; } + goto done; + } - if ((conn->advertised_as != base_as) && (base_as != AS_TRANS)) - log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name); + /* Try reachable buckets */ + if (!EMPTY_LIST(c->bucket_queue)) + { + buck = HEAD(c->bucket_queue); - if (conn->advertised_as != p->remote_as) + /* Cleanup empty buckets */ + if (EMPTY_LIST(buck->prefixes)) { - if (conn->peer_as4_support) - { - u32 val = htonl(conn->advertised_as); - bgp_error(conn, 2, 2, (byte *) &val, 4); - } - else - bgp_error(conn, 2, 2, pkt+20, 2); - - return; + bgp_free_bucket(c, buck); + goto again; } - /* Check the other connection */ - other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn; - switch (other->state) - { - case BS_CONNECT: - case BS_ACTIVE: - /* Stop outgoing connection attempts */ - bgp_conn_enter_idle_state(other); - break; + res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ? + bgp_create_ip_reach(&s, buck, buf, end): + bgp_create_mp_reach(&s, buck, buf, end); - case BS_IDLE: - case BS_OPENSENT: - case BS_CLOSE: - break; + if (EMPTY_LIST(buck->prefixes)) + bgp_free_bucket(c, buck); + else + bgp_defer_bucket(c, buck); - case BS_OPENCONFIRM: - /* - * Description of collision detection rules in RFC 4271 is confusing and - * contradictory, but it is essentially: - * - * 1. Router with higher ID is dominant - * 2. If both have the same ID, router with higher ASN is dominant [RFC6286] - * 3. When both connections are in OpenConfirm state, one initiated by - * the dominant router is kept. - * - * The first line in the expression below evaluates whether the neighbor - * is dominant, the second line whether the new connection was initiated - * by the neighbor. If both are true (or both are false), we keep the new - * connection, otherwise we keep the old one. - */ - if (((p->local_id < id) || ((p->local_id == id) && (p->local_as < p->remote_as))) - == (conn == &p->incoming_conn)) - { - /* Should close the other connection */ - BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection"); - bgp_error(other, 6, 7, NULL, 0); - break; - } - /* Fall thru */ - case BS_ESTABLISHED: - /* Should close this connection */ - BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection"); - bgp_error(conn, 6, 7, NULL, 0); - return; - default: - bug("bgp_rx_open: Unknown state"); - } + if (!res) + goto again; - /* Update our local variables */ - conn->hold_time = MIN(hold, p->cf->hold_time); - conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3; - p->remote_id = id; - p->as4_session = p->cf->enable_as4 && conn->peer_as4_support; - p->add_path_rx = (p->cf->add_path & ADD_PATH_RX) && (conn->peer_add_path & ADD_PATH_TX); - p->add_path_tx = (p->cf->add_path & ADD_PATH_TX) && (conn->peer_add_path & ADD_PATH_RX); - p->gr_ready = p->cf->gr_mode && conn->peer_gr_able; - p->ext_messages = p->cf->enable_extended_messages && conn->peer_ext_messages_support; - - /* Update RA mode */ - if (p->add_path_tx) - p->p.accept_ra_types = RA_ANY; - else if (p->cf->secondary) - p->p.accept_ra_types = RA_ACCEPTED; - else - p->p.accept_ra_types = RA_OPTIMAL; + goto done; + } - DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n", conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, p->as4_session); + /* No more prefixes to send */ + return NULL; - bgp_schedule_packet(conn, PKT_KEEPALIVE); - bgp_start_timer(conn->hold_timer, conn->hold_time); - bgp_conn_enter_openconfirm_state(conn); +done: + BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE"); + lp_flush(s.pool); + + return res; } +static byte * +bgp_create_ip_end_mark(struct bgp_channel *c UNUSED, byte *buf) +{ + /* Empty update packet */ + put_u32(buf, 0); -static inline void -bgp_rx_end_mark(struct bgp_proto *p) + return buf+4; +} + +static byte * +bgp_create_mp_end_mark(struct bgp_channel *c, byte *buf) { - BGP_TRACE(D_PACKETS, "Got END-OF-RIB"); + put_u16(buf+0, 0); + put_u16(buf+2, 6); /* length 4--9 */ - if (p->load_state == BFS_LOADING) - p->load_state = BFS_NONE; + /* Empty MP_UNREACH_NLRI atribute */ + buf[4] = BAF_OPTIONAL; + buf[5] = BA_MP_UNREACH_NLRI; + buf[6] = 3; /* Length 7--9 */ + put_af3(buf+7, c->afi); - if (p->p.gr_recovery) - proto_graceful_restart_unlock(&p->p); - - if (p->gr_active) - bgp_graceful_restart_done(p); -} - - -#define DECODE_PREFIX(pp, ll) do { \ - if (p->add_path_rx) \ - { \ - if (ll < 5) { err=1; goto done; } \ - path_id = get_u32(pp); \ - pp += 4; \ - ll -= 4; \ - } \ - int b = *pp++; \ - int q; \ - ll--; \ - if (b > BITS_PER_IP_ADDRESS) { err=10; goto done; } \ - q = (b+7) / 8; \ - if (ll < q) { err=1; goto done; } \ - memcpy(&prefix, pp, q); \ - pp += q; \ - ll -= q; \ - ipa_ntoh(prefix); \ - prefix = ipa_and(prefix, ipa_mkmask(b)); \ - pxlen = b; \ -} while (0) + return buf+10; +} + +static byte * +bgp_create_end_mark(struct bgp_channel *c, byte *buf) +{ + struct bgp_proto *p = (void *) c->c.proto; + BGP_TRACE(D_PACKETS, "Sending END-OF-RIB"); + + return (c->afi == BGP_AF_IPV4) ? + bgp_create_ip_end_mark(c, buf): + bgp_create_mp_end_mark(c, buf); +} static inline void -bgp_rte_update(struct bgp_proto *p, ip_addr prefix, int pxlen, - u32 path_id, u32 *last_id, struct rte_src **src, - rta *a0, rta **a) +bgp_rx_end_mark(struct bgp_parse_state *s, u32 afi) { - if (path_id != *last_id) - { - *src = rt_get_source(&p->p, path_id); - *last_id = path_id; + struct bgp_proto *p = s->proto; + struct bgp_channel *c = bgp_get_channel(p, afi); - if (*a) - { - rta_free(*a); - *a = NULL; - } - } + BGP_TRACE(D_PACKETS, "Got END-OF-RIB"); - /* Prepare cached route attributes */ - if (!*a) - { - a0->src = *src; + if (!c) + DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi)); - /* Workaround for rta_lookup() breaking eattrs */ - ea_list *ea = a0->eattrs; - *a = rta_lookup(a0); - a0->eattrs = ea; - } + if (c->load_state == BFS_LOADING) + c->load_state = BFS_NONE; - net *n = net_get(p->p.table, prefix, pxlen); - rte *e = rte_get_temp(rta_clone(*a)); - e->net = n; - e->pflags = 0; - e->u.bgp.suppressed = 0; - rte_update2(p->p.main_ahook, n, e, *src); + if (p->p.gr_recovery) + channel_graceful_restart_unlock(&c->c); + + if (c->gr_active) + bgp_graceful_restart_done(c); } static inline void -bgp_rte_withdraw(struct bgp_proto *p, ip_addr prefix, int pxlen, - u32 path_id, u32 *last_id, struct rte_src **src) +bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len) { - if (path_id != *last_id) - { - *src = rt_find_source(&p->p, path_id); - *last_id = path_id; - } + struct bgp_channel *c = bgp_get_channel(s->proto, afi); + rta *a = NULL; - net *n = net_find(p->p.table, prefix, pxlen); - rte_update2( p->p.main_ahook, n, NULL, *src); -} + if (!c) + DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi)); -static inline int -bgp_set_next_hop(struct bgp_proto *p, rta *a) -{ - struct eattr *nh = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP)); - ip_addr *nexthop = (ip_addr *) nh->u.ptr->data; + s->channel = c; + s->add_path = c->add_path_rx; + s->mpls = c->desc->mpls; -#ifdef IPV6 - int second = (nh->u.ptr->length == NEXT_HOP_LENGTH) && ipa_nonzero(nexthop[1]); + s->last_id = 0; + s->last_src = s->proto->p.main_source; - /* First address should not be link-local, but may be zero in direct mode */ - if (ipa_is_link_local(*nexthop)) - *nexthop = IPA_NONE; -#else - int second = 0; -#endif + /* + * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not + * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for + * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by + * decode_next_hop hooks) by restoring a->eattrs afterwards. + */ - if (p->cf->gw_mode == GW_DIRECT) - { - neighbor *ng = NULL; + if (ea) + { + a = allocz(RTA_MAX_SIZE); - if (ipa_nonzero(*nexthop)) - ng = neigh_find(&p->p, nexthop, 0); - else if (second) /* GW_DIRECT -> single_hop -> p->neigh != NULL */ - ng = neigh_find2(&p->p, nexthop + 1, p->neigh->iface, 0); + a->source = RTS_BGP; + a->scope = SCOPE_UNIVERSE; + a->from = s->proto->cf->remote_ip; + a->eattrs = ea; - /* Fallback */ - if (!ng) - ng = p->neigh; + c->desc->decode_next_hop(s, nh, nh_len, a); - if (ng->scope == SCOPE_HOST) - return 0; - - a->dest = RTD_ROUTER; - a->gw = ng->addr; - a->iface = ng->iface; - a->hostentry = NULL; - a->igp_metric = 0; - } - else /* GW_RECURSIVE */ - { - if (ipa_zero(*nexthop)) - return 0; + /* Handle withdraw during next hop decoding */ + if (s->err_withdraw) + a = NULL; + } - rta_set_recursive_next_hop(p->p.table, a, p->igp_table, nexthop, nexthop + second); - } + c->desc->decode_nlri(s, nlri, len, a); - return 1; + rta_free(s->cached_rta); + s->cached_rta = NULL; } -#ifndef IPV6 /* IPv4 version */ - static void -bgp_do_rx_update(struct bgp_conn *conn, - byte *withdrawn, int withdrawn_len, - byte *nlri, int nlri_len, - byte *attrs, int attr_len) +bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) { struct bgp_proto *p = conn->bgp; - struct rte_src *src = p->p.main_source; - rta *a0, *a = NULL; - ip_addr prefix; - int pxlen, err = 0; - u32 path_id = 0; - u32 last_id = 0; + ea_list *ea = NULL; - /* Check for End-of-RIB marker */ - if (!withdrawn_len && !attr_len && !nlri_len) - { - bgp_rx_end_mark(p); - return; - } + BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE"); - /* Withdraw routes */ - while (withdrawn_len) - { - DECODE_PREFIX(withdrawn, withdrawn_len); - DBG("Withdraw %I/%d\n", prefix, pxlen); + /* Workaround for some BGP implementations that skip initial KEEPALIVE */ + if (conn->state == BS_OPENCONFIRM) + bgp_conn_enter_established_state(conn); - bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); - } + if (conn->state != BS_ESTABLISHED) + { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - if (!attr_len && !nlri_len) /* shortcut */ - return; + bgp_start_timer(conn->hold_timer, conn->hold_time); - a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, nlri_len); + /* Initialize parse state */ + struct bgp_parse_state s = { + .proto = p, + .pool = bgp_linpool, + .as4_session = p->as4_session, + }; - if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */ - return; + /* Parse error handler */ + if (setjmp(s.err_jmpbuf)) + { + bgp_error(conn, 3, s.err_subcode, NULL, 0); + goto done; + } - if (a0 && nlri_len && !bgp_set_next_hop(p, a0)) - a0 = NULL; + /* Check minimal length */ + if (len < 23) + { bgp_error(conn, 1, 2, pkt+16, 2); return; } - last_id = 0; - src = p->p.main_source; + /* Skip fixed header */ + uint pos = 19; - while (nlri_len) - { - DECODE_PREFIX(nlri, nlri_len); - DBG("Add %I/%d\n", prefix, pxlen); + /* + * UPDATE message format + * + * 2 B IPv4 Withdrawn Routes Length + * var IPv4 Withdrawn Routes NLRI + * 2 B Total Path Attribute Length + * var Path Attributes + * var IPv4 Reachable Routes NLRI + */ - if (a0) - bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a); - else /* Forced withdraw as a result of soft error */ - bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); - } + s.ip_unreach_len = get_u16(pkt + pos); + s.ip_unreach_nlri = pkt + pos + 2; + pos += 2 + s.ip_unreach_len; - done: - if (a) - rta_free(a); + if (pos + 2 > len) + bgp_parse_error(&s, 1); - if (err) - bgp_error(conn, 3, err, NULL, 0); + s.attr_len = get_u16(pkt + pos); + s.attrs = pkt + pos + 2; + pos += 2 + s.attr_len; + + if (pos > len) + bgp_parse_error(&s, 1); + + s.ip_reach_len = len - pos; + s.ip_reach_nlri = pkt + pos; + + if (s.attr_len) + ea = bgp_decode_attrs(&s, s.attrs, s.attr_len); + else + ea = NULL; + + /* Check for End-of-RIB marker */ + if (!s.attr_len && !s.ip_unreach_len && !s.ip_reach_len) + { bgp_rx_end_mark(&s, BGP_AF_IPV4); goto done; } + + /* Check for MP End-of-RIB marker */ + if ((s.attr_len < 8) && !s.ip_unreach_len && !s.ip_reach_len && + !s.mp_reach_len && !s.mp_unreach_len && s.mp_unreach_af) + { bgp_rx_end_mark(&s, s.mp_unreach_af); goto done; } + + if (s.ip_unreach_len) + bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_unreach_nlri, s.ip_unreach_len, NULL, NULL, 0); + + if (s.mp_unreach_len) + bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0); + + if (s.ip_reach_len) + bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len, + ea, s.ip_next_hop_data, s.ip_next_hop_len); + + if (s.mp_reach_len) + bgp_decode_nlri(&s, s.mp_reach_af, s.mp_reach_nlri, s.mp_reach_len, + ea, s.mp_next_hop_data, s.mp_next_hop_len); + +done: + rta_free(s.cached_rta); + lp_flush(s.pool); return; } -#else /* IPv6 version */ -#define DO_NLRI(name) \ - x = p->name##_start; \ - len = len0 = p->name##_len; \ - if (len) \ - { \ - if (len < 3) { err=9; goto done; } \ - af = get_u16(x); \ - x += 3; \ - len -= 3; \ - DBG("\tNLRI AF=%d sub=%d len=%d\n", af, x[-1], len);\ - } \ - else \ - af = 0; \ - if (af == BGP_AF_IPV6) +/* + * ROUTE-REFRESH + */ -static void -bgp_attach_next_hop(rta *a0, byte *x) +static inline byte * +bgp_create_route_refresh(struct bgp_channel *c, byte *buf) { - ip_addr *nh = (ip_addr *) bgp_attach_attr_wa(&a0->eattrs, bgp_linpool, BA_NEXT_HOP, NEXT_HOP_LENGTH); - memcpy(nh, x+1, 16); - ipa_ntoh(nh[0]); + struct bgp_proto *p = (void *) c->c.proto; - /* We store received link local address in the other part of BA_NEXT_HOP eattr. */ - if (*x == 32) - { - memcpy(nh+1, x+17, 16); - ipa_ntoh(nh[1]); - } - else - nh[1] = IPA_NONE; + BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH"); + + /* Original route refresh request, RFC 2918 */ + put_af4(buf, c->afi); + buf[2] = BGP_RR_REQUEST; + + return buf+4; +} + +static inline byte * +bgp_create_begin_refresh(struct bgp_channel *c, byte *buf) +{ + struct bgp_proto *p = (void *) c->c.proto; + + BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR"); + + /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */ + put_af4(buf, c->afi); + buf[2] = BGP_RR_BEGIN; + + return buf+4; } +static inline byte * +bgp_create_end_refresh(struct bgp_channel *c, byte *buf) +{ + struct bgp_proto *p = (void *) c->c.proto; + + BGP_TRACE(D_PACKETS, "Sending END-OF-RR"); + + /* Demarcation of ending of route refresh (EoRR), RFC 7313 */ + put_af4(buf, c->afi); + buf[2] = BGP_RR_END; + + return buf+4; +} static void -bgp_do_rx_update(struct bgp_conn *conn, - byte *withdrawn UNUSED, int withdrawn_len, - byte *nlri UNUSED, int nlri_len, - byte *attrs, int attr_len) +bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len) { struct bgp_proto *p = conn->bgp; - struct rte_src *src = p->p.main_source; - byte *x; - int len, len0; - unsigned af; - rta *a0, *a = NULL; - ip_addr prefix; - int pxlen, err = 0; - u32 path_id = 0; - u32 last_id = 0; - - p->mp_reach_len = 0; - p->mp_unreach_len = 0; - a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, 0); - - if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */ - return; - /* Check for End-of-RIB marker */ - if ((attr_len < 8) && !withdrawn_len && !nlri_len && !p->mp_reach_len && - (p->mp_unreach_len == 3) && (get_u16(p->mp_unreach_start) == BGP_AF_IPV6)) - { - bgp_rx_end_mark(p); - return; - } + if (conn->state != BS_ESTABLISHED) + { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - DO_NLRI(mp_unreach) - { - while (len) - { - DECODE_PREFIX(x, len); - DBG("Withdraw %I/%d\n", prefix, pxlen); - bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); - } - } + if (!conn->local_caps->route_refresh) + { bgp_error(conn, 1, 3, pkt+18, 1); return; } - DO_NLRI(mp_reach) - { - /* Create fake NEXT_HOP attribute */ - if (len < 1 || (*x != 16 && *x != 32) || len < *x + 2) - { err = 9; goto done; } + if (len < (BGP_HEADER_LENGTH + 4)) + { bgp_error(conn, 1, 2, pkt+16, 2); return; } - if (a0) - bgp_attach_next_hop(a0, x); + if (len > (BGP_HEADER_LENGTH + 4)) + { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; } - /* Also ignore one reserved byte */ - len -= *x + 2; - x += *x + 2; + struct bgp_channel *c = bgp_get_channel(p, get_af4(pkt+19)); + if (!c) + { + log(L_WARN "%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring", + p->p.name, pkt[21], get_u16(pkt+19), pkt[22]); + return; + } - if (a0 && ! bgp_set_next_hop(p, a0)) - a0 = NULL; + /* RFC 7313 redefined reserved field as RR message subtype */ + uint subtype = p->enhanced_refresh ? pkt[21] : BGP_RR_REQUEST; - last_id = 0; - src = p->p.main_source; + switch (subtype) + { + case BGP_RR_REQUEST: + BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH"); + channel_request_feeding(&c->c); + break; - while (len) - { - DECODE_PREFIX(x, len); - DBG("Add %I/%d\n", prefix, pxlen); + case BGP_RR_BEGIN: + BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR"); + bgp_refresh_begin(c); + break; - if (a0) - bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a); - else /* Forced withdraw as a result of soft error */ - bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src); - } - } + case BGP_RR_END: + BGP_TRACE(D_PACKETS, "Got END-OF-RR"); + bgp_refresh_end(c); + break; - done: - if (a) - rta_free(a); + default: + log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring", + p->p.name, subtype); + break; + } +} + +static inline struct bgp_channel * +bgp_get_channel_to_send(struct bgp_proto *p, struct bgp_conn *conn) +{ + uint i = conn->last_channel; - if (err) /* Use subcode 9, not err */ - bgp_error(conn, 3, 9, NULL, 0); + /* Try the last channel, but at most several times */ + if ((conn->channels_to_send & (1 << i)) && + (conn->last_channel_count < 16)) + goto found; - return; + /* Find channel with non-zero channels_to_send */ + do + { + i++; + if (i >= p->channel_count) + i = 0; + } + while (! (conn->channels_to_send & (1 << i))); + + /* Use that channel */ + conn->last_channel = i; + conn->last_channel_count = 0; + +found: + conn->last_channel_count++; + return p->channel_map[i]; } -#endif +static inline int +bgp_send(struct bgp_conn *conn, uint type, uint len) +{ + sock *sk = conn->sk; + byte *buf = sk->tbuf; + + memset(buf, 0xff, 16); /* Marker */ + put_u16(buf+16, len); + buf[18] = type; -static void -bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len) + return sk_send(sk, len); +} + +/** + * bgp_fire_tx - transmit packets + * @conn: connection + * + * Whenever the transmit buffers of the underlying TCP connection + * are free and we have any packets queued for sending, the socket functions + * call bgp_fire_tx() which takes care of selecting the highest priority packet + * queued (Notification > Keepalive > Open > Update), assembling its header + * and body and sending it to the connection. + */ +static int +bgp_fire_tx(struct bgp_conn *conn) { struct bgp_proto *p = conn->bgp; - byte *withdrawn, *attrs, *nlri; - uint withdrawn_len, attr_len, nlri_len; + struct bgp_channel *c; + byte *buf, *pkt, *end; + uint s; - BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE"); + if (!conn->sk) + return 0; - /* Workaround for some BGP implementations that skip initial KEEPALIVE */ - if (conn->state == BS_OPENCONFIRM) - bgp_conn_enter_established_state(conn); + buf = conn->sk->tbuf; + pkt = buf + BGP_HEADER_LENGTH; + s = conn->packets_to_send; - if (conn->state != BS_ESTABLISHED) - { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - bgp_start_timer(conn->hold_timer, conn->hold_time); + if (s & (1 << PKT_SCHEDULE_CLOSE)) + { + /* We can finally close connection and enter idle state */ + bgp_conn_enter_idle_state(conn); + return 0; + } + if (s & (1 << PKT_NOTIFICATION)) + { + conn->packets_to_send = 1 << PKT_SCHEDULE_CLOSE; + end = bgp_create_notification(conn, pkt); + return bgp_send(conn, PKT_NOTIFICATION, end - buf); + } + else if (s & (1 << PKT_KEEPALIVE)) + { + conn->packets_to_send &= ~(1 << PKT_KEEPALIVE); + BGP_TRACE(D_PACKETS, "Sending KEEPALIVE"); + bgp_start_timer(conn->keepalive_timer, conn->keepalive_time); + return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH); + } + else if (s & (1 << PKT_OPEN)) + { + conn->packets_to_send &= ~(1 << PKT_OPEN); + end = bgp_create_open(conn, pkt); + return bgp_send(conn, PKT_OPEN, end - buf); + } + else while (conn->channels_to_send) + { + c = bgp_get_channel_to_send(p, conn); + s = c->packets_to_send; - /* Find parts of the packet and check sizes */ - if (len < 23) + if (s & (1 << PKT_ROUTE_REFRESH)) { - bgp_error(conn, 1, 2, pkt+16, 2); - return; + c->packets_to_send &= ~(1 << PKT_ROUTE_REFRESH); + end = bgp_create_route_refresh(c, pkt); + return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf); + } + else if (s & (1 << PKT_BEGIN_REFRESH)) + { + /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */ + c->packets_to_send &= ~(1 << PKT_BEGIN_REFRESH); + end = bgp_create_begin_refresh(c, pkt); + return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf); + } + else if (s & (1 << PKT_UPDATE)) + { + end = bgp_create_update(c, pkt); + if (end) + return bgp_send(conn, PKT_UPDATE, end - buf); + + /* No update to send, perhaps we need to send End-of-RIB or EoRR */ + c->packets_to_send = 0; + conn->channels_to_send &= ~(1 << c->index); + + if (c->feed_state == BFS_LOADED) + { + c->feed_state = BFS_NONE; + end = bgp_create_end_mark(c, pkt); + return bgp_send(conn, PKT_UPDATE, end - buf); + } + + else if (c->feed_state == BFS_REFRESHED) + { + c->feed_state = BFS_NONE; + end = bgp_create_end_refresh(c, pkt); + return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf); + } + } + else if (s) + bug("Channel packets_to_send: %x", s); + + c->packets_to_send = 0; + conn->channels_to_send &= ~(1 << c->index); + } + + return 0; +} + +/** + * bgp_schedule_packet - schedule a packet for transmission + * @conn: connection + * @c: channel + * @type: packet type + * + * Schedule a packet of type @type to be sent as soon as possible. + */ +void +bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type) +{ + ASSERT(conn->sk); + + DBG("BGP: Scheduling packet type %d\n", type); + + if (c) + { + if (! conn->channels_to_send) + { + conn->last_channel = c->index; + conn->last_channel_count = 0; } - withdrawn = pkt + 21; - withdrawn_len = get_u16(pkt + 19); - if (withdrawn_len + 23 > len) - goto malformed; - attrs = withdrawn + withdrawn_len + 2; - attr_len = get_u16(attrs - 2); - if (withdrawn_len + attr_len + 23 > len) - goto malformed; - nlri = attrs + attr_len; - nlri_len = len - withdrawn_len - attr_len - 23; - if (!attr_len && nlri_len) - goto malformed; - DBG("Sizes: withdrawn=%d, attrs=%d, NLRI=%d\n", withdrawn_len, attr_len, nlri_len); - - lp_flush(bgp_linpool); - - bgp_do_rx_update(conn, withdrawn, withdrawn_len, nlri, nlri_len, attrs, attr_len); - return; -malformed: - bgp_error(conn, 3, 1, NULL, 0); + c->packets_to_send |= 1 << type; + conn->channels_to_send |= 1 << c->index; + } + else + conn->packets_to_send |= 1 << type; + + if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev)) + ev_schedule(conn->tx_ev); } +void +bgp_kick_tx(void *vconn) +{ + struct bgp_conn *conn = vconn; + + DBG("BGP: kicking TX\n"); + while (bgp_fire_tx(conn) > 0) + ; +} + +void +bgp_tx(sock *sk) +{ + struct bgp_conn *conn = sk->data; + + DBG("BGP: TX hook\n"); + while (bgp_fire_tx(conn) > 0) + ; +} + + static struct { byte major, minor; byte *msg; @@ -1480,17 +2668,16 @@ static struct { * which might be static string or given temporary buffer. */ const char * -bgp_error_dsc(unsigned code, unsigned subcode) +bgp_error_dsc(uint code, uint subcode) { static char buff[32]; - unsigned i; + uint i; + for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++) if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode) - { - return bgp_msg_table[i].msg; - } + return bgp_msg_table[i].msg; - bsprintf(buff, "Unknown error %d.%d", code, subcode); + bsprintf(buff, "Unknown error %u.%u", code, subcode); return buff; } @@ -1521,10 +2708,10 @@ bgp_handle_message(struct bgp_proto *p, byte *data, uint len, byte **bp) } void -bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len) +bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, byte *data, uint len) { byte argbuf[256], *t = argbuf; - unsigned i; + uint i; /* Don't report Cease messages generated by myself */ if (code == 6 && class == BE_BGP_TX) @@ -1566,47 +2753,25 @@ static void bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len) { struct bgp_proto *p = conn->bgp; + if (len < 21) - { - bgp_error(conn, 1, 2, pkt+16, 2); - return; - } + { bgp_error(conn, 1, 2, pkt+16, 2); return; } - unsigned code = pkt[19]; - unsigned subcode = pkt[20]; + uint code = pkt[19]; + uint subcode = pkt[20]; int err = (code != 6); bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21); bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode); -#ifndef IPV6 - if ((code == 2) && ((subcode == 4) || (subcode == 7)) - /* Error related to capability: - * 4 - Peer does not support capabilities at all. - * 7 - Peer request some capability. Strange unless it is IPv6 only peer. - */ - && (p->cf->capabilities == 2) - /* Capabilities are not explicitly enabled or disabled, therefore heuristic is used */ - && (conn->start_state == BSS_CONNECT) - /* Failed connection attempt have used capabilities */ - && (p->cf->remote_as <= 0xFFFF)) - /* Not possible with disabled capabilities */ - { - /* We try connect without capabilities */ - log(L_WARN "%s: Capability related error received, retry with capabilities disabled", p->p.name); - p->start_state = BSS_CONNECT_NOCAP; - err = 0; - } -#endif - bgp_conn_enter_close_state(conn); - bgp_schedule_packet(conn, PKT_SCHEDULE_CLOSE); + bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE); - if (err) - { - bgp_update_startup_delay(p); - bgp_stop(p, 0, NULL, 0); - } + if (err) + { + bgp_update_startup_delay(p); + bgp_stop(p, 0, NULL, 0); + } } static void @@ -1616,64 +2781,12 @@ bgp_rx_keepalive(struct bgp_conn *conn) BGP_TRACE(D_PACKETS, "Got KEEPALIVE"); bgp_start_timer(conn->hold_timer, conn->hold_time); - switch (conn->state) - { - case BS_OPENCONFIRM: - bgp_conn_enter_established_state(conn); - break; - case BS_ESTABLISHED: - break; - default: - bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); - } -} -static void -bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len) -{ - struct bgp_proto *p = conn->bgp; + if (conn->state == BS_OPENCONFIRM) + { bgp_conn_enter_established_state(conn); return; } if (conn->state != BS_ESTABLISHED) - { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; } - - if (!p->cf->enable_refresh) - { bgp_error(conn, 1, 3, pkt+18, 1); return; } - - if (len < (BGP_HEADER_LENGTH + 4)) - { bgp_error(conn, 1, 2, pkt+16, 2); return; } - - if (len > (BGP_HEADER_LENGTH + 4)) - { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; } - - /* FIXME - we ignore AFI/SAFI values, as we support - just one value and even an error code for an invalid - request is not defined */ - - /* RFC 7313 redefined reserved field as RR message subtype */ - uint subtype = conn->peer_enhanced_refresh_support ? pkt[21] : BGP_RR_REQUEST; - - switch (subtype) - { - case BGP_RR_REQUEST: - BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH"); - proto_request_feeding(&p->p); - break; - - case BGP_RR_BEGIN: - BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR"); - bgp_refresh_begin(p); - break; - - case BGP_RR_END: - BGP_TRACE(D_PACKETS, "Got END-OF-RR"); - bgp_refresh_end(p); - break; - - default: - log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring", - p->p.name, subtype); - break; - } + bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); } @@ -1687,7 +2800,7 @@ bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len) * packet handler according to the packet type. */ static void -bgp_rx_packet(struct bgp_conn *conn, byte *pkt, unsigned len) +bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len) { byte type = pkt[18]; @@ -1697,14 +2810,14 @@ bgp_rx_packet(struct bgp_conn *conn, byte *pkt, unsigned len) mrt_dump_bgp_packet(conn, pkt, len); switch (type) - { - case PKT_OPEN: return bgp_rx_open(conn, pkt, len); - case PKT_UPDATE: return bgp_rx_update(conn, pkt, len); - case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len); - case PKT_KEEPALIVE: return bgp_rx_keepalive(conn); - case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len); - default: bgp_error(conn, 1, 3, pkt+18, 1); - } + { + case PKT_OPEN: return bgp_rx_open(conn, pkt, len); + case PKT_UPDATE: return bgp_rx_update(conn, pkt, len); + case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len); + case PKT_KEEPALIVE: return bgp_rx_keepalive(conn); + case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len); + default: bgp_error(conn, 1, 3, pkt+18, 1); + } } /** @@ -1721,10 +2834,9 @@ int bgp_rx(sock *sk, uint size) { struct bgp_conn *conn = sk->data; - struct bgp_proto *p = conn->bgp; byte *pkt_start = sk->rbuf; byte *end = pkt_start + size; - unsigned i, len; + uint i, len; DBG("BGP: RX hook: Got %d bytes\n", size); while (end >= pkt_start + BGP_HEADER_LENGTH) @@ -1738,7 +2850,7 @@ bgp_rx(sock *sk, uint size) break; } len = get_u16(pkt_start+16); - if (len < BGP_HEADER_LENGTH || len > bgp_max_packet_length(p)) + if ((len < BGP_HEADER_LENGTH) || (len > bgp_max_packet_length(conn))) { bgp_error(conn, 1, 2, pkt_start+16, 2); break; |