summaryrefslogtreecommitdiff
path: root/proto/bgp
diff options
context:
space:
mode:
Diffstat (limited to 'proto/bgp')
-rw-r--r--proto/bgp/Makefile9
-rw-r--r--proto/bgp/attrs.c2756
-rw-r--r--proto/bgp/bgp.c1634
-rw-r--r--proto/bgp/bgp.h486
-rw-r--r--proto/bgp/config.Y159
-rw-r--r--proto/bgp/packets.c3405
6 files changed, 5195 insertions, 3254 deletions
diff --git a/proto/bgp/Makefile b/proto/bgp/Makefile
index a634cf0d..00aaef5e 100644
--- a/proto/bgp/Makefile
+++ b/proto/bgp/Makefile
@@ -1,5 +1,6 @@
-source=bgp.c attrs.c packets.c
-root-rel=../../
-dir-name=proto/bgp
+src := attrs.c bgp.c packets.c
+obj := $(src-o-files)
+$(all-daemon)
+$(cf-local)
-include ../../Rules
+tests_objs := $(tests_objs) $(src-o-files) \ No newline at end of file
diff --git a/proto/bgp/attrs.c b/proto/bgp/attrs.c
index b9e2490d..882ba44e 100644
--- a/proto/bgp/attrs.c
+++ b/proto/bgp/attrs.c
@@ -2,6 +2,8 @@
* BIRD -- BGP Attributes
*
* (c) 2000 Martin Mares <mj@ucw.cz>
+ * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
+ * (c) 2008--2016 CZ.NIC z.s.p.o.
*
* Can be freely distributed and used under the terms of the GNU GPL.
*/
@@ -39,890 +41,1249 @@
* specifies that such updates should be ignored, but that is generally
* a bad idea.
*
- * Error checking of optional transitive attributes is done according to
- * draft-ietf-idr-optional-transitive-03, but errors are handled always
- * as withdraws.
+ * BGP attribute table has several hooks:
*
- * Unexpected AS_CONFED_* segments in AS_PATH are logged and removed,
- * but unknown segments cause a session drop with Malformed AS_PATH
- * error (see validate_path()). The behavior in such case is not
- * explicitly specified by RFC 4271. RFC 5065 specifies that
- * inconsistent AS_CONFED_* segments should cause a session drop, but
- * implementations that pass invalid AS_CONFED_* segments are
- * widespread.
+ * export - Hook that validates and normalizes attribute during export phase.
+ * Receives eattr, may modify it (e.g., sort community lists for canonical
+ * representation), UNSET() it (e.g., skip empty lists), or WITHDRAW() it if
+ * necessary. May assume that eattr has value valid w.r.t. its type, but may be
+ * invalid w.r.t. BGP constraints. Optional.
*
- * Error handling of AS4_* attributes is done as specified by RFC 6793. There
- * are several possible inconsistencies between AGGREGATOR and AS4_AGGREGATOR
- * that are not handled by that RFC, these are logged and ignored (see
- * bgp_reconstruct_4b_attrs()).
+ * encode - Hook that converts internal representation to external one during
+ * packet writing. Receives eattr and puts it in the buffer (including attribute
+ * header). Returns number of bytes, or -1 if not enough space. May assume that
+ * eattr has value valid w.r.t. its type and validated by export hook. Mandatory
+ * for all known attributes that exist internally after export phase (i.e., all
+ * except pseudoattributes MP_(UN)REACH_NLRI).
+ *
+ * decode - Hook that converts external representation to internal one during
+ * packet parsing. Receives attribute data in buffer, validates it and adds
+ * attribute to ea_list. If data are invalid, steps DISCARD(), WITHDRAW() or
+ * bgp_parse_error() may be used to escape. Mandatory for all known attributes.
+ *
+ * format - Optional hook that converts eattr to textual representation.
*/
+// XXXX review pool usage : c->c.proto->pool
-static byte bgp_mandatory_attrs[] = { BA_ORIGIN, BA_AS_PATH
-#ifndef IPV6
-,BA_NEXT_HOP
-#endif
-};
-struct attr_desc {
- char *name;
- int expected_length;
- int expected_flags;
- int type;
- int allow_in_ebgp;
- int (*validate)(struct bgp_proto *p, byte *attr, int len);
- void (*format)(eattr *ea, byte *buf, int buflen);
+struct bgp_attr_desc {
+ const char *name;
+ uint type;
+ uint flags;
+ void (*export)(struct bgp_export_state *s, eattr *a);
+ int (*encode)(struct bgp_write_state *s, eattr *a, byte *buf, uint size);
+ void (*decode)(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to);
+ void (*format)(eattr *ea, byte *buf, uint size);
};
-#define IGNORE -1
-#define WITHDRAW -2
+static const struct bgp_attr_desc bgp_attr_table[];
+
+static inline int bgp_attr_known(uint code);
+
+eattr *
+bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintptr_t val)
+{
+ ASSERT(bgp_attr_known(code));
+
+ ea_list *a = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr));
+ eattr *e = &a->attrs[0];
+
+ a->flags = EALF_SORTED;
+ a->count = 1;
+ a->next = *attrs;
+ *attrs = a;
+
+ e->id = EA_CODE(EAP_BGP, code);
+ e->type = bgp_attr_table[code].type;
+ e->flags = flags;
+
+ if (e->type & EAF_EMBEDDED)
+ e->u.data = (u32) val;
+ else
+ e->u.ptr = (struct adata *) val;
+
+ return e;
+}
+
+
+
+#define REPORT(msg, args...) \
+ ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); })
+
+#define DISCARD(msg, args...) \
+ ({ REPORT(msg, ## args); return; })
+
+#define WITHDRAW(msg, args...) \
+ ({ REPORT(msg, ## args); s->err_withdraw = 1; return; })
+
+#define UNSET(a) \
+ ({ a->type = EAF_TYPE_UNDEF; return; })
+
+#define NEW_BGP "Discarding %s attribute received from AS4-aware neighbor"
+#define BAD_EBGP "Discarding %s attribute received from EBGP neighbor"
+#define BAD_LENGTH "Malformed %s attribute - invalid length (%u)"
+#define BAD_VALUE "Malformed %s attribute - invalid value (%u)"
+#define NO_MANDATORY "Missing mandatory %s attribute"
+
+
+static inline int
+bgp_put_attr_hdr3(byte *buf, uint code, uint flags, uint len)
+{
+ *buf++ = flags;
+ *buf++ = code;
+ *buf++ = len;
+ return 3;
+}
+
+static inline int
+bgp_put_attr_hdr4(byte *buf, uint code, uint flags, uint len)
+{
+ *buf++ = flags | BAF_EXT_LEN;
+ *buf++ = code;
+ put_u16(buf, len);
+ return 4;
+}
+
+static inline int
+bgp_put_attr_hdr(byte *buf, uint code, uint flags, uint len)
+{
+ if (len < 256)
+ return bgp_put_attr_hdr3(buf, code, flags, len);
+ else
+ return bgp_put_attr_hdr4(buf, code, flags, len);
+}
static int
-bgp_check_origin(struct bgp_proto *p UNUSED, byte *a, int len UNUSED)
+bgp_encode_u8(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size)
{
- if (*a > 2)
- return 6;
- return 0;
+ if (size < (3+1))
+ return -1;
+
+ bgp_put_attr_hdr3(buf, EA_ID(a->id), a->flags, 1);
+ buf[3] = a->u.data;
+
+ return 3+1;
}
-static void
-bgp_format_origin(eattr *a, byte *buf, int buflen UNUSED)
+static int
+bgp_encode_u32(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size)
{
- static char *bgp_origin_names[] = { "IGP", "EGP", "Incomplete" };
+ if (size < (3+4))
+ return -1;
+
+ bgp_put_attr_hdr3(buf, EA_ID(a->id), a->flags, 4);
+ put_u32(buf+3, a->u.data);
- bsprintf(buf, bgp_origin_names[a->u.data]);
+ return 3+4;
}
static int
-path_segment_contains(byte *p, int bs, u32 asn)
+bgp_encode_u32s(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size)
{
- int i;
- int len = p[1];
- p += 2;
+ uint len = a->u.ptr->length;
- for(i=0; i<len; i++)
- {
- u32 asn2 = (bs == 4) ? get_u32(p) : get_u16(p);
- if (asn2 == asn)
- return 1;
- p += bs;
- }
+ if (size < (4+len))
+ return -1;
- return 0;
+ uint hdr = bgp_put_attr_hdr(buf, EA_ID(a->id), a->flags, len);
+ put_u32s(buf + hdr, (u32 *) a->u.ptr->data, len / 4);
+
+ return hdr + len;
}
-/* Validates path attribute, removes AS_CONFED_* segments, and also returns path length */
static int
-validate_path(struct bgp_proto *p, int as_path, int bs, byte *idata, uint *ilength)
+bgp_put_attr(byte *buf, uint size, uint code, uint flags, byte *data, uint len)
{
- int res = 0;
- u8 *a, *dst;
- int len, plen;
+ if (size < (4+len))
+ return -1;
- dst = a = idata;
- len = *ilength;
+ uint hdr = bgp_put_attr_hdr(buf, code, flags, len);
+ memcpy(buf + hdr, data, len);
- while (len)
- {
- if (len < 2)
- return -1;
-
- plen = 2 + bs * a[1];
- if (len < plen)
- return -1;
-
- if (a[1] == 0)
- {
- log(L_WARN "%s: %s_PATH attribute contains empty segment, skipping it",
- p->p.name, as_path ? "AS" : "AS4");
- goto skip;
- }
-
- switch (a[0])
- {
- case AS_PATH_SET:
- res++;
- break;
-
- case AS_PATH_SEQUENCE:
- res += a[1];
- break;
-
- case AS_PATH_CONFED_SEQUENCE:
- case AS_PATH_CONFED_SET:
- if (as_path && path_segment_contains(a, bs, p->remote_as))
- {
- log(L_WARN "%s: AS_CONFED_* segment with peer ASN found, misconfigured confederation?", p->p.name);
- return -1;
- }
-
- log(L_WARN "%s: %s_PATH attribute contains AS_CONFED_* segment, skipping segment",
- p->p.name, as_path ? "AS" : "AS4");
- goto skip;
-
- default:
- return -1;
- }
-
- if (dst != a)
- memmove(dst, a, plen);
- dst += plen;
-
- skip:
- len -= plen;
- a += plen;
- }
+ return hdr + len;
+}
- *ilength = dst - idata;
- return res;
+static int
+bgp_encode_raw(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size)
+{
+ return bgp_put_attr(buf, size, EA_ID(a->id), a->flags, a->u.ptr->data, a->u.ptr->length);
}
-static inline int
-validate_as_path(struct bgp_proto *p, byte *a, int *len)
+
+/*
+ * Attribute hooks
+ */
+
+static void
+bgp_export_origin(struct bgp_export_state *s, eattr *a)
{
- return validate_path(p, 1, p->as4_session ? 4 : 2, a, len);
+ if (a->u.data > 2)
+ WITHDRAW(BAD_VALUE, "ORIGIN", a->u.data);
}
-static inline int
-validate_as4_path(struct bgp_proto *p, struct adata *path)
+static void
+bgp_decode_origin(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
- return validate_path(p, 0, 4, path->data, &path->length);
+ if (len != 1)
+ WITHDRAW(BAD_LENGTH, "ORIGIN", len);
+
+ if (data[0] > 2)
+ WITHDRAW(BAD_VALUE, "ORIGIN", data[0]);
+
+ bgp_set_attr_u32(to, s->pool, BA_ORIGIN, flags, data[0]);
}
+static void
+bgp_format_origin(eattr *a, byte *buf, uint size UNUSED)
+{
+ static const char *bgp_origin_names[] = { "IGP", "EGP", "Incomplete" };
+
+ bsprintf(buf, (a->u.data <= 2) ? bgp_origin_names[a->u.data] : "?");
+}
+
+
static int
-bgp_check_next_hop(struct bgp_proto *p UNUSED, byte *a UNUSED6, int len UNUSED6)
+bgp_encode_as_path(struct bgp_write_state *s, eattr *a, byte *buf, uint size)
{
-#ifdef IPV6
- return IGNORE;
-#else
- ip_addr addr;
+ byte *data = a->u.ptr->data;
+ uint len = a->u.ptr->length;
+
+ if (!s->as4_session)
+ {
+ /* Prepare 16-bit AS_PATH (from 32-bit one) in a temporary buffer */
+ byte *src = data;
+ data = alloca(len);
+ len = as_path_32to16(data, src, len);
+ }
+
+ return bgp_put_attr(buf, size, BA_AS_PATH, a->flags, data, len);
+}
- memcpy(&addr, a, len);
- ipa_ntoh(addr);
- if (ipa_classify(addr) & IADDR_HOST)
+static void
+bgp_decode_as_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
+{
+ struct bgp_proto *p = s->proto;
+ int as_length = s->as4_session ? 4 : 2;
+ int as_confed = p->cf->confederation && p->is_interior;
+ char err[128];
+
+ if (!as_path_valid(data, len, as_length, as_confed, err, sizeof(err)))
+ WITHDRAW("Malformed AS_PATH attribute - %s", err);
+
+ /* In some circumstances check for initial AS_CONFED_SEQUENCE; RFC 5065 5.0 */
+ if (p->is_interior && !p->is_internal &&
+ ((len < 2) || (data[0] != AS_PATH_CONFED_SEQUENCE)))
+ WITHDRAW("Malformed AS_PATH attribute - %s", "missing initial AS_CONFED_SEQUENCE");
+
+ if (!s->as4_session)
+ {
+ /* Prepare 32-bit AS_PATH (from 16-bit one) in a temporary buffer */
+ byte *src = data;
+ data = alloca(2*len);
+ len = as_path_16to32(data, src, len);
+ }
+
+ bgp_set_attr_data(to, s->pool, BA_AS_PATH, flags, data, len);
+}
+
+
+static int
+bgp_encode_next_hop(struct bgp_write_state *s, eattr *a, byte *buf, uint size)
+{
+ /*
+ * The NEXT_HOP attribute is used only in traditional (IPv4) BGP. In MP-BGP,
+ * the next hop is encoded as a part of the MP_REACH_NLRI attribute, so we
+ * store it and encode it later by AFI-specific hooks.
+ */
+
+ if (s->channel->afi == BGP_AF_IPV4)
+ {
+ ASSERT(a->u.ptr->length == sizeof(ip_addr));
+
+ if (size < (3+4))
+ return -1;
+
+ bgp_put_attr_hdr3(buf, BA_NEXT_HOP, a->flags, 4);
+ put_ip4(buf+3, ipa_to_ip4( *(ip_addr *) a->u.ptr->data ));
+
+ return 3+4;
+ }
+ else
+ {
+ s->mp_next_hop = a;
return 0;
+ }
+}
+
+static void
+bgp_decode_next_hop(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data, uint len, ea_list **to UNUSED)
+{
+ if (len != 4)
+ WITHDRAW(BAD_LENGTH, "NEXT_HOP", len);
+
+ /* Semantic checks are done later */
+ s->ip_next_hop_len = len;
+ s->ip_next_hop_data = data;
+}
+
+/* TODO: This function should use AF-specific hook */
+static void
+bgp_format_next_hop(eattr *a, byte *buf, uint size UNUSED)
+{
+ ip_addr *nh = (void *) a->u.ptr->data;
+ uint len = a->u.ptr->length;
+
+ ASSERT((len == 16) || (len == 32));
+
+ /* in IPv6, we may have two addresses in NEXT HOP */
+ if ((len == 16) || ipa_zero(nh[1]))
+ bsprintf(buf, "%I", nh[0]);
else
- return 8;
-#endif
+ bsprintf(buf, "%I %I", nh[0], nh[1]);
}
+
static void
-bgp_format_next_hop(eattr *a, byte *buf, int buflen UNUSED)
+bgp_decode_med(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
- ip_addr *ipp = (ip_addr *) a->u.ptr->data;
-#ifdef IPV6
- /* in IPv6, we might have two addresses in NEXT HOP */
- if ((a->u.ptr->length == NEXT_HOP_LENGTH) && ipa_nonzero(ipp[1]))
- {
- bsprintf(buf, "%I %I", ipp[0], ipp[1]);
- return;
- }
-#endif
+ if (len != 4)
+ WITHDRAW(BAD_LENGTH, "MULTI_EXIT_DISC", len);
- bsprintf(buf, "%I", ipp[0]);
+ u32 val = get_u32(data);
+ bgp_set_attr_u32(to, s->pool, BA_MULTI_EXIT_DISC, flags, val);
}
-static int
-bgp_check_aggregator(struct bgp_proto *p, byte *a UNUSED, int len)
+
+static void
+bgp_export_local_pref(struct bgp_export_state *s, eattr *a)
{
- int exp_len = p->as4_session ? 8 : 6;
-
- return (len == exp_len) ? 0 : WITHDRAW;
+ if (!s->proto->is_interior && !s->proto->cf->allow_local_pref)
+ UNSET(a);
}
static void
-bgp_format_aggregator(eattr *a, byte *buf, int buflen UNUSED)
+bgp_decode_local_pref(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
- struct adata *ad = a->u.ptr;
- byte *data = ad->data;
- u32 as;
+ if (!s->proto->is_interior && !s->proto->cf->allow_local_pref)
+ DISCARD(BAD_EBGP, "LOCAL_PREF");
- as = get_u32(data);
- data += 4;
+ if (len != 4)
+ WITHDRAW(BAD_LENGTH, "LOCAL_PREF", len);
- bsprintf(buf, "%d.%d.%d.%d AS%u", data[0], data[1], data[2], data[3], as);
+ u32 val = get_u32(data);
+ bgp_set_attr_u32(to, s->pool, BA_LOCAL_PREF, flags, val);
}
-static int
-bgp_check_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
+
+static void
+bgp_decode_atomic_aggr(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to)
{
- return ((len % 4) == 0) ? 0 : WITHDRAW;
+ if (len != 0)
+ DISCARD(BAD_LENGTH, "ATOMIC_AGGR", len);
+
+ bgp_set_attr_data(to, s->pool, BA_ATOMIC_AGGR, flags, NULL, 0);
}
static int
-bgp_check_cluster_list(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
+bgp_encode_aggregator(struct bgp_write_state *s, eattr *a, byte *buf, uint size)
{
- return ((len % 4) == 0) ? 0 : 5;
+ byte *data = a->u.ptr->data;
+ uint len = a->u.ptr->length;
+
+ if (!s->as4_session)
+ {
+ /* Prepare 16-bit AGGREGATOR (from 32-bit one) in a temporary buffer */
+ byte *src = data;
+ data = alloca(6);
+ len = aggregator_32to16(data, src);
+ }
+
+ return bgp_put_attr(buf, size, BA_AGGREGATOR, a->flags, data, len);
}
static void
-bgp_format_cluster_list(eattr *a, byte *buf, int buflen)
+bgp_decode_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
- /* Truncates cluster lists larger than buflen, probably not a problem */
- int_set_format(a->u.ptr, 0, -1, buf, buflen);
+ if (len != (s->as4_session ? 8 : 6))
+ DISCARD(BAD_LENGTH, "AGGREGATOR", len);
+
+ if (!s->as4_session)
+ {
+ /* Prepare 32-bit AGGREGATOR (from 16-bit one) in a temporary buffer */
+ byte *src = data;
+ data = alloca(8);
+ len = aggregator_16to32(data, src);
+ }
+
+ bgp_set_attr_data(to, s->pool, BA_AGGREGATOR, flags, data, len);
}
-static int
-bgp_check_reach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED)
+static void
+bgp_format_aggregator(eattr *a, byte *buf, uint size UNUSED)
{
-#ifdef IPV6
- p->mp_reach_start = a;
- p->mp_reach_len = len;
-#endif
- return IGNORE;
+ byte *data = a->u.ptr->data;
+
+ bsprintf(buf, "%I4 AS%u", get_ip4(data+4), get_u32(data+0));
}
-static int
-bgp_check_unreach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED)
+
+static void
+bgp_export_community(struct bgp_export_state *s, eattr *a)
{
-#ifdef IPV6
- p->mp_unreach_start = a;
- p->mp_unreach_len = len;
-#endif
- return IGNORE;
+ if (a->u.ptr->length == 0)
+ UNSET(a);
+
+ a->u.ptr = int_set_sort(s->pool, a->u.ptr);
}
-static int
-bgp_check_ext_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
+static void
+bgp_decode_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
- return ((len % 8) == 0) ? 0 : WITHDRAW;
+ if (!len || (len % 4))
+ WITHDRAW(BAD_LENGTH, "COMMUNITY", len);
+
+ struct adata *ad = lp_alloc_adata(s->pool, len);
+ get_u32s(data, (u32 *) ad->data, len / 4);
+ bgp_set_attr_ptr(to, s->pool, BA_COMMUNITY, flags, ad);
}
-static int
-bgp_check_large_community(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
-{
- return ((len % 12) == 0) ? 0 : WITHDRAW;
-}
-
-
-static struct attr_desc bgp_attr_table[] = {
- { NULL, -1, 0, 0, 0, /* Undefined */
- NULL, NULL },
- { "origin", 1, BAF_TRANSITIVE, EAF_TYPE_INT, 1, /* BA_ORIGIN */
- bgp_check_origin, bgp_format_origin },
- { "as_path", -1, BAF_TRANSITIVE, EAF_TYPE_AS_PATH, 1, /* BA_AS_PATH */
- NULL, NULL }, /* is checked by validate_as_path() as a special case */
- { "next_hop", 4, BAF_TRANSITIVE, EAF_TYPE_IP_ADDRESS, 1, /* BA_NEXT_HOP */
- bgp_check_next_hop, bgp_format_next_hop },
- { "med", 4, BAF_OPTIONAL, EAF_TYPE_INT, 1, /* BA_MULTI_EXIT_DISC */
- NULL, NULL },
- { "local_pref", 4, BAF_TRANSITIVE, EAF_TYPE_INT, 1, /* BA_LOCAL_PREF */
- NULL, NULL },
- { "atomic_aggr", 0, BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_ATOMIC_AGGR */
- NULL, NULL },
- { "aggregator", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AGGREGATOR */
- bgp_check_aggregator, bgp_format_aggregator },
- { "community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_INT_SET, 1, /* BA_COMMUNITY */
- bgp_check_community, NULL },
- { "originator_id", 4, BAF_OPTIONAL, EAF_TYPE_ROUTER_ID, 0, /* BA_ORIGINATOR_ID */
- NULL, NULL },
- { "cluster_list", -1, BAF_OPTIONAL, EAF_TYPE_INT_SET, 0, /* BA_CLUSTER_LIST */
- bgp_check_cluster_list, bgp_format_cluster_list },
- { .name = NULL }, /* BA_DPA */
- { .name = NULL }, /* BA_ADVERTISER */
- { .name = NULL }, /* BA_RCID_PATH */
- { "mp_reach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1, /* BA_MP_REACH_NLRI */
- bgp_check_reach_nlri, NULL },
- { "mp_unreach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1, /* BA_MP_UNREACH_NLRI */
- bgp_check_unreach_nlri, NULL },
- { "ext_community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_EC_SET, 1, /* BA_EXT_COMMUNITY */
- bgp_check_ext_community, NULL },
- { "as4_path", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AS4_PATH */
- NULL, NULL },
- { "as4_aggregator", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1, /* BA_AS4_PATH */
- NULL, NULL },
- [BA_LARGE_COMMUNITY] =
- { "large_community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_LC_SET, 1,
- bgp_check_large_community, NULL }
-};
-/* BA_AS4_PATH is type EAF_TYPE_OPAQUE and not type EAF_TYPE_AS_PATH.
- * It does not matter as this attribute does not appear on routes in the routing table.
- */
+static void
+bgp_export_originator_id(struct bgp_export_state *s, eattr *a)
+{
+ if (!s->proto->is_internal)
+ UNSET(a);
+}
+
+static void
+bgp_decode_originator_id(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
+{
+ if (!s->proto->is_internal)
+ DISCARD(BAD_EBGP, "ORIGINATOR_ID");
-#define ATTR_KNOWN(code) ((code) < ARRAY_SIZE(bgp_attr_table) && bgp_attr_table[code].name)
+ if (len != 4)
+ WITHDRAW(BAD_LENGTH, "ORIGINATOR_ID", len);
-static inline struct adata *
-bgp_alloc_adata(struct linpool *pool, unsigned len)
+ u32 val = get_u32(data);
+ bgp_set_attr_u32(to, s->pool, BA_ORIGINATOR_ID, flags, val);
+}
+
+
+static void
+bgp_export_cluster_list(struct bgp_export_state *s UNUSED, eattr *a)
{
- struct adata *ad = lp_alloc(pool, sizeof(struct adata) + len);
- ad->length = len;
- return ad;
+ if (!s->proto->is_internal)
+ UNSET(a);
+
+ if (a->u.ptr->length == 0)
+ UNSET(a);
}
static void
-bgp_set_attr(eattr *e, unsigned attr, uintptr_t val)
+bgp_decode_cluster_list(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
- ASSERT(ATTR_KNOWN(attr));
- e->id = EA_CODE(EAP_BGP, attr);
- e->type = bgp_attr_table[attr].type;
- e->flags = bgp_attr_table[attr].expected_flags;
- if (e->type & EAF_EMBEDDED)
- e->u.data = val;
- else
- e->u.ptr = (struct adata *) val;
+ if (!s->proto->is_internal)
+ DISCARD(BAD_EBGP, "CLUSTER_LIST");
+
+ if (!len || (len % 4))
+ WITHDRAW(BAD_LENGTH, "CLUSTER_LIST", len);
+
+ struct adata *ad = lp_alloc_adata(s->pool, len);
+ get_u32s(data, (u32 *) ad->data, len / 4);
+ bgp_set_attr_ptr(to, s->pool, BA_CLUSTER_LIST, flags, ad);
}
-static byte *
-bgp_set_attr_wa(eattr *e, struct linpool *pool, unsigned attr, unsigned len)
+static void
+bgp_format_cluster_list(eattr *a, byte *buf, uint size)
{
- struct adata *ad = bgp_alloc_adata(pool, len);
- bgp_set_attr(e, attr, (uintptr_t) ad);
- return ad->data;
+ /* Truncates cluster lists larger than buflen, probably not a problem */
+ int_set_format(a->u.ptr, 0, -1, buf, size);
}
-void
-bgp_attach_attr(ea_list **to, struct linpool *pool, unsigned attr, uintptr_t val)
+
+static inline u32
+get_af3(byte *buf)
{
- ea_list *a = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr));
- a->next = *to;
- *to = a;
- a->flags = EALF_SORTED;
- a->count = 1;
- bgp_set_attr(a->attrs, attr, val);
+ return (get_u16(buf) << 16) | buf[2];
}
-byte *
-bgp_attach_attr_wa(ea_list **to, struct linpool *pool, unsigned attr, unsigned len)
+static void
+bgp_decode_mp_reach_nlri(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data, uint len, ea_list **to UNUSED)
{
- struct adata *ad = bgp_alloc_adata(pool, len);
- bgp_attach_attr(to, pool, attr, (uintptr_t) ad);
- return ad->data;
+ /*
+ * 2 B MP_REACH_NLRI data - Address Family Identifier
+ * 1 B MP_REACH_NLRI data - Subsequent Address Family Identifier
+ * 1 B MP_REACH_NLRI data - Length of Next Hop Network Address
+ * var MP_REACH_NLRI data - Network Address of Next Hop
+ * 1 B MP_REACH_NLRI data - Reserved (zero)
+ * var MP_REACH_NLRI data - Network Layer Reachability Information
+ */
+
+ if ((len < 5) || (len < (5 + (uint) data[3])))
+ bgp_parse_error(s, 9);
+
+ s->mp_reach_af = get_af3(data);
+ s->mp_next_hop_len = data[3];
+ s->mp_next_hop_data = data + 4;
+ s->mp_reach_len = len - 5 - s->mp_next_hop_len;
+ s->mp_reach_nlri = data + 5 + s->mp_next_hop_len;
}
-static int
-bgp_encode_attr_hdr(byte *dst, uint flags, unsigned code, int len)
+
+static void
+bgp_decode_mp_unreach_nlri(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data, uint len, ea_list **to UNUSED)
{
- int wlen;
+ /*
+ * 2 B MP_UNREACH_NLRI data - Address Family Identifier
+ * 1 B MP_UNREACH_NLRI data - Subsequent Address Family Identifier
+ * var MP_UNREACH_NLRI data - Network Layer Reachability Information
+ */
- DBG("\tAttribute %02x (%d bytes, flags %02x)\n", code, len, flags);
+ if (len < 3)
+ bgp_parse_error(s, 9);
+
+ s->mp_unreach_af = get_af3(data);
+ s->mp_unreach_len = len - 3;
+ s->mp_unreach_nlri = data + 3;
+}
- if (len < 256)
- {
- *dst++ = flags;
- *dst++ = code;
- *dst++ = len;
- wlen = 3;
- }
- else
- {
- *dst++ = flags | BAF_EXT_LEN;
- *dst++ = code;
- put_u16(dst, len);
- wlen = 4;
- }
- return wlen;
+static void
+bgp_export_ext_community(struct bgp_export_state *s, eattr *a)
+{
+ if (a->u.ptr->length == 0)
+ UNSET(a);
+
+ a->u.ptr = ec_set_sort(s->pool, a->u.ptr);
}
static void
-aggregator_convert_to_old(struct adata *aggr, byte *dst, int *new_used)
+bgp_decode_ext_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
- byte *src = aggr->data;
- *new_used = 0;
+ if (!len || (len % 8))
+ WITHDRAW(BAD_LENGTH, "EXT_COMMUNITY", len);
- u32 as = get_u32(src);
- if (as > 0xFFFF)
- {
- as = AS_TRANS;
- *new_used = 1;
- }
- put_u16(dst, as);
+ struct adata *ad = lp_alloc_adata(s->pool, len);
+ get_u32s(data, (u32 *) ad->data, len / 4);
+ bgp_set_attr_ptr(to, s->pool, BA_EXT_COMMUNITY, flags, ad);
+}
- /* Copy IPv4 address */
- memcpy(dst + 2, src + 4, 4);
+
+static void
+bgp_decode_as4_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
+{
+ if (s->as4_session)
+ DISCARD(NEW_BGP, "AS4_AGGREGATOR");
+
+ if (len != 8)
+ DISCARD(BAD_LENGTH, "AS4_AGGREGATOR", len);
+
+ bgp_set_attr_data(to, s->pool, BA_AS4_AGGREGATOR, flags, data, len);
}
static void
-aggregator_convert_to_new(struct adata *aggr, byte *dst)
+bgp_decode_as4_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
- byte *src = aggr->data;
+ char err[128];
- u32 as = get_u16(src);
- put_u32(dst, as);
+ if (s->as4_session)
+ DISCARD(NEW_BGP, "AS4_PATH");
- /* Copy IPv4 address */
- memcpy(dst + 4, src + 2, 4);
+ if (len < 6)
+ DISCARD(BAD_LENGTH, "AS4_PATH", len);
+
+ if (!as_path_valid(data, len, 4, 1, err, sizeof(err)))
+ DISCARD("Malformed AS4_PATH attribute - %s", err);
+
+ struct adata *a = lp_alloc_adata(s->pool, len);
+ memcpy(a->data, data, len);
+
+ /* AS_CONFED* segments are invalid in AS4_PATH; RFC 6793 6 */
+ if (as_path_contains_confed(a))
+ {
+ REPORT("Discarding AS_CONFED* segment from AS4_PATH attribute");
+ a = as_path_strip_confed(s->pool, a);
+ }
+
+ bgp_set_attr_ptr(to, s->pool, BA_AS4_PATH, flags, a);
+}
+
+static void
+bgp_export_large_community(struct bgp_export_state *s, eattr *a)
+{
+ if (a->u.ptr->length == 0)
+ UNSET(a);
+
+ a->u.ptr = lc_set_sort(s->pool, a->u.ptr);
+}
+
+static void
+bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
+{
+ if (!len || (len % 12))
+ WITHDRAW(BAD_LENGTH, "LARGE_COMMUNITY", len);
+
+ struct adata *ad = lp_alloc_adata(s->pool, len);
+ get_u32s(data, (u32 *) ad->data, len / 4);
+ bgp_set_attr_ptr(to, s->pool, BA_LARGE_COMMUNITY, flags, ad);
+}
+
+static void
+bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a)
+{
+ net_addr *n = s->route->net->n.addr;
+ u32 *labels = (u32 *) a->u.ptr->data;
+ uint lnum = a->u.ptr->length / 4;
+
+ /* Perhaps we should just ignore it? */
+ if (!s->mpls)
+ WITHDRAW("Unexpected MPLS stack");
+
+ /* Empty MPLS stack is not allowed */
+ if (!lnum)
+ WITHDRAW("Malformed MPLS stack - empty");
+
+ /* This is ugly, but we must ensure that labels fit into NLRI field */
+ if ((24*lnum + (net_is_vpn(n) ? 64 : 0) + net_pxlen(n)) > 255)
+ WITHDRAW("Malformed MPLS stack - too many labels (%u)", lnum);
+
+ for (uint i = 0; i < lnum; i++)
+ {
+ if (labels[i] > 0xfffff)
+ WITHDRAW("Malformed MPLS stack - invalid label (%u)", labels[i]);
+
+ /* TODO: Check for special-purpose label values? */
+ }
}
static int
-bgp_get_attr_len(eattr *a)
+bgp_encode_mpls_label_stack(struct bgp_write_state *s, eattr *a, byte *buf UNUSED, uint size UNUSED)
{
- int len;
- if (ATTR_KNOWN(EA_ID(a->id)))
+ /*
+ * MPLS labels are encoded as a part of the NLRI in MP_REACH_NLRI attribute,
+ * so we store MPLS_LABEL_STACK and encode it later by AFI-specific hooks.
+ */
+
+ s->mpls_labels = a->u.ptr;
+ return 0;
+}
+
+static void
+bgp_decode_mpls_label_stack(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data UNUSED, uint len UNUSED, ea_list **to UNUSED)
+{
+ DISCARD("Discarding received attribute #0");
+}
+
+static void
+bgp_format_mpls_label_stack(eattr *a, byte *buf, uint size)
+{
+ u32 *labels = (u32 *) a->u.ptr->data;
+ uint lnum = a->u.ptr->length / 4;
+ char *pos = buf;
+
+ for (uint i = 0; i < lnum; i++)
+ {
+ if (size < 20)
{
- int code = EA_ID(a->id);
- struct attr_desc *desc = &bgp_attr_table[code];
- len = desc->expected_length;
- if (len < 0)
- {
- ASSERT(!(a->type & EAF_EMBEDDED));
- len = a->u.ptr->length;
- }
+ bsprintf(pos, "...");
+ return;
}
+
+ uint l = bsprintf(pos, "%d/", labels[i]);
+ ADVANCE(pos, size, l);
+ }
+
+ /* Clear last slash or terminate empty string */
+ pos[lnum ? -1 : 0] = 0;
+}
+
+static inline void
+bgp_decode_unknown(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to)
+{
+ bgp_set_attr_data(to, s->pool, code, flags, data, len);
+}
+
+
+/*
+ * Attribute table
+ */
+
+static const struct bgp_attr_desc bgp_attr_table[] = {
+ [BA_ORIGIN] = {
+ .name = "origin",
+ .type = EAF_TYPE_INT,
+ .flags = BAF_TRANSITIVE,
+ .export = bgp_export_origin,
+ .encode = bgp_encode_u8,
+ .decode = bgp_decode_origin,
+ .format = bgp_format_origin,
+ },
+ [BA_AS_PATH] = {
+ .name = "as_path",
+ .type = EAF_TYPE_AS_PATH,
+ .flags = BAF_TRANSITIVE,
+ .encode = bgp_encode_as_path,
+ .decode = bgp_decode_as_path,
+ },
+ [BA_NEXT_HOP] = {
+ .name = "next_hop",
+ .type = EAF_TYPE_IP_ADDRESS,
+ .flags = BAF_TRANSITIVE,
+ .encode = bgp_encode_next_hop,
+ .decode = bgp_decode_next_hop,
+ .format = bgp_format_next_hop,
+ },
+ [BA_MULTI_EXIT_DISC] = {
+ .name = "med",
+ .type = EAF_TYPE_INT,
+ .flags = BAF_OPTIONAL,
+ .encode = bgp_encode_u32,
+ .decode = bgp_decode_med,
+ },
+ [BA_LOCAL_PREF] = {
+ .name = "local_pref",
+ .type = EAF_TYPE_INT,
+ .flags = BAF_TRANSITIVE,
+ .export = bgp_export_local_pref,
+ .encode = bgp_encode_u32,
+ .decode = bgp_decode_local_pref,
+ },
+ [BA_ATOMIC_AGGR] = {
+ .name = "atomic_aggr",
+ .type = EAF_TYPE_OPAQUE,
+ .flags = BAF_TRANSITIVE,
+ .encode = bgp_encode_raw,
+ .decode = bgp_decode_atomic_aggr,
+ },
+ [BA_AGGREGATOR] = {
+ .name = "aggregator",
+ .type = EAF_TYPE_OPAQUE,
+ .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
+ .encode = bgp_encode_aggregator,
+ .decode = bgp_decode_aggregator,
+ .format = bgp_format_aggregator,
+ },
+ [BA_COMMUNITY] = {
+ .name = "community",
+ .type = EAF_TYPE_INT_SET,
+ .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
+ .export = bgp_export_community,
+ .encode = bgp_encode_u32s,
+ .decode = bgp_decode_community,
+ },
+ [BA_ORIGINATOR_ID] = {
+ .name = "originator_id",
+ .type = EAF_TYPE_ROUTER_ID,
+ .flags = BAF_OPTIONAL,
+ .export = bgp_export_originator_id,
+ .encode = bgp_encode_u32,
+ .decode = bgp_decode_originator_id,
+ },
+ [BA_CLUSTER_LIST] = {
+ .name = "cluster_list",
+ .type = EAF_TYPE_INT_SET,
+ .flags = BAF_OPTIONAL,
+ .export = bgp_export_cluster_list,
+ .encode = bgp_encode_u32s,
+ .decode = bgp_decode_cluster_list,
+ .format = bgp_format_cluster_list,
+ },
+ [BA_MP_REACH_NLRI] = {
+ .name = "mp_reach_nlri",
+ .type = EAF_TYPE_OPAQUE,
+ .flags = BAF_OPTIONAL,
+ .decode = bgp_decode_mp_reach_nlri,
+ },
+ [BA_MP_UNREACH_NLRI] = {
+ .name = "mp_unreach_nlri",
+ .type = EAF_TYPE_OPAQUE,
+ .flags = BAF_OPTIONAL,
+ .decode = bgp_decode_mp_unreach_nlri,
+ },
+ [BA_EXT_COMMUNITY] = {
+ .name = "ext_community",
+ .type = EAF_TYPE_EC_SET,
+ .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
+ .export = bgp_export_ext_community,
+ .encode = bgp_encode_u32s,
+ .decode = bgp_decode_ext_community,
+ },
+ [BA_AS4_PATH] = {
+ .name = "as4_path",
+ .type = EAF_TYPE_AS_PATH,
+ .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
+ .encode = bgp_encode_raw,
+ .decode = bgp_decode_as4_path,
+ },
+ [BA_AS4_AGGREGATOR] = {
+ .name = "as4_aggregator",
+ .type = EAF_TYPE_OPAQUE,
+ .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
+ .encode = bgp_encode_raw,
+ .decode = bgp_decode_as4_aggregator,
+ .format = bgp_format_aggregator,
+ },
+ [BA_LARGE_COMMUNITY] = {
+ .name = "large_community",
+ .type = EAF_TYPE_LC_SET,
+ .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
+ .export = bgp_export_large_community,
+ .encode = bgp_encode_u32s,
+ .decode = bgp_decode_large_community,
+ },
+ [BA_MPLS_LABEL_STACK] = {
+ .name = "mpls_label_stack",
+ .type = EAF_TYPE_INT_SET,
+ .export = bgp_export_mpls_label_stack,
+ .encode = bgp_encode_mpls_label_stack,
+ .decode = bgp_decode_mpls_label_stack,
+ .format = bgp_format_mpls_label_stack,
+ },
+};
+
+static inline int
+bgp_attr_known(uint code)
+{
+ return (code < ARRAY_SIZE(bgp_attr_table)) && bgp_attr_table[code].name;
+}
+
+
+/*
+ * Attribute export
+ */
+
+static inline void
+bgp_export_attr(struct bgp_export_state *s, eattr *a, ea_list *to)
+{
+ if (EA_PROTO(a->id) != EAP_BGP)
+ return;
+
+ uint code = EA_ID(a->id);
+
+ if (bgp_attr_known(code))
+ {
+ const struct bgp_attr_desc *desc = &bgp_attr_table[code];
+
+ /* The flags might have been zero if the attr was added by filters */
+ a->flags = (a->flags & BAF_PARTIAL) | desc->flags;
+
+ /* Set partial bit if new opt-trans attribute is attached to non-local route */
+ if ((s->src != NULL) && (a->type & EAF_ORIGINATED) &&
+ (a->flags & BAF_OPTIONAL) && (a->flags & BAF_TRANSITIVE))
+ a->flags |= BAF_PARTIAL;
+
+ /* Call specific hook */
+ CALL(desc->export, s, a);
+
+ /* Attribute might become undefined in hook */
+ if ((a->type & EAF_TYPE_MASK) == EAF_TYPE_UNDEF)
+ return;
+ }
else
- {
- ASSERT((a->type & EAF_TYPE_MASK) == EAF_TYPE_OPAQUE);
- len = a->u.ptr->length;
- }
-
- return len;
+ {
+ /* Don't re-export unknown non-transitive attributes */
+ if (!(a->flags & BAF_TRANSITIVE))
+ return;
+
+ a->flags |= BAF_PARTIAL;
+ }
+
+ /* Append updated attribute */
+ to->attrs[to->count++] = *a;
}
-#define ADVANCE(w, r, l) do { r -= l; w += l; } while (0)
+/**
+ * bgp_export_attrs - export BGP attributes
+ * @s: BGP export state
+ * @attrs: a list of extended attributes
+ *
+ * The bgp_export_attrs() function takes a list of attributes and merges it to
+ * one newly allocated and sorted segment. Attributes are validated and
+ * normalized by type-specific export hooks and attribute flags are updated.
+ * Some attributes may be eliminated (e.g. unknown non-tranitive attributes, or
+ * empty community sets).
+ *
+ * Result: one sorted attribute list segment, or NULL if attributes are unsuitable.
+ */
+static inline ea_list *
+bgp_export_attrs(struct bgp_export_state *s, ea_list *attrs)
+{
+ /* Merge the attribute list */
+ ea_list *new = lp_alloc(s->pool, ea_scan(attrs));
+ ea_merge(attrs, new);
+ ea_sort(new);
+
+ uint i, count;
+ count = new->count;
+ new->count = 0;
+
+ /* Export each attribute */
+ for (i = 0; i < count; i++)
+ bgp_export_attr(s, &new->attrs[i], new);
+
+ if (s->err_withdraw)
+ return NULL;
+
+ return new;
+}
+
+
+/*
+ * Attribute encoding
+ */
+
+static inline int
+bgp_encode_attr(struct bgp_write_state *s, eattr *a, byte *buf, uint size)
+{
+ ASSERT(EA_PROTO(a->id) == EAP_BGP);
+
+ uint code = EA_ID(a->id);
+
+ if (bgp_attr_known(code))
+ return bgp_attr_table[code].encode(s, a, buf, size);
+ else
+ return bgp_encode_raw(s, a, buf, size);
+}
/**
* bgp_encode_attrs - encode BGP attributes
- * @p: BGP instance
- * @w: buffer
+ * @s: BGP write state
* @attrs: a list of extended attributes
- * @remains: remaining space in the buffer
+ * @buf: buffer
+ * @end: buffer end
*
* The bgp_encode_attrs() function takes a list of extended attributes
* and converts it to its BGP representation (a part of an Update message).
*
* Result: Length of the attribute block generated or -1 if not enough space.
*/
-uint
-bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains)
+int
+bgp_encode_attrs(struct bgp_write_state *s, ea_list *attrs, byte *buf, byte *end)
{
- uint i, code, type, flags;
- byte *start = w;
- int len, rv;
+ byte *pos = buf;
+ int i, len;
- for(i=0; i<attrs->count; i++)
- {
- eattr *a = &attrs->attrs[i];
- ASSERT(EA_PROTO(a->id) == EAP_BGP);
- code = EA_ID(a->id);
-
-#ifdef IPV6
- /* When talking multiprotocol BGP, the NEXT_HOP attributes are used only temporarily. */
- if (code == BA_NEXT_HOP)
- continue;
-#endif
-
- /* When AS4-aware BGP speaker is talking to non-AS4-aware BGP speaker,
- * we have to convert our 4B AS_PATH to 2B AS_PATH and send our AS_PATH
- * as optional AS4_PATH attribute.
- */
- if ((code == BA_AS_PATH) && (! p->as4_session))
- {
- len = a->u.ptr->length;
-
- if (remains < (len + 4))
- goto err_no_buffer;
-
- /* Using temporary buffer because don't know a length of created attr
- * and therefore a length of a header. Perhaps i should better always
- * use BAF_EXT_LEN. */
-
- byte buf[len];
- int new_used;
- int nl = as_path_convert_to_old(a->u.ptr, buf, &new_used);
-
- DBG("BGP: Encoding old AS_PATH\n");
- rv = bgp_encode_attr_hdr(w, BAF_TRANSITIVE, BA_AS_PATH, nl);
- ADVANCE(w, remains, rv);
- memcpy(w, buf, nl);
- ADVANCE(w, remains, nl);
-
- if (! new_used)
- continue;
-
- if (remains < (len + 4))
- goto err_no_buffer;
-
- /* We should discard AS_CONFED_SEQUENCE or AS_CONFED_SET path segments
- * here but we don't support confederations and such paths we already
- * discarded in bgp_check_as_path().
- */
-
- DBG("BGP: Encoding AS4_PATH\n");
- rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_PATH, len);
- ADVANCE(w, remains, rv);
- memcpy(w, a->u.ptr->data, len);
- ADVANCE(w, remains, len);
-
- continue;
- }
-
- /* The same issue with AGGREGATOR attribute */
- if ((code == BA_AGGREGATOR) && (! p->as4_session))
- {
- int new_used;
-
- len = 6;
- if (remains < (len + 3))
- goto err_no_buffer;
-
- rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AGGREGATOR, len);
- ADVANCE(w, remains, rv);
- aggregator_convert_to_old(a->u.ptr, w, &new_used);
- ADVANCE(w, remains, len);
-
- if (! new_used)
- continue;
-
- len = 8;
- if (remains < (len + 3))
- goto err_no_buffer;
-
- rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_AGGREGATOR, len);
- ADVANCE(w, remains, rv);
- memcpy(w, a->u.ptr->data, len);
- ADVANCE(w, remains, len);
-
- continue;
- }
-
- /* Standard path continues here ... */
-
- type = a->type & EAF_TYPE_MASK;
- flags = a->flags & (BAF_OPTIONAL | BAF_TRANSITIVE | BAF_PARTIAL);
- len = bgp_get_attr_len(a);
-
- /* Skip empty sets */
- if (((type == EAF_TYPE_INT_SET) || (type == EAF_TYPE_EC_SET) || (type == EAF_TYPE_LC_SET)) && (len == 0))
- continue;
-
- if (remains < len + 4)
- goto err_no_buffer;
-
- rv = bgp_encode_attr_hdr(w, flags, code, len);
- ADVANCE(w, remains, rv);
-
- switch (type)
- {
- case EAF_TYPE_INT:
- case EAF_TYPE_ROUTER_ID:
- if (len == 4)
- put_u32(w, a->u.data);
- else
- *w = a->u.data;
- break;
- case EAF_TYPE_IP_ADDRESS:
- {
- ip_addr ip = *(ip_addr *)a->u.ptr->data;
- ipa_hton(ip);
- memcpy(w, &ip, len);
- break;
- }
- case EAF_TYPE_INT_SET:
- case EAF_TYPE_LC_SET:
- case EAF_TYPE_EC_SET:
- {
- u32 *z = int_set_get_data(a->u.ptr);
- int i;
- for(i=0; i<len; i+=4)
- put_u32(w+i, *z++);
- break;
- }
- case EAF_TYPE_OPAQUE:
- case EAF_TYPE_AS_PATH:
- memcpy(w, a->u.ptr->data, len);
- break;
- default:
- bug("bgp_encode_attrs: unknown attribute type %02x", a->type);
- }
- ADVANCE(w, remains, len);
- }
- return w - start;
+ for (i = 0; i < attrs->count; i++)
+ {
+ len = bgp_encode_attr(s, &attrs->attrs[i], pos, end - pos);
- err_no_buffer:
- return -1;
+ if (len < 0)
+ return -1;
+
+ pos += len;
+ }
+
+ return pos - buf;
}
+
/*
-static void
-bgp_init_prefix(struct fib_node *N)
+ * Attribute decoding
+ */
+
+static void bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool);
+
+static inline int
+bgp_as_path_loopy(struct bgp_proto *p, ea_list *attrs, u32 asn)
{
- struct bgp_prefix *p = (struct bgp_prefix *) N;
- p->bucket_node.next = NULL;
+ eattr *e = bgp_find_attr(attrs, BA_AS_PATH);
+ int num = p->cf->allow_local_as + 1;
+ return (e && (num > 0) && as_path_contains(e->u.ptr, asn, num));
}
-*/
-static int
-bgp_compare_u32(const u32 *x, const u32 *y)
+static inline int
+bgp_originator_id_loopy(struct bgp_proto *p, ea_list *attrs)
{
- return (*x < *y) ? -1 : (*x > *y) ? 1 : 0;
+ eattr *e = bgp_find_attr(attrs, BA_ORIGINATOR_ID);
+ return (e && (e->u.data == p->local_id));
}
-static inline void
-bgp_normalize_int_set(u32 *dest, u32 *src, unsigned cnt)
+static inline int
+bgp_cluster_list_loopy(struct bgp_proto *p, ea_list *attrs)
{
- memcpy(dest, src, sizeof(u32) * cnt);
- qsort(dest, cnt, sizeof(u32), (int(*)(const void *, const void *)) bgp_compare_u32);
+ eattr *e = bgp_find_attr(attrs, BA_CLUSTER_LIST);
+ return (e && int_set_contains(e->u.ptr, p->rr_cluster_id));
}
-static int
-bgp_compare_ec(const u32 *xp, const u32 *yp)
+static inline void
+bgp_decode_attr(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to)
{
- u64 x = ec_get(xp, 0);
- u64 y = ec_get(yp, 0);
- return (x < y) ? -1 : (x > y) ? 1 : 0;
+ /* Handle duplicate attributes; RFC 7606 3 (g) */
+ if (BIT32_TEST(s->attrs_seen, code))
+ {
+ if ((code == BA_MP_REACH_NLRI) || (code == BA_MP_UNREACH_NLRI))
+ bgp_parse_error(s, 1);
+ else
+ DISCARD("Discarding duplicate attribute (code %u)", code);
+ }
+ BIT32_SET(s->attrs_seen, code);
+
+ if (bgp_attr_known(code))
+ {
+ const struct bgp_attr_desc *desc = &bgp_attr_table[code];
+
+ /* Handle conflicting flags; RFC 7606 3 (c) */
+ if ((flags ^ desc->flags) & (BAF_OPTIONAL | BAF_TRANSITIVE))
+ WITHDRAW("Malformed %s attribute - conflicting flags (%02x)", desc->name, flags);
+
+ desc->decode(s, code, flags, data, len, to);
+ }
+ else /* Unknown attribute */
+ {
+ if (!(flags & BAF_OPTIONAL))
+ WITHDRAW("Unknown attribute (code %u) - conflicting flags (%02x)", code, flags);
+
+ bgp_decode_unknown(s, code, flags, data, len, to);
+ }
}
-static inline void
-bgp_normalize_ec_set(struct adata *ad, u32 *src, int internal)
+/**
+ * bgp_decode_attrs - check and decode BGP attributes
+ * @s: BGP parse state
+ * @data: start of attribute block
+ * @len: length of attribute block
+ *
+ * This function takes a BGP attribute block (a part of an Update message), checks
+ * its consistency and converts it to a list of BIRD route attributes represented
+ * by an (uncached) &rta.
+ */
+ea_list *
+bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len)
{
- u32 *dst = int_set_get_data(ad);
+ struct bgp_proto *p = s->proto;
+ ea_list *attrs = NULL;
+ uint code, flags, alen;
+ byte *pos = data;
+
+ /* Parse the attributes */
+ while (len)
+ {
+ alen = 0;
- /* Remove non-transitive communities (EC_TBIT active) on external sessions */
- if (! internal)
+ /* Read attribute type */
+ if (len < 2)
+ goto framing_error;
+ flags = pos[0];
+ code = pos[1];
+ ADVANCE(pos, len, 2);
+
+ /* Read attribute length */
+ if (flags & BAF_EXT_LEN)
{
- int len = int_set_get_size(ad);
- u32 *t = dst;
- int i;
-
- for (i=0; i < len; i += 2)
- {
- if (src[i] & EC_TBIT)
- continue;
-
- *t++ = src[i];
- *t++ = src[i+1];
- }
-
- ad->length = (t - dst) * 4;
+ if (len < 2)
+ goto framing_error;
+ alen = get_u16(pos);
+ ADVANCE(pos, len, 2);
+ }
+ else
+ {
+ if (len < 1)
+ goto framing_error;
+ alen = *pos;
+ ADVANCE(pos, len, 1);
}
- else
- memcpy(dst, src, ad->length);
- qsort(dst, ad->length / 8, 8, (int(*)(const void *, const void *)) bgp_compare_ec);
-}
+ if (alen > len)
+ goto framing_error;
-static int
-bgp_compare_lc(const u32 *x, const u32 *y)
-{
- if (x[0] != y[0])
- return (x[0] > y[0]) ? 1 : -1;
- if (x[1] != y[1])
- return (x[1] > y[1]) ? 1 : -1;
- if (x[2] != y[2])
- return (x[2] > y[2]) ? 1 : -1;
- return 0;
+ DBG("Attr %02x %02x %u\n", code, flags, alen);
+
+ bgp_decode_attr(s, code, flags, pos, alen, &attrs);
+ ADVANCE(pos, len, alen);
+ }
+
+ if (s->err_withdraw)
+ goto withdraw;
+
+ /* If there is no reachability NLRI, we are finished */
+ if (!s->ip_reach_len && !s->mp_reach_len)
+ return NULL;
+
+
+ /* Handle missing mandatory attributes; RFC 7606 3 (d) */
+ if (!BIT32_TEST(s->attrs_seen, BA_ORIGIN))
+ { REPORT(NO_MANDATORY, "ORIGIN"); goto withdraw; }
+
+ if (!BIT32_TEST(s->attrs_seen, BA_AS_PATH))
+ { REPORT(NO_MANDATORY, "AS_PATH"); goto withdraw; }
+
+ /* When receiving attributes from non-AS4-aware BGP speaker, we have to
+ reconstruct AS_PATH and AGGREGATOR attributes; RFC 6793 4.2.3 */
+ if (!p->as4_session)
+ bgp_process_as4_attrs(&attrs, s->pool);
+
+ /* Reject routes with our ASN in AS_PATH attribute */
+ if (bgp_as_path_loopy(p, attrs, p->local_as))
+ goto withdraw;
+
+ /* Reject routes with our Confederation ID in AS_PATH attribute; RFC 5065 4.0 */
+ if ((p->public_as != p->local_as) && bgp_as_path_loopy(p, attrs, p->public_as))
+ goto withdraw;
+
+ /* Reject routes with our Router ID in ORIGINATOR_ID attribute; RFC 4456 8 */
+ if (p->is_internal && bgp_originator_id_loopy(p, attrs))
+ goto withdraw;
+
+ /* Reject routes with our Cluster ID in CLUSTER_LIST attribute; RFC 4456 8 */
+ if (p->rr_client && bgp_cluster_list_loopy(p, attrs))
+ goto withdraw;
+
+ /* If there is no local preference, define one */
+ if (!BIT32_TEST(s->attrs_seen, BA_LOCAL_PREF))
+ bgp_set_attr_u32(&attrs, s->pool, BA_LOCAL_PREF, 0, p->cf->default_local_pref);
+
+ return attrs;
+
+
+framing_error:
+ /* RFC 7606 4 - handle attribute framing errors */
+ REPORT("Malformed attribute list - framing error (%u/%u) at %d",
+ alen, len, (int) (pos - s->attrs));
+
+withdraw:
+ /* RFC 7606 5.2 - handle missing NLRI during errors */
+ if (!s->ip_reach_len && !s->mp_reach_len)
+ bgp_parse_error(s, 1);
+
+ s->err_withdraw = 1;
+ return NULL;
}
-static inline void
-bgp_normalize_lc_set(u32 *dest, u32 *src, unsigned cnt)
+
+/*
+ * Route bucket hash table
+ */
+
+#define RBH_KEY(b) b->eattrs, b->hash
+#define RBH_NEXT(b) b->next
+#define RBH_EQ(a1,h1,a2,h2) h1 == h2 && ea_same(a1, a2)
+#define RBH_FN(a,h) h
+
+#define RBH_REHASH bgp_rbh_rehash
+#define RBH_PARAMS /8, *2, 2, 2, 8, 20
+
+
+HASH_DEFINE_REHASH_FN(RBH, struct bgp_bucket)
+
+void
+bgp_init_bucket_table(struct bgp_channel *c)
{
- memcpy(dest, src, LCOMM_LENGTH * cnt);
- qsort(dest, cnt, LCOMM_LENGTH, (int(*)(const void *, const void *)) bgp_compare_lc);
-}
+ HASH_INIT(c->bucket_hash, c->pool, 8);
-static void
-bgp_rehash_buckets(struct bgp_proto *p)
-{
- struct bgp_bucket **old = p->bucket_hash;
- struct bgp_bucket **new;
- unsigned oldn = p->hash_size;
- unsigned i, e, mask;
- struct bgp_bucket *b;
-
- p->hash_size = p->hash_limit;
- DBG("BGP: Rehashing bucket table from %d to %d\n", oldn, p->hash_size);
- p->hash_limit *= 4;
- if (p->hash_limit >= 65536)
- p->hash_limit = ~0;
- new = p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *));
- mask = p->hash_size - 1;
- for (i=0; i<oldn; i++)
- while (b = old[i])
- {
- old[i] = b->hash_next;
- e = b->hash & mask;
- b->hash_next = new[e];
- if (b->hash_next)
- b->hash_next->hash_prev = b;
- b->hash_prev = NULL;
- new[e] = b;
- }
- mb_free(old);
+ init_list(&c->bucket_queue);
+ c->withdraw_bucket = NULL;
}
static struct bgp_bucket *
-bgp_new_bucket(struct bgp_proto *p, ea_list *new, unsigned hash)
+bgp_get_bucket(struct bgp_channel *c, ea_list *new)
{
- struct bgp_bucket *b;
- unsigned ea_size = sizeof(ea_list) + new->count * sizeof(eattr);
- unsigned ea_size_aligned = BIRD_ALIGN(ea_size, CPU_STRUCT_ALIGN);
- unsigned size = sizeof(struct bgp_bucket) + ea_size_aligned;
- unsigned i;
+ /* Hash and lookup */
+ u32 hash = ea_hash(new);
+ struct bgp_bucket *b = HASH_FIND(c->bucket_hash, RBH, new, hash);
+
+ if (b)
+ return b;
+
+ uint ea_size = sizeof(ea_list) + new->count * sizeof(eattr);
+ uint ea_size_aligned = BIRD_ALIGN(ea_size, CPU_STRUCT_ALIGN);
+ uint size = sizeof(struct bgp_bucket) + ea_size_aligned;
+ uint i;
byte *dest;
- unsigned index = hash & (p->hash_size - 1);
/* Gather total size of non-inline attributes */
- for (i=0; i<new->count; i++)
- {
- eattr *a = &new->attrs[i];
- if (!(a->type & EAF_EMBEDDED))
- size += BIRD_ALIGN(sizeof(struct adata) + a->u.ptr->length, CPU_STRUCT_ALIGN);
- }
+ for (i = 0; i < new->count; i++)
+ {
+ eattr *a = &new->attrs[i];
- /* Create the bucket and hash it */
- b = mb_alloc(p->p.pool, size);
- b->hash_next = p->bucket_hash[index];
- if (b->hash_next)
- b->hash_next->hash_prev = b;
- p->bucket_hash[index] = b;
- b->hash_prev = NULL;
- b->hash = hash;
- add_tail(&p->bucket_queue, &b->send_node);
+ if (!(a->type & EAF_EMBEDDED))
+ size += BIRD_ALIGN(sizeof(struct adata) + a->u.ptr->length, CPU_STRUCT_ALIGN);
+ }
+
+ /* Create the bucket */
+ b = mb_alloc(c->pool, size);
init_list(&b->prefixes);
+ b->hash = hash;
+
+ /* Copy list of extended attributes */
memcpy(b->eattrs, new, ea_size);
- dest = ((byte *)b->eattrs) + ea_size_aligned;
+ dest = ((byte *) b->eattrs) + ea_size_aligned;
/* Copy values of non-inline attributes */
- for (i=0; i<new->count; i++)
+ for (i = 0; i < new->count; i++)
+ {
+ eattr *a = &b->eattrs->attrs[i];
+
+ if (!(a->type & EAF_EMBEDDED))
{
- eattr *a = &b->eattrs->attrs[i];
- if (!(a->type & EAF_EMBEDDED))
- {
- struct adata *oa = a->u.ptr;
- struct adata *na = (struct adata *) dest;
- memcpy(na, oa, sizeof(struct adata) + oa->length);
- a->u.ptr = na;
- dest += BIRD_ALIGN(sizeof(struct adata) + na->length, CPU_STRUCT_ALIGN);
- }
+ struct adata *oa = a->u.ptr;
+ struct adata *na = (struct adata *) dest;
+ memcpy(na, oa, sizeof(struct adata) + oa->length);
+ a->u.ptr = na;
+ dest += BIRD_ALIGN(sizeof(struct adata) + na->length, CPU_STRUCT_ALIGN);
}
+ }
- /* If needed, rehash */
- p->hash_count++;
- if (p->hash_count > p->hash_limit)
- bgp_rehash_buckets(p);
+ /* Insert the bucket to send queue and bucket hash */
+ add_tail(&c->bucket_queue, &b->send_node);
+ HASH_INSERT2(c->bucket_hash, RBH, c->pool, b);
return b;
}
static struct bgp_bucket *
-bgp_get_bucket(struct bgp_proto *p, net *n, ea_list *attrs, int originate)
+bgp_get_withdraw_bucket(struct bgp_channel *c)
{
- ea_list *new;
- unsigned i, cnt, hash, code;
- eattr *a, *d;
- u32 seen = 0;
- struct bgp_bucket *b;
-
- /* Merge the attribute list */
- new = alloca(ea_scan(attrs));
- ea_merge(attrs, new);
- ea_sort(new);
+ if (!c->withdraw_bucket)
+ {
+ c->withdraw_bucket = mb_allocz(c->pool, sizeof(struct bgp_bucket));
+ init_list(&c->withdraw_bucket->prefixes);
+ }
- /* Normalize attributes */
- d = new->attrs;
- cnt = new->count;
- new->count = 0;
- for(i=0; i<cnt; i++)
- {
- a = &new->attrs[i];
- if (EA_PROTO(a->id) != EAP_BGP)
- continue;
- code = EA_ID(a->id);
- if (ATTR_KNOWN(code))
- {
- if (!p->is_internal)
- {
- if (!bgp_attr_table[code].allow_in_ebgp)
- continue;
- if ((code == BA_LOCAL_PREF) && !p->cf->allow_local_pref)
- continue;
- }
- /* The flags might have been zero if the attr was added by filters */
- a->flags = (a->flags & BAF_PARTIAL) | bgp_attr_table[code].expected_flags;
- if (code < 32)
- seen |= 1 << code;
- }
- else
- {
- /* Don't re-export unknown non-transitive attributes */
- if (!(a->flags & BAF_TRANSITIVE))
- continue;
- }
- *d = *a;
- if ((d->type & EAF_ORIGINATED) && !originate && (d->flags & BAF_TRANSITIVE) && (d->flags & BAF_OPTIONAL))
- d->flags |= BAF_PARTIAL;
- switch (d->type & EAF_TYPE_MASK)
- {
- case EAF_TYPE_INT_SET:
- {
- struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length);
- z->length = d->u.ptr->length;
- bgp_normalize_int_set((u32 *) z->data, (u32 *) d->u.ptr->data, z->length / 4);
- d->u.ptr = z;
- break;
- }
- case EAF_TYPE_EC_SET:
- {
- struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length);
- z->length = d->u.ptr->length;
- bgp_normalize_ec_set(z, (u32 *) d->u.ptr->data, p->is_internal);
- d->u.ptr = z;
- break;
- }
- case EAF_TYPE_LC_SET:
- {
- struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length);
- z->length = d->u.ptr->length;
- bgp_normalize_lc_set((u32 *) z->data, (u32 *) d->u.ptr->data, z->length / LCOMM_LENGTH);
- d->u.ptr = z;
- break;
- }
- default: ;
- }
- d++;
- new->count++;
- }
+ return c->withdraw_bucket;
+}
- /* Hash */
- hash = ea_hash(new);
- for(b=p->bucket_hash[hash & (p->hash_size - 1)]; b; b=b->hash_next)
- if (b->hash == hash && ea_same(b->eattrs, new))
- {
- DBG("Found bucket.\n");
- return b;
- }
-
- /* Ensure that there are all mandatory attributes */
- for(i=0; i<ARRAY_SIZE(bgp_mandatory_attrs); i++)
- if (!(seen & (1 << bgp_mandatory_attrs[i])))
- {
- log(L_ERR "%s: Mandatory attribute %s missing in route %I/%d", p->p.name, bgp_attr_table[bgp_mandatory_attrs[i]].name, n->n.prefix, n->n.pxlen);
- return NULL;
- }
-
- /* Check if next hop is valid */
- a = ea_find(new, EA_CODE(EAP_BGP, BA_NEXT_HOP));
- if (!a || ipa_equal(p->cf->remote_ip, *(ip_addr *)a->u.ptr->data))
- {
- log(L_ERR "%s: Invalid NEXT_HOP attribute in route %I/%d", p->p.name, n->n.prefix, n->n.pxlen);
- return NULL;
- }
+void
+bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b)
+{
+ rem_node(&b->send_node);
+ HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b);
+ mb_free(b);
+}
- /* Create new bucket */
- DBG("Creating bucket.\n");
- return bgp_new_bucket(p, new, hash);
+void
+bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b)
+{
+ rem_node(&b->send_node);
+ add_tail(&c->bucket_queue, &b->send_node);
}
void
-bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck)
+bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b)
{
- if (buck->hash_next)
- buck->hash_next->hash_prev = buck->hash_prev;
- if (buck->hash_prev)
- buck->hash_prev->hash_next = buck->hash_next;
- else
- p->bucket_hash[buck->hash & (p->hash_size-1)] = buck->hash_next;
- mb_free(buck);
+ struct bgp_proto *p = (void *) c->c.proto;
+ struct bgp_bucket *wb = bgp_get_withdraw_bucket(c);
+
+ log(L_ERR "%s: Attribute list too long", p->p.name);
+ while (!EMPTY_LIST(b->prefixes))
+ {
+ struct bgp_prefix *px = HEAD(b->prefixes);
+
+ log(L_ERR "%s: - withdrawing %N", p->p.name, &px->net);
+ rem_node(&px->buck_node);
+ add_tail(&wb->prefixes, &px->buck_node);
+ }
}
-/* Prefix hash table */
+/*
+ * Prefix hash table
+ */
-#define PXH_KEY(n1) n1->n.prefix, n1->n.pxlen, n1->path_id
-#define PXH_NEXT(n) n->next
-#define PXH_EQ(p1,l1,i1,p2,l2,i2) ipa_equal(p1, p2) && l1 == l2 && i1 == i2
-#define PXH_FN(p,l,i) ipa_hash32(p) ^ u32_hash((l << 16) ^ i)
+#define PXH_KEY(px) px->net, px->path_id, px->hash
+#define PXH_NEXT(px) px->next
+#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && i1 == i2 && net_equal(n1, n2)
+#define PXH_FN(n,i,h) h
#define PXH_REHASH bgp_pxh_rehash
#define PXH_PARAMS /8, *2, 2, 2, 8, 20
@@ -931,308 +1292,282 @@ bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck)
HASH_DEFINE_REHASH_FN(PXH, struct bgp_prefix)
void
-bgp_init_prefix_table(struct bgp_proto *p, u32 order)
+bgp_init_prefix_table(struct bgp_channel *c)
{
- HASH_INIT(p->prefix_hash, p->p.pool, order);
+ HASH_INIT(c->prefix_hash, c->pool, 8);
- p->prefix_slab = sl_new(p->p.pool, sizeof(struct bgp_prefix));
+ uint alen = net_addr_length[c->c.net_type];
+ c->prefix_slab = alen ? sl_new(c->pool, sizeof(struct bgp_prefix) + alen) : NULL;
}
void
-bgp_free_prefix_table(struct bgp_proto *p)
+bgp_free_prefix_table(struct bgp_channel *c)
{
- HASH_FREE(p->prefix_hash);
+ HASH_FREE(c->prefix_hash);
- rfree(p->prefix_slab);
- p->prefix_slab = NULL;
+ rfree(c->prefix_slab);
+ c->prefix_slab = NULL;
}
static struct bgp_prefix *
-bgp_get_prefix(struct bgp_proto *p, ip_addr prefix, int pxlen, u32 path_id)
+bgp_get_prefix(struct bgp_channel *c, net_addr *net, u32 path_id)
{
- struct bgp_prefix *bp = HASH_FIND(p->prefix_hash, PXH, prefix, pxlen, path_id);
+ u32 hash = net_hash(net) ^ u32_hash(path_id);
+ struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id, hash);
+
+ if (px)
+ {
+ rem_node(&px->buck_node);
+ return px;
+ }
- if (bp)
- return bp;
+ if (c->prefix_slab)
+ px = sl_alloc(c->prefix_slab);
+ else
+ px = mb_alloc(c->pool, sizeof(struct bgp_prefix) + net->length);
- bp = sl_alloc(p->prefix_slab);
- bp->n.prefix = prefix;
- bp->n.pxlen = pxlen;
- bp->path_id = path_id;
- bp->bucket_node.next = NULL;
+ px->buck_node.next = NULL;
+ px->buck_node.prev = NULL;
+ px->hash = hash;
+ px->path_id = path_id;
+ net_copy(px->net, net);
- HASH_INSERT2(p->prefix_hash, PXH, p->p.pool, bp);
+ HASH_INSERT2(c->prefix_hash, PXH, c->pool, px);
- return bp;
+ return px;
}
void
-bgp_free_prefix(struct bgp_proto *p, struct bgp_prefix *bp)
+bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px)
{
- HASH_REMOVE2(p->prefix_hash, PXH, p->p.pool, bp);
- sl_free(p->prefix_slab, bp);
+ rem_node(&px->buck_node);
+ HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px);
+
+ if (c->prefix_slab)
+ sl_free(c->prefix_slab, px);
+ else
+ mb_free(px);
}
-void
-bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs)
+/*
+ * BGP protocol glue
+ */
+
+int
+bgp_import_control(struct proto *P, rte **new, ea_list **attrs UNUSED, struct linpool *pool UNUSED)
{
+ rte *e = *new;
+ struct proto *SRC = e->attrs->src->proto;
struct bgp_proto *p = (struct bgp_proto *) P;
- struct bgp_bucket *buck;
- struct bgp_prefix *px;
- rte *key;
- u32 path_id;
+ struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (struct bgp_proto *) SRC : NULL;
- DBG("BGP: Got route %I/%d %s\n", n->n.prefix, n->n.pxlen, new ? "up" : "down");
+ /* Reject our routes */
+ if (src == p)
+ return -1;
- if (new)
- {
- key = new;
- buck = bgp_get_bucket(p, n, attrs, new->attrs->source != RTS_BGP);
- if (!buck) /* Inconsistent attribute list */
- return;
- }
- else
- {
- key = old;
- if (!(buck = p->withdraw_bucket))
- {
- buck = p->withdraw_bucket = mb_alloc(P->pool, sizeof(struct bgp_bucket));
- init_list(&buck->prefixes);
- }
- }
- path_id = p->add_path_tx ? key->attrs->src->global_id : 0;
- px = bgp_get_prefix(p, n->n.prefix, n->n.pxlen, path_id);
- if (px->bucket_node.next)
- {
- DBG("\tRemoving old entry.\n");
- rem_node(&px->bucket_node);
- }
- add_tail(&buck->prefixes, &px->bucket_node);
- bgp_schedule_packet(p->conn, PKT_UPDATE);
-}
+ /* Accept non-BGP routes */
+ if (src == NULL)
+ return 0;
-static int
-bgp_create_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool)
-{
- ea_list *ea = lp_alloc(pool, sizeof(ea_list) + 4*sizeof(eattr));
- rta *rta = e->attrs;
- byte *z;
+ // XXXX: Check next hop AF
- ea->next = *attrs;
- *attrs = ea;
- ea->flags = EALF_SORTED;
- ea->count = 4;
+ /* IBGP route reflection, RFC 4456 */
+ if (p->is_internal && src->is_internal && (p->local_as == src->local_as))
+ {
+ /* Rejected unless configured as route reflector */
+ if (!p->rr_client && !src->rr_client)
+ return -1;
+
+ /* Generally, this should be handled when path is received, but we check it
+ also here as rr_cluster_id may be undefined or different in src. */
+ if (p->rr_cluster_id && bgp_cluster_list_loopy(p, e->attrs->eattrs))
+ return -1;
+ }
- bgp_set_attr(ea->attrs, BA_ORIGIN,
- ((rta->source == RTS_OSPF_EXT1) || (rta->source == RTS_OSPF_EXT2)) ? ORIGIN_INCOMPLETE : ORIGIN_IGP);
+ /* Handle well-known communities, RFC 1997 */
+ struct eattr *c;
+ if (p->cf->interpret_communities &&
+ (c = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_COMMUNITY))))
+ {
+ struct adata *d = c->u.ptr;
- if (p->is_internal)
- bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, 0);
- else
- {
- z = bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, 6);
- z[0] = AS_PATH_SEQUENCE;
- z[1] = 1; /* 1 AS */
- put_u32(z+2, p->local_as);
- }
+ /* Do not export anywhere */
+ if (int_set_contains(d, BGP_COMM_NO_ADVERTISE))
+ return -1;
- /* iBGP -> use gw, eBGP multi-hop -> use source_addr,
- eBGP single-hop -> use gw if on the same iface */
- z = bgp_set_attr_wa(ea->attrs+2, pool, BA_NEXT_HOP, NEXT_HOP_LENGTH);
- if (p->cf->next_hop_self ||
- rta->dest != RTD_ROUTER ||
- ipa_equal(rta->gw, IPA_NONE) ||
- ipa_is_link_local(rta->gw) ||
- (!p->is_internal && !p->cf->next_hop_keep &&
- (!p->neigh || (rta->iface != p->neigh->iface))))
- set_next_hop(z, p->source_addr);
- else
- set_next_hop(z, rta->gw);
+ /* Do not export outside of AS (or member-AS) */
+ if (!p->is_internal && int_set_contains(d, BGP_COMM_NO_EXPORT_SUBCONFED))
+ return -1;
- bgp_set_attr(ea->attrs+3, BA_LOCAL_PREF, p->cf->default_local_pref);
+ /* Do not export outside of AS (or confederation) */
+ if (!p->is_interior && int_set_contains(d, BGP_COMM_NO_EXPORT))
+ return -1;
+ }
- return 0; /* Leave decision to the filters */
+ return 0;
}
-static inline int
-bgp_as_path_loopy(struct bgp_proto *p, rta *a)
-{
- int num = p->cf->allow_local_as + 1;
- eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
- return (e && (num > 0) && as_path_contains(e->u.ptr, p->local_as, num));
-}
+static adata null_adata; /* adata of length 0 */
-static inline int
-bgp_originator_id_loopy(struct bgp_proto *p, rta *a)
-{
- eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
- return (e && (e->u.data == p->local_id));
-}
-
-static inline int
-bgp_cluster_list_loopy(struct bgp_proto *p, rta *a)
+static ea_list *
+bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *attrs0, struct linpool *pool)
{
- eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
- return (e && p->rr_client && int_set_contains(e->u.ptr, p->rr_cluster_id));
-}
+ struct proto *SRC = e->attrs->src->proto;
+ struct bgp_proto *src = (SRC->proto == &proto_bgp) ? (void *) SRC : NULL;
+ struct bgp_export_state s = { .proto = p, .channel = c, .pool = pool, .src = src, .route = e, .mpls = c->desc->mpls };
+ ea_list *attrs = attrs0;
+ eattr *a;
+ adata *ad;
+ /* ORIGIN attribute - mandatory, attach if missing */
+ if (! bgp_find_attr(attrs0, BA_ORIGIN))
+ bgp_set_attr_u32(&attrs, pool, BA_ORIGIN, 0, src ? ORIGIN_INCOMPLETE : ORIGIN_IGP);
-static inline void
-bgp_path_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 as)
-{
- eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
- bgp_attach_attr(attrs, pool, BA_AS_PATH, (uintptr_t) as_path_prepend(pool, a->u.ptr, as));
-}
+ /* AS_PATH attribute - mandatory */
+ a = bgp_find_attr(attrs0, BA_AS_PATH);
+ ad = a ? a->u.ptr : &null_adata;
-static inline void
-bgp_cluster_list_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 cid)
-{
- eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
- bgp_attach_attr(attrs, pool, BA_CLUSTER_LIST, (uintptr_t) int_set_prepend(pool, a ? a->u.ptr : NULL, cid));
-}
+ /* AS_PATH attribute - strip AS_CONFED* segments outside confederation */
+ if ((!p->cf->confederation || !p->is_interior) && as_path_contains_confed(ad))
+ ad = as_path_strip_confed(pool, ad);
-static int
-bgp_update_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool, int rr)
-{
- eattr *a;
+ /* AS_PATH attribute - keep or prepend ASN */
+ if (p->is_internal ||
+ (p->rs_client && src && src->rs_client))
+ {
+ /* IBGP or route server -> just ensure there is one */
+ if (!a)
+ bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, &null_adata);
+ }
+ else if (p->is_interior)
+ {
+ /* Confederation -> prepend ASN as AS_CONFED_SEQUENCE */
+ ad = as_path_prepend2(pool, ad, AS_PATH_CONFED_SEQUENCE, p->public_as);
+ bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, ad);
+ }
+ else /* Regular EBGP (no RS, no confederation) */
+ {
+ /* Regular EBGP -> prepend ASN as regular sequence */
+ ad = as_path_prepend2(pool, ad, AS_PATH_SEQUENCE, p->public_as);
+ bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, ad);
+
+ /* MULTI_EXIT_DESC attribute - accept only if set in export filter */
+ a = bgp_find_attr(attrs0, BA_MULTI_EXIT_DISC);
+ if (a && !(a->type & EAF_FRESH))
+ bgp_unset_attr(&attrs, pool, BA_MULTI_EXIT_DISC);
+ }
- if (!p->is_internal && !p->rs_client)
- {
- bgp_path_prepend(e, attrs, pool, p->local_as);
-
- /* The MULTI_EXIT_DISC attribute received from a neighboring AS MUST NOT be
- * propagated to other neighboring ASes.
- * Perhaps it would be better to undefine it.
- */
- a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
- if (a)
- bgp_attach_attr(attrs, pool, BA_MULTI_EXIT_DISC, 0);
- }
+ /* NEXT_HOP attribute - delegated to AF-specific hook */
+ a = bgp_find_attr(attrs0, BA_NEXT_HOP);
+ bgp_update_next_hop(&s, a, &attrs);
- /* iBGP -> keep next_hop, eBGP multi-hop -> use source_addr,
- * eBGP single-hop -> keep next_hop if on the same iface.
- * If the next_hop is zero (i.e. link-local), keep only if on the same iface.
- *
- * Note that same-iface-check uses iface from route, which is based on gw.
- */
- a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
- if (a && !p->cf->next_hop_self &&
- (p->cf->next_hop_keep ||
- (p->is_internal && ipa_nonzero(*((ip_addr *) a->u.ptr->data))) ||
- (p->neigh && (e->attrs->iface == p->neigh->iface))))
- {
- /* Leave the original next hop attribute, will check later where does it point */
- }
- else
- {
- /* Need to create new one */
- byte *b = bgp_attach_attr_wa(attrs, pool, BA_NEXT_HOP, NEXT_HOP_LENGTH);
- set_next_hop(b, p->source_addr);
- }
+ /* LOCAL_PREF attribute - required for IBGP, attach if missing */
+ if (p->is_interior && ! bgp_find_attr(attrs0, BA_LOCAL_PREF))
+ bgp_set_attr_u32(&attrs, pool, BA_LOCAL_PREF, 0, p->cf->default_local_pref);
- if (rr)
- {
- /* Handling route reflection, RFC 4456 */
- struct bgp_proto *src = (struct bgp_proto *) e->attrs->src->proto;
+ /* IBGP route reflection, RFC 4456 */
+ if (src && src->is_internal && p->is_internal && (src->local_as == p->local_as))
+ {
+ /* ORIGINATOR_ID attribute - attach if not already set */
+ if (! bgp_find_attr(attrs0, BA_ORIGINATOR_ID))
+ bgp_set_attr_u32(&attrs, pool, BA_ORIGINATOR_ID, 0, src->remote_id);
- a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
- if (!a)
- bgp_attach_attr(attrs, pool, BA_ORIGINATOR_ID, src->remote_id);
+ /* CLUSTER_LIST attribute - prepend cluster ID */
+ a = bgp_find_attr(attrs0, BA_CLUSTER_LIST);
+ ad = a ? a->u.ptr : NULL;
- /* We attach proper cluster ID according to whether the route is entering or leaving the cluster */
- bgp_cluster_list_prepend(e, attrs, pool, src->rr_client ? src->rr_cluster_id : p->rr_cluster_id);
+ /* Prepend src cluster ID */
+ if (src->rr_cluster_id)
+ ad = int_set_prepend(pool, ad, src->rr_cluster_id);
- /* Two RR clients with different cluster ID, hmmm */
- if (src->rr_client && p->rr_client && (src->rr_cluster_id != p->rr_cluster_id))
- bgp_cluster_list_prepend(e, attrs, pool, p->rr_cluster_id);
- }
+ /* Prepend dst cluster ID if src and dst clusters are different */
+ if (p->rr_cluster_id && (src->rr_cluster_id != p->rr_cluster_id))
+ ad = int_set_prepend(pool, ad, p->rr_cluster_id);
- return 0; /* Leave decision to the filters */
-}
+ /* Should be at least one prepended cluster ID */
+ bgp_set_attr_ptr(&attrs, pool, BA_CLUSTER_LIST, 0, ad);
+ }
-static int
-bgp_community_filter(struct bgp_proto *p, rte *e)
-{
- eattr *a;
- struct adata *d;
+ /* AS4_* transition attributes, RFC 6793 4.2.2 */
+ if (! p->as4_session)
+ {
+ a = bgp_find_attr(attrs, BA_AS_PATH);
+ if (a && as_path_contains_as4(a->u.ptr))
+ {
+ bgp_set_attr_ptr(&attrs, pool, BA_AS_PATH, 0, as_path_to_old(pool, a->u.ptr));
+ bgp_set_attr_ptr(&attrs, pool, BA_AS4_PATH, 0, as_path_strip_confed(pool, a->u.ptr));
+ }
- /* Check if we aren't forbidden to export the route by communities */
- a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_COMMUNITY));
- if (a)
+ a = bgp_find_attr(attrs, BA_AGGREGATOR);
+ if (a && aggregator_contains_as4(a->u.ptr))
{
- d = a->u.ptr;
- if (int_set_contains(d, BGP_COMM_NO_ADVERTISE))
- {
- DBG("\tNO_ADVERTISE\n");
- return 1;
- }
- if (!p->is_internal &&
- (int_set_contains(d, BGP_COMM_NO_EXPORT) ||
- int_set_contains(d, BGP_COMM_NO_EXPORT_SUBCONFED)))
- {
- DBG("\tNO_EXPORT\n");
- return 1;
- }
+ bgp_set_attr_ptr(&attrs, pool, BA_AGGREGATOR, 0, aggregator_to_old(pool, a->u.ptr));
+ bgp_set_attr_ptr(&attrs, pool, BA_AS4_AGGREGATOR, 0, a->u.ptr);
}
+ }
- return 0;
+ /*
+ * Presence of mandatory attributes ORIGIN and AS_PATH is ensured by above
+ * conditions. Presence and validity of quasi-mandatory NEXT_HOP attribute
+ * should be checked in AF-specific hooks.
+ */
+
+ /* Apply per-attribute export hooks for validatation and normalization */
+ return bgp_export_attrs(&s, attrs);
}
-int
-bgp_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool *pool)
+void
+bgp_rt_notify(struct proto *P, struct channel *C, net *n, rte *new, rte *old, ea_list *attrs)
{
- rte *e = *new;
- struct bgp_proto *p = (struct bgp_proto *) P;
- struct bgp_proto *new_bgp = (e->attrs->src->proto->proto == &proto_bgp) ?
- (struct bgp_proto *) e->attrs->src->proto : NULL;
+ struct bgp_proto *p = (void *) P;
+ struct bgp_channel *c = (void *) C;
+ struct bgp_bucket *buck;
+ struct bgp_prefix *px;
+ u32 path;
- if (p == new_bgp) /* Poison reverse updates */
- return -1;
- if (new_bgp)
- {
- /* We should check here for cluster list loop, because the receiving BGP instance
- might have different cluster ID */
- if (bgp_cluster_list_loopy(p, e->attrs))
- return -1;
-
- if (p->cf->interpret_communities && bgp_community_filter(p, e))
- return -1;
-
- if (p->local_as == new_bgp->local_as && p->is_internal && new_bgp->is_internal)
- {
- /* Redistribution of internal routes with IBGP */
- if (p->rr_client || new_bgp->rr_client)
- /* Route reflection, RFC 4456 */
- return bgp_update_attrs(p, e, attrs, pool, 1);
- else
- return -1;
- }
- else
- return bgp_update_attrs(p, e, attrs, pool, 0);
- }
+ if (new)
+ {
+ attrs = bgp_update_attrs(p, c, new, attrs, bgp_linpool2);
+
+ /* If attributes are invalid, we fail back to withdraw */
+ buck = attrs ? bgp_get_bucket(c, attrs) : bgp_get_withdraw_bucket(c);
+ path = new->attrs->src->global_id;
+
+ lp_flush(bgp_linpool2);
+ }
else
- return bgp_create_attrs(p, e, attrs, pool);
+ {
+ buck = bgp_get_withdraw_bucket(c);
+ path = old->attrs->src->global_id;
+ }
+
+ px = bgp_get_prefix(c, n->n.addr, c->add_path_tx ? path : 0);
+ add_tail(&buck->prefixes, &px->buck_node);
+
+ bgp_schedule_packet(p->conn, c, PKT_UPDATE);
}
+
static inline u32
bgp_get_neighbor(rte *r)
{
eattr *e = ea_find(r->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
u32 as;
- if (e && as_path_get_first(e->u.ptr, &as))
+ if (e && as_path_get_first_regular(e->u.ptr, &as))
return as;
- else
- return ((struct bgp_proto *) r->attrs->src->proto)->remote_as;
+
+ /* If AS_PATH is not defined, we treat rte as locally originated */
+ struct bgp_proto *p = (void *) r->attrs->src->proto;
+ return p->cf->confederation ?: p->local_as;
}
static inline int
rte_resolvable(rte *rt)
{
- int rd = rt->attrs->dest;
- return (rd == RTD_ROUTER) || (rd == RTD_DEVICE) || (rd == RTD_MULTIPATH);
+ return rt->attrs->dest == RTD_UNICAST;
}
int
@@ -1271,16 +1606,16 @@ bgp_rte_better(rte *new, rte *old)
/* RFC 4271 9.1.2.2. a) Use AS path lengths */
if (new_bgp->cf->compare_path_lengths || old_bgp->cf->compare_path_lengths)
- {
- x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
- y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
- n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN;
- o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN;
- if (n < o)
- return 1;
- if (n > o)
- return 0;
- }
+ {
+ x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
+ y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
+ n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN;
+ o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN;
+ if (n < o)
+ return 1;
+ if (n > o)
+ return 0;
+ }
/* RFC 4271 9.1.2.2. b) Use origins */
x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
@@ -1305,21 +1640,21 @@ bgp_rte_better(rte *new, rte *old)
*/
if (new_bgp->cf->med_metric || old_bgp->cf->med_metric ||
(bgp_get_neighbor(new) == bgp_get_neighbor(old)))
- {
- x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
- y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
- n = x ? x->u.data : new_bgp->cf->default_med;
- o = y ? y->u.data : old_bgp->cf->default_med;
- if (n < o)
- return 1;
- if (n > o)
- return 0;
- }
+ {
+ x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
+ y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
+ n = x ? x->u.data : new_bgp->cf->default_med;
+ o = y ? y->u.data : old_bgp->cf->default_med;
+ if (n < o)
+ return 1;
+ if (n > o)
+ return 0;
+ }
/* RFC 4271 9.1.2.2. d) Prefer external peers */
- if (new_bgp->is_internal > old_bgp->is_internal)
+ if (new_bgp->is_interior > old_bgp->is_interior)
return 0;
- if (new_bgp->is_internal < old_bgp->is_internal)
+ if (new_bgp->is_interior < old_bgp->is_interior)
return 1;
/* RFC 4271 9.1.2.2. e) Compare IGP metrics */
@@ -1331,7 +1666,7 @@ bgp_rte_better(rte *new, rte *old)
return 0;
/* RFC 4271 9.1.2.2. f) Compare BGP identifiers */
- /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighor ID */
+ /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighbor ID */
x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
n = x ? x->u.data : new_bgp->remote_id;
@@ -1390,18 +1725,18 @@ bgp_rte_mergable(rte *pri, rte *sec)
/* RFC 4271 9.1.2.2. a) Use AS path lengths */
if (pri_bgp->cf->compare_path_lengths || sec_bgp->cf->compare_path_lengths)
- {
- x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
- y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
- p = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN;
- s = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN;
+ {
+ x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
+ y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
+ p = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN;
+ s = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN;
- if (p != s)
- return 0;
+ if (p != s)
+ return 0;
-// if (DELTA(p, s) > pri_bgp->cf->relax_multipath)
-// return 0;
- }
+// if (DELTA(p, s) > pri_bgp->cf->relax_multipath)
+// return 0;
+ }
/* RFC 4271 9.1.2.2. b) Use origins */
x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
@@ -1414,17 +1749,17 @@ bgp_rte_mergable(rte *pri, rte *sec)
/* RFC 4271 9.1.2.2. c) Compare MED's */
if (pri_bgp->cf->med_metric || sec_bgp->cf->med_metric ||
(bgp_get_neighbor(pri) == bgp_get_neighbor(sec)))
- {
- x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
- y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
- p = x ? x->u.data : pri_bgp->cf->default_med;
- s = y ? y->u.data : sec_bgp->cf->default_med;
- if (p != s)
- return 0;
- }
+ {
+ x = ea_find(pri->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
+ y = ea_find(sec->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
+ p = x ? x->u.data : pri_bgp->cf->default_med;
+ s = y ? y->u.data : sec_bgp->cf->default_med;
+ if (p != s)
+ return 0;
+ }
/* RFC 4271 9.1.2.2. d) Prefer external peers */
- if (pri_bgp->is_internal != sec_bgp->is_internal)
+ if (pri_bgp->is_interior != sec_bgp->is_interior)
return 0;
/* RFC 4271 9.1.2.2. e) Compare IGP metrics */
@@ -1439,7 +1774,6 @@ bgp_rte_mergable(rte *pri, rte *sec)
}
-
static inline int
same_group(rte *r, u32 lpref, u32 lasn)
{
@@ -1484,7 +1818,7 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best)
* that this fn is not called for them.
*
* The idea is simple, the implementation is more problematic,
- * mostly because of optimizations in rte_recalculate() that
+ * mostly because of optimizations in rte_recalculate() that
* avoids full recalculation in most cases.
*
* We can assume that at least one of new, old is non-NULL and both
@@ -1496,14 +1830,14 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best)
/* If new and old are from different groups, we just process that
as two independent events */
if (new && old && !same_group(old, lpref, lasn))
- {
- int i1, i2;
- i1 = bgp_rte_recalculate(table, net, NULL, old, old_best);
- i2 = bgp_rte_recalculate(table, net, new, NULL, old_best);
- return i1 || i2;
- }
+ {
+ int i1, i2;
+ i1 = bgp_rte_recalculate(table, net, NULL, old, old_best);
+ i2 = bgp_rte_recalculate(table, net, new, NULL, old_best);
+ return i1 || i2;
+ }
- /*
+ /*
* We could find the best-in-group and then make some shortcuts like
* in rte_recalculate, but as we would have to walk through all
* net->routes just to find it, it is probably not worth. So we
@@ -1515,35 +1849,35 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best)
new->u.bgp.suppressed = 1;
if (old)
+ {
+ old_is_group_best = !old->u.bgp.suppressed;
+ old->u.bgp.suppressed = 1;
+ int new_is_better = new && bgp_rte_better(new, old);
+
+ /* The first case - replace not best with worse (or remove not best) */
+ if (!old_is_group_best && !new_is_better)
+ return 0;
+
+ /* The second case - replace the best with better */
+ if (old_is_group_best && new_is_better)
{
- old_is_group_best = !old->u.bgp.suppressed;
- old->u.bgp.suppressed = 1;
- int new_is_better = new && bgp_rte_better(new, old);
-
- /* The first case - replace not best with worse (or remove not best) */
- if (!old_is_group_best && !new_is_better)
- return 0;
-
- /* The second case - replace the best with better */
- if (old_is_group_best && new_is_better)
- {
- /* new is best-in-group, the see discussion below - this is
- a special variant of NBG && OBG. From OBG we can deduce
- that same_group(old_best) iff (old == old_best) */
- new->u.bgp.suppressed = 0;
- return (old == old_best);
- }
+ /* new is best-in-group, the see discussion below - this is
+ a special variant of NBG && OBG. From OBG we can deduce
+ that same_group(old_best) iff (old == old_best) */
+ new->u.bgp.suppressed = 0;
+ return (old == old_best);
}
+ }
/* The default case - find a new best-in-group route */
r = new; /* new may not be in the list */
for (s=net->routes; rte_is_valid(s); s=s->next)
if (use_deterministic_med(s) && same_group(s, lpref, lasn))
- {
- s->u.bgp.suppressed = 1;
- if (!r || bgp_rte_better(s, r))
- r = s;
- }
+ {
+ s->u.bgp.suppressed = 1;
+ if (!r || bgp_rte_better(s, r))
+ r = s;
+ }
/* Simple case - the last route in group disappears */
if (!r)
@@ -1582,397 +1916,77 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best)
return old_is_group_best;
}
-static struct adata *
-bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool)
-{
- struct adata *newa = lp_alloc(pool, sizeof(struct adata) + 8);
- newa->length = 8;
- aggregator_convert_to_new(old, newa->data);
- return newa;
-}
-
-/* Take last req_as ASNs from path old2 (in 2B format), convert to 4B format
- * and append path old4 (in 4B format).
+/*
+ * Reconstruct AS_PATH and AGGREGATOR according to RFC 6793 4.2.3
*/
-static struct adata *
-bgp_merge_as_paths(struct adata *old2, struct adata *old4, int req_as, struct linpool *pool)
-{
- byte buf[old2->length * 2];
-
- int ol = as_path_convert_to_new(old2, buf, req_as);
- int nl = ol + (old4 ? old4->length : 0);
-
- struct adata *newa = lp_alloc(pool, sizeof(struct adata) + nl);
- newa->length = nl;
- memcpy(newa->data, buf, ol);
- if (old4) memcpy(newa->data + ol, old4->data, old4->length);
-
- return newa;
-}
-
-static int
-as4_aggregator_valid(struct adata *aggr)
-{
- return aggr->length == 8;
-}
-
-
-/* Reconstruct 4B AS_PATH and AGGREGATOR according to RFC 4893 4.2.3 */
-static void
-bgp_reconstruct_4b_atts(struct bgp_proto *p, rta *a, struct linpool *pool)
-{
- eattr *p2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
- eattr *p4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_PATH));
- eattr *a2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AGGREGATOR));
- eattr *a4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR));
- int a4_removed = 0;
-
- if (a4 && !as4_aggregator_valid(a4->u.ptr))
- {
- log(L_WARN "%s: AS4_AGGREGATOR attribute is invalid, skipping attribute", p->p.name);
- a4 = NULL;
- a4_removed = 1;
- }
-
- if (a2)
- {
- u32 a2_as = get_u16(a2->u.ptr->data);
-
- if (a4)
- {
- if (a2_as != AS_TRANS)
- {
- /* Routes were aggregated by old router and therefore AS4_PATH
- * and AS4_AGGREGATOR is invalid
- *
- * Convert AS_PATH and AGGREGATOR to 4B format and finish.
- */
-
- a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool);
- p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool);
-
- return;
- }
- else
- {
- /* Common case, use AS4_AGGREGATOR attribute */
- a2->u.ptr = a4->u.ptr;
- }
- }
- else
- {
- /* Common case, use old AGGREGATOR attribute */
- a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool);
-
- if ((a2_as == AS_TRANS) && !a4_removed)
- log(L_WARN "%s: AGGREGATOR attribute contain AS_TRANS, but AS4_AGGREGATOR is missing", p->p.name);
- }
- }
- else
- if (a4)
- log(L_WARN "%s: AS4_AGGREGATOR attribute received, but AGGREGATOR attribute is missing", p->p.name);
-
- int p2_len = as_path_getlen_int(p2->u.ptr, 2);
- int p4_len = p4 ? validate_as4_path(p, p4->u.ptr) : -1;
-
- if (p4 && (p4_len < 0))
- log(L_WARN "%s: AS4_PATH attribute is malformed, skipping attribute", p->p.name);
-
- if ((p4_len <= 0) || (p2_len < p4_len))
- p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool);
- else
- p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, p4->u.ptr, p2_len - p4_len, pool);
-}
-
static void
-bgp_remove_as4_attrs(struct bgp_proto *p, rta *a)
+bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool)
{
- unsigned id1 = EA_CODE(EAP_BGP, BA_AS4_PATH);
- unsigned id2 = EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR);
- ea_list **el = &(a->eattrs);
+ eattr *p2 = bgp_find_attr(*attrs, BA_AS_PATH);
+ eattr *p4 = bgp_find_attr(*attrs, BA_AS4_PATH);
+ eattr *a2 = bgp_find_attr(*attrs, BA_AGGREGATOR);
+ eattr *a4 = bgp_find_attr(*attrs, BA_AS4_AGGREGATOR);
- /* We know that ea_lists constructed in bgp_decode attrs have one attribute per ea_list struct */
- while (*el != NULL)
- {
- unsigned fid = (*el)->attrs[0].id;
-
- if ((fid == id1) || (fid == id2))
- {
- *el = (*el)->next;
- if (p->as4_session)
- log(L_WARN "%s: Unexpected AS4_* attributes received", p->p.name);
- }
- else
- el = &((*el)->next);
- }
-}
+ /* First, unset AS4_* attributes */
+ if (p4) bgp_unset_attr(attrs, pool, BA_AS4_PATH);
+ if (a4) bgp_unset_attr(attrs, pool, BA_AS4_AGGREGATOR);
-/**
- * bgp_decode_attrs - check and decode BGP attributes
- * @conn: connection
- * @attr: start of attribute block
- * @len: length of attribute block
- * @pool: linear pool to make all the allocations in
- * @mandatory: 1 iff presence of mandatory attributes has to be checked
- *
- * This function takes a BGP attribute block (a part of an Update message), checks
- * its consistency and converts it to a list of BIRD route attributes represented
- * by a &rta.
- */
-struct rta *
-bgp_decode_attrs(struct bgp_conn *conn, byte *attr, uint len, struct linpool *pool, int mandatory)
-{
- struct bgp_proto *bgp = conn->bgp;
- rta *a = lp_alloc(pool, sizeof(struct rta));
- uint flags, code, l, i, type;
- int errcode;
- byte *z, *attr_start;
- byte seen[256/8];
- ea_list *ea;
- struct adata *ad;
- int withdraw = 0;
-
- bzero(a, sizeof(rta));
- a->source = RTS_BGP;
- a->scope = SCOPE_UNIVERSE;
- a->cast = RTC_UNICAST;
- /* a->dest = RTD_ROUTER; -- set in bgp_set_next_hop() */
- a->from = bgp->cf->remote_ip;
-
- /* Parse the attributes */
- bzero(seen, sizeof(seen));
- DBG("BGP: Parsing attributes\n");
- while (len)
- {
- if (len < 2)
- goto malformed;
- attr_start = attr;
- flags = *attr++;
- code = *attr++;
- len -= 2;
- if (flags & BAF_EXT_LEN)
- {
- if (len < 2)
- goto malformed;
- l = get_u16(attr);
- attr += 2;
- len -= 2;
- }
- else
- {
- if (len < 1)
- goto malformed;
- l = *attr++;
- len--;
- }
- if (l > len)
- goto malformed;
- len -= l;
- z = attr;
- attr += l;
- DBG("Attr %02x %02x %d\n", code, flags, l);
- if (seen[code/8] & (1 << (code%8)))
- goto malformed;
- if (ATTR_KNOWN(code))
- {
- struct attr_desc *desc = &bgp_attr_table[code];
- if (desc->expected_length >= 0 && desc->expected_length != (int) l)
- { errcode = 5; goto err; }
- if ((desc->expected_flags ^ flags) & (BAF_OPTIONAL | BAF_TRANSITIVE))
- { errcode = 4; goto err; }
- if (!bgp->is_internal)
- {
- if (!desc->allow_in_ebgp)
- continue;
- if ((code == BA_LOCAL_PREF) && !bgp->cf->allow_local_pref)
- continue;
- }
- if (desc->validate)
- {
- errcode = desc->validate(bgp, z, l);
- if (errcode > 0)
- goto err;
- if (errcode == IGNORE)
- continue;
- if (errcode <= WITHDRAW)
- {
- log(L_WARN "%s: Attribute %s is malformed, withdrawing update",
- bgp->p.name, desc->name);
- withdraw = 1;
- }
- }
- else if (code == BA_AS_PATH)
- {
- /* Special case as it might also trim the attribute */
- if (validate_as_path(bgp, z, &l) < 0)
- { errcode = 11; goto err; }
- }
- type = desc->type;
- }
- else /* Unknown attribute */
- {
- if (!(flags & BAF_OPTIONAL))
- { errcode = 2; goto err; }
- type = EAF_TYPE_OPAQUE;
- }
-
- // Only OPTIONAL and TRANSITIVE attributes may have non-zero PARTIAL flag
- // if (!((flags & BAF_OPTIONAL) && (flags & BAF_TRANSITIVE)) && (flags & BAF_PARTIAL))
- // { errcode = 4; goto err; }
-
- seen[code/8] |= (1 << (code%8));
- ea = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr));
- ea->next = a->eattrs;
- a->eattrs = ea;
- ea->flags = 0;
- ea->count = 1;
- ea->attrs[0].id = EA_CODE(EAP_BGP, code);
- ea->attrs[0].flags = flags;
- ea->attrs[0].type = type;
- if (type & EAF_EMBEDDED)
- ad = NULL;
- else
- {
- ad = lp_alloc(pool, sizeof(struct adata) + l);
- ea->attrs[0].u.ptr = ad;
- ad->length = l;
- memcpy(ad->data, z, l);
- }
- switch (type)
- {
- case EAF_TYPE_ROUTER_ID:
- case EAF_TYPE_INT:
- if (l == 1)
- ea->attrs[0].u.data = *z;
- else
- ea->attrs[0].u.data = get_u32(z);
- break;
- case EAF_TYPE_IP_ADDRESS:
- ipa_ntoh(*(ip_addr *)ad->data);
- break;
- case EAF_TYPE_INT_SET:
- case EAF_TYPE_LC_SET:
- case EAF_TYPE_EC_SET:
- {
- u32 *z = (u32 *) ad->data;
- for(i=0; i<ad->length/4; i++)
- z[i] = ntohl(z[i]);
- break;
- }
- }
- }
-
- if (withdraw)
- goto withdraw;
-
-#ifdef IPV6
- /* If we received MP_REACH_NLRI we should check mandatory attributes */
- if (bgp->mp_reach_len != 0)
- mandatory = 1;
-#endif
-
- /* If there is no (reachability) NLRI, we should exit now */
- if (! mandatory)
- return a;
-
- /* Check if all mandatory attributes are present */
- for(i=0; i < ARRAY_SIZE(bgp_mandatory_attrs); i++)
- {
- code = bgp_mandatory_attrs[i];
- if (!(seen[code/8] & (1 << (code%8))))
- {
- bgp_error(conn, 3, 3, &bgp_mandatory_attrs[i], 1);
- return NULL;
- }
- }
-
- /* When receiving attributes from non-AS4-aware BGP speaker,
- * we have to reconstruct 4B AS_PATH and AGGREGATOR attributes
- */
- if (! bgp->as4_session)
- bgp_reconstruct_4b_atts(bgp, a, pool);
-
- bgp_remove_as4_attrs(bgp, a);
-
- /* If the AS path attribute contains our AS, reject the routes */
- if (bgp_as_path_loopy(bgp, a))
- goto withdraw;
-
- /* Two checks for IBGP loops caused by route reflection, RFC 4456 */
- if (bgp_originator_id_loopy(bgp, a) ||
- bgp_cluster_list_loopy(bgp, a))
- goto withdraw;
+ /* Handle AGGREGATOR attribute */
+ if (a2 && a4)
+ {
+ u32 a2_asn = get_u32(a2->u.ptr->data);
- /* If there's no local preference, define one */
- if (!(seen[0] & (1 << BA_LOCAL_PREF)))
- bgp_attach_attr(&a->eattrs, pool, BA_LOCAL_PREF, bgp->cf->default_local_pref);
+ /* If routes were aggregated by an old router, then AS4_PATH and
+ AS4_AGGREGATOR are invalid. In that case we give up. */
+ if (a2_asn != AS_TRANS)
+ return;
- return a;
+ /* Use AS4_AGGREGATOR instead of AGGREGATOR */
+ a2->u.ptr = a4->u.ptr;
+ }
-withdraw:
- return NULL;
+ /* Handle AS_PATH attribute */
+ if (p2 && p4)
+ {
+ /* Both as_path_getlen() and as_path_cut() take AS_CONFED* as zero length */
+ int p2_len = as_path_getlen(p2->u.ptr);
+ int p4_len = as_path_getlen(p4->u.ptr);
-malformed:
- bgp_error(conn, 3, 1, NULL, 0);
- return NULL;
+ /* AS_PATH is too short, give up */
+ if (p2_len < p4_len)
+ return;
-err:
- bgp_error(conn, 3, errcode, attr_start, z+l-attr_start);
- return NULL;
+ /* Merge AS_PATH and AS4_PATH */
+ as_path_cut(p2->u.ptr, p2_len - p4_len);
+ p2->u.ptr = as_path_merge(pool, p2->u.ptr, p4->u.ptr);
+ }
}
int
bgp_get_attr(eattr *a, byte *buf, int buflen)
{
uint i = EA_ID(a->id);
- struct attr_desc *d;
+ const struct bgp_attr_desc *d;
int len;
- if (ATTR_KNOWN(i))
+ if (bgp_attr_known(i))
+ {
+ d = &bgp_attr_table[i];
+ len = bsprintf(buf, "%s", d->name);
+ buf += len;
+ if (d->format)
{
- d = &bgp_attr_table[i];
- len = bsprintf(buf, "%s", d->name);
- buf += len;
- if (d->format)
- {
- *buf++ = ':';
- *buf++ = ' ';
- d->format(a, buf, buflen - len - 2);
- return GA_FULL;
- }
- return GA_NAME;
+ *buf++ = ':';
+ *buf++ = ' ';
+ d->format(a, buf, buflen - len - 2);
+ return GA_FULL;
}
- bsprintf(buf, "%02x%s", i, (a->flags & BAF_TRANSITIVE) ? " [t]" : "");
- return GA_NAME;
-}
-
-void
-bgp_init_bucket_table(struct bgp_proto *p)
-{
- p->hash_size = 256;
- p->hash_limit = p->hash_size * 4;
- p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *));
- init_list(&p->bucket_queue);
- p->withdraw_bucket = NULL;
- // fib_init(&p->prefix_fib, p->p.pool, sizeof(struct bgp_prefix), 0, bgp_init_prefix);
-}
-
-void
-bgp_free_bucket_table(struct bgp_proto *p)
-{
- mb_free(p->bucket_hash);
- p->bucket_hash = NULL;
-
- struct bgp_bucket *b;
- WALK_LIST_FIRST(b, p->bucket_queue)
- {
- rem_node(&b->send_node);
- mb_free(b);
+ return GA_NAME;
}
- mb_free(p->withdraw_bucket);
- p->withdraw_bucket = NULL;
+ bsprintf(buf, "%02x%s", i, (a->flags & BAF_TRANSITIVE) ? " [t]" : "");
+ return GA_NAME;
}
void
@@ -1988,14 +2002,14 @@ bgp_get_route_info(rte *e, byte *buf, ea_list *attrs)
buf += bsprintf(buf, "-");
if (e->attrs->hostentry)
- {
- if (!rte_resolvable(e))
- buf += bsprintf(buf, "/-");
- else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN)
- buf += bsprintf(buf, "/?");
- else
- buf += bsprintf(buf, "/%d", e->attrs->igp_metric);
- }
+ {
+ if (!rte_resolvable(e))
+ buf += bsprintf(buf, "/-");
+ else if (e->attrs->igp_metric >= IGP_METRIC_UNKNOWN)
+ buf += bsprintf(buf, "/?");
+ else
+ buf += bsprintf(buf, "/%d", e->attrs->igp_metric);
+ }
buf += bsprintf(buf, ") [");
if (p && as_path_get_last(p->u.ptr, &origas))
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c
index f706e76e..b9a1d157 100644
--- a/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@@ -2,6 +2,8 @@
* BIRD -- The Border Gateway Protocol
*
* (c) 2000 Martin Mares <mj@ucw.cz>
+ * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
+ * (c) 2008--2016 CZ.NIC z.s.p.o.
*
* Can be freely distributed and used under the terms of the GNU GPL.
*/
@@ -9,48 +11,52 @@
/**
* DOC: Border Gateway Protocol
*
- * The BGP protocol is implemented in three parts: |bgp.c| which takes care of the
- * connection and most of the interface with BIRD core, |packets.c| handling
+ * The BGP protocol is implemented in three parts: |bgp.c| which takes care of
+ * the connection and most of the interface with BIRD core, |packets.c| handling
* both incoming and outgoing BGP packets and |attrs.c| containing functions for
* manipulation with BGP attribute lists.
*
- * As opposed to the other existing routing daemons, BIRD has a sophisticated core
- * architecture which is able to keep all the information needed by BGP in the
- * primary routing table, therefore no complex data structures like a central
- * BGP table are needed. This increases memory footprint of a BGP router with
- * many connections, but not too much and, which is more important, it makes
- * BGP much easier to implement.
+ * As opposed to the other existing routing daemons, BIRD has a sophisticated
+ * core architecture which is able to keep all the information needed by BGP in
+ * the primary routing table, therefore no complex data structures like a
+ * central BGP table are needed. This increases memory footprint of a BGP router
+ * with many connections, but not too much and, which is more important, it
+ * makes BGP much easier to implement.
*
- * Each instance of BGP (corresponding to a single BGP peer) is described by a &bgp_proto
- * structure to which are attached individual connections represented by &bgp_connection
- * (usually, there exists only one connection, but during BGP session setup, there
- * can be more of them). The connections are handled according to the BGP state machine
- * defined in the RFC with all the timers and all the parameters configurable.
+ * Each instance of BGP (corresponding to a single BGP peer) is described by a
+ * &bgp_proto structure to which are attached individual connections represented
+ * by &bgp_connection (usually, there exists only one connection, but during BGP
+ * session setup, there can be more of them). The connections are handled
+ * according to the BGP state machine defined in the RFC with all the timers and
+ * all the parameters configurable.
*
- * In incoming direction, we listen on the connection's socket and each time we receive
- * some input, we pass it to bgp_rx(). It decodes packet headers and the markers and
- * passes complete packets to bgp_rx_packet() which distributes the packet according
- * to its type.
+ * In incoming direction, we listen on the connection's socket and each time we
+ * receive some input, we pass it to bgp_rx(). It decodes packet headers and the
+ * markers and passes complete packets to bgp_rx_packet() which distributes the
+ * packet according to its type.
*
- * In outgoing direction, we gather all the routing updates and sort them to buckets
- * (&bgp_bucket) according to their attributes (we keep a hash table for fast comparison
- * of &rta's and a &fib which helps us to find if we already have another route for
- * the same destination queued for sending, so that we can replace it with the new one
- * immediately instead of sending both updates). There also exists a special bucket holding
- * all the route withdrawals which cannot be queued anywhere else as they don't have any
- * attributes. If we have any packet to send (due to either new routes or the connection
- * tracking code wanting to send a Open, Keepalive or Notification message), we call
- * bgp_schedule_packet() which sets the corresponding bit in a @packet_to_send
- * bit field in &bgp_conn and as soon as the transmit socket buffer becomes empty,
- * we call bgp_fire_tx(). It inspects state of all the packet type bits and calls
- * the corresponding bgp_create_xx() functions, eventually rescheduling the same packet
- * type if we have more data of the same type to send.
+ * In outgoing direction, we gather all the routing updates and sort them to
+ * buckets (&bgp_bucket) according to their attributes (we keep a hash table for
+ * fast comparison of &rta's and a &fib which helps us to find if we already
+ * have another route for the same destination queued for sending, so that we
+ * can replace it with the new one immediately instead of sending both
+ * updates). There also exists a special bucket holding all the route
+ * withdrawals which cannot be queued anywhere else as they don't have any
+ * attributes. If we have any packet to send (due to either new routes or the
+ * connection tracking code wanting to send a Open, Keepalive or Notification
+ * message), we call bgp_schedule_packet() which sets the corresponding bit in a
+ * @packet_to_send bit field in &bgp_conn and as soon as the transmit socket
+ * buffer becomes empty, we call bgp_fire_tx(). It inspects state of all the
+ * packet type bits and calls the corresponding bgp_create_xx() functions,
+ * eventually rescheduling the same packet type if we have more data of the same
+ * type to send.
*
- * The processing of attributes consists of two functions: bgp_decode_attrs() for checking
- * of the attribute blocks and translating them to the language of BIRD's extended attributes
- * and bgp_encode_attrs() which does the converse. Both functions are built around a
- * @bgp_attr_table array describing all important characteristics of all known attributes.
- * Unknown transitive attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
+ * The processing of attributes consists of two functions: bgp_decode_attrs()
+ * for checking of the attribute blocks and translating them to the language of
+ * BIRD's extended attributes and bgp_encode_attrs() which does the
+ * converse. Both functions are built around a @bgp_attr_table array describing
+ * all important characteristics of all known attributes. Unknown transitive
+ * attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
*
* BGP protocol implements graceful restart in both restarting (local restart)
* and receiving (neighbor restart) roles. The first is handled mostly by the
@@ -61,7 +67,39 @@
* point of view and therefore maintaining received routes. Routing table
* refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing
* stale routes after reestablishment of BGP session during graceful restart.
- */
+ *
+ * Supported standards:
+ * <itemize>
+ * <item> <rfc id="4271"> - Border Gateway Protocol 4 (BGP)
+ * <item> <rfc id="1997"> - BGP Communities Attribute
+ * <item> <rfc id="2385"> - Protection of BGP Sessions via TCP MD5 Signature
+ * <item> <rfc id="2545"> - Use of BGP Multiprotocol Extensions for IPv6
+ * <item> <rfc id="2918"> - Route Refresh Capability
+ * <item> <rfc id="3107"> - Carrying Label Information in BGP
+ * <item> <rfc id="4360"> - BGP Extended Communities Attribute
+ * <item> <rfc id="4364"> - BGP/MPLS IPv4 Virtual Private Networks
+ * <item> <rfc id="4456"> - BGP Route Reflection
+ * <item> <rfc id="4486"> - Subcodes for BGP Cease Notification Message
+ * <item> <rfc id="4659"> - BGP/MPLS IPv6 Virtual Private Networks
+ * <item> <rfc id="4724"> - Graceful Restart Mechanism for BGP
+ * <item> <rfc id="4760"> - Multiprotocol extensions for BGP
+ * <item> <rfc id="4798"> - Connecting IPv6 Islands over IPv4 MPLS
+ * <item> <rfc id="5065"> - AS confederations for BGP
+ * <item> <rfc id="5082"> - Generalized TTL Security Mechanism
+ * <item> <rfc id="5492"> - Capabilities Advertisement with BGP
+ * <item> <rfc id="5549"> - Advertising IPv4 NLRI with an IPv6 Next Hop
+ * <item> <rfc id="5575"> - Dissemination of Flow Specification Rules
+ * <item> <rfc id="5668"> - 4-Octet AS Specific BGP Extended Community
+ * <item> <rfc id="6286"> - AS-Wide Unique BGP Identifier
+ * <item> <rfc id="6608"> - Subcodes for BGP Finite State Machine Error
+ * <item> <rfc id="6793"> - BGP Support for 4-Octet AS Numbers
+ * <item> <rfc id="7313"> - Enhanced Route Refresh Capability for BGP
+ * <item> <rfc id="7606"> - Revised Error Handling for BGP UPDATE Messages
+ * <item> <rfc id="7911"> - Advertisement of Multiple Paths in BGP
+ * <item> <rfc id="7947"> - Internet Exchange BGP Route Server
+ * <item> <rfc id="8092"> - BGP Large Communities Attribute
+ * </itemize>
+*/
#undef LOCAL_DEBUG
@@ -80,70 +118,150 @@
struct linpool *bgp_linpool; /* Global temporary pool */
-static sock *bgp_listen_sk; /* Global listening socket */
-static int bgp_counter; /* Number of protocol instances using the listening socket */
+struct linpool *bgp_linpool2; /* Global temporary pool for bgp_rt_notify() */
+static list bgp_sockets; /* Global list of listening sockets */
+
-static void bgp_close(struct bgp_proto *p, int apply_md5);
static void bgp_connect(struct bgp_proto *p);
static void bgp_active(struct bgp_proto *p);
-static sock *bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags);
static void bgp_update_bfd(struct bgp_proto *p, int use_bfd);
+static int bgp_incoming_connection(sock *sk, uint dummy UNUSED);
+static void bgp_listen_sock_err(sock *sk UNUSED, int err);
/**
* bgp_open - open a BGP instance
* @p: BGP instance
*
- * This function allocates and configures shared BGP resources.
- * Should be called as the last step during initialization
- * (when lock is acquired and neighbor is ready).
- * When error, state changed to PS_DOWN, -1 is returned and caller
- * should return immediately.
+ * This function allocates and configures shared BGP resources, mainly listening
+ * sockets. Should be called as the last step during initialization (when lock
+ * is acquired and neighbor is ready). When error, caller should change state to
+ * PS_DOWN and return immediately.
*/
static int
bgp_open(struct bgp_proto *p)
{
- struct config *cfg = p->cf->c.global;
- int errcode;
+ struct bgp_socket *bs = NULL;
+ struct iface *ifa = p->cf->strict_bind ? p->cf->iface : NULL;
+ ip_addr addr = p->cf->strict_bind ? p->cf->local_ip :
+ (ipa_is_ip4(p->cf->remote_ip) ? IPA_NONE4 : IPA_NONE6);
+ uint port = p->cf->local_port;
- if (!bgp_listen_sk)
- bgp_listen_sk = bgp_setup_listen_sk(cfg->listen_bgp_addr, cfg->listen_bgp_port, cfg->listen_bgp_flags);
+ /* FIXME: Add some global init? */
+ if (!bgp_linpool)
+ init_list(&bgp_sockets);
- if (!bgp_listen_sk)
+ /* We assume that cf->iface is defined iff cf->local_ip is link-local */
+
+ WALK_LIST(bs, bgp_sockets)
+ if (ipa_equal(bs->sk->saddr, addr) && (bs->sk->iface == ifa) && (bs->sk->sport == port))
{
- errcode = BEM_NO_SOCKET;
- goto err;
+ bs->uc++;
+ p->sock = bs;
+ return 0;
}
- if (!bgp_linpool)
- bgp_linpool = lp_new(&root_pool, 4080);
+ sock *sk = sk_new(proto_pool);
+ sk->type = SK_TCP_PASSIVE;
+ sk->ttl = 255;
+ sk->saddr = addr;
+ sk->sport = port;
+ sk->flags = 0;
+ sk->tos = IP_PREC_INTERNET_CONTROL;
+ sk->rbsize = BGP_RX_BUFFER_SIZE;
+ sk->tbsize = BGP_TX_BUFFER_SIZE;
+ sk->rx_hook = bgp_incoming_connection;
+ sk->err_hook = bgp_listen_sock_err;
+
+ if (sk_open(sk) < 0)
+ goto err;
- bgp_counter++;
+ bs = mb_allocz(proto_pool, sizeof(struct bgp_socket));
+ bs->sk = sk;
+ bs->uc = 1;
+ p->sock = bs;
- if (p->cf->password)
- if (sk_set_md5_auth(bgp_listen_sk, p->cf->source_addr, p->cf->remote_ip,
- p->cf->iface, p->cf->password, p->cf->setkey) < 0)
- {
- sk_log_error(bgp_listen_sk, p->p.name);
- bgp_close(p, 0);
- errcode = BEM_INVALID_MD5;
- goto err;
- }
+ add_tail(&bgp_sockets, &bs->n);
+
+ if (!bgp_linpool)
+ {
+ bgp_linpool = lp_new(proto_pool, 4080);
+ bgp_linpool2 = lp_new(proto_pool, 4080);
+ }
return 0;
err:
- p->p.disabled = 1;
- bgp_store_error(p, NULL, BE_MISC, errcode);
- proto_notify_state(&p->p, PS_DOWN);
+ sk_log_error(sk, p->p.name);
+ log(L_ERR "%s: Cannot open listening socket", p->p.name);
+ rfree(sk);
return -1;
}
+/**
+ * bgp_close - close a BGP instance
+ * @p: BGP instance
+ *
+ * This function frees and deconfigures shared BGP resources.
+ */
+static void
+bgp_close(struct bgp_proto *p)
+{
+ struct bgp_socket *bs = p->sock;
+
+ ASSERT(bs && bs->uc);
+
+ if (--bs->uc)
+ return;
+
+ rfree(bs->sk);
+ rem_node(&bs->n);
+ mb_free(bs);
+
+ if (!EMPTY_LIST(bgp_sockets))
+ return;
+
+ rfree(bgp_linpool);
+ bgp_linpool = NULL;
+
+ rfree(bgp_linpool2);
+ bgp_linpool2 = NULL;
+}
+
+static inline int
+bgp_setup_auth(struct bgp_proto *p, int enable)
+{
+ if (p->cf->password)
+ {
+ int rv = sk_set_md5_auth(p->sock->sk,
+ p->cf->local_ip, p->cf->remote_ip, p->cf->iface,
+ enable ? p->cf->password : NULL, p->cf->setkey);
+
+ if (rv < 0)
+ sk_log_error(p->sock->sk, p->p.name);
+
+ return rv;
+ }
+ else
+ return 0;
+}
+
+static inline struct bgp_channel *
+bgp_find_channel(struct bgp_proto *p, u32 afi)
+{
+ struct bgp_channel *c;
+ WALK_LIST(c, p->p.channels)
+ if (c->afi == afi)
+ return c;
+
+ return NULL;
+}
+
static void
bgp_startup(struct bgp_proto *p)
{
BGP_TRACE(D_EVENTS, "Started");
- p->start_state = p->cf->capabilities ? BSS_CONNECT : BSS_CONNECT_NOCAP;
+ p->start_state = BSS_CONNECT;
if (!p->cf->passive)
bgp_active(p);
@@ -159,50 +277,36 @@ bgp_startup_timeout(timer *t)
static void
bgp_initiate(struct bgp_proto *p)
{
- int rv = bgp_open(p);
- if (rv < 0)
- return;
+ int err_val;
+
+ if (bgp_open(p) < 0)
+ { err_val = BEM_NO_SOCKET; goto err1; }
+
+ if (bgp_setup_auth(p, 1) < 0)
+ { err_val = BEM_INVALID_MD5; goto err2; }
if (p->cf->bfd)
bgp_update_bfd(p, p->cf->bfd);
if (p->startup_delay)
- {
- p->start_state = BSS_DELAY;
- BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay);
- bgp_start_timer(p->startup_timer, p->startup_delay);
- }
+ {
+ p->start_state = BSS_DELAY;
+ BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay);
+ bgp_start_timer(p->startup_timer, p->startup_delay);
+ }
else
bgp_startup(p);
-}
-/**
- * bgp_close - close a BGP instance
- * @p: BGP instance
- * @apply_md5: 0 to disable unsetting MD5 auth
- *
- * This function frees and deconfigures shared BGP resources.
- * @apply_md5 is set to 0 when bgp_close is called as a cleanup
- * from failed bgp_open().
- */
-static void
-bgp_close(struct bgp_proto *p, int apply_md5)
-{
- ASSERT(bgp_counter);
- bgp_counter--;
+ return;
- if (p->cf->password && apply_md5)
- if (sk_set_md5_auth(bgp_listen_sk, p->cf->source_addr, p->cf->remote_ip,
- p->cf->iface, NULL, p->cf->setkey) < 0)
- sk_log_error(bgp_listen_sk, p->p.name);
+err2:
+ bgp_close(p);
+err1:
+ p->p.disabled = 1;
+ bgp_store_error(p, NULL, BE_MISC, err_val);
+ proto_notify_state(&p->p, PS_DOWN);
- if (!bgp_counter)
- {
- rfree(bgp_listen_sk);
- bgp_listen_sk = NULL;
- rfree(bgp_linpool);
- bgp_linpool = NULL;
- }
+ return;
}
/**
@@ -210,19 +314,19 @@ bgp_close(struct bgp_proto *p, int apply_md5)
* @t: timer
* @value: time to fire (0 to disable the timer)
*
- * This functions calls tm_start() on @t with time @value and the
- * amount of randomization suggested by the BGP standard. Please use
- * it for all BGP timers.
+ * This functions calls tm_start() on @t with time @value and the amount of
+ * randomization suggested by the BGP standard. Please use it for all BGP
+ * timers.
*/
void
bgp_start_timer(timer *t, int value)
{
if (value)
- {
- /* The randomization procedure is specified in RFC 1771: 9.2.3.3 */
- t->randomize = value / 4;
- tm_start(t, value - t->randomize);
- }
+ {
+ /* The randomization procedure is specified in RFC 1771: 9.2.3.3 */
+ t->randomize = value / 4;
+ tm_start(t, value - t->randomize);
+ }
else
tm_stop(t);
}
@@ -231,8 +335,8 @@ bgp_start_timer(timer *t, int value)
* bgp_close_conn - close a BGP connection
* @conn: connection to close
*
- * This function takes a connection described by the &bgp_conn structure,
- * closes its socket and frees all resources associated with it.
+ * This function takes a connection described by the &bgp_conn structure, closes
+ * its socket and frees all resources associated with it.
*/
void
bgp_close_conn(struct bgp_conn *conn)
@@ -241,16 +345,22 @@ bgp_close_conn(struct bgp_conn *conn)
DBG("BGP: Closing connection\n");
conn->packets_to_send = 0;
- rfree(conn->connect_retry_timer);
- conn->connect_retry_timer = NULL;
+ conn->channels_to_send = 0;
+ rfree(conn->connect_timer);
+ conn->connect_timer = NULL;
rfree(conn->keepalive_timer);
conn->keepalive_timer = NULL;
rfree(conn->hold_timer);
conn->hold_timer = NULL;
- rfree(conn->sk);
- conn->sk = NULL;
rfree(conn->tx_ev);
conn->tx_ev = NULL;
+ rfree(conn->sk);
+ conn->sk = NULL;
+
+ mb_free(conn->local_caps);
+ conn->local_caps = NULL;
+ mb_free(conn->remote_caps);
+ conn->remote_caps = NULL;
}
@@ -258,9 +368,9 @@ bgp_close_conn(struct bgp_conn *conn)
* bgp_update_startup_delay - update a startup delay
* @p: BGP instance
*
- * This function updates a startup delay that is used to postpone next BGP connect.
- * It also handles disable_after_error and might stop BGP instance when error
- * happened and disable_after_error is on.
+ * This function updates a startup delay that is used to postpone next BGP
+ * connect. It also handles disable_after_error and might stop BGP instance
+ * when error happened and disable_after_error is on.
*
* It should be called when BGP protocol error happened.
*/
@@ -277,11 +387,11 @@ bgp_update_startup_delay(struct bgp_proto *p)
p->last_proto_error = now;
if (cf->disable_after_error)
- {
- p->startup_delay = 0;
- p->p.disabled = 1;
- return;
- }
+ {
+ p->startup_delay = 0;
+ p->p.disabled = 1;
+ return;
+ }
if (!p->startup_delay)
p->startup_delay = cf->error_delay_time_min;
@@ -290,32 +400,38 @@ bgp_update_startup_delay(struct bgp_proto *p)
}
static void
-bgp_graceful_close_conn(struct bgp_conn *conn, unsigned subcode)
+bgp_graceful_close_conn(struct bgp_conn *conn, uint subcode)
{
switch (conn->state)
- {
- case BS_IDLE:
- case BS_CLOSE:
- return;
- case BS_CONNECT:
- case BS_ACTIVE:
- bgp_conn_enter_idle_state(conn);
- return;
- case BS_OPENSENT:
- case BS_OPENCONFIRM:
- case BS_ESTABLISHED:
- bgp_error(conn, 6, subcode, NULL, 0);
- return;
- default:
- bug("bgp_graceful_close_conn: Unknown state %d", conn->state);
- }
+ {
+ case BS_IDLE:
+ case BS_CLOSE:
+ return;
+
+ case BS_CONNECT:
+ case BS_ACTIVE:
+ bgp_conn_enter_idle_state(conn);
+ return;
+
+ case BS_OPENSENT:
+ case BS_OPENCONFIRM:
+ case BS_ESTABLISHED:
+ bgp_error(conn, 6, subcode, NULL, 0);
+ return;
+
+ default:
+ bug("bgp_graceful_close_conn: Unknown state %d", conn->state);
+ }
}
static void
bgp_down(struct bgp_proto *p)
{
if (p->start_state > BSS_PREPARE)
- bgp_close(p, 1);
+ {
+ bgp_setup_auth(p, 0);
+ bgp_close(p);
+ }
BGP_TRACE(D_EVENTS, "Down");
proto_notify_state(&p->p, PS_DOWN);
@@ -327,20 +443,20 @@ bgp_decision(void *vp)
struct bgp_proto *p = vp;
DBG("BGP: Decision start\n");
- if ((p->p.proto_state == PS_START)
- && (p->outgoing_conn.state == BS_IDLE)
- && (p->incoming_conn.state != BS_OPENCONFIRM)
- && (!p->cf->passive))
+ if ((p->p.proto_state == PS_START) &&
+ (p->outgoing_conn.state == BS_IDLE) &&
+ (p->incoming_conn.state != BS_OPENCONFIRM) &&
+ !p->cf->passive)
bgp_active(p);
- if ((p->p.proto_state == PS_STOP)
- && (p->outgoing_conn.state == BS_IDLE)
- && (p->incoming_conn.state == BS_IDLE))
+ if ((p->p.proto_state == PS_STOP) &&
+ (p->outgoing_conn.state == BS_IDLE) &&
+ (p->incoming_conn.state == BS_IDLE))
bgp_down(p);
}
void
-bgp_stop(struct bgp_proto *p, unsigned subcode)
+bgp_stop(struct bgp_proto *p, uint subcode)
{
proto_notify_state(&p->p, PS_STOP);
bgp_graceful_close_conn(&p->outgoing_conn, subcode);
@@ -349,7 +465,7 @@ bgp_stop(struct bgp_proto *p, unsigned subcode)
}
static inline void
-bgp_conn_set_state(struct bgp_conn *conn, unsigned new_state)
+bgp_conn_set_state(struct bgp_conn *conn, uint new_state)
{
if (conn->bgp->p.mrtdump & MD_STATES)
mrt_dump_bgp_state_change(conn, conn->state, new_state);
@@ -364,13 +480,17 @@ bgp_conn_enter_openconfirm_state(struct bgp_conn *conn)
bgp_conn_set_state(conn, BS_OPENCONFIRM);
}
+static const struct bgp_af_caps dummy_af_caps = { };
+
void
bgp_conn_enter_established_state(struct bgp_conn *conn)
{
struct bgp_proto *p = conn->bgp;
+ struct bgp_caps *local = conn->local_caps;
+ struct bgp_caps *peer = conn->remote_caps;
+ struct bgp_channel *c;
BGP_TRACE(D_EVENTS, "BGP session established");
- DBG("BGP: UP!!!\n");
/* For multi-hop BGP sessions */
if (ipa_zero(p->source_addr))
@@ -381,30 +501,92 @@ bgp_conn_enter_established_state(struct bgp_conn *conn)
p->conn = conn;
p->last_error_class = 0;
p->last_error_code = 0;
- p->feed_state = BFS_NONE;
- p->load_state = BFS_NONE;
- bgp_init_bucket_table(p);
- bgp_init_prefix_table(p, 8);
- int peer_gr_ready = conn->peer_gr_aware && !(conn->peer_gr_flags & BGP_GRF_RESTART);
+ p->as4_session = conn->as4_session;
- if (p->p.gr_recovery && !peer_gr_ready)
- proto_graceful_restart_unlock(&p->p);
+ p->route_refresh = peer->route_refresh;
+ p->enhanced_refresh = local->enhanced_refresh && peer->enhanced_refresh;
- if (p->p.gr_recovery && (p->cf->gr_mode == BGP_GR_ABLE) && peer_gr_ready)
- p->p.gr_wait = 1;
+ /* Whether we may handle possible GR of peer (it has some AF GR-able) */
+ p->gr_ready = 0; /* Updated later */
- if (p->gr_active)
+ /* Whether peer is ready to handle our GR recovery */
+ int peer_gr_ready = peer->gr_aware && !(peer->gr_flags & BGP_GRF_RESTART);
+
+ if (p->gr_active_num)
tm_stop(p->gr_timer);
- if (p->gr_active && (!conn->peer_gr_able || !(conn->peer_gr_aflags & BGP_GRF_FORWARDING)))
- bgp_graceful_restart_done(p);
+ /* Number of active channels */
+ int num = 0;
+
+ WALK_LIST(c, p->p.channels)
+ {
+ const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi);
+ const struct bgp_af_caps *rem = bgp_find_af_caps(peer, c->afi);
+
+ /* Ignore AFIs that were not announced in multiprotocol capability */
+ if (!loc || !loc->ready)
+ loc = &dummy_af_caps;
+
+ if (!rem || !rem->ready)
+ rem = &dummy_af_caps;
+
+ int active = loc->ready && rem->ready;
+ c->c.disabled = !active;
+ c->c.reloadable = p->route_refresh;
+
+ c->index = active ? num++ : 0;
+
+ c->feed_state = BFS_NONE;
+ c->load_state = BFS_NONE;
+
+ /* Channels where peer may do GR */
+ c->gr_ready = active && local->gr_aware && rem->gr_able;
+ p->gr_ready = p->gr_ready || c->gr_ready;
+
+ /* Channels not able to recover gracefully */
+ if (p->p.gr_recovery && (!active || !peer_gr_ready))
+ channel_graceful_restart_unlock(&c->c);
+
+ /* Channels waiting for local convergence */
+ if (p->p.gr_recovery && loc->gr_able && peer_gr_ready)
+ c->c.gr_wait = 1;
+
+ /* Channels where peer is not able to recover gracefully */
+ if (c->gr_active && ! (c->gr_ready && (rem->gr_af_flags & BGP_GRF_FORWARDING)))
+ bgp_graceful_restart_done(c);
+
+ /* GR capability implies that neighbor will send End-of-RIB */
+ if (peer->gr_aware)
+ c->load_state = BFS_LOADING;
+
+ c->ext_next_hop = c->cf->ext_next_hop && (bgp_channel_is_ipv6(c) || rem->ext_next_hop);
+ c->add_path_rx = (loc->add_path & BGP_ADD_PATH_RX) && (rem->add_path & BGP_ADD_PATH_TX);
+ c->add_path_tx = (loc->add_path & BGP_ADD_PATH_TX) && (rem->add_path & BGP_ADD_PATH_RX);
- /* GR capability implies that neighbor will send End-of-RIB */
- if (conn->peer_gr_aware)
- p->load_state = BFS_LOADING;
+ /* Update RA mode */
+ if (c->add_path_tx)
+ c->c.ra_mode = RA_ANY;
+ else if (c->cf->secondary)
+ c->c.ra_mode = RA_ACCEPTED;
+ else
+ c->c.ra_mode = RA_OPTIMAL;
+ }
+
+ p->afi_map = mb_alloc(p->p.pool, num * sizeof(u32));
+ p->channel_map = mb_alloc(p->p.pool, num * sizeof(void *));
+ p->channel_count = num;
- /* proto_notify_state() will likely call bgp_feed_begin(), setting p->feed_state */
+ WALK_LIST(c, p->p.channels)
+ {
+ if (c->c.disabled)
+ continue;
+
+ p->afi_map[c->index] = c->afi;
+ p->channel_map[c->index] = c;
+ }
+
+ /* proto_notify_state() will likely call bgp_feed_begin(), setting c->feed_state */
bgp_conn_set_state(conn, BS_ESTABLISHED);
proto_notify_state(&p->p, PS_UP);
@@ -416,8 +598,9 @@ bgp_conn_leave_established_state(struct bgp_proto *p)
BGP_TRACE(D_EVENTS, "BGP session closed");
p->conn = NULL;
- bgp_free_prefix_table(p);
- bgp_free_bucket_table(p);
+ // XXXX free these tables to avoid memory leak during graceful restart
+ // bgp_free_prefix_table(p);
+ // bgp_free_bucket_table(p);
if (p->p.proto_state == PS_UP)
bgp_stop(p, 0);
@@ -471,34 +654,57 @@ bgp_handle_graceful_restart(struct bgp_proto *p)
ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready);
BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s",
- p->gr_active ? " - already pending" : "");
- proto_notify_state(&p->p, PS_START);
+ p->gr_active_num ? " - already pending" : "");
- if (p->gr_active)
- rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
+ p->gr_active_num = 0;
- p->gr_active = 1;
- bgp_start_timer(p->gr_timer, p->conn->peer_gr_time);
- rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook);
+ struct bgp_channel *c;
+ WALK_LIST(c, p->p.channels)
+ {
+ if (c->gr_ready)
+ {
+ if (c->gr_active)
+ rt_refresh_end(c->c.table, &c->c);
+
+ c->gr_active = 1;
+ p->gr_active_num++;
+ rt_refresh_begin(c->c.table, &c->c);
+ }
+ else
+ {
+ /* Just flush the routes */
+ rt_refresh_begin(c->c.table, &c->c);
+ rt_refresh_end(c->c.table, &c->c);
+ }
+ }
+
+ proto_notify_state(&p->p, PS_START);
+ bgp_start_timer(p->gr_timer, p->conn->local_caps->gr_time);
}
/**
* bgp_graceful_restart_done - finish active BGP graceful restart
- * @p: BGP instance
+ * @c: BGP channel
*
* This function is called when the active BGP graceful restart of the neighbor
- * should be finished - either successfully (the neighbor sends all paths and
- * reports end-of-RIB on the new session) or unsuccessfully (the neighbor does
- * not support BGP graceful restart on the new session). The function ends
- * routing table refresh cycle and stops BGP restart timer.
+ * should be finished for channel @c - either successfully (the neighbor sends
+ * all paths and reports end-of-RIB for given AFI/SAFI on the new session) or
+ * unsuccessfully (the neighbor does not support BGP graceful restart on the new
+ * session). The function ends the routing table refresh cycle.
*/
void
-bgp_graceful_restart_done(struct bgp_proto *p)
+bgp_graceful_restart_done(struct bgp_channel *c)
{
- BGP_TRACE(D_EVENTS, "Neighbor graceful restart done");
- p->gr_active = 0;
- tm_stop(p->gr_timer);
- rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
+ struct bgp_proto *p = (void *) c->c.proto;
+
+ ASSERT(c->gr_active);
+ c->gr_active = 0;
+ p->gr_active_num--;
+
+ if (!p->gr_active_num)
+ BGP_TRACE(D_EVENTS, "Neighbor graceful restart done");
+
+ rt_refresh_end(c->c.table, &c->c);
}
/**
@@ -522,7 +728,7 @@ bgp_graceful_restart_timeout(timer *t)
/**
* bgp_refresh_begin - start incoming enhanced route refresh sequence
- * @p: BGP instance
+ * @c: BGP channel
*
* This function is called when an incoming enhanced route refresh sequence is
* started by the neighbor, demarcated by the BoRR packet. The function updates
@@ -531,18 +737,20 @@ bgp_graceful_restart_timeout(timer *t)
* ensure that these two sequences do not overlap.
*/
void
-bgp_refresh_begin(struct bgp_proto *p)
+bgp_refresh_begin(struct bgp_channel *c)
{
- if (p->load_state == BFS_LOADING)
- { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; }
+ struct bgp_proto *p = (void *) c->c.proto;
- p->load_state = BFS_REFRESHING;
- rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook);
+ if (c->load_state == BFS_LOADING)
+ { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; }
+
+ c->load_state = BFS_REFRESHING;
+ rt_refresh_begin(c->c.table, &c->c);
}
/**
* bgp_refresh_end - finish incoming enhanced route refresh sequence
- * @p: BGP instance
+ * @c: BGP channel
*
* This function is called when an incoming enhanced route refresh sequence is
* finished by the neighbor, demarcated by the EoRR packet. The function updates
@@ -550,39 +758,26 @@ bgp_refresh_begin(struct bgp_proto *p)
* during the sequence are removed by the nest.
*/
void
-bgp_refresh_end(struct bgp_proto *p)
+bgp_refresh_end(struct bgp_channel *c)
{
- if (p->load_state != BFS_REFRESHING)
- { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; }
+ struct bgp_proto *p = (void *) c->c.proto;
- p->load_state = BFS_NONE;
- rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
+ if (c->load_state != BFS_REFRESHING)
+ { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; }
+
+ c->load_state = BFS_NONE;
+ rt_refresh_end(c->c.table, &c->c);
}
static void
bgp_send_open(struct bgp_conn *conn)
{
- conn->start_state = conn->bgp->start_state;
-
- // Default values, possibly changed by receiving capabilities.
- conn->advertised_as = 0;
- conn->peer_refresh_support = 0;
- conn->peer_as4_support = 0;
- conn->peer_add_path = 0;
- conn->peer_enhanced_refresh_support = 0;
- conn->peer_gr_aware = 0;
- conn->peer_gr_able = 0;
- conn->peer_gr_time = 0;
- conn->peer_gr_flags = 0;
- conn->peer_gr_aflags = 0;
- conn->peer_ext_messages_support = 0;
-
DBG("BGP: Sending open\n");
conn->sk->rx_hook = bgp_rx;
conn->sk->tx_hook = bgp_tx;
- tm_stop(conn->connect_retry_timer);
- bgp_schedule_packet(conn, PKT_OPEN);
+ tm_stop(conn->connect_timer);
+ bgp_schedule_packet(conn, NULL, PKT_OPEN);
bgp_conn_set_state(conn, BS_OPENSENT);
bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time);
}
@@ -605,10 +800,10 @@ bgp_connect_timeout(timer *t)
DBG("BGP: connect_timeout\n");
if (p->p.proto_state == PS_START)
- {
- bgp_close_conn(conn);
- bgp_connect(p);
- }
+ {
+ bgp_close_conn(conn);
+ bgp_connect(p);
+ }
else
bgp_conn_enter_idle_state(conn);
}
@@ -672,7 +867,7 @@ bgp_keepalive_timeout(timer *t)
struct bgp_conn *conn = t->data;
DBG("BGP: Keepalive timer\n");
- bgp_schedule_packet(conn, PKT_KEEPALIVE);
+ bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
/* Kick TX a bit faster */
if (ev_active(conn->tx_ev))
@@ -682,21 +877,18 @@ bgp_keepalive_timeout(timer *t)
static void
bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn)
{
- timer *t;
-
conn->sk = NULL;
conn->bgp = p;
+
conn->packets_to_send = 0;
+ conn->channels_to_send = 0;
+ conn->last_channel = 0;
+ conn->last_channel_count = 0;
+
+ conn->connect_timer = tm_new_set(p->p.pool, bgp_connect_timeout, conn, 0, 0);
+ conn->hold_timer = tm_new_set(p->p.pool, bgp_hold_timeout, conn, 0, 0);
+ conn->keepalive_timer = tm_new_set(p->p.pool, bgp_keepalive_timeout, conn, 0, 0);
- t = conn->connect_retry_timer = tm_new(p->p.pool);
- t->hook = bgp_connect_timeout;
- t->data = conn;
- t = conn->hold_timer = tm_new(p->p.pool);
- t->hook = bgp_hold_timeout;
- t->data = conn;
- t = conn->keepalive_timer = tm_new(p->p.pool);
- t->hook = bgp_keepalive_timeout;
- t->data = conn;
conn->tx_ev = ev_new(p->p.pool);
conn->tx_ev->hook = bgp_kick_tx;
conn->tx_ev->data = conn;
@@ -720,7 +912,7 @@ bgp_active(struct bgp_proto *p)
BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay);
bgp_setup_conn(p, conn);
bgp_conn_set_state(conn, BS_ACTIVE);
- bgp_start_timer(conn->connect_retry_timer, delay);
+ bgp_start_timer(conn->connect_timer, delay);
}
/**
@@ -734,12 +926,11 @@ bgp_active(struct bgp_proto *p)
static void
bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing connection */
{
- sock *s;
struct bgp_conn *conn = &p->outgoing_conn;
int hops = p->cf->multihop ? : 1;
DBG("BGP: Connecting\n");
- s = sk_new(p->p.pool);
+ sock *s = sk_new(p->p.pool);
s->type = SK_TCP_ACTIVE;
s->saddr = p->source_addr;
s->daddr = p->cf->remote_ip;
@@ -766,10 +957,10 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c
goto err;
DBG("BGP: Waiting for connect success\n");
- bgp_start_timer(conn->connect_retry_timer, p->cf->connect_retry_time);
+ bgp_start_timer(conn->connect_timer, p->cf->connect_retry_time);
return;
- err:
+err:
sk_log_error(s, p->p.name);
bgp_sock_err(s, 0);
return;
@@ -783,16 +974,15 @@ bgp_connect(struct bgp_proto *p) /* Enter Connect state and start establishing c
static struct bgp_proto *
bgp_find_proto(sock *sk)
{
- struct proto_config *pc;
+ struct bgp_proto *p;
- WALK_LIST(pc, config->protos)
- if ((pc->protocol == &proto_bgp) && pc->proto)
- {
- struct bgp_proto *p = (struct bgp_proto *) pc->proto;
- if (ipa_equal(p->cf->remote_ip, sk->daddr) &&
- (!p->cf->iface || (p->cf->iface == sk->iface)))
- return p;
- }
+ WALK_LIST(p, proto_list)
+ if ((p->p.proto == &proto_bgp) &&
+ ipa_equal(p->cf->remote_ip, sk->daddr) &&
+ (!p->cf->iface || (p->cf->iface == sk->iface)) &&
+ (ipa_zero(p->cf->local_ip) || ipa_equal(p->cf->local_ip, sk->saddr)) &&
+ (p->cf->local_port == sk->sport))
+ return p;
return NULL;
}
@@ -818,12 +1008,12 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED)
DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport);
p = bgp_find_proto(sk);
if (!p)
- {
- log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)",
- sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport);
- rfree(sk);
- return 0;
- }
+ {
+ log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)",
+ sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport);
+ rfree(sk);
+ return 0;
+ }
/*
* BIRD should keep multiple incoming connections in OpenSent state (for
@@ -836,26 +1026,26 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED)
(p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk);
if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
- {
- bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART);
- bgp_handle_graceful_restart(p);
- bgp_conn_enter_idle_state(p->conn);
- acc = 1;
-
- /* There might be separate incoming connection in OpenSent state */
- if (p->incoming_conn.state > BS_ACTIVE)
- bgp_close_conn(&p->incoming_conn);
- }
+ {
+ bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART);
+ bgp_handle_graceful_restart(p);
+ bgp_conn_enter_idle_state(p->conn);
+ acc = 1;
+
+ /* There might be separate incoming connection in OpenSent state */
+ if (p->incoming_conn.state > BS_ACTIVE)
+ bgp_close_conn(&p->incoming_conn);
+ }
BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s",
sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL,
sk->dport, acc ? "accepted" : "rejected");
if (!acc)
- {
- rfree(sk);
- return 0;
- }
+ {
+ rfree(sk);
+ return 0;
+ }
hops = p->cf->multihop ? : 1;
@@ -867,11 +1057,11 @@ bgp_incoming_connection(sock *sk, uint dummy UNUSED)
goto err;
if (p->cf->enable_extended_messages)
- {
- sk->rbsize = BGP_RX_BUFFER_EXT_SIZE;
- sk->tbsize = BGP_TX_BUFFER_EXT_SIZE;
- sk_reallocate(sk);
- }
+ {
+ sk->rbsize = BGP_RX_BUFFER_EXT_SIZE;
+ sk->tbsize = BGP_TX_BUFFER_EXT_SIZE;
+ sk_reallocate(sk);
+ }
bgp_setup_conn(p, &p->incoming_conn);
bgp_setup_sk(&p->incoming_conn, sk);
@@ -894,34 +1084,6 @@ bgp_listen_sock_err(sock *sk UNUSED, int err)
log(L_ERR "BGP: Error on listening socket: %M", err);
}
-static sock *
-bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags)
-{
- sock *s = sk_new(&root_pool);
- DBG("BGP: Creating listening socket\n");
- s->type = SK_TCP_PASSIVE;
- s->ttl = 255;
- s->saddr = addr;
- s->sport = port ? port : BGP_PORT;
- s->flags = flags ? 0 : SKF_V6ONLY;
- s->tos = IP_PREC_INTERNET_CONTROL;
- s->rbsize = BGP_RX_BUFFER_SIZE;
- s->tbsize = BGP_TX_BUFFER_SIZE;
- s->rx_hook = bgp_incoming_connection;
- s->err_hook = bgp_listen_sock_err;
-
- if (sk_open(s) < 0)
- goto err;
-
- return s;
-
- err:
- sk_log_error(s, "BGP");
- log(L_ERR "BGP: Cannot open listening socket");
- rfree(s);
- return NULL;
-}
-
static void
bgp_start_neighbor(struct bgp_proto *p)
{
@@ -930,23 +1092,21 @@ bgp_start_neighbor(struct bgp_proto *p)
if (ipa_zero(p->source_addr))
p->source_addr = p->neigh->ifa->ip;
-#ifdef IPV6
+ if (ipa_is_link_local(p->source_addr))
+ p->link_addr = p->source_addr;
+ else
{
+ /* Find some link-local address for given iface */
struct ifa *a;
- p->local_link = IPA_NONE;
WALK_LIST(a, p->neigh->iface->addrs)
if (a->scope == SCOPE_LINK)
- {
- p->local_link = a->ip;
- break;
- }
-
- if (! ipa_nonzero(p->local_link))
- log(L_WARN "%s: Missing link local address on interface %s", p->p.name, p->neigh->iface->name);
+ {
+ p->link_addr = a->ip;
+ break;
+ }
- DBG("BGP: Selected link-level address %I\n", p->local_link);
+ DBG("%s: Selected link-local address %I\n", p->p.name, p->link_addr);
}
-#endif
bgp_initiate(p);
}
@@ -966,34 +1126,34 @@ bgp_neigh_notify(neighbor *n)
int prepare = (ps == PS_START) && (p->start_state == BSS_PREPARE);
if (n->scope <= 0)
+ {
+ if (!prepare)
{
- if (!prepare)
- {
- BGP_TRACE(D_EVENTS, "Neighbor lost");
- bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST);
- /* Perhaps also run bgp_update_startup_delay(p)? */
- bgp_stop(p, 0);
- }
+ BGP_TRACE(D_EVENTS, "Neighbor lost");
+ bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST);
+ /* Perhaps also run bgp_update_startup_delay(p)? */
+ bgp_stop(p, 0);
}
+ }
else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
+ {
+ if (!prepare)
{
- if (!prepare)
- {
- BGP_TRACE(D_EVENTS, "Link down");
- bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN);
- if (ps == PS_UP)
- bgp_update_startup_delay(p);
- bgp_stop(p, 0);
- }
+ BGP_TRACE(D_EVENTS, "Link down");
+ bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN);
+ if (ps == PS_UP)
+ bgp_update_startup_delay(p);
+ bgp_stop(p, 0);
}
+ }
else
+ {
+ if (prepare)
{
- if (prepare)
- {
- BGP_TRACE(D_EVENTS, "Neighbor ready");
- bgp_start_neighbor(p);
- }
+ BGP_TRACE(D_EVENTS, "Neighbor ready");
+ bgp_start_neighbor(p);
}
+ }
}
static void
@@ -1003,13 +1163,13 @@ bgp_bfd_notify(struct bfd_request *req)
int ps = p->p.proto_state;
if (req->down && ((ps == PS_START) || (ps == PS_UP)))
- {
- BGP_TRACE(D_EVENTS, "BFD session down");
- bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN);
- if (ps == PS_UP)
- bgp_update_startup_delay(p);
- bgp_stop(p, 0);
- }
+ {
+ BGP_TRACE(D_EVENTS, "BFD session down");
+ bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN);
+ if (ps == PS_UP)
+ bgp_update_startup_delay(p);
+ bgp_stop(p, 0);
+ }
}
static void
@@ -1021,71 +1181,72 @@ bgp_update_bfd(struct bgp_proto *p, int use_bfd)
bgp_bfd_notify, p);
if (!use_bfd && p->bfd_req)
- {
- rfree(p->bfd_req);
- p->bfd_req = NULL;
- }
+ {
+ rfree(p->bfd_req);
+ p->bfd_req = NULL;
+ }
}
-static int
-bgp_reload_routes(struct proto *P)
+static void
+bgp_reload_routes(struct channel *C)
{
- struct bgp_proto *p = (struct bgp_proto *) P;
- if (!p->conn || !p->conn->peer_refresh_support)
- return 0;
+ struct bgp_proto *p = (void *) C->proto;
+ struct bgp_channel *c = (void *) C;
- bgp_schedule_packet(p->conn, PKT_ROUTE_REFRESH);
- return 1;
+ ASSERT(p->conn && p->route_refresh);
+
+ bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH);
}
static void
-bgp_feed_begin(struct proto *P, int initial)
+bgp_feed_begin(struct channel *C, int initial)
{
- struct bgp_proto *p = (struct bgp_proto *) P;
+ struct bgp_proto *p = (void *) C->proto;
+ struct bgp_channel *c = (void *) C;
/* This should not happen */
if (!p->conn)
return;
if (initial && p->cf->gr_mode)
- p->feed_state = BFS_LOADING;
+ c->feed_state = BFS_LOADING;
/* It is refeed and both sides support enhanced route refresh */
- if (!initial && p->cf->enable_refresh &&
- p->conn->peer_enhanced_refresh_support)
- {
- /* BoRR must not be sent before End-of-RIB */
- if (p->feed_state == BFS_LOADING || p->feed_state == BFS_LOADED)
- return;
+ if (!initial && p->enhanced_refresh)
+ {
+ /* BoRR must not be sent before End-of-RIB */
+ if (c->feed_state == BFS_LOADING || c->feed_state == BFS_LOADED)
+ return;
- p->feed_state = BFS_REFRESHING;
- bgp_schedule_packet(p->conn, PKT_BEGIN_REFRESH);
- }
+ c->feed_state = BFS_REFRESHING;
+ bgp_schedule_packet(p->conn, c, PKT_BEGIN_REFRESH);
+ }
}
static void
-bgp_feed_end(struct proto *P)
+bgp_feed_end(struct channel *C)
{
- struct bgp_proto *p = (struct bgp_proto *) P;
+ struct bgp_proto *p = (void *) C->proto;
+ struct bgp_channel *c = (void *) C;
/* This should not happen */
if (!p->conn)
return;
/* Non-demarcated feed ended, nothing to do */
- if (p->feed_state == BFS_NONE)
+ if (c->feed_state == BFS_NONE)
return;
/* Schedule End-of-RIB packet */
- if (p->feed_state == BFS_LOADING)
- p->feed_state = BFS_LOADED;
+ if (c->feed_state == BFS_LOADING)
+ c->feed_state = BFS_LOADED;
/* Schedule EoRR packet */
- if (p->feed_state == BFS_REFRESHING)
- p->feed_state = BFS_REFRESHED;
+ if (c->feed_state == BFS_REFRESHING)
+ c->feed_state = BFS_REFRESHED;
/* Kick TX hook */
- bgp_schedule_packet(p->conn, PKT_UPDATE);
+ bgp_schedule_packet(p->conn, c, PKT_UPDATE);
}
@@ -1096,30 +1257,30 @@ bgp_start_locked(struct object_lock *lock)
struct bgp_config *cf = p->cf;
if (p->p.proto_state != PS_START)
- {
- DBG("BGP: Got lock in different state %d\n", p->p.proto_state);
- return;
- }
+ {
+ DBG("BGP: Got lock in different state %d\n", p->p.proto_state);
+ return;
+ }
DBG("BGP: Got lock\n");
if (cf->multihop)
- {
- /* Multi-hop sessions do not use neighbor entries */
- bgp_initiate(p);
- return;
- }
+ {
+ /* Multi-hop sessions do not use neighbor entries */
+ bgp_initiate(p);
+ return;
+ }
neighbor *n = neigh_find2(&p->p, &cf->remote_ip, cf->iface, NEF_STICKY);
if (!n)
- {
- log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface);
- /* As we do not start yet, we can just disable protocol */
- p->p.disabled = 1;
- bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP);
- proto_notify_state(&p->p, PS_DOWN);
- return;
- }
+ {
+ log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface);
+ /* As we do not start yet, we can just disable protocol */
+ p->p.disabled = 1;
+ bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP);
+ proto_notify_state(&p->p, PS_DOWN);
+ return;
+ }
p->neigh = n;
@@ -1144,9 +1305,7 @@ bgp_start(struct proto *P)
p->neigh = NULL;
p->bfd_req = NULL;
p->gr_ready = 0;
- p->gr_active = 0;
-
- rt_lock_table(p->igp_table);
+ p->gr_active_num = 0;
p->event = ev_new(p->p.pool);
p->event->hook = bgp_decision;
@@ -1165,15 +1324,20 @@ bgp_start(struct proto *P)
p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id;
p->remote_id = 0;
- p->source_addr = p->cf->source_addr;
+ p->source_addr = p->cf->local_ip;
+ p->link_addr = IPA_NONE;
+ /* XXXX */
if (p->p.gr_recovery && p->cf->gr_mode)
- proto_graceful_restart_lock(P);
+ {
+ struct bgp_channel *c;
+ WALK_LIST(c, p->p.channels)
+ channel_graceful_restart_lock(&c->c);
+ }
/*
- * Before attempting to create the connection, we need to lock the
- * port, so that are sure we're the only instance attempting to talk
- * with that neighbor.
+ * Before attempting to create the connection, we need to lock the port,
+ * so that we are the only instance attempting to talk with that neighbor.
*/
lock = p->lock = olock_new(P->pool);
@@ -1194,78 +1358,64 @@ static int
bgp_shutdown(struct proto *P)
{
struct bgp_proto *p = (struct bgp_proto *) P;
- unsigned subcode = 0;
+ uint subcode = 0;
BGP_TRACE(D_EVENTS, "Shutdown requested");
switch (P->down_code)
- {
- case PDC_CF_REMOVE:
- case PDC_CF_DISABLE:
- subcode = 3; // Errcode 6, 3 - peer de-configured
- break;
-
- case PDC_CF_RESTART:
- subcode = 6; // Errcode 6, 6 - other configuration change
- break;
-
- case PDC_CMD_DISABLE:
- case PDC_CMD_SHUTDOWN:
- subcode = 2; // Errcode 6, 2 - administrative shutdown
- break;
-
- case PDC_CMD_RESTART:
- subcode = 4; // Errcode 6, 4 - administrative reset
- break;
-
- case PDC_RX_LIMIT_HIT:
- case PDC_IN_LIMIT_HIT:
- subcode = 1; // Errcode 6, 1 - max number of prefixes reached
- /* log message for compatibility */
- log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name);
- goto limit;
-
- case PDC_OUT_LIMIT_HIT:
- subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown
-
- limit:
- bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED);
- if (proto_restart)
- bgp_update_startup_delay(p);
- else
- p->startup_delay = 0;
- goto done;
- }
+ {
+ case PDC_CF_REMOVE:
+ case PDC_CF_DISABLE:
+ subcode = 3; // Errcode 6, 3 - peer de-configured
+ break;
+
+ case PDC_CF_RESTART:
+ subcode = 6; // Errcode 6, 6 - other configuration change
+ break;
+
+ case PDC_CMD_DISABLE:
+ case PDC_CMD_SHUTDOWN:
+ subcode = 2; // Errcode 6, 2 - administrative shutdown
+ break;
+
+ case PDC_CMD_RESTART:
+ subcode = 4; // Errcode 6, 4 - administrative reset
+ break;
+
+ case PDC_RX_LIMIT_HIT:
+ case PDC_IN_LIMIT_HIT:
+ subcode = 1; // Errcode 6, 1 - max number of prefixes reached
+ /* log message for compatibility */
+ log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name);
+ goto limit;
+
+ case PDC_OUT_LIMIT_HIT:
+ subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown
+
+ limit:
+ bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED);
+ if (proto_restart)
+ bgp_update_startup_delay(p);
+ else
+ p->startup_delay = 0;
+ goto done;
+ }
bgp_store_error(p, NULL, BE_MAN_DOWN, 0);
p->startup_delay = 0;
- done:
+done:
bgp_stop(p, subcode);
return p->p.proto_state;
}
-static void
-bgp_cleanup(struct proto *P)
-{
- struct bgp_proto *p = (struct bgp_proto *) P;
- rt_unlock_table(p->igp_table);
-}
-
-static rtable *
-get_igp_table(struct bgp_config *cf)
-{
- return cf->igp_table ? cf->igp_table->table : cf->c.table->table;
-}
-
static struct proto *
-bgp_init(struct proto_config *C)
+bgp_init(struct proto_config *CF)
{
- struct proto *P = proto_new(C, sizeof(struct bgp_proto));
- struct bgp_config *c = (struct bgp_config *) C;
+ struct proto *P = proto_new(CF);
struct bgp_proto *p = (struct bgp_proto *) P;
+ struct bgp_config *cf = (struct bgp_config *) CF;
- P->accept_ra_types = c->secondary ? RA_ACCEPTED : RA_OPTIMAL;
P->rt_notify = bgp_rt_notify;
P->import_control = bgp_import_control;
P->neigh_notify = bgp_neigh_notify;
@@ -1274,102 +1424,276 @@ bgp_init(struct proto_config *C)
P->feed_end = bgp_feed_end;
P->rte_better = bgp_rte_better;
P->rte_mergable = bgp_rte_mergable;
- P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL;
-
- p->cf = c;
- p->local_as = c->local_as;
- p->remote_as = c->remote_as;
- p->is_internal = (c->local_as == c->remote_as);
- p->rs_client = c->rs_client;
- p->rr_client = c->rr_client;
- p->igp_table = get_igp_table(c);
+ P->rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL;
+
+ p->cf = cf;
+ p->local_as = cf->local_as;
+ p->remote_as = cf->remote_as;
+ p->public_as = cf->local_as;
+ p->is_internal = (cf->local_as == cf->remote_as);
+ p->is_interior = p->is_internal || cf->confederation_member;
+ p->rs_client = cf->rs_client;
+ p->rr_client = cf->rr_client;
+
+ /* Confederation ID is used for truly external peers */
+ if (cf->confederation && !p->is_interior)
+ p->public_as = cf->confederation;
+
+ /* Add all channels */
+ struct bgp_channel_config *cc;
+ WALK_LIST(cc, CF->channels)
+ proto_add_channel(P, &cc->c);
return P;
}
+static void
+bgp_channel_init(struct channel *C, struct channel_config *CF)
+{
+ struct bgp_channel *c = (void *) C;
+ struct bgp_channel_config *cf = (void *) CF;
+
+ c->cf = cf;
+ c->afi = cf->afi;
+ c->desc = cf->desc;
+
+ if (cf->igp_table_ip4)
+ c->igp_table_ip4 = cf->igp_table_ip4->table;
+
+ if (cf->igp_table_ip6)
+ c->igp_table_ip6 = cf->igp_table_ip6->table;
+}
+
+static int
+bgp_channel_start(struct channel *C)
+{
+ struct bgp_proto *p = (void *) C->proto;
+ struct bgp_channel *c = (void *) C;
+ ip_addr src = p->source_addr;
+
+ if (c->igp_table_ip4)
+ rt_lock_table(c->igp_table_ip4);
+
+ if (c->igp_table_ip6)
+ rt_lock_table(c->igp_table_ip6);
+
+ c->pool = p->p.pool; // XXXX
+ bgp_init_bucket_table(c);
+ bgp_init_prefix_table(c);
+
+ c->next_hop_addr = c->cf->next_hop_addr;
+ c->link_addr = IPA_NONE;
+ c->packets_to_send = 0;
+
+ /* Try to use source address as next hop address */
+ if (ipa_zero(c->next_hop_addr))
+ {
+ if (bgp_channel_is_ipv4(c) && (ipa_is_ip4(src) || c->ext_next_hop))
+ c->next_hop_addr = src;
+
+ if (bgp_channel_is_ipv6(c) && (ipa_is_ip6(src) || c->ext_next_hop))
+ c->next_hop_addr = src;
+ }
+
+ /* Exit if no feasible next hop address is found */
+ if (ipa_zero(c->next_hop_addr))
+ {
+ log(L_WARN "%s: Missing next hop address", p->p.name);
+ return 0;
+ }
+
+ /* Set link-local address for IPv6 single-hop BGP */
+ if (ipa_is_ip6(c->next_hop_addr) && p->neigh)
+ {
+ c->link_addr = p->link_addr;
+
+ if (ipa_zero(c->link_addr))
+ log(L_WARN "%s: Missing link-local address", p->p.name);
+ }
+
+ /* Link local address is already in c->link_addr */
+ if (ipa_is_link_local(c->next_hop_addr))
+ c->next_hop_addr = IPA_NONE;
+
+ return 0; /* XXXX: Currently undefined */
+}
+
+static void
+bgp_channel_shutdown(struct channel *C)
+{
+ struct bgp_channel *c = (void *) C;
+
+ /* XXXX: cleanup bucket and prefix tables */
+
+ c->next_hop_addr = IPA_NONE;
+ c->link_addr = IPA_NONE;
+}
+
+static void
+bgp_channel_cleanup(struct channel *C)
+{
+ struct bgp_channel *c = (void *) C;
+
+ if (c->igp_table_ip4)
+ rt_unlock_table(c->igp_table_ip4);
+
+ if (c->igp_table_ip6)
+ rt_unlock_table(c->igp_table_ip6);
+}
+
+static inline struct bgp_channel_config *
+bgp_find_channel_config(struct bgp_config *cf, u32 afi)
+{
+ struct bgp_channel_config *cc;
+
+ WALK_LIST(cc, cf->c.channels)
+ if (cc->afi == afi)
+ return cc;
+
+ return NULL;
+}
+
+struct rtable_config *
+bgp_default_igp_table(struct bgp_config *cf, struct bgp_channel_config *cc, u32 type)
+{
+ struct bgp_channel_config *cc2;
+ struct rtable_config *tab;
+
+ /* First, try table connected by the channel */
+ if (cc->c.table->addr_type == type)
+ return cc->c.table;
+
+ /* Find paired channel with the same SAFI but the other AFI */
+ u32 afi2 = cc->afi ^ 0x30000;
+ cc2 = bgp_find_channel_config(cf, afi2);
+
+ /* Second, try IGP table configured in the paired channel */
+ if (cc2 && (tab = (type == NET_IP4) ? cc2->igp_table_ip4 : cc2->igp_table_ip6))
+ return tab;
+
+ /* Third, try table connected by the paired channel */
+ if (cc2 && (cc2->c.table->addr_type == type))
+ return cc2->c.table;
+
+ /* Last, try default table of given type */
+ if (tab = cf->c.global->def_tables[type])
+ return tab;
+
+ cf_error("Undefined IGP table");
+}
+
void
-bgp_check_config(struct bgp_config *c)
+bgp_postconfig(struct proto_config *CF)
{
- int internal = (c->local_as == c->remote_as);
+ struct bgp_config *cf = (void *) CF;
+ int internal = (cf->local_as == cf->remote_as);
/* Do not check templates at all */
- if (c->c.class == SYM_TEMPLATE)
+ if (cf->c.class == SYM_TEMPLATE)
return;
/* EBGP direct by default, IBGP multihop by default */
- if (c->multihop < 0)
- c->multihop = internal ? 64 : 0;
-
- /* Different default for gw_mode */
- if (!c->gw_mode)
- c->gw_mode = c->multihop ? GW_RECURSIVE : GW_DIRECT;
-
- /* Different default based on rs_client */
- if (!c->missing_lladdr)
- c->missing_lladdr = c->rs_client ? MLL_IGNORE : MLL_SELF;
-
- /* Disable after error incompatible with restart limit action */
- if (c->c.in_limit && (c->c.in_limit->action == PLA_RESTART) && c->disable_after_error)
- c->c.in_limit->action = PLA_DISABLE;
+ if (cf->multihop < 0)
+ cf->multihop = internal ? 64 : 0;
- if (!c->local_as)
+ if (!cf->local_as)
cf_error("Local AS number must be set");
- if (ipa_zero(c->remote_ip))
+ if (ipa_zero(cf->remote_ip))
cf_error("Neighbor must be configured");
- if (!c->remote_as)
+ if (!cf->remote_as)
cf_error("Remote AS number must be set");
- if (ipa_is_link_local(c->remote_ip) && !c->iface)
+ if (ipa_is_link_local(cf->remote_ip) && !cf->iface)
cf_error("Link-local neighbor address requires specified interface");
- if (!(c->capabilities && c->enable_as4) && (c->remote_as > 0xFFFF))
+ if (!(cf->capabilities && cf->enable_as4) && (cf->remote_as > 0xFFFF))
cf_error("Neighbor AS number out of range (AS4 not available)");
- if (!internal && c->rr_client)
+ if (!internal && cf->rr_client)
cf_error("Only internal neighbor can be RR client");
- if (internal && c->rs_client)
+ if (internal && cf->rs_client)
cf_error("Only external neighbor can be RS client");
- if (c->multihop && (c->gw_mode == GW_DIRECT))
- cf_error("Multihop BGP cannot use direct gateway mode");
+ if (!cf->confederation && cf->confederation_member)
+ cf_error("Confederation ID must be set for member sessions");
- if (c->multihop && (ipa_is_link_local(c->remote_ip) ||
- ipa_is_link_local(c->source_addr)))
+ if (cf->multihop && (ipa_is_link_local(cf->local_ip) ||
+ ipa_is_link_local(cf->remote_ip)))
cf_error("Multihop BGP cannot be used with link-local addresses");
- if (c->multihop && c->iface)
+ if (cf->multihop && cf->iface)
cf_error("Multihop BGP cannot be bound to interface");
- if (c->multihop && c->check_link)
+ if (cf->multihop && cf->check_link)
cf_error("Multihop BGP cannot depend on link state");
- if (c->multihop && c->bfd && ipa_zero(c->source_addr))
- cf_error("Multihop BGP with BFD requires specified source address");
+ if (cf->multihop && cf->bfd && ipa_zero(cf->local_ip))
+ cf_error("Multihop BGP with BFD requires specified local address");
- if ((c->gw_mode == GW_RECURSIVE) && c->c.table->sorted)
- cf_error("BGP in recursive mode prohibits sorted table");
- if (c->deterministic_med && c->c.table->sorted)
- cf_error("BGP with deterministic MED prohibits sorted table");
+ struct bgp_channel_config *cc;
+ WALK_LIST(cc, CF->channels)
+ {
+ /* Disable after error incompatible with restart limit action */
+ if ((cc->c.in_limit.action == PLA_RESTART) && cf->disable_after_error)
+ cc->c.in_limit.action = PLA_DISABLE;
+
+ /* Different default based on rs_client */
+ if (!cc->missing_lladdr)
+ cc->missing_lladdr = cf->rs_client ? MLL_IGNORE : MLL_SELF;
+
+ /* Different default for gw_mode */
+ if (!cc->gw_mode)
+ cc->gw_mode = cf->multihop ? GW_RECURSIVE : GW_DIRECT;
+
+ /* Default based on proto config */
+ if (cc->gr_able == 0xff)
+ cc->gr_able = (cf->gr_mode == BGP_GR_ABLE);
+
+ /* Default values of IGP tables */
+ if ((cc->gw_mode == GW_RECURSIVE) && !cc->desc->no_igp)
+ {
+ if (!cc->igp_table_ip4 && (bgp_cc_is_ipv4(cc) || cc->ext_next_hop))
+ cc->igp_table_ip4 = bgp_default_igp_table(cf, cc, NET_IP4);
+
+ if (!cc->igp_table_ip6 && (bgp_cc_is_ipv6(cc) || cc->ext_next_hop))
+ cc->igp_table_ip6 = bgp_default_igp_table(cf, cc, NET_IP6);
- if (c->secondary && !c->c.table->sorted)
- cf_error("BGP with secondary option requires sorted table");
+ if (cc->igp_table_ip4 && bgp_cc_is_ipv6(cc) && !cc->ext_next_hop)
+ cf_error("Mismatched IGP table type");
+
+ if (cc->igp_table_ip6 && bgp_cc_is_ipv4(cc) && !cc->ext_next_hop)
+ cf_error("Mismatched IGP table type");
+ }
+
+ if (cf->multihop && (cc->gw_mode == GW_DIRECT))
+ cf_error("Multihop BGP cannot use direct gateway mode");
+
+ if ((cc->gw_mode == GW_RECURSIVE) && cc->c.table->sorted)
+ cf_error("BGP in recursive mode prohibits sorted table");
+
+ if (cf->deterministic_med && cc->c.table->sorted)
+ cf_error("BGP with deterministic MED prohibits sorted table");
+
+ if (cc->secondary && !cc->c.table->sorted)
+ cf_error("BGP with secondary option requires sorted table");
+ }
}
static int
-bgp_reconfigure(struct proto *P, struct proto_config *C)
+bgp_reconfigure(struct proto *P, struct proto_config *CF)
{
- struct bgp_config *new = (struct bgp_config *) C;
- struct bgp_proto *p = (struct bgp_proto *) P;
+ struct bgp_proto *p = (void *) P;
+ struct bgp_config *new = (void *) CF;
struct bgp_config *old = p->cf;
- if (proto_get_router_id(C) != p->local_id)
+ if (proto_get_router_id(CF) != p->local_id)
return 0;
int same = !memcmp(((byte *) old) + sizeof(struct proto_config),
@@ -1377,8 +1701,26 @@ bgp_reconfigure(struct proto *P, struct proto_config *C)
// password item is last and must be checked separately
OFFSETOF(struct bgp_config, password) - sizeof(struct proto_config))
&& ((!old->password && !new->password)
- || (old->password && new->password && !strcmp(old->password, new->password)))
- && (get_igp_table(old) == get_igp_table(new));
+ || (old->password && new->password && !strcmp(old->password, new->password)));
+
+ /* FIXME: Move channel reconfiguration to generic protocol code ? */
+ struct channel *C, *C2;
+ struct bgp_channel_config *cc;
+
+ WALK_LIST(C, p->p.channels)
+ C->stale = 1;
+
+ WALK_LIST(cc, new->c.channels)
+ {
+ C = (struct channel *) bgp_find_channel(p, cc->afi);
+ same = proto_configure_channel(P, &C, &cc->c) && same;
+ C->stale = 0;
+ }
+
+ WALK_LIST_DELSAFE(C, C2, p->p.channels)
+ if (C->stale)
+ same = proto_configure_channel(P, &C, NULL) && same;
+
if (same && (p->start_state > BSS_PREPARE))
bgp_update_bfd(p, new->bfd);
@@ -1390,11 +1732,34 @@ bgp_reconfigure(struct proto *P, struct proto_config *C)
return same;
}
+#define IGP_TABLE(cf, sym) ((cf)->igp_table_##sym ? (cf)->igp_table_##sym ->table : NULL )
+
+static int
+bgp_channel_reconfigure(struct channel *C, struct channel_config *CC)
+{
+ struct bgp_channel *c = (void *) C;
+ struct bgp_channel_config *new = (void *) CC;
+ struct bgp_channel_config *old = c->cf;
+
+ if (memcmp(((byte *) old) + sizeof(struct channel_config),
+ ((byte *) new) + sizeof(struct channel_config),
+ /* Remaining items must be checked separately */
+ OFFSETOF(struct bgp_channel_config, rest) - sizeof(struct channel_config)))
+ return 0;
+
+ /* Check change in IGP tables */
+ if ((IGP_TABLE(old, ip4) != IGP_TABLE(new, ip4)) ||
+ (IGP_TABLE(old, ip6) != IGP_TABLE(new, ip6)))
+ return 0;
+
+ c->cf = new;
+ return 1;
+}
+
static void
-bgp_copy_config(struct proto_config *dest, struct proto_config *src)
+bgp_copy_config(struct proto_config *dest UNUSED, struct proto_config *src UNUSED)
{
/* Just a shallow copy */
- proto_copy_rest(dest, src, sizeof(struct bgp_config));
}
@@ -1411,14 +1776,14 @@ bgp_copy_config(struct proto_config *dest, struct proto_config *src)
* closes the connection.
*/
void
-bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int len)
+bgp_error(struct bgp_conn *c, uint code, uint subcode, byte *data, int len)
{
struct bgp_proto *p = c->bgp;
if (c->state == BS_CLOSE)
return;
- bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, (len > 0) ? len : -len);
+ bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, ABS(len));
bgp_store_error(p, c, BE_BGP_TX, (code << 16) | subcode);
bgp_conn_enter_close_state(c);
@@ -1426,13 +1791,13 @@ bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int l
c->notify_subcode = subcode;
c->notify_data = data;
c->notify_size = (len > 0) ? len : 0;
- bgp_schedule_packet(c, PKT_NOTIFICATION);
+ bgp_schedule_packet(c, NULL, PKT_NOTIFICATION);
if (code != 6)
- {
- bgp_update_startup_delay(p);
- bgp_stop(p, 0);
- }
+ {
+ bgp_update_startup_delay(p);
+ bgp_stop(p, 0);
+ }
}
/**
@@ -1471,19 +1836,19 @@ static const char *
bgp_last_errmsg(struct bgp_proto *p)
{
switch (p->last_error_class)
- {
- case BE_MISC:
- return bgp_misc_errors[p->last_error_code];
- case BE_SOCKET:
- return (p->last_error_code == 0) ? "Connection closed" : strerror(p->last_error_code);
- case BE_BGP_RX:
- case BE_BGP_TX:
- return bgp_error_dsc(p->last_error_code >> 16, p->last_error_code & 0xFF);
- case BE_AUTO_DOWN:
- return bgp_auto_errors[p->last_error_code];
- default:
- return "";
- }
+ {
+ case BE_MISC:
+ return bgp_misc_errors[p->last_error_code];
+ case BE_SOCKET:
+ return (p->last_error_code == 0) ? "Connection closed" : strerror(p->last_error_code);
+ case BE_BGP_RX:
+ case BE_BGP_TX:
+ return bgp_error_dsc(p->last_error_code >> 16, p->last_error_code & 0xFF);
+ case BE_AUTO_DOWN:
+ return bgp_auto_errors[p->last_error_code];
+ default:
+ return "";
+ }
}
static const char *
@@ -1514,48 +1879,164 @@ bgp_get_status(struct proto *P, byte *buf)
}
static void
+bgp_show_afis(int code, char *s, u32 *afis, uint count)
+{
+ buffer b;
+ LOG_BUFFER_INIT(b);
+
+ buffer_puts(&b, s);
+
+ for (u32 *af = afis; af < (afis + count); af++)
+ {
+ const struct bgp_af_desc *desc = bgp_get_af_desc(*af);
+ if (desc)
+ buffer_print(&b, " %s", desc->name);
+ else
+ buffer_print(&b, " <%u/%u>", BGP_AFI(*af), BGP_SAFI(*af));
+ }
+
+ if (b.pos == b.end)
+ strcpy(b.end - 32, " ... <too long>");
+
+ cli_msg(code, b.start);
+}
+
+static void
+bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps)
+{
+ struct bgp_af_caps *ac;
+ uint any_mp_bgp = 0;
+ uint any_gr_able = 0;
+ uint any_add_path = 0;
+ uint any_ext_next_hop = 0;
+ u32 *afl1 = alloca(caps->af_count * sizeof(u32));
+ u32 *afl2 = alloca(caps->af_count * sizeof(u32));
+ uint afn1, afn2;
+
+ WALK_AF_CAPS(caps, ac)
+ {
+ any_mp_bgp |= ac->ready;
+ any_gr_able |= ac->gr_able;
+ any_add_path |= ac->add_path;
+ any_ext_next_hop |= ac->ext_next_hop;
+ }
+
+ if (any_mp_bgp)
+ {
+ cli_msg(-1006, " Multiprotocol");
+
+ afn1 = 0;
+ WALK_AF_CAPS(caps, ac)
+ if (ac->ready)
+ afl1[afn1++] = ac->afi;
+
+ bgp_show_afis(-1006, " AF announced:", afl1, afn1);
+ }
+
+ if (caps->route_refresh)
+ cli_msg(-1006, " Route refresh");
+
+ if (any_ext_next_hop)
+ {
+ cli_msg(-1006, " Extended next hop");
+
+ afn1 = 0;
+ WALK_AF_CAPS(caps, ac)
+ if (ac->ext_next_hop)
+ afl1[afn1++] = ac->afi;
+
+ bgp_show_afis(-1006, " IPv6 nexthop:", afl1, afn1);
+ }
+
+ if (caps->ext_messages)
+ cli_msg(-1006, " Extended message");
+
+ if (caps->gr_aware)
+ cli_msg(-1006, " Graceful restart");
+
+ if (any_gr_able)
+ {
+ /* Continues from gr_aware */
+ cli_msg(-1006, " Restart time: %u", caps->gr_time);
+ if (caps->gr_flags & BGP_GRF_RESTART)
+ cli_msg(-1006, " Restart recovery");
+
+ afn1 = afn2 = 0;
+ WALK_AF_CAPS(caps, ac)
+ {
+ if (ac->gr_able)
+ afl1[afn1++] = ac->afi;
+
+ if (ac->gr_af_flags & BGP_GRF_FORWARDING)
+ afl2[afn2++] = ac->afi;
+ }
+
+ bgp_show_afis(-1006, " AF supported:", afl1, afn1);
+ bgp_show_afis(-1006, " AF preserved:", afl2, afn2);
+ }
+
+ if (caps->as4_support)
+ cli_msg(-1006, " 4-octet AS numbers");
+
+ if (any_add_path)
+ {
+ cli_msg(-1006, " ADD-PATH");
+
+ afn1 = afn2 = 0;
+ WALK_AF_CAPS(caps, ac)
+ {
+ if (ac->add_path & BGP_ADD_PATH_RX)
+ afl1[afn1++] = ac->afi;
+
+ if (ac->add_path & BGP_ADD_PATH_TX)
+ afl2[afn2++] = ac->afi;
+ }
+
+ bgp_show_afis(-1006, " RX:", afl1, afn1);
+ bgp_show_afis(-1006, " TX:", afl2, afn2);
+ }
+
+ if (caps->enhanced_refresh)
+ cli_msg(-1006, " Enhanced refresh");
+}
+
+static void
bgp_show_proto_info(struct proto *P)
{
struct bgp_proto *p = (struct bgp_proto *) P;
- struct bgp_conn *c = p->conn;
-
- proto_show_basic_info(P);
cli_msg(-1006, " BGP state: %s", bgp_state_dsc(p));
cli_msg(-1006, " Neighbor address: %I%J", p->cf->remote_ip, p->cf->iface);
cli_msg(-1006, " Neighbor AS: %u", p->remote_as);
- if (p->gr_active)
+ if (p->gr_active_num)
cli_msg(-1006, " Neighbor graceful restart active");
if (P->proto_state == PS_START)
- {
- struct bgp_conn *oc = &p->outgoing_conn;
+ {
+ struct bgp_conn *oc = &p->outgoing_conn;
- if ((p->start_state < BSS_CONNECT) &&
- (p->startup_timer->expires))
- cli_msg(-1006, " Error wait: %d/%d",
- p->startup_timer->expires - now, p->startup_delay);
+ if ((p->start_state < BSS_CONNECT) &&
+ (p->startup_timer->expires))
+ cli_msg(-1006, " Error wait: %d/%d",
+ p->startup_timer->expires - now, p->startup_delay);
- if ((oc->state == BS_ACTIVE) &&
- (oc->connect_retry_timer->expires))
- cli_msg(-1006, " Connect delay: %d/%d",
- oc->connect_retry_timer->expires - now, p->cf->connect_delay_time);
+ if ((oc->state == BS_ACTIVE) &&
+ (oc->connect_timer->expires))
+ cli_msg(-1006, " Connect delay: %d/%d",
+ oc->connect_timer->expires - now, p->cf->connect_delay_time);
- if (p->gr_active && p->gr_timer->expires)
- cli_msg(-1006, " Restart timer: %d/-", p->gr_timer->expires - now);
- }
+ if (p->gr_active_num && p->gr_timer->expires)
+ cli_msg(-1006, " Restart timer: %d/-", p->gr_timer->expires - now);
+ }
else if (P->proto_state == PS_UP)
- {
- cli_msg(-1006, " Neighbor ID: %R", p->remote_id);
- cli_msg(-1006, " Neighbor caps: %s%s%s%s%s%s%s",
- c->peer_refresh_support ? " refresh" : "",
- c->peer_enhanced_refresh_support ? " enhanced-refresh" : "",
- c->peer_gr_able ? " restart-able" : (c->peer_gr_aware ? " restart-aware" : ""),
- c->peer_as4_support ? " AS4" : "",
- (c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "",
- (c->peer_add_path & ADD_PATH_TX) ? " add-path-tx" : "",
- c->peer_ext_messages_support ? " ext-messages" : "");
+ {
+ cli_msg(-1006, " Neighbor ID: %R", p->remote_id);
+ cli_msg(-1006, " Local capabilities");
+ bgp_show_capabilities(p, p->conn->local_caps);
+ cli_msg(-1006, " Neighbor capabilities");
+ bgp_show_capabilities(p, p->conn->remote_caps);
+/* XXXX
cli_msg(-1006, " Session: %s%s%s%s%s%s%s%s",
p->is_internal ? "internal" : "external",
p->cf->multihop ? " multihop" : "",
@@ -1565,35 +2046,60 @@ bgp_show_proto_info(struct proto *P)
p->add_path_rx ? " add-path-rx" : "",
p->add_path_tx ? " add-path-tx" : "",
p->ext_messages ? " ext-messages" : "");
- cli_msg(-1006, " Source address: %I", p->source_addr);
- if (P->cf->in_limit)
- cli_msg(-1006, " Route limit: %d/%d",
- p->p.stats.imp_routes + p->p.stats.filt_routes, P->cf->in_limit->limit);
- cli_msg(-1006, " Hold timer: %d/%d",
- tm_remains(c->hold_timer), c->hold_time);
- cli_msg(-1006, " Keepalive timer: %d/%d",
- tm_remains(c->keepalive_timer), c->keepalive_time);
- }
+*/
+ cli_msg(-1006, " Source address: %I", p->source_addr);
+ cli_msg(-1006, " Hold timer: %d/%d",
+ tm_remains(p->conn->hold_timer), p->conn->hold_time);
+ cli_msg(-1006, " Keepalive timer: %d/%d",
+ tm_remains(p->conn->keepalive_timer), p->conn->keepalive_time);
+ }
if ((p->last_error_class != BE_NONE) &&
(p->last_error_class != BE_MAN_DOWN))
+ {
+ const char *err1 = bgp_err_classes[p->last_error_class];
+ const char *err2 = bgp_last_errmsg(p);
+ cli_msg(-1006, " Last error: %s%s", err1, err2);
+ }
+
+ {
+ /* XXXX ?? */
+ struct bgp_channel *c;
+ WALK_LIST(c, p->p.channels)
{
- const char *err1 = bgp_err_classes[p->last_error_class];
- const char *err2 = bgp_last_errmsg(p);
- cli_msg(-1006, " Last error: %s%s", err1, err2);
+ channel_show_info(&c->c);
+
+ if (c->igp_table_ip4)
+ cli_msg(-1006, " IGP IPv4 table: %s", c->igp_table_ip4->name);
+
+ if (c->igp_table_ip6)
+ cli_msg(-1006, " IGP IPv6 table: %s", c->igp_table_ip6->name);
}
+ }
}
+struct channel_class channel_bgp = {
+ .channel_size = sizeof(struct bgp_channel),
+ .config_size = sizeof(struct bgp_channel_config),
+ .init = bgp_channel_init,
+ .start = bgp_channel_start,
+ .shutdown = bgp_channel_shutdown,
+ .cleanup = bgp_channel_cleanup,
+ .reconfigure = bgp_channel_reconfigure,
+};
+
struct protocol proto_bgp = {
.name = "BGP",
.template = "bgp%d",
.attr_class = EAP_BGP,
.preference = DEF_PREF_BGP,
+ .channel_mask = NB_IP | NB_VPN | NB_FLOW,
+ .proto_size = sizeof(struct bgp_proto),
.config_size = sizeof(struct bgp_config),
+ .postconfig = bgp_postconfig,
.init = bgp_init,
.start = bgp_start,
.shutdown = bgp_shutdown,
- .cleanup = bgp_cleanup,
.reconfigure = bgp_reconfigure,
.copy_config = bgp_copy_config,
.get_status = bgp_get_status,
diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h
index e47a0eb1..7ffcb68a 100644
--- a/proto/bgp/bgp.h
+++ b/proto/bgp/bgp.h
@@ -2,6 +2,8 @@
* BIRD -- The Border Gateway Protocol
*
* (c) 2000 Martin Mares <mj@ucw.cz>
+ * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
+ * (c) 2008--2016 CZ.NIC z.s.p.o.
*
* Can be freely distributed and used under the terms of the GNU GPL.
*/
@@ -10,26 +12,80 @@
#define _BIRD_BGP_H_
#include <stdint.h>
+#include <setjmp.h>
+#include "nest/bird.h"
#include "nest/route.h"
#include "nest/bfd.h"
+//#include "lib/lists.h"
#include "lib/hash.h"
+#include "lib/socket.h"
struct linpool;
struct eattr;
+
+/* Address families */
+
+#define BGP_AFI_IPV4 1
+#define BGP_AFI_IPV6 2
+
+#define BGP_SAFI_UNICAST 1
+#define BGP_SAFI_MULTICAST 2
+#define BGP_SAFI_MPLS 4
+#define BGP_SAFI_MPLS_VPN 128
+#define BGP_SAFI_VPN_MULTICAST 129
+#define BGP_SAFI_FLOW 133
+
+/* Internal AF codes */
+
+#define BGP_AF(A, B) (((u32)(A) << 16) | (u32)(B))
+#define BGP_AFI(A) ((u32)(A) >> 16)
+#define BGP_SAFI(A) ((u32)(A) & 0xFFFF)
+
+#define BGP_AF_IPV4 BGP_AF( BGP_AFI_IPV4, BGP_SAFI_UNICAST )
+#define BGP_AF_IPV6 BGP_AF( BGP_AFI_IPV6, BGP_SAFI_UNICAST )
+#define BGP_AF_IPV4_MC BGP_AF( BGP_AFI_IPV4, BGP_SAFI_MULTICAST )
+#define BGP_AF_IPV6_MC BGP_AF( BGP_AFI_IPV6, BGP_SAFI_MULTICAST )
+#define BGP_AF_IPV4_MPLS BGP_AF( BGP_AFI_IPV4, BGP_SAFI_MPLS )
+#define BGP_AF_IPV6_MPLS BGP_AF( BGP_AFI_IPV6, BGP_SAFI_MPLS )
+#define BGP_AF_VPN4_MPLS BGP_AF( BGP_AFI_IPV4, BGP_SAFI_MPLS_VPN )
+#define BGP_AF_VPN6_MPLS BGP_AF( BGP_AFI_IPV6, BGP_SAFI_MPLS_VPN )
+#define BGP_AF_VPN4_MC BGP_AF( BGP_AFI_IPV4, BGP_SAFI_VPN_MULTICAST )
+#define BGP_AF_VPN6_MC BGP_AF( BGP_AFI_IPV6, BGP_SAFI_VPN_MULTICAST )
+#define BGP_AF_FLOW4 BGP_AF( BGP_AFI_IPV4, BGP_SAFI_FLOW )
+#define BGP_AF_FLOW6 BGP_AF( BGP_AFI_IPV6, BGP_SAFI_FLOW )
+
+
+struct bgp_write_state;
+struct bgp_parse_state;
+struct bgp_export_state;
+struct bgp_bucket;
+
+struct bgp_af_desc {
+ u32 afi;
+ u32 net;
+ u8 mpls;
+ u8 no_igp;
+ const char *name;
+ uint (*encode_nlri)(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size);
+ void (*decode_nlri)(struct bgp_parse_state *s, byte *pos, uint len, rta *a);
+ void (*update_next_hop)(struct bgp_export_state *s, eattr *nh, ea_list **to);
+ uint (*encode_next_hop)(struct bgp_write_state *s, eattr *nh, byte *buf, uint size);
+ void (*decode_next_hop)(struct bgp_parse_state *s, byte *pos, uint len, rta *a);
+};
+
+
struct bgp_config {
struct proto_config c;
u32 local_as, remote_as;
+ ip_addr local_ip; /* Source address to use */
ip_addr remote_ip;
- ip_addr source_addr; /* Source address to use */
struct iface *iface; /* Interface for link-local addresses */
+ u16 local_port; /* Local listening port */
u16 remote_port; /* Neighbor destination port */
int multihop; /* Number of hops if multihop */
- int ttl_security; /* Enable TTL security [RFC5082] */
- int next_hop_self; /* Always set next hop to local IP address */
- int next_hop_keep; /* Do not touch next hop attribute */
- int missing_lladdr; /* What we will do when we don' know link-local addr, see MLL_* */
- int gw_mode; /* How we compute route gateway from next_hop attr, see GW_* */
+ int strict_bind; /* Bind listening socket to local address */
+ int ttl_security; /* Enable TTL security [RFC 5082] */
int compare_path_lengths; /* Use path lengths when selecting best route */
int med_metric; /* Compare MULTI_EXIT_DISC even between routes from differen ASes */
int igp_metric; /* Use IGP metrics when selecting best route */
@@ -37,18 +93,17 @@ struct bgp_config {
int deterministic_med; /* Use more complicated algo to have strict RFC 4271 MED comparison */
u32 default_local_pref; /* Default value for LOCAL_PREF attribute */
u32 default_med; /* Default value for MULTI_EXIT_DISC attribute */
- int capabilities; /* Enable capability handshake [RFC3392] */
- int enable_refresh; /* Enable local support for route refresh [RFC2918] */
- int enable_as4; /* Enable local support for 4B AS numbers [RFC4893] */
+ int capabilities; /* Enable capability handshake [RFC 5492] */
+ int enable_refresh; /* Enable local support for route refresh [RFC 2918] */
+ int enable_as4; /* Enable local support for 4B AS numbers [RFC 6793] */
int enable_extended_messages; /* Enable local support for extended messages [draft] */
u32 rr_cluster_id; /* Route reflector cluster ID, if different from local ID */
int rr_client; /* Whether neighbor is RR client of me */
int rs_client; /* Whether neighbor is RS client of me */
- int advertise_ipv4; /* Whether we should add IPv4 capability advertisement to OPEN message */
+ u32 confederation; /* Confederation ID, or zero if confeds not active */
+ int confederation_member; /* Whether neighbor AS is member of our confederation */
int passive; /* Do not initiate outgoing connection */
int interpret_communities; /* Hardwired handling of well-known communities */
- int secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */
- int add_path; /* Use ADD-PATH extension [RFC7911] */
int allow_local_as; /* Allow that number of local ASNs in incoming AS_PATHs */
int allow_local_pref; /* Allow LOCAL_PREF in EBGP sessions */
int gr_mode; /* Graceful restart mode (BGP_GR_*) */
@@ -64,11 +119,31 @@ struct bgp_config {
unsigned disable_after_error; /* Disable the protocol when error is detected */
char *password; /* Password used for MD5 authentication */
- struct rtable_config *igp_table; /* Table used for recursive next hop lookups */
int check_link; /* Use iface link state for liveness detection */
int bfd; /* Use BFD for liveness detection */
};
+struct bgp_channel_config {
+ struct channel_config c;
+
+ u32 afi;
+ const struct bgp_af_desc *desc;
+
+ ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */
+ u8 next_hop_self; /* Always set next hop to local IP address */
+ u8 next_hop_keep; /* Do not touch next hop attribute */
+ u8 missing_lladdr; /* What we will do when we don' know link-local addr, see MLL_* */
+ u8 gw_mode; /* How we compute route gateway from next_hop attr, see GW_* */
+ u8 secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */
+ u8 gr_able; /* Allow full graceful restart for the channel */
+ u8 ext_next_hop; /* Allow both IPv4 and IPv6 next hops */
+ u8 add_path; /* Use ADD-PATH extension [RFC 7911] */
+
+ uint rest[0]; /* Remaining items are reconfigured separately */
+ struct rtable_config *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */
+ struct rtable_config *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */
+};
+
#define MLL_SELF 1
#define MLL_DROP 2
#define MLL_IGNORE 3
@@ -76,112 +151,241 @@ struct bgp_config {
#define GW_DIRECT 1
#define GW_RECURSIVE 2
-#define ADD_PATH_RX 1
-#define ADD_PATH_TX 2
-#define ADD_PATH_FULL 3
+#define BGP_ADD_PATH_RX 1
+#define BGP_ADD_PATH_TX 2
+#define BGP_ADD_PATH_FULL 3
-#define BGP_GR_ABLE 1
-#define BGP_GR_AWARE 2
+#define BGP_GR_ABLE 1
+#define BGP_GR_AWARE 2
-/* For peer_gr_flags */
+/* For GR capability common flags */
#define BGP_GRF_RESTART 0x80
-/* For peer_gr_aflags */
+/* For GR capability per-AF flags */
#define BGP_GRF_FORWARDING 0x80
+struct bgp_af_caps {
+ u32 afi;
+ u8 ready; /* Multiprotocol capability, RFC 4760 */
+ u8 gr_able; /* Graceful restart support, RFC 4724 */
+ u8 gr_af_flags; /* Graceful restart per-AF flags */
+ u8 ext_next_hop; /* Extended IPv6 next hop, RFC 5549 */
+ u8 add_path; /* Multiple paths support, RFC 7911 */
+};
+
+struct bgp_caps {
+ u32 as4_number; /* Announced ASN */
+
+ u8 as4_support; /* Four-octet AS capability, RFC 6793 */
+ u8 ext_messages; /* Extended message length, RFC draft */
+ u8 route_refresh; /* Route refresh capability, RFC 2918 */
+ u8 enhanced_refresh; /* Enhanced route refresh, RFC 7313 */
+
+ u8 gr_aware; /* Graceful restart capability, RFC 4724 */
+ u8 gr_flags; /* Graceful restart flags */
+ u16 gr_time; /* Graceful restart time in seconds */
+
+ u16 af_count; /* Number of af_data items */
+
+ struct bgp_af_caps af_data[0]; /* Per-AF capability data */
+};
+
+#define WALK_AF_CAPS(caps,ac) \
+ for (ac = caps->af_data; ac < &caps->af_data[caps->af_count]; ac++)
+
+
+struct bgp_socket {
+ node n; /* Node in global bgp_sockets */
+ sock *sk; /* Real listening socket */
+ u32 uc; /* Use count */
+};
+
struct bgp_conn {
struct bgp_proto *bgp;
struct birdsock *sk;
- uint state; /* State of connection state machine */
- struct timer *connect_retry_timer;
+ u8 state; /* State of connection state machine */
+ u8 as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */
+ u8 ext_messages; /* Session uses extended message length */
+
+ struct bgp_caps *local_caps;
+ struct bgp_caps *remote_caps;
+ struct timer *connect_timer;
struct timer *hold_timer;
struct timer *keepalive_timer;
struct event *tx_ev;
- int packets_to_send; /* Bitmap of packet types to be sent */
+ u32 packets_to_send; /* Bitmap of packet types to be sent */
+ u32 channels_to_send; /* Bitmap of channels with packets to be sent */
+ u8 last_channel; /* Channel used last time for TX */
+ u8 last_channel_count; /* Number of times the last channel was used in succession */
int notify_code, notify_subcode, notify_size;
byte *notify_data;
- u32 advertised_as; /* Temporary value for AS number received */
- int start_state; /* protocol start_state snapshot when connection established */
- u8 peer_refresh_support; /* Peer supports route refresh [RFC2918] */
- u8 peer_as4_support; /* Peer supports 4B AS numbers [RFC4893] */
- u8 peer_add_path; /* Peer supports ADD-PATH [RFC7911] */
- u8 peer_enhanced_refresh_support; /* Peer supports enhanced refresh [RFC7313] */
- u8 peer_gr_aware;
- u8 peer_gr_able;
- u16 peer_gr_time;
- u8 peer_gr_flags;
- u8 peer_gr_aflags;
- u8 peer_ext_messages_support; /* Peer supports extended message length [draft] */
- unsigned hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */
+
+ uint hold_time, keepalive_time; /* Times calculated from my and neighbor's requirements */
};
struct bgp_proto {
struct proto p;
struct bgp_config *cf; /* Shortcut to BGP configuration */
u32 local_as, remote_as;
- int start_state; /* Substates that partitions BS_START */
- u8 is_internal; /* Internal BGP connection (local_as == remote_as) */
- u8 as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */
- u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */
- u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */
- u8 ext_messages; /* Session allows to use extended messages (both sides support it) */
+ u32 public_as; /* Externally visible ASN (local_as or confederation id) */
u32 local_id; /* BGP identifier of this router */
u32 remote_id; /* BGP identifier of the neighbor */
u32 rr_cluster_id; /* Route reflector cluster ID */
- int rr_client; /* Whether neighbor is RR client of me */
- int rs_client; /* Whether neighbor is RS client of me */
+ int start_state; /* Substates that partitions BS_START */
+ u8 is_internal; /* Internal BGP session (local_as == remote_as) */
+ u8 is_interior; /* Internal or intra-confederation BGP session */
+ u8 as4_session; /* Session uses 4B AS numbers in AS_PATH (both sides support it) */
+ u8 rr_client; /* Whether neighbor is RR client of me */
+ u8 rs_client; /* Whether neighbor is RS client of me */
+ u8 route_refresh; /* Route refresh allowed to send [RFC 2918] */
+ u8 enhanced_refresh; /* Enhanced refresh is negotiated [RFC 7313] */
u8 gr_ready; /* Neighbor could do graceful restart */
- u8 gr_active; /* Neighbor is doing graceful restart */
- u8 feed_state; /* Feed state (TX) for EoR, RR packets, see BFS_* */
- u8 load_state; /* Load state (RX) for EoR, RR packets, see BFS_* */
+ u8 gr_active_num; /* Neighbor is doing GR, number of active channels */
+ u8 channel_count; /* Number of active channels */
+ u32 *afi_map; /* Map channel index -> AFI */
+ struct bgp_channel **channel_map; /* Map channel index -> channel */
struct bgp_conn *conn; /* Connection we have established */
struct bgp_conn outgoing_conn; /* Outgoing connection we're working with */
struct bgp_conn incoming_conn; /* Incoming connection we have neither accepted nor rejected yet */
struct object_lock *lock; /* Lock for neighbor connection */
struct neighbor *neigh; /* Neighbor entry corresponding to remote ip, NULL if multihop */
+ struct bgp_socket *sock; /* Shared listening socket */
struct bfd_request *bfd_req; /* BFD request, if BFD is used */
ip_addr source_addr; /* Local address used as an advertised next hop */
- rtable *igp_table; /* Table used for recursive next hop lookups */
+ ip_addr link_addr; /* Link-local version of source_addr */
struct event *event; /* Event for respawning and shutting process */
struct timer *startup_timer; /* Timer used to delay protocol startup due to previous errors (startup_delay) */
struct timer *gr_timer; /* Timer waiting for reestablishment after graceful restart */
- struct bgp_bucket **bucket_hash; /* Hash table of attribute buckets */
- uint hash_size, hash_count, hash_limit;
- HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */
- slab *prefix_slab; /* Slab holding prefix nodes */
- list bucket_queue; /* Queue of buckets to send */
- struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */
unsigned startup_delay; /* Time to delay protocol startup by due to errors */
bird_clock_t last_proto_error; /* Time of last error that leads to protocol stop */
u8 last_error_class; /* Error class of last error */
u32 last_error_code; /* Error code of last error. BGP protocol errors
are encoded as (bgp_err_code << 16 | bgp_err_subcode) */
-#ifdef IPV6
- byte *mp_reach_start, *mp_unreach_start; /* Multiprotocol BGP attribute notes */
- unsigned mp_reach_len, mp_unreach_len;
- ip_addr local_link; /* Link-level version of source_addr */
-#endif
+};
+
+struct bgp_channel {
+ struct channel c;
+
+ /* Rest are BGP specific data */
+ struct bgp_channel_config *cf;
+ pool *pool; /* XXXX */
+
+ u32 afi;
+ u32 index;
+ const struct bgp_af_desc *desc;
+
+ HASH(struct bgp_bucket) bucket_hash; /* Hash table of route buckets */
+ struct bgp_bucket *withdraw_bucket; /* Withdrawn routes */
+ list bucket_queue; /* Queue of buckets to send (struct bgp_bucket) */
+
+ HASH(struct bgp_prefix) prefix_hash; /* Prefixes to be sent */
+ slab *prefix_slab; /* Slab holding prefix nodes */
+
+ rtable *igp_table_ip4; /* Table for recursive IPv4 next hop lookups */
+ rtable *igp_table_ip6; /* Table for recursive IPv6 next hop lookups */
+ ip_addr next_hop_addr; /* Local address for NEXT_HOP attribute */
+ ip_addr link_addr; /* Link-local version of next_hop_addr */
+
+ u32 packets_to_send; /* Bitmap of packet types to be sent */
+
+ u8 gr_ready; /* Neighbor could do GR on this AF */
+ u8 gr_active; /* Neighbor is doing GR and keeping fwd state */
+
+ u8 ext_next_hop; /* Session allows both IPv4 and IPv6 next hops */
+
+ u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */
+ u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */
+
+ u8 feed_state; /* Feed state (TX) for EoR, RR packets, see BFS_* */
+ u8 load_state; /* Load state (RX) for EoR, RR packets, see BFS_* */
};
struct bgp_prefix {
- struct {
- ip_addr prefix;
- int pxlen;
- } n;
+ node buck_node; /* Node in per-bucket list */
+ struct bgp_prefix *next; /* Node in prefix hash table */
+ u32 hash;
u32 path_id;
- struct bgp_prefix *next;
- node bucket_node; /* Node in per-bucket list */
+ net_addr net[0];
};
struct bgp_bucket {
node send_node; /* Node in send queue */
- struct bgp_bucket *hash_next, *hash_prev; /* Node in bucket hash table */
- unsigned hash; /* Hash over extended attributes */
- list prefixes; /* Prefixes in this buckets */
+ struct bgp_bucket *next; /* Node in bucket hash table */
+ list prefixes; /* Prefixes in this bucket (struct bgp_prefix) */
+ u32 hash; /* Hash over extended attributes */
ea_list eattrs[0]; /* Per-bucket extended attributes */
};
+struct bgp_export_state {
+ struct bgp_proto *proto;
+ struct bgp_channel *channel;
+ struct linpool *pool;
+
+ struct bgp_proto *src;
+ rte *route;
+ int mpls;
+
+ u32 attrs_seen[1];
+ uint err_withdraw;
+};
+
+struct bgp_write_state {
+ struct bgp_proto *proto;
+ struct bgp_channel *channel;
+ struct linpool *pool;
+
+ int as4_session;
+ int add_path;
+ int mpls;
+
+ eattr *mp_next_hop;
+ adata *mpls_labels;
+};
+
+struct bgp_parse_state {
+ struct bgp_proto *proto;
+ struct bgp_channel *channel;
+ struct linpool *pool;
+
+ int as4_session;
+ int add_path;
+ int mpls;
+
+ u32 attrs_seen[256/32];
+
+ u32 mp_reach_af;
+ u32 mp_unreach_af;
+
+ uint attr_len;
+ uint ip_reach_len;
+ uint ip_unreach_len;
+ uint ip_next_hop_len;
+ uint mp_reach_len;
+ uint mp_unreach_len;
+ uint mp_next_hop_len;
+
+ byte *attrs;
+ byte *ip_reach_nlri;
+ byte *ip_unreach_nlri;
+ byte *ip_next_hop_data;
+ byte *mp_reach_nlri;
+ byte *mp_unreach_nlri;
+ byte *mp_next_hop_data;
+
+ uint err_withdraw;
+ uint err_subcode;
+ jmp_buf err_jmpbuf;
+
+ struct hostentry *hostentry;
+ adata *mpls_labels;
+
+ /* Cached state for bgp_rte_update() */
+ u32 last_id;
+ struct rte_src *last_src;
+ rta *cached_rta;
+};
+
#define BGP_PORT 179
#define BGP_VERSION 4
#define BGP_HEADER_LENGTH 19
@@ -192,10 +396,30 @@ struct bgp_bucket {
#define BGP_RX_BUFFER_EXT_SIZE 65535
#define BGP_TX_BUFFER_EXT_SIZE 65535
-static inline uint bgp_max_packet_length(struct bgp_proto *p)
-{ return p->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; }
+static inline int bgp_channel_is_ipv4(struct bgp_channel *c)
+{ return BGP_AFI(c->afi) == BGP_AFI_IPV4; }
+
+static inline int bgp_channel_is_ipv6(struct bgp_channel *c)
+{ return BGP_AFI(c->afi) == BGP_AFI_IPV6; }
+
+static inline int bgp_cc_is_ipv4(struct bgp_channel_config *c)
+{ return BGP_AFI(c->afi) == BGP_AFI_IPV4; }
+
+static inline int bgp_cc_is_ipv6(struct bgp_channel_config *c)
+{ return BGP_AFI(c->afi) == BGP_AFI_IPV6; }
+
+static inline uint bgp_max_packet_length(struct bgp_conn *conn)
+{ return conn->ext_messages ? BGP_MAX_EXT_MSG_LENGTH : BGP_MAX_MESSAGE_LENGTH; }
+
+static inline void
+bgp_parse_error(struct bgp_parse_state *s, uint subcode)
+{
+ s->err_subcode = subcode;
+ longjmp(s->err_jmpbuf, 1);
+}
extern struct linpool *bgp_linpool;
+extern struct linpool *bgp_linpool2;
void bgp_start_timer(struct timer *t, int value);
@@ -208,9 +432,9 @@ void bgp_conn_enter_established_state(struct bgp_conn *conn);
void bgp_conn_enter_close_state(struct bgp_conn *conn);
void bgp_conn_enter_idle_state(struct bgp_conn *conn);
void bgp_handle_graceful_restart(struct bgp_proto *p);
-void bgp_graceful_restart_done(struct bgp_proto *p);
-void bgp_refresh_begin(struct bgp_proto *p);
-void bgp_refresh_end(struct bgp_proto *p);
+void bgp_graceful_restart_done(struct bgp_channel *c);
+void bgp_refresh_begin(struct bgp_channel *c);
+void bgp_refresh_end(struct bgp_channel *c);
void bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code);
void bgp_stop(struct bgp_proto *p, unsigned subcode);
@@ -233,48 +457,71 @@ struct rte_source *bgp_get_source(struct bgp_proto *p, u32 path_id);
/* attrs.c */
-/* Hack: although BA_NEXT_HOP attribute has type EAF_TYPE_IP_ADDRESS, in IPv6
- * we store two addesses in it - a global address and a link local address.
- */
-#ifdef IPV6
-#define NEXT_HOP_LENGTH (2*sizeof(ip_addr))
-static inline void set_next_hop(byte *b, ip_addr addr) { ((ip_addr *) b)[0] = addr; ((ip_addr *) b)[1] = IPA_NONE; }
-#else
-#define NEXT_HOP_LENGTH sizeof(ip_addr)
-static inline void set_next_hop(byte *b, ip_addr addr) { ((ip_addr *) b)[0] = addr; }
-#endif
+static inline eattr *
+bgp_find_attr(ea_list *attrs, uint code)
+{
+ return ea_find(attrs, EA_CODE(EAP_BGP, code));
+}
+
+eattr *
+bgp_set_attr(ea_list **attrs, struct linpool *pool, uint code, uint flags, uintptr_t val);
+
+static inline void
+bgp_set_attr_u32(ea_list **to, struct linpool *pool, uint code, uint flags, u32 val)
+{ bgp_set_attr(to, pool, code, flags, (uintptr_t) val); }
+
+static inline void
+bgp_set_attr_ptr(ea_list **to, struct linpool *pool, uint code, uint flags, struct adata *val)
+{ bgp_set_attr(to, pool, code, flags, (uintptr_t) val); }
+
+static inline void
+bgp_set_attr_data(ea_list **to, struct linpool *pool, uint code, uint flags, void *data, uint len)
+{
+ struct adata *a = lp_alloc_adata(pool, len);
+ memcpy(a->data, data, len);
+ bgp_set_attr(to, pool, code, flags, (uintptr_t) a);
+}
+
+static inline void
+bgp_unset_attr(ea_list **to, struct linpool *pool, uint code)
+{ eattr *e = bgp_set_attr(to, pool, code, 0, 0); e->type = EAF_TYPE_UNDEF; }
+
+
+int bgp_encode_attrs(struct bgp_write_state *s, ea_list *attrs, byte *buf, byte *end);
+ea_list * bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len);
+
+void bgp_init_bucket_table(struct bgp_channel *c);
+void bgp_free_bucket(struct bgp_channel *c, struct bgp_bucket *b);
+void bgp_defer_bucket(struct bgp_channel *c, struct bgp_bucket *b);
+void bgp_withdraw_bucket(struct bgp_channel *c, struct bgp_bucket *b);
+
+void bgp_init_prefix_table(struct bgp_channel *c);
+void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *bp);
-void bgp_attach_attr(struct ea_list **to, struct linpool *pool, unsigned attr, uintptr_t val);
-byte *bgp_attach_attr_wa(struct ea_list **to, struct linpool *pool, unsigned attr, unsigned len);
-struct rta *bgp_decode_attrs(struct bgp_conn *conn, byte *a, uint len, struct linpool *pool, int mandatory);
-int bgp_get_attr(struct eattr *e, byte *buf, int buflen);
int bgp_rte_better(struct rte *, struct rte *);
int bgp_rte_mergable(rte *pri, rte *sec);
int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best);
-void bgp_rt_notify(struct proto *P, rtable *tbl UNUSED, net *n, rte *new, rte *old UNUSED, ea_list *attrs);
+void bgp_rt_notify(struct proto *P, struct channel *C, net *n, rte *new, rte *old, ea_list *attrs);
int bgp_import_control(struct proto *, struct rte **, struct ea_list **, struct linpool *);
-void bgp_init_bucket_table(struct bgp_proto *);
-void bgp_free_bucket_table(struct bgp_proto *p);
-void bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck);
-void bgp_init_prefix_table(struct bgp_proto *p, u32 order);
-void bgp_free_prefix_table(struct bgp_proto *p);
-void bgp_free_prefix(struct bgp_proto *p, struct bgp_prefix *bp);
-uint bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains);
+int bgp_get_attr(struct eattr *e, byte *buf, int buflen);
void bgp_get_route_info(struct rte *, byte *buf, struct ea_list *attrs);
-inline static void bgp_attach_attr_ip(struct ea_list **to, struct linpool *pool, unsigned attr, ip_addr a)
-{ *(ip_addr *) bgp_attach_attr_wa(to, pool, attr, sizeof(ip_addr)) = a; }
/* packets.c */
void mrt_dump_bgp_state_change(struct bgp_conn *conn, unsigned old, unsigned new);
-void bgp_schedule_packet(struct bgp_conn *conn, int type);
+const struct bgp_af_desc *bgp_get_af_desc(u32 afi);
+const struct bgp_af_caps *bgp_find_af_caps(struct bgp_caps *caps, u32 afi);
+void bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type);
void bgp_kick_tx(void *vconn);
void bgp_tx(struct birdsock *sk);
int bgp_rx(struct birdsock *sk, uint size);
const char * bgp_error_dsc(unsigned code, unsigned subcode);
void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len);
+void bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to);
+
+
/* Packet types */
#define PKT_OPEN 0x01
@@ -292,26 +539,25 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi
#define BAF_PARTIAL 0x20
#define BAF_EXT_LEN 0x10
-#define BA_ORIGIN 0x01 /* [RFC1771] */ /* WM */
+#define BA_ORIGIN 0x01 /* RFC 4271 */ /* WM */
#define BA_AS_PATH 0x02 /* WM */
#define BA_NEXT_HOP 0x03 /* WM */
#define BA_MULTI_EXIT_DISC 0x04 /* ON */
#define BA_LOCAL_PREF 0x05 /* WD */
#define BA_ATOMIC_AGGR 0x06 /* WD */
#define BA_AGGREGATOR 0x07 /* OT */
-#define BA_COMMUNITY 0x08 /* [RFC1997] */ /* OT */
-#define BA_ORIGINATOR_ID 0x09 /* [RFC1966] */ /* ON */
-#define BA_CLUSTER_LIST 0x0a /* ON */
-/* We don't support these: */
-#define BA_DPA 0x0b /* ??? */
-#define BA_ADVERTISER 0x0c /* [RFC1863] */
-#define BA_RCID_PATH 0x0d
-#define BA_MP_REACH_NLRI 0x0e /* [RFC2283] */
-#define BA_MP_UNREACH_NLRI 0x0f
-#define BA_EXT_COMMUNITY 0x10 /* [RFC4360] */
-#define BA_AS4_PATH 0x11 /* [RFC4893] */
-#define BA_AS4_AGGREGATOR 0x12
-#define BA_LARGE_COMMUNITY 0x20 /* [RFC8092] */
+#define BA_COMMUNITY 0x08 /* RFC 1997 */ /* OT */
+#define BA_ORIGINATOR_ID 0x09 /* RFC 4456 */ /* ON */
+#define BA_CLUSTER_LIST 0x0a /* RFC 4456 */ /* ON */
+#define BA_MP_REACH_NLRI 0x0e /* RFC 4760 */
+#define BA_MP_UNREACH_NLRI 0x0f /* RFC 4760 */
+#define BA_EXT_COMMUNITY 0x10 /* RFC 4360 */
+#define BA_AS4_PATH 0x11 /* RFC 6793 */
+#define BA_AS4_AGGREGATOR 0x12 /* RFC 6793 */
+#define BA_LARGE_COMMUNITY 0x20 /* RFC 8092 */
+
+/* Bird's private internal BGP attributes */
+#define BA_MPLS_LABEL_STACK 0xfe /* MPLS label stack transfer attribute */
/* BGP connection states */
@@ -331,14 +577,12 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi
*
* When BGP protocol is started by core, it goes to BSS_PREPARE. When BGP
* protocol done what is neccessary to start itself (like acquiring the lock),
- * it goes to BSS_CONNECT. When some connection attempt failed because of
- * option or capability error, it goes to BSS_CONNECT_NOCAP.
+ * it goes to BSS_CONNECT.
*/
#define BSS_PREPARE 0 /* Used before ordinary BGP started, i. e. waiting for lock */
#define BSS_DELAY 1 /* Startup delay due to previous errors */
#define BSS_CONNECT 2 /* Ordinary BGP connecting */
-#define BSS_CONNECT_NOCAP 3 /* Legacy BGP connecting (without capabilities) */
/* BGP feed states (TX)
@@ -347,7 +591,7 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi
*
* RFC 7313 specifies that a route refresh should be demarcated by BoRR and EoRR packets.
*
- * These states (stored in p->feed_state) are used to keep track of these
+ * These states (stored in c->feed_state) are used to keep track of these
* requirements. When such feed is started, BFS_LOADING / BFS_REFRESHING is
* set. When it ended, BFS_LOADED / BFS_REFRESHED is set to schedule End-of-RIB
* or EoRR packet. When the packet is sent, the state returned to BFS_NONE.
@@ -403,15 +647,5 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi
#define ORIGIN_EGP 1
#define ORIGIN_INCOMPLETE 2
-/* Address families */
-
-#define BGP_AF_IPV4 1
-#define BGP_AF_IPV6 2
-
-#ifdef IPV6
-#define BGP_AF BGP_AF_IPV6
-#else
-#define BGP_AF BGP_AF_IPV4
-#endif
#endif
diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y
index 55c602f1..941ae5b6 100644
--- a/proto/bgp/config.Y
+++ b/proto/bgp/config.Y
@@ -13,28 +13,32 @@ CF_HDR
CF_DEFINES
#define BGP_CFG ((struct bgp_config *) this_proto)
+#define BGP_CC ((struct bgp_channel_config *) this_channel)
CF_DECLS
-CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY,
- KEEPALIVE, MULTIHOP, STARTUP, VIA, NEXT, HOP, SELF, DEFAULT,
- PATH, METRIC, ERROR, START, DELAY, FORGET, WAIT, ENABLE,
- DISABLE, AFTER, BGP_PATH, BGP_LOCAL_PREF, BGP_MED, BGP_ORIGIN,
- BGP_NEXT_HOP, BGP_ATOMIC_AGGR, BGP_AGGREGATOR, BGP_COMMUNITY,
- BGP_EXT_COMMUNITY, SOURCE, ADDRESS, PASSWORD, RR, RS, CLIENT,
- CLUSTER, ID, AS4, ADVERTISE, IPV4, CAPABILITIES, LIMIT, PASSIVE,
- PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH,
- INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP,
- TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC,
- SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX, GRACEFUL, RESTART, AWARE,
- CHECK, LINK, PORT, EXTENDED, MESSAGES, SETKEY, BGP_LARGE_COMMUNITY)
+CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE,
+ MULTIHOP, STARTUP, VIA, NEXT, HOP, SELF, DEFAULT, PATH, METRIC, ERROR,
+ START, DELAY, FORGET, WAIT, ENABLE, DISABLE, AFTER, BGP_PATH,
+ BGP_LOCAL_PREF, BGP_MED, BGP_ORIGIN, BGP_NEXT_HOP, BGP_ATOMIC_AGGR,
+ BGP_AGGREGATOR, BGP_COMMUNITY, BGP_EXT_COMMUNITY, BGP_LARGE_COMMUNITY,
+ SOURCE, ADDRESS, PASSWORD, RR, RS, CLIENT, CLUSTER, ID, AS4, ADVERTISE,
+ IPV4, CAPABILITIES, LIMIT, PASSIVE, PREFER, OLDER, MISSING, LLADDR,
+ DROP, IGNORE, ROUTE, REFRESH, INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID,
+ BGP_CLUSTER_LIST, IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL,
+ SECURITY, DETERMINISTIC, SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX,
+ GRACEFUL, RESTART, AWARE, CHECK, LINK, PORT, EXTENDED, MESSAGES, SETKEY,
+ STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6)
+
+%type <i32> bgp_afi
CF_GRAMMAR
-CF_ADDTO(proto, bgp_proto '}' { bgp_check_config(BGP_CFG); } )
+CF_ADDTO(proto, bgp_proto '}' )
bgp_proto_start: proto_start BGP {
this_proto = proto_config_new(&proto_bgp, $1);
+ BGP_CFG->local_port = BGP_PORT;
BGP_CFG->remote_port = BGP_PORT;
BGP_CFG->multihop = -1; /* undefined */
BGP_CFG->hold_time = 240;
@@ -49,26 +53,35 @@ bgp_proto_start: proto_start BGP {
BGP_CFG->enable_refresh = 1;
BGP_CFG->enable_as4 = 1;
BGP_CFG->capabilities = 2;
- BGP_CFG->advertise_ipv4 = 1;
BGP_CFG->interpret_communities = 1;
BGP_CFG->default_local_pref = 100;
BGP_CFG->gr_mode = BGP_GR_AWARE;
BGP_CFG->gr_time = 120;
BGP_CFG->setkey = 1;
- }
+ }
+ ;
+
+bgp_loc_opts:
+ /* empty */
+ | bgp_loc_opts PORT expr { BGP_CFG->local_port = $3; if (($3<1) || ($3>65535)) cf_error("Invalid port number"); }
+ | bgp_loc_opts AS expr { BGP_CFG->local_as = $3; }
;
bgp_nbr_opts:
/* empty */
- | bgp_nbr_opts PORT expr { BGP_CFG->remote_port = $3; if (($3<1) || ($3>65535)) cf_error("Invalid port number"); }
+ | bgp_nbr_opts PORT expr { BGP_CFG->remote_port = $3; if (($3<1) || ($3>65535)) cf_error("Invalid port number"); }
| bgp_nbr_opts AS expr { BGP_CFG->remote_as = $3; }
;
bgp_proto:
bgp_proto_start proto_name '{'
| bgp_proto proto_item ';'
- | bgp_proto LOCAL AS expr ';' { BGP_CFG->local_as = $4; }
- | bgp_proto LOCAL ipa AS expr ';' { BGP_CFG->source_addr = $3; BGP_CFG->local_as = $5; }
+ | bgp_proto bgp_proto_channel ';'
+ | bgp_proto LOCAL bgp_loc_opts ';'
+ | bgp_proto LOCAL ipa ipa_scope bgp_loc_opts ';' {
+ BGP_CFG->local_ip = $3;
+ if ($4) BGP_CFG->iface = $4;
+ }
| bgp_proto NEIGHBOR bgp_nbr_opts ';'
| bgp_proto NEIGHBOR ipa ipa_scope bgp_nbr_opts ';' {
if (ipa_nonzero(BGP_CFG->remote_ip))
@@ -78,20 +91,16 @@ bgp_proto:
}
| bgp_proto INTERFACE TEXT ';' { BGP_CFG->iface = if_get_by_name($3); }
| bgp_proto RR CLUSTER ID idval ';' { BGP_CFG->rr_cluster_id = $5; }
- | bgp_proto RR CLIENT ';' { BGP_CFG->rr_client = 1; }
- | bgp_proto RS CLIENT ';' { BGP_CFG->rs_client = 1; }
+ | bgp_proto RR CLIENT bool ';' { BGP_CFG->rr_client = $4; }
+ | bgp_proto RS CLIENT bool ';' { BGP_CFG->rs_client = $4; }
+ | bgp_proto CONFEDERATION expr ';' { BGP_CFG->confederation = $3; }
+ | bgp_proto CONFEDERATION MEMBER bool ';' { BGP_CFG->confederation_member = $4; }
| bgp_proto HOLD TIME expr ';' { BGP_CFG->hold_time = $4; }
| bgp_proto STARTUP HOLD TIME expr ';' { BGP_CFG->initial_hold_time = $5; }
| bgp_proto DIRECT ';' { BGP_CFG->multihop = 0; }
| bgp_proto MULTIHOP ';' { BGP_CFG->multihop = 64; }
| bgp_proto MULTIHOP expr ';' { BGP_CFG->multihop = $3; if (($3<1) || ($3>255)) cf_error("Multihop must be in range 1-255"); }
- | bgp_proto NEXT HOP SELF ';' { BGP_CFG->next_hop_self = 1; BGP_CFG->next_hop_keep = 0; }
- | bgp_proto NEXT HOP KEEP ';' { BGP_CFG->next_hop_keep = 1; BGP_CFG->next_hop_self = 0; }
- | bgp_proto MISSING LLADDR SELF ';' { BGP_CFG->missing_lladdr = MLL_SELF; }
- | bgp_proto MISSING LLADDR DROP ';' { BGP_CFG->missing_lladdr = MLL_DROP; }
- | bgp_proto MISSING LLADDR IGNORE ';' { BGP_CFG->missing_lladdr = MLL_IGNORE; }
- | bgp_proto GATEWAY DIRECT ';' { BGP_CFG->gw_mode = GW_DIRECT; }
- | bgp_proto GATEWAY RECURSIVE ';' { BGP_CFG->gw_mode = GW_RECURSIVE; }
+ | bgp_proto STRICT BIND bool ';' { BGP_CFG->strict_bind = $4; }
| bgp_proto PATH METRIC bool ';' { BGP_CFG->compare_path_lengths = $4; }
| bgp_proto MED METRIC bool ';' { BGP_CFG->med_metric = $4; }
| bgp_proto IGP METRIC bool ';' { BGP_CFG->igp_metric = $4; }
@@ -99,7 +108,7 @@ bgp_proto:
| bgp_proto DETERMINISTIC MED bool ';' { BGP_CFG->deterministic_med = $4; }
| bgp_proto DEFAULT BGP_MED expr ';' { BGP_CFG->default_med = $4; }
| bgp_proto DEFAULT BGP_LOCAL_PREF expr ';' { BGP_CFG->default_local_pref = $4; }
- | bgp_proto SOURCE ADDRESS ipa ';' { BGP_CFG->source_addr = $4; }
+ | bgp_proto SOURCE ADDRESS ipa ';' { BGP_CFG->local_ip = $4; }
| bgp_proto START DELAY TIME expr ';' { BGP_CFG->connect_delay_time = $5; log(L_WARN "%s: Start delay time option is deprecated, use connect delay time", this_proto->name); }
| bgp_proto CONNECT DELAY TIME expr ';' { BGP_CFG->connect_delay_time = $5; }
| bgp_proto CONNECT RETRY TIME expr ';' { BGP_CFG->connect_retry_time = $5; }
@@ -111,33 +120,101 @@ bgp_proto:
| bgp_proto ENABLE AS4 bool ';' { BGP_CFG->enable_as4 = $4; }
| bgp_proto ENABLE EXTENDED MESSAGES bool ';' { BGP_CFG->enable_extended_messages = $5; }
| bgp_proto CAPABILITIES bool ';' { BGP_CFG->capabilities = $3; }
- | bgp_proto ADVERTISE IPV4 bool ';' { BGP_CFG->advertise_ipv4 = $4; }
| bgp_proto PASSWORD text ';' { BGP_CFG->password = $3; }
| bgp_proto SETKEY bool ';' { BGP_CFG->setkey = $3; }
- | bgp_proto ROUTE LIMIT expr ';' {
- this_proto->in_limit = cfg_allocz(sizeof(struct proto_limit));
- this_proto->in_limit->limit = $4;
- this_proto->in_limit->action = PLA_RESTART;
- log(L_WARN "%s: Route limit option is deprecated, use import limit", this_proto->name);
- }
| bgp_proto PASSIVE bool ';' { BGP_CFG->passive = $3; }
| bgp_proto INTERPRET COMMUNITIES bool ';' { BGP_CFG->interpret_communities = $4; }
- | bgp_proto SECONDARY bool ';' { BGP_CFG->secondary = $3; }
- | bgp_proto ADD PATHS RX ';' { BGP_CFG->add_path = ADD_PATH_RX; }
- | bgp_proto ADD PATHS TX ';' { BGP_CFG->add_path = ADD_PATH_TX; }
- | bgp_proto ADD PATHS bool ';' { BGP_CFG->add_path = $4 ? ADD_PATH_FULL : 0; }
- | bgp_proto ALLOW BGP_LOCAL_PREF bool ';' { BGP_CFG->allow_local_pref = $4; }
| bgp_proto ALLOW LOCAL AS ';' { BGP_CFG->allow_local_as = -1; }
| bgp_proto ALLOW LOCAL AS expr ';' { BGP_CFG->allow_local_as = $5; }
+ | bgp_proto ALLOW BGP_LOCAL_PREF bool ';' { BGP_CFG->allow_local_pref = $4; }
| bgp_proto GRACEFUL RESTART bool ';' { BGP_CFG->gr_mode = $4; }
| bgp_proto GRACEFUL RESTART AWARE ';' { BGP_CFG->gr_mode = BGP_GR_AWARE; }
| bgp_proto GRACEFUL RESTART TIME expr ';' { BGP_CFG->gr_time = $5; }
- | bgp_proto IGP TABLE rtable ';' { BGP_CFG->igp_table = $4; }
| bgp_proto TTL SECURITY bool ';' { BGP_CFG->ttl_security = $4; }
| bgp_proto CHECK LINK bool ';' { BGP_CFG->check_link = $4; }
| bgp_proto BFD bool ';' { BGP_CFG->bfd = $3; cf_check_bfd($3); }
;
+bgp_afi:
+ IPV4 { $$ = BGP_AF_IPV4; }
+ | IPV6 { $$ = BGP_AF_IPV6; }
+ | IPV4 MULTICAST { $$ = BGP_AF_IPV4_MC; }
+ | IPV6 MULTICAST { $$ = BGP_AF_IPV6_MC; }
+ | IPV4 MPLS { $$ = BGP_AF_IPV4_MPLS; }
+ | IPV6 MPLS { $$ = BGP_AF_IPV6_MPLS; }
+ | VPN4 MPLS { $$ = BGP_AF_VPN4_MPLS; }
+ | VPN6 MPLS { $$ = BGP_AF_VPN6_MPLS; }
+ | VPN4 MULTICAST { $$ = BGP_AF_VPN4_MC; }
+ | VPN6 MULTICAST { $$ = BGP_AF_VPN6_MC; }
+ | FLOW4 { $$ = BGP_AF_FLOW4; }
+ | FLOW6 { $$ = BGP_AF_FLOW6; }
+ ;
+
+bgp_channel_start: bgp_afi
+{
+ const struct bgp_af_desc *desc = bgp_get_af_desc($1);
+
+ if (!desc)
+ cf_error("Unknown AFI/SAFI");
+
+ this_channel = channel_config_new(&channel_bgp, desc->net, this_proto);
+ BGP_CC->c.name = desc->name;
+ BGP_CC->c.ra_mode = RA_UNDEF;
+ BGP_CC->afi = $1;
+ BGP_CC->desc = desc;
+ BGP_CC->gr_able = 0xff; /* undefined */
+};
+
+bgp_channel_item:
+ channel_item
+ | NEXT HOP ADDRESS ipa { BGP_CC->next_hop_addr = $4; }
+ | NEXT HOP SELF { BGP_CC->next_hop_self = 1; BGP_CC->next_hop_keep = 0; }
+ | NEXT HOP KEEP { BGP_CC->next_hop_keep = 1; BGP_CC->next_hop_self = 0; }
+ | MISSING LLADDR SELF { BGP_CC->missing_lladdr = MLL_SELF; }
+ | MISSING LLADDR DROP { BGP_CC->missing_lladdr = MLL_DROP; }
+ | MISSING LLADDR IGNORE { BGP_CC->missing_lladdr = MLL_IGNORE; }
+ | GATEWAY DIRECT { BGP_CC->gw_mode = GW_DIRECT; }
+ | GATEWAY RECURSIVE { BGP_CC->gw_mode = GW_RECURSIVE; }
+ | SECONDARY bool { BGP_CC->secondary = $2; }
+ | GRACEFUL RESTART bool { BGP_CC->gr_able = $3; }
+ | EXTENDED NEXT HOP bool { BGP_CC->ext_next_hop = $4; }
+ | ADD PATHS RX { BGP_CC->add_path = BGP_ADD_PATH_RX; }
+ | ADD PATHS TX { BGP_CC->add_path = BGP_ADD_PATH_TX; }
+ | ADD PATHS bool { BGP_CC->add_path = $3 ? BGP_ADD_PATH_FULL : 0; }
+ | IGP TABLE rtable {
+ if (BGP_CC->desc->no_igp)
+ cf_error("IGP table not allowed here");
+
+ if ($3->addr_type == NET_IP4)
+ BGP_CC->igp_table_ip4 = $3;
+ else if ($3->addr_type == NET_IP6)
+ BGP_CC->igp_table_ip6 = $3;
+ else
+ cf_error("Mismatched IGP table type");
+ }
+ ;
+
+bgp_channel_opts:
+ /* empty */
+ | bgp_channel_opts bgp_channel_item ';'
+ ;
+
+bgp_channel_opt_list:
+ /* empty */
+ | '{' bgp_channel_opts '}'
+ ;
+
+bgp_channel_end:
+{
+ if (!this_channel->table)
+ cf_error("Routing table not specified");
+
+ this_channel = NULL;
+};
+
+bgp_proto_channel: bgp_channel_start bgp_channel_opt_list bgp_channel_end;
+
+
CF_ADDTO(dynamic_attr, BGP_ORIGIN
{ $$ = f_new_dynamic_attr(EAF_TYPE_INT, T_ENUM_BGP_ORIGIN, EA_CODE(EAP_BGP, BA_ORIGIN)); })
CF_ADDTO(dynamic_attr, BGP_PATH
diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c
index ab87bdcc..0e974746 100644
--- a/proto/bgp/packets.c
+++ b/proto/bgp/packets.c
@@ -2,12 +2,16 @@
* BIRD -- BGP Packet Processing
*
* (c) 2000 Martin Mares <mj@ucw.cz>
+ * (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
+ * (c) 2008--2016 CZ.NIC z.s.p.o.
*
* Can be freely distributed and used under the terms of the GNU GPL.
*/
#undef LOCAL_DEBUG
+#include <stdlib.h>
+
#include "nest/bird.h"
#include "nest/iface.h"
#include "nest/protocol.h"
@@ -16,6 +20,7 @@
#include "nest/mrtdump.h"
#include "conf/conf.h"
#include "lib/unaligned.h"
+#include "lib/flowspec.h"
#include "lib/socket.h"
#include "nest/cli.h"
@@ -27,6 +32,13 @@
#define BGP_RR_BEGIN 1
#define BGP_RR_END 2
+#define BGP_NLRI_MAX (4 + 1 + 32)
+
+#define BGP_MPLS_BOS 1 /* Bottom-of-stack bit */
+#define BGP_MPLS_MAX 10 /* Max number of labels that 24*n <= 255 */
+#define BGP_MPLS_NULL 3 /* Implicit NULL label */
+#define BGP_MPLS_MAGIC 0x800000 /* Magic withdraw label value, RFC 3107 3 */
+
static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
@@ -38,6 +50,46 @@ static byte fsm_err_subcode[BS_MAX] = {
[BS_ESTABLISHED] = 3
};
+
+static struct bgp_channel *
+bgp_get_channel(struct bgp_proto *p, u32 afi)
+{
+ uint i;
+
+ for (i = 0; i < p->channel_count; i++)
+ if (p->afi_map[i] == afi)
+ return p->channel_map[i];
+
+ return NULL;
+}
+
+static inline void
+put_af3(byte *buf, u32 id)
+{
+ put_u16(buf, id >> 16);
+ buf[2] = id & 0xff;
+}
+
+static inline void
+put_af4(byte *buf, u32 id)
+{
+ put_u16(buf, id >> 16);
+ buf[2] = 0;
+ buf[3] = id & 0xff;
+}
+
+static inline u32
+get_af3(byte *buf)
+{
+ return (get_u16(buf) << 16) | buf[2];
+}
+
+static inline u32
+get_af4(byte *buf)
+{
+ return (get_u16(buf) << 16) | buf[3];
+}
+
/*
* MRT Dump format is not semantically specified.
* We will use these values in appropriate fields:
@@ -58,31 +110,41 @@ static byte *
mrt_put_bgp4_hdr(byte *buf, struct bgp_conn *conn, int as4)
{
struct bgp_proto *p = conn->bgp;
+ uint v4 = ipa_is_ip4(p->cf->remote_ip);
if (as4)
- {
- put_u32(buf+0, p->remote_as);
- put_u32(buf+4, p->local_as);
- buf+=8;
- }
+ {
+ put_u32(buf+0, p->remote_as);
+ put_u32(buf+4, p->public_as);
+ buf+=8;
+ }
else
- {
- put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS);
- put_u16(buf+2, (p->local_as <= 0xFFFF) ? p->local_as : AS_TRANS);
- buf+=4;
- }
+ {
+ put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS);
+ put_u16(buf+2, (p->public_as <= 0xFFFF) ? p->public_as : AS_TRANS);
+ buf+=4;
+ }
put_u16(buf+0, (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0);
- put_u16(buf+2, BGP_AF);
+ put_u16(buf+2, v4 ? BGP_AFI_IPV4 : BGP_AFI_IPV6);
buf+=4;
- buf = put_ipa(buf, conn->sk ? conn->sk->daddr : IPA_NONE);
- buf = put_ipa(buf, conn->sk ? conn->sk->saddr : IPA_NONE);
+
+ if (v4)
+ {
+ buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->daddr) : IP4_NONE);
+ buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->saddr) : IP4_NONE);
+ }
+ else
+ {
+ buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->daddr) : IP6_NONE);
+ buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->saddr) : IP6_NONE);
+ }
return buf;
}
static void
-mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, unsigned len)
+mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, uint len)
{
byte *buf = alloca(128+len); /* 128 is enough for MRT headers */
byte *bp = buf + MRTDUMP_HDR_LENGTH;
@@ -96,14 +158,14 @@ mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, unsigned len)
}
static inline u16
-convert_state(unsigned state)
+convert_state(uint state)
{
/* Convert state from our BS_* values to values used in MRTDump */
return (state == BS_CLOSE) ? 1 : state + 1;
}
void
-mrt_dump_bgp_state_change(struct bgp_conn *conn, unsigned old, unsigned new)
+mrt_dump_bgp_state_change(struct bgp_conn *conn, uint old, uint new)
{
byte buf[128];
byte *bp = buf + MRTDUMP_HDR_LENGTH;
@@ -127,1303 +189,2426 @@ bgp_create_notification(struct bgp_conn *conn, byte *buf)
return buf + 2 + conn->notify_size;
}
-#ifdef IPV6
-static byte *
-bgp_put_cap_ipv6(struct bgp_proto *p UNUSED, byte *buf)
-{
- *buf++ = 1; /* Capability 1: Multiprotocol extensions */
- *buf++ = 4; /* Capability data length */
- *buf++ = 0; /* We support AF IPv6 */
- *buf++ = BGP_AF_IPV6;
- *buf++ = 0; /* RFU */
- *buf++ = 1; /* and SAFI 1 */
- return buf;
-}
-#else
+/* Capability negotiation as per RFC 5492 */
-static byte *
-bgp_put_cap_ipv4(struct bgp_proto *p UNUSED, byte *buf)
-{
- *buf++ = 1; /* Capability 1: Multiprotocol extensions */
- *buf++ = 4; /* Capability data length */
- *buf++ = 0; /* We support AF IPv4 */
- *buf++ = BGP_AF_IPV4;
- *buf++ = 0; /* RFU */
- *buf++ = 1; /* and SAFI 1 */
- return buf;
+const struct bgp_af_caps *
+bgp_find_af_caps(struct bgp_caps *caps, u32 afi)
+{
+ struct bgp_af_caps *ac;
+
+ WALK_AF_CAPS(caps, ac)
+ if (ac->afi == afi)
+ return ac;
+
+ return NULL;
}
-#endif
-static byte *
-bgp_put_cap_rr(struct bgp_proto *p UNUSED, byte *buf)
+static struct bgp_af_caps *
+bgp_get_af_caps(struct bgp_caps *caps, u32 afi)
{
- *buf++ = 2; /* Capability 2: Support for route refresh */
- *buf++ = 0; /* Capability data length */
- return buf;
+ struct bgp_af_caps *ac;
+
+ WALK_AF_CAPS(caps, ac)
+ if (ac->afi == afi)
+ return ac;
+
+ ac = &caps->af_data[caps->af_count++];
+ memset(ac, 0, sizeof(struct bgp_af_caps));
+ ac->afi = afi;
+
+ return ac;
}
-static byte *
-bgp_put_cap_ext_msg(struct bgp_proto *p UNUSED, byte *buf)
+static int
+bgp_af_caps_cmp(const void *X, const void *Y)
{
- *buf++ = 6; /* Capability 6: Support for extended messages */
- *buf++ = 0; /* Capability data length */
- return buf;
+ const struct bgp_af_caps *x = X, *y = Y;
+ return (x->afi < y->afi) ? -1 : (x->afi > y->afi) ? 1 : 0;
}
+
static byte *
-bgp_put_cap_gr1(struct bgp_proto *p, byte *buf)
+bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
{
- *buf++ = 64; /* Capability 64: Support for graceful restart */
- *buf++ = 6; /* Capability data length */
+ struct bgp_proto *p = conn->bgp;
+ struct bgp_channel *c;
+ struct bgp_caps *caps;
+ struct bgp_af_caps *ac;
+ uint any_ext_next_hop = 0;
+ uint any_add_path = 0;
+ byte *data;
- put_u16(buf, p->cf->gr_time);
- if (p->p.gr_recovery)
- buf[0] |= BGP_GRF_RESTART;
- buf += 2;
+ /* Prepare bgp_caps structure */
+
+ int n = list_length(&p->p.channels);
+ caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + n * sizeof(struct bgp_af_caps));
+ conn->local_caps = caps;
+
+ caps->as4_support = p->cf->enable_as4;
+ caps->ext_messages = p->cf->enable_extended_messages;
+ caps->route_refresh = p->cf->enable_refresh;
+ caps->enhanced_refresh = p->cf->enable_refresh;
+
+ if (caps->as4_support)
+ caps->as4_number = p->public_as;
+
+ if (p->cf->gr_mode)
+ {
+ caps->gr_aware = 1;
+ caps->gr_time = p->cf->gr_time;
+ caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0;
+ }
+
+ /* Allocate and fill per-AF fields */
+ WALK_LIST(c, p->p.channels)
+ {
+ ac = &caps->af_data[caps->af_count++];
+ ac->afi = c->afi;
+ ac->ready = 1;
+
+ ac->ext_next_hop = bgp_channel_is_ipv4(c) && c->cf->ext_next_hop;
+ any_ext_next_hop |= ac->ext_next_hop;
+
+ ac->add_path = c->cf->add_path;
+ any_add_path |= ac->add_path;
+
+ if (c->cf->gr_able)
+ {
+ ac->gr_able = 1;
+
+ if (p->p.gr_recovery)
+ ac->gr_af_flags |= BGP_GRF_FORWARDING;
+ }
+ }
+
+ /* Sort capability fields by AFI/SAFI */
+ qsort(caps->af_data, caps->af_count, sizeof(struct bgp_af_caps), bgp_af_caps_cmp);
- *buf++ = 0; /* Appropriate AF */
- *buf++ = BGP_AF;
- *buf++ = 1; /* and SAFI 1 */
- *buf++ = p->p.gr_recovery ? BGP_GRF_FORWARDING : 0;
+
+ /* Create capability list in buffer */
+
+ /*
+ * Note that max length is ~ 20+14*af_count. With max 12 channels that is
+ * 188. Option limit is 253 and buffer size is 4096, so we cannot overflow
+ * unless we add new capabilities or more AFs.
+ */
+
+ WALK_AF_CAPS(caps, ac)
+ if (ac->ready)
+ {
+ *buf++ = 1; /* Capability 1: Multiprotocol extensions */
+ *buf++ = 4; /* Capability data length */
+ put_af4(buf, ac->afi);
+ buf += 4;
+ }
+
+ if (caps->route_refresh)
+ {
+ *buf++ = 2; /* Capability 2: Support for route refresh */
+ *buf++ = 0; /* Capability data length */
+ }
+
+ if (any_ext_next_hop)
+ {
+ *buf++ = 5; /* Capability 5: Support for extended next hop */
+ *buf++ = 0; /* Capability data length, will be fixed later */
+ data = buf;
+
+ WALK_AF_CAPS(caps, ac)
+ if (ac->ext_next_hop)
+ {
+ put_af4(buf, ac->afi);
+ put_u16(buf+4, BGP_AFI_IPV6);
+ buf += 6;
+ }
+
+ data[-1] = buf - data;
+ }
+
+ if (caps->ext_messages)
+ {
+ *buf++ = 6; /* Capability 6: Support for extended messages */
+ *buf++ = 0; /* Capability data length */
+ }
+
+ if (caps->gr_aware)
+ {
+ *buf++ = 64; /* Capability 64: Support for graceful restart */
+ *buf++ = 0; /* Capability data length, will be fixed later */
+ data = buf;
+
+ put_u16(buf, caps->gr_time);
+ buf[0] |= caps->gr_flags;
+ buf += 2;
+
+ WALK_AF_CAPS(caps, ac)
+ if (ac->gr_able)
+ {
+ put_af3(buf, ac->afi);
+ buf[3] = ac->gr_af_flags;
+ buf += 4;
+ }
+
+ data[-1] = buf - data;
+ }
+
+ if (caps->as4_support)
+ {
+ *buf++ = 65; /* Capability 65: Support for 4-octet AS number */
+ *buf++ = 4; /* Capability data length */
+ put_u32(buf, p->public_as);
+ buf += 4;
+ }
+
+ if (any_add_path)
+ {
+ *buf++ = 69; /* Capability 69: Support for ADD-PATH */
+ *buf++ = 0; /* Capability data length, will be fixed later */
+ data = buf;
+
+ WALK_AF_CAPS(caps, ac)
+ if (ac->add_path)
+ {
+ put_af3(buf, ac->afi);
+ buf[3] = ac->add_path;
+ buf += 4;
+ }
+
+ data[-1] = buf - data;
+ }
+
+ if (caps->enhanced_refresh)
+ {
+ *buf++ = 70; /* Capability 70: Support for enhanced route refresh */
+ *buf++ = 0; /* Capability data length */
+ }
return buf;
}
-static byte *
-bgp_put_cap_gr2(struct bgp_proto *p UNUSED, byte *buf)
+static void
+bgp_read_capabilities(struct bgp_conn *conn, struct bgp_caps *caps, byte *pos, int len)
{
- *buf++ = 64; /* Capability 64: Support for graceful restart */
- *buf++ = 2; /* Capability data length */
- put_u16(buf, 0);
- return buf + 2;
-}
+ struct bgp_proto *p = conn->bgp;
+ struct bgp_af_caps *ac;
+ int i, cl;
+ u32 af;
-static byte *
-bgp_put_cap_as4(struct bgp_proto *p, byte *buf)
-{
- *buf++ = 65; /* Capability 65: Support for 4-octet AS number */
- *buf++ = 4; /* Capability data length */
- put_u32(buf, p->local_as);
- return buf + 4;
-}
+ while (len > 0)
+ {
+ if (len < 2 || len < (2 + pos[1]))
+ goto err;
-static byte *
-bgp_put_cap_add_path(struct bgp_proto *p, byte *buf)
-{
- *buf++ = 69; /* Capability 69: Support for ADD-PATH */
- *buf++ = 4; /* Capability data length */
+ /* Capability length */
+ cl = pos[1];
- *buf++ = 0; /* Appropriate AF */
- *buf++ = BGP_AF;
- *buf++ = 1; /* SAFI 1 */
+ /* Capability type */
+ switch (pos[0])
+ {
+ case 1: /* Multiprotocol capability, RFC 4760 */
+ if (cl != 4)
+ goto err;
- *buf++ = p->cf->add_path;
+ af = get_af4(pos+2);
+ ac = bgp_get_af_caps(caps, af);
+ ac->ready = 1;
+ break;
- return buf;
+ case 2: /* Route refresh capability, RFC 2918 */
+ if (cl != 0)
+ goto err;
+
+ caps->route_refresh = 1;
+ break;
+
+ case 5: /* Extended next hop encoding capability, RFC 5549 */
+ if (cl % 6)
+ goto err;
+
+ for (i = 0; i < cl; i += 6)
+ {
+ /* Specified only for IPv4 prefixes with IPv6 next hops */
+ if ((get_u16(pos+2+i+0) != BGP_AFI_IPV4) ||
+ (get_u16(pos+2+i+4) != BGP_AFI_IPV6))
+ continue;
+
+ af = get_af4(pos+2+i);
+ ac = bgp_get_af_caps(caps, af);
+ ac->ext_next_hop = 1;
+ }
+ break;
+
+ case 6: /* Extended message length capability, RFC draft */
+ if (cl != 0)
+ goto err;
+
+ caps->ext_messages = 1;
+ break;
+
+ case 64: /* Graceful restart capability, RFC 4724 */
+ if (cl % 4 != 2)
+ goto err;
+
+ /* Only the last instance is valid */
+ WALK_AF_CAPS(caps, ac)
+ {
+ ac->gr_able = 0;
+ ac->gr_af_flags = 0;
+ }
+
+ caps->gr_aware = 1;
+ caps->gr_flags = pos[2] & 0xf0;
+ caps->gr_time = get_u16(pos + 2) & 0x0fff;
+
+ for (i = 2; i < cl; i += 4)
+ {
+ af = get_af3(pos+2+i);
+ ac = bgp_get_af_caps(caps, af);
+ ac->gr_able = 1;
+ ac->gr_af_flags = pos[2+i+3];
+ }
+ break;
+
+ case 65: /* AS4 capability, RFC 6793 */
+ if (cl != 4)
+ goto err;
+
+ caps->as4_support = 1;
+ caps->as4_number = get_u32(pos + 2);
+ break;
+
+ case 69: /* ADD-PATH capability, RFC 7911 */
+ if (cl % 4)
+ goto err;
+
+ for (i = 0; i < cl; i += 4)
+ {
+ byte val = pos[2+i+3];
+ if (!val || (val > BGP_ADD_PATH_FULL))
+ {
+ log(L_WARN "%s: Got ADD-PATH capability with unknown value %u, ignoring",
+ p->p.name, val);
+ break;
+ }
+ }
+
+ for (i = 0; i < cl; i += 4)
+ {
+ af = get_af3(pos+2+i);
+ ac = bgp_get_af_caps(caps, af);
+ ac->add_path = pos[2+i+3];
+ }
+ break;
+
+ case 70: /* Enhanced route refresh capability, RFC 7313 */
+ if (cl != 0)
+ goto err;
+
+ caps->enhanced_refresh = 1;
+ break;
+
+ /* We can safely ignore all other capabilities */
+ }
+
+ ADVANCE(pos, len, 2 + cl);
+ }
+ return;
+
+err:
+ bgp_error(conn, 2, 0, NULL, 0);
+ return;
}
-static byte *
-bgp_put_cap_err(struct bgp_proto *p UNUSED, byte *buf)
+static int
+bgp_read_options(struct bgp_conn *conn, byte *pos, int len)
{
- *buf++ = 70; /* Capability 70: Support for enhanced route refresh */
- *buf++ = 0; /* Capability data length */
- return buf;
-}
+ struct bgp_proto *p = conn->bgp;
+ struct bgp_caps *caps;
+ int ol;
+
+ /* Max number of announced AFIs is limited by max option length (255) */
+ caps = alloca(sizeof(struct bgp_caps) + 64 * sizeof(struct bgp_af_caps));
+ memset(caps, 0, sizeof(struct bgp_caps));
+
+ while (len > 0)
+ {
+ if ((len < 2) || (len < (2 + pos[1])))
+ { bgp_error(conn, 2, 0, NULL, 0); return -1; }
+ ol = pos[1];
+ if (pos[0] == 2)
+ {
+ /* BGP capabilities, RFC 5492 */
+ if (p->cf->capabilities)
+ bgp_read_capabilities(conn, caps, pos + 2, ol);
+ }
+ else
+ {
+ /* Unknown option */
+ bgp_error(conn, 2, 4, pos, ol); /* FIXME: ol or ol+2 ? */
+ return -1;
+ }
+
+ ADVANCE(pos, len, 2 + ol);
+ }
+
+ uint n = sizeof(struct bgp_caps) + caps->af_count * sizeof(struct bgp_af_caps);
+ conn->remote_caps = mb_allocz(p->p.pool, n);
+ memcpy(conn->remote_caps, caps, n);
+
+ return 0;
+}
static byte *
bgp_create_open(struct bgp_conn *conn, byte *buf)
{
struct bgp_proto *p = conn->bgp;
- byte *cap;
- int cap_len;
BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
- BGP_VERSION, p->local_as, p->cf->hold_time, p->local_id);
+ BGP_VERSION, p->public_as, p->cf->hold_time, p->local_id);
+
buf[0] = BGP_VERSION;
- put_u16(buf+1, (p->local_as < 0xFFFF) ? p->local_as : AS_TRANS);
+ put_u16(buf+1, (p->public_as < 0xFFFF) ? p->public_as : AS_TRANS);
put_u16(buf+3, p->cf->hold_time);
put_u32(buf+5, p->local_id);
- if (conn->start_state == BSS_CONNECT_NOCAP)
- {
- BGP_TRACE(D_PACKETS, "Skipping capabilities");
- buf[9] = 0;
- return buf + 10;
- }
+ if (p->cf->capabilities)
+ {
+ /* Prepare local_caps and write capabilities to buffer */
+ byte *end = bgp_write_capabilities(conn, buf+12);
+ uint len = end - (buf+12);
+
+ buf[9] = len + 2; /* Optional parameters length */
+ buf[10] = 2; /* Option 2: Capability list */
+ buf[11] = len; /* Option data length */
+
+ return end;
+ }
+ else
+ {
+ /* Prepare empty local_caps */
+ conn->local_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps));
+
+ buf[9] = 0; /* No optional parameters */
+ return buf + 10;
+ }
+
+ return buf;
+}
+
+static void
+bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
+{
+ struct bgp_proto *p = conn->bgp;
+ struct bgp_conn *other;
+ u32 asn, hold, id;
- /* Skipped 3 B for length field and Capabilities parameter header */
- cap = buf + 12;
+ /* Check state */
+ if (conn->state != BS_OPENSENT)
+ { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
+
+ /* Check message contents */
+ if (len < 29 || len != 29 + (uint) pkt[28])
+ { bgp_error(conn, 1, 2, pkt+16, 2); return; }
-#ifndef IPV6
- if (p->cf->advertise_ipv4)
- cap = bgp_put_cap_ipv4(p, cap);
-#endif
+ if (pkt[19] != BGP_VERSION)
+ { u16 val = BGP_VERSION; bgp_error(conn, 2, 1, (byte *) &val, 2); return; }
-#ifdef IPV6
- cap = bgp_put_cap_ipv6(p, cap);
-#endif
+ asn = get_u16(pkt+20);
+ hold = get_u16(pkt+22);
+ id = get_u32(pkt+24);
+ BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%R)", asn, hold, id);
- if (p->cf->enable_refresh)
- cap = bgp_put_cap_rr(p, cap);
+ if (bgp_read_options(conn, pkt+29, pkt[28]) < 0)
+ return;
- if (p->cf->gr_mode == BGP_GR_ABLE)
- cap = bgp_put_cap_gr1(p, cap);
- else if (p->cf->gr_mode == BGP_GR_AWARE)
- cap = bgp_put_cap_gr2(p, cap);
+ if (hold > 0 && hold < 3)
+ { bgp_error(conn, 2, 6, pkt+22, 2); return; }
- if (p->cf->enable_as4)
- cap = bgp_put_cap_as4(p, cap);
+ /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
+ if (!id || (p->is_internal && id == p->local_id))
+ { bgp_error(conn, 2, 3, pkt+24, -4); return; }
- if (p->cf->add_path)
- cap = bgp_put_cap_add_path(p, cap);
+ struct bgp_caps *caps = conn->remote_caps;
- if (p->cf->enable_refresh)
- cap = bgp_put_cap_err(p, cap);
+ if (caps->as4_support)
+ {
+ u32 as4 = caps->as4_number;
- if (p->cf->enable_extended_messages)
- cap = bgp_put_cap_ext_msg(p, cap);
+ if ((as4 != asn) && (asn != AS_TRANS))
+ log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
- cap_len = cap - buf - 12;
- if (cap_len > 0)
- {
- buf[9] = cap_len + 2; /* Optional params len */
- buf[10] = 2; /* Option: Capability list */
- buf[11] = cap_len; /* Option length */
- return cap;
- }
+ if (as4 != p->remote_as)
+ { as4 = htonl(as4); bgp_error(conn, 2, 2, (byte *) &as4, 4); return; }
+ }
else
+ {
+ if (asn != p->remote_as)
+ { bgp_error(conn, 2, 2, pkt+20, 2); return; }
+ }
+
+ /* Check the other connection */
+ other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
+ switch (other->state)
+ {
+ case BS_CONNECT:
+ case BS_ACTIVE:
+ /* Stop outgoing connection attempts */
+ bgp_conn_enter_idle_state(other);
+ break;
+
+ case BS_IDLE:
+ case BS_OPENSENT:
+ case BS_CLOSE:
+ break;
+
+ case BS_OPENCONFIRM:
+ /*
+ * Description of collision detection rules in RFC 4271 is confusing and
+ * contradictory, but it is essentially:
+ *
+ * 1. Router with higher ID is dominant
+ * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
+ * 3. When both connections are in OpenConfirm state, one initiated by
+ * the dominant router is kept.
+ *
+ * The first line in the expression below evaluates whether the neighbor
+ * is dominant, the second line whether the new connection was initiated
+ * by the neighbor. If both are true (or both are false), we keep the new
+ * connection, otherwise we keep the old one.
+ */
+ if (((p->local_id < id) || ((p->local_id == id) && (p->public_as < p->remote_as)))
+ == (conn == &p->incoming_conn))
{
- buf[9] = 0; /* No optional parameters */
- return buf + 10;
+ /* Should close the other connection */
+ BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
+ bgp_error(other, 6, 7, NULL, 0);
+ break;
}
+ /* Fall thru */
+ case BS_ESTABLISHED:
+ /* Should close this connection */
+ BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
+ bgp_error(conn, 6, 7, NULL, 0);
+ return;
+
+ default:
+ bug("bgp_rx_open: Unknown state");
+ }
+
+ /* Update our local variables */
+ conn->hold_time = MIN(hold, p->cf->hold_time);
+ conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
+ conn->as4_session = conn->local_caps->as4_support && caps->as4_support;
+ conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages;
+ p->remote_id = id;
+
+ DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n",
+ conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session);
+
+ bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
+ bgp_start_timer(conn->hold_timer, conn->hold_time);
+ bgp_conn_enter_openconfirm_state(conn);
}
-static uint
-bgp_encode_prefixes(struct bgp_proto *p, byte *w, struct bgp_bucket *buck, uint remains)
+
+/*
+ * Next hop handling
+ */
+
+#define REPORT(msg, args...) \
+ ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); })
+
+#define DISCARD(msg, args...) \
+ ({ REPORT(msg, ## args); return; })
+
+#define WITHDRAW(msg, args...) \
+ ({ REPORT(msg, ## args); s->err_withdraw = 1; return; })
+
+#define BAD_AFI "Unexpected AF <%u/%u> in UPDATE"
+#define BAD_NEXT_HOP "Invalid NEXT_HOP attribute"
+#define NO_NEXT_HOP "Missing NEXT_HOP attribute"
+#define NO_LABEL_STACK "Missing MPLS stack"
+
+
+static void
+bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll)
{
- byte *start = w;
- ip_addr a;
- int bytes;
+ struct bgp_proto *p = s->proto;
+ struct bgp_channel *c = s->channel;
- while (!EMPTY_LIST(buck->prefixes) && (remains >= (5+sizeof(ip_addr))))
- {
- struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes));
- DBG("\tDequeued route %I/%d\n", px->n.prefix, px->n.pxlen);
+ if (c->cf->gw_mode == GW_DIRECT)
+ {
+ neighbor *nbr = NULL;
- if (p->add_path_tx)
- {
- put_u32(w, px->path_id);
- w += 4;
- remains -= 4;
- }
+ /* GW_DIRECT -> single_hop -> p->neigh != NULL */
+ if (ipa_nonzero(gw))
+ nbr = neigh_find2(&p->p, &gw, NULL, 0);
+ else if (ipa_nonzero(ll))
+ nbr = neigh_find2(&p->p, &ll, p->neigh->iface, 0);
- *w++ = px->n.pxlen;
- bytes = (px->n.pxlen + 7) / 8;
- a = px->n.prefix;
- ipa_hton(a);
- memcpy(w, &a, bytes);
- w += bytes;
- remains -= bytes + 1;
- rem_node(&px->bucket_node);
- bgp_free_prefix(p, px);
- // fib_delete(&p->prefix_fib, px);
- }
- return w - start;
+ if (!nbr || (nbr->scope == SCOPE_HOST))
+ WITHDRAW(BAD_NEXT_HOP);
+
+ a->dest = RTD_UNICAST;
+ a->nh.gw = nbr->addr;
+ a->nh.iface = nbr->iface;
+ }
+ else /* GW_RECURSIVE */
+ {
+ if (ipa_zero(gw))
+ WITHDRAW(BAD_NEXT_HOP);
+
+ rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6;
+ s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table);
+
+ if (!s->mpls)
+ rta_apply_hostentry(a, s->hostentry, NULL);
+
+ /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */
+ }
}
static void
-bgp_flush_prefixes(struct bgp_proto *p, struct bgp_bucket *buck)
+bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum)
{
- while (!EMPTY_LIST(buck->prefixes))
- {
- struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes));
- log(L_ERR "%s: - route %I/%d skipped", p->p.name, px->n.prefix, px->n.pxlen);
- rem_node(&px->bucket_node);
- bgp_free_prefix(p, px);
- // fib_delete(&p->prefix_fib, px);
- }
+ if (lnum > MPLS_MAX_LABEL_STACK)
+ {
+ REPORT("Too many MPLS labels ($u)", lnum);
+
+ a->dest = RTD_UNREACHABLE;
+ a->hostentry = NULL;
+ a->nh = (struct nexthop) { };
+ return;
+ }
+
+ /* Handle implicit NULL as empty MPLS stack */
+ if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL))
+ lnum = 0;
+
+ if (s->channel->cf->gw_mode == GW_DIRECT)
+ {
+ a->nh.labels = lnum;
+ memcpy(a->nh.label, labels, 4*lnum);
+ }
+ else /* GW_RECURSIVE */
+ {
+ mpls_label_stack ms;
+
+ ms.len = lnum;
+ memcpy(ms.stack, labels, 4*lnum);
+ rta_apply_hostentry(a, s->hostentry, &ms);
+ }
}
-#ifndef IPV6 /* IPv4 version */
-static byte *
-bgp_create_update(struct bgp_conn *conn, byte *buf)
+static inline int
+bgp_use_next_hop(struct bgp_export_state *s, eattr *a)
{
- struct bgp_proto *p = conn->bgp;
- struct bgp_bucket *buck;
- int remains = bgp_max_packet_length(p) - BGP_HEADER_LENGTH - 4;
- byte *w;
- int wd_size = 0;
- int r_size = 0;
- int a_size = 0;
-
- w = buf+2;
- if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
- {
- DBG("Withdrawn routes:\n");
- wd_size = bgp_encode_prefixes(p, w, buck, remains);
- w += wd_size;
- remains -= wd_size;
- }
- put_u16(buf, wd_size);
+ struct bgp_proto *p = s->proto;
+ ip_addr *nh = (void *) a->u.ptr->data;
- if (!wd_size)
- {
- while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next)
- {
- if (EMPTY_LIST(buck->prefixes))
- {
- DBG("Deleting empty bucket %p\n", buck);
- rem_node(&buck->send_node);
- bgp_free_bucket(p, buck);
- continue;
- }
-
- DBG("Processing bucket %p\n", buck);
- a_size = bgp_encode_attrs(p, w+2, buck->eattrs, remains - 1024);
-
- if (a_size < 0)
- {
- log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name);
- bgp_flush_prefixes(p, buck);
- rem_node(&buck->send_node);
- bgp_free_bucket(p, buck);
- continue;
- }
-
- put_u16(w, a_size);
- w += a_size + 2;
- r_size = bgp_encode_prefixes(p, w, buck, remains - a_size);
- w += r_size;
- break;
- }
- }
- if (!a_size) /* Attributes not already encoded */
+ if (s->channel->cf->next_hop_self)
+ return 0;
+
+ if (s->channel->cf->next_hop_keep)
+ return 1;
+
+ /* Keep it when explicitly set in export filter */
+ if (a->type & EAF_FRESH)
+ return 1;
+
+ /* Keep it when exported to internal peers */
+ if (p->is_interior && ipa_nonzero(*nh))
+ return 1;
+
+ /* Keep it when forwarded between single-hop BGPs on the same iface */
+ struct iface *ifa = (s->src && s->src->neigh) ? s->src->neigh->iface : NULL;
+ return p->neigh && (p->neigh->iface == ifa);
+}
+
+static inline int
+bgp_use_gateway(struct bgp_export_state *s)
+{
+ struct bgp_proto *p = s->proto;
+ rta *ra = s->route->attrs;
+
+ if (s->channel->cf->next_hop_self)
+ return 0;
+
+ /* We need one valid global gateway */
+ if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw))
+ return 0;
+
+ /* Use it when exported to internal peers */
+ if (p->is_interior)
+ return 1;
+
+ /* Use it when forwarded to single-hop BGP peer on on the same iface */
+ return p->neigh && (p->neigh->iface == ra->nh.iface);
+}
+
+static void
+bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to)
+{
+ if (!a || !bgp_use_next_hop(s, a))
+ {
+ if (bgp_use_gateway(s))
{
- put_u16(w, 0);
- w += 2;
+ rta *ra = s->route->attrs;
+ ip_addr nh[1] = { ra->nh.gw };
+ bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16);
+
+ if (s->mpls)
+ {
+ u32 implicit_null = BGP_MPLS_NULL;
+ u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null;
+ uint lnum = ra->nh.labels ? ra->nh.labels : 1;
+ bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4);
+ }
}
- if (wd_size || r_size)
+ else
{
- BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
- return w;
+ ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr };
+ bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16);
+
+ /* TODO: Use local MPLS assigned label */
+ if (s->mpls)
+ bgp_unset_attr(to, s->pool, BA_MPLS_LABEL_STACK);
}
+ }
+
+ /* Check if next hop is valid */
+ a = bgp_find_attr(*to, BA_NEXT_HOP);
+ if (!a)
+ WITHDRAW(NO_NEXT_HOP);
+
+ ip_addr *nh = (void *) a->u.ptr->data;
+ ip_addr peer = s->proto->cf->remote_ip;
+ uint len = a->u.ptr->length;
+
+ /* Forbid zero next hop */
+ if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1])))
+ WITHDRAW(BAD_NEXT_HOP);
+
+ /* Forbid next hop equal to neighbor IP */
+ if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1])))
+ WITHDRAW(BAD_NEXT_HOP);
+
+ /* Forbid next hop with non-matching AF */
+ if ((ipa_is_ip4(nh[0]) != bgp_channel_is_ipv4(s->channel)) &&
+ !s->channel->ext_next_hop)
+ WITHDRAW(BAD_NEXT_HOP);
+
+ /* Just check if MPLS stack */
+ if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK))
+ WITHDRAW(NO_LABEL_STACK);
+}
+
+static uint
+bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
+{
+ /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */
+ ip_addr *nh = (void *) a->u.ptr->data;
+ uint len = a->u.ptr->length;
+
+ ASSERT((len == 16) || (len == 32));
+
+ /*
+ * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
+ * is specified in RFC 5549 for IPv4 and in RFC 4798 for IPv6. The difference
+ * is that IPv4 address is directly encoded with IPv4 NLRI, but as IPv4-mapped
+ * IPv6 address with IPv6 NLRI.
+ */
+
+ if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
+ {
+ put_ip4(buf, ipa_to_ip4(nh[0]));
+ return 4;
+ }
+
+ put_ip6(buf, ipa_to_ip6(nh[0]));
+
+ if (len == 32)
+ put_ip6(buf+16, ipa_to_ip6(nh[1]));
+
+ return len;
+}
+
+static void
+bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a)
+{
+ struct bgp_channel *c = s->channel;
+ struct adata *ad = lp_alloc_adata(s->pool, 32);
+ ip_addr *nh = (void *) ad->data;
+
+ if (len == 4)
+ {
+ nh[0] = ipa_from_ip4(get_ip4(data));
+ nh[1] = IPA_NONE;
+ }
+ else if (len == 16)
+ {
+ nh[0] = ipa_from_ip6(get_ip6(data));
+ nh[1] = IPA_NONE;
+
+ if (ipa_is_link_local(nh[0]))
+ { nh[1] = nh[0]; nh[0] = IPA_NONE; }
+ }
+ else if (len == 32)
+ {
+ nh[0] = ipa_from_ip6(get_ip6(data));
+ nh[1] = ipa_from_ip6(get_ip6(data+16));
+
+ if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
+ nh[1] = IPA_NONE;
+ }
else
- return NULL;
+ bgp_parse_error(s, 9);
+
+ if (ipa_zero(nh[1]))
+ ad->length = 16;
+
+ if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
+ WITHDRAW(BAD_NEXT_HOP);
+
+ // XXXX validate next hop
+
+ bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
+ bgp_apply_next_hop(s, a, nh[0], nh[1]);
}
-static byte *
-bgp_create_end_mark(struct bgp_conn *conn, byte *buf)
+static uint
+bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
{
- struct bgp_proto *p = conn->bgp;
- BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
+ ip_addr *nh = (void *) a->u.ptr->data;
+ uint len = a->u.ptr->length;
- put_u32(buf, 0);
- return buf+4;
+ ASSERT((len == 16) || (len == 32));
+
+ /*
+ * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
+ * is specified in RFC 5549 for VPNv4 and in RFC 4659 for VPNv6. The difference
+ * is that IPv4 address is directly encoded with VPNv4 NLRI, but as IPv4-mapped
+ * IPv6 address with VPNv6 NLRI.
+ */
+
+ if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
+ {
+ put_u64(buf, 0); /* VPN RD is 0 */
+ put_ip4(buf+8, ipa_to_ip4(nh[0]));
+ return 12;
+ }
+
+ put_u64(buf, 0); /* VPN RD is 0 */
+ put_ip6(buf+8, ipa_to_ip6(nh[0]));
+
+ if (len == 16)
+ return 24;
+
+ put_u64(buf+24, 0); /* VPN RD is 0 */
+ put_ip6(buf+32, ipa_to_ip6(nh[1]));
+
+ return 48;
}
-#else /* IPv6 version */
+static void
+bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a)
+{
+ struct bgp_channel *c = s->channel;
+ struct adata *ad = lp_alloc_adata(s->pool, 32);
+ ip_addr *nh = (void *) ad->data;
-static inline int
-same_iface(struct bgp_proto *p, ip_addr *ip)
+ if (len == 12)
+ {
+ nh[0] = ipa_from_ip4(get_ip4(data+8));
+ nh[1] = IPA_NONE;
+ }
+ else if (len == 24)
+ {
+ nh[0] = ipa_from_ip6(get_ip6(data+8));
+ nh[1] = IPA_NONE;
+
+ if (ipa_is_link_local(nh[0]))
+ { nh[1] = nh[0]; nh[0] = IPA_NONE; }
+ }
+ else if (len == 48)
+ {
+ nh[0] = ipa_from_ip6(get_ip6(data+8));
+ nh[1] = ipa_from_ip6(get_ip6(data+32));
+
+ if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
+ nh[1] = IPA_NONE;
+ }
+ else
+ bgp_parse_error(s, 9);
+
+ if (ipa_zero(nh[1]))
+ ad->length = 16;
+
+ /* XXXX which error */
+ if ((get_u64(data) != 0) || ((len == 48) && (get_u64(data+24) != 0)))
+ bgp_parse_error(s, 9);
+
+ if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
+ WITHDRAW(BAD_NEXT_HOP);
+
+ // XXXX validate next hop
+
+ bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
+ bgp_apply_next_hop(s, a, nh[0], nh[1]);
+}
+
+
+
+static uint
+bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte *buf UNUSED, uint size UNUSED)
{
- neighbor *n = neigh_find(&p->p, ip, 0);
- return n && p->neigh && n->iface == p->neigh->iface;
+ return 0;
}
-static byte *
-bgp_create_update(struct bgp_conn *conn, byte *buf)
+static void
+bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED)
{
- struct bgp_proto *p = conn->bgp;
- struct bgp_bucket *buck;
- int size, second, rem_stored;
- int remains = bgp_max_packet_length(p) - BGP_HEADER_LENGTH - 4;
- byte *w, *w_stored, *tmp, *tstart;
- ip_addr *ipp, ip, ip_ll;
- ea_list *ea;
- eattr *nh;
+ /*
+ * Although we expect no next hop and RFC 7606 7.11 states that attribute
+ * MP_REACH_NLRI with unexpected next hop length is considered malformed,
+ * FlowSpec RFC 5575 4 states that next hop shall be ignored on receipt.
+ */
+
+ return;
+}
- put_u16(buf, 0);
- w = buf+4;
+static void
+bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to)
+{
+ /* NEXT_HOP shall not pass */
+ if (a)
+ bgp_unset_attr(to, s->pool, BA_NEXT_HOP);
+}
+
+
+/*
+ * UPDATE
+ */
+
+static void
+bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0)
+{
+ if (path_id != s->last_id)
+ {
+ s->last_src = rt_get_source(&s->proto->p, path_id);
+ s->last_id = path_id;
- if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
+ rta_free(s->cached_rta);
+ s->cached_rta = NULL;
+ }
+
+ if (!a0)
+ {
+ /* Route withdraw */
+ rte_update2(&s->channel->c, n, NULL, s->last_src);
+ return;
+ }
+
+ /* Prepare cached route attributes */
+ if (s->cached_rta == NULL)
+ {
+ a0->src = s->last_src;
+
+ /* Workaround for rta_lookup() breaking eattrs */
+ ea_list *ea = a0->eattrs;
+ s->cached_rta = rta_lookup(a0);
+ a0->eattrs = ea;
+ }
+
+ rta *a = rta_clone(s->cached_rta);
+ rte *e = rte_get_temp(a);
+
+ e->pflags = 0;
+ e->u.bgp.suppressed = 0;
+ rte_update2(&s->channel->c, n, e, s->last_src);
+}
+
+static void
+bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, adata *mpls, byte **pos, uint *size, byte *pxlen)
+{
+ u32 dummy = 0;
+ u32 *labels = mpls ? (u32 *) mpls->data : &dummy;
+ uint lnum = mpls ? (mpls->length / 4) : 1;
+
+ for (uint i = 0; i < lnum; i++)
+ {
+ put_u24(*pos, labels[i] << 4);
+ ADVANCE(*pos, *size, 3);
+ }
+
+ /* Add bottom-of-stack flag */
+ (*pos)[-1] |= BGP_MPLS_BOS;
+
+ *pxlen += 24 * lnum;
+}
+
+static void
+bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a)
+{
+ u32 labels[BGP_MPLS_MAX], label;
+ uint lnum = 0;
+
+ do {
+ if (*pxlen < 24)
+ bgp_parse_error(s, 1);
+
+ label = get_u24(*pos);
+ labels[lnum++] = label >> 4;
+ ADVANCE(*pos, *len, 3);
+ *pxlen -= 24;
+
+ /* Withdraw: Magic label stack value 0x800000 according to RFC 3107, section 3, last paragraph */
+ if (!a && !s->err_withdraw && (lnum == 1) && (label == BGP_MPLS_MAGIC))
+ break;
+ }
+ while (!(label & BGP_MPLS_BOS));
+
+ if (!a)
+ return;
+
+ /* Attach MPLS attribute unless we already have one */
+ if (!s->mpls_labels)
+ {
+ s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX);
+ bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels);
+ }
+
+ /* Overwrite data in the attribute */
+ s->mpls_labels->length = 4*lnum;
+ memcpy(s->mpls_labels->data, labels, 4*lnum);
+
+ /* Update next hop entry in rta */
+ bgp_apply_mpls_labels(s, a, labels, lnum);
+
+ /* Attributes were changed, invalidate cached entry */
+ rta_free(s->cached_rta);
+ s->cached_rta = NULL;
+
+ return;
+}
+
+static uint
+bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
+{
+ byte *pos = buf;
+
+ while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
+ {
+ struct bgp_prefix *px = HEAD(buck->prefixes);
+ struct net_addr_ip4 *net = (void *) px->net;
+
+ /* Encode path ID */
+ if (s->add_path)
{
- DBG("Withdrawn routes:\n");
- tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_UNREACH_NLRI, remains-8);
- *tmp++ = 0;
- *tmp++ = BGP_AF_IPV6;
- *tmp++ = 1;
- ea->attrs[0].u.ptr->length = 3 + bgp_encode_prefixes(p, tmp, buck, remains-11);
- size = bgp_encode_attrs(p, w, ea, remains);
- ASSERT(size >= 0);
- w += size;
- remains -= size;
+ put_u32(pos, px->path_id);
+ ADVANCE(pos, size, 4);
}
- else
+
+ /* Encode prefix length */
+ *pos = net->pxlen;
+ ADVANCE(pos, size, 1);
+
+ /* Encode MPLS labels */
+ if (s->mpls)
+ bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
+
+ /* Encode prefix body */
+ ip4_addr a = ip4_hton(net->prefix);
+ uint b = (net->pxlen + 7) / 8;
+ memcpy(pos, &a, b);
+ ADVANCE(pos, size, b);
+
+ bgp_free_prefix(s->channel, px);
+ }
+
+ return pos - buf;
+}
+
+static void
+bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
+{
+ while (len)
+ {
+ net_addr_ip4 net;
+ u32 path_id = 0;
+
+ /* Decode path ID */
+ if (s->add_path)
{
- while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next)
- {
- if (EMPTY_LIST(buck->prefixes))
- {
- DBG("Deleting empty bucket %p\n", buck);
- rem_node(&buck->send_node);
- bgp_free_bucket(p, buck);
- continue;
- }
-
- DBG("Processing bucket %p\n", buck);
- rem_stored = remains;
- w_stored = w;
-
- size = bgp_encode_attrs(p, w, buck->eattrs, remains - 1024);
- if (size < 0)
- {
- log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name);
- bgp_flush_prefixes(p, buck);
- rem_node(&buck->send_node);
- bgp_free_bucket(p, buck);
- continue;
- }
- w += size;
- remains -= size;
-
- /* We have two addresses here in NEXT_HOP eattr. Really.
- Unless NEXT_HOP was modified by filter */
- nh = ea_find(buck->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
- ASSERT(nh);
- second = (nh->u.ptr->length == NEXT_HOP_LENGTH);
- ipp = (ip_addr *) nh->u.ptr->data;
- ip = ipp[0];
- ip_ll = IPA_NONE;
-
- if (ipa_equal(ip, p->source_addr))
- ip_ll = p->local_link;
- else
- {
- /* If we send a route with 'third party' next hop destinated
- * in the same interface, we should also send a link local
- * next hop address. We use the received one (stored in the
- * other part of BA_NEXT_HOP eattr). If we didn't received
- * it (for example it is a static route), we can't use
- * 'third party' next hop and we have to use local IP address
- * as next hop. Sending original next hop address without
- * link local address seems to be a natural way to solve that
- * problem, but it is contrary to RFC 2545 and Quagga does not
- * accept such routes.
- *
- * There are two cases, either we have global IP, or
- * IPA_NONE if the neighbor is link-local. For IPA_NONE,
- * we suppose it is on the same iface, see bgp_update_attrs().
- */
-
- if (ipa_zero(ip) || same_iface(p, &ip))
- {
- if (second && ipa_nonzero(ipp[1]))
- ip_ll = ipp[1];
- else
- {
- switch (p->cf->missing_lladdr)
- {
- case MLL_SELF:
- ip = p->source_addr;
- ip_ll = p->local_link;
- break;
- case MLL_DROP:
- log(L_ERR "%s: Missing link-local next hop address, skipping corresponding routes", p->p.name);
- w = w_stored;
- remains = rem_stored;
- bgp_flush_prefixes(p, buck);
- rem_node(&buck->send_node);
- bgp_free_bucket(p, buck);
- continue;
- case MLL_IGNORE:
- break;
- }
- }
- }
- }
-
- tstart = tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_REACH_NLRI, remains-8);
- *tmp++ = 0;
- *tmp++ = BGP_AF_IPV6;
- *tmp++ = 1;
-
- if (ipa_is_link_local(ip))
- ip = IPA_NONE;
-
- if (ipa_nonzero(ip_ll))
- {
- *tmp++ = 32;
- ipa_hton(ip);
- memcpy(tmp, &ip, 16);
- ipa_hton(ip_ll);
- memcpy(tmp+16, &ip_ll, 16);
- tmp += 32;
- }
- else
- {
- *tmp++ = 16;
- ipa_hton(ip);
- memcpy(tmp, &ip, 16);
- tmp += 16;
- }
-
- *tmp++ = 0; /* No SNPA information */
- tmp += bgp_encode_prefixes(p, tmp, buck, remains - (8+3+32+1));
- ea->attrs[0].u.ptr->length = tmp - tstart;
- size = bgp_encode_attrs(p, w, ea, remains);
- ASSERT(size >= 0);
- w += size;
- break;
- }
+ if (len < 5)
+ bgp_parse_error(s, 1);
+
+ path_id = get_u32(pos);
+ ADVANCE(pos, len, 4);
}
- size = w - (buf+4);
- put_u16(buf+2, size);
- lp_flush(bgp_linpool);
- if (size)
+ /* Decode prefix length */
+ uint l = *pos;
+ ADVANCE(pos, len, 1);
+
+ if (len < ((l + 7) / 8))
+ bgp_parse_error(s, 1);
+
+ /* Decode MPLS labels */
+ if (s->mpls)
+ bgp_decode_mpls_labels(s, &pos, &len, &l, a);
+
+ if (l > IP4_MAX_PREFIX_LENGTH)
+ bgp_parse_error(s, 10);
+
+ /* Decode prefix body */
+ ip4_addr addr = IP4_NONE;
+ uint b = (l + 7) / 8;
+ memcpy(&addr, pos, b);
+ ADVANCE(pos, len, b);
+
+ net = NET_ADDR_IP4(ip4_ntoh(addr), l);
+ net_normalize_ip4(&net);
+
+ // XXXX validate prefix
+
+ bgp_rte_update(s, (net_addr *) &net, path_id, a);
+ }
+}
+
+
+static uint
+bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
+{
+ byte *pos = buf;
+
+ while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
+ {
+ struct bgp_prefix *px = HEAD(buck->prefixes);
+ struct net_addr_ip6 *net = (void *) px->net;
+
+ /* Encode path ID */
+ if (s->add_path)
{
- BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
- return w;
+ put_u32(pos, px->path_id);
+ ADVANCE(pos, size, 4);
}
- else
- return NULL;
+
+ /* Encode prefix length */
+ *pos = net->pxlen;
+ ADVANCE(pos, size, 1);
+
+ /* Encode MPLS labels */
+ if (s->mpls)
+ bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
+
+ /* Encode prefix body */
+ ip6_addr a = ip6_hton(net->prefix);
+ uint b = (net->pxlen + 7) / 8;
+ memcpy(pos, &a, b);
+ ADVANCE(pos, size, b);
+
+ bgp_free_prefix(s->channel, px);
+ }
+
+ return pos - buf;
}
-static byte *
-bgp_create_end_mark(struct bgp_conn *conn, byte *buf)
+static void
+bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
{
- struct bgp_proto *p = conn->bgp;
- BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
+ while (len)
+ {
+ net_addr_ip6 net;
+ u32 path_id = 0;
- put_u16(buf+0, 0);
- put_u16(buf+2, 6); /* length 4-9 */
- buf += 4;
+ /* Decode path ID */
+ if (s->add_path)
+ {
+ if (len < 5)
+ bgp_parse_error(s, 1);
- /* Empty MP_UNREACH_NLRI atribute */
- *buf++ = BAF_OPTIONAL;
- *buf++ = BA_MP_UNREACH_NLRI;
- *buf++ = 3; /* Length 7-9 */
- *buf++ = 0; /* AFI */
- *buf++ = BGP_AF_IPV6;
- *buf++ = 1; /* SAFI */
- return buf;
-}
+ path_id = get_u32(pos);
+ ADVANCE(pos, len, 4);
+ }
-#endif
+ /* Decode prefix length */
+ uint l = *pos;
+ ADVANCE(pos, len, 1);
-static inline byte *
-bgp_create_route_refresh(struct bgp_conn *conn, byte *buf)
-{
- struct bgp_proto *p = conn->bgp;
- BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
+ if (len < ((l + 7) / 8))
+ bgp_parse_error(s, 1);
- /* Original original route refresh request, RFC 2918 */
- *buf++ = 0;
- *buf++ = BGP_AF;
- *buf++ = BGP_RR_REQUEST;
- *buf++ = 1; /* SAFI */
- return buf;
+ /* Decode MPLS labels */
+ if (s->mpls)
+ bgp_decode_mpls_labels(s, &pos, &len, &l, a);
+
+ if (l > IP6_MAX_PREFIX_LENGTH)
+ bgp_parse_error(s, 10);
+
+ /* Decode prefix body */
+ ip6_addr addr = IP6_NONE;
+ uint b = (l + 7) / 8;
+ memcpy(&addr, pos, b);
+ ADVANCE(pos, len, b);
+
+ net = NET_ADDR_IP6(ip6_ntoh(addr), l);
+ net_normalize_ip6(&net);
+
+ // XXXX validate prefix
+
+ bgp_rte_update(s, (net_addr *) &net, path_id, a);
+ }
}
-static inline byte *
-bgp_create_begin_refresh(struct bgp_conn *conn, byte *buf)
+static uint
+bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
{
- struct bgp_proto *p = conn->bgp;
- BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
+ byte *pos = buf;
- /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
- *buf++ = 0;
- *buf++ = BGP_AF;
- *buf++ = BGP_RR_BEGIN;
- *buf++ = 1; /* SAFI */
- return buf;
+ while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
+ {
+ struct bgp_prefix *px = HEAD(buck->prefixes);
+ struct net_addr_vpn4 *net = (void *) px->net;
+
+ /* Encode path ID */
+ if (s->add_path)
+ {
+ put_u32(pos, px->path_id);
+ ADVANCE(pos, size, 4);
+ }
+
+ /* Encode prefix length */
+ *pos = 64 + net->pxlen;
+ ADVANCE(pos, size, 1);
+
+ /* Encode MPLS labels */
+ if (s->mpls)
+ bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
+
+ /* Encode route distinguisher */
+ put_u64(pos, net->rd);
+ ADVANCE(pos, size, 8);
+
+ /* Encode prefix body */
+ ip4_addr a = ip4_hton(net->prefix);
+ uint b = (net->pxlen + 7) / 8;
+ memcpy(pos, &a, b);
+ ADVANCE(pos, size, b);
+
+ bgp_free_prefix(s->channel, px);
+ }
+
+ return pos - buf;
}
-static inline byte *
-bgp_create_end_refresh(struct bgp_conn *conn, byte *buf)
+static void
+bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
{
- struct bgp_proto *p = conn->bgp;
- BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
+ while (len)
+ {
+ net_addr_vpn4 net;
+ u32 path_id = 0;
- /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
- *buf++ = 0;
- *buf++ = BGP_AF;
- *buf++ = BGP_RR_END;
- *buf++ = 1; /* SAFI */
- return buf;
+ /* Decode path ID */
+ if (s->add_path)
+ {
+ if (len < 5)
+ bgp_parse_error(s, 1);
+
+ path_id = get_u32(pos);
+ ADVANCE(pos, len, 4);
+ }
+
+ /* Decode prefix length */
+ uint l = *pos;
+ ADVANCE(pos, len, 1);
+
+ if (len < ((l + 7) / 8))
+ bgp_parse_error(s, 1);
+
+ /* Decode MPLS labels */
+ if (s->mpls)
+ bgp_decode_mpls_labels(s, &pos, &len, &l, a);
+
+ /* Decode route distinguisher */
+ if (l < 64)
+ bgp_parse_error(s, 1);
+
+ u64 rd = get_u64(pos);
+ ADVANCE(pos, len, 8);
+ l -= 64;
+
+ if (l > IP4_MAX_PREFIX_LENGTH)
+ bgp_parse_error(s, 10);
+
+ /* Decode prefix body */
+ ip4_addr addr = IP4_NONE;
+ uint b = (l + 7) / 8;
+ memcpy(&addr, pos, b);
+ ADVANCE(pos, len, b);
+
+ net = NET_ADDR_VPN4(ip4_ntoh(addr), l, rd);
+ net_normalize_vpn4(&net);
+
+ // XXXX validate prefix
+
+ bgp_rte_update(s, (net_addr *) &net, path_id, a);
+ }
}
+static uint
+bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
+{
+ byte *pos = buf;
+
+ while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
+ {
+ struct bgp_prefix *px = HEAD(buck->prefixes);
+ struct net_addr_vpn6 *net = (void *) px->net;
+
+ /* Encode path ID */
+ if (s->add_path)
+ {
+ put_u32(pos, px->path_id);
+ ADVANCE(pos, size, 4);
+ }
+
+ /* Encode prefix length */
+ *pos = 64 + net->pxlen;
+ ADVANCE(pos, size, 1);
+
+ /* Encode MPLS labels */
+ bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
+
+ /* Encode route distinguisher */
+ put_u64(pos, net->rd);
+ ADVANCE(pos, size, 8);
+
+ /* Encode prefix body */
+ ip6_addr a = ip6_hton(net->prefix);
+ uint b = (net->pxlen + 7) / 8;
+ memcpy(pos, &a, b);
+ ADVANCE(pos, size, b);
+
+ bgp_free_prefix(s->channel, px);
+ }
+
+ return pos - buf;
+}
+
static void
-bgp_create_header(byte *buf, uint len, uint type)
+bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
{
- memset(buf, 0xff, 16); /* Marker */
- put_u16(buf+16, len);
- buf[18] = type;
+ while (len)
+ {
+ net_addr_vpn6 net;
+ u32 path_id = 0;
+
+ /* Decode path ID */
+ if (s->add_path)
+ {
+ if (len < 5)
+ bgp_parse_error(s, 1);
+
+ path_id = get_u32(pos);
+ ADVANCE(pos, len, 4);
+ }
+
+ /* Decode prefix length */
+ uint l = *pos;
+ ADVANCE(pos, len, 1);
+
+ if (len < ((l + 7) / 8))
+ bgp_parse_error(s, 1);
+
+ /* Decode MPLS labels */
+ if (s->mpls)
+ bgp_decode_mpls_labels(s, &pos, &len, &l, a);
+
+ /* Decode route distinguisher */
+ if (l < 64)
+ bgp_parse_error(s, 1);
+
+ u64 rd = get_u64(pos);
+ ADVANCE(pos, len, 8);
+ l -= 64;
+
+ if (l > IP6_MAX_PREFIX_LENGTH)
+ bgp_parse_error(s, 10);
+
+ /* Decode prefix body */
+ ip6_addr addr = IP6_NONE;
+ uint b = (l + 7) / 8;
+ memcpy(&addr, pos, b);
+ ADVANCE(pos, len, b);
+
+ net = NET_ADDR_VPN6(ip6_ntoh(addr), l, rd);
+ net_normalize_vpn6(&net);
+
+ // XXXX validate prefix
+
+ bgp_rte_update(s, (net_addr *) &net, path_id, a);
+ }
}
-/**
- * bgp_fire_tx - transmit packets
- * @conn: connection
- *
- * Whenever the transmit buffers of the underlying TCP connection
- * are free and we have any packets queued for sending, the socket functions
- * call bgp_fire_tx() which takes care of selecting the highest priority packet
- * queued (Notification > Keepalive > Open > Update), assembling its header
- * and body and sending it to the connection.
- */
-static int
-bgp_fire_tx(struct bgp_conn *conn)
+
+static uint
+bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
{
- struct bgp_proto *p = conn->bgp;
- uint s = conn->packets_to_send;
- sock *sk = conn->sk;
- byte *buf, *pkt, *end;
- int type;
+ byte *pos = buf;
+
+ while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
+ {
+ struct bgp_prefix *px = HEAD(buck->prefixes);
+ struct net_addr_flow4 *net = (void *) px->net;
+ uint flen = net->length - sizeof(net_addr_flow4);
- if (!sk)
+ /* Encode path ID */
+ if (s->add_path)
{
- conn->packets_to_send = 0;
- return 0;
+ put_u32(pos, px->path_id);
+ ADVANCE(pos, size, 4);
}
- buf = sk->tbuf;
- pkt = buf + BGP_HEADER_LENGTH;
- if (s & (1 << PKT_SCHEDULE_CLOSE))
+ if (flen > size)
+ break;
+
+ /* Copy whole flow data including length */
+ memcpy(pos, net->data, flen);
+ ADVANCE(pos, size, flen);
+
+ bgp_free_prefix(s->channel, px);
+ }
+
+ return pos - buf;
+}
+
+static void
+bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
+{
+ while (len)
+ {
+ u32 path_id = 0;
+
+ /* Decode path ID */
+ if (s->add_path)
{
- /* We can finally close connection and enter idle state */
- bgp_conn_enter_idle_state(conn);
- return 0;
+ if (len < 4)
+ bgp_parse_error(s, 1);
+
+ path_id = get_u32(pos);
+ ADVANCE(pos, len, 4);
}
- if (s & (1 << PKT_NOTIFICATION))
+
+ if (len < 2)
+ bgp_parse_error(s, 1);
+
+ /* Decode flow length */
+ uint hlen = flow_hdr_length(pos);
+ uint dlen = flow_read_length(pos);
+ uint flen = hlen + dlen;
+ byte *data = pos + hlen;
+
+ if (len < flen)
+ bgp_parse_error(s, 1);
+
+ /* Validate flow data */
+ enum flow_validated_state r = flow4_validate(data, dlen);
+ if (r != FLOW_ST_VALID)
{
- s = 1 << PKT_SCHEDULE_CLOSE;
- type = PKT_NOTIFICATION;
- end = bgp_create_notification(conn, pkt);
+ log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
+ bgp_parse_error(s, 1);
}
- else if (s & (1 << PKT_KEEPALIVE))
+
+ if (data[0] != FLOW_TYPE_DST_PREFIX)
{
- s &= ~(1 << PKT_KEEPALIVE);
- type = PKT_KEEPALIVE;
- end = pkt; /* Keepalives carry no data */
- BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
- bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
+ log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
+ bgp_parse_error(s, 1);
}
- else if (s & (1 << PKT_OPEN))
+
+ /* Decode dst prefix */
+ ip4_addr px = IP4_NONE;
+ uint pxlen = data[1];
+
+ // FIXME: Use some generic function
+ memcpy(&px, data, BYTES(pxlen));
+ px = ip4_and(px, ip4_mkmask(pxlen));
+
+ /* Prepare the flow */
+ net_addr *n = alloca(sizeof(struct net_addr_flow4) + flen);
+ net_fill_flow4(n, px, pxlen, pos, flen);
+ ADVANCE(pos, len, flen);
+
+ bgp_rte_update(s, n, path_id, a);
+ }
+}
+
+
+static uint
+bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
+{
+ byte *pos = buf;
+
+ while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
+ {
+ struct bgp_prefix *px = HEAD(buck->prefixes);
+ struct net_addr_flow6 *net = (void *) px->net;
+ uint flen = net->length - sizeof(net_addr_flow6);
+
+ /* Encode path ID */
+ if (s->add_path)
{
- s &= ~(1 << PKT_OPEN);
- type = PKT_OPEN;
- end = bgp_create_open(conn, pkt);
+ put_u32(pos, px->path_id);
+ ADVANCE(pos, size, 4);
}
- else if (s & (1 << PKT_ROUTE_REFRESH))
+
+ if (flen > size)
+ break;
+
+ /* Copy whole flow data including length */
+ memcpy(pos, net->data, flen);
+ ADVANCE(pos, size, flen);
+
+ bgp_free_prefix(s->channel, px);
+ }
+
+ return pos - buf;
+}
+
+static void
+bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
+{
+ while (len)
+ {
+ u32 path_id = 0;
+
+ /* Decode path ID */
+ if (s->add_path)
{
- s &= ~(1 << PKT_ROUTE_REFRESH);
- type = PKT_ROUTE_REFRESH;
- end = bgp_create_route_refresh(conn, pkt);
+ if (len < 4)
+ bgp_parse_error(s, 1);
+
+ path_id = get_u32(pos);
+ ADVANCE(pos, len, 4);
}
- else if (s & (1 << PKT_BEGIN_REFRESH))
+
+ if (len < 2)
+ bgp_parse_error(s, 1);
+
+ /* Decode flow length */
+ uint hlen = flow_hdr_length(pos);
+ uint dlen = flow_read_length(pos);
+ uint flen = hlen + dlen;
+ byte *data = pos + hlen;
+
+ if (len < flen)
+ bgp_parse_error(s, 1);
+
+ /* Validate flow data */
+ enum flow_validated_state r = flow6_validate(data, dlen);
+ if (r != FLOW_ST_VALID)
{
- s &= ~(1 << PKT_BEGIN_REFRESH);
- type = PKT_ROUTE_REFRESH; /* BoRR is a subtype of RR */
- end = bgp_create_begin_refresh(conn, pkt);
+ log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
+ bgp_parse_error(s, 1);
}
- else if (s & (1 << PKT_UPDATE))
+
+ if (data[0] != FLOW_TYPE_DST_PREFIX)
{
- type = PKT_UPDATE;
- end = bgp_create_update(conn, pkt);
+ log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
+ bgp_parse_error(s, 1);
+ }
- if (!end)
- {
- /* No update to send, perhaps we need to send End-of-RIB or EoRR */
+ /* Decode dst prefix */
+ ip6_addr px = IP6_NONE;
+ uint pxlen = data[1];
- conn->packets_to_send = 0;
+ // FIXME: Use some generic function
+ memcpy(&px, data, BYTES(pxlen));
+ px = ip6_and(px, ip6_mkmask(pxlen));
- if (p->feed_state == BFS_LOADED)
- {
- type = PKT_UPDATE;
- end = bgp_create_end_mark(conn, pkt);
- }
+ /* Prepare the flow */
+ net_addr *n = alloca(sizeof(struct net_addr_flow6) + flen);
+ net_fill_flow6(n, px, pxlen, pos, flen);
+ ADVANCE(pos, len, flen);
- else if (p->feed_state == BFS_REFRESHED)
- {
- type = PKT_ROUTE_REFRESH;
- end = bgp_create_end_refresh(conn, pkt);
- }
+ bgp_rte_update(s, n, path_id, a);
+ }
+}
- else /* Really nothing to send */
- return 0;
- p->feed_state = BFS_NONE;
- }
- }
- else
- return 0;
+static const struct bgp_af_desc bgp_af_table[] = {
+ {
+ .afi = BGP_AF_IPV4,
+ .net = NET_IP4,
+ .name = "ipv4",
+ .encode_nlri = bgp_encode_nlri_ip4,
+ .decode_nlri = bgp_decode_nlri_ip4,
+ .encode_next_hop = bgp_encode_next_hop_ip,
+ .decode_next_hop = bgp_decode_next_hop_ip,
+ .update_next_hop = bgp_update_next_hop_ip,
+ },
+ {
+ .afi = BGP_AF_IPV4_MC,
+ .net = NET_IP4,
+ .name = "ipv4-mc",
+ .encode_nlri = bgp_encode_nlri_ip4,
+ .decode_nlri = bgp_decode_nlri_ip4,
+ .encode_next_hop = bgp_encode_next_hop_ip,
+ .decode_next_hop = bgp_decode_next_hop_ip,
+ .update_next_hop = bgp_update_next_hop_ip,
+ },
+ {
+ .afi = BGP_AF_IPV4_MPLS,
+ .net = NET_IP4,
+ .mpls = 1,
+ .name = "ipv4-mpls",
+ .encode_nlri = bgp_encode_nlri_ip4,
+ .decode_nlri = bgp_decode_nlri_ip4,
+ .encode_next_hop = bgp_encode_next_hop_ip,
+ .decode_next_hop = bgp_decode_next_hop_ip,
+ .update_next_hop = bgp_update_next_hop_ip,
+ },
+ {
+ .afi = BGP_AF_IPV6,
+ .net = NET_IP6,
+ .name = "ipv6",
+ .encode_nlri = bgp_encode_nlri_ip6,
+ .decode_nlri = bgp_decode_nlri_ip6,
+ .encode_next_hop = bgp_encode_next_hop_ip,
+ .decode_next_hop = bgp_decode_next_hop_ip,
+ .update_next_hop = bgp_update_next_hop_ip,
+ },
+ {
+ .afi = BGP_AF_IPV6_MC,
+ .net = NET_IP6,
+ .name = "ipv6-mc",
+ .encode_nlri = bgp_encode_nlri_ip6,
+ .decode_nlri = bgp_decode_nlri_ip6,
+ .encode_next_hop = bgp_encode_next_hop_ip,
+ .decode_next_hop = bgp_decode_next_hop_ip,
+ .update_next_hop = bgp_update_next_hop_ip,
+ },
+ {
+ .afi = BGP_AF_IPV6_MPLS,
+ .net = NET_IP6,
+ .mpls = 1,
+ .name = "ipv6-mpls",
+ .encode_nlri = bgp_encode_nlri_ip6,
+ .decode_nlri = bgp_decode_nlri_ip6,
+ .encode_next_hop = bgp_encode_next_hop_ip,
+ .decode_next_hop = bgp_decode_next_hop_ip,
+ .update_next_hop = bgp_update_next_hop_ip,
+ },
+ {
+ .afi = BGP_AF_VPN4_MPLS,
+ .net = NET_VPN4,
+ .mpls = 1,
+ .name = "vpn4-mpls",
+ .encode_nlri = bgp_encode_nlri_vpn4,
+ .decode_nlri = bgp_decode_nlri_vpn4,
+ .encode_next_hop = bgp_encode_next_hop_vpn,
+ .decode_next_hop = bgp_decode_next_hop_vpn,
+ .update_next_hop = bgp_update_next_hop_ip,
+ },
+ {
+ .afi = BGP_AF_VPN6_MPLS,
+ .net = NET_VPN6,
+ .mpls = 1,
+ .name = "vpn6-mpls",
+ .encode_nlri = bgp_encode_nlri_vpn6,
+ .decode_nlri = bgp_decode_nlri_vpn6,
+ .encode_next_hop = bgp_encode_next_hop_vpn,
+ .decode_next_hop = bgp_decode_next_hop_vpn,
+ .update_next_hop = bgp_update_next_hop_ip,
+ },
+ {
+ .afi = BGP_AF_VPN4_MC,
+ .net = NET_VPN4,
+ .name = "vpn4-mc",
+ .encode_nlri = bgp_encode_nlri_vpn4,
+ .decode_nlri = bgp_decode_nlri_vpn4,
+ .encode_next_hop = bgp_encode_next_hop_vpn,
+ .decode_next_hop = bgp_decode_next_hop_vpn,
+ .update_next_hop = bgp_update_next_hop_ip,
+ },
+ {
+ .afi = BGP_AF_VPN6_MC,
+ .net = NET_VPN6,
+ .name = "vpn6-mc",
+ .encode_nlri = bgp_encode_nlri_vpn6,
+ .decode_nlri = bgp_decode_nlri_vpn6,
+ .encode_next_hop = bgp_encode_next_hop_vpn,
+ .decode_next_hop = bgp_decode_next_hop_vpn,
+ .update_next_hop = bgp_update_next_hop_ip,
+ },
+ {
+ .afi = BGP_AF_FLOW4,
+ .net = NET_FLOW4,
+ .no_igp = 1,
+ .name = "flow4",
+ .encode_nlri = bgp_encode_nlri_flow4,
+ .decode_nlri = bgp_decode_nlri_flow4,
+ .encode_next_hop = bgp_encode_next_hop_none,
+ .decode_next_hop = bgp_decode_next_hop_none,
+ .update_next_hop = bgp_update_next_hop_none,
+ },
+ {
+ .afi = BGP_AF_FLOW6,
+ .net = NET_FLOW6,
+ .no_igp = 1,
+ .name = "flow6",
+ .encode_nlri = bgp_encode_nlri_flow6,
+ .decode_nlri = bgp_decode_nlri_flow6,
+ .encode_next_hop = bgp_encode_next_hop_none,
+ .decode_next_hop = bgp_decode_next_hop_none,
+ .update_next_hop = bgp_update_next_hop_none,
+ },
+};
- conn->packets_to_send = s;
- bgp_create_header(buf, end - buf, type);
- return sk_send(sk, end - buf);
+const struct bgp_af_desc *
+bgp_get_af_desc(u32 afi)
+{
+ uint i;
+ for (i = 0; i < ARRAY_SIZE(bgp_af_table); i++)
+ if (bgp_af_table[i].afi == afi)
+ return &bgp_af_table[i];
+
+ return NULL;
}
-/**
- * bgp_schedule_packet - schedule a packet for transmission
- * @conn: connection
- * @type: packet type
- *
- * Schedule a packet of type @type to be sent as soon as possible.
- */
-void
-bgp_schedule_packet(struct bgp_conn *conn, int type)
+static inline uint
+bgp_encode_nlri(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
{
- DBG("BGP: Scheduling packet type %d\n", type);
- conn->packets_to_send |= 1 << type;
- if (conn->sk && conn->sk->tpos == conn->sk->tbuf && !ev_active(conn->tx_ev))
- ev_schedule(conn->tx_ev);
+ return s->channel->desc->encode_nlri(s, buck, buf, end - buf);
}
-void
-bgp_kick_tx(void *vconn)
+static inline uint
+bgp_encode_next_hop(struct bgp_write_state *s, eattr *nh, byte *buf)
{
- struct bgp_conn *conn = vconn;
-
- DBG("BGP: kicking TX\n");
- while (bgp_fire_tx(conn) > 0)
- ;
+ return s->channel->desc->encode_next_hop(s, nh, buf, 255);
}
void
-bgp_tx(sock *sk)
+bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to)
{
- struct bgp_conn *conn = sk->data;
-
- DBG("BGP: TX hook\n");
- while (bgp_fire_tx(conn) > 0)
- ;
+ s->channel->desc->update_next_hop(s, a, to);
}
-/* Capatibility negotiation as per RFC 2842 */
+#define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024)
-void
-bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len)
+static byte *
+bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
{
- // struct bgp_proto *p = conn->bgp;
- int i, cl;
+ /*
+ * 2 B Withdrawn Routes Length (zero)
+ * --- IPv4 Withdrawn Routes NLRI (unused)
+ * 2 B Total Path Attribute Length
+ * var Path Attributes
+ * var IPv4 Network Layer Reachability Information
+ */
+
+ int lr, la;
+
+ la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH);
+ if (la < 0)
+ {
+ /* Attribute list too long */
+ bgp_withdraw_bucket(s->channel, buck);
+ return NULL;
+ }
- while (len > 0)
- {
- if (len < 2 || len < 2 + opt[1])
- goto err;
+ put_u16(buf+0, 0);
+ put_u16(buf+2, la);
- cl = opt[1];
+ lr = bgp_encode_nlri(s, buck, buf+4+la, end);
- switch (opt[0])
- {
- case 2: /* Route refresh capability, RFC 2918 */
- if (cl != 0)
- goto err;
- conn->peer_refresh_support = 1;
- break;
+ return buf+4+la+lr;
+}
- case 6: /* Extended message length capability, draft */
- if (cl != 0)
- goto err;
- conn->peer_ext_messages_support = 1;
- break;
+static byte *
+bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
+{
+ /*
+ * 2 B IPv4 Withdrawn Routes Length (zero)
+ * --- IPv4 Withdrawn Routes NLRI (unused)
+ * 2 B Total Path Attribute Length
+ * 1 B MP_REACH_NLRI hdr - Attribute Flags
+ * 1 B MP_REACH_NLRI hdr - Attribute Type Code
+ * 2 B MP_REACH_NLRI hdr - Length of Attribute Data
+ * 2 B MP_REACH_NLRI data - Address Family Identifier
+ * 1 B MP_REACH_NLRI data - Subsequent Address Family Identifier
+ * 1 B MP_REACH_NLRI data - Length of Next Hop Network Address
+ * var MP_REACH_NLRI data - Network Address of Next Hop
+ * 1 B MP_REACH_NLRI data - Reserved (zero)
+ * var MP_REACH_NLRI data - Network Layer Reachability Information
+ * var Rest of Path Attributes
+ * --- IPv4 Network Layer Reachability Information (unused)
+ */
+
+ int lh, lr, la; /* Lengths of next hop, NLRI and attributes */
+
+ /* Begin of MP_REACH_NLRI atribute */
+ buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
+ buf[5] = BA_MP_REACH_NLRI;
+ put_u16(buf+6, 0); /* Will be fixed later */
+ put_af3(buf+8, s->channel->afi);
+ byte *pos = buf+11;
+
+ /* Encode attributes to temporary buffer */
+ byte *abuf = alloca(MAX_ATTRS_LENGTH);
+ la = bgp_encode_attrs(s, buck->eattrs, abuf, abuf + MAX_ATTRS_LENGTH);
+ if (la < 0)
+ {
+ /* Attribute list too long */
+ bgp_withdraw_bucket(s->channel, buck);
+ return NULL;
+ }
- case 64: /* Graceful restart capability, RFC 4724 */
- if (cl % 4 != 2)
- goto err;
- conn->peer_gr_aware = 1;
- conn->peer_gr_able = 0;
- conn->peer_gr_time = get_u16(opt + 2) & 0x0fff;
- conn->peer_gr_flags = opt[2] & 0xf0;
- conn->peer_gr_aflags = 0;
- for (i = 2; i < cl; i += 4)
- if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */
- {
- conn->peer_gr_able = 1;
- conn->peer_gr_aflags = opt[2+i+3];
- }
- break;
+ /* Encode the next hop */
+ lh = bgp_encode_next_hop(s, s->mp_next_hop, pos+1);
+ *pos = lh;
+ pos += 1+lh;
- case 65: /* AS4 capability, RFC 4893 */
- if (cl != 4)
- goto err;
- conn->peer_as4_support = 1;
- if (conn->bgp->cf->enable_as4)
- conn->advertised_as = get_u32(opt + 2);
- break;
+ /* Reserved field */
+ *pos++ = 0;
- case 69: /* ADD-PATH capability, RFC 7911 */
- if (cl % 4)
- goto err;
- for (i = 0; i < cl; i += 4)
- if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */
- conn->peer_add_path = opt[2+i+3];
- if (conn->peer_add_path > ADD_PATH_FULL)
- goto err;
- break;
+ /* Encode the NLRI */
+ lr = bgp_encode_nlri(s, buck, pos, end - la);
+ pos += lr;
- case 70: /* Enhanced route refresh capability, RFC 7313 */
- if (cl != 0)
- goto err;
- conn->peer_enhanced_refresh_support = 1;
- break;
+ /* End of MP_REACH_NLRI atribute, update data length */
+ put_u16(buf+6, pos-buf-8);
- /* We can safely ignore all other capabilities */
- }
- len -= 2 + cl;
- opt += 2 + cl;
- }
- return;
+ /* Copy remaining attributes */
+ memcpy(pos, abuf, la);
+ pos += la;
- err:
- bgp_error(conn, 2, 0, NULL, 0);
- return;
+ /* Initial UPDATE fields */
+ put_u16(buf+0, 0);
+ put_u16(buf+2, pos-buf-4);
+
+ return pos;
}
-static int
-bgp_parse_options(struct bgp_conn *conn, byte *opt, int len)
+#undef MAX_ATTRS_LENGTH
+
+static byte *
+bgp_create_ip_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
{
- struct bgp_proto *p = conn->bgp;
- int ol;
+ /*
+ * 2 B Withdrawn Routes Length
+ * var IPv4 Withdrawn Routes NLRI
+ * 2 B Total Path Attribute Length (zero)
+ * --- Path Attributes (unused)
+ * --- IPv4 Network Layer Reachability Information (unused)
+ */
- while (len > 0)
- {
- if (len < 2 || len < 2 + opt[1])
- { bgp_error(conn, 2, 0, NULL, 0); return 0; }
-#ifdef LOCAL_DEBUG
- {
- int i;
- DBG("\tOption %02x:", opt[0]);
- for(i=0; i<opt[1]; i++)
- DBG(" %02x", opt[2+i]);
- DBG("\n");
- }
-#endif
+ uint len = bgp_encode_nlri(s, buck, buf+2, end);
- ol = opt[1];
- switch (opt[0])
- {
- case 2:
- if (conn->start_state == BSS_CONNECT_NOCAP)
- BGP_TRACE(D_PACKETS, "Ignoring received capabilities");
- else
- bgp_parse_capabilities(conn, opt + 2, ol);
- break;
+ put_u16(buf+0, len);
+ put_u16(buf+2+len, 0);
- default:
- /*
- * BGP specs don't tell us to send which option
- * we didn't recognize, but it's common practice
- * to do so. Also, capability negotiation with
- * Cisco routers doesn't work without that.
- */
- bgp_error(conn, 2, 4, opt, ol);
- return 0;
- }
- len -= 2 + ol;
- opt += 2 + ol;
- }
- return 0;
+ return buf+4+len;
}
-static void
-bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
+static byte *
+bgp_create_mp_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
{
- struct bgp_conn *other;
- struct bgp_proto *p = conn->bgp;
- unsigned hold;
- u16 base_as;
- u32 id;
+ /*
+ * 2 B Withdrawn Routes Length (zero)
+ * --- IPv4 Withdrawn Routes NLRI (unused)
+ * 2 B Total Path Attribute Length
+ * 1 B MP_UNREACH_NLRI hdr - Attribute Flags
+ * 1 B MP_UNREACH_NLRI hdr - Attribute Type Code
+ * 2 B MP_UNREACH_NLRI hdr - Length of Attribute Data
+ * 2 B MP_UNREACH_NLRI data - Address Family Identifier
+ * 1 B MP_UNREACH_NLRI data - Subsequent Address Family Identifier
+ * var MP_UNREACH_NLRI data - Network Layer Reachability Information
+ * --- IPv4 Network Layer Reachability Information (unused)
+ */
+
+ uint len = bgp_encode_nlri(s, buck, buf+11, end);
- /* Check state */
- if (conn->state != BS_OPENSENT)
- { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
+ put_u16(buf+0, 0);
+ put_u16(buf+2, 7+len);
- /* Check message contents */
- if (len < 29 || len != 29U + pkt[28])
- { bgp_error(conn, 1, 2, pkt+16, 2); return; }
- if (pkt[19] != BGP_VERSION)
- { bgp_error(conn, 2, 1, pkt+19, 1); return; } /* RFC 1771 says 16 bits, draft-09 tells to use 8 */
- conn->advertised_as = base_as = get_u16(pkt+20);
- hold = get_u16(pkt+22);
- id = get_u32(pkt+24);
- BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%08x)", conn->advertised_as, hold, id);
+ /* Begin of MP_UNREACH_NLRI atribute */
+ buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
+ buf[5] = BA_MP_UNREACH_NLRI;
+ put_u16(buf+6, 3+len);
+ put_af3(buf+8, s->channel->afi);
- if (bgp_parse_options(conn, pkt+29, pkt[28]))
- return;
+ return buf+11+len;
+}
- if (hold > 0 && hold < 3)
- { bgp_error(conn, 2, 6, pkt+22, 2); return; }
+static byte *
+bgp_create_update(struct bgp_channel *c, byte *buf)
+{
+ struct bgp_proto *p = (void *) c->c.proto;
+ struct bgp_bucket *buck;
+ byte *end = buf + (bgp_max_packet_length(p->conn) - BGP_HEADER_LENGTH);
+ byte *res = NULL;
+
+again: ;
+
+ /* Initialize write state */
+ struct bgp_write_state s = {
+ .proto = p,
+ .channel = c,
+ .pool = bgp_linpool,
+ .as4_session = p->as4_session,
+ .add_path = c->add_path_tx,
+ .mpls = c->desc->mpls,
+ };
+
+ /* Try unreachable bucket */
+ if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
+ {
+ res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ?
+ bgp_create_ip_unreach(&s, buck, buf, end):
+ bgp_create_mp_unreach(&s, buck, buf, end);
- /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
- if (!id || (p->is_internal && id == p->local_id))
- { bgp_error(conn, 2, 3, pkt+24, -4); return; }
+ goto done;
+ }
- if ((conn->advertised_as != base_as) && (base_as != AS_TRANS))
- log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
+ /* Try reachable buckets */
+ if (!EMPTY_LIST(c->bucket_queue))
+ {
+ buck = HEAD(c->bucket_queue);
- if (conn->advertised_as != p->remote_as)
+ /* Cleanup empty buckets */
+ if (EMPTY_LIST(buck->prefixes))
{
- if (conn->peer_as4_support)
- {
- u32 val = htonl(conn->advertised_as);
- bgp_error(conn, 2, 2, (byte *) &val, 4);
- }
- else
- bgp_error(conn, 2, 2, pkt+20, 2);
-
- return;
+ bgp_free_bucket(c, buck);
+ goto again;
}
- /* Check the other connection */
- other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
- switch (other->state)
- {
- case BS_CONNECT:
- case BS_ACTIVE:
- /* Stop outgoing connection attempts */
- bgp_conn_enter_idle_state(other);
- break;
+ res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ?
+ bgp_create_ip_reach(&s, buck, buf, end):
+ bgp_create_mp_reach(&s, buck, buf, end);
- case BS_IDLE:
- case BS_OPENSENT:
- case BS_CLOSE:
- break;
+ if (EMPTY_LIST(buck->prefixes))
+ bgp_free_bucket(c, buck);
+ else
+ bgp_defer_bucket(c, buck);
- case BS_OPENCONFIRM:
- /*
- * Description of collision detection rules in RFC 4271 is confusing and
- * contradictory, but it is essentially:
- *
- * 1. Router with higher ID is dominant
- * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
- * 3. When both connections are in OpenConfirm state, one initiated by
- * the dominant router is kept.
- *
- * The first line in the expression below evaluates whether the neighbor
- * is dominant, the second line whether the new connection was initiated
- * by the neighbor. If both are true (or both are false), we keep the new
- * connection, otherwise we keep the old one.
- */
- if (((p->local_id < id) || ((p->local_id == id) && (p->local_as < p->remote_as)))
- == (conn == &p->incoming_conn))
- {
- /* Should close the other connection */
- BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
- bgp_error(other, 6, 7, NULL, 0);
- break;
- }
- /* Fall thru */
- case BS_ESTABLISHED:
- /* Should close this connection */
- BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
- bgp_error(conn, 6, 7, NULL, 0);
- return;
- default:
- bug("bgp_rx_open: Unknown state");
- }
+ if (!res)
+ goto again;
- /* Update our local variables */
- conn->hold_time = MIN(hold, p->cf->hold_time);
- conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
- p->remote_id = id;
- p->as4_session = p->cf->enable_as4 && conn->peer_as4_support;
- p->add_path_rx = (p->cf->add_path & ADD_PATH_RX) && (conn->peer_add_path & ADD_PATH_TX);
- p->add_path_tx = (p->cf->add_path & ADD_PATH_TX) && (conn->peer_add_path & ADD_PATH_RX);
- p->gr_ready = p->cf->gr_mode && conn->peer_gr_able;
- p->ext_messages = p->cf->enable_extended_messages && conn->peer_ext_messages_support;
-
- /* Update RA mode */
- if (p->add_path_tx)
- p->p.accept_ra_types = RA_ANY;
- else if (p->cf->secondary)
- p->p.accept_ra_types = RA_ACCEPTED;
- else
- p->p.accept_ra_types = RA_OPTIMAL;
+ goto done;
+ }
- DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n", conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, p->as4_session);
+ /* No more prefixes to send */
+ return NULL;
- bgp_schedule_packet(conn, PKT_KEEPALIVE);
- bgp_start_timer(conn->hold_timer, conn->hold_time);
- bgp_conn_enter_openconfirm_state(conn);
+done:
+ BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
+ lp_flush(s.pool);
+
+ return res;
}
+static byte *
+bgp_create_ip_end_mark(struct bgp_channel *c UNUSED, byte *buf)
+{
+ /* Empty update packet */
+ put_u32(buf, 0);
-static inline void
-bgp_rx_end_mark(struct bgp_proto *p)
+ return buf+4;
+}
+
+static byte *
+bgp_create_mp_end_mark(struct bgp_channel *c, byte *buf)
{
- BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
+ put_u16(buf+0, 0);
+ put_u16(buf+2, 6); /* length 4--9 */
- if (p->load_state == BFS_LOADING)
- p->load_state = BFS_NONE;
+ /* Empty MP_UNREACH_NLRI atribute */
+ buf[4] = BAF_OPTIONAL;
+ buf[5] = BA_MP_UNREACH_NLRI;
+ buf[6] = 3; /* Length 7--9 */
+ put_af3(buf+7, c->afi);
- if (p->p.gr_recovery)
- proto_graceful_restart_unlock(&p->p);
-
- if (p->gr_active)
- bgp_graceful_restart_done(p);
-}
-
-
-#define DECODE_PREFIX(pp, ll) do { \
- if (p->add_path_rx) \
- { \
- if (ll < 5) { err=1; goto done; } \
- path_id = get_u32(pp); \
- pp += 4; \
- ll -= 4; \
- } \
- int b = *pp++; \
- int q; \
- ll--; \
- if (b > BITS_PER_IP_ADDRESS) { err=10; goto done; } \
- q = (b+7) / 8; \
- if (ll < q) { err=1; goto done; } \
- memcpy(&prefix, pp, q); \
- pp += q; \
- ll -= q; \
- ipa_ntoh(prefix); \
- prefix = ipa_and(prefix, ipa_mkmask(b)); \
- pxlen = b; \
-} while (0)
+ return buf+10;
+}
+static byte *
+bgp_create_end_mark(struct bgp_channel *c, byte *buf)
+{
+ struct bgp_proto *p = (void *) c->c.proto;
+
+ BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
+
+ return (c->afi == BGP_AF_IPV4) ?
+ bgp_create_ip_end_mark(c, buf):
+ bgp_create_mp_end_mark(c, buf);
+}
static inline void
-bgp_rte_update(struct bgp_proto *p, ip_addr prefix, int pxlen,
- u32 path_id, u32 *last_id, struct rte_src **src,
- rta *a0, rta **a)
+bgp_rx_end_mark(struct bgp_parse_state *s, u32 afi)
{
- if (path_id != *last_id)
- {
- *src = rt_get_source(&p->p, path_id);
- *last_id = path_id;
+ struct bgp_proto *p = s->proto;
+ struct bgp_channel *c = bgp_get_channel(p, afi);
- if (*a)
- {
- rta_free(*a);
- *a = NULL;
- }
- }
+ BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
- /* Prepare cached route attributes */
- if (!*a)
- {
- a0->src = *src;
+ if (!c)
+ DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
- /* Workaround for rta_lookup() breaking eattrs */
- ea_list *ea = a0->eattrs;
- *a = rta_lookup(a0);
- a0->eattrs = ea;
- }
+ if (c->load_state == BFS_LOADING)
+ c->load_state = BFS_NONE;
- net *n = net_get(p->p.table, prefix, pxlen);
- rte *e = rte_get_temp(rta_clone(*a));
- e->net = n;
- e->pflags = 0;
- e->u.bgp.suppressed = 0;
- rte_update2(p->p.main_ahook, n, e, *src);
+ if (p->p.gr_recovery)
+ channel_graceful_restart_unlock(&c->c);
+
+ if (c->gr_active)
+ bgp_graceful_restart_done(c);
}
static inline void
-bgp_rte_withdraw(struct bgp_proto *p, ip_addr prefix, int pxlen,
- u32 path_id, u32 *last_id, struct rte_src **src)
+bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len)
{
- if (path_id != *last_id)
- {
- *src = rt_find_source(&p->p, path_id);
- *last_id = path_id;
- }
+ struct bgp_channel *c = bgp_get_channel(s->proto, afi);
+ rta *a = NULL;
- net *n = net_find(p->p.table, prefix, pxlen);
- rte_update2( p->p.main_ahook, n, NULL, *src);
-}
+ if (!c)
+ DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
-static inline int
-bgp_set_next_hop(struct bgp_proto *p, rta *a)
-{
- struct eattr *nh = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
- ip_addr *nexthop = (ip_addr *) nh->u.ptr->data;
+ s->channel = c;
+ s->add_path = c->add_path_rx;
+ s->mpls = c->desc->mpls;
-#ifdef IPV6
- int second = (nh->u.ptr->length == NEXT_HOP_LENGTH) && ipa_nonzero(nexthop[1]);
+ s->last_id = 0;
+ s->last_src = s->proto->p.main_source;
- /* First address should not be link-local, but may be zero in direct mode */
- if (ipa_is_link_local(*nexthop))
- *nexthop = IPA_NONE;
-#else
- int second = 0;
-#endif
-
- if (p->cf->gw_mode == GW_DIRECT)
- {
- neighbor *ng = NULL;
+ /*
+ * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not
+ * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for
+ * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by
+ * decode_next_hop hooks) by restoring a->eattrs afterwards.
+ */
- if (ipa_nonzero(*nexthop))
- ng = neigh_find(&p->p, nexthop, 0);
- else if (second) /* GW_DIRECT -> single_hop -> p->neigh != NULL */
- ng = neigh_find2(&p->p, nexthop + 1, p->neigh->iface, 0);
+ if (ea)
+ {
+ a = allocz(RTA_MAX_SIZE);
- /* Fallback */
- if (!ng)
- ng = p->neigh;
+ a->source = RTS_BGP;
+ a->scope = SCOPE_UNIVERSE;
+ a->from = s->proto->cf->remote_ip;
+ a->eattrs = ea;
- if (ng->scope == SCOPE_HOST)
- return 0;
+ c->desc->decode_next_hop(s, nh, nh_len, a);
- a->dest = RTD_ROUTER;
- a->gw = ng->addr;
- a->iface = ng->iface;
- a->hostentry = NULL;
- a->igp_metric = 0;
- }
- else /* GW_RECURSIVE */
- {
- if (ipa_zero(*nexthop))
- return 0;
+ /* Handle withdraw during next hop decoding */
+ if (s->err_withdraw)
+ a = NULL;
+ }
- rta_set_recursive_next_hop(p->p.table, a, p->igp_table, nexthop, nexthop + second);
- }
+ c->desc->decode_nlri(s, nlri, len, a);
- return 1;
+ rta_free(s->cached_rta);
+ s->cached_rta = NULL;
}
-#ifndef IPV6 /* IPv4 version */
-
static void
-bgp_do_rx_update(struct bgp_conn *conn,
- byte *withdrawn, int withdrawn_len,
- byte *nlri, int nlri_len,
- byte *attrs, int attr_len)
+bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len)
{
struct bgp_proto *p = conn->bgp;
- struct rte_src *src = p->p.main_source;
- rta *a0, *a = NULL;
- ip_addr prefix;
- int pxlen, err = 0;
- u32 path_id = 0;
- u32 last_id = 0;
+ ea_list *ea = NULL;
- /* Check for End-of-RIB marker */
- if (!withdrawn_len && !attr_len && !nlri_len)
- {
- bgp_rx_end_mark(p);
- return;
- }
+ BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
- /* Withdraw routes */
- while (withdrawn_len)
- {
- DECODE_PREFIX(withdrawn, withdrawn_len);
- DBG("Withdraw %I/%d\n", prefix, pxlen);
+ /* Workaround for some BGP implementations that skip initial KEEPALIVE */
+ if (conn->state == BS_OPENCONFIRM)
+ bgp_conn_enter_established_state(conn);
- bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src);
- }
+ if (conn->state != BS_ESTABLISHED)
+ { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
- if (!attr_len && !nlri_len) /* shortcut */
- return;
+ bgp_start_timer(conn->hold_timer, conn->hold_time);
- a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, nlri_len);
+ /* Initialize parse state */
+ struct bgp_parse_state s = {
+ .proto = p,
+ .pool = bgp_linpool,
+ .as4_session = p->as4_session,
+ };
- if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */
- return;
+ /* Parse error handler */
+ if (setjmp(s.err_jmpbuf))
+ {
+ bgp_error(conn, 3, s.err_subcode, NULL, 0);
+ goto done;
+ }
- if (a0 && nlri_len && !bgp_set_next_hop(p, a0))
- a0 = NULL;
+ /* Check minimal length */
+ if (len < 23)
+ { bgp_error(conn, 1, 2, pkt+16, 2); return; }
- last_id = 0;
- src = p->p.main_source;
+ /* Skip fixed header */
+ uint pos = 19;
- while (nlri_len)
- {
- DECODE_PREFIX(nlri, nlri_len);
- DBG("Add %I/%d\n", prefix, pxlen);
+ /*
+ * UPDATE message format
+ *
+ * 2 B IPv4 Withdrawn Routes Length
+ * var IPv4 Withdrawn Routes NLRI
+ * 2 B Total Path Attribute Length
+ * var Path Attributes
+ * var IPv4 Reachable Routes NLRI
+ */
- if (a0)
- bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a);
- else /* Forced withdraw as a result of soft error */
- bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src);
- }
+ s.ip_unreach_len = get_u16(pkt + pos);
+ s.ip_unreach_nlri = pkt + pos + 2;
+ pos += 2 + s.ip_unreach_len;
- done:
- if (a)
- rta_free(a);
+ if (pos + 2 > len)
+ bgp_parse_error(&s, 1);
- if (err)
- bgp_error(conn, 3, err, NULL, 0);
+ s.attr_len = get_u16(pkt + pos);
+ s.attrs = pkt + pos + 2;
+ pos += 2 + s.attr_len;
+
+ if (pos > len)
+ bgp_parse_error(&s, 1);
+ s.ip_reach_len = len - pos;
+ s.ip_reach_nlri = pkt + pos;
+
+
+ if (s.attr_len)
+ ea = bgp_decode_attrs(&s, s.attrs, s.attr_len);
+
+ /* Check for End-of-RIB marker */
+ if (!s.attr_len && !s.ip_unreach_len && !s.ip_reach_len)
+ { bgp_rx_end_mark(&s, BGP_AF_IPV4); goto done; }
+
+ /* Check for MP End-of-RIB marker */
+ if ((s.attr_len < 8) && !s.ip_unreach_len && !s.ip_reach_len &&
+ !s.mp_reach_len && !s.mp_unreach_len && s.mp_unreach_af)
+ { bgp_rx_end_mark(&s, s.mp_unreach_af); goto done; }
+
+ if (s.ip_unreach_len)
+ bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_unreach_nlri, s.ip_unreach_len, NULL, NULL, 0);
+
+ if (s.mp_unreach_len)
+ bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0);
+
+ if (s.ip_reach_len)
+ bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len,
+ ea, s.ip_next_hop_data, s.ip_next_hop_len);
+
+ if (s.mp_reach_len)
+ bgp_decode_nlri(&s, s.mp_reach_af, s.mp_reach_nlri, s.mp_reach_len,
+ ea, s.mp_next_hop_data, s.mp_next_hop_len);
+
+done:
+ rta_free(s.cached_rta);
+ lp_flush(s.pool);
return;
}
-#else /* IPv6 version */
-#define DO_NLRI(name) \
- x = p->name##_start; \
- len = len0 = p->name##_len; \
- if (len) \
- { \
- if (len < 3) { err=9; goto done; } \
- af = get_u16(x); \
- x += 3; \
- len -= 3; \
- DBG("\tNLRI AF=%d sub=%d len=%d\n", af, x[-1], len);\
- } \
- else \
- af = 0; \
- if (af == BGP_AF_IPV6)
+/*
+ * ROUTE-REFRESH
+ */
-static void
-bgp_attach_next_hop(rta *a0, byte *x)
+static inline byte *
+bgp_create_route_refresh(struct bgp_channel *c, byte *buf)
{
- ip_addr *nh = (ip_addr *) bgp_attach_attr_wa(&a0->eattrs, bgp_linpool, BA_NEXT_HOP, NEXT_HOP_LENGTH);
- memcpy(nh, x+1, 16);
- ipa_ntoh(nh[0]);
+ struct bgp_proto *p = (void *) c->c.proto;
- /* We store received link local address in the other part of BA_NEXT_HOP eattr. */
- if (*x == 32)
- {
- memcpy(nh+1, x+17, 16);
- ipa_ntoh(nh[1]);
- }
- else
- nh[1] = IPA_NONE;
+ BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
+
+ /* Original route refresh request, RFC 2918 */
+ put_af4(buf, c->afi);
+ buf[2] = BGP_RR_REQUEST;
+
+ return buf+4;
+}
+
+static inline byte *
+bgp_create_begin_refresh(struct bgp_channel *c, byte *buf)
+{
+ struct bgp_proto *p = (void *) c->c.proto;
+
+ BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
+
+ /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
+ put_af4(buf, c->afi);
+ buf[2] = BGP_RR_BEGIN;
+
+ return buf+4;
}
+static inline byte *
+bgp_create_end_refresh(struct bgp_channel *c, byte *buf)
+{
+ struct bgp_proto *p = (void *) c->c.proto;
+
+ BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
+
+ /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
+ put_af4(buf, c->afi);
+ buf[2] = BGP_RR_END;
+
+ return buf+4;
+}
static void
-bgp_do_rx_update(struct bgp_conn *conn,
- byte *withdrawn UNUSED, int withdrawn_len,
- byte *nlri UNUSED, int nlri_len,
- byte *attrs, int attr_len)
+bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
{
struct bgp_proto *p = conn->bgp;
- struct rte_src *src = p->p.main_source;
- byte *x;
- int len, len0;
- unsigned af;
- rta *a0, *a = NULL;
- ip_addr prefix;
- int pxlen, err = 0;
- u32 path_id = 0;
- u32 last_id = 0;
-
- p->mp_reach_len = 0;
- p->mp_unreach_len = 0;
- a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, 0);
-
- if (conn->state != BS_ESTABLISHED) /* fatal error during decoding */
- return;
- /* Check for End-of-RIB marker */
- if ((attr_len < 8) && !withdrawn_len && !nlri_len && !p->mp_reach_len &&
- (p->mp_unreach_len == 3) && (get_u16(p->mp_unreach_start) == BGP_AF_IPV6))
- {
- bgp_rx_end_mark(p);
- return;
- }
+ if (conn->state != BS_ESTABLISHED)
+ { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
- DO_NLRI(mp_unreach)
- {
- while (len)
- {
- DECODE_PREFIX(x, len);
- DBG("Withdraw %I/%d\n", prefix, pxlen);
- bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src);
- }
- }
+ if (!conn->local_caps->route_refresh)
+ { bgp_error(conn, 1, 3, pkt+18, 1); return; }
- DO_NLRI(mp_reach)
- {
- /* Create fake NEXT_HOP attribute */
- if (len < 1 || (*x != 16 && *x != 32) || len < *x + 2)
- { err = 9; goto done; }
+ if (len < (BGP_HEADER_LENGTH + 4))
+ { bgp_error(conn, 1, 2, pkt+16, 2); return; }
- if (a0)
- bgp_attach_next_hop(a0, x);
+ if (len > (BGP_HEADER_LENGTH + 4))
+ { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
- /* Also ignore one reserved byte */
- len -= *x + 2;
- x += *x + 2;
+ struct bgp_channel *c = bgp_get_channel(p, get_af4(pkt+19));
+ if (!c)
+ {
+ log(L_WARN "%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring",
+ p->p.name, pkt[21], get_u16(pkt+19), pkt[22]);
+ return;
+ }
- if (a0 && ! bgp_set_next_hop(p, a0))
- a0 = NULL;
+ /* RFC 7313 redefined reserved field as RR message subtype */
+ uint subtype = p->enhanced_refresh ? pkt[21] : BGP_RR_REQUEST;
- last_id = 0;
- src = p->p.main_source;
+ switch (subtype)
+ {
+ case BGP_RR_REQUEST:
+ BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
+ channel_request_feeding(&c->c);
+ break;
- while (len)
- {
- DECODE_PREFIX(x, len);
- DBG("Add %I/%d\n", prefix, pxlen);
+ case BGP_RR_BEGIN:
+ BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
+ bgp_refresh_begin(c);
+ break;
- if (a0)
- bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a);
- else /* Forced withdraw as a result of soft error */
- bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src);
- }
- }
+ case BGP_RR_END:
+ BGP_TRACE(D_PACKETS, "Got END-OF-RR");
+ bgp_refresh_end(c);
+ break;
- done:
- if (a)
- rta_free(a);
+ default:
+ log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
+ p->p.name, subtype);
+ break;
+ }
+}
+
+static inline struct bgp_channel *
+bgp_get_channel_to_send(struct bgp_proto *p, struct bgp_conn *conn)
+{
+ uint i = conn->last_channel;
- if (err) /* Use subcode 9, not err */
- bgp_error(conn, 3, 9, NULL, 0);
+ /* Try the last channel, but at most several times */
+ if ((conn->channels_to_send & (1 << i)) &&
+ (conn->last_channel_count < 16))
+ goto found;
- return;
+ /* Find channel with non-zero channels_to_send */
+ do
+ {
+ i++;
+ if (i >= p->channel_count)
+ i = 0;
+ }
+ while (! (conn->channels_to_send & (1 << i)));
+
+ /* Use that channel */
+ conn->last_channel = i;
+ conn->last_channel_count = 0;
+
+found:
+ conn->last_channel_count++;
+ return p->channel_map[i];
}
-#endif
+static inline int
+bgp_send(struct bgp_conn *conn, uint type, uint len)
+{
+ sock *sk = conn->sk;
+ byte *buf = sk->tbuf;
-static void
-bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len)
+ memset(buf, 0xff, 16); /* Marker */
+ put_u16(buf+16, len);
+ buf[18] = type;
+
+ return sk_send(sk, len);
+}
+
+/**
+ * bgp_fire_tx - transmit packets
+ * @conn: connection
+ *
+ * Whenever the transmit buffers of the underlying TCP connection
+ * are free and we have any packets queued for sending, the socket functions
+ * call bgp_fire_tx() which takes care of selecting the highest priority packet
+ * queued (Notification > Keepalive > Open > Update), assembling its header
+ * and body and sending it to the connection.
+ */
+static int
+bgp_fire_tx(struct bgp_conn *conn)
{
struct bgp_proto *p = conn->bgp;
- byte *withdrawn, *attrs, *nlri;
- uint withdrawn_len, attr_len, nlri_len;
+ struct bgp_channel *c;
+ byte *buf, *pkt, *end;
+ uint s;
- BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
+ if (!conn->sk)
+ return 0;
- /* Workaround for some BGP implementations that skip initial KEEPALIVE */
- if (conn->state == BS_OPENCONFIRM)
- bgp_conn_enter_established_state(conn);
+ buf = conn->sk->tbuf;
+ pkt = buf + BGP_HEADER_LENGTH;
+ s = conn->packets_to_send;
- if (conn->state != BS_ESTABLISHED)
- { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
- bgp_start_timer(conn->hold_timer, conn->hold_time);
+ if (s & (1 << PKT_SCHEDULE_CLOSE))
+ {
+ /* We can finally close connection and enter idle state */
+ bgp_conn_enter_idle_state(conn);
+ return 0;
+ }
+ if (s & (1 << PKT_NOTIFICATION))
+ {
+ conn->packets_to_send = 1 << PKT_SCHEDULE_CLOSE;
+ end = bgp_create_notification(conn, pkt);
+ return bgp_send(conn, PKT_NOTIFICATION, end - buf);
+ }
+ else if (s & (1 << PKT_KEEPALIVE))
+ {
+ conn->packets_to_send &= ~(1 << PKT_KEEPALIVE);
+ BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
+ bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
+ return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH);
+ }
+ else if (s & (1 << PKT_OPEN))
+ {
+ conn->packets_to_send &= ~(1 << PKT_OPEN);
+ end = bgp_create_open(conn, pkt);
+ return bgp_send(conn, PKT_OPEN, end - buf);
+ }
+ else while (conn->channels_to_send)
+ {
+ c = bgp_get_channel_to_send(p, conn);
+ s = c->packets_to_send;
- /* Find parts of the packet and check sizes */
- if (len < 23)
+ if (s & (1 << PKT_ROUTE_REFRESH))
{
- bgp_error(conn, 1, 2, pkt+16, 2);
- return;
+ c->packets_to_send &= ~(1 << PKT_ROUTE_REFRESH);
+ end = bgp_create_route_refresh(c, pkt);
+ return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
}
- withdrawn = pkt + 21;
- withdrawn_len = get_u16(pkt + 19);
- if (withdrawn_len + 23 > len)
- goto malformed;
- attrs = withdrawn + withdrawn_len + 2;
- attr_len = get_u16(attrs - 2);
- if (withdrawn_len + attr_len + 23 > len)
- goto malformed;
- nlri = attrs + attr_len;
- nlri_len = len - withdrawn_len - attr_len - 23;
- if (!attr_len && nlri_len)
- goto malformed;
- DBG("Sizes: withdrawn=%d, attrs=%d, NLRI=%d\n", withdrawn_len, attr_len, nlri_len);
-
- lp_flush(bgp_linpool);
-
- bgp_do_rx_update(conn, withdrawn, withdrawn_len, nlri, nlri_len, attrs, attr_len);
- return;
+ else if (s & (1 << PKT_BEGIN_REFRESH))
+ {
+ /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */
+ c->packets_to_send &= ~(1 << PKT_BEGIN_REFRESH);
+ end = bgp_create_begin_refresh(c, pkt);
+ return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
+ }
+ else if (s & (1 << PKT_UPDATE))
+ {
+ end = bgp_create_update(c, pkt);
+ if (end)
+ return bgp_send(conn, PKT_UPDATE, end - buf);
+
+ /* No update to send, perhaps we need to send End-of-RIB or EoRR */
+ c->packets_to_send = 0;
+ conn->channels_to_send &= ~(1 << c->index);
+
+ if (c->feed_state == BFS_LOADED)
+ {
+ c->feed_state = BFS_NONE;
+ end = bgp_create_end_mark(c, pkt);
+ return bgp_send(conn, PKT_UPDATE, end - buf);
+ }
+
+ else if (c->feed_state == BFS_REFRESHED)
+ {
+ c->feed_state = BFS_NONE;
+ end = bgp_create_end_refresh(c, pkt);
+ return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
+ }
+ }
+ else if (s)
+ bug("Channel packets_to_send: %x", s);
+
+ c->packets_to_send = 0;
+ conn->channels_to_send &= ~(1 << c->index);
+ }
+
+ return 0;
+}
+
+/**
+ * bgp_schedule_packet - schedule a packet for transmission
+ * @conn: connection
+ * @c: channel
+ * @type: packet type
+ *
+ * Schedule a packet of type @type to be sent as soon as possible.
+ */
+void
+bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type)
+{
+ ASSERT(conn->sk);
+
+ DBG("BGP: Scheduling packet type %d\n", type);
+
+ if (c)
+ {
+ if (! conn->channels_to_send)
+ {
+ conn->last_channel = c->index;
+ conn->last_channel_count = 0;
+ }
+
+ c->packets_to_send |= 1 << type;
+ conn->channels_to_send |= 1 << c->index;
+ }
+ else
+ conn->packets_to_send |= 1 << type;
+
+ if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev))
+ ev_schedule(conn->tx_ev);
+}
+
+void
+bgp_kick_tx(void *vconn)
+{
+ struct bgp_conn *conn = vconn;
-malformed:
- bgp_error(conn, 3, 1, NULL, 0);
+ DBG("BGP: kicking TX\n");
+ while (bgp_fire_tx(conn) > 0)
+ ;
+}
+
+void
+bgp_tx(sock *sk)
+{
+ struct bgp_conn *conn = sk->data;
+
+ DBG("BGP: TX hook\n");
+ while (bgp_fire_tx(conn) > 0)
+ ;
}
+
static struct {
byte major, minor;
byte *msg;
@@ -1480,26 +2665,25 @@ static struct {
* which might be static string or given temporary buffer.
*/
const char *
-bgp_error_dsc(unsigned code, unsigned subcode)
+bgp_error_dsc(uint code, uint subcode)
{
static char buff[32];
- unsigned i;
+ uint i;
+
for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
- {
- return bgp_msg_table[i].msg;
- }
+ return bgp_msg_table[i].msg;
- bsprintf(buff, "Unknown error %d.%d", code, subcode);
+ bsprintf(buff, "Unknown error %u.%u", code, subcode);
return buff;
}
void
-bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len)
+bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, byte *data, uint len)
{
const byte *name;
byte *t, argbuf[36];
- unsigned i;
+ uint i;
/* Don't report Cease messages generated by myself */
if (code == 6 && class == BE_BGP_TX)
@@ -1515,7 +2699,7 @@ bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned
if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
{
/* Bad peer AS - we would like to print the AS */
- t += bsprintf(t, "%d", (len == 2) ? get_u16(data) : get_u32(data));
+ t += bsprintf(t, "%u", (len == 2) ? get_u16(data) : get_u32(data));
goto done;
}
if (len > 16)
@@ -1532,47 +2716,25 @@ static void
bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len)
{
struct bgp_proto *p = conn->bgp;
+
if (len < 21)
- {
- bgp_error(conn, 1, 2, pkt+16, 2);
- return;
- }
+ { bgp_error(conn, 1, 2, pkt+16, 2); return; }
- unsigned code = pkt[19];
- unsigned subcode = pkt[20];
+ uint code = pkt[19];
+ uint subcode = pkt[20];
int err = (code != 6);
bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
-#ifndef IPV6
- if ((code == 2) && ((subcode == 4) || (subcode == 7))
- /* Error related to capability:
- * 4 - Peer does not support capabilities at all.
- * 7 - Peer request some capability. Strange unless it is IPv6 only peer.
- */
- && (p->cf->capabilities == 2)
- /* Capabilities are not explicitly enabled or disabled, therefore heuristic is used */
- && (conn->start_state == BSS_CONNECT)
- /* Failed connection attempt have used capabilities */
- && (p->cf->remote_as <= 0xFFFF))
- /* Not possible with disabled capabilities */
- {
- /* We try connect without capabilities */
- log(L_WARN "%s: Capability related error received, retry with capabilities disabled", p->p.name);
- p->start_state = BSS_CONNECT_NOCAP;
- err = 0;
- }
-#endif
-
bgp_conn_enter_close_state(conn);
- bgp_schedule_packet(conn, PKT_SCHEDULE_CLOSE);
+ bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE);
- if (err)
- {
- bgp_update_startup_delay(p);
- bgp_stop(p, 0);
- }
+ if (err)
+ {
+ bgp_update_startup_delay(p);
+ bgp_stop(p, 0);
+ }
}
static void
@@ -1582,64 +2744,12 @@ bgp_rx_keepalive(struct bgp_conn *conn)
BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
bgp_start_timer(conn->hold_timer, conn->hold_time);
- switch (conn->state)
- {
- case BS_OPENCONFIRM:
- bgp_conn_enter_established_state(conn);
- break;
- case BS_ESTABLISHED:
- break;
- default:
- bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0);
- }
-}
-static void
-bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
-{
- struct bgp_proto *p = conn->bgp;
+ if (conn->state == BS_OPENCONFIRM)
+ { bgp_conn_enter_established_state(conn); return; }
if (conn->state != BS_ESTABLISHED)
- { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
-
- if (!p->cf->enable_refresh)
- { bgp_error(conn, 1, 3, pkt+18, 1); return; }
-
- if (len < (BGP_HEADER_LENGTH + 4))
- { bgp_error(conn, 1, 2, pkt+16, 2); return; }
-
- if (len > (BGP_HEADER_LENGTH + 4))
- { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
-
- /* FIXME - we ignore AFI/SAFI values, as we support
- just one value and even an error code for an invalid
- request is not defined */
-
- /* RFC 7313 redefined reserved field as RR message subtype */
- uint subtype = conn->peer_enhanced_refresh_support ? pkt[21] : BGP_RR_REQUEST;
-
- switch (subtype)
- {
- case BGP_RR_REQUEST:
- BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
- proto_request_feeding(&p->p);
- break;
-
- case BGP_RR_BEGIN:
- BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
- bgp_refresh_begin(p);
- break;
-
- case BGP_RR_END:
- BGP_TRACE(D_PACKETS, "Got END-OF-RR");
- bgp_refresh_end(p);
- break;
-
- default:
- log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
- p->p.name, subtype);
- break;
- }
+ bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0);
}
@@ -1653,7 +2763,7 @@ bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
* packet handler according to the packet type.
*/
static void
-bgp_rx_packet(struct bgp_conn *conn, byte *pkt, unsigned len)
+bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len)
{
byte type = pkt[18];
@@ -1663,14 +2773,14 @@ bgp_rx_packet(struct bgp_conn *conn, byte *pkt, unsigned len)
mrt_dump_bgp_packet(conn, pkt, len);
switch (type)
- {
- case PKT_OPEN: return bgp_rx_open(conn, pkt, len);
- case PKT_UPDATE: return bgp_rx_update(conn, pkt, len);
- case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len);
- case PKT_KEEPALIVE: return bgp_rx_keepalive(conn);
- case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len);
- default: bgp_error(conn, 1, 3, pkt+18, 1);
- }
+ {
+ case PKT_OPEN: return bgp_rx_open(conn, pkt, len);
+ case PKT_UPDATE: return bgp_rx_update(conn, pkt, len);
+ case PKT_NOTIFICATION: return bgp_rx_notification(conn, pkt, len);
+ case PKT_KEEPALIVE: return bgp_rx_keepalive(conn);
+ case PKT_ROUTE_REFRESH: return bgp_rx_route_refresh(conn, pkt, len);
+ default: bgp_error(conn, 1, 3, pkt+18, 1);
+ }
}
/**
@@ -1687,10 +2797,9 @@ int
bgp_rx(sock *sk, uint size)
{
struct bgp_conn *conn = sk->data;
- struct bgp_proto *p = conn->bgp;
byte *pkt_start = sk->rbuf;
byte *end = pkt_start + size;
- unsigned i, len;
+ uint i, len;
DBG("BGP: RX hook: Got %d bytes\n", size);
while (end >= pkt_start + BGP_HEADER_LENGTH)
@@ -1704,7 +2813,7 @@ bgp_rx(sock *sk, uint size)
break;
}
len = get_u16(pkt_start+16);
- if (len < BGP_HEADER_LENGTH || len > bgp_max_packet_length(p))
+ if ((len < BGP_HEADER_LENGTH) || (len > bgp_max_packet_length(conn)))
{
bgp_error(conn, 1, 2, pkt_start+16, 2);
break;