/*
 *	BIRD -- BGP Attributes
 *
 *	(c) 2000 Martin Mares <mj@ucw.cz>
 *	(c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
 *	(c) 2008--2016 CZ.NIC z.s.p.o.
 *
 *	Can be freely distributed and used under the terms of the GNU GPL.
 */

#undef LOCAL_DEBUG

#include <stdlib.h>

#include "nest/bird.h"
#include "nest/iface.h"
#include "nest/protocol.h"
#include "nest/rt.h"
#include "lib/attrs.h"
#include "conf/conf.h"
#include "lib/resource.h"
#include "lib/string.h"
#include "lib/unaligned.h"
#include "lib/macro.h"

#include "bgp.h"

/*
 *   UPDATE message error handling
 *
 * All checks from RFC 4271 6.3 are done as specified with these exceptions:
 *  - The semantic check of an IP address from NEXT_HOP attribute is missing.
 *  - Checks of some optional attribute values are missing.
 *  - Syntactic and semantic checks of NLRIs (done in DECODE_PREFIX())
 *    are probably inadequate.
 *
 * Loop detection based on AS_PATH causes updates to be withdrawn. RFC
 * 4271 does not explicitly specify the behavior in that case.
 *
 * Loop detection related to route reflection (based on ORIGINATOR_ID
 * and CLUSTER_LIST) causes updates to be withdrawn. RFC 4456 8
 * specifies that such updates should be ignored, but that is generally
 * a bad idea.
 *
 * BGP attribute table has several hooks:
 *
 * export - Hook that validates and normalizes attribute during export phase.
 * Receives eattr, may modify it (e.g., sort community lists for canonical
 * representation), UNSET() it (e.g., skip empty lists), or REJECT() the route
 * if necessary. May assume that eattr has value valid w.r.t. its type, but may
 * be invalid w.r.t. BGP constraints. Optional.
 *
 * encode - Hook that converts internal representation to external one during
 * packet writing. Receives eattr and puts it in the buffer (including attribute
 * header). Returns number of bytes, or -1 if not enough space. May assume that
 * eattr has value valid w.r.t. its type and validated by export hook. Mandatory
 * for all known attributes that exist internally after export phase (i.e., all
 * except pseudoattributes MP_(UN)REACH_NLRI).
 *
 * decode - Hook that converts external representation to internal one during
 * packet parsing. Receives attribute data in buffer, validates it and adds
 * attribute to ea_list. If data are invalid, steps DISCARD(), WITHDRAW() or
 * bgp_parse_error() may be used to escape. Mandatory for all known attributes.
 *
 * format - Optional hook that converts eattr to textual representation.
 */

union bgp_attr_desc {
  struct ea_class class;
  struct {
    EA_CLASS_INSIDE;
    uint flags;
    void (*export)(struct bgp_export_state *s, eattr *a);
    int  (*encode)(struct bgp_write_state *s, eattr *a, byte *buf, uint size);
    void (*decode)(struct bgp_parse_state *s, uint code, uint flags, byte *data, uint len, ea_list **to);
  };
};

static union bgp_attr_desc bgp_attr_table[];
static inline const union bgp_attr_desc *bgp_find_attr_desc(eattr *a)
{
  const struct ea_class *class = ea_class_find(a->id);

  if ((class < &bgp_attr_table[0].class) || (class >= &bgp_attr_table[BGP_ATTR_MAX].class))
    return NULL;

  return (const union bgp_attr_desc *) class;
}

#define BGP_EA_ID(code)	(bgp_attr_table[code].id)
#define EA_BGP_ID(code)	(((union bgp_attr_desc *) ea_class_find(code)) - bgp_attr_table)

void bgp_set_attr_u32(ea_list **to, uint code, uint flags, u32 val)
{
  const union bgp_attr_desc *desc = &bgp_attr_table[code];

  ea_set_attr(to, EA_LITERAL_EMBEDDED(
	&desc->class,
	flags & ~BAF_EXT_LEN,
	val
	));
}

void bgp_set_attr_ptr(ea_list **to, uint code, uint flags, const struct adata *ad)
{
  const union bgp_attr_desc *desc = &bgp_attr_table[code];

  ea_set_attr(to, EA_LITERAL_DIRECT_ADATA(
	&desc->class,
	flags & ~BAF_EXT_LEN,
	ad
	));
}

void
bgp_set_attr_data(ea_list **to, uint code, uint flags, void *data, uint len)
{
  const union bgp_attr_desc *desc = &bgp_attr_table[code];

  ea_set_attr(to, EA_LITERAL_STORE_ADATA(
	&desc->class,
	flags & ~BAF_EXT_LEN,
	data,
	len
	));
}

void
bgp_unset_attr(ea_list **to, uint code)
{
  const union bgp_attr_desc *desc = &bgp_attr_table[code];
  ea_unset_attr(to, 0, &desc->class);
}

#define REPORT(msg, args...) \
  ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); })

#define DISCARD(msg, args...) \
  ({ REPORT(msg, ## args); return; })

#define WITHDRAW(msg, args...) \
  ({ REPORT(msg, ## args); s->err_withdraw = 1; return; })

#define UNSET(a) \
  ({ a->undef = 1; return; })

#define REJECT(msg, args...) \
  ({ log(L_ERR "%s: " msg, s->proto->p.name, ## args); s->err_reject = 1; return; })

#define NEW_BGP		"Discarding %s attribute received from AS4-aware neighbor"
#define BAD_EBGP	"Discarding %s attribute received from EBGP neighbor"
#define BAD_LENGTH	"Malformed %s attribute - invalid length (%u)"
#define BAD_VALUE	"Malformed %s attribute - invalid value (%u)"
#define NO_MANDATORY	"Missing mandatory %s attribute"


static inline int
bgp_put_attr_hdr3(byte *buf, uint code, uint flags, uint len)
{
  *buf++ = flags & ~BAF_EXT_LEN;
  *buf++ = code;
  *buf++ = len;
  return 3;
}

static inline int
bgp_put_attr_hdr4(byte *buf, uint code, uint flags, uint len)
{
  *buf++ = flags | BAF_EXT_LEN;
  *buf++ = code;
  put_u16(buf, len);
  return 4;
}

static inline int
bgp_put_attr_hdr(byte *buf, uint code, uint flags, uint len)
{
  if (len < 256)
    return bgp_put_attr_hdr3(buf, code, flags, len);
  else
    return bgp_put_attr_hdr4(buf, code, flags, len);
}

static int
bgp_encode_u8(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size)
{
  if (size < (3+1))
    return -1;

  bgp_put_attr_hdr3(buf, EA_BGP_ID(a->id), a->flags, 1);
  buf[3] = a->u.data;

  return 3+1;
}

static int
bgp_encode_u32(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size)
{
  if (size < (3+4))
    return -1;

  bgp_put_attr_hdr3(buf, EA_BGP_ID(a->id), a->flags, 4);
  put_u32(buf+3, a->u.data);

  return 3+4;
}

static int
bgp_encode_u32s(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size)
{
  uint len = a->u.ptr->length;

  if (size < (4+len))
    return -1;

  uint hdr = bgp_put_attr_hdr(buf, EA_BGP_ID(a->id), a->flags, len);
  put_u32s(buf + hdr, (u32 *) a->u.ptr->data, len / 4);

  return hdr + len;
}

static int
bgp_put_attr(byte *buf, uint size, uint code, uint flags, const byte *data, uint len)
{
  if (size < (4+len))
    return -1;

  uint hdr = bgp_put_attr_hdr(buf, code, flags, len);
  memcpy(buf + hdr, data, len);

  return hdr + len;
}

static int
bgp_encode_raw(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size)
{
  return bgp_put_attr(buf, size, EA_BGP_ID(a->id), a->flags, a->u.ptr->data, a->u.ptr->length);
}


/*
 *	AIGP handling
 */

static int
bgp_aigp_valid(byte *data, uint len, char *err, uint elen)
{
  byte *pos = data;
  char *err_dsc = NULL;
  uint err_val = 0;

#define BAD(DSC,VAL) ({ err_dsc = DSC; err_val = VAL; goto bad; })
  while (len)
  {
    if (len < 3)
      BAD("TLV framing error", len);

    /* Process one TLV */
    uint ptype = pos[0];
    uint plen = get_u16(pos + 1);

    if (len < plen)
      BAD("TLV framing error", plen);

    if (plen < 3)
      BAD("Bad TLV length", plen);

    if ((ptype == BGP_AIGP_METRIC) && (plen != 11))
      BAD("Bad AIGP TLV length", plen);

    ADVANCE(pos, len, plen);
  }
#undef BAD

  return 1;

bad:
  if (err)
    if (bsnprintf(err, elen, "%s (%u) at %d", err_dsc, err_val, (int) (pos - data)) < 0)
      err[0] = 0;

  return 0;
}

static const byte *
bgp_aigp_get_tlv(const struct adata *ad, uint type)
{
  if (!ad)
    return NULL;

  uint len = ad->length;
  const byte *pos = ad->data;

  while (len)
  {
    uint ptype = pos[0];
    uint plen = get_u16(pos + 1);

    if (ptype == type)
      return pos;

    ADVANCE(pos, len, plen);
  }

  return NULL;
}

static const struct adata *
bgp_aigp_set_tlv(struct linpool *pool, const struct adata *ad, uint type, byte *data, uint dlen)
{
  uint len = ad ? ad->length : 0;
  const byte *pos = ad ? ad->data : NULL;
  struct adata *res = lp_alloc_adata(pool, len + 3 + dlen);
  byte *dst = res->data;
  byte *tlv = NULL;
  int del = 0;

  while (len)
  {
    uint ptype = pos[0];
    uint plen = get_u16(pos + 1);

    /* Find position for new TLV */
    if ((ptype >= type) && !tlv)
    {
      tlv = dst;
      dst += 3 + dlen;
    }

    /* Skip first matching TLV, copy others */
    if ((ptype == type) && !del)
      del = 1;
    else
    {
      memcpy(dst, pos, plen);
      dst += plen;
    }

    ADVANCE(pos, len, plen);
  }

  if (!tlv)
  {
    tlv = dst;
    dst += 3 + dlen;
  }

  /* Store the TLD */
  put_u8(tlv + 0, type);
  put_u16(tlv + 1, 3 + dlen);
  memcpy(tlv + 3, data, dlen);

  /* Update length */
  res->length = dst - res->data;

  return res;
}

static u64 UNUSED
bgp_aigp_get_metric(const struct adata *ad, u64 def)
{
  const byte *b = bgp_aigp_get_tlv(ad, BGP_AIGP_METRIC);
  return b ? get_u64(b + 3) : def;
}

static const struct adata *
bgp_aigp_set_metric(struct linpool *pool, const struct adata *ad, u64 metric)
{
  byte data[8];
  put_u64(data, metric);
  return bgp_aigp_set_tlv(pool, ad, BGP_AIGP_METRIC, data, 8);
}

int
bgp_total_aigp_metric_(const rte *e, u64 *metric, const struct adata **ad)
{
  eattr *a = ea_find(e->attrs, BGP_EA_ID(BA_AIGP));
  if (!a)
    return 0;

  const byte *b = bgp_aigp_get_tlv(a->u.ptr, BGP_AIGP_METRIC);
  if (!b)
    return 0;

  u64 aigp = get_u64(b + 3);
  u64 step = rt_get_igp_metric(e);

  if (!rte_resolvable(e) || (step >= IGP_METRIC_UNKNOWN))
    step = BGP_AIGP_MAX;

  if (!step)
    step = 1;

  *ad = a->u.ptr;
  *metric = aigp + step;
  if (*metric < aigp)
    *metric = BGP_AIGP_MAX;

  return 1;
}

static inline int
bgp_init_aigp_metric(rte *e, u64 *metric, const struct adata **ad)
{
  if (rt_get_source_attr(e) == RTS_BGP)
    return 0;

  *metric = rt_get_igp_metric(e);
  *ad = NULL;
  return *metric < IGP_METRIC_UNKNOWN;
}

u32
bgp_rte_igp_metric(const rte *rt)
{
  u64 metric = bgp_total_aigp_metric(rt);
  return (u32) MIN(metric, (u64) IGP_METRIC_UNKNOWN);
}


/*
 *	Attribute hooks
 */

static void
bgp_export_origin(struct bgp_export_state *s, eattr *a)
{
  if (a->u.data > 2)
    REJECT(BAD_VALUE, "ORIGIN", a->u.data);
}

static void
bgp_decode_origin(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
  if (len != 1)
    WITHDRAW(BAD_LENGTH, "ORIGIN", len);

  if (data[0] > 2)
    WITHDRAW(BAD_VALUE, "ORIGIN", data[0]);

  bgp_set_attr_u32(to, BA_ORIGIN, flags, data[0]);
}

static void
bgp_format_origin(const eattr *a, byte *buf, uint size UNUSED)
{
  static const char *bgp_origin_names[] = { "IGP", "EGP", "Incomplete" };

  bsprintf(buf, (a->u.data <= 2) ? bgp_origin_names[a->u.data] : "?");
}


static inline int
bgp_as_path_first_as_equal(const byte *data, uint len, u32 asn)
{
  return (len >= 6) &&
    ((data[0] == AS_PATH_SEQUENCE) || (data[0] == AS_PATH_CONFED_SEQUENCE)) &&
    (data[1] > 0) &&
    (get_u32(data+2) == asn);
}

static int
bgp_encode_as_path(struct bgp_write_state *s, eattr *a, byte *buf, uint size)
{
  const byte *data = a->u.ptr->data;
  uint len = a->u.ptr->length;

  if (!s->as4_session)
  {
    /* Prepare 16-bit AS_PATH (from 32-bit one) in a temporary buffer */
    byte *dst = alloca(len);
    len = as_path_32to16(dst, data, len);
    data = dst;
  }

  return bgp_put_attr(buf, size, BA_AS_PATH, a->flags, data, len);
}

static void
bgp_decode_as_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
  struct bgp_proto *p = s->proto;
  int as_length = s->as4_session ? 4 : 2;
  int as_sets = p->cf->allow_as_sets;
  int as_confed = p->cf->confederation && p->is_interior;
  char err[128];

  if (!as_path_valid(data, len, as_length, as_sets, as_confed, err, sizeof(err)))
    WITHDRAW("Malformed AS_PATH attribute - %s", err);

  if (!s->as4_session)
  {
    /* Prepare 32-bit AS_PATH (from 16-bit one) in a temporary buffer */
    byte *src = data;
    data = alloca(2*len);
    len = as_path_16to32(data, src, len);
  }

  /* In some circumstances check for initial AS_CONFED_SEQUENCE; RFC 5065 5.0 */
  if (p->is_interior && !p->is_internal &&
      ((len < 2) || (data[0] != AS_PATH_CONFED_SEQUENCE)))
    WITHDRAW("Malformed AS_PATH attribute - %s", "missing initial AS_CONFED_SEQUENCE");

  /* Reject routes with first AS in AS_PATH not matching neighbor AS; RFC 4271 6.3 */
  if (!p->is_internal && p->cf->enforce_first_as &&
      !bgp_as_path_first_as_equal(data, len, p->remote_as))
    WITHDRAW("Malformed AS_PATH attribute - %s", "First AS differs from neigbor AS");

  bgp_set_attr_data(to, BA_AS_PATH, flags, data, len);
}


static int
bgp_encode_next_hop(struct bgp_write_state *s, eattr *a, byte *buf, uint size)
{
  /*
   * The NEXT_HOP attribute is used only in traditional (IPv4) BGP. In MP-BGP,
   * the next hop is encoded as a part of the MP_REACH_NLRI attribute, so we
   * store it and encode it later by AFI-specific hooks.
   */

  if (!s->mp_reach)
  {
    // ASSERT(a->u.ptr->length == sizeof(ip_addr));

    /* FIXME: skip IPv6 next hops for IPv4 routes during MRT dump */
    ip_addr *addr = (void *) a->u.ptr->data;
    if ((a->u.ptr->length != sizeof(ip_addr)) || !ipa_is_ip4(*addr))
      return 0;

    if (size < (3+4))
      return -1;

    bgp_put_attr_hdr3(buf, BA_NEXT_HOP, a->flags, 4);
    put_ip4(buf+3, ipa_to_ip4(*addr));

    return 3+4;
  }
  else
  {
    s->mp_next_hop = a;
    return 0;
  }
}

static void
bgp_decode_next_hop(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data, uint len, ea_list **to UNUSED)
{
  if (len != 4)
    WITHDRAW(BAD_LENGTH, "NEXT_HOP", len);

  /* Semantic checks are done later */
  s->ip_next_hop_len = len;
  s->ip_next_hop_data = data;
}

/* TODO: This function should use AF-specific hook */
static void
bgp_format_next_hop(const eattr *a, byte *buf, uint size UNUSED)
{
  ip_addr *nh = (void *) a->u.ptr->data;
  uint len = a->u.ptr->length;

  ASSERT((len == 16) || (len == 32));

  /* in IPv6, we may have two addresses in NEXT HOP */
  if ((len == 16) || ipa_zero(nh[1]))
    bsprintf(buf, "%I", nh[0]);
  else
    bsprintf(buf, "%I %I", nh[0], nh[1]);
}


static void
bgp_decode_med(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
  if (len != 4)
    WITHDRAW(BAD_LENGTH, "MULTI_EXIT_DISC", len);

  u32 val = get_u32(data);
  bgp_set_attr_u32(to, BA_MULTI_EXIT_DISC, flags, val);
}


static void
bgp_export_local_pref(struct bgp_export_state *s, eattr *a)
{
  if (!s->proto->is_interior && !s->proto->cf->allow_local_pref)
    UNSET(a);
}

static void
bgp_decode_local_pref(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
  if (!s->proto->is_interior && !s->proto->cf->allow_local_pref)
    DISCARD(BAD_EBGP, "LOCAL_PREF");

  if (len != 4)
    WITHDRAW(BAD_LENGTH, "LOCAL_PREF", len);

  u32 val = get_u32(data);
  bgp_set_attr_u32(to, BA_LOCAL_PREF, flags, val);
}


static void
bgp_decode_atomic_aggr(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to)
{
  if (len != 0)
    DISCARD(BAD_LENGTH, "ATOMIC_AGGR", len);

  bgp_set_attr_data(to, BA_ATOMIC_AGGR, flags, NULL, 0);
}

static int
bgp_encode_aggregator(struct bgp_write_state *s, eattr *a, byte *buf, uint size)
{
  const byte *data = a->u.ptr->data;
  uint len = a->u.ptr->length;

  if (!s->as4_session)
  {
    /* Prepare 16-bit AGGREGATOR (from 32-bit one) in a temporary buffer */
    byte *dst = alloca(6);
    len = aggregator_32to16(dst, data);
    data = dst;
  }

  return bgp_put_attr(buf, size, BA_AGGREGATOR, a->flags, data, len);
}

static void
bgp_decode_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
  if (len != (s->as4_session ? 8 : 6))
    DISCARD(BAD_LENGTH, "AGGREGATOR", len);

  if (!s->as4_session)
  {
    /* Prepare 32-bit AGGREGATOR (from 16-bit one) in a temporary buffer */
    byte *src = data;
    data = alloca(8);
    len = aggregator_16to32(data, src);
  }

  bgp_set_attr_data(to, BA_AGGREGATOR, flags, data, len);
}

static void
bgp_format_aggregator(const eattr *a, byte *buf, uint size UNUSED)
{
  const byte *data = a->u.ptr->data;

  bsprintf(buf, "%I4 AS%u", get_ip4(data+4), get_u32(data+0));
}


static void
bgp_export_community(struct bgp_export_state *s, eattr *a)
{
  if (a->u.ptr->length == 0)
    UNSET(a);

  a->u.ptr = int_set_sort(s->pool, a->u.ptr);
}

static void
bgp_decode_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
  if (!len || (len % 4))
    WITHDRAW(BAD_LENGTH, "COMMUNITY", len);

  struct adata *ad = lp_alloc_adata(s->pool, len);
  get_u32s(data, (u32 *) ad->data, len / 4);
  bgp_set_attr_ptr(to, BA_COMMUNITY, flags, ad);
}


static void
bgp_export_originator_id(struct bgp_export_state *s, eattr *a)
{
  if (!s->proto->is_internal)
    UNSET(a);
}

static void
bgp_decode_originator_id(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
  if (!s->proto->is_internal)
    DISCARD(BAD_EBGP, "ORIGINATOR_ID");

  if (len != 4)
    WITHDRAW(BAD_LENGTH, "ORIGINATOR_ID", len);

  u32 val = get_u32(data);
  bgp_set_attr_u32(to, BA_ORIGINATOR_ID, flags, val);
}


static void
bgp_export_cluster_list(struct bgp_export_state *s UNUSED, eattr *a)
{
  if (!s->proto->is_internal)
    UNSET(a);

  if (a->u.ptr->length == 0)
    UNSET(a);
}

static void
bgp_decode_cluster_list(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
  if (!s->proto->is_internal)
    DISCARD(BAD_EBGP, "CLUSTER_LIST");

  if (!len || (len % 4))
    WITHDRAW(BAD_LENGTH, "CLUSTER_LIST", len);

  struct adata *ad = lp_alloc_adata(s->pool, len);
  get_u32s(data, (u32 *) ad->data, len / 4);
  bgp_set_attr_ptr(to, BA_CLUSTER_LIST, flags, ad);
}

static void
bgp_format_cluster_list(const eattr *a, byte *buf, uint size)
{
  /* Truncates cluster lists larger than buflen, probably not a problem */
  int_set_format(a->u.ptr, 0, -1, buf, size);
}


int
bgp_encode_mp_reach_mrt(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size)
{
  /*
   *	Limited version of MP_REACH_NLRI used for MRT table dumps (IPv6 only):
   *
   *	3 B	MP_REACH_NLRI header
   *	1 B	MP_REACH_NLRI data - Length of Next Hop Network Address
   *	var	MP_REACH_NLRI data - Network Address of Next Hop
   */

  ip_addr *nh = (void *) a->u.ptr->data;
  uint len = a->u.ptr->length;

  ASSERT((len == 16) || (len == 32));

  if (size < (3+1+len))
    return -1;

  bgp_put_attr_hdr3(buf, BA_MP_REACH_NLRI, BAF_OPTIONAL, 1+len);
  buf[3] = len;
  buf += 4;

  put_ip6(buf, ipa_to_ip6(nh[0]));

  if (len == 32)
    put_ip6(buf+16, ipa_to_ip6(nh[1]));

  return 3+1+len;
}

static inline u32
get_af3(byte *buf)
{
  return (get_u16(buf) << 16) | buf[2];
}

static void
bgp_decode_mp_reach_nlri(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data, uint len, ea_list **to UNUSED)
{
  /*
   *	2 B	MP_REACH_NLRI data - Address Family Identifier
   *	1 B	MP_REACH_NLRI data - Subsequent Address Family Identifier
   *	1 B	MP_REACH_NLRI data - Length of Next Hop Network Address
   *	var	MP_REACH_NLRI data - Network Address of Next Hop
   *	1 B	MP_REACH_NLRI data - Reserved (zero)
   *	var	MP_REACH_NLRI data - Network Layer Reachability Information
   */

  if ((len < 5) || (len < (5 + (uint) data[3])))
    bgp_parse_error(s, 9);

  s->mp_reach_af = get_af3(data);
  s->mp_next_hop_len = data[3];
  s->mp_next_hop_data = data + 4;
  s->mp_reach_len = len - 5 - s->mp_next_hop_len;
  s->mp_reach_nlri = data + 5 + s->mp_next_hop_len;
}


static void
bgp_decode_mp_unreach_nlri(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data, uint len, ea_list **to UNUSED)
{
  /*
   *	2 B	MP_UNREACH_NLRI data - Address Family Identifier
   *	1 B	MP_UNREACH_NLRI data - Subsequent Address Family Identifier
   *	var	MP_UNREACH_NLRI data - Network Layer Reachability Information
   */

  if (len < 3)
    bgp_parse_error(s, 9);

  s->mp_unreach_af = get_af3(data);
  s->mp_unreach_len = len - 3;
  s->mp_unreach_nlri = data + 3;
}


static void
bgp_export_ext_community(struct bgp_export_state *s, eattr *a)
{
  if (!s->proto->is_interior)
  {
    struct adata *ad = ec_set_del_nontrans(s->pool, a->u.ptr);

    if (ad->length == 0)
      UNSET(a);

    ec_set_sort_x(ad);
    a->u.ptr = ad;
  }
  else
  {
    if (a->u.ptr->length == 0)
      UNSET(a);

    a->u.ptr = ec_set_sort(s->pool, a->u.ptr);
  }
}

static void
bgp_decode_ext_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
  if (!len || (len % 8))
    WITHDRAW(BAD_LENGTH, "EXT_COMMUNITY", len);

  struct adata *ad = lp_alloc_adata(s->pool, len);
  get_u32s(data, (u32 *) ad->data, len / 4);
  bgp_set_attr_ptr(to, BA_EXT_COMMUNITY, flags, ad);
}


static void
bgp_decode_as4_aggregator(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
  if (s->as4_session)
    DISCARD(NEW_BGP, "AS4_AGGREGATOR");

  if (len != 8)
    DISCARD(BAD_LENGTH, "AS4_AGGREGATOR", len);

  bgp_set_attr_data(to, BA_AS4_AGGREGATOR, flags, data, len);
}

static void
bgp_decode_as4_path(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
  struct bgp_proto *p = s->proto;
  int sets = p->cf->allow_as_sets;

  char err[128];

  if (s->as4_session)
    DISCARD(NEW_BGP, "AS4_PATH");

  if (len < 6)
    DISCARD(BAD_LENGTH, "AS4_PATH", len);

  if (!as_path_valid(data, len, 4, sets, 1, err, sizeof(err)))
    DISCARD("Malformed AS4_PATH attribute - %s", err);

  struct adata *a = lp_alloc_adata(s->pool, len);
  memcpy(a->data, data, len);

  /* AS_CONFED* segments are invalid in AS4_PATH; RFC 6793 6 */
  if (as_path_contains_confed(a))
  {
    REPORT("Discarding AS_CONFED* segment from AS4_PATH attribute");
    a = as_path_strip_confed(s->pool, a);
  }

  bgp_set_attr_ptr(to, BA_AS4_PATH, flags, a);
}


static void
bgp_export_aigp(struct bgp_export_state *s, eattr *a)
{
  if (!s->channel->cf->aigp)
    UNSET(a);
}

static void
bgp_decode_aigp(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
  char err[128];

  /* Acceptability test postponed to bgp_finish_attrs() */

  if ((flags ^ bgp_attr_table[BA_AIGP].flags) & (BAF_OPTIONAL | BAF_TRANSITIVE))
    DISCARD("Malformed AIGP attribute - conflicting flags (%02x)", flags);

  if (!bgp_aigp_valid(data, len, err, sizeof(err)))
    DISCARD("Malformed AIGP attribute - %s", err);

  bgp_set_attr_data(to, BA_AIGP, flags, data, len);
}

static void
bgp_format_aigp(const eattr *a, byte *buf, uint size UNUSED)
{
  const byte *b = bgp_aigp_get_tlv(a->u.ptr, BGP_AIGP_METRIC);

  if (!b)
    bsprintf(buf, "?");
  else
    bsprintf(buf, "%lu", get_u64(b + 3));
}


static void
bgp_export_large_community(struct bgp_export_state *s, eattr *a)
{
  if (a->u.ptr->length == 0)
    UNSET(a);

  a->u.ptr = lc_set_sort(s->pool, a->u.ptr);
}

static void
bgp_decode_large_community(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data, uint len, ea_list **to)
{
  if (!len || (len % 12))
    WITHDRAW(BAD_LENGTH, "LARGE_COMMUNITY", len);

  struct adata *ad = lp_alloc_adata(s->pool, len);
  get_u32s(data, (u32 *) ad->data, len / 4);
  bgp_set_attr_ptr(to, BA_LARGE_COMMUNITY, flags, ad);
}


static void
bgp_decode_otc(struct bgp_parse_state *s, uint code UNUSED, uint flags, byte *data UNUSED, uint len, ea_list **to)
{
  if (len != 4)
    WITHDRAW(BAD_LENGTH, "OTC", len);

  u32 val = get_u32(data);
  bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, flags, val);
}


static void
bgp_export_mpls_label_stack(struct bgp_export_state *s, eattr *a)
{
  const net_addr *n = s->route->net;
  u32 *labels = (u32 *) a->u.ptr->data;
  uint lnum = a->u.ptr->length / 4;

  /* Perhaps we should just ignore it? */
  if (!s->mpls)
    REJECT("Unexpected MPLS stack");

  /* Empty MPLS stack is not allowed */
  if (!lnum)
    REJECT("Malformed MPLS stack - empty");

  /* This is ugly, but we must ensure that labels fit into NLRI field */
  if ((24*lnum + (net_is_vpn(n) ? 64 : 0) + net_pxlen(n)) > 255)
    REJECT("Malformed MPLS stack - too many labels (%u)", lnum);

  for (uint i = 0; i < lnum; i++)
  {
    if (labels[i] > 0xfffff)
      REJECT("Malformed MPLS stack - invalid label (%u)", labels[i]);

    /* TODO: Check for special-purpose label values? */
  }
}

static int
bgp_encode_mpls_label_stack(struct bgp_write_state *s, eattr *a, byte *buf UNUSED, uint size UNUSED)
{
  /*
   * MPLS labels are encoded as a part of the NLRI in MP_REACH_NLRI attribute,
   * so we store MPLS_LABEL_STACK and encode it later by AFI-specific hooks.
   */

  s->mpls_labels = a->u.ptr;
  return 0;
}

static void
bgp_decode_mpls_label_stack(struct bgp_parse_state *s, uint code UNUSED, uint flags UNUSED, byte *data UNUSED, uint len UNUSED, ea_list **to UNUSED)
{
  DISCARD("Discarding received attribute #0");
}

static void
bgp_format_mpls_label_stack(const eattr *a, byte *buf, uint size)
{
  u32 *labels = (u32 *) a->u.ptr->data;
  uint lnum = a->u.ptr->length / 4;
  char *pos = buf;

  for (uint i = 0; i < lnum; i++)
  {
    if (size < 20)
    {
      bsprintf(pos, "...");
      return;
    }

    uint l = bsprintf(pos, "%d/", labels[i]);
    ADVANCE(pos, size, l);
  }

  /* Clear last slash or terminate empty string */
  pos[lnum ? -1 : 0] = 0;
}

static inline void
bgp_export_unknown(struct bgp_export_state *s UNUSED, eattr *a)
{
  if (!(a->flags & BAF_TRANSITIVE))
    UNSET(a);

  a->flags |= BAF_PARTIAL;
}

static inline void
bgp_decode_unknown(struct bgp_parse_state *s UNUSED, uint code, uint flags, byte *data, uint len, ea_list **to)
{
  if (!(flags & BAF_OPTIONAL))
    WITHDRAW("Unknown attribute (code %u) - conflicting flags (%02x)", code, flags);

  /* Cannot use bgp_set_attr_data() as it works on known attributes only */
  ea_set_attr_data(to, &bgp_attr_table[code].class, flags, data, len);
}

static inline void
bgp_format_unknown(const eattr *a, byte *buf, uint size)
{
  if (a->flags & BAF_TRANSITIVE)
    bsnprintf(buf, size, "(transitive)");
}


/*
 *	Attribute table
 */

static union bgp_attr_desc bgp_attr_table[BGP_ATTR_MAX] = {
  [BA_ORIGIN] = {
    .name = "bgp_origin",
    .type = T_ENUM_BGP_ORIGIN,
    .flags = BAF_TRANSITIVE,
    .export = bgp_export_origin,
    .encode = bgp_encode_u8,
    .decode = bgp_decode_origin,
    .format = bgp_format_origin,
  },
  [BA_AS_PATH] = {
    .name = "bgp_path",
    .type = T_PATH,
    .flags = BAF_TRANSITIVE,
    .encode = bgp_encode_as_path,
    .decode = bgp_decode_as_path,
  },
  [BA_NEXT_HOP] = {
    .name = "bgp_next_hop",
    .type = T_IP,
    .flags = BAF_TRANSITIVE,
    .encode = bgp_encode_next_hop,
    .decode = bgp_decode_next_hop,
    .format = bgp_format_next_hop,
  },
  [BA_MULTI_EXIT_DISC] = {
    .name = "bgp_med",
    .type = T_INT,
    .flags = BAF_OPTIONAL,
    .encode = bgp_encode_u32,
    .decode = bgp_decode_med,
  },
  [BA_LOCAL_PREF] = {
    .name = "bgp_local_pref",
    .type = T_INT,
    .flags = BAF_TRANSITIVE,
    .export = bgp_export_local_pref,
    .encode = bgp_encode_u32,
    .decode = bgp_decode_local_pref,
  },
  [BA_ATOMIC_AGGR] = {
    .name = "bgp_atomic_aggr",
    .type = T_OPAQUE,
    .flags = BAF_TRANSITIVE,
    .encode = bgp_encode_raw,
    .decode = bgp_decode_atomic_aggr,
  },
  [BA_AGGREGATOR] = {
    .name = "bgp_aggregator",
    .type = T_OPAQUE,
    .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
    .encode = bgp_encode_aggregator,
    .decode = bgp_decode_aggregator,
    .format = bgp_format_aggregator,
  },
  [BA_COMMUNITY] = {
    .name = "bgp_community",
    .type = T_CLIST,
    .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
    .export = bgp_export_community,
    .encode = bgp_encode_u32s,
    .decode = bgp_decode_community,
  },
  [BA_ORIGINATOR_ID] = {
    .name = "bgp_originator_id",
    .type = T_QUAD,
    .flags = BAF_OPTIONAL,
    .export = bgp_export_originator_id,
    .encode = bgp_encode_u32,
    .decode = bgp_decode_originator_id,
  },
  [BA_CLUSTER_LIST] = {
    .name = "bgp_cluster_list",
    .type = T_CLIST,
    .flags = BAF_OPTIONAL,
    .export = bgp_export_cluster_list,
    .encode = bgp_encode_u32s,
    .decode = bgp_decode_cluster_list,
    .format = bgp_format_cluster_list,
  },
  [BA_MP_REACH_NLRI] = {
    .name = "bgp_mp_reach_nlri",
    .type = T_OPAQUE,
    .hidden = 1,
    .flags = BAF_OPTIONAL,
    .decode = bgp_decode_mp_reach_nlri,
  },
  [BA_MP_UNREACH_NLRI] = {
    .name = "bgp_mp_unreach_nlri",
    .type = T_OPAQUE,
    .hidden = 1,
    .flags = BAF_OPTIONAL,
    .decode = bgp_decode_mp_unreach_nlri,
  },
  [BA_EXT_COMMUNITY] = {
    .name = "bgp_ext_community",
    .type = T_ECLIST,
    .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
    .export = bgp_export_ext_community,
    .encode = bgp_encode_u32s,
    .decode = bgp_decode_ext_community,
  },
  [BA_AS4_PATH] = {
    .name = "bgp_as4_path",
    .type = T_PATH,
    .hidden = 1,
    .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
    .encode = bgp_encode_raw,
    .decode = bgp_decode_as4_path,
  },
  [BA_AS4_AGGREGATOR] = {
    .name = "bgp_as4_aggregator",
    .type = T_OPAQUE,
    .hidden = 1,
    .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
    .encode = bgp_encode_raw,
    .decode = bgp_decode_as4_aggregator,
    .format = bgp_format_aggregator,
  },
  [BA_AIGP] = {
    .name = "bgp_aigp",
    .type = T_OPAQUE,
    .flags = BAF_OPTIONAL | BAF_DECODE_FLAGS,
    .export = bgp_export_aigp,
    .encode = bgp_encode_raw,
    .decode = bgp_decode_aigp,
    .format = bgp_format_aigp,
  },
  [BA_LARGE_COMMUNITY] = {
    .name = "bgp_large_community",
    .type = T_LCLIST,
    .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
    .export = bgp_export_large_community,
    .encode = bgp_encode_u32s,
    .decode = bgp_decode_large_community,
  },
  [BA_ONLY_TO_CUSTOMER] = {
    .name = "otc",
    .type = T_INT,
    .flags = BAF_OPTIONAL | BAF_TRANSITIVE,
    .encode = bgp_encode_u32,
    .decode = bgp_decode_otc,
  },
  [BA_MPLS_LABEL_STACK] = {
    .name = "bgp_mpls_label_stack",
    .type = T_CLIST,
    .readonly = 1,
    .export = bgp_export_mpls_label_stack,
    .encode = bgp_encode_mpls_label_stack,
    .decode = bgp_decode_mpls_label_stack,
    .format = bgp_format_mpls_label_stack,
  },
};

eattr *
bgp_find_attr(ea_list *attrs, uint code)
{
  return ea_find(attrs, BGP_EA_ID(code));
}

void
bgp_register_attrs(void)
{
  for (uint i=0; i<ARRAY_SIZE(bgp_attr_table); i++)
  {
    if (!bgp_attr_table[i].name)
      bgp_attr_table[i] = (union bgp_attr_desc) {
	.name = mb_sprintf(&root_pool, "bgp_unknown_0x%02x", i),
	.type = T_OPAQUE,
	.flags = BAF_OPTIONAL,
	.readonly = 1,
	.export = bgp_export_unknown,
	.encode = bgp_encode_raw,
	.decode = bgp_decode_unknown,
	.format = bgp_format_unknown,
      };

    ea_register_init(&bgp_attr_table[i].class);
  }
}

/*
 *	Attribute export
 */

static inline void
bgp_export_attr(struct bgp_export_state *s, eattr *a, ea_list *to)
{
  const union bgp_attr_desc *desc = bgp_find_attr_desc(a);
  if (!desc)
    return;

  /* The flags might have been zero if the attr was added locally */
  a->flags = (a->flags & BAF_PARTIAL) | desc->flags;

  /* Set partial bit if new opt-trans attribute is attached to non-local route */
  if ((s->src != NULL) && (a->originated) &&
      (a->flags & BAF_OPTIONAL) && (a->flags & BAF_TRANSITIVE))
    a->flags |= BAF_PARTIAL;

  /* Call specific hook */
  CALL(desc->export, s, a);

  /* Attribute might become undefined in hook */
  if (a->undef)
    return;

  /* Append updated attribute */
  to->attrs[to->count++] = *a;
}

/**
 * bgp_export_attrs - export BGP attributes
 * @s: BGP export state
 * @attrs: a list of extended attributes
 *
 * The bgp_export_attrs() function takes a list of attributes and merges it to
 * one newly allocated and sorted segment. Attributes are validated and
 * normalized by type-specific export hooks and attribute flags are updated.
 * Some attributes may be eliminated (e.g. unknown non-tranitive attributes, or
 * empty community sets).
 *
 * Result: one sorted attribute list segment, or NULL if attributes are unsuitable.
 */
static inline ea_list *
bgp_export_attrs(struct bgp_export_state *s, ea_list *a)
{
  /* Merge the attribute list */
  ea_list *new = ea_normalize(a, 0);
  ASSERT_DIE(new);

  uint i, count;
  count = new->count;
  new->count = 0;

  /* Export each attribute */
  for (i = 0; i < count; i++)
    bgp_export_attr(s, &new->attrs[i], new);

  if (s->err_reject)
    return NULL;

  return new;
}


/*
 *	Attribute encoding
 */

static inline int
bgp_encode_attr(struct bgp_write_state *s, eattr *a, byte *buf, uint size)
{
  const union bgp_attr_desc *desc = bgp_find_attr_desc(a);
  ASSERT_DIE(desc);
  return desc->encode(s, a, buf, size);
}

/**
 * bgp_encode_attrs - encode BGP attributes
 * @s: BGP write state
 * @attrs: a list of extended attributes
 * @buf: buffer
 * @end: buffer end
 *
 * The bgp_encode_attrs() function takes a list of extended attributes
 * and converts it to its BGP representation (a part of an Update message).
 * BGP write state may be fake when called from MRT protocol.
 *
 * Result: Length of the attribute block generated or -1 if not enough space.
 */
int
bgp_encode_attrs(struct bgp_write_state *s, ea_list *attrs, byte *buf, byte *end)
{
  byte *pos = buf;
  int i, len;

  for (i = 0; i < attrs->count; i++)
  {
    len = bgp_encode_attr(s, &attrs->attrs[i], pos, end - pos);

    if (len < 0)
      return -1;

    pos += len;
  }

  return pos - buf;
}


/*
 *	Attribute decoding
 */

static void bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool);

static inline int
bgp_as_path_loopy(struct bgp_proto *p, ea_list *attrs, u32 asn)
{
  eattr *e = bgp_find_attr(attrs, BA_AS_PATH);
  int num = p->cf->allow_local_as + 1;
  return (e && (num > 0) && as_path_contains(e->u.ptr, asn, num));
}

static inline int
bgp_originator_id_loopy(struct bgp_proto *p, ea_list *attrs)
{
  eattr *e = bgp_find_attr(attrs, BA_ORIGINATOR_ID);
  return (e && (e->u.data == p->local_id));
}

static inline int
bgp_cluster_list_loopy(struct bgp_proto *p, ea_list *attrs)
{
  eattr *e = bgp_find_attr(attrs, BA_CLUSTER_LIST);
  return (e && int_set_contains(e->u.ptr, p->rr_cluster_id));
}

static inline void
bgp_decode_attr(struct bgp_parse_state *s, byte code, byte flags, byte *data, uint len, ea_list **to)
{
  /* Handle duplicate attributes; RFC 7606 3 (g) */
  if (BIT32_TEST(s->attrs_seen, code))
  {
    if ((code == BA_MP_REACH_NLRI) || (code == BA_MP_UNREACH_NLRI))
      bgp_parse_error(s, 1);
    else
      DISCARD("Discarding duplicate attribute (code %u)", code);
  }
  BIT32_SET(s->attrs_seen, code);

  ASSERT_DIE(bgp_attr_table[code].id);
  const union bgp_attr_desc *desc = &bgp_attr_table[code];

  /* Handle conflicting flags; RFC 7606 3 (c) */
  if (((flags ^ desc->flags) & (BAF_OPTIONAL | BAF_TRANSITIVE)) &&
      !(desc->flags & BAF_DECODE_FLAGS))
    WITHDRAW("Malformed %s attribute - conflicting flags (%02x, expected %02x)", desc->name, flags, desc->flags);

  desc->decode(s, code, flags, data, len, to);
}

/**
 * bgp_decode_attrs - check and decode BGP attributes
 * @s: BGP parse state
 * @data: start of attribute block
 * @len: length of attribute block
 *
 * This function takes a BGP attribute block (a part of an Update message), checks
 * its consistency and converts it to a list of BIRD route attributes represented
 * by an (uncached) &rta.
 */
ea_list *
bgp_decode_attrs(struct bgp_parse_state *s, byte *data, uint len)
{
  struct bgp_proto *p = s->proto;
  ea_list *attrs = NULL;
  uint alen;
  byte code, flags;
  byte *pos = data;

  /* Parse the attributes */
  while (len)
  {
    alen = 0;

    /* Read attribute type */
    if (len < 2)
      goto framing_error;
    flags = pos[0];
    code = pos[1];
    ADVANCE(pos, len, 2);

    /* Read attribute length */
    if (flags & BAF_EXT_LEN)
    {
      if (len < 2)
	goto framing_error;
      alen = get_u16(pos);
      ADVANCE(pos, len, 2);
    }
    else
    {
      if (len < 1)
	goto framing_error;
      alen = *pos;
      ADVANCE(pos, len, 1);
    }

    if (alen > len)
      goto framing_error;

    DBG("Attr %02x %02x %u\n", code, flags, alen);

    bgp_decode_attr(s, code, flags, pos, alen, &attrs);
    ADVANCE(pos, len, alen);
  }

  if (s->err_withdraw)
    goto withdraw;

  /* If there is no reachability NLRI, we are finished */
  if (!s->ip_reach_len && !s->mp_reach_len)
    return NULL;


  /* Handle missing mandatory attributes; RFC 7606 3 (d) */
  if (!BIT32_TEST(s->attrs_seen, BA_ORIGIN))
  { REPORT(NO_MANDATORY, "ORIGIN"); goto withdraw; }

  if (!BIT32_TEST(s->attrs_seen, BA_AS_PATH))
  { REPORT(NO_MANDATORY, "AS_PATH"); goto withdraw; }

  if (s->ip_reach_len && !BIT32_TEST(s->attrs_seen, BA_NEXT_HOP))
  { REPORT(NO_MANDATORY, "NEXT_HOP"); goto withdraw; }

  /* When receiving attributes from non-AS4-aware BGP speaker, we have to
     reconstruct AS_PATH and AGGREGATOR attributes; RFC 6793 4.2.3 */
  if (!p->as4_session)
    bgp_process_as4_attrs(&attrs, s->pool);

  /* Reject routes with our ASN in AS_PATH attribute */
  if (bgp_as_path_loopy(p, attrs, p->local_as))
    goto loop;

  /* Reject routes with our Confederation ID in AS_PATH attribute; RFC 5065 4.0 */
  if ((p->public_as != p->local_as) && bgp_as_path_loopy(p, attrs, p->public_as))
    goto loop;

  /* Reject routes with our Router ID in ORIGINATOR_ID attribute; RFC 4456 8 */
  if (p->is_internal && bgp_originator_id_loopy(p, attrs))
    goto loop;

  /* Reject routes with our Cluster ID in CLUSTER_LIST attribute; RFC 4456 8 */
  if (p->rr_client && bgp_cluster_list_loopy(p, attrs))
    goto loop;

  /* If there is no local preference, define one */
  if (!BIT32_TEST(s->attrs_seen, BA_LOCAL_PREF))
    bgp_set_attr_u32(&attrs, BA_LOCAL_PREF, 0, p->cf->default_local_pref);

  return attrs;


framing_error:
  /* RFC 7606 4 - handle attribute framing errors */
  REPORT("Malformed attribute list - framing error (%u/%u) at %d",
	 alen, len, (int) (pos - s->attrs));

withdraw:
  /* RFC 7606 5.2 - handle missing NLRI during errors */
  if (!s->ip_reach_len && !s->mp_reach_len)
    bgp_parse_error(s, 1);

  s->err_withdraw = 1;
  return NULL;

loop:
  /* Loops are handled as withdraws, but ignored silently. Do not set err_withdraw. */
  return NULL;
}

void
bgp_finish_attrs(struct bgp_parse_state *s, ea_list **to)
{
  /* AIGP test here instead of in bgp_decode_aigp() - we need to know channel */
  if (BIT32_TEST(s->attrs_seen, BA_AIGP) && !s->channel->cf->aigp)
  {
    REPORT("Discarding AIGP attribute received on non-AIGP session");
    bgp_unset_attr(to, BA_AIGP);
  }

  /* Handle OTC ingress procedure, RFC 9234 */
  if (bgp_channel_is_role_applicable(s->channel))
  {
    struct bgp_proto *p = s->proto;
    eattr *e = bgp_find_attr(*to, BA_ONLY_TO_CUSTOMER);

    /* Reject routes from downstream if they are leaked */
    if (e && (p->cf->local_role == BGP_ROLE_PROVIDER ||
	      p->cf->local_role == BGP_ROLE_RS_SERVER))
      WITHDRAW("Route leak detected - OTC attribute from downstream");

    /* Reject routes from peers if they are leaked */
    if (e && (p->cf->local_role == BGP_ROLE_PEER) && (e->u.data != p->cf->remote_as))
      WITHDRAW("Route leak detected - OTC attribute with mismatched ASN (%u)",
	       (uint) e->u.data);

    /* Mark routes from upstream if it did not happened before */
    if (!e && (p->cf->local_role == BGP_ROLE_CUSTOMER ||
	       p->cf->local_role == BGP_ROLE_PEER ||
	       p->cf->local_role == BGP_ROLE_RS_CLIENT))
      bgp_set_attr_u32(to, BA_ONLY_TO_CUSTOMER, 0, p->cf->remote_as);
  }
}


/*
 *	Route bucket hash table
 */

#define RBH_KEY(b)		b->eattrs, b->hash
#define RBH_NEXT(b)		b->next
#define RBH_EQ(a1,h1,a2,h2)	h1 == h2 && ea_same(a1, a2)
#define RBH_FN(a,h)		h

#define RBH_REHASH		bgp_rbh_rehash
#define RBH_PARAMS		/8, *2, 2, 2, 12, 20


HASH_DEFINE_REHASH_FN(RBH, struct bgp_bucket)

static void
bgp_init_bucket_table(struct bgp_pending_tx *c)
{
  HASH_INIT(c->bucket_hash, c->pool, 8);

  init_list(&c->bucket_queue);
  c->withdraw_bucket = NULL;
}

static struct bgp_bucket *
bgp_get_bucket(struct bgp_pending_tx *c, ea_list *new)
{
  /* Hash and lookup */
  u32 hash = ea_hash(new);
  struct bgp_bucket *b = HASH_FIND(c->bucket_hash, RBH, new, hash);

  if (b)
    return b;

  /* Scan the list for total size */
  uint ea_size = BIRD_CPU_ALIGN(ea_list_size(new));
  uint size = sizeof(struct bgp_bucket) + ea_size;

  /* Allocate the bucket */
  b = mb_alloc(c->pool, size);
  *b = (struct bgp_bucket) { };
  init_list(&b->prefixes);
  b->hash = hash;

  /* Copy the ea_list */
  ea_list_copy(b->eattrs, new, ea_size);

  /* Insert the bucket to bucket hash */
  HASH_INSERT2(c->bucket_hash, RBH, c->pool, b);

  return b;
}

static struct bgp_bucket *
bgp_get_withdraw_bucket(struct bgp_pending_tx *c)
{
  if (!c->withdraw_bucket)
  {
    c->withdraw_bucket = mb_allocz(c->pool, sizeof(struct bgp_bucket));
    init_list(&c->withdraw_bucket->prefixes);
  }

  return c->withdraw_bucket;
}

static void
bgp_free_bucket(struct bgp_pending_tx *c, struct bgp_bucket *b)
{
  HASH_REMOVE2(c->bucket_hash, RBH, c->pool, b);
  mb_free(b);
}

int
bgp_done_bucket(struct bgp_channel *bc, struct bgp_bucket *b)
{
  struct bgp_pending_tx *c = bc->ptx;

  /* Won't free the withdraw bucket */
  if (b == c->withdraw_bucket)
    return 0;

  if (EMPTY_LIST(b->prefixes))
    rem_node(&b->send_node);

  if (b->px_uc || !EMPTY_LIST(b->prefixes))
    return 0;

  bgp_free_bucket(c, b);
  return 1;
}

void
bgp_defer_bucket(struct bgp_channel *bc, struct bgp_bucket *b)
{
  struct bgp_pending_tx *c = bc->ptx;
  rem_node(&b->send_node);
  add_tail(&c->bucket_queue, &b->send_node);
}

void
bgp_withdraw_bucket(struct bgp_channel *bc, struct bgp_bucket *b)
{
  struct bgp_proto *p = (void *) bc->c.proto;
  struct bgp_pending_tx *c = bc->ptx;
  struct bgp_bucket *wb = bgp_get_withdraw_bucket(c);

  log(L_ERR "%s: Attribute list too long", p->p.name);
  while (!EMPTY_LIST(b->prefixes))
  {
    struct bgp_prefix *px = HEAD(b->prefixes);

    log(L_ERR "%s: - withdrawing %N", p->p.name, &px->net);
    rem_node(&px->buck_node_xx);
    add_tail(&wb->prefixes, &px->buck_node_xx);
  }
}


/*
 *	Prefix hash table
 */

#define PXH_KEY(px)		px->net, px->path_id, px->hash
#define PXH_NEXT(px)		px->next
#define PXH_EQ(n1,i1,h1,n2,i2,h2) h1 == h2 && (add_path_tx ? (i1 == i2) : 1) && net_equal(n1, n2)
#define PXH_FN(n,i,h)		h

#define PXH_REHASH		bgp_pxh_rehash
#define PXH_PARAMS		/8, *2, 2, 2, 12, 24


HASH_DEFINE_REHASH_FN(PXH, struct bgp_prefix)

static void
bgp_init_prefix_table(struct bgp_channel *bc)
{
  struct bgp_pending_tx *c = bc->ptx;
  HASH_INIT(c->prefix_hash, c->pool, 8);

  uint alen = net_addr_length[bc->c.net_type];
  c->prefix_slab = alen ? sl_new(c->pool, sizeof(struct bgp_prefix) + alen) : NULL;
}

static struct bgp_prefix *
bgp_get_prefix(struct bgp_pending_tx *c, const net_addr *net, struct rte_src *src, int add_path_tx)
{
  u32 path_id = src->global_id;
  u32 path_id_hash = add_path_tx ? path_id : 0;
  /* We must use a different hash function than the rtable */
  u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id_hash));
  struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id_hash, hash);

  if (px)
    return px;

  if (c->prefix_slab)
    px = sl_alloc(c->prefix_slab);
  else
    px = mb_alloc(c->pool, sizeof(struct bgp_prefix) + net->length);

  *px = (struct bgp_prefix) { };
  px->hash = hash;
  px->path_id = path_id;
  net_copy(px->net, net);
  rt_lock_source(src);

  HASH_INSERT2(c->prefix_hash, PXH, c->pool, px);

  return px;
}

static void bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px);

static inline int
bgp_update_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *b)
{
#define IS_WITHDRAW_BUCKET(b)	((b) == c->ptx->withdraw_bucket)
#define BPX_TRACE(what)	do { \
  if (c->c.debug & D_ROUTES) log(L_TRACE "%s.%s < %s %N %uG %s", \
      c->c.proto->name, c->c.name, what, \
      px->net, px->path_id, IS_WITHDRAW_BUCKET(b) ? "withdraw" : "update"); } while (0)
  px->lastmod = current_time();

  /* Already queued for the same bucket */
  if (px->cur == b)
  {
    BPX_TRACE("already queued");
    return 0;
  }

  /* Unqueue from the old bucket */
  if (px->cur)
  {
    rem_node(&px->buck_node_xx);
    bgp_done_bucket(c, px->cur);
  }

  /* The new bucket is the same as we sent before */
  if ((px->last == b) || c->c.out_table && !px->last && IS_WITHDRAW_BUCKET(b))
  {
    if (px->cur)
      BPX_TRACE("reverted");
    else
      BPX_TRACE("already sent");

    /* Well, we haven't sent anything yet */
    if (!px->last)
      bgp_free_prefix(c->ptx, px);

    px->cur = NULL;
    return 0;
  }

  /* Enqueue the bucket if it has been empty */
  if (!IS_WITHDRAW_BUCKET(b) && EMPTY_LIST(b->prefixes))
    add_tail(&c->ptx->bucket_queue, &b->send_node);

  /* Enqueue to the new bucket and indicate the change */
  add_tail(&b->prefixes, &px->buck_node_xx);
  px->cur = b;

  BPX_TRACE("queued");
  return 1;

#undef BPX_TRACE
}

static void
bgp_free_prefix(struct bgp_pending_tx *c, struct bgp_prefix *px)
{
  HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px);

  rt_unlock_source(rt_find_source_global(px->path_id));

  if (c->prefix_slab)
    sl_free(px);
  else
    mb_free(px);
}

void
bgp_done_prefix(struct bgp_channel *c, struct bgp_prefix *px, struct bgp_bucket *buck)
{
  /* Cleanup: We're called from bucket senders. */
  ASSERT_DIE(px->cur == buck);
  rem_node(&px->buck_node_xx);

  /* We may want to store the updates */
  if (c->c.out_table)
  {
    /* Nothing to be sent right now */
    px->cur = NULL;

    /* Unref the previous sent version */
    if (px->last)
      px->last->px_uc--;

    /* Ref the current sent version */
    if (!IS_WITHDRAW_BUCKET(buck))
    {
      px->last = buck;
      px->last->px_uc++;
      return;
    }

    /* Prefixes belonging to the withdraw bucket are freed always */
  }

  bgp_free_prefix(c->ptx, px);
}

static void
bgp_pending_tx_rfree(resource *r)
{
  struct bgp_pending_tx *ptx = SKIP_BACK(struct bgp_pending_tx, r, r);

  HASH_WALK(ptx->prefix_hash, next, n)
    rt_unlock_source(rt_find_source_global(n->path_id));
  HASH_WALK_END;
}

static void bgp_pending_tx_dump(resource *r UNUSED) { debug("\n"); }

static struct resclass bgp_pending_tx_class = {
  .name = "BGP Pending TX",
  .size = sizeof(struct bgp_pending_tx),
  .free = bgp_pending_tx_rfree,
  .dump = bgp_pending_tx_dump,
};

void
bgp_init_pending_tx(struct bgp_channel *c)
{
  ASSERT_DIE(!c->ptx);

  pool *p = rp_new(c->pool, "BGP Pending TX");
  c->ptx = ralloc(p, &bgp_pending_tx_class);
  c->ptx->pool = p;

  bgp_init_bucket_table(c->ptx);
  bgp_init_prefix_table(c);
}

void
bgp_free_pending_tx(struct bgp_channel *c)
{
  ASSERT_DIE(c->ptx);
  ASSERT_DIE(c->ptx->pool);

  rfree(c->ptx->pool);
  c->ptx = NULL;
}


/*
 *	Prefix hash table exporter
 */

struct bgp_out_export_hook {
  struct rt_export_hook h;
  u32 hash_iter;				/* Iterator over hash */
};

static void
bgp_out_table_feed(void *data)
{
  struct bgp_out_export_hook *hook = data;
  struct bgp_channel *bc = SKIP_BACK(struct bgp_channel, prefix_exporter, hook->h.table);
  struct bgp_pending_tx *c = bc->ptx;

  int max = 512;

  const net_addr *neq = (hook->h.req->addr_mode == TE_ADDR_EQUAL) ? hook->h.req->addr : NULL;
  const net_addr *cand = NULL;

  do {
    HASH_WALK_ITER(c->prefix_hash, PXH, n, hook->hash_iter)
    {
      switch (hook->h.req->addr_mode)
      {
	case TE_ADDR_IN:
	  if (!net_in_netX(n->net, hook->h.req->addr))
	    continue;
	  /* fall through */
	case TE_ADDR_NONE:
	  /* Splitting only for multi-net exports */
	  if (--max <= 0)
	    HASH_WALK_ITER_PUT;
	  break;

	case TE_ADDR_FOR:
	  if (!neq)
	  {
	    if (net_in_netX(hook->h.req->addr, n->net) && (!cand || (n->net->length > cand->length)))
	      cand = n->net;
	    continue;
	  }
	  /* fall through */
	case TE_ADDR_EQUAL:
	  if (!net_equal(n->net, neq))
	    continue;
	  break;
      }

      struct bgp_bucket *buck = n->cur ?: n->last;
      ea_list *ea = NULL;
      if (buck == c->withdraw_bucket)
	ea_set_dest(&ea, 0, RTD_UNREACHABLE);
      else
      {
	ea = buck->eattrs;
	eattr *eanh = bgp_find_attr(ea, BA_NEXT_HOP);
	ASSERT_DIE(eanh);
	const ip_addr *nh = (const void *) eanh->u.ptr->data;

	struct nexthop_adata nhad = {
	  .ad = { .length = sizeof (struct nexthop_adata) - sizeof (struct adata), },
	  .nh = { .gw = nh[0], },
	};

	ea_set_attr(&ea, EA_LITERAL_DIRECT_ADATA(&ea_gen_nexthop, 0, tmp_copy_adata(&nhad.ad)));
      }

      struct rte_storage es = {
	.rte = {
	  .attrs = ea,
	  .net = n->net,
	  .src = rt_find_source_global(n->path_id),
	  .sender = NULL,
	  .lastmod = n->lastmod,
	  .flags = n->cur ? REF_PENDING : 0,
	},
      };

      struct rt_pending_export rpe = {
	.new = &es, .new_best = &es,
      };

      if (hook->h.req->export_bulk)
      {
	rte *feed = &es.rte;
	hook->h.req->export_bulk(hook->h.req, n->net, &rpe, &feed, 1);
      }
      else if (hook->h.req->export_one)
	hook->h.req->export_one(hook->h.req, n->net, &rpe);
      else
	bug("No export method in export request");
    }
    HASH_WALK_ITER_END;

    neq = cand;
    cand = NULL;
  } while (neq);

  if (hook->hash_iter)
    ev_schedule_work(&hook->h.event);
  else
    rt_set_export_state(&hook->h, TES_READY);
}

static void
bgp_out_table_export_start(struct rt_exporter *re, struct rt_export_request *req)
{
  req->hook = rt_alloc_export(re, sizeof(struct bgp_out_export_hook));
  req->hook->req = req;

  struct bgp_out_export_hook *hook = SKIP_BACK(struct bgp_out_export_hook, h, req->hook);

  hook->h.event.hook = bgp_out_table_feed;
  rt_init_export(re, req->hook);
}

static void
bgp_out_table_export_done(void *data)
{
  struct bgp_out_export_hook *hook = data;
  struct rt_export_request *req = hook->h.req;
  void (*stopped)(struct rt_export_request *) = hook->h.stopped;

  rt_export_stopped(&hook->h);
  CALL(stopped, req);
}

static const struct rt_exporter_class bgp_out_table_export_class = {
  .start = bgp_out_table_export_start,
  .done = bgp_out_table_export_done,
};

void
bgp_setup_out_table(struct bgp_channel *c)
{
  ASSERT_DIE(c->c.out_table == NULL);

  c->prefix_exporter = (struct rt_exporter) {
    .class = &bgp_out_table_export_class,
    .addr_type = c->c.table->addr_type,
    .rp = c->c.proto->pool,
  };

  rt_exporter_init(&c->prefix_exporter);

  c->c.out_table = &c->prefix_exporter;
}


/*
 *	BGP protocol glue
 */

int
bgp_preexport(struct channel *C, rte *e)
{
  struct bgp_proto *p = (struct bgp_proto *) C->proto;
  struct bgp_proto *src = bgp_rte_proto(e);
  struct bgp_channel *c = (struct bgp_channel *) C;

  /* Reject our routes */
  if (src == p)
    return -1;

  /* Accept non-BGP routes */
  if (src == NULL)
    return 0;

  /* Reject flowspec that failed validation */
  if (net_is_flow(e->net))
    switch (rt_get_flowspec_valid(e))
    {
      case FLOWSPEC_VALID:
	break;
      case FLOWSPEC_INVALID:
	return -1;
      case FLOWSPEC_UNKNOWN:
 	ASSUME((rt_get_source_attr(e) != RTS_BGP) ||
	    !((struct bgp_channel *) SKIP_BACK(struct channel, in_req, e->sender->req))->base_table);
	break;
      case FLOWSPEC__MAX:
	bug("This never happens.");
    }

  /* IBGP route reflection, RFC 4456 */
  if (p->is_internal && src->is_internal && (p->local_as == src->local_as))
  {
    /* Rejected unless configured as route reflector */
    if (!p->rr_client && !src->rr_client)
      return -1;

    /* Generally, this should be handled when path is received, but we check it
       also here as rr_cluster_id may be undefined or different in src. */
    if (p->rr_cluster_id && bgp_cluster_list_loopy(p, e->attrs))
      return -1;
  }

  /* Handle well-known communities, RFC 1997 */
  struct eattr *a;
  if (p->cf->interpret_communities &&
      (a = bgp_find_attr(e->attrs, BA_COMMUNITY)))
  {
    const struct adata *d = a->u.ptr;

    /* Do not export anywhere */
    if (int_set_contains(d, BGP_COMM_NO_ADVERTISE))
      return -1;

    /* Do not export outside of AS (or member-AS) */
    if (!p->is_internal && int_set_contains(d, BGP_COMM_NO_EXPORT_SUBCONFED))
      return -1;

    /* Do not export outside of AS (or confederation) */
    if (!p->is_interior && int_set_contains(d, BGP_COMM_NO_EXPORT))
      return -1;

    /* Do not export LLGR_STALE routes to LLGR-ignorant peers */
    if (!p->conn->remote_caps->llgr_aware && int_set_contains(d, BGP_COMM_LLGR_STALE))
      return -1;
  }

  /* Do not export routes marked with OTC to upstream, RFC 9234 */
  if (bgp_channel_is_role_applicable(c))
  {
    a = bgp_find_attr(e->attrs, BA_ONLY_TO_CUSTOMER);
    if (a && (p->cf->local_role==BGP_ROLE_CUSTOMER ||
	      p->cf->local_role==BGP_ROLE_PEER ||
	      p->cf->local_role==BGP_ROLE_RS_CLIENT))
      return -1;
  }

  return 0;
}

static ea_list *
bgp_update_attrs(struct bgp_proto *p, struct bgp_channel *c, rte *e, ea_list *attrs0, struct linpool *pool)
{
  struct bgp_proto *src = bgp_rte_proto(e);
  struct bgp_export_state s = { .proto = p, .channel = c, .pool = pool, .src = src, .route = e, .mpls = c->desc->mpls };
  ea_list *attrs = attrs0;
  eattr *a;
  const adata *ad;

  /* ORIGIN attribute - mandatory, attach if missing */
  if (! bgp_find_attr(attrs0, BA_ORIGIN))
    bgp_set_attr_u32(&attrs, BA_ORIGIN, 0, src ? ORIGIN_INCOMPLETE : ORIGIN_IGP);

  /* AS_PATH attribute - mandatory */
  a = bgp_find_attr(attrs0, BA_AS_PATH);
  ad = a ? a->u.ptr : &null_adata;

  /* AS_PATH attribute - strip AS_CONFED* segments outside confederation */
  if ((!p->cf->confederation || !p->is_interior) && as_path_contains_confed(ad))
    ad = as_path_strip_confed(pool, ad);

  /* AS_PATH attribute - keep or prepend ASN */
  if (p->is_internal || p->rs_client)
  {
    /* IBGP or route server -> just ensure there is one */
    if (!a)
      bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, &null_adata);
  }
  else if (p->is_interior)
  {
    /* Confederation -> prepend ASN as AS_CONFED_SEQUENCE */
    ad = as_path_prepend2(pool, ad, AS_PATH_CONFED_SEQUENCE, p->public_as);
    bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, ad);
  }
  else /* Regular EBGP (no RS, no confederation) */
  {
    /* Regular EBGP -> prepend ASN as regular sequence */
    ad = as_path_prepend2(pool, ad, AS_PATH_SEQUENCE, p->public_as);
    bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, ad);

    /* MULTI_EXIT_DESC attribute - accept only if set in export filter */
    a = bgp_find_attr(attrs0, BA_MULTI_EXIT_DISC);
    if (a && !(a->fresh))
      bgp_unset_attr(&attrs, BA_MULTI_EXIT_DISC);
  }

  /* NEXT_HOP attribute - delegated to AF-specific hook */
  a = bgp_find_attr(attrs0, BA_NEXT_HOP);
  bgp_update_next_hop(&s, a, &attrs);

  /* LOCAL_PREF attribute - required for IBGP, attach if missing */
  if (p->is_interior && ! bgp_find_attr(attrs0, BA_LOCAL_PREF))
    bgp_set_attr_u32(&attrs, BA_LOCAL_PREF, 0, p->cf->default_local_pref);

  /* AIGP attribute - accumulate local metric or originate new one */
  u64 metric;
  if (s.local_next_hop &&
      (bgp_total_aigp_metric_(e, &metric, &ad) ||
       (c->cf->aigp_originate && bgp_init_aigp_metric(e, &metric, &ad))))
  {
    ad = bgp_aigp_set_metric(pool, ad, metric);
    bgp_set_attr_ptr(&attrs, BA_AIGP, 0, ad);
  }

  /* IBGP route reflection, RFC 4456 */
  if (src && src->is_internal && p->is_internal && (src->local_as == p->local_as))
  {
    /* ORIGINATOR_ID attribute - attach if not already set */
    if (! bgp_find_attr(attrs0, BA_ORIGINATOR_ID))
      bgp_set_attr_u32(&attrs, BA_ORIGINATOR_ID, 0, src->remote_id);

    /* CLUSTER_LIST attribute - prepend cluster ID */
    a = bgp_find_attr(attrs0, BA_CLUSTER_LIST);
    ad = a ? a->u.ptr : NULL;

    /* Prepend src cluster ID */
    if (src->rr_cluster_id)
      ad = int_set_prepend(pool, ad, src->rr_cluster_id);

    /* Prepend dst cluster ID if src and dst clusters are different */
    if (p->rr_cluster_id && (src->rr_cluster_id != p->rr_cluster_id))
      ad = int_set_prepend(pool, ad, p->rr_cluster_id);

    /* Should be at least one prepended cluster ID */
    bgp_set_attr_ptr(&attrs, BA_CLUSTER_LIST, 0, ad);
  }

  /* AS4_* transition attributes, RFC 6793 4.2.2 */
  if (! p->as4_session)
  {
    a = bgp_find_attr(attrs, BA_AS_PATH);
    if (a && as_path_contains_as4(a->u.ptr))
    {
      bgp_set_attr_ptr(&attrs, BA_AS_PATH, 0, as_path_to_old(pool, a->u.ptr));
      bgp_set_attr_ptr(&attrs, BA_AS4_PATH, 0, as_path_strip_confed(pool, a->u.ptr));
    }

    a = bgp_find_attr(attrs, BA_AGGREGATOR);
    if (a && aggregator_contains_as4(a->u.ptr))
    {
      bgp_set_attr_ptr(&attrs, BA_AGGREGATOR, 0, aggregator_to_old(pool, a->u.ptr));
      bgp_set_attr_ptr(&attrs, BA_AS4_AGGREGATOR, 0, a->u.ptr);
    }
  }

  /* Mark routes for downstream with OTC, RFC 9234 */
  if (bgp_channel_is_role_applicable(c))
  {
    a = bgp_find_attr(attrs, BA_ONLY_TO_CUSTOMER);
    if (!a && (p->cf->local_role == BGP_ROLE_PROVIDER ||
	       p->cf->local_role == BGP_ROLE_PEER ||
	       p->cf->local_role == BGP_ROLE_RS_SERVER))
      bgp_set_attr_u32(&attrs, BA_ONLY_TO_CUSTOMER, 0, p->public_as);
  }

  /*
   * Presence of mandatory attributes ORIGIN and AS_PATH is ensured by above
   * conditions. Presence and validity of quasi-mandatory NEXT_HOP attribute
   * should be checked in AF-specific hooks.
   */

  /* Apply per-attribute export hooks for validatation and normalization */
  return bgp_export_attrs(&s, attrs);
}

void
bgp_rt_notify(struct proto *P, struct channel *C, const net_addr *n, rte *new, const rte *old)
{
  struct bgp_proto *p = (void *) P;
  struct bgp_channel *c = (void *) C;
  struct bgp_bucket *buck;
  struct rte_src *path;

  if (new)
  {
    struct ea_list *attrs = bgp_update_attrs(p, c, new, new->attrs, tmp_linpool);

    /* Error during attribute processing */
    if (!attrs)
      log(L_ERR "%s: Invalid route %N withdrawn", p->p.name, n);

    /* If attributes are invalid, we fail back to withdraw */
    buck = attrs ? bgp_get_bucket(c->ptx, attrs) : bgp_get_withdraw_bucket(c->ptx);
    path = new->src;
  }
  else
  {
    buck = bgp_get_withdraw_bucket(c->ptx);
    path = old->src;
  }

  if (bgp_update_prefix(c, bgp_get_prefix(c->ptx, n, path, c->add_path_tx), buck))
    bgp_schedule_packet(p->conn, c, PKT_UPDATE);
}


static inline u32
bgp_get_neighbor(rte *r)
{
  eattr *e = ea_find(r->attrs, BGP_EA_ID(BA_AS_PATH));
  u32 as;

  if (e && as_path_get_first_regular(e->u.ptr, &as))
    return as;

  /* If AS_PATH is not defined, we treat rte as locally originated */
  struct bgp_proto *p = bgp_rte_proto(r);
  return p->cf->confederation ?: p->local_as;
}

static inline int
rte_stale(rte *r)
{
  if (r->pflags & BGP_REF_STALE)
    return 1;

  if (r->pflags & BGP_REF_NOT_STALE)
    return 0;

  /* If staleness is unknown, compute and cache it */
  eattr *a = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY));
  if (a && int_set_contains(a->u.ptr, BGP_COMM_LLGR_STALE))
  {
    r->pflags |= BGP_REF_STALE;
    return 1;
  }
  else
  {
    r->pflags |= BGP_REF_NOT_STALE;
    return 0;
  }
}

int
bgp_rte_better(rte *new, rte *old)
{
  struct bgp_proto *new_bgp = bgp_rte_proto(new);
  struct bgp_proto *old_bgp = bgp_rte_proto(old);
  eattr *x, *y;
  u32 n, o;

  /* Skip suppressed routes (see bgp_rte_recalculate()) */
  n = new->pflags & BGP_REF_SUPPRESSED;
  o = old->pflags & BGP_REF_SUPPRESSED;
  if (n > o)
    return 0;
  if (n < o)
    return 1;

  /* RFC 4271 9.1.2.1. Route resolvability test */
  n = rte_resolvable(new);
  o = rte_resolvable(old);
  if (n > o)
    return 1;
  if (n < o)
    return 0;

  /* LLGR draft - depreference stale routes */
  n = rte_stale(new);
  o = rte_stale(old);
  if (n > o)
    return 0;
  if (n < o)
    return 1;

 /* Start with local preferences */
  x = ea_find(new->attrs, BGP_EA_ID(BA_LOCAL_PREF));
  y = ea_find(old->attrs, BGP_EA_ID(BA_LOCAL_PREF));
  n = x ? x->u.data : new_bgp->cf->default_local_pref;
  o = y ? y->u.data : old_bgp->cf->default_local_pref;
  if (n > o)
    return 1;
  if (n < o)
    return 0;

  /* RFC 7311 4.1 - Apply AIGP metric */
  u64 n2 = bgp_total_aigp_metric(new);
  u64 o2 = bgp_total_aigp_metric(old);
  if (n2 < o2)
    return 1;
  if (n2 > o2)
    return 0;

  /* RFC 4271 9.1.2.2. a)  Use AS path lengths */
  if (new_bgp->cf->compare_path_lengths || old_bgp->cf->compare_path_lengths)
  {
    x = ea_find(new->attrs, BGP_EA_ID(BA_AS_PATH));
    y = ea_find(old->attrs, BGP_EA_ID(BA_AS_PATH));
    n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN;
    o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN;
    if (n < o)
      return 1;
    if (n > o)
      return 0;
  }

  /* RFC 4271 9.1.2.2. b) Use origins */
  x = ea_find(new->attrs, BGP_EA_ID(BA_ORIGIN));
  y = ea_find(old->attrs, BGP_EA_ID(BA_ORIGIN));
  n = x ? x->u.data : ORIGIN_INCOMPLETE;
  o = y ? y->u.data : ORIGIN_INCOMPLETE;
  if (n < o)
    return 1;
  if (n > o)
    return 0;

  /* RFC 4271 9.1.2.2. c) Compare MED's */
  /* Proper RFC 4271 path selection cannot be interpreted as finding
   * the best path in some ordering. It is implemented partially in
   * bgp_rte_recalculate() when deterministic_med option is
   * active. Without that option, the behavior is just an
   * approximation, which in specific situations may lead to
   * persistent routing loops, because it is nondeterministic - it
   * depends on the order in which routes appeared. But it is also the
   * same behavior as used by default in Cisco routers, so it is
   * probably not a big issue.
   */
  if (new_bgp->cf->med_metric || old_bgp->cf->med_metric ||
      (bgp_get_neighbor(new) == bgp_get_neighbor(old)))
  {
    x = ea_find(new->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC));
    y = ea_find(old->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC));
    n = x ? x->u.data : new_bgp->cf->default_med;
    o = y ? y->u.data : old_bgp->cf->default_med;
    if (n < o)
      return 1;
    if (n > o)
      return 0;
  }

  /* RFC 4271 9.1.2.2. d) Prefer external peers */
  if (new_bgp->is_interior > old_bgp->is_interior)
    return 0;
  if (new_bgp->is_interior < old_bgp->is_interior)
    return 1;

  /* RFC 4271 9.1.2.2. e) Compare IGP metrics */
  n = new_bgp->cf->igp_metric ? rt_get_igp_metric(new) : 0;
  o = old_bgp->cf->igp_metric ? rt_get_igp_metric(old) : 0;
  if (n < o)
    return 1;
  if (n > o)
    return 0;

  /* RFC 4271 9.1.2.2. f) Compare BGP identifiers */
  /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighbor ID */
  x = ea_find(new->attrs, BGP_EA_ID(BA_ORIGINATOR_ID));
  y = ea_find(old->attrs, BGP_EA_ID(BA_ORIGINATOR_ID));
  n = x ? x->u.data : new_bgp->remote_id;
  o = y ? y->u.data : old_bgp->remote_id;

  /* RFC 5004 - prefer older routes */
  /* (if both are external and from different peer) */
  if ((new_bgp->cf->prefer_older || old_bgp->cf->prefer_older) &&
      !new_bgp->is_internal && n != o)
    return 0;

  /* rest of RFC 4271 9.1.2.2. f) */
  if (n < o)
    return 1;
  if (n > o)
    return 0;

  /* RFC 4456 9. b) Compare cluster list lengths */
  x = ea_find(new->attrs, BGP_EA_ID(BA_CLUSTER_LIST));
  y = ea_find(old->attrs, BGP_EA_ID(BA_CLUSTER_LIST));
  n = x ? int_set_get_size(x->u.ptr) : 0;
  o = y ? int_set_get_size(y->u.ptr) : 0;
  if (n < o)
    return 1;
  if (n > o)
    return 0;

  /* RFC 4271 9.1.2.2. g) Compare peer IP adresses */
  return ipa_compare(new_bgp->remote_ip, old_bgp->remote_ip) < 0;
}


int
bgp_rte_mergable(rte *pri, rte *sec)
{
  struct bgp_proto *pri_bgp = bgp_rte_proto(pri);
  struct bgp_proto *sec_bgp = bgp_rte_proto(sec);
  eattr *x, *y;
  u32 p, s;

  /* Skip suppressed routes (see bgp_rte_recalculate()) */
  if ((pri->pflags ^ sec->pflags) & BGP_REF_SUPPRESSED)
    return 0;

  /* RFC 4271 9.1.2.1. Route resolvability test */
  if (rte_resolvable(pri) != rte_resolvable(sec))
    return 0;

  /* LLGR draft - depreference stale routes */
  if (rte_stale(pri) != rte_stale(sec))
    return 0;

  /* Start with local preferences */
  x = ea_find(pri->attrs, BGP_EA_ID(BA_LOCAL_PREF));
  y = ea_find(sec->attrs, BGP_EA_ID(BA_LOCAL_PREF));
  p = x ? x->u.data : pri_bgp->cf->default_local_pref;
  s = y ? y->u.data : sec_bgp->cf->default_local_pref;
  if (p != s)
    return 0;

  /* RFC 4271 9.1.2.2. a)  Use AS path lengths */
  if (pri_bgp->cf->compare_path_lengths || sec_bgp->cf->compare_path_lengths)
  {
    x = ea_find(pri->attrs, BGP_EA_ID(BA_AS_PATH));
    y = ea_find(sec->attrs, BGP_EA_ID(BA_AS_PATH));
    p = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN;
    s = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN;

    if (p != s)
      return 0;

//    if (DELTA(p, s) > pri_bgp->cf->relax_multipath)
//      return 0;
  }

  /* RFC 4271 9.1.2.2. b) Use origins */
  x = ea_find(pri->attrs, BGP_EA_ID(BA_ORIGIN));
  y = ea_find(sec->attrs, BGP_EA_ID(BA_ORIGIN));
  p = x ? x->u.data : ORIGIN_INCOMPLETE;
  s = y ? y->u.data : ORIGIN_INCOMPLETE;
  if (p != s)
    return 0;

  /* RFC 4271 9.1.2.2. c) Compare MED's */
  if (pri_bgp->cf->med_metric || sec_bgp->cf->med_metric ||
      (bgp_get_neighbor(pri) == bgp_get_neighbor(sec)))
  {
    x = ea_find(pri->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC));
    y = ea_find(sec->attrs, BGP_EA_ID(BA_MULTI_EXIT_DISC));
    p = x ? x->u.data : pri_bgp->cf->default_med;
    s = y ? y->u.data : sec_bgp->cf->default_med;
    if (p != s)
      return 0;
  }

  /* RFC 4271 9.1.2.2. d) Prefer external peers */
  if (pri_bgp->is_interior != sec_bgp->is_interior)
    return 0;

  /* RFC 4271 9.1.2.2. e) Compare IGP metrics */
  p = pri_bgp->cf->igp_metric ? rt_get_igp_metric(pri) : 0;
  s = sec_bgp->cf->igp_metric ? rt_get_igp_metric(sec) : 0;
  if (p != s)
    return 0;

  /* Remaining criteria are ignored */

  return 1;
}


static inline int
same_group(rte *r, u32 lpref, u32 lasn)
{
  return (rt_get_preference(r) == lpref) && (bgp_get_neighbor(r) == lasn);
}

static inline int
use_deterministic_med(struct rte_storage *r)
{
  struct bgp_proto *p = bgp_rte_proto(&r->rte);
  return p && p->cf->deterministic_med;
}

int
bgp_rte_recalculate(struct rtable_private *table, net *net, rte *new, rte *old, rte *old_best)
{
  rte *key = new ? new : old;
  u32 lpref = rt_get_preference(key);
  u32 lasn = bgp_get_neighbor(key);
  int old_suppressed = old ? !!(old->pflags & BGP_REF_SUPPRESSED) : 0;

  /*
   * Proper RFC 4271 path selection is a bit complicated, it cannot be
   * implemented just by rte_better(), because it is not a linear
   * ordering. But it can be splitted to two levels, where the lower
   * level chooses the best routes in each group of routes from the
   * same neighboring AS and higher level chooses the best route (with
   * a slightly different ordering) between the best-in-group routes.
   *
   * When deterministic_med is disabled, we just ignore this issue and
   * choose the best route by bgp_rte_better() alone. If enabled, the
   * lower level of the route selection is done here (for the group
   * to which the changed route belongs), all routes in group are
   * marked as suppressed, just chosen best-in-group is not.
   *
   * Global best route selection then implements higher level by
   * choosing between non-suppressed routes (as they are always
   * preferred over suppressed routes). Routes from BGP protocols
   * that do not set deterministic_med are just never suppressed. As
   * they do not participate in the lower level selection, it is OK
   * that this fn is not called for them.
   *
   * The idea is simple, the implementation is more problematic,
   * mostly because of optimizations in rte_recalculate() that
   * avoids full recalculation in most cases.
   *
   * We can assume that at least one of new, old is non-NULL and both
   * are from the same protocol with enabled deterministic_med. We
   * group routes by both neighbor AS (lasn) and preference (lpref),
   * because bgp_rte_better() does not handle preference itself.
   */

  /* If new and old are from different groups, we just process that
     as two independent events */
  if (new && old && !same_group(old, lpref, lasn))
  {
    int i1, i2;
    i1 = bgp_rte_recalculate(table, net, NULL, old, old_best);
    i2 = bgp_rte_recalculate(table, net, new, NULL, old_best);
    return i1 || i2;
  }

  /*
   * We could find the best-in-group and then make some shortcuts like
   * in rte_recalculate, but as we would have to walk through all
   * net->routes just to find it, it is probably not worth. So we
   * just have one simple fast case that use just the old route.
   * We also set suppressed flag to avoid using it in bgp_rte_better().
   */

  if (new)
    new->pflags |= BGP_REF_SUPPRESSED;

  if (old)
  {
    old->pflags |= BGP_REF_SUPPRESSED;

    /* The fast case - replace not best with worse (or remove not best) */
    if (old_suppressed && !(new && bgp_rte_better(new, old)))
      return 0;
  }

  /* The default case - find a new best-in-group route */
  rte *r = new; /* new may not be in the list */
  for (struct rte_storage *s = net->routes; rte_is_valid(RTE_OR_NULL(s)); s = s->next)
    if (use_deterministic_med(s) && same_group(&s->rte, lpref, lasn))
    {
      s->rte.pflags |= BGP_REF_SUPPRESSED;
      if (!r || bgp_rte_better(&s->rte, r))
	r = &s->rte;
    }

  /* Simple case - the last route in group disappears */
  if (!r)
    return 0;

  /* Found if new is mergable with best-in-group */
  if (new && (new != r) && bgp_rte_mergable(r, new))
    new->pflags &= ~BGP_REF_SUPPRESSED;

  /* Found all existing routes mergable with best-in-group */
  for (struct rte_storage *s = net->routes; rte_is_valid(RTE_OR_NULL(s)); s = s->next)
    if (use_deterministic_med(s) && same_group(&s->rte, lpref, lasn))
      if ((&s->rte != r) && bgp_rte_mergable(r, &s->rte))
	s->rte.pflags &= ~BGP_REF_SUPPRESSED;

  /* Found best-in-group */
  r->pflags &= ~BGP_REF_SUPPRESSED;

  /*
   * There are generally two reasons why we have to force
   * recalculation (return 1): First, the new route may be wrongfully
   * chosen to be the best in the first case check in
   * rte_recalculate(), this may happen only if old_best is from the
   * same group. Second, another (different than new route)
   * best-in-group is chosen and that may be the proper best (although
   * rte_recalculate() without ignore that possibility).
   *
   * There are three possible cases according to whether the old route
   * was the best in group (OBG, i.e. !old_suppressed) and whether the
   * new route is the best in group (NBG, tested by r == new). These
   * cases work even if old or new is NULL.
   *
   * NBG -> new is a possible candidate for the best route, so we just
   *        check for the first reason using same_group().
   *
   * !NBG && OBG -> Second reason applies, return 1
   *
   * !NBG && !OBG -> Best in group does not change, old != old_best,
   *                 rte_better(new, old_best) is false and therefore
   *                 the first reason does not apply, return 0
   */

  if (r == new)
    return old_best && same_group(old_best, lpref, lasn);
  else
    return !old_suppressed;
}

void
bgp_rte_modify_stale(struct rt_export_request *req, const net_addr *n, struct rt_pending_export *rpe UNUSED, rte **feed, uint count)
{
  struct bgp_channel *c = SKIP_BACK(struct bgp_channel, stale_feed, req);
  struct rt_import_hook *irh = c->c.in_req.hook;

  /* Find our routes among others */
  for (uint i=0; i<count; i++)
  {
    rte *r = feed[i];

    if (
	!rte_is_valid(r) ||			/* Not a valid route */
	(r->sender != irh) ||			/* Not our route */
	(r->stale_cycle == irh->stale_set))	/* A new route, do not mark as stale */
      continue;

    eattr *ea = ea_find(r->attrs, BGP_EA_ID(BA_COMMUNITY));
    const struct adata *ad = ea ? ea->u.ptr : NULL;
    uint flags = ea ? ea->flags : BAF_PARTIAL;

    /* LLGR not allowed, withdraw the route */
    if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR))
    {
      rte_import(&c->c.in_req, n, NULL, r->src);
      continue;
    }

    /* Route already marked as LLGR, do nothing */
    if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE))
      continue;

    /* Store the tmp_linpool state to aggresively save memory */
    struct lp_state tmpp;
    lp_save(tmp_linpool, &tmpp);

    /* Mark the route as LLGR */
    rte e0 = *r;
    bgp_set_attr_ptr(&e0.attrs, BA_COMMUNITY, flags, int_set_add(tmp_linpool, ad, BGP_COMM_LLGR_STALE));
    e0.pflags &= ~BGP_REF_NOT_STALE;
    e0.pflags |= BGP_REF_STALE;

    /* We need to update the route but keep it stale. */
    ASSERT_DIE(irh->stale_set == irh->stale_valid + 1);
    irh->stale_set--;
    rte_import(&c->c.in_req, n, &e0, r->src);
    irh->stale_set++;

    /* Restore the memory state */
    lp_restore(tmp_linpool, &tmpp);
  }
}


/*
 * Reconstruct AS_PATH and AGGREGATOR according to RFC 6793 4.2.3
 */
static void
bgp_process_as4_attrs(ea_list **attrs, struct linpool *pool)
{
  eattr *p2 = bgp_find_attr(*attrs, BA_AS_PATH);
  eattr *p4 = bgp_find_attr(*attrs, BA_AS4_PATH);
  eattr *a2 = bgp_find_attr(*attrs, BA_AGGREGATOR);
  eattr *a4 = bgp_find_attr(*attrs, BA_AS4_AGGREGATOR);

  /* First, unset AS4_* attributes */
  if (p4) bgp_unset_attr(attrs, BA_AS4_PATH);
  if (a4) bgp_unset_attr(attrs, BA_AS4_AGGREGATOR);

  /* Handle AGGREGATOR attribute */
  if (a2 && a4)
  {
    u32 a2_asn = get_u32(a2->u.ptr->data);

    /* If routes were aggregated by an old router, then AS4_PATH and
       AS4_AGGREGATOR are invalid. In that case we give up. */
    if (a2_asn != AS_TRANS)
      return;

    /* Use AS4_AGGREGATOR instead of AGGREGATOR */
    a2->u.ptr = a4->u.ptr;
  }

  /* Handle AS_PATH attribute */
  if (p2 && p4)
  {
    /* Both as_path_getlen() and as_path_cut() take AS_CONFED* as zero length */
    int p2_len = as_path_getlen(p2->u.ptr);
    int p4_len = as_path_getlen(p4->u.ptr);

    /* AS_PATH is too short, give up */
    if (p2_len < p4_len)
      return;

    /* Merge AS_PATH and AS4_PATH */
    struct adata *apc = as_path_cut(pool, p2->u.ptr, p2_len - p4_len);
    p2->u.ptr = as_path_merge(pool, apc, p4->u.ptr);
  }
}

void
bgp_get_route_info(rte *e, byte *buf)
{
  eattr *p = ea_find(e->attrs, BGP_EA_ID(BA_AS_PATH));
  eattr *o = ea_find(e->attrs, BGP_EA_ID(BA_ORIGIN));
  u32 origas;

  buf += bsprintf(buf, " (%d", rt_get_preference(e));

  if (!net_is_flow(e->net))
  {
    if (e->pflags & BGP_REF_SUPPRESSED)
      buf += bsprintf(buf, "-");

    if (rte_stale(e))
      buf += bsprintf(buf, "s");

    u64 metric = bgp_total_aigp_metric(e);
    if (metric < BGP_AIGP_MAX)
    {
      buf += bsprintf(buf, "/%lu", metric);
    }
    else if (metric = rt_get_igp_metric(e))
    {
      if (!rte_resolvable(e))
	buf += bsprintf(buf, "/-");
      else if (metric >= IGP_METRIC_UNKNOWN)
	buf += bsprintf(buf, "/?");
      else
	buf += bsprintf(buf, "/%d", metric);
    }
  }
  buf += bsprintf(buf, ") [");

  if (p && as_path_get_last(p->u.ptr, &origas))
    buf += bsprintf(buf, "AS%u", origas);
  if (o)
    buf += bsprintf(buf, "%c", "ie?"[o->u.data]);
  strcpy(buf, "]");
}