/*
 *	BIRD -- BGP Attributes
 *
 *	(c) 2000 Martin Mares <mj@ucw.cz>
 *
 *	Can be freely distributed and used under the terms of the GNU GPL.
 */

#undef LOCAL_DEBUG

#include <stdlib.h>

#include "nest/bird.h"
#include "nest/iface.h"
#include "nest/protocol.h"
#include "nest/route.h"
#include "nest/attrs.h"
#include "conf/conf.h"
#include "lib/resource.h"
#include "lib/string.h"
#include "lib/unaligned.h"

#include "bgp.h"

static byte bgp_mandatory_attrs[] = { BA_ORIGIN, BA_AS_PATH
#ifndef IPV6
,BA_NEXT_HOP
#endif
};

struct attr_desc {
  char *name;
  int expected_length;
  int expected_flags;
  int type;
  int allow_in_ebgp;
  int (*validate)(struct bgp_proto *p, byte *attr, int len);
  void (*format)(eattr *ea, byte *buf, int buflen);
};

static int
bgp_check_origin(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
{
  if (len > 2)
    return 6;
  return 0;
}

static void
bgp_format_origin(eattr *a, byte *buf, int buflen)
{
  static char *bgp_origin_names[] = { "IGP", "EGP", "Incomplete" };

  bsprintf(buf, bgp_origin_names[a->u.data]);
}

static int
bgp_check_path(byte *a, int len, int bs, int errcode)
{
  while (len)
    {
      DBG("Path segment %02x %02x\n", a[0], a[1]);
      if (len < 2 ||
	  (a[0] != AS_PATH_SET && a[0] != AS_PATH_SEQUENCE) ||
	  bs * a[1] + 2 > len)
	return errcode;
      len -= bs * a[1] + 2;
      a += bs * a[1] + 2;
    }
  return 0;
}

static int
bgp_check_as_path(struct bgp_proto *p, byte *a, int len)
{
  return bgp_check_path(a, len, p->as4_session ? 4 : 2, 11);
}

static int
bgp_check_as4_path(struct bgp_proto *p, byte *a, int len)
{
  if (bgp_as4_support && (! p->as4_session))
    return bgp_check_path(a, len, 4, 9);
  else 
    return 0;
}


static int
bgp_check_next_hop(struct bgp_proto *p UNUSED, byte *a, int len)
{
#ifdef IPV6
  return -1;
#else
  ip_addr addr;

  memcpy(&addr, a, len);
  ipa_ntoh(addr);
  if (ipa_classify(addr) & IADDR_HOST)
    return 0;
  else
    return 8;
#endif
}

static int
bgp_check_aggregator(struct bgp_proto *p, byte *a UNUSED, int len)
{
  int exp_len = p->as4_session ? 8 : 6;
  
  return (len == exp_len) ? 0 : 5;
}

static void
bgp_format_aggregator(eattr *a, byte *buf, int buflen UNUSED)
{
  struct adata *ad =  a->u.ptr;
  byte *data = ad->data;
  u32 as;

  if (bgp_as4_support)
    {
      as = get_u32(data);
      data += 4;
    }
  else
    {
      as = get_u16(data);
      data += 2;
    }

  bsprintf(buf, "%d.%d.%d.%d AS%d", data[0], data[1], data[2], data[3], as);
}

static int
bgp_check_cluster_list(struct bgp_proto *p UNUSED, byte *a UNUSED, int len)
{
  return ((len % 4) == 0) ? 0 : 5;
}

static void
bgp_format_cluster_list(eattr *a, byte *buf, int buflen UNUSED)
{
  int_set_format(a->u.ptr, 0, buf, buflen);
}

static int
bgp_check_reach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED)
{
#ifdef IPV6
  p->mp_reach_start = a;
  p->mp_reach_len = len;
#endif
  return -1;
}

static int
bgp_check_unreach_nlri(struct bgp_proto *p UNUSED, byte *a UNUSED, int len UNUSED)
{
#ifdef IPV6
  p->mp_unreach_start = a;
  p->mp_unreach_len = len;
#endif
  return -1;
}

static struct attr_desc bgp_attr_table[] = {
  { NULL, -1, 0, 0, 0,								/* Undefined */
    NULL, NULL },
  { "origin", 1, BAF_TRANSITIVE, EAF_TYPE_INT, 1,				/* BA_ORIGIN */
    bgp_check_origin, bgp_format_origin },
  { "as_path", -1, BAF_TRANSITIVE, EAF_TYPE_AS_PATH, 1,				/* BA_AS_PATH */
    bgp_check_as_path, NULL },
  { "next_hop", 4, BAF_TRANSITIVE, EAF_TYPE_IP_ADDRESS, 1,			/* BA_NEXT_HOP */
    bgp_check_next_hop, NULL },
  { "med", 4, BAF_OPTIONAL, EAF_TYPE_INT, 1,					/* BA_MULTI_EXIT_DISC */
    NULL, NULL },
  { "local_pref", 4, BAF_TRANSITIVE, EAF_TYPE_INT, 0,				/* BA_LOCAL_PREF */
    NULL, NULL },
  { "atomic_aggr", 0, BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1,			/* BA_ATOMIC_AGGR */
    NULL, NULL },
  { "aggregator", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1,	/* BA_AGGREGATOR */
    bgp_check_aggregator, bgp_format_aggregator },
  { "community", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_INT_SET, 1,	/* BA_COMMUNITY */
    NULL, NULL },
  { "originator_id", 4, BAF_OPTIONAL, EAF_TYPE_ROUTER_ID, 0,			/* BA_ORIGINATOR_ID */
    NULL, NULL },
  { "cluster_list", -1, BAF_OPTIONAL, EAF_TYPE_INT_SET, 0,			/* BA_CLUSTER_LIST */
    bgp_check_cluster_list, bgp_format_cluster_list }, 
  { NULL, },									/* BA_DPA */
  { NULL, },									/* BA_ADVERTISER */
  { NULL, },									/* BA_RCID_PATH */
  { "mp_reach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1,			/* BA_MP_REACH_NLRI */
    bgp_check_reach_nlri, NULL },
  { "mp_unreach_nlri", -1, BAF_OPTIONAL, EAF_TYPE_OPAQUE, 1,			/* BA_MP_UNREACH_NLRI */
    bgp_check_unreach_nlri, NULL },
  { NULL, },									/* BA_EXTENDED_COMM */
  { "as4_path", -1, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1,		/* BA_AS4_PATH */
    bgp_check_as4_path, NULL },
  { "as4_aggregator", 8, BAF_OPTIONAL | BAF_TRANSITIVE, EAF_TYPE_OPAQUE, 1,	/* BA_AS4_PATH */
    NULL, NULL }
};

/* BA_AS4_PATH is type EAF_TYPE_OPAQUE and not type EAF_TYPE_AS_PATH because
 * EAF_TYPE_AS_PATH is supposed to have different format (2 or 4 B for each ASN)
 * depending on bgp_as4_support variable.
 */

#define ATTR_KNOWN(code) ((code) < ARRAY_SIZE(bgp_attr_table) && bgp_attr_table[code].name)

static inline struct adata *
bgp_alloc_adata(struct linpool *pool, unsigned len)
{
  struct adata *ad = lp_alloc(pool, sizeof(struct adata) + len);
  ad->length = len;
  return ad;
}

static void
bgp_set_attr(eattr *e, unsigned attr, uintptr_t val)
{
  ASSERT(ATTR_KNOWN(attr));
  e->id = EA_CODE(EAP_BGP, attr);
  e->type = bgp_attr_table[attr].type;
  e->flags = bgp_attr_table[attr].expected_flags;
  if (e->type & EAF_EMBEDDED)
    e->u.data = val;
  else
    e->u.ptr = (struct adata *) val;
}

static byte *
bgp_set_attr_wa(eattr *e, struct linpool *pool, unsigned attr, unsigned len)
{
  struct adata *ad = bgp_alloc_adata(pool, len);
  bgp_set_attr(e, attr, (uintptr_t) ad);
  return ad->data;
}

void
bgp_attach_attr(ea_list **to, struct linpool *pool, unsigned attr, uintptr_t val)
{
  ea_list *a = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr));
  a->next = *to;
  *to = a;
  a->flags = EALF_SORTED;
  a->count = 1;
  bgp_set_attr(a->attrs, attr, val);
}

byte *
bgp_attach_attr_wa(ea_list **to, struct linpool *pool, unsigned attr, unsigned len)
{
  struct adata *ad = bgp_alloc_adata(pool, len);
  bgp_attach_attr(to, pool, attr, (uintptr_t) ad);
  return ad->data;
}

static int
bgp_encode_attr_hdr(byte *dst, unsigned int flags, unsigned code, int len)
{
  int wlen;

  DBG("\tAttribute %02x (%d bytes, flags %02x)\n", code, len, flags);

  if (len < 256)
    {
      *dst++ = flags;
      *dst++ = code;
      *dst++ = len;
      wlen = 3;
    }
  else
    {
      *dst++ = flags | BAF_EXT_LEN;
      *dst++ = code;
      put_u16(dst, len);
      wlen = 4;
    }

  return wlen;
}

static void
aggregator_convert_to_old(struct adata *aggr, byte *dst, int *new_used)
{
  byte *src = aggr->data;
  *new_used = 0;

  u32 as = get_u32(src);
  if (as > 0xFFFF) 
    {
      as = AS_TRANS;
      *new_used = 1;
    }
  put_u16(dst, as);

  /* Copy IPv4 address */
  memcpy(dst + 2, src + 4, 4);
}

static void
aggregator_convert_to_new(struct adata *aggr, byte *dst)
{
  byte *src = aggr->data;

  u32 as   = get_u16(src);
  put_u32(dst, as);

  /* Copy IPv4 address */
  memcpy(dst + 4, src + 2, 4);
}

static int
bgp_get_attr_len(eattr *a)
{
  int len;
  if (ATTR_KNOWN(EA_ID(a->id)))
    {
      int code = EA_ID(a->id);
      struct attr_desc *desc = &bgp_attr_table[code];
      len = desc->expected_length;
      if (len < 0)
	{
	  ASSERT(!(a->type & EAF_EMBEDDED));
	  len = a->u.ptr->length;
	}
    }
  else
    {
      ASSERT((a->type & EAF_TYPE_MASK) == EAF_TYPE_OPAQUE);
      len = a->u.ptr->length;
    }
  
  return len;
}

#define ADVANCE(w, r, l) do { r -= l; w += l; } while (0)

/**
 * bgp_encode_attrs - encode BGP attributes
 * @p: BGP instance
 * @w: buffer
 * @attrs: a list of extended attributes
 * @remains: remaining space in the buffer
 *
 * The bgp_encode_attrs() function takes a list of extended attributes
 * and converts it to its BGP representation (a part of an Update message).
 *
 * Result: Length of the attribute block generated.
 */
unsigned int
bgp_encode_attrs(struct bgp_proto *p, byte *w, ea_list *attrs, int remains)
{
  unsigned int i, code, flags;
  byte *start = w;
  int len, rv;

  for(i=0; i<attrs->count; i++)
    {
      eattr *a = &attrs->attrs[i];
      ASSERT(EA_PROTO(a->id) == EAP_BGP);
      code = EA_ID(a->id);
#ifdef IPV6
      /* When talking multiprotocol BGP, the NEXT_HOP attributes are used only temporarily. */
      if (code == BA_NEXT_HOP)
	continue;
#endif

      /* When AS4-aware BGP speaker is talking to non-AS4-aware BGP speaker,
       * we have to convert our 4B AS_PATH to 2B AS_PATH and send our AS_PATH 
       * as optional AS4_PATH attribute.
       */
      if ((code == BA_AS_PATH) && bgp_as4_support && (! p->as4_session))
	{
	  len = a->u.ptr->length;

	  if (remains < (len + 4))
	    goto err_no_buffer;

	  /* Using temporary buffer because don't know a length of created attr
	   * and therefore a length of a header. Perhaps i should better always
	   * use BAF_EXT_LEN. */
	  
	  byte buf[len];
	  int new_used;
	  int nl = as_path_convert_to_old(a->u.ptr, buf, &new_used);

	  DBG("BGP: Encoding old AS_PATH\n");
	  rv = bgp_encode_attr_hdr(w, BAF_TRANSITIVE, BA_AS_PATH, nl);
	  ADVANCE(w, remains, rv);
	  memcpy(w, buf, nl);
	  ADVANCE(w, remains, nl);

	  if (! new_used)
	    continue;

	  if (remains < (len + 4))
	    goto err_no_buffer;

	  /* We should discard AS_CONFED_SEQUENCE or AS_CONFED_SET path segments 
	   * here but we don't support confederations and such paths we already
	   * discarded in bgp_check_as_path().
	   */

	  DBG("BGP: Encoding AS4_PATH\n");
	  rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_PATH, len);
	  ADVANCE(w, remains, rv);
	  memcpy(w, a->u.ptr->data, len);
	  ADVANCE(w, remains, len);

	  continue;
	}

      /* The same issue with AGGREGATOR attribute */
      if ((code == BA_AGGREGATOR) && bgp_as4_support && (! p->as4_session))
	{
	  int new_used;

	  len = 6;
	  if (remains < (len + 3))
	    goto err_no_buffer;

	  rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AGGREGATOR, len);
	  ADVANCE(w, remains, rv);
	  aggregator_convert_to_old(a->u.ptr, w, &new_used);
	  ADVANCE(w, remains, len);

	  if (! new_used)
	    continue;

	  len = 8;
	  if (remains < (len + 3))
	    goto err_no_buffer;

	  rv = bgp_encode_attr_hdr(w, BAF_OPTIONAL | BAF_TRANSITIVE, BA_AS4_AGGREGATOR, len);
	  ADVANCE(w, remains, rv);
	  memcpy(w, a->u.ptr->data, len);
	  ADVANCE(w, remains, len);

	  continue;
	}

      /* Standard path continues here ... */

      flags = a->flags & (BAF_OPTIONAL | BAF_TRANSITIVE | BAF_PARTIAL);
      len = bgp_get_attr_len(a);

      if (remains < len + 4)
	goto err_no_buffer;

      rv = bgp_encode_attr_hdr(w, flags, code, len);
      ADVANCE(w, remains, rv);

      switch (a->type & EAF_TYPE_MASK)
	{
	case EAF_TYPE_INT:
	case EAF_TYPE_ROUTER_ID:
	  if (len == 4)
	    put_u32(w, a->u.data);
	  else
	    *w = a->u.data;
	  break;
	case EAF_TYPE_IP_ADDRESS:
	  {
	    ip_addr ip = *(ip_addr *)a->u.ptr->data;
	    ipa_hton(ip);
	    memcpy(w, &ip, len);
	    break;
	  }
	case EAF_TYPE_INT_SET:
	  {
	    u32 *z = (u32 *)a->u.ptr->data;
	    int i;
	    for(i=0; i<len; i+=4)
	      put_u32(w+i, *z++);
	    break;
	  }
	case EAF_TYPE_OPAQUE:
	case EAF_TYPE_AS_PATH:
	  memcpy(w, a->u.ptr->data, len);
	  break;
	default:
	  bug("bgp_encode_attrs: unknown attribute type %02x", a->type);
	}
      ADVANCE(w, remains, len);
    }
  return w - start;

 err_no_buffer:
  log(L_ERR "BGP: attribute list too long, ignoring the remaining attributes");
  return w - start;
}

static void
bgp_init_prefix(struct fib_node *N)
{
  struct bgp_prefix *p = (struct bgp_prefix *) N;
  p->bucket_node.next = NULL;
}

static int
bgp_compare_u32(const u32 *x, const u32 *y)
{
  return (*x < *y) ? -1 : (*x > *y) ? 1 : 0;
}

static void
bgp_normalize_set(u32 *dest, u32 *src, unsigned cnt)
{
  memcpy(dest, src, sizeof(u32) * cnt);
  qsort(dest, cnt, sizeof(u32), (int(*)(const void *, const void *)) bgp_compare_u32);
}

static void
bgp_rehash_buckets(struct bgp_proto *p)
{
  struct bgp_bucket **old = p->bucket_hash;
  struct bgp_bucket **new;
  unsigned oldn = p->hash_size;
  unsigned i, e, mask;
  struct bgp_bucket *b;

  p->hash_size = p->hash_limit;
  DBG("BGP: Rehashing bucket table from %d to %d\n", oldn, p->hash_size);
  p->hash_limit *= 4;
  if (p->hash_limit >= 65536)
    p->hash_limit = ~0;
  new = p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *));
  mask = p->hash_size - 1;
  for (i=0; i<oldn; i++)
    while (b = old[i])
      {
	old[i] = b->hash_next;
	e = b->hash & mask;
	b->hash_next = new[e];
	if (b->hash_next)
	  b->hash_next->hash_prev = b;
	b->hash_prev = NULL;
	new[e] = b;
      }
  mb_free(old);
}

static struct bgp_bucket *
bgp_new_bucket(struct bgp_proto *p, ea_list *new, unsigned hash)
{
  struct bgp_bucket *b;
  unsigned ea_size = sizeof(ea_list) + new->count * sizeof(eattr);
  unsigned ea_size_aligned = BIRD_ALIGN(ea_size, CPU_STRUCT_ALIGN);
  unsigned size = sizeof(struct bgp_bucket) + ea_size;
  unsigned i;
  byte *dest;
  unsigned index = hash & (p->hash_size - 1);

  /* Gather total size of non-inline attributes */
  for (i=0; i<new->count; i++)
    {
      eattr *a = &new->attrs[i];
      if (!(a->type & EAF_EMBEDDED))
	size += BIRD_ALIGN(sizeof(struct adata) + a->u.ptr->length, CPU_STRUCT_ALIGN);
    }

  /* Create the bucket and hash it */
  b = mb_alloc(p->p.pool, size);
  b->hash_next = p->bucket_hash[index];
  if (b->hash_next)
    b->hash_next->hash_prev = b;
  p->bucket_hash[index] = b;
  b->hash_prev = NULL;
  b->hash = hash;
  add_tail(&p->bucket_queue, &b->send_node);
  init_list(&b->prefixes);
  memcpy(b->eattrs, new, ea_size);
  dest = ((byte *)b->eattrs) + ea_size_aligned;

  /* Copy values of non-inline attributes */
  for (i=0; i<new->count; i++)
    {
      eattr *a = &b->eattrs->attrs[i];
      if (!(a->type & EAF_EMBEDDED))
	{
	  struct adata *oa = a->u.ptr;
	  struct adata *na = (struct adata *) dest;
	  memcpy(na, oa, sizeof(struct adata) + oa->length);
	  a->u.ptr = na;
	  dest += BIRD_ALIGN(sizeof(struct adata) + na->length, CPU_STRUCT_ALIGN);
	}
    }

  /* If needed, rehash */
  p->hash_count++;
  if (p->hash_count > p->hash_limit)
    bgp_rehash_buckets(p);

  return b;
}

static int
bgp_export_check(struct bgp_proto *p, ea_list *new)
{
  eattr *a;
  struct adata *d;

  /* Check if next hop is valid */
  a = ea_find(new, EA_CODE(EAP_BGP, BA_NEXT_HOP));
  if (!a || ipa_equal(p->next_hop, *(ip_addr *)a->u.ptr))
    {
      DBG("\tInvalid NEXT_HOP\n");
      return 0;
    }

  /* Check if we aren't forbidden to export the route by communities */
  a = ea_find(new, EA_CODE(EAP_BGP, BA_COMMUNITY));
  if (a)
    {
      d = a->u.ptr;
      if (int_set_contains(d, BGP_COMM_NO_ADVERTISE))
	{
	  DBG("\tNO_ADVERTISE\n");
	  return 0;
	}
      if (!p->is_internal &&
	  (int_set_contains(d, BGP_COMM_NO_EXPORT) ||
	   int_set_contains(d, BGP_COMM_NO_EXPORT_SUBCONFED)))
	{
	  DBG("\tNO_EXPORT\n");
	  return 0;
	}
    }

  return 1;
}

static struct bgp_bucket *
bgp_get_bucket(struct bgp_proto *p, ea_list *attrs, int originate)
{
  ea_list *new;
  unsigned i, cnt, hash, code;
  eattr *a, *d;
  u32 seen = 0;
  struct bgp_bucket *b;

  /* Merge the attribute list */
  new = alloca(ea_scan(attrs));
  ea_merge(attrs, new);
  ea_sort(new);

  /* Normalize attributes */
  d = new->attrs;
  cnt = new->count;
  new->count = 0;
  for(i=0; i<cnt; i++)
    {
      a = &new->attrs[i];
#ifdef LOCAL_DEBUG
      {
	byte buf[EA_FORMAT_BUF_SIZE];
	ea_format(a, buf);
	DBG("\t%s\n", buf);
      }
#endif
      if (EA_PROTO(a->id) != EAP_BGP)
	continue;
      code = EA_ID(a->id);
      if (ATTR_KNOWN(code))
	{
	  if (!bgp_attr_table[code].allow_in_ebgp && !p->is_internal)
	    continue;
	  /* The flags might have been zero if the attr was added by filters */
	  a->flags = (a->flags & BAF_PARTIAL) | bgp_attr_table[code].expected_flags;
	  if (code < 32)
	    seen |= 1 << code;
	}
      else
	{
	  /* Don't re-export unknown non-transitive attributes */
	  if (!(a->flags & BAF_TRANSITIVE))
	    continue;
	}
      *d = *a;
      if ((d->type & EAF_ORIGINATED) && !originate && (d->flags & BAF_TRANSITIVE) && (d->flags & BAF_OPTIONAL))
	d->flags |= BAF_PARTIAL;
      switch (d->type & EAF_TYPE_MASK)
	{
	case EAF_TYPE_INT_SET:
	  {
	    struct adata *z = alloca(sizeof(struct adata) + d->u.ptr->length);
	    z->length = d->u.ptr->length;
	    bgp_normalize_set((u32 *) z->data, (u32 *) d->u.ptr->data, z->length / 4);
	    d->u.ptr = z;
	    break;
	  }
	default: ;
	}
      d++;
      new->count++;
    }

  /* Hash */
  hash = ea_hash(new);
  for(b=p->bucket_hash[hash & (p->hash_size - 1)]; b; b=b->hash_next)
    if (b->hash == hash && ea_same(b->eattrs, new))
      {
	DBG("Found bucket.\n");
	return b;
      }

  /* Ensure that there are all mandatory attributes */
  for(i=0; i<ARRAY_SIZE(bgp_mandatory_attrs); i++)
    if (!(seen & (1 << bgp_mandatory_attrs[i])))
      {
	log(L_ERR "%s: Mandatory attribute %s missing", p->p.name, bgp_attr_table[bgp_mandatory_attrs[i]].name);
	return NULL;
      }

  if (!bgp_export_check(p, new))
    return NULL;

  /* Create new bucket */
  DBG("Creating bucket.\n");
  return bgp_new_bucket(p, new, hash);
}

void
bgp_free_bucket(struct bgp_proto *p, struct bgp_bucket *buck)
{
  if (buck->hash_next)
    buck->hash_next->hash_prev = buck->hash_prev;
  if (buck->hash_prev)
    buck->hash_prev->hash_next = buck->hash_next;
  else
    p->bucket_hash[buck->hash & (p->hash_size-1)] = buck->hash_next;
  mb_free(buck);
}

void
bgp_rt_notify(struct proto *P, net *n, rte *new, rte *old UNUSED, ea_list *attrs)
{
  struct bgp_proto *p = (struct bgp_proto *) P;
  struct bgp_bucket *buck;
  struct bgp_prefix *px;

  DBG("BGP: Got route %I/%d %s\n", n->n.prefix, n->n.pxlen, new ? "up" : "down");

  if (new)
    {
      buck = bgp_get_bucket(p, attrs, new->attrs->source != RTS_BGP);
      if (!buck)			/* Inconsistent attribute list */
	return;
    }
  else
    {
      if (!(buck = p->withdraw_bucket))
	{
	  buck = p->withdraw_bucket = mb_alloc(P->pool, sizeof(struct bgp_bucket));
	  init_list(&buck->prefixes);
	}
    }
  px = fib_get(&p->prefix_fib, &n->n.prefix, n->n.pxlen);
  if (px->bucket_node.next)
    {
      DBG("\tRemoving old entry.\n");
      rem_node(&px->bucket_node);
    }
  add_tail(&buck->prefixes, &px->bucket_node);
  bgp_schedule_packet(p->conn, PKT_UPDATE);
}


static int
bgp_create_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool)
{
  ea_list *ea = lp_alloc(pool, sizeof(ea_list) + 4*sizeof(eattr));
  rta *rta = e->attrs;
  byte *z;

  ea->next = *attrs;
  *attrs = ea;
  ea->flags = EALF_SORTED;
  ea->count = 4;

  bgp_set_attr(ea->attrs, BA_ORIGIN,
       ((rta->source == RTS_OSPF_EXT1) || (rta->source == RTS_OSPF_EXT2)) ? ORIGIN_INCOMPLETE : ORIGIN_IGP);

  if (p->is_internal)
    bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, 0);
  else
    {
      z = bgp_set_attr_wa(ea->attrs+1, pool, BA_AS_PATH, bgp_as4_support ? 6 : 4);
      z[0] = AS_PATH_SEQUENCE;
      z[1] = 1;				/* 1 AS */

      if (bgp_as4_support)
	put_u32(z+2, p->local_as);
      else
	put_u16(z+2, p->local_as);
    }

  z = bgp_set_attr_wa(ea->attrs+2, pool, BA_NEXT_HOP, sizeof(ip_addr));
  if (p->cf->next_hop_self ||
      !p->is_internal ||
      rta->dest != RTD_ROUTER)
    {
      if (ipa_nonzero(p->cf->source_addr))
        *(ip_addr *)z = p->cf->source_addr;
      else
        *(ip_addr *)z = p->local_addr;
    }
  else
    *(ip_addr *)z = e->attrs->gw;

  bgp_set_attr(ea->attrs+3, BA_LOCAL_PREF, 0);

  return 0;				/* Leave decision to the filters */
}


static inline int
bgp_as_path_loopy(struct bgp_proto *p, rta *a)
{
  eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
  return (e && as_path_is_member(e->u.ptr, p->local_as));
}

static inline int
bgp_originator_id_loopy(struct bgp_proto *p, rta *a)
{
  eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
  return (e && (e->u.data == p->local_id));
}

static inline int
bgp_cluster_list_loopy(struct bgp_proto *p, rta *a)
{
  eattr *e = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
  return (e && p->rr_client && int_set_contains(e->u.ptr, p->rr_cluster_id));
}


static inline void
bgp_path_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 as)
{
  eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
  bgp_attach_attr(attrs, pool, BA_AS_PATH, (uintptr_t) as_path_prepend(pool, a->u.ptr, as));
}

static inline void
bgp_cluster_list_prepend(rte *e, ea_list **attrs, struct linpool *pool, u32 cid)
{
  eattr *a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
  bgp_attach_attr(attrs, pool, BA_CLUSTER_LIST, (uintptr_t) int_set_add(pool, a ? a->u.ptr : NULL, cid));
}

static int
bgp_update_attrs(struct bgp_proto *p, rte *e, ea_list **attrs, struct linpool *pool, int rr)
{
  eattr *a;

  if (!p->is_internal && !p->rs_client)
    {
      bgp_path_prepend(e, attrs, pool, p->local_as);

      /* The MULTI_EXIT_DISC attribute received from a neighboring AS MUST NOT be
       * propagated to other neighboring ASes.
       * Perhaps it would be better to undefine it.
       */
      a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
      if (a)
	bgp_attach_attr(attrs, pool, BA_MULTI_EXIT_DISC, 0);
    }

  a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
  if (a && (p->is_internal || (!p->is_internal && e->attrs->iface == p->neigh->iface)))
    {
      /* Leave the original next hop attribute, will check later where does it point */
    }
  else
    {
      /* Need to create new one */
      bgp_attach_attr_ip(attrs, pool, BA_NEXT_HOP, p->local_addr);
    }

  if (rr)
    {
      /* Handling route reflection, RFC 4456 */
      struct bgp_proto *src = (struct bgp_proto *) e->attrs->proto;

      a = ea_find(e->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
      if (!a)
	bgp_attach_attr(attrs, pool, BA_ORIGINATOR_ID, src->remote_id);

      /* We attach proper cluster ID according to whether the route is entering or leaving the cluster */
      bgp_cluster_list_prepend(e, attrs, pool, src->rr_client ? src->rr_cluster_id : p->rr_cluster_id);

      /* Two RR clients with different cluster ID, hmmm */
      if (src->rr_client && p->rr_client && (src->rr_cluster_id != p->rr_cluster_id))
	bgp_cluster_list_prepend(e, attrs, pool, p->rr_cluster_id);
    }

  return 0;				/* Leave decision to the filters */
}

int
bgp_import_control(struct proto *P, rte **new, ea_list **attrs, struct linpool *pool)
{
  rte *e = *new;
  struct bgp_proto *p = (struct bgp_proto *) P;
  struct bgp_proto *new_bgp = (e->attrs->proto->proto == &proto_bgp) ? (struct bgp_proto *) e->attrs->proto : NULL;

  if (p == new_bgp)			/* Poison reverse updates */
    return -1;
  if (new_bgp)
    {
      /* We should check here for cluster list loop, because the receiving BGP instance
	 might have different cluster ID  */
      if (bgp_cluster_list_loopy(p, e->attrs))
	return -1;

      if (p->local_as == new_bgp->local_as && p->is_internal && new_bgp->is_internal)
	{
	  /* Redistribution of internal routes with IBGP */
	  if (p->rr_client || new_bgp->rr_client)
	    /* Route reflection, RFC 4456 */
	    return bgp_update_attrs(p, e, attrs, pool, 1);
	  else
	    return -1;
	}
      else
	return bgp_update_attrs(p, e, attrs, pool, 0);
    }
  else
    return bgp_create_attrs(p, e, attrs, pool);
}

static inline u32
bgp_get_neighbor(rte *r)
{
  eattr *e = ea_find(r->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
  u32 as;

  if (e && as_path_get_last(e->u.ptr, &as))
    return as;
  else
    return ((struct bgp_proto *) r->attrs->proto)->remote_as;
}

int
bgp_rte_better(rte *new, rte *old)
{
  struct bgp_proto *new_bgp = (struct bgp_proto *) new->attrs->proto;
  struct bgp_proto *old_bgp = (struct bgp_proto *) old->attrs->proto;
  eattr *x, *y;
  u32 n, o;

  /* Start with local preferences */
  x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF));
  y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_LOCAL_PREF));
  n = x ? x->u.data : new_bgp->cf->default_local_pref;
  o = y ? y->u.data : old_bgp->cf->default_local_pref;
  if (n > o)
    return 1;
  if (n < o)
    return 0;

  /* RFC 4271 9.1.2.2. a)  Use AS path lengths */
  if (new_bgp->cf->compare_path_lengths || old_bgp->cf->compare_path_lengths)
    {
      x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
      y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
      n = x ? as_path_getlen(x->u.ptr) : AS_PATH_MAXLEN;
      o = y ? as_path_getlen(y->u.ptr) : AS_PATH_MAXLEN;
      if (n < o)
	return 1;
      if (n > o)
	return 0;
    }

  /* RFC 4271 9.1.2.2. b) Use origins */
  x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
  y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGIN));
  n = x ? x->u.data : ORIGIN_INCOMPLETE;
  o = y ? y->u.data : ORIGIN_INCOMPLETE;
  if (n < o)
    return 1;
  if (n > o)
    return 0;

  /* RFC 4271 9.1.2.2. c) Compare MED's */

  if (bgp_get_neighbor(new) == bgp_get_neighbor(old))
    {
      x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
      y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_MULTI_EXIT_DISC));
      n = x ? x->u.data : new_bgp->cf->default_med;
      o = y ? y->u.data : old_bgp->cf->default_med;
      if (n < o)
	return 1;
      if (n > o)
	return 0;
    }

  /* RFC 4271 9.1.2.2. d) Prefer external peers */
  if (new_bgp->is_internal > old_bgp->is_internal)
    return 0;
  if (new_bgp->is_internal < old_bgp->is_internal)
    return 1;

  /* Skipping RFC 4271 9.1.2.2. e) */
  /* We don't have interior distances */

  /* RFC 4456 9. b) Compare cluster list lengths */
  x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
  y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_CLUSTER_LIST));
  n = x ? int_set_get_size(x->u.ptr) : 0;
  o = y ? int_set_get_size(y->u.ptr) : 0;
  if (n < o)
    return 1;
  if (n > o)
    return 0;

  /* RFC 4271 9.1.2.2. f) Compare BGP identifiers */
  /* RFC 4456 9. a) Use ORIGINATOR_ID instead of local neighor ID */
  x = ea_find(new->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
  y = ea_find(old->attrs->eattrs, EA_CODE(EAP_BGP, BA_ORIGINATOR_ID));
  n = x ? x->u.data : new_bgp->remote_id;
  o = y ? y->u.data : old_bgp->remote_id;
  if (n < o)
    return 1;
  if (n > o)
    return 0;


  /* RFC 4271 9.1.2.2. g) Compare peer IP adresses */
  return (ipa_compare(new_bgp->cf->remote_ip, old_bgp->cf->remote_ip) < 0);
}

static struct adata *
bgp_aggregator_convert_to_new(struct adata *old, struct linpool *pool)
{
  struct adata *newa = lp_alloc(pool, sizeof(struct adata) + 8);
  newa->length = 8;
  aggregator_convert_to_new(old, newa->data);
  return newa;
}


/* Take last req_as ASNs from path old2 (in 2B format), convert to 4B format
 * and append path old4 (in 4B format).
 */
static struct adata *
bgp_merge_as_paths(struct adata *old2, struct adata *old4, int req_as, struct linpool *pool)
{
  byte buf[old2->length * 2];

  int ol = as_path_convert_to_new(old2, buf, req_as);
  int nl = ol + (old4 ? old4->length : 0);

  struct adata *newa = lp_alloc(pool, sizeof(struct adata) + nl);
  newa->length = nl;
  memcpy(newa->data, buf, ol);
  if (old4) memcpy(newa->data + ol, old4->data, old4->length);

  return newa;
}


/* Reconstruct 4B AS_PATH and AGGREGATOR according to RFC 4893 4.2.3 */
static void
bgp_reconstruct_4b_atts(struct bgp_proto *p, rta *a, struct linpool *pool)
{
  eattr *p2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS_PATH));
  eattr *p4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_PATH));
  eattr *a2 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AGGREGATOR));
  eattr *a4 =ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR));

  if (a2)
    {
      u32 a2_as = get_u16(a2->u.ptr->data);

      if (a4)
	{
	  if (a2_as != AS_TRANS)
	    {
	      /* Routes were aggregated by old router and therefore AS4_PATH
	       * and AS4_AGGREGATOR is invalid
	       *
	       * Convert AS_PATH and AGGREGATOR to 4B format and finish.
	       */

	      a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool);
	      p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool);

	      return;
	    }
	  else
	    {
	      /* Common case, use AS4_AGGREGATOR attribute */
	      a2->u.ptr = a4->u.ptr;
	    }
	}
      else
	{
	  /* Common case, use old AGGREGATOR attribute */
	  a2->u.ptr = bgp_aggregator_convert_to_new(a2->u.ptr, pool);

	  if (a2_as == AS_TRANS)
	    log(L_WARN "BGP: AGGREGATOR attribute contain AS_TRANS, but AS4_AGGREGATOR is missing");
	}
    }
  else
    if (a4)
      log(L_WARN "BGP: AS4_AGGREGATOR attribute received, but AGGREGATOR attribute is missing");

  int p2_len = as_path_getlen(p2->u.ptr);
  int p4_len = p4 ? as_path_getlen(p4->u.ptr) : AS_PATH_MAXLEN;

  if (p2_len < p4_len)
    p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, NULL, AS_PATH_MAXLEN, pool);
  else
    p2->u.ptr = bgp_merge_as_paths(p2->u.ptr, p4->u.ptr, p2_len - p4_len, pool);

}

static void
bgp_remove_as4_attrs(struct bgp_proto *p, rta *a)
{
  unsigned id1 = EA_CODE(EAP_BGP, BA_AS4_PATH);
  unsigned id2 = EA_CODE(EAP_BGP, BA_AS4_AGGREGATOR);
  ea_list **el = &(a->eattrs);

  /* We know that ea_lists constructed in bgp_decode_attrs have one attribute per ea_list struct */
  while (*el != NULL)
    {
      unsigned fid = (*el)->attrs[0].id;

      if ((fid == id1) || (fid == id2))
	{
	  *el = (*el)->next;
	  if (p->as4_session)
	    log(L_WARN "BGP: Unexpected AS4_* attributes received");
	}
      else
	el = &((*el)->next);
    }
}

/**
 * bgp_decode_attrs - check and decode BGP attributes
 * @conn: connection
 * @attr: start of attribute block
 * @len: length of attribute block
 * @pool: linear pool to make all the allocations in
 * @mandatory: 1 iff presence of mandatory attributes has to be checked
 *
 * This function takes a BGP attribute block (a part of an Update message), checks
 * its consistency and converts it to a list of BIRD route attributes represented
 * by a &rta.
 */
struct rta *
bgp_decode_attrs(struct bgp_conn *conn, byte *attr, unsigned int len, struct linpool *pool, int mandatory)
{
  struct bgp_proto *bgp = conn->bgp;
  rta *a = lp_alloc(pool, sizeof(struct rta));
  unsigned int flags, code, l, i, type;
  int errcode;
  byte *z, *attr_start;
  byte seen[256/8];
  eattr *e;
  ea_list *ea;
  struct adata *ad;

  a->proto = &bgp->p;
  a->source = RTS_BGP;
  a->scope = SCOPE_UNIVERSE;
  a->cast = RTC_UNICAST;
  a->dest = RTD_ROUTER;
  a->flags = 0;
  a->aflags = 0;
  a->from = bgp->cf->remote_ip;
  a->eattrs = NULL;

  /* Parse the attributes */
  bzero(seen, sizeof(seen));
  DBG("BGP: Parsing attributes\n");
  while (len)
    {
      if (len < 2)
	goto malformed;
      attr_start = attr;
      flags = *attr++;
      code = *attr++;
      len -= 2;
      if (flags & BAF_EXT_LEN)
	{
	  if (len < 2)
	    goto malformed;
	  l = get_u16(attr);
	  attr += 2;
	  len -= 2;
	}
      else
	{
	  if (len < 1)
	    goto malformed;
	  l = *attr++;
	  len--;
	}
      if (l > len)
	goto malformed;
      len -= l;
      z = attr;
      attr += l;
      DBG("Attr %02x %02x %d\n", code, flags, l);
      if (seen[code/8] & (1 << (code%8)))
	goto malformed;
      if (ATTR_KNOWN(code))
	{
	  struct attr_desc *desc = &bgp_attr_table[code];
	  if (desc->expected_length >= 0 && desc->expected_length != (int) l)
	    { errcode = 5; goto err; }
	  if ((desc->expected_flags ^ flags) & (BAF_OPTIONAL | BAF_TRANSITIVE))
	    { errcode = 4; goto err; }
	  if (!desc->allow_in_ebgp && !bgp->is_internal)
	    continue;
	  if (desc->validate)
	    {
	      errcode = desc->validate(bgp, z, l);
	      if (errcode > 0)
		goto err;
	      if (errcode < 0)
		continue;
	    }
	  type = desc->type;
	}
      else				/* Unknown attribute */
	{
	  if (!(flags & BAF_OPTIONAL))
	    { errcode = 2; goto err; }
	  type = EAF_TYPE_OPAQUE;
	}
      seen[code/8] |= (1 << (code%8));
      ea = lp_alloc(pool, sizeof(ea_list) + sizeof(eattr));
      ea->next = a->eattrs;
      a->eattrs = ea;
      ea->flags = 0;
      ea->count = 1;
      ea->attrs[0].id = EA_CODE(EAP_BGP, code);
      ea->attrs[0].flags = flags;
      ea->attrs[0].type = type;
      if (type & EAF_EMBEDDED)
	ad = NULL;
      else
	{
	  ad = lp_alloc(pool, sizeof(struct adata) + l);
	  ea->attrs[0].u.ptr = ad;
	  ad->length = l;
	  memcpy(ad->data, z, l);
	}
      switch (type)
	{
	case EAF_TYPE_ROUTER_ID:
	case EAF_TYPE_INT:
	  if (l == 1)
	    ea->attrs[0].u.data = *z;
	  else
	    ea->attrs[0].u.data = get_u32(z);
	  break;
	case EAF_TYPE_IP_ADDRESS:
	  ipa_ntoh(*(ip_addr *)ad->data);
	  break;
	case EAF_TYPE_INT_SET:
	  {
	    u32 *z = (u32 *) ad->data;
	    for(i=0; i<ad->length/4; i++)
	      z[i] = ntohl(z[i]);
	    break;
	  }
	}
    }

#ifdef IPV6
  if (seen[BA_MP_REACH_NLRI / 8] & (1 << (BA_MP_REACH_NLRI % 8)))
    mandatory = 1;
#endif

  /* Check if all mandatory attributes are present */
  if (mandatory)
    {
      for(i=0; i < ARRAY_SIZE(bgp_mandatory_attrs); i++)
	{
	  code = bgp_mandatory_attrs[i];
	  if (!(seen[code/8] & (1 << (code%8))))
	    {
	      bgp_error(conn, 3, 3, &bgp_mandatory_attrs[i], 1);
	      return NULL;
	    }
	}
    }
  
  /* When receiving attributes from non-AS4-aware BGP speaker,
   * we have to reconstruct 4B AS_PATH and AGGREGATOR attributes
   */
  if (bgp_as4_support && (! bgp->as4_session))
    bgp_reconstruct_4b_atts(bgp, a, pool);

  if (bgp_as4_support)
    bgp_remove_as4_attrs(bgp, a);

  /* If the AS path attribute contains our AS, reject the routes */
  if (bgp_as_path_loopy(bgp, a))
    goto loop;

  /* Two checks for IBGP loops caused by route reflection, RFC 4456 */ 
  if (bgp_originator_id_loopy(bgp, a) ||
      bgp_cluster_list_loopy(bgp, a))
    goto loop;

  /* If there's no local preference, define one */
  if (!(seen[0] & (1 << BA_LOCAL_PREF)))
    bgp_attach_attr(&a->eattrs, pool, BA_LOCAL_PREF, 0);
  return a;

loop:
  DBG("BGP: Path loop!\n");
  return NULL;

malformed:
  bgp_error(conn, 3, 1, NULL, 0);
  return NULL;

err:
  bgp_error(conn, 3, errcode, attr_start, z+l-attr_start);
  return NULL;
}

int
bgp_get_attr(eattr *a, byte *buf, int buflen)
{
  unsigned int i = EA_ID(a->id);
  struct attr_desc *d;

  if (ATTR_KNOWN(i))
    {
      d = &bgp_attr_table[i];
      buf += bsprintf(buf, "%s", d->name);
      if (d->format)
	{
	  *buf++ = ':';
	  *buf++ = ' ';
	  d->format(a, buf, buflen);
	  return GA_FULL;
	}
      return GA_NAME;
    }
  bsprintf(buf, "%02x%s", i, (a->flags & BAF_TRANSITIVE) ? " [t]" : "");
  return GA_NAME;
}

void
bgp_attr_init(struct bgp_proto *p)
{
  p->hash_size = 256;
  p->hash_limit = p->hash_size * 4;
  p->bucket_hash = mb_allocz(p->p.pool, p->hash_size * sizeof(struct bgp_bucket *));
  init_list(&p->bucket_queue);
  p->withdraw_bucket = NULL;
  fib_init(&p->prefix_fib, p->p.pool, sizeof(struct bgp_prefix), 0, bgp_init_prefix);
}

void
bgp_get_route_info(rte *e, byte *buf, ea_list *attrs)
{
  eattr *p = ea_find(attrs, EA_CODE(EAP_BGP, BA_AS_PATH));
  eattr *o = ea_find(attrs, EA_CODE(EAP_BGP, BA_ORIGIN));
  u32 origas;

  buf += bsprintf(buf, " (%d) [", e->pref);
  if (p && as_path_get_first(p->u.ptr, &origas))
    buf += bsprintf(buf, "AS%u", origas);
  if (o)
    buf += bsprintf(buf, "%c", "ie?"[o->u.data]);
  strcpy(buf, "]");
}