summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOndrej Zajicek <santiago@crfreenet.org>2010-12-07 23:35:39 +0100
committerOndrej Zajicek <santiago@crfreenet.org>2010-12-07 23:35:39 +0100
commit57c574d82a44d10143aba7aaea6d1384d850c079 (patch)
tree095e00deaeedd0907652dd2e6a01f40968281bde
parent9852f81064a38d35ff1bd5cc9fab7fc33926c83c (diff)
Multipath support for OSPF
-rw-r--r--proto/ospf/config.Y7
-rw-r--r--proto/ospf/iface.c3
-rw-r--r--proto/ospf/ospf.c18
-rw-r--r--proto/ospf/ospf.h10
-rw-r--r--proto/ospf/rt.c600
-rw-r--r--proto/ospf/rt.h33
-rw-r--r--proto/ospf/topology.c4
-rw-r--r--proto/ospf/topology.h6
8 files changed, 445 insertions, 236 deletions
diff --git a/proto/ospf/config.Y b/proto/ospf/config.Y
index 3af879d6..59e1fbe0 100644
--- a/proto/ospf/config.Y
+++ b/proto/ospf/config.Y
@@ -51,7 +51,7 @@ CF_KEYWORDS(HELLO, TRANSMIT, PRIORITY, DEAD, NONBROADCAST, POINTOPOINT, TYPE)
CF_KEYWORDS(NONE, SIMPLE, AUTHENTICATION, STRICT, CRYPTOGRAPHIC)
CF_KEYWORDS(ELIGIBLE, POLL, NETWORKS, HIDDEN, VIRTUAL, CHECK, LINK)
CF_KEYWORDS(RX, BUFFER, LARGE, NORMAL, STUBNET, HIDDEN, SUMMARY)
-CF_KEYWORDS(WAIT, DELAY, LSADB)
+CF_KEYWORDS(WAIT, DELAY, LSADB, ECMP, LIMIT, WEIGHT)
%type <t> opttext
@@ -76,7 +76,9 @@ ospf_proto:
ospf_proto_item:
proto_item
| RFC1583COMPAT bool { OSPF_CFG->rfc1583 = $2; }
- | TICK expr { OSPF_CFG->tick = $2 ; if($2<=0) cf_error("Tick must be greater than zero"); }
+ | ECMP bool { OSPF_CFG->ecmp = $2 ? DEFAULT_ECMP_LIMIT : 0; }
+ | ECMP bool LIMIT expr { OSPF_CFG->ecmp = $2 ? $4 : 0; if ($4 < 0) cf_error("ECMP limit cannot be negative"); }
+ | TICK expr { OSPF_CFG->tick = $2; if($2<=0) cf_error("Tick must be greater than zero"); }
| ospf_area '}'
;
@@ -193,6 +195,7 @@ ospf_iface_item:
| STRICT NONBROADCAST bool { OSPF_PATT->strictnbma = $3 ; }
| STUB bool { OSPF_PATT->stub = $2 ; }
| CHECK LINK bool { OSPF_PATT->check_link = $3; }
+ | ECMP WEIGHT expr { OSPF_PATT->ecmp_weight = $3 - 1; if (($3<1) || ($3>256)) cf_error("ECMP weight must be in range 1-256"); }
| NEIGHBORS '{' ipa_list '}'
| AUTHENTICATION NONE { OSPF_PATT->autype = OSPF_AUTH_NONE ; }
| AUTHENTICATION SIMPLE { OSPF_PATT->autype = OSPF_AUTH_SIMPLE ; }
diff --git a/proto/ospf/iface.c b/proto/ospf/iface.c
index 83ea1c29..8b21f94b 100644
--- a/proto/ospf/iface.c
+++ b/proto/ospf/iface.c
@@ -436,6 +436,7 @@ ospf_iface_new(struct proto_ospf *po, struct iface *iface, struct ifa *addr,
ifa->ioprob = OSPF_I_OK;
ifa->rxbuf = ip->rxbuf;
ifa->check_link = ip->check_link;
+ ifa->ecmp_weight = ip->ecmp_weight;
#ifdef OSPFv2
ifa->autype = ip->autype;
@@ -795,6 +796,8 @@ ospf_iface_info(struct ospf_iface *ifa)
ifa->stub ? "(stub)" : "");
cli_msg(-1015, "\tPriority: %u", ifa->priority);
cli_msg(-1015, "\tCost: %u", ifa->cost);
+ if (ifa->oa->po->ecmp)
+ cli_msg(-1015, "\tECMP weight: %d", ((int) ifa->ecmp_weight) + 1);
cli_msg(-1015, "\tHello timer: %u", ifa->helloint);
if (ifa->type == OSPF_IT_NBMA)
diff --git a/proto/ospf/ospf.c b/proto/ospf/ospf.c
index 026d9751..19e68e24 100644
--- a/proto/ospf/ospf.c
+++ b/proto/ospf/ospf.c
@@ -147,6 +147,7 @@ ospf_start(struct proto *p)
po->router_id = proto_get_router_id(p->cf);
po->rfc1583 = c->rfc1583;
po->ebit = 0;
+ po->ecmp = c->ecmp;
po->tick = c->tick;
po->disp_timer = tm_new(p->pool);
po->disp_timer->data = po;
@@ -157,6 +158,7 @@ ospf_start(struct proto *p)
po->lsab_size = 256;
po->lsab_used = 0;
po->lsab = mb_alloc(p->pool, po->lsab_size);
+ po->nhpool = lp_new(p->pool, 12*sizeof(struct mpnh));
init_list(&(po->iface_list));
init_list(&(po->area_list));
fib_init(&po->rtf, p->pool, sizeof(ort), 0, ospf_rt_initort);
@@ -514,6 +516,13 @@ ospf_shutdown(struct proto *p)
if (ifa->state > OSPF_IS_DOWN)
ospf_iface_shutdown(ifa);
+ /* Cleanup locked rta entries */
+ FIB_WALK(&po->rtf, nftmp)
+ {
+ rta_free(((ort *) nftmp)->old_rta);
+ }
+ FIB_WALK_END;
+
return PS_DOWN;
}
@@ -648,6 +657,7 @@ ospf_reconfigure(struct proto *p, struct proto_config *c)
schedule_rtcalc(po);
po->tick = new->tick;
+ po->ecmp = new->ecmp;
po->disp_timer->recurrent = po->tick;
tm_start(po->disp_timer, 1);
@@ -767,6 +777,14 @@ ospf_reconfigure(struct proto *p, struct proto_config *c)
ospf_iface_sm(ifa, ifa->check_link ? ISM_LOOP : ISM_UNLOOP);
}
+ /* ECMP weight */
+ if (oldip->ecmp_weight != newip->ecmp_weight)
+ {
+ ifa->ecmp_weight = newip->ecmp_weight;
+ OSPF_TRACE(D_EVENTS, "Changing ECMP weight of interface %s from %d to %d",
+ ifa->iface->name, (int)oldip->ecmp_weight + 1, (int)newip->ecmp_weight + 1);
+ }
+
/* strict nbma */
if ((oldip->strictnbma == 0) && (newip->strictnbma != 0))
{
diff --git a/proto/ospf/ospf.h b/proto/ospf/ospf.h
index 3345d4fc..2ef0180c 100644
--- a/proto/ospf/ospf.h
+++ b/proto/ospf/ospf.h
@@ -74,6 +74,7 @@ do { if ((p->debug & D_PACKETS) || OSPF_FORCE_DEBUG) \
#define DEFAULT_OSPFTICK 1
#define DEFAULT_RFC1583 0 /* compatibility with rfc1583 */
#define DEFAULT_STUB_COST 1000
+#define DEFAULT_ECMP_LIMIT 16
struct ospf_config
@@ -81,6 +82,7 @@ struct ospf_config
struct proto_config c;
unsigned tick;
int rfc1583;
+ int ecmp;
list area_list;
};
@@ -247,6 +249,7 @@ struct ospf_iface
u8 sk_dr; /* Socket is a member of DRouters group */
u16 rxbuf; /* Buffer size */
u8 check_link; /* Whether iface link change is used */
+ u8 ecmp_weight; /* Weight used for ECMP */
};
struct ospf_md5
@@ -730,11 +733,13 @@ struct proto_ospf
list area_list;
int areano; /* Number of area I belong to */
struct fib rtf; /* Routing table */
- int rfc1583; /* RFC1583 compatibility */
- int ebit; /* Did I originate any ext lsa? */
+ byte rfc1583; /* RFC1583 compatibility */
+ byte ebit; /* Did I originate any ext lsa? */
+ byte ecmp; /* Maximal number of nexthops in ECMP route, or 0 */
struct ospf_area *backbone; /* If exists */
void *lsab; /* LSA buffer used when originating router LSAs */
int lsab_size, lsab_used;
+ linpool *nhpool; /* Linpool used for next hops computed in SPF */
u32 router_id;
};
@@ -756,6 +761,7 @@ struct ospf_iface_patt
u32 vid;
u16 rxbuf;
u8 check_link;
+ u8 ecmp_weight;
#define OSPF_RXBUF_NORMAL 0
#define OSPF_RXBUF_LARGE 1
#define OSPF_RXBUF_MINSIZE 256 /* Minimal allowed size */
diff --git a/proto/ospf/rt.c b/proto/ospf/rt.c
index 55cd1cc3..6b8886ba 100644
--- a/proto/ospf/rt.c
+++ b/proto/ospf/rt.c
@@ -10,10 +10,7 @@
static void add_cand(list * l, struct top_hash_entry *en,
struct top_hash_entry *par, u32 dist,
- struct ospf_area *oa);
-static int calc_next_hop(struct ospf_area *oa,
- struct top_hash_entry *en,
- struct top_hash_entry *par);
+ struct ospf_area *oa, struct ospf_lsa_rt_link *rtl);
static void rt_sync(struct proto_ospf *po);
/* In ospf_area->rtr we store paths to routers, but we use RID (and not IP address)
@@ -25,20 +22,48 @@ static void rt_sync(struct proto_ospf *po);
#endif
-static inline void reset_ri(orta * orta)
+static inline void reset_ri(ort *ort)
{
- bzero(orta, sizeof(orta));
+ bzero(&ort->n, sizeof(orta));
}
void
ospf_rt_initort(struct fib_node *fn)
{
ort *ri = (ort *) fn;
- reset_ri(&ri->n);
- reset_ri(&ri->o);
+ reset_ri(ri);
+ ri->old_rta = NULL;
ri->fn.x0 = 0;
}
+static inline int
+unresolved_vlink(struct mpnh *nhs)
+{
+ return nhs && !nhs->iface;
+}
+
+static inline struct mpnh *
+new_nexthop(struct proto_ospf *po, ip_addr gw, struct iface *iface, unsigned char weight)
+{
+ struct mpnh *nh = lp_alloc(po->nhpool, sizeof(struct mpnh));
+ nh->gw = gw;
+ nh->iface = iface;
+ nh->next = NULL;
+ nh->weight = weight;
+ return nh;
+}
+
+static inline struct mpnh *
+copy_nexthop(struct proto_ospf *po, struct mpnh *src)
+{
+ struct mpnh *nh = lp_alloc(po->nhpool, sizeof(struct mpnh));
+ nh->gw = src->gw;
+ nh->iface = src->iface;
+ nh->next = NULL;
+ nh->weight = src->weight;
+ return nh;
+}
+
/* If new is better return 1 */
static int
@@ -234,8 +259,7 @@ add_network(struct ospf_area *oa, ip_addr px, int pxlen, int metric, struct top_
.tag = 0,
.rid = en->lsa.rt,
.oa = oa,
- .ifa = en->nhi,
- .nh = en->nh
+ .nhs = en->nhs
};
if (en == oa->rt)
@@ -248,8 +272,8 @@ add_network(struct ospf_area *oa, ip_addr px, int pxlen, int metric, struct top_
* be removed in rt_sync().
*/
- nf.ifa = find_stub_src(oa, px, pxlen);
- nf.nh = IPA_NONE;
+ struct ospf_iface *ifa = find_stub_src(oa, px, pxlen);
+ nf.nhs = ifa ? new_nexthop(oa->po, IPA_NONE, ifa->iface, ifa->ecmp_weight) : NULL;
}
ri_install_net(oa->po, px, pxlen, &nf);
@@ -372,7 +396,7 @@ ospf_rt_spfa_rtlinks(struct ospf_area *oa, struct top_hash_entry *act, struct to
if (tmp)
DBG("Going to add cand, Mydist: %u, Req: %u\n",
tmp->dist, act->dist + rtl->metric);
- add_cand(&oa->cand, tmp, act, act->dist + rtl->metric, oa);
+ add_cand(&oa->cand, tmp, act, act->dist + rtl->metric, oa, rtl);
}
}
@@ -439,8 +463,7 @@ ospf_rt_spfa(struct ospf_area *oa)
.tag = 0,
.rid = act->lsa.rt,
.oa = oa,
- .ifa = act->nhi,
- .nh = act->nh
+ .nhs = act->nhs
};
ri_install_rt(oa, act->lsa.rt, &nf);
}
@@ -471,7 +494,7 @@ ospf_rt_spfa(struct ospf_area *oa)
DBG("Found :-)\n");
else
DBG("Not found!\n");
- add_cand(&oa->cand, tmp, act, act->dist, oa);
+ add_cand(&oa->cand, tmp, act, act->dist, oa, NULL);
}
break;
}
@@ -661,8 +684,7 @@ ospf_rt_sum(struct ospf_area *oa)
.tag = 0,
.rid = en->lsa.rt, /* ABR ID */
.oa = oa,
- .ifa = abr->n.ifa,
- .nh = abr->n.nh
+ .nhs = abr->n.nhs
};
if (type == ORT_NET)
@@ -762,13 +784,18 @@ ospf_rt_sum_tr(struct ospf_area *oa)
metric = abr->n.metric1 + metric; /* IAC */
/* 16.3. (5) */
- if (metric <= re->n.metric1)
+ if ((metric < re->n.metric1) ||
+ ((metric == re->n.metric1) && unresolved_vlink(re->n.nhs)))
{
/* We want to replace the next-hop even if the metric is equal
- to replace a virtual next-hop through vlink with a real one */
+ to replace a virtual next-hop through vlink with a real one.
+ Proper ECMP would merge nexthops here, but we do not do that.
+ We restrict nexthops to fit one area to simplify check
+ 12.4.3 p4 in decide_sum_lsa() */
+
re->n.metric1 = metric;
- re->n.nh = abr->n.nh;
- re->n.ifa = abr->n.ifa;
+ re->n.voa = oa;
+ re->n.nhs = abr->n.nhs;
}
}
}
@@ -811,7 +838,7 @@ decide_sum_lsa(struct ospf_area *oa, ort *nf, int dest)
return 0;
/* 12.4.3 p4 */
- if (nf->n.ifa && (nf->n.ifa->oa->areaid == oa->areaid))
+ if (nf->n.voa && (nf->n.voa->areaid == oa->areaid))
return 0;
/* 12.4.3 p5 */
@@ -912,18 +939,20 @@ ospf_check_vlinks(struct proto_ospf *po)
struct top_hash_entry *tmp;
tmp = ospf_hash_find_rt(po->gr, iface->voa->areaid, iface->vid);
- if (tmp && (tmp->color == INSPF) && ipa_nonzero(tmp->lb))
+ if (tmp && (tmp->color == INSPF) && ipa_nonzero(tmp->lb) && tmp->nhs)
{
+ struct ospf_iface *nhi = ospf_iface_find(po, tmp->nhs->iface);
+
if ((iface->state != OSPF_IS_PTP)
- || (iface->vifa != tmp->nhi)
+ || (iface->vifa != nhi)
|| !ipa_equal(iface->vip, tmp->lb))
{
OSPF_TRACE(D_EVENTS, "Vlink peer %R found", tmp->lsa.id);
ospf_iface_sm(iface, ISM_DOWN);
- iface->vifa = tmp->nhi;
- iface->iface = tmp->nhi->iface;
- iface->addr = tmp->nhi->addr;
- iface->sk = tmp->nhi->sk;
+ iface->vifa = nhi;
+ iface->iface = nhi->iface;
+ iface->addr = nhi->addr;
+ iface->sk = nhi->sk;
iface->cost = tmp->dist;
iface->vip = tmp->lb;
ospf_iface_sm(iface, ISM_UP);
@@ -959,8 +988,8 @@ ospf_rt_abr(struct proto_ospf *po)
/* RFC 2328 G.3 - incomplete resolution of virtual next hops */
- if (nf->n.type && nf->n.ifa && (nf->n.ifa->type == OSPF_IT_VLINK))
- reset_ri(&nf->n);
+ if (nf->n.type && unresolved_vlink(nf->n.nhs))
+ reset_ri(nf);
/* Compute condensed area networks */
@@ -979,7 +1008,7 @@ ospf_rt_abr(struct proto_ospf *po)
/* 16.2. (3) */
if (nfi->n.type == RTS_OSPF_IA)
- reset_ri(&nfi->n);
+ reset_ri(nfi);
}
if (anet->metric < nf->n.metric1)
@@ -1055,10 +1084,10 @@ ospf_ext_spf(struct proto_ospf *po)
struct proto *p = &po->proto;
struct ospf_lsa_ext *le;
int pxlen, ebit, rt_fwaddr_valid;
- ip_addr ip, nh, rtid, rt_fwaddr;
- struct ospf_iface *nhi = NULL;
+ ip_addr ip, rtid, rt_fwaddr;
u32 br_metric, rt_metric, rt_tag;
struct ospf_area *atmp;
+ struct mpnh* nhs = NULL;
OSPF_TRACE(D_EVENTS, "Starting routing table calculation for ext routes");
@@ -1119,8 +1148,6 @@ ospf_ext_spf(struct proto_ospf *po)
p->name, en->lsa.type, en->lsa.id, en->lsa.rt);
continue;
}
- nhi = NULL;
- nh = IPA_NONE;
/* 16.4. (3) */
/* If there are more areas, we already precomputed preferred ASBR entries
@@ -1138,8 +1165,7 @@ ospf_ext_spf(struct proto_ospf *po)
if (!rt_fwaddr_valid)
{
nf2 = nf1;
- nh = nf1->n.nh;
- nhi = nf1->n.ifa;
+ nhs = nf1->n.nhs;
br_metric = nf1->n.metric1;
}
else
@@ -1152,12 +1178,13 @@ ospf_ext_spf(struct proto_ospf *po)
continue;
/* Next-hop is a part of a configured stubnet */
- if (!nf2->n.ifa)
+ if (!nf2->n.nhs)
continue;
- /* If nh is zero, it is a device route */
- nh = ipa_nonzero(nf2->n.nh) ? nf2->n.nh : rt_fwaddr;
- nhi = nf2->n.ifa;
+ nhs = nf2->n.nhs;
+ /* If gw is zero, it is a device route */
+ if (ipa_zero(nhs->gw))
+ nhs = new_nexthop(po, rt_fwaddr, nhs->iface, nhs->weight);
br_metric = nf2->n.metric1;
}
@@ -1183,14 +1210,14 @@ ospf_ext_spf(struct proto_ospf *po)
nfa.tag = rt_tag;
nfa.rid = en->lsa.rt;
nfa.oa = nf1->n.oa; /* undefined in RFC 2328 */
- nfa.ifa = nhi;
- nfa.nh = nh;
+ nfa.voa = NULL;
+ nfa.nhs = nhs;
ri_install_ext(po, ip, pxlen, &nfa);
}
}
-/* Cleanup of routing tables and data Cleanup */
+/* Cleanup of routing tables and data */
void
ospf_rt_reset(struct proto_ospf *po)
{
@@ -1203,9 +1230,8 @@ ospf_rt_reset(struct proto_ospf *po)
FIB_WALK(&po->rtf, nftmp)
{
ri = (ort *) nftmp;
- memcpy(&ri->o, &ri->n, sizeof(orta)); /* Backup old data */
ri->fn.x0 = 0;
- reset_ri(&ri->n);
+ reset_ri(ri);
}
FIB_WALK_END;
@@ -1214,8 +1240,7 @@ ospf_rt_reset(struct proto_ospf *po)
{
en->color = OUTSPF;
en->dist = LSINFINITY;
- en->nhi = NULL;
- en->nh = IPA_NONE;
+ en->nhs = NULL;
en->lb = IPA_NONE;
}
@@ -1225,8 +1250,7 @@ ospf_rt_reset(struct proto_ospf *po)
FIB_WALK(&oa->rtr, nftmp)
{
ri = (ort *) nftmp;
- memcpy(&ri->o, &ri->n, sizeof(orta)); /* Backup old data */
- reset_ri(&ri->n);
+ reset_ri(ri);
}
FIB_WALK_END;
@@ -1288,15 +1312,220 @@ ospf_rt_spf(struct proto_ospf *po)
ospf_ext_spf(po);
rt_sync(po);
-
+ lp_flush(po->nhpool);
+
po->calcrt = 0;
}
+
+static inline int
+match_dr(struct ospf_iface *ifa, struct top_hash_entry *en)
+{
+#ifdef OSPFv2
+ return (ifa->drid == en->lsa.rt) && (ipa_to_u32(ifa->drip) == en->lsa.id);
+#else /* OSPFv3 */
+ return (ifa->drid == en->lsa.rt) && (ifa->dr_iface_id == en->lsa.id);
+#endif
+}
+
+
+static inline int
+match_rtlink(struct ospf_iface *ifa, struct ospf_lsa_rt_link *rtl)
+{
+#ifdef OSPFv2
+ return (ifa->type == OSPF_IT_PTP) && (ifa->cost == rtl->metric) &&
+ (((ifa->addr->flags & IA_UNNUMBERED) ? ifa->iface->index :
+ ipa_to_u32(ifa->addr->ip)) == rtl->data);
+#else /* OSPFv3 */
+ return (ifa->type == OSPF_IT_PTP) && (ifa->cost == rtl->metric) &&
+ (ifa->iface->index == rtl->lif);
+#endif
+}
+
+static inline int
+inherit_nexthops(struct mpnh *pn)
+{
+ /* Proper nexthops (with defined GW) or dummy vlink nexthops (without iface) */
+ return pn && (ipa_nonzero(pn->gw) || !pn->iface);
+}
+
+static struct mpnh *
+calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en,
+ struct top_hash_entry *par, struct ospf_lsa_rt_link *rtl)
+{
+ // struct proto *p = &oa->po->proto;
+ struct proto_ospf *po = oa->po;
+ struct mpnh *pn = par->nhs;
+ struct ospf_iface *ifa;
+ u32 rid = en->lsa.rt;
+
+ /* 16.1.1. The next hop calculation */
+ DBG(" Next hop calculating for id: %R rt: %R type: %u\n",
+ en->lsa.id, en->lsa.rt, en->lsa.type);
+
+ /* Usually, we inherit parent nexthops */
+ if (inherit_nexthops(pn))
+ return pn;
+
+ /*
+ * There are three cases:
+ * 1) en is a local network (and par is root)
+ * 2) en is a ptp or ptmp neighbor (and par is root)
+ * 3) en is a bcast or nbma neighbor (and par is local network)
+ */
+
+ /* The first case - local network */
+ if ((en->lsa.type == LSA_T_NET) && (par == oa->rt))
+ {
+ WALK_LIST(ifa, po->iface_list)
+ if (match_dr(ifa, en))
+ return new_nexthop(po, IPA_NONE, ifa->iface, ifa->ecmp_weight);
+
+ return NULL;
+ }
+
+ /* The second case - ptp or ptmp neighbor */
+ if ((en->lsa.type == LSA_T_RT) && (par == oa->rt))
+ {
+ if (rtl->type == LSART_VLNK)
+ return new_nexthop(po, IPA_NONE, NULL, 0);
+
+ WALK_LIST(ifa, po->iface_list)
+ if (match_rtlink(ifa, rtl))
+ {
+ struct ospf_neighbor *m = find_neigh(ifa, rid);
+ if (m && (m->state == NEIGHBOR_FULL))
+ return new_nexthop(po, m->ip, ifa->iface, ifa->ecmp_weight);
+ }
+
+ return NULL;
+ }
+
+ /* The third case - bcast or nbma neighbor */
+ if ((en->lsa.type == LSA_T_RT) && (par->lsa.type == LSA_T_NET))
+ {
+ /* par->nhi should be defined from parent's calc_next_hop() */
+ if (!pn)
+ goto bad;
+
+#ifdef OSPFv2
+ /*
+ * In this case, next-hop is the same as link-back, which is
+ * already computed in link_back().
+ */
+ if (ipa_zero(en->lb))
+ goto bad;
+
+ return new_nexthop(po, en->lb, pn->iface, pn->weight);
+
+#else /* OSPFv3 */
+ /*
+ * Next-hop is taken from lladdr field of Link-LSA, en->lb_id
+ * is computed in link_back().
+ */
+ struct top_hash_entry *lhe;
+ lhe = ospf_hash_find(po->gr, pn->iface->index, en->lb_id, rid, LSA_T_LINK);
+
+ if (!lhe)
+ return NULL;
+
+ struct ospf_lsa_link *llsa = lhe->lsa_body;
+
+ if (ipa_zero(llsa->lladdr))
+ return NULL;
+
+ return new_nexthop(po, llsa->lladdr, pn->iface, pn->weight);
+#endif
+ }
+
+ bad:
+ /* Probably bug or some race condition, we log it */
+ log(L_ERR "Unexpected case in next hop calculation");
+ return NULL;
+}
+
+/* Compare nexthops during merge.
+ We need to maintain nhs sorted to eliminate duplicities */
+static int
+cmp_nhs(struct mpnh *s1, struct mpnh *s2)
+{
+ int r;
+
+ if (!s1)
+ return 1;
+
+ if (!s2)
+ return -1;
+
+ r = ((int) s2->weight) - ((int) s1->weight);
+ if (r)
+ return r;
+
+ r = ipa_compare(s1->gw, s2->gw);
+ if (r)
+ return r;
+
+ return ((int) s1->iface->index) - ((int) s2->iface->index);
+}
+
+static void
+merge_nexthops(struct proto_ospf *po, struct top_hash_entry *en,
+ struct top_hash_entry *par, struct mpnh *new)
+{
+ if (en->nhs == new)
+ return;
+
+ int r1 = en->nhs_reuse;
+ int r2 = (par->nhs != new);
+ int count = po->ecmp;
+ struct mpnh *s1 = en->nhs;
+ struct mpnh *s2 = new;
+ struct mpnh **n = &(en->nhs);
+
+ /*
+ * r1, r2 signalize whether we can reuse nexthops from s1, s2.
+ * New nexthops (s2, new) can be reused if they are not inherited
+ * from the parent (i.e. it is allocated in calc_next_hop()).
+ * Current nexthops (s1, en->nhs) can be reused if they weren't
+ * inherited in previous steps (that is stored in nhs_reuse,
+ * i.e. created by merging or allocalted in calc_next_hop()).
+ *
+ * Generally, a node first inherits shared nexthops from its
+ * parent and later possibly gets reusable copy during merging.
+ */
+
+ while ((s1 || s2) && count--)
+ {
+ int cmp = cmp_nhs(s1, s2);
+ if (cmp < 0)
+ {
+ *n = r1 ? s1 : copy_nexthop(po, s1);
+ s1 = s1->next;
+ }
+ else if (cmp > 0)
+ {
+ *n = r2 ? s2 : copy_nexthop(po, s2);
+ s2 = s2->next;
+ }
+ else
+ {
+ *n = r1 ? s1 : (r2 ? s2 : copy_nexthop(po, s1));
+ s1 = s1->next;
+ s2 = s2->next;
+ }
+ n = &((*n)->next);
+ }
+ *n = NULL;
+
+ en->nhs_reuse=1;
+}
+
/* Add LSA into list of candidates in Dijkstra's algorithm */
static void
add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par,
- u32 dist, struct ospf_area *oa)
+ u32 dist, struct ospf_area *oa, struct ospf_lsa_rt_link *rtl)
{
+ struct proto_ospf *po = oa->po;
node *prev, *n;
int added = 0;
struct top_hash_entry *act;
@@ -1321,24 +1550,48 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par,
return;
/* 16.1. (2d), also checks that dist < LSINFINITY */
- if (dist >= en->dist)
+ if (dist > en->dist)
return;
- /*
- * The line above (=) is not a bug, but we don't support multiple
- * next hops. I'll start as soon as nest will
- */
/* We should check whether there is a reverse link from en to par, */
if (!link_back(oa, en, par))
return;
- if (!calc_next_hop(oa, en, par))
+ struct mpnh *nhs = calc_next_hop(oa, en, par, rtl);
+ if (!nhs)
{
log(L_WARN "Cannot find next hop for LSA (Type: %04x, Id: %R, Rt: %R)",
en->lsa.type, en->lsa.id, en->lsa.rt);
return;
}
+ if (dist == en->dist)
+ {
+ /*
+ * For multipath, we should merge nexthops. We do not mix dummy
+ * vlink nexthops, device nexthops and gateway nexthops. We merge
+ * gateway nexthops only. We prefer device nexthops over gateway
+ * nexthops and gateway nexthops over vlink nexthops. We either
+ * keep old nexthops, merge old and new, or replace old with new.
+ *
+ * We know that en->color == CANDIDATE and en->nhs is defined.
+ */
+ struct mpnh *onhs = en->nhs;
+
+ /* Keep old ones */
+ if (!po->ecmp || !nhs->iface || (onhs->iface && ipa_zero(onhs->gw)))
+ return;
+
+ /* Merge old and new */
+ if (ipa_nonzero(nhs->gw) && ipa_nonzero(onhs->gw))
+ {
+ merge_nexthops(po, en, par, nhs);
+ return;
+ }
+
+ /* Fallback to replace old ones */
+ }
+
DBG(" Adding candidate: rt: %R, id: %R, type: %u\n",
en->lsa.rt, en->lsa.id, en->lsa.type);
@@ -1346,8 +1599,10 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par,
{ /* We found a shorter path */
rem_node(&en->cn);
}
+ en->nhs = nhs;
en->dist = dist;
en->color = CANDIDATE;
+ en->nhs_reuse = (par->nhs != nhs);
prev = NULL;
@@ -1361,8 +1616,7 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par,
{
act = SKIP_BACK(struct top_hash_entry, cn, n);
if ((act->dist > dist) ||
- ((act->dist == dist) && (act->lsa.type == LSA_T_NET)))
- /* FIXME - shouldn't be here LSA_T_RT ??? */
+ ((act->dist == dist) && (act->lsa.type == LSA_T_RT)))
{
if (prev == NULL)
add_head(l, &en->cn);
@@ -1381,132 +1635,16 @@ add_cand(list * l, struct top_hash_entry *en, struct top_hash_entry *par,
}
}
-
static inline int
-match_dr(struct ospf_iface *ifa, struct top_hash_entry *en)
+ort_changed(ort *nf, rta *nr)
{
-#ifdef OSPFv2
- return (ifa->drid == en->lsa.rt) && (ipa_to_u32(ifa->drip) == en->lsa.id);
-#else /* OSPFv3 */
- return (ifa->drid == en->lsa.rt) && (ifa->dr_iface_id == en->lsa.id);
-#endif
-}
-
-static int
-calc_next_hop(struct ospf_area *oa, struct top_hash_entry *en,
- struct top_hash_entry *par)
-{
- // struct proto *p = &oa->po->proto;
- struct ospf_neighbor *neigh, *m;
- struct proto_ospf *po = oa->po;
- struct ospf_iface *ifa;
-
- /* 16.1.1. The next hop calculation */
- DBG(" Next hop called.\n");
- if (ipa_zero(par->nh))
- {
- u32 rid = en->lsa.rt;
- DBG(" Next hop calculating for id: %R rt: %R type: %u\n",
- en->lsa.id, en->lsa.rt, en->lsa.type);
-
- /*
- * There are three cases:
- * 1) en is a local network (and par is root)
- * 2) en is a ptp or ptmp neighbor (and par is root)
- * 3) en is a bcast or nbma neighbor (and par is local network)
- */
-
- /* The first case - local network */
- if ((en->lsa.type == LSA_T_NET) && (par == oa->rt))
- {
- WALK_LIST(ifa, po->iface_list)
- if (match_dr(ifa, en))
- {
- en->nh = IPA_NONE;
- en->nhi = ifa;
- return 1;
- }
- return 0;
- }
-
- /* The second case - ptp or ptmp neighbor */
- if ((en->lsa.type == LSA_T_RT) && (par == oa->rt))
- {
- /*
- * We don't know which iface was used to reach this neighbor
- * (there might be more parallel ifaces) so we will find
- * the best PTP iface with given fully adjacent neighbor.
- */
- neigh = NULL;
- WALK_LIST(ifa, po->iface_list)
- if ((ifa->type == OSPF_IT_PTP) || (ifa->type == OSPF_IT_VLINK))
- {
- m = find_neigh(ifa, rid);
- if (m && (m->state == NEIGHBOR_FULL))
- {
- if (!neigh || (m->ifa->cost < neigh->ifa->cost))
- neigh = m;
- }
- }
-
- if (!neigh)
- return 0;
-
- en->nh = neigh->ip;
- en->nhi = neigh->ifa;
- return 1;
- }
-
- /* The third case - bcast or nbma neighbor */
- if ((en->lsa.type == LSA_T_RT) && (par->lsa.type == LSA_T_NET))
- {
- /* par->nhi should be defined from parent's calc_next_hop() */
- if (!par->nhi)
- goto bad;
-
-#ifdef OSPFv2
- /*
- * In this case, next-hop is the same as link-back, which is
- * already computed in link_back().
- */
- if (ipa_zero(en->lb))
- goto bad;
-
- en->nh = en->lb;
- en->nhi = par->nhi;
- return 1;
-
-#else /* OSPFv3 */
- /*
- * Next-hop is taken from lladdr field of Link-LSA, en->lb_id
- * is computed in link_back().
- */
- struct top_hash_entry *lhe;
- lhe = ospf_hash_find(po->gr, par->nhi->iface->index, en->lb_id, rid, LSA_T_LINK);
-
- if (!lhe)
- return 0;
-
- struct ospf_lsa_link *llsa = lhe->lsa_body;
-
- if (ipa_zero(llsa->lladdr))
- return 0;
-
- en->nh = llsa->lladdr;
- en->nhi = par->nhi;
- return 1;
-#endif
- }
-
- bad:
- /* Probably bug or some race condition, we log it */
- log(L_ERR "Unexpected case in next hop calculation");
- return 0;
- }
-
- en->nh = par->nh;
- en->nhi = par->nhi;
- return 1;
+ rta *or = nf->old_rta;
+ return !or ||
+ (nf->n.metric1 != nf->old_metric1) || (nf->n.metric2 != nf->old_metric2) ||
+ (nf->n.tag != nf->old_tag) || (nf->n.rid != nf->old_rid) ||
+ (nr->source != or->source) || (nr->dest != or->dest) ||
+ (nr->iface != or->iface) || !ipa_equal(nr->gw, or->gw) ||
+ !mpnh_same(nr->nexthops, or->nexthops);
}
static void
@@ -1530,57 +1668,83 @@ again1:
{
nf = (ort *) nftmp;
- /* Sanity check of next-hop address */
- if (nf->n.type && ipa_nonzero(nf->n.nh))
+ /* Sanity check of next-hop addresses, failure should not happen */
+ if (nf->n.type)
{
- neighbor *ng = neigh_find2(p, &nf->n.nh, nf->n.ifa->iface, 0);
- if (!ng || (ng->scope == SCOPE_HOST))
- reset_ri(&nf->n);
+ struct mpnh *nh;
+ for (nh = nf->n.nhs; nh; nh = nh->next)
+ if (ipa_nonzero(nh->gw))
+ {
+ neighbor *ng = neigh_find2(p, &nh->gw, nh->iface, 0);
+ if (!ng || (ng->scope == SCOPE_HOST))
+ { reset_ri(nf); break; }
+ }
}
if (po->areano > 1)
check_sum_net_lsa(po, nf);
/* Remove configured stubnets */
- if (!nf->n.ifa)
- reset_ri(&nf->n);
+ if (!nf->n.nhs)
+ reset_ri(nf);
- if (reload || memcmp(&nf->n, &nf->o, sizeof(orta)))
+ if (nf->n.type) /* Add the route */
{
- net *ne = net_get(p->table, nf->fn.prefix, nf->fn.pxlen);
-
- if (nf->n.type) /* Add the route */
+ rta a0 = {
+ .proto = p,
+ .source = nf->n.type,
+ .scope = SCOPE_UNIVERSE,
+ .cast = RTC_UNICAST,
+ };
+
+ if (nf->n.nhs->next)
{
- rta a0 = {
- .proto = p,
- .source = nf->n.type,
- .scope = SCOPE_UNIVERSE,
- .cast = RTC_UNICAST,
- .iface = nf->n.ifa->iface
- };
-
- if (ipa_nonzero(nf->n.nh))
- {
- a0.dest = RTD_ROUTER;
- a0.gw = nf->n.nh;
- }
- else
- a0.dest = RTD_DEVICE;
+ a0.dest = RTD_MULTIPATH;
+ a0.nexthops = nf->n.nhs;
+ }
+ else if (ipa_nonzero(nf->n.nhs->gw))
+ {
+ a0.dest = RTD_ROUTER;
+ a0.iface = nf->n.nhs->iface;
+ a0.gw = nf->n.nhs->gw;
+ }
+ else
+ {
+ a0.dest = RTD_DEVICE;
+ a0.iface = nf->n.nhs->iface;
+ }
- rte *e = rte_get_temp(&a0);
- e->u.ospf.metric1 = nf->n.metric1;
- e->u.ospf.metric2 = nf->n.metric2;
- e->u.ospf.tag = nf->n.tag;
- e->u.ospf.router_id = nf->n.rid;
+ if (reload || ort_changed(nf, &a0))
+ {
+ net *ne = net_get(p->table, nf->fn.prefix, nf->fn.pxlen);
+ rta *a = rta_lookup(&a0);
+ rte *e = rte_get_temp(a);
+
+ rta_free(nf->old_rta);
+ nf->old_rta = rta_clone(a);
+ e->u.ospf.metric1 = nf->old_metric1 = nf->n.metric1;
+ e->u.ospf.metric2 = nf->old_metric2 = nf->n.metric2;
+ e->u.ospf.tag = nf->old_tag = nf->n.tag;
+ e->u.ospf.router_id = nf->old_rid = nf->n.rid;
e->pflags = 0;
e->net = ne;
e->pref = p->preference;
+
+
+
DBG("Mod rte type %d - %I/%d via %I on iface %s, met %d\n",
a0.source, nf->fn.prefix, nf->fn.pxlen, a0.gw, a0.iface ? a0.iface->name : "(none)", nf->n.metric1);
rte_update(p->table, ne, p, p, e);
}
- else /* Remove the route */
- rte_update(p->table, ne, p, p, NULL);
+ }
+ else if (nf->old_rta)
+ {
+ /* Remove the route */
+ rta_free(nf->old_rta);
+ nf->old_rta = NULL;
+
+ net *ne = net_get(p->table, nf->fn.prefix, nf->fn.pxlen);
+ rte_update(p->table, ne, p, p, NULL);
}
/* Remove unused rt entry. Entries with fn.x0 == 1 are persistent. */
diff --git a/proto/ospf/rt.h b/proto/ospf/rt.h
index 78156c75..bf234f58 100644
--- a/proto/ospf/rt.h
+++ b/proto/ospf/rt.h
@@ -40,21 +40,32 @@ typedef struct orta
u32 tag;
u32 rid; /* Router ID of real advertising router */
struct ospf_area *oa;
- struct ospf_iface *ifa; /* Outgoing interface */
- ip_addr nh; /* Next hop */
+ struct ospf_area *voa; /* Used when route is replaced in ospf_rt_sum_tr(),
+ NULL otherwise */
+ struct mpnh *nhs; /* Next hops computed during SPF */
}
orta;
+// struct ospf_iface *ifa; /* Outgoing interface */
+// ip_addr nh; /* Next hop */
+
+
typedef struct ort
{
/*
* We use fn.x0 to mark persistent rt entries, that are needed for summary
* LSAs that don't have 'proper' rt entry (area networks + default to stubs)
* to keep uid stable (used for LSA ID in OSPFv3 - see fibnode_to_lsaid()).
+ *
+ * old_* values are here to represent the last route update. old_rta
+ * is cached (we keep reference), mainly for multipath nexthops.
+ * old_rta == NULL means route wasn not in the last update, in that
+ * case other old_* values are not valid.
*/
struct fib_node fn;
orta n;
- orta o;
+ u32 old_metric1, old_metric2, old_tag, old_rid;
+ rta *old_rta;
}
ort;
@@ -64,18 +75,24 @@ ort;
* - only router, network and AS-external LSAs
* - lsa.age < LSA_MAXAGE
* - dist < LSINFINITY (or 2*LSINFINITY for ext-LSAs)
- * - nhi are non-NULL unless the node is oa->rt (calculating router itself)
- * - beware, nhi is not valid after SPF calculation
- * - nh is IFA_NONE iff the node is a local network
+ * - nhs is non-NULL unless the node is oa->rt (calculating router itself)
+ * - beware, nhs is not valid after SPF calculation
*
* Invariants for structs orta nodes of fib tables po->rtf, oa->rtr:
* - nodes may be invalid (fn.type == 0), in that case other invariants don't hold
* - n.metric1 may be at most a small multiple of LSINFINITY,
* therefore sums do not overflow
* - n.oa is always non-NULL
- * - n.ifa is always non-NULL with one exception - configured stubnet
- nodes (in po->rtf). In that case, n.nh is IFA_NONE.
+ * - n.nhs is always non-NULL with one exception - configured stubnet
+ * nodes (in po->rtf).
* - oa->rtr does not contain calculating router itself
+ *
+ * There are three types of nexthops in nhs fields:
+ * - gateway nexthops (non-NULL iface, gw != IPA_NONE)
+ * - device nexthops (non-NULL iface, gw == IPA_NONE)
+ * - dummy vlink nexthops (NULL iface, gw == IPA_NONE)
+ * These three types don't mix, nhs field contains either
+ * one device, one vlink node, or one/more gateway nodes.
*/
void ospf_rt_spf(struct proto_ospf *po);
diff --git a/proto/ospf/topology.c b/proto/ospf/topology.c
index 51e96c7f..e604bf87 100644
--- a/proto/ospf/topology.c
+++ b/proto/ospf/topology.c
@@ -1674,14 +1674,12 @@ ospf_hash_get(struct top_graph *f, u32 domain, u32 lsa, u32 rtr, u32 type)
e = sl_alloc(f->hash_slab);
e->color = OUTSPF;
e->dist = LSINFINITY;
- e->nhi = NULL;
- e->nh = IPA_NONE;
+ e->nhs = NULL;
e->lb = IPA_NONE;
e->lsa.id = lsa;
e->lsa.rt = rtr;
e->lsa.type = type;
e->lsa_body = NULL;
- e->nhi = NULL;
e->domain = domain;
e->next = *ee;
*ee = e;
diff --git a/proto/ospf/topology.h b/proto/ospf/topology.h
index b185c7f3..9521e3eb 100644
--- a/proto/ospf/topology.h
+++ b/proto/ospf/topology.h
@@ -20,9 +20,8 @@ struct top_hash_entry
// struct ospf_area *oa;
void *lsa_body;
bird_clock_t inst_t; /* Time of installation into DB */
- ip_addr nh; /* Next hop */
+ struct mpnh *nhs; /* Computed nexthops - valid only in ospf_rt_spf() */
ip_addr lb; /* In OSPFv2, link back address. In OSPFv3, any global address in the area useful for vlinks */
- struct ospf_iface *nhi; /* Next hop interface - valid only in ospf_rt_spf()*/
#ifdef OSPFv3
u32 lb_id; /* Interface ID of link back iface (for bcast or NBMA networks) */
#endif
@@ -32,7 +31,8 @@ struct top_hash_entry
#define OUTSPF 0
#define CANDIDATE 1
#define INSPF 2
- u8 padding;
+ u8 nhs_reuse; /* Whether nhs nodes can be reused during merging.
+ See a note in rt.c:merge_nexthops() */
};
struct top_graph