From 0c791f873aeb7c1052c97db7da4fe23873d69603 Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Thu, 20 Mar 2014 14:07:12 +0100
Subject: BGP graceful restart support.

Also significant core protocol state changes needed for that,
global graceful restart recovery state and kernel proto support
for recovery.
---
 nest/cmds.c     |   3 +
 nest/config.Y   |   6 +
 nest/proto.c    | 388 +++++++++++++++++++++++++++++++++++++++++---------------
 nest/protocol.h |  31 ++++-
 nest/route.h    |  17 +++
 nest/rt-table.c | 100 ++++++++++++---
 6 files changed, 422 insertions(+), 123 deletions(-)

(limited to 'nest')

diff --git a/nest/cmds.c b/nest/cmds.c
index ec6bc762..70fbdaf8 100644
--- a/nest/cmds.c
+++ b/nest/cmds.c
@@ -7,6 +7,7 @@
  */
 
 #include "nest/bird.h"
+#include "nest/protocol.h"
 #include "nest/route.h"
 #include "nest/cli.h"
 #include "conf/conf.h"
@@ -32,6 +33,8 @@ cmd_show_status(void)
   tm_format_datetime(tim, &config->tf_base, config->load_time);
   cli_msg(-1011, "Last reconfiguration on %s", tim);
 
+  graceful_restart_show_status();
+
   if (shutting_down)
     cli_msg(13, "Shutdown in progress");
   else if (configuring)
diff --git a/nest/config.Y b/nest/config.Y
index e9b8a21b..59d354b8 100644
--- a/nest/config.Y
+++ b/nest/config.Y
@@ -49,6 +49,7 @@ CF_KEYWORDS(PASSWORD, FROM, PASSIVE, TO, ID, EVENTS, PACKETS, PROTOCOLS, INTERFA
 CF_KEYWORDS(PRIMARY, STATS, COUNT, FOR, COMMANDS, PREEXPORT, GENERATE, ROA, MAX, FLUSH, AS)
 CF_KEYWORDS(LISTEN, BGP, V6ONLY, DUAL, ADDRESS, PORT, PASSWORDS, DESCRIPTION, SORTED)
 CF_KEYWORDS(RELOAD, IN, OUT, MRTDUMP, MESSAGES, RESTRICT, MEMORY, IGP_METRIC, CLASS, DSCP)
+CF_KEYWORDS(GRACEFUL, RESTART, WAIT)
 
 CF_ENUM(T_ENUM_RTS, RTS_, DUMMY, STATIC, INHERIT, DEVICE, STATIC_DEVICE, REDIRECT,
 	RIP, OSPF, OSPF_IA, OSPF_EXT1, OSPF_EXT2, BGP, PIPE)
@@ -110,6 +111,11 @@ listen_opt:
  ;
 
 
+CF_ADDTO(conf, gr_opts)
+
+gr_opts: GRACEFUL RESTART WAIT expr ';' { new_config->gr_wait = $4; } ;
+
+
 /* Creation of routing tables */
 
 tab_sorted:
diff --git a/nest/proto.c b/nest/proto.c
index cfa6ff4b..2bc3e319 100644
--- a/nest/proto.c
+++ b/nest/proto.c
@@ -35,9 +35,18 @@ static struct proto *initial_device_proto;
 
 static event *proto_flush_event;
 static timer *proto_shutdown_timer;
+static timer *gr_wait_timer;
+
+#define GRS_NONE	0
+#define GRS_INIT	1
+#define GRS_ACTIVE	2
+#define GRS_DONE	3
+
+static int graceful_restart_state;
+static u32 graceful_restart_locks;
 
 static char *p_states[] = { "DOWN", "START", "UP", "STOP" };
-static char *c_states[] = { "HUNGRY", "FEEDING", "HAPPY", "FLUSHING" };
+static char *c_states[] = { "HUNGRY", "???", "HAPPY", "FLUSHING" };
 
 static void proto_flush_loop(void *);
 static void proto_shutdown_loop(struct timer *);
@@ -51,10 +60,12 @@ proto_enqueue(list *l, struct proto *p)
 }
 
 static void
-proto_relink(struct proto *p)
+proto_set_core_state(struct proto *p, uint state)
 {
   list *l = NULL;
 
+  p->core_state = state;
+
   if (p->debug & D_STATES)
     {
       char *name = proto_state_name(p);
@@ -66,13 +77,13 @@ proto_relink(struct proto *p)
     }
   else
     p->last_state_name_announced = NULL;
+
   rem_node(&p->n);
   switch (p->core_state)
     {
     case FS_HUNGRY:
       l = &inactive_proto_list;
       break;
-    case FS_FEEDING:
     case FS_HAPPY:
       l = &active_proto_list;
       break;
@@ -126,6 +137,9 @@ proto_init_instance(struct proto *p)
   p->attn = ev_new(p->pool);
   p->attn->data = p;
 
+  if (graceful_restart_state == GRS_INIT)
+    p->gr_recovery = 1;
+
   if (! p->proto->multitable)
     rt_lock_table(p->table);
 }
@@ -169,7 +183,7 @@ proto_add_announce_hook(struct proto *p, struct rtable *t, struct proto_stats *s
   h->next = p->ahooks;
   p->ahooks = h;
 
-  if (p->rt_notify)
+  if (p->rt_notify && (p->export_state == ES_READY))
     add_tail(&t->hooks, &h->n);
   return h;
 }
@@ -193,6 +207,16 @@ proto_find_announce_hook(struct proto *p, struct rtable *t)
   return NULL;
 }
 
+static void
+proto_link_ahooks(struct proto *p)
+{
+  struct announce_hook *h;
+
+  if (p->rt_notify)
+    for(h=p->ahooks; h; h=h->next)
+      add_tail(&h->table->hooks, &h->n);
+}
+
 static void
 proto_unlink_ahooks(struct proto *p)
 {
@@ -362,6 +386,7 @@ proto_init(struct proto_config *c)
 
   q->proto_state = PS_DOWN;
   q->core_state = FS_HUNGRY;
+  q->export_state = ES_DOWN;
   q->last_state_change = now;
 
   proto_enqueue(&initial_proto_list, q);
@@ -590,6 +615,7 @@ static void
 proto_rethink_goal(struct proto *p)
 {
   struct protocol *q;
+  byte goal;
 
   if (p->reconfiguring && p->core_state == FS_HUNGRY && p->proto_state == PS_DOWN)
     {
@@ -606,22 +632,14 @@ proto_rethink_goal(struct proto *p)
 
   /* Determine what state we want to reach */
   if (p->disabled || p->reconfiguring)
-    {
-      p->core_goal = FS_HUNGRY;
-      if (p->core_state == FS_HUNGRY && p->proto_state == PS_DOWN)
-	return;
-    }
+    goal = PS_DOWN;
   else
-    {
-      p->core_goal = FS_HAPPY;
-      if (p->core_state == FS_HAPPY && p->proto_state == PS_UP)
-	return;
-    }
+    goal = PS_UP;
 
   q = p->proto;
-  if (p->core_goal == FS_HAPPY)		/* Going up */
+  if (goal == PS_UP) 			/* Going up */
     {
-      if (p->core_state == FS_HUNGRY && p->proto_state == PS_DOWN)
+      if (p->proto_state == PS_DOWN && p->core_state == FS_HUNGRY)
 	{
 	  DBG("Kicking %s up\n", p->name);
 	  PD(p, "Starting");
@@ -640,6 +658,104 @@ proto_rethink_goal(struct proto *p)
     }
 }
 
+
+
+static void graceful_restart_done(struct timer *t UNUSED);
+static void proto_want_export_up(struct proto *p);
+
+void
+graceful_restart_recovery(void)
+{
+  graceful_restart_state = GRS_INIT;
+}
+
+void
+graceful_restart_init(void)
+{
+  if (!graceful_restart_state)
+    return;
+
+  log(L_INFO "Graceful restart started");
+
+  if (!graceful_restart_locks)
+    {
+      graceful_restart_done(NULL);
+      return;
+    }
+
+  graceful_restart_state = GRS_ACTIVE;
+  gr_wait_timer = tm_new(proto_pool);
+  gr_wait_timer->hook = graceful_restart_done;
+  tm_start(gr_wait_timer, config->gr_wait);
+}
+
+static void
+graceful_restart_done(struct timer *t UNUSED)
+{
+  struct proto *p;
+  node *n;
+
+  log(L_INFO "Graceful restart done");
+  graceful_restart_state = GRS_DONE;
+
+  WALK_LIST2(p, n, proto_list, glob_node)
+    {
+      if (!p->gr_recovery)
+	continue;
+
+      /* Resume postponed export of routes */
+      if ((p->proto_state == PS_UP) && p->gr_wait)
+	proto_want_export_up(p);
+
+      /* Cleanup */
+      p->gr_recovery = 0;
+      p->gr_wait = 0;
+      p->gr_lock = 0;
+    }
+
+  graceful_restart_locks = 0;
+}
+
+void
+graceful_restart_show_status(void)
+{
+  if (graceful_restart_state != GRS_ACTIVE)
+    return;
+
+  cli_msg(-24, "Graceful restart recovery in progress");
+  cli_msg(-24, "  Waiting for %d protocols to recover", graceful_restart_locks);
+  cli_msg(-24, "  Wait timer is %d/%d", tm_remains(gr_wait_timer), config->gr_wait);
+}
+
+/* Just from start hook */
+void
+proto_graceful_restart_lock(struct proto *p)
+{
+  ASSERT(graceful_restart_state == GRS_INIT);
+  ASSERT(p->gr_recovery);
+
+  if (p->gr_lock)
+    return;
+
+  p->gr_lock = 1;
+  graceful_restart_locks++;
+}
+
+void
+proto_graceful_restart_unlock(struct proto *p)
+{
+  if (!p->gr_lock)
+    return;
+
+  p->gr_lock = 0;
+  graceful_restart_locks--;
+
+  if ((graceful_restart_state == GRS_ACTIVE) && !graceful_restart_locks)
+    tm_start(gr_wait_timer, 0);
+}
+
+
+
 /**
  * protos_dump_all - dump status of all protocols
  *
@@ -751,6 +867,8 @@ protos_build(void)
   proto_flush_event->hook = proto_flush_loop;
   proto_shutdown_timer = tm_new(proto_pool);
   proto_shutdown_timer->hook = proto_shutdown_loop;
+  proto_shutdown_timer = tm_new(proto_pool);
+  proto_shutdown_timer->hook = proto_shutdown_loop;
 }
 
 static void
@@ -779,15 +897,17 @@ proto_feed_more(void *P)
 {
   struct proto *p = P;
 
-  if (p->core_state != FS_FEEDING)
+  if (p->export_state != ES_FEEDING)
     return;
 
   DBG("Feeding protocol %s continued\n", p->name);
   if (rt_feed_baby(p))
     {
-      p->core_state = FS_HAPPY;
-      proto_relink(p);
-      DBG("Protocol %s up and running\n", p->name);
+      DBG("Feeding protocol %s finished\n", p->name);
+      p->export_state = ES_READY;
+
+      if (p->feed_done)
+	p->feed_done(p);
     }
   else
     {
@@ -801,7 +921,7 @@ proto_feed_initial(void *P)
 {
   struct proto *p = P;
 
-  if (p->core_state != FS_FEEDING)
+  if (p->export_state != ES_FEEDING)
     return;
 
   DBG("Feeding protocol %s\n", p->name);
@@ -814,40 +934,10 @@ static void
 proto_schedule_feed(struct proto *p, int initial)
 {
   DBG("%s: Scheduling meal\n", p->name);
-  p->core_state = FS_FEEDING;
-  p->refeeding = !initial;
-
-  /* FIXME: This should be changed for better support of multitable protos */
-  if (!initial)
-    {
-      struct announce_hook *ah;
-      for (ah = p->ahooks; ah; ah = ah->next)
-	proto_reset_limit(ah->out_limit);
-
-      /* Hack: reset exp_routes during refeed, and do not decrease it later */
-      p->stats.exp_routes = 0;
-    }
 
-  /* Connect protocol to routing table */
-  if (initial && !p->proto->multitable)
-    {
-      p->main_source = rt_get_source(p, 0);
-      rt_lock_source(p->main_source);
-
-      p->main_ahook = proto_add_announce_hook(p, p->table, &p->stats);
-      p->main_ahook->in_filter = p->cf->in_filter;
-      p->main_ahook->out_filter = p->cf->out_filter;
-      p->main_ahook->rx_limit = p->cf->rx_limit;
-      p->main_ahook->in_limit = p->cf->in_limit;
-      p->main_ahook->out_limit = p->cf->out_limit;
-      p->main_ahook->in_keep_filtered = p->cf->in_keep_filtered;
-
-      proto_reset_limit(p->main_ahook->rx_limit);
-      proto_reset_limit(p->main_ahook->in_limit);
-      proto_reset_limit(p->main_ahook->out_limit);
-    }
+  p->export_state = ES_FEEDING;
+  p->refeeding = !initial;
 
-  proto_relink(p);
   p->attn->hook = initial ? proto_feed_initial : proto_feed_more;
   ev_schedule(p->attn);
 }
@@ -877,7 +967,7 @@ proto_schedule_flush_loop(void)
   {
     p->flushing = 1;
     for (h=p->ahooks; h; h=h->next)
-      h->table->prune_state = 1;
+      rt_mark_for_prune(h->table);
   }
 
   ev_schedule(proto_flush_event);
@@ -908,8 +998,7 @@ proto_flush_loop(void *unused UNUSED)
 
 	DBG("Flushing protocol %s\n", p->name);
 	p->flushing = 0;
-	p->core_state = FS_HUNGRY;
-	proto_relink(p);
+	proto_set_core_state(p, FS_HUNGRY);
 	if (p->proto_state == PS_DOWN)
 	  proto_fell_down(p);
 	goto again;
@@ -921,19 +1010,6 @@ proto_flush_loop(void *unused UNUSED)
     proto_schedule_flush_loop();
 }
 
-static void
-proto_schedule_flush(struct proto *p)
-{
-  /* Need to abort feeding */
-  if (p->core_state == FS_FEEDING)
-    rt_feed_baby_abort(p);
-
-  DBG("%s: Scheduling flush\n", p->name);
-  p->core_state = FS_FLUSHING;
-  proto_relink(p);
-  proto_unlink_ahooks(p);
-  proto_schedule_flush_loop();
-}
 
 /* Temporary hack to propagate restart to BGP */
 int proto_restart;
@@ -980,9 +1056,9 @@ proto_schedule_down(struct proto *p, byte restart, byte code)
  *
  * Sometimes it is needed to send again all routes to the
  * protocol. This is called feeding and can be requested by this
- * function. This would cause protocol core state transition
- * to FS_FEEDING (during feeding) and when completed, it will
- * switch back to FS_HAPPY. This function can be called even
+ * function. This would cause protocol export state transition
+ * to ES_FEEDING (during feeding) and when completed, it will
+ * switch back to ES_READY. This function can be called even
  * when feeding is already running, in that case it is restarted.
  */
 void
@@ -991,7 +1067,7 @@ proto_request_feeding(struct proto *p)
   ASSERT(p->proto_state == PS_UP);
 
   /* If we are already feeding, we want to restart it */
-  if (p->core_state == FS_FEEDING)
+  if (p->export_state == ES_FEEDING)
     {
       /* Unless feeding is in initial state */
       if (p->attn->hook == proto_feed_initial)
@@ -1000,6 +1076,14 @@ proto_request_feeding(struct proto *p)
       rt_feed_baby_abort(p);
     }
 
+  /* FIXME: This should be changed for better support of multitable protos */
+  struct announce_hook *ah;
+  for (ah = p->ahooks; ah; ah = ah->next)
+    proto_reset_limit(ah->out_limit);
+
+  /* Hack: reset exp_routes during refeed, and do not decrease it later */
+  p->stats.exp_routes = 0;
+
   proto_schedule_feed(p, 0);
 }
 
@@ -1060,6 +1144,83 @@ proto_notify_limit(struct announce_hook *ah, struct proto_limit *l, int dir, u32
     }
 }
 
+
+static void
+proto_want_core_up(struct proto *p)
+{
+  ASSERT(p->core_state == FS_HUNGRY);
+
+  if (!p->proto->multitable)
+    {
+      p->main_source = rt_get_source(p, 0);
+      rt_lock_source(p->main_source);
+
+      /* Connect protocol to routing table */
+      p->main_ahook = proto_add_announce_hook(p, p->table, &p->stats);
+      p->main_ahook->in_filter = p->cf->in_filter;
+      p->main_ahook->out_filter = p->cf->out_filter;
+      p->main_ahook->rx_limit = p->cf->rx_limit;
+      p->main_ahook->in_limit = p->cf->in_limit;
+      p->main_ahook->out_limit = p->cf->out_limit;
+      p->main_ahook->in_keep_filtered = p->cf->in_keep_filtered;
+
+      proto_reset_limit(p->main_ahook->rx_limit);
+      proto_reset_limit(p->main_ahook->in_limit);
+      proto_reset_limit(p->main_ahook->out_limit);
+    }
+
+  proto_set_core_state(p, FS_HAPPY);
+}
+
+static void
+proto_want_export_up(struct proto *p)
+{
+  ASSERT(p->core_state == CS_HAPPY);
+  ASSERT(p->export_state == ES_DOWN);
+
+  proto_link_ahooks(p);
+  proto_schedule_feed(p, 1); /* Sets ES_FEEDING */
+}
+
+static void
+proto_want_export_down(struct proto *p)
+{
+  ASSERT(p->export_state != ES_DOWN);
+
+  /* Need to abort feeding */
+  if (p->export_state == ES_FEEDING)
+    rt_feed_baby_abort(p);
+
+  p->export_state = ES_DOWN;
+  proto_unlink_ahooks(p);
+}
+
+static void
+proto_want_core_down(struct proto *p)
+{
+  ASSERT(p->core_state == CS_HAPPY);
+  ASSERT(p->export_state == ES_DOWN);
+
+  proto_set_core_state(p, FS_FLUSHING);
+  proto_schedule_flush_loop();
+
+  if (!p->proto->multitable)
+    {
+      rt_unlock_source(p->main_source);
+      p->main_source = NULL;
+    }
+}
+
+static void
+proto_falling_down(struct proto *p)
+{
+  p->gr_recovery = 0;
+  p->gr_wait = 0;
+  if (p->gr_lock)
+    proto_graceful_restart_unlock(p);
+}
+
+
 /**
  * proto_notify_state - notify core about protocol state change
  * @p: protocol the state of which has changed
@@ -1079,6 +1240,7 @@ proto_notify_state(struct proto *p, unsigned ps)
 {
   unsigned ops = p->proto_state;
   unsigned cs = p->core_state;
+  unsigned es = p->export_state;
 
   DBG("%s reporting state transition %s/%s -> */%s\n", p->name, c_states[cs], p_states[ops], p_states[ps]);
   if (ops == ps)
@@ -1089,17 +1251,47 @@ proto_notify_state(struct proto *p, unsigned ps)
 
   switch (ps)
     {
+    case PS_START:
+      ASSERT(ops == PS_DOWN || ops == PS_UP);
+      ASSERT(cs == FS_HUNGRY || cs == FS_HAPPY);
+
+      if (es != ES_DOWN)
+	proto_want_export_down(p);
+      break;
+
+    case PS_UP:
+      ASSERT(ops == PS_DOWN || ops == PS_START);
+      ASSERT(cs == FS_HUNGRY || cs == FS_HAPPY);
+      ASSERT(es == ES_DOWN);
+
+      if (cs == FS_HUNGRY)
+	proto_want_core_up(p);
+      if (!p->gr_wait)
+	proto_want_export_up(p);
+      break;
+
+    case PS_STOP:
+      ASSERT(ops == PS_START || ops == PS_UP);
+
+      p->down_sched = 0;
+
+      if (es != ES_DOWN)
+	proto_want_export_down(p);
+      if (cs == FS_HAPPY)
+	proto_want_core_down(p);
+      proto_falling_down(p);
+      break;
+
     case PS_DOWN:
       p->down_code = 0;
       p->down_sched = 0;
-      if ((cs == FS_FEEDING) || (cs == FS_HAPPY))
-	proto_schedule_flush(p);
 
-      if (p->proto->multitable)
-	{
-	  rt_unlock_source(p->main_source);
-	  p->main_source = NULL;
-	}
+      if (es != ES_DOWN)
+	proto_want_export_down(p);
+      if (cs == FS_HAPPY)
+	proto_want_core_down(p);
+      if (ops != PS_STOP)
+	proto_falling_down(p);
 
       neigh_prune(); // FIXME convert neighbors to resource?
       rfree(p->pool);
@@ -1111,22 +1303,9 @@ proto_notify_state(struct proto *p, unsigned ps)
 	  return;			/* The protocol might have ceased to exist */
 	}
       break;
-    case PS_START:
-      ASSERT(ops == PS_DOWN);
-      ASSERT(cs == FS_HUNGRY);
-      break;
-    case PS_UP:
-      ASSERT(ops == PS_DOWN || ops == PS_START);
-      ASSERT(cs == FS_HUNGRY);
-      proto_schedule_feed(p, 1);
-      break;
-    case PS_STOP:
-      p->down_sched = 0;
-      if ((cs == FS_FEEDING) || (cs == FS_HAPPY))
-	proto_schedule_flush(p);
-      break;
+
     default:
-      bug("Invalid state transition for %s from %s/%s to */%s", p->name, c_states[cs], p_states[ops], p_states[ps]);
+      bug("%s: Invalid state %d", p->name, ps);
     }
 }
 
@@ -1141,11 +1320,17 @@ proto_state_name(struct proto *p)
   switch (P(p->proto_state, p->core_state))
     {
     case P(PS_DOWN, FS_HUNGRY):		return "down";
-    case P(PS_START, FS_HUNGRY):	return "start";
-    case P(PS_UP, FS_HUNGRY):
-    case P(PS_UP, FS_FEEDING):		return "feed";
+    case P(PS_START, FS_HUNGRY):
+    case P(PS_START, FS_HAPPY):		return "start";
+    case P(PS_UP, FS_HAPPY):
+      switch (p->export_state)
+	{
+	case ES_DOWN:			return "wait";
+	case ES_FEEDING:		return "feed";
+	case ES_READY:			return "up";
+	default:      			return "???";
+	}
     case P(PS_STOP, FS_HUNGRY):		return "stop";
-    case P(PS_UP, FS_HAPPY):		return "up";
     case P(PS_STOP, FS_FLUSHING):
     case P(PS_DOWN, FS_FLUSHING):	return "flush";
     default:      			return "???";
@@ -1196,6 +1381,11 @@ proto_show_basic_info(struct proto *p)
   cli_msg(-1006, "  Input filter:   %s", filter_name(p->cf->in_filter));
   cli_msg(-1006, "  Output filter:  %s", filter_name(p->cf->out_filter));
 
+  if (graceful_restart_state == GRS_ACTIVE)
+    cli_msg(-1006, "  GR recovery:   %s%s",
+	    p->gr_lock ? " pending" : "",
+	    p->gr_wait ? " waiting" : "");
+
   proto_show_limit(p->cf->rx_limit, "Receive limit:");
   proto_show_limit(p->cf->in_limit, "Import limit:");
   proto_show_limit(p->cf->out_limit, "Export limit:");
diff --git a/nest/protocol.h b/nest/protocol.h
index b58f9e67..ec779563 100644
--- a/nest/protocol.h
+++ b/nest/protocol.h
@@ -148,10 +148,13 @@ struct proto {
   byte disabled;			/* Manually disabled */
   byte proto_state;			/* Protocol state machine (PS_*, see below) */
   byte core_state;			/* Core state machine (FS_*, see below) */
-  byte core_goal;			/* State we want to reach (FS_*, see below) */
+  byte export_state;			/* Route export state (ES_*, see below) */	
   byte reconfiguring;			/* We're shutting down due to reconfiguration */
-  byte refeeding;			/* We are refeeding (valid only if core_state == FS_FEEDING) */
+  byte refeeding;			/* We are refeeding (valid only if export_state == ES_FEEDING) */
   byte flushing;			/* Protocol is flushed in current flush loop round */
+  byte gr_recovery;			/* Protocol should participate in graceful restart recovery */
+  byte gr_lock;				/* Graceful restart mechanism should wait for this proto */
+  byte gr_wait;				/* Route export to protocol is postponed until graceful restart */
   byte down_sched;			/* Shutdown is scheduled for later (PDS_*) */
   byte down_code;			/* Reason for shutdown (PDC_* codes) */
   u32 hash_key;				/* Random key used for hashing of neighbors */
@@ -175,6 +178,7 @@ struct proto {
    *	   reload_routes   Request protocol to reload all its routes to the core
    *			(using rte_update()). Returns: 0=reload cannot be done,
    *			1= reload is scheduled and will happen (asynchronously).
+   *	   feed_done	Notify protocol about finish of route feeding.
    */
 
   void (*if_notify)(struct proto *, unsigned flags, struct iface *i);
@@ -185,6 +189,7 @@ struct proto {
   void (*store_tmp_attrs)(struct rte *rt, struct ea_list *attrs);
   int (*import_control)(struct proto *, struct rte **rt, struct ea_list **attrs, struct linpool *pool);
   int (*reload_routes)(struct proto *);
+  void (*feed_done)(struct proto *);
 
   /*
    *	Routing entry hooks (called only for routes belonging to this protocol):
@@ -242,6 +247,13 @@ static inline void
 proto_copy_rest(struct proto_config *dest, struct proto_config *src, unsigned size)
 { memcpy(dest + 1, src + 1, size - sizeof(struct proto_config)); }
 
+void graceful_restart_recovery(void);
+void graceful_restart_init(void);
+void graceful_restart_show_status(void);
+void proto_graceful_restart_lock(struct proto *p);
+void proto_graceful_restart_unlock(struct proto *p);
+
+#define DEFAULT_GR_WAIT	240
 
 void proto_show_limit(struct proto_limit *l, const char *dsc);
 void proto_show_basic_info(struct proto *p);
@@ -343,10 +355,17 @@ void proto_notify_state(struct proto *p, unsigned state);
  *	as a result of received ROUTE-REFRESH request).
  */
 
-#define FS_HUNGRY 0
-#define FS_FEEDING 1
-#define FS_HAPPY 2
-#define FS_FLUSHING 3
+#define FS_HUNGRY	0
+#define FS_FEEDING	1	/* obsolete */
+#define FS_HAPPY	2
+#define FS_FLUSHING	3
+
+
+#define ES_DOWN		0
+#define ES_FEEDING	1
+#define ES_READY	2
+
+
 
 /*
  *	Debugging flags
diff --git a/nest/route.h b/nest/route.h
index f00f8b2b..82d9e202 100644
--- a/nest/route.h
+++ b/nest/route.h
@@ -148,6 +148,10 @@ typedef struct rtable {
   struct fib_iterator nhu_fit;		/* Next Hop Update FIB iterator */
 } rtable;
 
+#define RPS_NONE	0
+#define RPS_SCHEDULED	1
+#define RPS_RUNNING	2
+
 typedef struct network {
   struct fib_node n;			/* FIB flags reserved for kernel syncer */
   struct rte *routes;			/* Available routes for this network */
@@ -222,6 +226,8 @@ typedef struct rte {
 
 #define REF_COW		1		/* Copy this rte on write */
 #define REF_FILTERED	2		/* Route is rejected by import filter */
+#define REF_STALE	4		/* Route is stale in a refresh cycle */
+#define REF_DISCARD	8		/* Route is scheduled for discard */
 
 /* Route is valid for propagation (may depend on other flags in the future), accepts NULL */
 static inline int rte_is_valid(rte *r) { return r && !(r->flags & REF_FILTERED); }
@@ -257,6 +263,8 @@ void rte_update2(struct announce_hook *ah, net *net, rte *new, struct rte_src *s
 static inline void rte_update(struct proto *p, net *net, rte *new) { rte_update2(p->main_ahook, net, new, p->main_source); }
 void rte_discard(rtable *tab, rte *old);
 int rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter *filter);
+void rt_refresh_begin(rtable *t, struct announce_hook *ah);
+void rt_refresh_end(rtable *t, struct announce_hook *ah);
 void rte_dump(rte *);
 void rte_free(rte *);
 rte *rte_do_cow(rte *);
@@ -268,6 +276,15 @@ void rt_feed_baby_abort(struct proto *p);
 int rt_prune_loop(void);
 struct rtable_config *rt_new_table(struct symbol *s);
 
+static inline void
+rt_mark_for_prune(rtable *tab)
+{
+  if (tab->prune_state == RPS_RUNNING)
+    fit_get(&tab->fib, &tab->prune_fit);
+
+  tab->prune_state = RPS_SCHEDULED;
+}
+
 struct rt_show_data {
   ip_addr prefix;
   unsigned pxlen;
diff --git a/nest/rt-table.c b/nest/rt-table.c
index 8c91ea0a..bc911729 100644
--- a/nest/rt-table.c
+++ b/nest/rt-table.c
@@ -55,8 +55,10 @@ static void rt_free_hostcache(rtable *tab);
 static void rt_notify_hostcache(rtable *tab, net *net);
 static void rt_update_hostcache(rtable *tab);
 static void rt_next_hop_update(rtable *tab);
-
+static inline int rt_prune_table(rtable *tab);
 static inline void rt_schedule_gc(rtable *tab);
+static inline void rt_schedule_prune(rtable *tab);
+
 
 static inline struct ea_list *
 make_tmp_attrs(struct rte *rt, struct linpool *pool)
@@ -570,7 +572,7 @@ rte_announce(rtable *tab, unsigned type, net *net, rte *new, rte *old, rte *befo
   struct announce_hook *a;
   WALK_LIST(a, tab->hooks)
     {
-      ASSERT(a->proto->core_state == FS_HAPPY || a->proto->core_state == FS_FEEDING);
+      ASSERT(a->proto->export_state != ES_DOWN);
       if (a->proto->accept_ra_types == type)
 	if (type == RA_ACCEPTED)
 	  rt_notify_accepted(a, net, new, old, before_old, tmpa, 0);
@@ -1108,6 +1110,46 @@ rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter
   return v > 0;
 }
 
+void
+rt_refresh_begin(rtable *t, struct announce_hook *ah)
+{
+  net *n;
+  rte *e;
+
+  FIB_WALK(&t->fib, fn)
+    {
+      n = (net *) fn;
+      for (e = n->routes; e; e = e->next)
+	if (e->sender == ah)
+	  e->flags |= REF_STALE;
+    }
+  FIB_WALK_END;
+}
+
+void
+rt_refresh_end(rtable *t, struct announce_hook *ah)
+{
+  int prune = 0;
+  net *n;
+  rte *e;
+
+  FIB_WALK(&t->fib, fn)
+    {
+      n = (net *) fn;
+      for (e = n->routes; e; e = e->next)
+	if ((e->sender == ah) && (e->flags & REF_STALE))
+	  {
+	    e->flags |= REF_DISCARD;
+	    prune = 1;
+	  }
+    }
+  FIB_WALK_END;
+
+  if (prune)
+    rt_schedule_prune(t);
+}
+
+
 /**
  * rte_dump - dump a route
  * @e: &rte to be dumped
@@ -1169,6 +1211,13 @@ rt_dump_all(void)
     rt_dump(t);
 }
 
+static inline void
+rt_schedule_prune(rtable *tab)
+{
+  rt_mark_for_prune(tab);
+  ev_schedule(tab->rt_event);
+}
+
 static inline void
 rt_schedule_gc(rtable *tab)
 {
@@ -1199,6 +1248,7 @@ rt_schedule_nhu(rtable *tab)
   tab->nhu_state |= 1;
 }
 
+
 static void
 rt_prune_nets(rtable *tab)
 {
@@ -1242,6 +1292,14 @@ rt_event(void *ptr)
   if (tab->nhu_state)
     rt_next_hop_update(tab);
 
+  if (tab->prune_state)
+    if (!rt_prune_table(tab))
+      {
+	/* Table prune unfinished */
+	ev_schedule(tab->rt_event);
+	return;
+      }
+
   if (tab->gc_scheduled)
     {
       rt_prune_nets(tab);
@@ -1283,8 +1341,8 @@ rt_init(void)
 }
 
 
-static inline int
-rt_prune_step(rtable *tab, int step, int *max_feed)
+static int
+rt_prune_step(rtable *tab, int step, int *limit)
 {
   static struct rate_limit rl_flush;
   struct fib_iterator *fit = &tab->prune_fit;
@@ -1294,13 +1352,13 @@ rt_prune_step(rtable *tab, int step, int *max_feed)
   fib_check(&tab->fib);
 #endif
 
-  if (tab->prune_state == 0)
+  if (tab->prune_state == RPS_NONE)
     return 1;
 
-  if (tab->prune_state == 1)
+  if (tab->prune_state == RPS_SCHEDULED)
     {
       FIB_ITERATE_INIT(fit, &tab->fib);
-      tab->prune_state = 2;
+      tab->prune_state = RPS_RUNNING;
     }
 
 again:
@@ -1312,9 +1370,10 @@ again:
     rescan:
       for (e=n->routes; e; e=e->next)
 	if (e->sender->proto->flushing ||
+	    (e->flags & REF_DISCARD) ||
 	    (step && e->attrs->src->proto->flushing))
 	  {
-	    if (*max_feed <= 0)
+	    if (*limit <= 0)
 	      {
 		FIB_ITERATE_PUT(fit, fn);
 		return 0;
@@ -1325,7 +1384,7 @@ again:
 		  n->n.prefix, n->n.pxlen, e->attrs->src->proto->name, tab->name);
 
 	    rte_discard(tab, e);
-	    (*max_feed)--;
+	    (*limit)--;
 
 	    goto rescan;
 	  }
@@ -1342,10 +1401,17 @@ again:
   fib_check(&tab->fib);
 #endif
 
-  tab->prune_state = 0;
+  tab->prune_state = RPS_NONE;
   return 1;
 }
 
+static inline int
+rt_prune_table(rtable *tab)
+{
+  int limit = 512;
+  return rt_prune_step(tab, 0, &limit);
+}
+
 /**
  * rt_prune_loop - prune routing tables
  *
@@ -1364,19 +1430,19 @@ int
 rt_prune_loop(void)
 {
   static int step = 0;
-  int max_feed = 512;
+  int limit = 512;
   rtable *t;
 
  again:
   WALK_LIST(t, routing_tables)
-    if (! rt_prune_step(t, step, &max_feed))
+    if (! rt_prune_step(t, step, &limit))
       return 0;
 
   if (step == 0)
     {
       /* Prepare for the second step */
       WALK_LIST(t, routing_tables)
-	t->prune_state = 1;
+	t->prune_state = RPS_SCHEDULED;
 
       step = 1;
       goto again;
@@ -1721,7 +1787,7 @@ again:
 	  (p->accept_ra_types == RA_ACCEPTED))
 	if (rte_is_valid(e))
 	  {
-	    if (p->core_state != FS_FEEDING)
+	    if (p->export_state != ES_FEEDING)
 	      return 1;  /* In the meantime, the protocol fell down. */
 	    do_feed_baby(p, p->accept_ra_types, h, n, e);
 	    max_feed--;
@@ -1730,7 +1796,7 @@ again:
       if (p->accept_ra_types == RA_ANY)
 	for(e = n->routes; rte_is_valid(e); e = e->next)
 	  {
-	    if (p->core_state != FS_FEEDING)
+	    if (p->export_state != ES_FEEDING)
 	      return 1;  /* In the meantime, the protocol fell down. */
 	    do_feed_baby(p, RA_ANY, h, n, e);
 	    max_feed--;
@@ -2223,9 +2289,7 @@ rt_show_cont(struct cli *c)
 	  cli_printf(c, 8004, "Stopped due to reconfiguration");
 	  goto done;
 	}
-      if (d->export_protocol &&
-	  d->export_protocol->core_state != FS_HAPPY &&
-	  d->export_protocol->core_state != FS_FEEDING)
+      if (d->export_protocol && (d->export_protocol->export_state == ES_DOWN))
 	{
 	  cli_printf(c, 8005, "Protocol is down");
 	  goto done;
-- 
cgit v1.2.3


From 6eda3f135f5bab4db456531d25bc3e5f669ec22e Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Sun, 23 Mar 2014 01:35:33 +0100
Subject: Documentation (and minor fixes) for BGP graceful restart.

---
 doc/bird.sgml   |  99 ++++++++++++++++++++++++++++--------
 nest/proto.c    | 154 +++++++++++++++++++++++++++++++++++++++++---------------
 nest/rt-table.c |  53 +++++++++++++++----
 proto/bgp/bgp.c |  42 +++++++++++++++-
 4 files changed, 276 insertions(+), 72 deletions(-)

(limited to 'nest')

diff --git a/doc/bird.sgml b/doc/bird.sgml
index e9c61526..3ea90920 100644
--- a/doc/bird.sgml
+++ b/doc/bird.sgml
@@ -157,6 +157,9 @@ options. The most important ones are:
 
 	<tag>-f</tag>
 	run bird in foreground.
+
+	<tag>-R</tag>
+	apply graceful restart recovery after start.
 </descrip>
 
 <p>BIRD writes messages about its work to log files or syslog (according to config).
@@ -187,6 +190,7 @@ configuration, but it is generally easy -- BIRD needs just the
 standard library, privileges to read the config file and create the
 control socket and the CAP_NET_* capabilities.
 
+
 <chapt>About routing tables
 
 <p>BIRD has one or more routing tables which may or may not be
@@ -242,6 +246,20 @@ using comparison and ordering). Minor advantage is that routes are
 shown sorted in <cf/show route/, minor disadvantage is that it is
 slightly more computationally expensive.
 
+<sect>Graceful restart
+
+<p>When BIRD is started after restart or crash, it repopulates routing tables in
+an uncoordinated manner, like after clean start. This may be impractical in some
+cases, because if the forwarding plane (i.e. kernel routing tables) remains
+intact, then its synchronization with BIRD would temporarily disrupt packet
+forwarding until protocols converge. Graceful restart is a mechanism that could
+help with this issue. Generally, it works by starting protocols and letting them
+repopulate routing tables while deferring route propagation until protocols
+acknowledge their convergence. Note that graceful restart behavior have to be
+configured for all relevant protocols and requires protocol-specific support
+(currently implemented for Kernel and BGP protocols), it is activated for
+particular boot by option <cf/-R/.
+
 
 <chapt>Configuration
 
@@ -371,6 +389,12 @@ protocol rip {
 	would accept IPv6 routes only). Such behavior was default in
 	older versions of BIRD.
 
+	<tag>graceful restart wait <m/number/</tag>
+	During graceful restart recovery, BIRD waits for convergence of routing
+	protocols. This option allows to specify a timeout for the recovery to
+	prevent waiting indefinitely if some protocols cannot converge. Default:
+	240 seconds.
+
 	<tag>timeformat route|protocol|base|log "<m/format1/" [<m/limit/ "<m/format2/"]</tag>
 	This option allows to specify a format of date/time used by
 	BIRD.  The first argument specifies for which purpose such
@@ -1493,6 +1517,8 @@ extended communities
 (RFC 4360<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4360.txt">),
 route reflectors 
 (RFC 4456<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4456.txt">),
+graceful restart
+(RFC 4724<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4724.txt">),
 multiprotocol extensions
 (RFC 4760<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4760.txt">),
 4B AS numbers 
@@ -1502,9 +1528,7 @@ and 4B AS numbers in extended communities
 
 
 For IPv6, it uses the standard multiprotocol extensions defined in
-RFC 2283<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc2283.txt">
-including changes described in the
-latest draft<htmlurl url="ftp://ftp.rfc-editor.org/internet-drafts/draft-ietf-idr-bgp4-multiprotocol-v2-05.txt">
+RFC 4760<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4760.txt">
 and applied to IPv6 according to
 RFC 2545<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc2545.txt">.
 
@@ -1716,6 +1740,26 @@ for each neighbor using the following configuration parameters:
 	capability and accepts such requests. Even when disabled, BIRD
 	can send route refresh requests. Default: on.
 
+	<tag>graceful restart <m/switch/|aware</tag>
+	When a BGP speaker restarts or crashes, neighbors will discard all
+	received paths from the speaker, which disrupts packet forwarding even
+	when the forwarding plane of the speaker remains intact. RFC 4724
+	specifies an optional graceful restart mechanism to alleviate this
+	issue. This option controls the mechanism. It has three states:
+	Disabled, when no support is provided. Aware, when the graceful restart
+	support is announced and the support for restarting neighbors is
+	provided, but no local graceful restart is allowed (i.e. receiving-only
+	role). Enabled, when the full graceful restart support is provided
+	(i.e. both restarting and receiving role). Note that proper support for
+	local graceful restart requires also configuration of other protocols.
+	Default: aware.
+
+	<tag>graceful restart time <m/number/</tag>
+	The restart time is announced in the BGP graceful restart capability
+	and specifies how long the neighbor would wait for the BGP session to
+	re-establish after a restart before deleting stale routes. Default:
+	120 seconds.
+
 	<tag>interpret communities <m/switch/</tag> RFC 1997 demands
 	that BGP speaker should process well-known communities like
 	no-export (65535, 65281) or no-advertise (65535, 65282). For
@@ -2063,25 +2107,36 @@ overcome using another routing table and the pipe protocol.
 <sect1>Configuration
 
 <p><descrip>
-	<tag>persist <m/switch/</tag> Tell BIRD to leave all its routes in the
-	routing tables when it exits (instead of cleaning them up).
-	<tag>scan time <m/number/</tag> Time in seconds between two consecutive scans of the
-	kernel routing table.
-	<tag>learn <m/switch/</tag> Enable learning of routes added to the kernel
-	routing tables by other routing daemons or by the system administrator.
-	This is possible only on systems which support identification of route
-	authorship.
-
-	<tag>device routes <m/switch/</tag> Enable export of device
-	routes to the kernel routing table. By default, such routes
-	are rejected (with the exception of explicitly configured
-	device routes from the static protocol) regardless of the
-	export filter to protect device routes in kernel routing table
-	(managed by OS itself) from accidental overwriting or erasing.
-
-	<tag>kernel table <m/number/</tag> Select which kernel table should
-	this particular instance of the Kernel protocol work with. Available
-	only on systems supporting multiple routing tables.
+	<tag>persist <m/switch/</tag>
+	Tell BIRD to leave all its routes in the routing tables when it exits
+	(instead of cleaning them up).
+
+	<tag>scan time <m/number/</tag>
+	Time in seconds between two consecutive scans of the kernel routing
+	table.
+
+	<tag>learn <m/switch/</tag>
+	Enable learning of routes added to the kernel routing tables by other
+	routing daemons or by the system administrator. This is possible only on
+	systems which support identification of route authorship.
+
+	<tag>device routes <m/switch/</tag>
+	Enable export of device routes to the kernel routing table. By default,
+	such routes are rejected (with the exception of explicitly configured
+	device routes from the static protocol) regardless of the export filter
+	to protect device routes in kernel routing table (managed by OS itself)
+	from accidental overwriting or erasing.
+
+	<tag>kernel table <m/number/</tag>
+	Select which kernel table should this particular instance of the Kernel
+	protocol work with. Available only on systems supporting multiple
+	routing tables.
+
+	<tag>graceful restart <m/switch/</tag>
+	Participate in graceful restart recovery. If this option is enabled and
+	a graceful restart recovery is active, the Kernel protocol will defer
+	synchronization of routing tables until the end of the recovery. Note
+	that import of kernel routes to BIRD is not affected.
 </descrip>
 
 <sect1>Attributes
diff --git a/nest/proto.c b/nest/proto.c
index 2bc3e319..e990b48f 100644
--- a/nest/proto.c
+++ b/nest/proto.c
@@ -51,6 +51,8 @@ static char *c_states[] = { "HUNGRY", "???", "HAPPY", "FLUSHING" };
 static void proto_flush_loop(void *);
 static void proto_shutdown_loop(struct timer *);
 static void proto_rethink_goal(struct proto *p);
+static void proto_want_export_up(struct proto *p);
+static void proto_fell_down(struct proto *p);
 static char *proto_state_name(struct proto *p);
 
 static void
@@ -151,21 +153,20 @@ extern pool *rt_table_pool;
  * @t: routing table to connect to
  * @stats: per-table protocol statistics
  *
- * This function creates a connection between the protocol instance @p
- * and the routing table @t, making the protocol hear all changes in
- * the table.
+ * This function creates a connection between the protocol instance @p and the
+ * routing table @t, making the protocol hear all changes in the table.
  *
- * The announce hook is linked in the protocol ahook list and, if the
- * protocol accepts routes, also in the table ahook list. Announce
- * hooks are allocated from the routing table resource pool, they are
- * unlinked from the table ahook list after the protocol went down,
- * (in proto_schedule_flush()) and they are automatically freed after the
- * protocol is flushed (in proto_fell_down()).
+ * The announce hook is linked in the protocol ahook list. Announce hooks are
+ * allocated from the routing table resource pool and when protocol accepts
+ * routes also in the table ahook list. The are linked to the table ahook list
+ * and unlinked from it depending on export_state (in proto_want_export_up() and
+ * proto_want_export_down()) and they are automatically freed after the protocol
+ * is flushed (in proto_fell_down()).
  *
- * Unless you want to listen to multiple routing tables (as the Pipe
- * protocol does), you needn't to worry about this function since the
- * connection to the protocol's primary routing table is initialized
- * automatically by the core code.
+ * Unless you want to listen to multiple routing tables (as the Pipe protocol
+ * does), you needn't to worry about this function since the connection to the
+ * protocol's primary routing table is initialized automatically by the core
+ * code.
  */
 struct announce_hook *
 proto_add_announce_hook(struct proto *p, struct rtable *t, struct proto_stats *stats)
@@ -183,7 +184,7 @@ proto_add_announce_hook(struct proto *p, struct rtable *t, struct proto_stats *s
   h->next = p->ahooks;
   p->ahooks = h;
 
-  if (p->rt_notify && (p->export_state == ES_READY))
+  if (p->rt_notify && (p->export_state != ES_DOWN))
     add_tail(&t->hooks, &h->n);
   return h;
 }
@@ -659,16 +660,59 @@ proto_rethink_goal(struct proto *p)
 }
 
 
+/**
+ * DOC: Graceful restart recovery
+ *
+ * Graceful restart of a router is a process when the routing plane (e.g. BIRD)
+ * restarts but both the forwarding plane (e.g kernel routing table) and routing
+ * neighbors keep proper routes, and therefore uninterrupted packet forwarding
+ * is maintained.
+ *
+ * BIRD implements graceful restart recovery by deferring export of routes to
+ * protocols until routing tables are refilled with the expected content. After
+ * start, protocols generate routes as usual, but routes are not propagated to
+ * them, until protocols report that they generated all routes. After that,
+ * graceful restart recovery is finished and the export (and the initial feed)
+ * to protocols is enabled.
+ *
+ * When graceful restart recovery need is detected during initialization, then
+ * enabled protocols are marked with @gr_recovery flag before start. Such
+ * protocols then decide how to proceed with graceful restart, participation is
+ * voluntary. Protocols could lock the recovery by proto_graceful_restart_lock()
+ * (stored in @gr_lock flag), which means that they want to postpone the end of
+ * the recovery until they converge and then unlock it. They also could set
+ * @gr_wait before advancing to %PS_UP, which means that the core should defer
+ * route export to that protocol until the end of the recovery. This should be
+ * done by protocols that expect their neigbors to keep the proper routes
+ * (kernel table, BGP sessions with BGP graceful restart capability).
+ *
+ * The graceful restart recovery is finished when either all graceful restart
+ * locks are unlocked or when graceful restart wait timer fires.
+ *
+ */
 
-static void graceful_restart_done(struct timer *t UNUSED);
-static void proto_want_export_up(struct proto *p);
+static void graceful_restart_done(struct timer *t);
 
+/**
+ * graceful_restart_recovery - request initial graceful restart recovery
+ *
+ * Called by the platform initialization code if the need for recovery
+ * after graceful restart is detected during boot. Have to be called
+ * before protos_commit().
+ */
 void
 graceful_restart_recovery(void)
 {
   graceful_restart_state = GRS_INIT;
 }
 
+/**
+ * graceful_restart_init - initialize graceful restart
+ *
+ * When graceful restart recovery was requested, the function starts an active
+ * phase of the recovery and initializes graceful restart wait timer. The
+ * function have to be called after protos_commit().
+ */
 void
 graceful_restart_init(void)
 {
@@ -689,6 +733,15 @@ graceful_restart_init(void)
   tm_start(gr_wait_timer, config->gr_wait);
 }
 
+/**
+ * graceful_restart_done - finalize graceful restart
+ *
+ * When there are no locks on graceful restart, the functions finalizes the
+ * graceful restart recovery. Protocols postponing route export until the end of
+ * the recovery are awakened and the export to them is enabled. All other
+ * related state is cleared. The function is also called when the graceful
+ * restart wait timer fires (but there are still some locks).
+ */
 static void
 graceful_restart_done(struct timer *t UNUSED)
 {
@@ -727,7 +780,19 @@ graceful_restart_show_status(void)
   cli_msg(-24, "  Wait timer is %d/%d", tm_remains(gr_wait_timer), config->gr_wait);
 }
 
-/* Just from start hook */
+/**
+ * proto_graceful_restart_lock - lock graceful restart by protocol
+ * @p: protocol instance
+ *
+ * This function allows a protocol to postpone the end of graceful restart
+ * recovery until it converges. The lock is removed when the protocol calls
+ * proto_graceful_restart_unlock() or when the protocol is stopped.
+ *
+ * The function have to be called during the initial phase of graceful restart
+ * recovery and only for protocols that are part of graceful restart (i.e. their
+ * @gr_recovery is set), which means it should be called from protocol start
+ * hooks.
+ */
 void
 proto_graceful_restart_lock(struct proto *p)
 {
@@ -741,6 +806,13 @@ proto_graceful_restart_lock(struct proto *p)
   graceful_restart_locks++;
 }
 
+/**
+ * proto_graceful_restart_unlock - unlock graceful restart by protocol
+ * @p: protocol instance
+ *
+ * This function unlocks a lock from proto_graceful_restart_lock(). It is also
+ * automatically called when the lock holding protocol went down.
+ */
 void
 proto_graceful_restart_unlock(struct proto *p)
 {
@@ -867,29 +939,6 @@ protos_build(void)
   proto_flush_event->hook = proto_flush_loop;
   proto_shutdown_timer = tm_new(proto_pool);
   proto_shutdown_timer->hook = proto_shutdown_loop;
-  proto_shutdown_timer = tm_new(proto_pool);
-  proto_shutdown_timer->hook = proto_shutdown_loop;
-}
-
-static void
-proto_fell_down(struct proto *p)
-{
-  DBG("Protocol %s down\n", p->name);
-
-  u32 all_routes = p->stats.imp_routes + p->stats.filt_routes;
-  if (all_routes != 0)
-    log(L_ERR "Protocol %s is down but still has %d routes", p->name, all_routes);
-
-  bzero(&p->stats, sizeof(struct proto_stats));
-  proto_free_ahooks(p);
-
-  if (! p->proto->multitable)
-    rt_unlock_table(p->table);
-
-  if (p->proto->cleanup)
-    p->proto->cleanup(p);
-
-  proto_rethink_goal(p);
 }
 
 static void
@@ -1066,6 +1115,10 @@ proto_request_feeding(struct proto *p)
 {
   ASSERT(p->proto_state == PS_UP);
 
+  /* Do nothing if we are still waiting for feeding */
+  if (p->export_state == ES_DOWN)
+    return;
+
   /* If we are already feeding, we want to restart it */
   if (p->export_state == ES_FEEDING)
     {
@@ -1220,6 +1273,27 @@ proto_falling_down(struct proto *p)
     proto_graceful_restart_unlock(p);
 }
 
+static void
+proto_fell_down(struct proto *p)
+{
+  DBG("Protocol %s down\n", p->name);
+
+  u32 all_routes = p->stats.imp_routes + p->stats.filt_routes;
+  if (all_routes != 0)
+    log(L_ERR "Protocol %s is down but still has %d routes", p->name, all_routes);
+
+  bzero(&p->stats, sizeof(struct proto_stats));
+  proto_free_ahooks(p);
+
+  if (! p->proto->multitable)
+    rt_unlock_table(p->table);
+
+  if (p->proto->cleanup)
+    p->proto->cleanup(p);
+
+  proto_rethink_goal(p);
+}
+
 
 /**
  * proto_notify_state - notify core about protocol state change
diff --git a/nest/rt-table.c b/nest/rt-table.c
index bc911729..4295f836 100644
--- a/nest/rt-table.c
+++ b/nest/rt-table.c
@@ -1110,6 +1110,21 @@ rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter
   return v > 0;
 }
 
+
+/**
+ * rt_refresh_begin - start a refresh cycle
+ * @t: related routing table
+ * @ah: related announce hook 
+ *
+ * This function starts a refresh cycle for given routing table and announce
+ * hook. The refresh cycle is a sequence where the protocol sends all its valid
+ * routes to the routing table (by rte_update()). After that, all protocol
+ * routes (more precisely routes with @ah as @sender) not sent during the
+ * refresh cycle but still in the table from the past are pruned. This is
+ * implemented by marking all related routes as stale by REF_STALE flag in
+ * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD
+ * flag in rt_refresh_end() and then removing such routes in the prune loop.
+ */
 void
 rt_refresh_begin(rtable *t, struct announce_hook *ah)
 {
@@ -1126,6 +1141,14 @@ rt_refresh_begin(rtable *t, struct announce_hook *ah)
   FIB_WALK_END;
 }
 
+/**
+ * rt_refresh_end - end a refresh cycle
+ * @t: related routing table
+ * @ah: related announce hook 
+ *
+ * This function starts a refresh cycle for given routing table and announce
+ * hook. See rt_refresh_begin() for description of refresh cycles.
+ */
 void
 rt_refresh_end(rtable *t, struct announce_hook *ah)
 {
@@ -1405,6 +1428,19 @@ again:
   return 1;
 }
 
+/**
+ * rt_prune_table - prune a routing table
+ *
+ * This function scans the routing table @tab and removes routes belonging to
+ * flushing protocols, discarded routes and also stale network entries, in a
+ * similar fashion like rt_prune_loop(). Returns 1 when all such routes are
+ * pruned. Contrary to rt_prune_loop(), this function is not a part of the
+ * protocol flushing loop, but it is called from rt_event() for just one routing
+ * table.
+ *
+ * Note that rt_prune_table() and rt_prune_loop() share (for each table) the
+ * prune state (@prune_state) and also the pruning iterator (@prune_fit).
+ */
 static inline int
 rt_prune_table(rtable *tab)
 {
@@ -1415,16 +1451,15 @@ rt_prune_table(rtable *tab)
 /**
  * rt_prune_loop - prune routing tables
  *
- * The prune loop scans routing tables and removes routes belonging to
- * flushing protocols and also stale network entries. Returns 1 when
- * all such routes are pruned. It is a part of the protocol flushing
- * loop.
+ * The prune loop scans routing tables and removes routes belonging to flushing
+ * protocols, discarded routes and also stale network entries. Returns 1 when
+ * all such routes are pruned. It is a part of the protocol flushing loop.
  *
- * The prune loop runs in two steps. In the first step it prunes just
- * the routes with flushing senders (in explicitly marked tables) so
- * the route removal is propagated as usual. In the second step, all
- * remaining relevant routes are removed. Ideally, there shouldn't be
- * any, but it happens when pipe filters are changed.
+ * The prune loop runs in two steps. In the first step it prunes just the routes
+ * with flushing senders (in explicitly marked tables) so the route removal is
+ * propagated as usual. In the second step, all remaining relevant routes are
+ * removed. Ideally, there shouldn't be any, but it happens when pipe filters
+ * are changed.
  */
 int
 rt_prune_loop(void)
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c
index ae9f6877..326883dd 100644
--- a/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@@ -51,6 +51,16 @@
  * and bgp_encode_attrs() which does the converse. Both functions are built around a
  * @bgp_attr_table array describing all important characteristics of all known attributes.
  * Unknown transitive attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
+ *
+ * BGP protocol implements graceful restart in both restarting (local restart)
+ * and receiving (neighbor restart) roles. The first is handled mostly by the
+ * graceful restart code in the nest, BGP protocol just handles capabilities,
+ * sets @gr_wait and locks graceful restart until end-of-RIB mark is received.
+ * The second is implemented by internal restart of the BGP state to %BS_IDLE
+ * and protocol state to %PS_START, but keeping the protocol up from the core
+ * point of view and therefore maintaining received routes. Routing table
+ * refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing
+ * stale routes after reestablishment of BGP session during graceful restart.
  */
 
 #undef LOCAL_DEBUG
@@ -431,6 +441,17 @@ bgp_conn_enter_idle_state(struct bgp_conn *conn)
     bgp_conn_leave_established_state(p);
 }
 
+/**
+ * bgp_handle_graceful_restart - handle detected BGP graceful restart
+ * @p: BGP instance
+ *
+ * This function is called when a BGP graceful restart of the neighbor is
+ * detected (when the TCP connection fails or when a new TCP connection
+ * appears). The function activates processing of the restart - starts routing
+ * table refresh cycle and activates BGP restart timer. The protocol state goes
+ * back to %PS_START, but changing BGP state back to %BS_IDLE is left for the
+ * caller.
+ */
 void
 bgp_handle_graceful_restart(struct bgp_proto *p)
 {
@@ -448,6 +469,16 @@ bgp_handle_graceful_restart(struct bgp_proto *p)
   rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook);
 }
 
+/**
+ * bgp_graceful_restart_done - finish active BGP graceful restart
+ * @p: BGP instance
+ *
+ * This function is called when the active BGP graceful restart of the neighbor
+ * should be finished - either successfully (the neighbor sends all paths and
+ * reports end-of-RIB on the new session) or unsuccessfully (the neighbor does
+ * not support BGP graceful restart on the new session). The function ends
+ * routing table refresh cycle and stops BGP restart timer.
+ */
 void
 bgp_graceful_restart_done(struct bgp_proto *p)
 {
@@ -457,6 +488,15 @@ bgp_graceful_restart_done(struct bgp_proto *p)
   rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
 }
 
+/**
+ * bgp_graceful_restart_timeout - timeout of graceful restart 'restart timer'
+ * @t: timer
+ *
+ * This function is a timeout hook for @gr_timer, implementing BGP restart time
+ * limit for reestablisment of the BGP session after the graceful restart. When
+ * fired, we just proceed with the usual protocol restart.
+ */
+
 static void
 bgp_graceful_restart_timeout(timer *t)
 {
@@ -968,7 +1008,7 @@ bgp_start(struct proto *P)
   p->remote_id = 0;
   p->source_addr = p->cf->source_addr;
 
-  if (P->gr_recovery)
+  if (p->p.gr_recovery && p->cf->gr_mode)
     proto_graceful_restart_lock(P);
 
   /*
-- 
cgit v1.2.3


From 227af309e55a59f14d1a5a757f17900164bffc97 Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Mon, 24 Mar 2014 12:32:12 +0100
Subject: Fixes some minor issues in graceful restart.

---
 nest/proto.c    | 69 +++++++++++++++++++++++++++++++++------------------------
 proto/bgp/bgp.c |  2 +-
 2 files changed, 41 insertions(+), 30 deletions(-)

(limited to 'nest')

diff --git a/nest/proto.c b/nest/proto.c
index e990b48f..13a0833a 100644
--- a/nest/proto.c
+++ b/nest/proto.c
@@ -56,31 +56,10 @@ static void proto_fell_down(struct proto *p);
 static char *proto_state_name(struct proto *p);
 
 static void
-proto_enqueue(list *l, struct proto *p)
-{
-  add_tail(l, &p->n);
-}
-
-static void
-proto_set_core_state(struct proto *p, uint state)
+proto_relink(struct proto *p)
 {
   list *l = NULL;
 
-  p->core_state = state;
-
-  if (p->debug & D_STATES)
-    {
-      char *name = proto_state_name(p);
-      if (name != p->last_state_name_announced)
-	{
-	  p->last_state_name_announced = name;
-	  PD(p, "State changed to %s", proto_state_name(p));
-	}
-    }
-  else
-    p->last_state_name_announced = NULL;
-
-  rem_node(&p->n);
   switch (p->core_state)
     {
     case FS_HUNGRY:
@@ -95,9 +74,28 @@ proto_set_core_state(struct proto *p, uint state)
     default:
       ASSERT(0);
     }
-  proto_enqueue(l, p);
+
+  rem_node(&p->n);
+  add_tail(l, &p->n);
+}
+
+static void
+proto_log_state_change(struct proto *p)
+{
+  if (p->debug & D_STATES)
+    {
+      char *name = proto_state_name(p);
+      if (name != p->last_state_name_announced)
+	{
+	  p->last_state_name_announced = name;
+	  PD(p, "State changed to %s", proto_state_name(p));
+	}
+    }
+  else
+    p->last_state_name_announced = NULL;
 }
 
+
 /**
  * proto_new - create a new protocol instance
  * @c: protocol configuration
@@ -390,7 +388,8 @@ proto_init(struct proto_config *c)
   q->export_state = ES_DOWN;
   q->last_state_change = now;
 
-  proto_enqueue(&initial_proto_list, q);
+  add_tail(&initial_proto_list, &q->n);
+
   if (p == &proto_unix_iface)
     initial_device_proto = q;
 
@@ -758,7 +757,10 @@ graceful_restart_done(struct timer *t UNUSED)
 
       /* Resume postponed export of routes */
       if ((p->proto_state == PS_UP) && p->gr_wait)
+      {
 	proto_want_export_up(p);
+	proto_log_state_change(p);
+      }
 
       /* Cleanup */
       p->gr_recovery = 0;
@@ -954,6 +956,7 @@ proto_feed_more(void *P)
     {
       DBG("Feeding protocol %s finished\n", p->name);
       p->export_state = ES_READY;
+      proto_log_state_change(p);
 
       if (p->feed_done)
 	p->feed_done(p);
@@ -1047,7 +1050,9 @@ proto_flush_loop(void *unused UNUSED)
 
 	DBG("Flushing protocol %s\n", p->name);
 	p->flushing = 0;
-	proto_set_core_state(p, FS_HUNGRY);
+	p->core_state = FS_HUNGRY;
+	proto_relink(p);
+	proto_log_state_change(p);
 	if (p->proto_state == PS_DOWN)
 	  proto_fell_down(p);
 	goto again;
@@ -1138,6 +1143,7 @@ proto_request_feeding(struct proto *p)
   p->stats.exp_routes = 0;
 
   proto_schedule_feed(p, 0);
+  proto_log_state_change(p);
 }
 
 static const char *
@@ -1222,7 +1228,8 @@ proto_want_core_up(struct proto *p)
       proto_reset_limit(p->main_ahook->out_limit);
     }
 
-  proto_set_core_state(p, FS_HAPPY);
+  p->core_state = FS_HAPPY;
+  proto_relink(p);
 }
 
 static void
@@ -1254,7 +1261,8 @@ proto_want_core_down(struct proto *p)
   ASSERT(p->core_state == CS_HAPPY);
   ASSERT(p->export_state == ES_DOWN);
 
-  proto_set_core_state(p, FS_FLUSHING);
+  p->core_state = FS_FLUSHING;
+  proto_relink(p);
   proto_schedule_flush_loop();
 
   if (!p->proto->multitable)
@@ -1373,6 +1381,7 @@ proto_notify_state(struct proto *p, unsigned ps)
 
       if (cs == FS_HUNGRY)		/* Shutdown finished */
 	{
+	  proto_log_state_change(p);
 	  proto_fell_down(p);
 	  return;			/* The protocol might have ceased to exist */
 	}
@@ -1381,6 +1390,8 @@ proto_notify_state(struct proto *p, unsigned ps)
     default:
       bug("%s: Invalid state %d", p->name, ps);
     }
+
+  proto_log_state_change(p);
 }
 
 /*
@@ -1404,8 +1415,8 @@ proto_state_name(struct proto *p)
 	case ES_READY:			return "up";
 	default:      			return "???";
 	}
-    case P(PS_STOP, FS_HUNGRY):		return "stop";
-    case P(PS_STOP, FS_FLUSHING):
+    case P(PS_STOP, FS_HUNGRY):
+    case P(PS_STOP, FS_FLUSHING):	return "stop";
     case P(PS_DOWN, FS_FLUSHING):	return "flush";
     default:      			return "???";
     }
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c
index 326883dd..ca619f31 100644
--- a/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@@ -927,7 +927,7 @@ static void
 bgp_feed_done(struct proto *P)
 {
   struct bgp_proto *p = (struct bgp_proto *) P;
-  if (!p->conn || !p->cf->gr_mode)
+  if (!p->conn || !p->cf->gr_mode || p->p.refeeding)
     return;
 
   p->send_end_mark = 1;
-- 
cgit v1.2.3