From 0c791f873aeb7c1052c97db7da4fe23873d69603 Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Thu, 20 Mar 2014 14:07:12 +0100
Subject: BGP graceful restart support.

Also significant core protocol state changes needed for that,
global graceful restart recovery state and kernel proto support
for recovery.
---
 proto/bgp/bgp.c     | 106 ++++++++++++++++++++++++++++++++++---
 proto/bgp/bgp.h     |  23 ++++++++
 proto/bgp/config.Y  |   7 ++-
 proto/bgp/packets.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++------
 4 files changed, 264 insertions(+), 22 deletions(-)

(limited to 'proto/bgp')

diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c
index a748669d..ae9f6877 100644
--- a/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@@ -319,6 +319,7 @@ bgp_decision(void *vp)
   DBG("BGP: Decision start\n");
   if ((p->p.proto_state == PS_START)
       && (p->outgoing_conn.state == BS_IDLE)
+      && (p->incoming_conn.state != BS_OPENCONFIRM)
       && (!p->cf->passive))
     bgp_active(p);
 
@@ -363,7 +364,7 @@ bgp_conn_enter_established_state(struct bgp_conn *conn)
 
   /* For multi-hop BGP sessions */
   if (ipa_zero(p->source_addr))
-    p->source_addr = conn->sk->saddr; 
+    p->source_addr = conn->sk->saddr;
 
   p->conn = conn;
   p->last_error_class = 0;
@@ -371,6 +372,20 @@ bgp_conn_enter_established_state(struct bgp_conn *conn)
   bgp_init_bucket_table(p);
   bgp_init_prefix_table(p, 8);
 
+  int peer_gr_ready = conn->peer_gr_aware && !(conn->peer_gr_flags & BGP_GRF_RESTART);
+
+  if (p->p.gr_recovery && !peer_gr_ready)
+    proto_graceful_restart_unlock(&p->p);
+
+  if (p->p.gr_recovery && (p->cf->gr_mode == BGP_GR_ABLE) && peer_gr_ready)
+    p->p.gr_wait = 1;
+
+  if (p->gr_active)
+    tm_stop(p->gr_timer);
+
+  if (p->gr_active && (!conn->peer_gr_able || !(conn->peer_gr_aflags & BGP_GRF_FORWARDING)))
+    bgp_graceful_restart_done(p);
+
   bgp_conn_set_state(conn, BS_ESTABLISHED);
   proto_notify_state(&p->p, PS_UP);
 }
@@ -416,16 +431,56 @@ bgp_conn_enter_idle_state(struct bgp_conn *conn)
     bgp_conn_leave_established_state(p);
 }
 
+void
+bgp_handle_graceful_restart(struct bgp_proto *p)
+{
+  ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready);
+
+  BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s",
+	    p->gr_active ? " - already pending" : "");
+  proto_notify_state(&p->p, PS_START);
+
+  if (p->gr_active)
+    rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
+
+  p->gr_active = 1;
+  bgp_start_timer(p->gr_timer, p->conn->peer_gr_time);
+  rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook);
+}
+
+void
+bgp_graceful_restart_done(struct bgp_proto *p)
+{
+  BGP_TRACE(D_EVENTS, "Neighbor graceful restart done");
+  p->gr_active = 0;
+  tm_stop(p->gr_timer);
+  rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
+}
+
+static void
+bgp_graceful_restart_timeout(timer *t)
+{
+  struct bgp_proto *p = t->data;
+
+  BGP_TRACE(D_EVENTS, "Neighbor graceful restart timeout");
+  bgp_stop(p, 0);
+}
+
 static void
 bgp_send_open(struct bgp_conn *conn)
 {
   conn->start_state = conn->bgp->start_state;
 
   // Default values, possibly changed by receiving capabilities.
+  conn->advertised_as = 0;
   conn->peer_refresh_support = 0;
   conn->peer_as4_support = 0;
   conn->peer_add_path = 0;
-  conn->advertised_as = 0;
+  conn->peer_gr_aware = 0;
+  conn->peer_gr_able = 0;
+  conn->peer_gr_time = 0;
+  conn->peer_gr_flags = 0;
+  conn->peer_gr_aflags = 0;
 
   DBG("BGP: Sending open\n");
   conn->sk->rx_hook = bgp_rx;
@@ -484,6 +539,9 @@ bgp_sock_err(sock *sk, int err)
   else
     BGP_TRACE(D_EVENTS, "Connection closed");
 
+  if ((conn->state == BS_ESTABLISHED) && p->gr_ready)
+    bgp_handle_graceful_restart(p);
+
   bgp_conn_enter_idle_state(conn);
 }
 
@@ -649,6 +707,14 @@ bgp_incoming_connection(sock *sk, int dummy UNUSED)
 	    int acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) &&
 	      (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk);
 
+	    if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
+	    {
+	      bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART);
+	      bgp_handle_graceful_restart(p);
+	      bgp_conn_enter_idle_state(p->conn);
+	      acc = 1;
+	    }
+
 	    BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s",
 		      sk->daddr, ipa_has_link_scope(sk->daddr) ? sk->iface : NULL,
 		      sk->dport, acc ? "accepted" : "rejected");
@@ -817,6 +883,17 @@ bgp_reload_routes(struct proto *P)
   return 1;
 }
 
+static void
+bgp_feed_done(struct proto *P)
+{
+  struct bgp_proto *p = (struct bgp_proto *) P;
+  if (!p->conn || !p->cf->gr_mode)
+    return;
+
+  p->send_end_mark = 1;
+  bgp_schedule_packet(p->conn, PKT_UPDATE);
+}
+
 static void
 bgp_start_locked(struct object_lock *lock)
 {
@@ -867,6 +944,8 @@ bgp_start(struct proto *P)
   p->incoming_conn.state = BS_IDLE;
   p->neigh = NULL;
   p->bfd_req = NULL;
+  p->gr_ready = 0;
+  p->gr_active = 0;
 
   rt_lock_table(p->igp_table);
 
@@ -878,6 +957,10 @@ bgp_start(struct proto *P)
   p->startup_timer->hook = bgp_startup_timeout;
   p->startup_timer->data = p;
 
+  p->gr_timer = tm_new(p->p.pool);
+  p->gr_timer->hook = bgp_graceful_restart_timeout;
+  p->gr_timer->data = p;
+
   p->local_id = proto_get_router_id(P->cf);
   if (p->rr_client)
     p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id;
@@ -885,6 +968,9 @@ bgp_start(struct proto *P)
   p->remote_id = 0;
   p->source_addr = p->cf->source_addr;
 
+  if (P->gr_recovery)
+    proto_graceful_restart_lock(P);
+
   /*
    *  Before attempting to create the connection, we need to lock the
    *  port, so that are sure we're the only instance attempting to talk
@@ -985,6 +1071,7 @@ bgp_init(struct proto_config *C)
   P->import_control = bgp_import_control;
   P->neigh_notify = bgp_neigh_notify;
   P->reload_routes = bgp_reload_routes;
+  P->feed_done = bgp_feed_done;
   P->rte_better = bgp_rte_better;
   P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL;
 
@@ -1164,7 +1251,7 @@ bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code)
 
 static char *bgp_state_names[] = { "Idle", "Connect", "Active", "OpenSent", "OpenConfirm", "Established", "Close" };
 static char *bgp_err_classes[] = { "", "Error: ", "Socket: ", "Received: ", "BGP Error: ", "Automatic shutdown: ", ""};
-static char *bgp_misc_errors[] = { "", "Neighbor lost", "Invalid next hop", "Kernel MD5 auth failed", "No listening socket", "BFD session down" };
+static char *bgp_misc_errors[] = { "", "Neighbor lost", "Invalid next hop", "Kernel MD5 auth failed", "No listening socket", "BFD session down", "Graceful restart"};
 static char *bgp_auto_errors[] = { "", "Route limit exceeded"};
 
 static const char *
@@ -1225,25 +1312,32 @@ bgp_show_proto_info(struct proto *P)
   cli_msg(-1006, "    Neighbor address: %I%J", p->cf->remote_ip, p->cf->iface);
   cli_msg(-1006, "    Neighbor AS:      %u", p->remote_as);
 
+  if (p->gr_active)
+    cli_msg(-1006, "    Neighbor graceful restart active");
+
   if (P->proto_state == PS_START)
     {
       struct bgp_conn *oc = &p->outgoing_conn;
 
       if ((p->start_state < BSS_CONNECT) &&
 	  (p->startup_timer->expires))
-	cli_msg(-1006, "    Error wait:       %d/%d", 
+	cli_msg(-1006, "    Error wait:       %d/%d",
 		p->startup_timer->expires - now, p->startup_delay);
 
       if ((oc->state == BS_ACTIVE) &&
 	  (oc->connect_retry_timer->expires))
-	cli_msg(-1006, "    Start delay:      %d/%d", 
+	cli_msg(-1006, "    Start delay:      %d/%d",
 		oc->connect_retry_timer->expires - now, p->cf->start_delay_time);
+
+      if (p->gr_active && p->gr_timer->expires)
+	cli_msg(-1006, "    Restart timer:    %d/-", p->gr_timer->expires - now);
     }
   else if (P->proto_state == PS_UP)
     {
       cli_msg(-1006, "    Neighbor ID:      %R", p->remote_id);
-      cli_msg(-1006, "    Neighbor caps:   %s%s%s%s",
+      cli_msg(-1006, "    Neighbor caps:   %s%s%s%s%s",
 	      c->peer_refresh_support ? " refresh" : "",
+	      c->peer_gr_able ? " restart-able" : (c->peer_gr_aware ? " restart-aware" : ""),
 	      c->peer_as4_support ? " AS4" : "",
 	      (c->peer_add_path & ADD_PATH_RX) ? " add-path-rx" : "",
 	      (c->peer_add_path & ADD_PATH_TX) ? " add-path-tx" : "");
diff --git a/proto/bgp/bgp.h b/proto/bgp/bgp.h
index 170b6bbe..da0114c2 100644
--- a/proto/bgp/bgp.h
+++ b/proto/bgp/bgp.h
@@ -48,6 +48,8 @@ struct bgp_config {
   int secondary;			/* Accept also non-best routes (i.e. RA_ACCEPTED) */
   int add_path;				/* Use ADD-PATH extension [draft] */
   int allow_local_as;			/* Allow that number of local ASNs in incoming AS_PATHs */
+  int gr_mode;				/* Graceful restart mode (BGP_GR_*) */
+  unsigned gr_time;			/* Graceful restart timeout */
   unsigned connect_retry_time;
   unsigned hold_time, initial_hold_time;
   unsigned keepalive_time;
@@ -73,6 +75,15 @@ struct bgp_config {
 #define ADD_PATH_TX 2
 #define ADD_PATH_FULL 3
 
+#define BGP_GR_ABLE 1
+#define BGP_GR_AWARE 2
+
+/* For peer_gr_flags */
+#define BGP_GRF_RESTART 0x80
+
+/* For peer_gr_aflags */
+#define BGP_GRF_FORWARDING 0x80
+
 
 struct bgp_conn {
   struct bgp_proto *bgp;
@@ -90,6 +101,11 @@ struct bgp_conn {
   u8 peer_refresh_support;		/* Peer supports route refresh [RFC2918] */
   u8 peer_as4_support;			/* Peer supports 4B AS numbers [RFC4893] */
   u8 peer_add_path;			/* Peer supports ADD-PATH [draft] */
+  u8 peer_gr_aware;
+  u8 peer_gr_able;
+  u16 peer_gr_time;
+  u8 peer_gr_flags;
+  u8 peer_gr_aflags;
   unsigned hold_time, keepalive_time;	/* Times calculated from my and neighbor's requirements */
 };
 
@@ -107,6 +123,8 @@ struct bgp_proto {
   u32 rr_cluster_id;			/* Route reflector cluster ID */
   int rr_client;			/* Whether neighbor is RR client of me */
   int rs_client;			/* Whether neighbor is RS client of me */
+  u8 gr_ready;				/* Neighbor could do graceful restart */
+  u8 gr_active;				/* Neighbor is doing graceful restart */
   struct bgp_conn *conn;		/* Connection we have established */
   struct bgp_conn outgoing_conn;	/* Outgoing connection we're working with */
   struct bgp_conn incoming_conn;	/* Incoming connection we have neither accepted nor rejected yet */
@@ -117,12 +135,14 @@ struct bgp_proto {
   rtable *igp_table;			/* Table used for recursive next hop lookups */
   struct event *event;			/* Event for respawning and shutting process */
   struct timer *startup_timer;		/* Timer used to delay protocol startup due to previous errors (startup_delay) */
+  struct timer *gr_timer;		/* Timer waiting for reestablishment after graceful restart */
   struct bgp_bucket **bucket_hash;	/* Hash table of attribute buckets */
   unsigned int hash_size, hash_count, hash_limit;
   HASH(struct bgp_prefix) prefix_hash;	/* Prefixes to be sent */
   slab *prefix_slab;			/* Slab holding prefix nodes */
   list bucket_queue;			/* Queue of buckets to send */
   struct bgp_bucket *withdraw_bucket;	/* Withdrawn routes */
+  unsigned send_end_mark;		/* End-of-RIB mark scheduled for transmit */
   unsigned startup_delay;		/* Time to delay protocol startup by due to errors */
   bird_clock_t last_proto_error;	/* Time of last error that leads to protocol stop */
   u8 last_error_class; 			/* Error class of last error */
@@ -172,6 +192,8 @@ void bgp_conn_enter_openconfirm_state(struct bgp_conn *conn);
 void bgp_conn_enter_established_state(struct bgp_conn *conn);
 void bgp_conn_enter_close_state(struct bgp_conn *conn);
 void bgp_conn_enter_idle_state(struct bgp_conn *conn);
+void bgp_handle_graceful_restart(struct bgp_proto *p);
+void bgp_graceful_restart_done(struct bgp_proto *p);
 void bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code);
 void bgp_stop(struct bgp_proto *p, unsigned subcode);
 
@@ -313,6 +335,7 @@ void bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsi
 #define BEM_INVALID_MD5		3	/* MD5 authentication kernel request failed (possibly not supported) */
 #define BEM_NO_SOCKET		4
 #define BEM_BFD_DOWN		5
+#define BEM_GRACEFUL_RESTART	6
 
 /* Automatic shutdown error codes */
 
diff --git a/proto/bgp/config.Y b/proto/bgp/config.Y
index 76a76470..6b885032 100644
--- a/proto/bgp/config.Y
+++ b/proto/bgp/config.Y
@@ -26,7 +26,7 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY,
 	PREFER, OLDER, MISSING, LLADDR, DROP, IGNORE, ROUTE, REFRESH,
 	INTERPRET, COMMUNITIES, BGP_ORIGINATOR_ID, BGP_CLUSTER_LIST, IGP,
 	TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL, SECURITY, DETERMINISTIC,
-	SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX)
+	SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX, GRACEFUL, RESTART, AWARE)
 
 CF_GRAMMAR
 
@@ -50,6 +50,8 @@ bgp_proto_start: proto_start BGP {
      BGP_CFG->advertise_ipv4 = 1;
      BGP_CFG->interpret_communities = 1;
      BGP_CFG->default_local_pref = 100;
+     BGP_CFG->gr_mode = BGP_GR_AWARE;
+     BGP_CFG->gr_time = 120;
  }
  ;
 
@@ -115,6 +117,9 @@ bgp_proto:
  | bgp_proto ADD PATHS bool ';' { BGP_CFG->add_path = $4 ? ADD_PATH_FULL : 0; }
  | bgp_proto ALLOW LOCAL AS ';' { BGP_CFG->allow_local_as = -1; }
  | bgp_proto ALLOW LOCAL AS expr ';' { BGP_CFG->allow_local_as = $5; }
+ | bgp_proto GRACEFUL RESTART bool ';' { BGP_CFG->gr_mode = $4; }
+ | bgp_proto GRACEFUL RESTART AWARE ';' { BGP_CFG->gr_mode = BGP_GR_AWARE; }
+ | bgp_proto GRACEFUL RESTART TIME expr ';' { BGP_CFG->gr_time = $5; }
  | bgp_proto IGP TABLE rtable ';' { BGP_CFG->igp_table = $4; }
  | bgp_proto TTL SECURITY bool ';' { BGP_CFG->ttl_security = $4; }
  | bgp_proto BFD bool ';' { BGP_CFG->bfd = $3; cf_check_bfd($3); }
diff --git a/proto/bgp/packets.c b/proto/bgp/packets.c
index 649d8078..2d4da8c9 100644
--- a/proto/bgp/packets.c
+++ b/proto/bgp/packets.c
@@ -122,7 +122,7 @@ bgp_create_notification(struct bgp_conn *conn, byte *buf)
 
 #ifdef IPV6
 static byte *
-bgp_put_cap_ipv6(struct bgp_conn *conn UNUSED, byte *buf)
+bgp_put_cap_ipv6(struct bgp_proto *p UNUSED, byte *buf)
 {
   *buf++ = 1;		/* Capability 1: Multiprotocol extensions */
   *buf++ = 4;		/* Capability data length */
@@ -136,7 +136,7 @@ bgp_put_cap_ipv6(struct bgp_conn *conn UNUSED, byte *buf)
 #else
 
 static byte *
-bgp_put_cap_ipv4(struct bgp_conn *conn UNUSED, byte *buf)
+bgp_put_cap_ipv4(struct bgp_proto *p UNUSED, byte *buf)
 {
   *buf++ = 1;		/* Capability 1: Multiprotocol extensions */
   *buf++ = 4;		/* Capability data length */
@@ -149,7 +149,7 @@ bgp_put_cap_ipv4(struct bgp_conn *conn UNUSED, byte *buf)
 #endif
 
 static byte *
-bgp_put_cap_rr(struct bgp_conn *conn UNUSED, byte *buf)
+bgp_put_cap_rr(struct bgp_proto *p UNUSED, byte *buf)
 {
   *buf++ = 2;		/* Capability 2: Support for route refresh */
   *buf++ = 0;		/* Capability data length */
@@ -157,16 +157,44 @@ bgp_put_cap_rr(struct bgp_conn *conn UNUSED, byte *buf)
 }
 
 static byte *
-bgp_put_cap_as4(struct bgp_conn *conn, byte *buf)
+bgp_put_cap_gr1(struct bgp_proto *p, byte *buf)
+{
+  *buf++ = 64;		/* Capability 64: Support for graceful restart */
+  *buf++ = 6;		/* Capability data length */
+
+  put_u16(buf, p->cf->gr_time);
+  if (p->p.gr_recovery)
+    buf[0] |= BGP_GRF_RESTART;
+  buf += 2;
+
+  *buf++ = 0;		/* Appropriate AF */
+  *buf++ = BGP_AF;
+  *buf++ = 1;		/* and SAFI 1 */
+  *buf++ = p->p.gr_recovery ? BGP_GRF_FORWARDING : 0;
+
+  return buf;
+}
+
+static byte *
+bgp_put_cap_gr2(struct bgp_proto *p, byte *buf)
+{
+  *buf++ = 64;		/* Capability 64: Support for graceful restart */
+  *buf++ = 2;		/* Capability data length */
+  put_u16(buf, 0);
+  return buf + 2;
+}
+
+static byte *
+bgp_put_cap_as4(struct bgp_proto *p, byte *buf)
 {
   *buf++ = 65;		/* Capability 65: Support for 4-octet AS number */
   *buf++ = 4;		/* Capability data length */
-  put_u32(buf, conn->bgp->local_as);
+  put_u32(buf, p->local_as);
   return buf + 4;
 }
 
 static byte *
-bgp_put_cap_add_path(struct bgp_conn *conn, byte *buf)
+bgp_put_cap_add_path(struct bgp_proto *p, byte *buf)
 {
   *buf++ = 69;		/* Capability 69: Support for ADD-PATH */
   *buf++ = 4;		/* Capability data length */
@@ -175,7 +203,7 @@ bgp_put_cap_add_path(struct bgp_conn *conn, byte *buf)
   *buf++ = BGP_AF;
   *buf++ = 1;		/* SAFI 1 */
 
-  *buf++ = conn->bgp->cf->add_path;
+  *buf++ = p->cf->add_path;
 
   return buf;
 }
@@ -206,21 +234,26 @@ bgp_create_open(struct bgp_conn *conn, byte *buf)
 
 #ifndef IPV6
   if (p->cf->advertise_ipv4)
-    cap = bgp_put_cap_ipv4(conn, cap);
+    cap = bgp_put_cap_ipv4(p, cap);
 #endif
 
 #ifdef IPV6
-  cap = bgp_put_cap_ipv6(conn, cap);
+  cap = bgp_put_cap_ipv6(p, cap);
 #endif
 
   if (p->cf->enable_refresh)
-    cap = bgp_put_cap_rr(conn, cap);
+    cap = bgp_put_cap_rr(p, cap);
+
+  if (p->cf->gr_mode == BGP_GR_ABLE)
+    cap = bgp_put_cap_gr1(p, cap);
+  else if (p->cf->gr_mode == BGP_GR_AWARE)
+    cap = bgp_put_cap_gr2(p, cap);
 
   if (p->cf->enable_as4)
-    cap = bgp_put_cap_as4(conn, cap);
+    cap = bgp_put_cap_as4(p, cap);
 
   if (p->cf->add_path)
-    cap = bgp_put_cap_add_path(conn, cap);
+    cap = bgp_put_cap_add_path(p, cap);
 
   cap_len = cap - buf - 12;
   if (cap_len > 0)
@@ -351,6 +384,16 @@ bgp_create_update(struct bgp_conn *conn, byte *buf)
     return NULL;
 }
 
+static byte *
+bgp_create_end_mark(struct bgp_conn *conn, byte *buf)
+{
+  struct bgp_proto *p = conn->bgp;
+  BGP_TRACE(D_PACKETS, "Sending End-of-RIB");
+
+  put_u32(buf, 0);
+  return buf+4;
+}
+
 #else		/* IPv6 version */
 
 static inline int
@@ -520,6 +563,26 @@ bgp_create_update(struct bgp_conn *conn, byte *buf)
     return NULL;
 }
 
+static byte *
+bgp_create_end_mark(struct bgp_conn *conn, byte *buf)
+{
+  struct bgp_proto *p = conn->bgp;
+  BGP_TRACE(D_PACKETS, "Sending End-of-RIB");
+
+  put_u16(buf+0, 0);
+  put_u16(buf+2, 6);	/* length 4-9 */
+  buf += 4;
+
+  /* Empty MP_UNREACH_NLRI atribute */
+  *buf++ = BAF_OPTIONAL;
+  *buf++ = BA_MP_UNREACH_NLRI;
+  *buf++ = 3;		/* Length 7-9 */
+  *buf++ = 0;		/* AFI */
+  *buf++ = BGP_AF_IPV6;
+  *buf++ = 1;		/* SAFI */
+  return buf;
+}
+
 #endif
 
 static byte *
@@ -606,10 +669,16 @@ bgp_fire_tx(struct bgp_conn *conn)
     {
       end = bgp_create_update(conn, pkt);
       type = PKT_UPDATE;
+
       if (!end)
 	{
 	  conn->packets_to_send = 0;
-	  return 0;
+
+	  if (!p->send_end_mark)
+	    return 0;
+
+	  p->send_end_mark = 0;
+	  end = bgp_create_end_mark(conn, pkt);
 	}
     }
   else
@@ -678,6 +747,22 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len)
 	  conn->peer_refresh_support = 1;
 	  break;
 
+	case 64: /* Graceful restart capability, RFC 4724 */
+	  if (cl % 4 != 2)
+	    goto err;
+	  conn->peer_gr_aware = 1;
+	  conn->peer_gr_able = 0;
+	  conn->peer_gr_time = get_u16(opt + 2) & 0x0fff;
+	  conn->peer_gr_flags = opt[2] & 0xf0;
+	  conn->peer_gr_aflags = 0;
+	  for (i = 2; i < cl; i += 4)
+	    if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */
+	      {
+		conn->peer_gr_able = 1;
+		conn->peer_gr_aflags = opt[2+i+3];
+	      }
+	  break;
+
 	case 65: /* AS4 capability, RFC 4893 */
 	  if (cl != 4)
 	    goto err;
@@ -704,7 +789,7 @@ bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len)
     }
   return;
 
-    err:
+ err:
   bgp_error(conn, 2, 0, NULL, 0);
   return;
 }
@@ -807,12 +892,17 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len)
   other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
   switch (other->state)
     {
-    case BS_IDLE:
     case BS_CONNECT:
     case BS_ACTIVE:
+      /* Stop outgoing connection attempts */
+      bgp_conn_enter_idle_state(other);
+      break;
+
+    case BS_IDLE:
     case BS_OPENSENT:
     case BS_CLOSE:
       break;
+
     case BS_OPENCONFIRM:
       if ((p->local_id < id) == (conn == &p->incoming_conn))
 	{
@@ -838,6 +928,7 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len)
   p->as4_session = p->cf->enable_as4 && conn->peer_as4_support;
   p->add_path_rx = (p->cf->add_path & ADD_PATH_RX) && (conn->peer_add_path & ADD_PATH_TX);
   p->add_path_tx = (p->cf->add_path & ADD_PATH_TX) && (conn->peer_add_path & ADD_PATH_RX);
+  p->gr_ready = p->cf->gr_mode && conn->peer_gr_able;
 
   if (p->add_path_tx)
     p->p.accept_ra_types = RA_ANY;
@@ -849,6 +940,20 @@ bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len)
   bgp_conn_enter_openconfirm_state(conn);
 }
 
+
+static inline void
+bgp_rx_end_mark(struct bgp_proto *p)
+{
+  BGP_TRACE(D_PACKETS, "Got End-of-RIB");
+
+  if (p->p.gr_recovery)
+    proto_graceful_restart_unlock(&p->p);
+
+  if (p->gr_active)
+    bgp_graceful_restart_done(p);
+}
+
+
 #define DECODE_PREFIX(pp, ll) do {		\
   if (p->add_path_rx)				\
   {						\
@@ -983,6 +1088,13 @@ bgp_do_rx_update(struct bgp_conn *conn,
   u32 path_id = 0;
   u32 last_id = 0;
 
+  /* Check for End-of-RIB marker */
+  if (!withdrawn_len && !attr_len && !nlri_len)
+    {
+      bgp_rx_end_mark(p);
+      return;
+    }
+
   /* Withdraw routes */
   while (withdrawn_len)
     {
@@ -1088,6 +1200,14 @@ bgp_do_rx_update(struct bgp_conn *conn,
   if (conn->state != BS_ESTABLISHED)	/* fatal error during decoding */
     return;
 
+  /* Check for End-of-RIB marker */
+  if ((attr_len < 8) && !withdrawn_len && !attr_len &&
+      (p->mp_unreach_len == 3) && (get_u16(p->mp_unreach_start) == BGP_AF_IPV6))
+    {
+      bgp_rx_end_mark(p);
+      return;
+    }
+
   DO_NLRI(mp_unreach)
     {
       while (len)
-- 
cgit v1.2.3


From 6eda3f135f5bab4db456531d25bc3e5f669ec22e Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Sun, 23 Mar 2014 01:35:33 +0100
Subject: Documentation (and minor fixes) for BGP graceful restart.

---
 doc/bird.sgml   |  99 ++++++++++++++++++++++++++++--------
 nest/proto.c    | 154 +++++++++++++++++++++++++++++++++++++++++---------------
 nest/rt-table.c |  53 +++++++++++++++----
 proto/bgp/bgp.c |  42 +++++++++++++++-
 4 files changed, 276 insertions(+), 72 deletions(-)

(limited to 'proto/bgp')

diff --git a/doc/bird.sgml b/doc/bird.sgml
index e9c61526..3ea90920 100644
--- a/doc/bird.sgml
+++ b/doc/bird.sgml
@@ -157,6 +157,9 @@ options. The most important ones are:
 
 	<tag>-f</tag>
 	run bird in foreground.
+
+	<tag>-R</tag>
+	apply graceful restart recovery after start.
 </descrip>
 
 <p>BIRD writes messages about its work to log files or syslog (according to config).
@@ -187,6 +190,7 @@ configuration, but it is generally easy -- BIRD needs just the
 standard library, privileges to read the config file and create the
 control socket and the CAP_NET_* capabilities.
 
+
 <chapt>About routing tables
 
 <p>BIRD has one or more routing tables which may or may not be
@@ -242,6 +246,20 @@ using comparison and ordering). Minor advantage is that routes are
 shown sorted in <cf/show route/, minor disadvantage is that it is
 slightly more computationally expensive.
 
+<sect>Graceful restart
+
+<p>When BIRD is started after restart or crash, it repopulates routing tables in
+an uncoordinated manner, like after clean start. This may be impractical in some
+cases, because if the forwarding plane (i.e. kernel routing tables) remains
+intact, then its synchronization with BIRD would temporarily disrupt packet
+forwarding until protocols converge. Graceful restart is a mechanism that could
+help with this issue. Generally, it works by starting protocols and letting them
+repopulate routing tables while deferring route propagation until protocols
+acknowledge their convergence. Note that graceful restart behavior have to be
+configured for all relevant protocols and requires protocol-specific support
+(currently implemented for Kernel and BGP protocols), it is activated for
+particular boot by option <cf/-R/.
+
 
 <chapt>Configuration
 
@@ -371,6 +389,12 @@ protocol rip {
 	would accept IPv6 routes only). Such behavior was default in
 	older versions of BIRD.
 
+	<tag>graceful restart wait <m/number/</tag>
+	During graceful restart recovery, BIRD waits for convergence of routing
+	protocols. This option allows to specify a timeout for the recovery to
+	prevent waiting indefinitely if some protocols cannot converge. Default:
+	240 seconds.
+
 	<tag>timeformat route|protocol|base|log "<m/format1/" [<m/limit/ "<m/format2/"]</tag>
 	This option allows to specify a format of date/time used by
 	BIRD.  The first argument specifies for which purpose such
@@ -1493,6 +1517,8 @@ extended communities
 (RFC 4360<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4360.txt">),
 route reflectors 
 (RFC 4456<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4456.txt">),
+graceful restart
+(RFC 4724<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4724.txt">),
 multiprotocol extensions
 (RFC 4760<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4760.txt">),
 4B AS numbers 
@@ -1502,9 +1528,7 @@ and 4B AS numbers in extended communities
 
 
 For IPv6, it uses the standard multiprotocol extensions defined in
-RFC 2283<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc2283.txt">
-including changes described in the
-latest draft<htmlurl url="ftp://ftp.rfc-editor.org/internet-drafts/draft-ietf-idr-bgp4-multiprotocol-v2-05.txt">
+RFC 4760<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc4760.txt">
 and applied to IPv6 according to
 RFC 2545<htmlurl url="ftp://ftp.rfc-editor.org/in-notes/rfc2545.txt">.
 
@@ -1716,6 +1740,26 @@ for each neighbor using the following configuration parameters:
 	capability and accepts such requests. Even when disabled, BIRD
 	can send route refresh requests. Default: on.
 
+	<tag>graceful restart <m/switch/|aware</tag>
+	When a BGP speaker restarts or crashes, neighbors will discard all
+	received paths from the speaker, which disrupts packet forwarding even
+	when the forwarding plane of the speaker remains intact. RFC 4724
+	specifies an optional graceful restart mechanism to alleviate this
+	issue. This option controls the mechanism. It has three states:
+	Disabled, when no support is provided. Aware, when the graceful restart
+	support is announced and the support for restarting neighbors is
+	provided, but no local graceful restart is allowed (i.e. receiving-only
+	role). Enabled, when the full graceful restart support is provided
+	(i.e. both restarting and receiving role). Note that proper support for
+	local graceful restart requires also configuration of other protocols.
+	Default: aware.
+
+	<tag>graceful restart time <m/number/</tag>
+	The restart time is announced in the BGP graceful restart capability
+	and specifies how long the neighbor would wait for the BGP session to
+	re-establish after a restart before deleting stale routes. Default:
+	120 seconds.
+
 	<tag>interpret communities <m/switch/</tag> RFC 1997 demands
 	that BGP speaker should process well-known communities like
 	no-export (65535, 65281) or no-advertise (65535, 65282). For
@@ -2063,25 +2107,36 @@ overcome using another routing table and the pipe protocol.
 <sect1>Configuration
 
 <p><descrip>
-	<tag>persist <m/switch/</tag> Tell BIRD to leave all its routes in the
-	routing tables when it exits (instead of cleaning them up).
-	<tag>scan time <m/number/</tag> Time in seconds between two consecutive scans of the
-	kernel routing table.
-	<tag>learn <m/switch/</tag> Enable learning of routes added to the kernel
-	routing tables by other routing daemons or by the system administrator.
-	This is possible only on systems which support identification of route
-	authorship.
-
-	<tag>device routes <m/switch/</tag> Enable export of device
-	routes to the kernel routing table. By default, such routes
-	are rejected (with the exception of explicitly configured
-	device routes from the static protocol) regardless of the
-	export filter to protect device routes in kernel routing table
-	(managed by OS itself) from accidental overwriting or erasing.
-
-	<tag>kernel table <m/number/</tag> Select which kernel table should
-	this particular instance of the Kernel protocol work with. Available
-	only on systems supporting multiple routing tables.
+	<tag>persist <m/switch/</tag>
+	Tell BIRD to leave all its routes in the routing tables when it exits
+	(instead of cleaning them up).
+
+	<tag>scan time <m/number/</tag>
+	Time in seconds between two consecutive scans of the kernel routing
+	table.
+
+	<tag>learn <m/switch/</tag>
+	Enable learning of routes added to the kernel routing tables by other
+	routing daemons or by the system administrator. This is possible only on
+	systems which support identification of route authorship.
+
+	<tag>device routes <m/switch/</tag>
+	Enable export of device routes to the kernel routing table. By default,
+	such routes are rejected (with the exception of explicitly configured
+	device routes from the static protocol) regardless of the export filter
+	to protect device routes in kernel routing table (managed by OS itself)
+	from accidental overwriting or erasing.
+
+	<tag>kernel table <m/number/</tag>
+	Select which kernel table should this particular instance of the Kernel
+	protocol work with. Available only on systems supporting multiple
+	routing tables.
+
+	<tag>graceful restart <m/switch/</tag>
+	Participate in graceful restart recovery. If this option is enabled and
+	a graceful restart recovery is active, the Kernel protocol will defer
+	synchronization of routing tables until the end of the recovery. Note
+	that import of kernel routes to BIRD is not affected.
 </descrip>
 
 <sect1>Attributes
diff --git a/nest/proto.c b/nest/proto.c
index 2bc3e319..e990b48f 100644
--- a/nest/proto.c
+++ b/nest/proto.c
@@ -51,6 +51,8 @@ static char *c_states[] = { "HUNGRY", "???", "HAPPY", "FLUSHING" };
 static void proto_flush_loop(void *);
 static void proto_shutdown_loop(struct timer *);
 static void proto_rethink_goal(struct proto *p);
+static void proto_want_export_up(struct proto *p);
+static void proto_fell_down(struct proto *p);
 static char *proto_state_name(struct proto *p);
 
 static void
@@ -151,21 +153,20 @@ extern pool *rt_table_pool;
  * @t: routing table to connect to
  * @stats: per-table protocol statistics
  *
- * This function creates a connection between the protocol instance @p
- * and the routing table @t, making the protocol hear all changes in
- * the table.
+ * This function creates a connection between the protocol instance @p and the
+ * routing table @t, making the protocol hear all changes in the table.
  *
- * The announce hook is linked in the protocol ahook list and, if the
- * protocol accepts routes, also in the table ahook list. Announce
- * hooks are allocated from the routing table resource pool, they are
- * unlinked from the table ahook list after the protocol went down,
- * (in proto_schedule_flush()) and they are automatically freed after the
- * protocol is flushed (in proto_fell_down()).
+ * The announce hook is linked in the protocol ahook list. Announce hooks are
+ * allocated from the routing table resource pool and when protocol accepts
+ * routes also in the table ahook list. The are linked to the table ahook list
+ * and unlinked from it depending on export_state (in proto_want_export_up() and
+ * proto_want_export_down()) and they are automatically freed after the protocol
+ * is flushed (in proto_fell_down()).
  *
- * Unless you want to listen to multiple routing tables (as the Pipe
- * protocol does), you needn't to worry about this function since the
- * connection to the protocol's primary routing table is initialized
- * automatically by the core code.
+ * Unless you want to listen to multiple routing tables (as the Pipe protocol
+ * does), you needn't to worry about this function since the connection to the
+ * protocol's primary routing table is initialized automatically by the core
+ * code.
  */
 struct announce_hook *
 proto_add_announce_hook(struct proto *p, struct rtable *t, struct proto_stats *stats)
@@ -183,7 +184,7 @@ proto_add_announce_hook(struct proto *p, struct rtable *t, struct proto_stats *s
   h->next = p->ahooks;
   p->ahooks = h;
 
-  if (p->rt_notify && (p->export_state == ES_READY))
+  if (p->rt_notify && (p->export_state != ES_DOWN))
     add_tail(&t->hooks, &h->n);
   return h;
 }
@@ -659,16 +660,59 @@ proto_rethink_goal(struct proto *p)
 }
 
 
+/**
+ * DOC: Graceful restart recovery
+ *
+ * Graceful restart of a router is a process when the routing plane (e.g. BIRD)
+ * restarts but both the forwarding plane (e.g kernel routing table) and routing
+ * neighbors keep proper routes, and therefore uninterrupted packet forwarding
+ * is maintained.
+ *
+ * BIRD implements graceful restart recovery by deferring export of routes to
+ * protocols until routing tables are refilled with the expected content. After
+ * start, protocols generate routes as usual, but routes are not propagated to
+ * them, until protocols report that they generated all routes. After that,
+ * graceful restart recovery is finished and the export (and the initial feed)
+ * to protocols is enabled.
+ *
+ * When graceful restart recovery need is detected during initialization, then
+ * enabled protocols are marked with @gr_recovery flag before start. Such
+ * protocols then decide how to proceed with graceful restart, participation is
+ * voluntary. Protocols could lock the recovery by proto_graceful_restart_lock()
+ * (stored in @gr_lock flag), which means that they want to postpone the end of
+ * the recovery until they converge and then unlock it. They also could set
+ * @gr_wait before advancing to %PS_UP, which means that the core should defer
+ * route export to that protocol until the end of the recovery. This should be
+ * done by protocols that expect their neigbors to keep the proper routes
+ * (kernel table, BGP sessions with BGP graceful restart capability).
+ *
+ * The graceful restart recovery is finished when either all graceful restart
+ * locks are unlocked or when graceful restart wait timer fires.
+ *
+ */
 
-static void graceful_restart_done(struct timer *t UNUSED);
-static void proto_want_export_up(struct proto *p);
+static void graceful_restart_done(struct timer *t);
 
+/**
+ * graceful_restart_recovery - request initial graceful restart recovery
+ *
+ * Called by the platform initialization code if the need for recovery
+ * after graceful restart is detected during boot. Have to be called
+ * before protos_commit().
+ */
 void
 graceful_restart_recovery(void)
 {
   graceful_restart_state = GRS_INIT;
 }
 
+/**
+ * graceful_restart_init - initialize graceful restart
+ *
+ * When graceful restart recovery was requested, the function starts an active
+ * phase of the recovery and initializes graceful restart wait timer. The
+ * function have to be called after protos_commit().
+ */
 void
 graceful_restart_init(void)
 {
@@ -689,6 +733,15 @@ graceful_restart_init(void)
   tm_start(gr_wait_timer, config->gr_wait);
 }
 
+/**
+ * graceful_restart_done - finalize graceful restart
+ *
+ * When there are no locks on graceful restart, the functions finalizes the
+ * graceful restart recovery. Protocols postponing route export until the end of
+ * the recovery are awakened and the export to them is enabled. All other
+ * related state is cleared. The function is also called when the graceful
+ * restart wait timer fires (but there are still some locks).
+ */
 static void
 graceful_restart_done(struct timer *t UNUSED)
 {
@@ -727,7 +780,19 @@ graceful_restart_show_status(void)
   cli_msg(-24, "  Wait timer is %d/%d", tm_remains(gr_wait_timer), config->gr_wait);
 }
 
-/* Just from start hook */
+/**
+ * proto_graceful_restart_lock - lock graceful restart by protocol
+ * @p: protocol instance
+ *
+ * This function allows a protocol to postpone the end of graceful restart
+ * recovery until it converges. The lock is removed when the protocol calls
+ * proto_graceful_restart_unlock() or when the protocol is stopped.
+ *
+ * The function have to be called during the initial phase of graceful restart
+ * recovery and only for protocols that are part of graceful restart (i.e. their
+ * @gr_recovery is set), which means it should be called from protocol start
+ * hooks.
+ */
 void
 proto_graceful_restart_lock(struct proto *p)
 {
@@ -741,6 +806,13 @@ proto_graceful_restart_lock(struct proto *p)
   graceful_restart_locks++;
 }
 
+/**
+ * proto_graceful_restart_unlock - unlock graceful restart by protocol
+ * @p: protocol instance
+ *
+ * This function unlocks a lock from proto_graceful_restart_lock(). It is also
+ * automatically called when the lock holding protocol went down.
+ */
 void
 proto_graceful_restart_unlock(struct proto *p)
 {
@@ -867,29 +939,6 @@ protos_build(void)
   proto_flush_event->hook = proto_flush_loop;
   proto_shutdown_timer = tm_new(proto_pool);
   proto_shutdown_timer->hook = proto_shutdown_loop;
-  proto_shutdown_timer = tm_new(proto_pool);
-  proto_shutdown_timer->hook = proto_shutdown_loop;
-}
-
-static void
-proto_fell_down(struct proto *p)
-{
-  DBG("Protocol %s down\n", p->name);
-
-  u32 all_routes = p->stats.imp_routes + p->stats.filt_routes;
-  if (all_routes != 0)
-    log(L_ERR "Protocol %s is down but still has %d routes", p->name, all_routes);
-
-  bzero(&p->stats, sizeof(struct proto_stats));
-  proto_free_ahooks(p);
-
-  if (! p->proto->multitable)
-    rt_unlock_table(p->table);
-
-  if (p->proto->cleanup)
-    p->proto->cleanup(p);
-
-  proto_rethink_goal(p);
 }
 
 static void
@@ -1066,6 +1115,10 @@ proto_request_feeding(struct proto *p)
 {
   ASSERT(p->proto_state == PS_UP);
 
+  /* Do nothing if we are still waiting for feeding */
+  if (p->export_state == ES_DOWN)
+    return;
+
   /* If we are already feeding, we want to restart it */
   if (p->export_state == ES_FEEDING)
     {
@@ -1220,6 +1273,27 @@ proto_falling_down(struct proto *p)
     proto_graceful_restart_unlock(p);
 }
 
+static void
+proto_fell_down(struct proto *p)
+{
+  DBG("Protocol %s down\n", p->name);
+
+  u32 all_routes = p->stats.imp_routes + p->stats.filt_routes;
+  if (all_routes != 0)
+    log(L_ERR "Protocol %s is down but still has %d routes", p->name, all_routes);
+
+  bzero(&p->stats, sizeof(struct proto_stats));
+  proto_free_ahooks(p);
+
+  if (! p->proto->multitable)
+    rt_unlock_table(p->table);
+
+  if (p->proto->cleanup)
+    p->proto->cleanup(p);
+
+  proto_rethink_goal(p);
+}
+
 
 /**
  * proto_notify_state - notify core about protocol state change
diff --git a/nest/rt-table.c b/nest/rt-table.c
index bc911729..4295f836 100644
--- a/nest/rt-table.c
+++ b/nest/rt-table.c
@@ -1110,6 +1110,21 @@ rt_examine(rtable *t, ip_addr prefix, int pxlen, struct proto *p, struct filter
   return v > 0;
 }
 
+
+/**
+ * rt_refresh_begin - start a refresh cycle
+ * @t: related routing table
+ * @ah: related announce hook 
+ *
+ * This function starts a refresh cycle for given routing table and announce
+ * hook. The refresh cycle is a sequence where the protocol sends all its valid
+ * routes to the routing table (by rte_update()). After that, all protocol
+ * routes (more precisely routes with @ah as @sender) not sent during the
+ * refresh cycle but still in the table from the past are pruned. This is
+ * implemented by marking all related routes as stale by REF_STALE flag in
+ * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD
+ * flag in rt_refresh_end() and then removing such routes in the prune loop.
+ */
 void
 rt_refresh_begin(rtable *t, struct announce_hook *ah)
 {
@@ -1126,6 +1141,14 @@ rt_refresh_begin(rtable *t, struct announce_hook *ah)
   FIB_WALK_END;
 }
 
+/**
+ * rt_refresh_end - end a refresh cycle
+ * @t: related routing table
+ * @ah: related announce hook 
+ *
+ * This function starts a refresh cycle for given routing table and announce
+ * hook. See rt_refresh_begin() for description of refresh cycles.
+ */
 void
 rt_refresh_end(rtable *t, struct announce_hook *ah)
 {
@@ -1405,6 +1428,19 @@ again:
   return 1;
 }
 
+/**
+ * rt_prune_table - prune a routing table
+ *
+ * This function scans the routing table @tab and removes routes belonging to
+ * flushing protocols, discarded routes and also stale network entries, in a
+ * similar fashion like rt_prune_loop(). Returns 1 when all such routes are
+ * pruned. Contrary to rt_prune_loop(), this function is not a part of the
+ * protocol flushing loop, but it is called from rt_event() for just one routing
+ * table.
+ *
+ * Note that rt_prune_table() and rt_prune_loop() share (for each table) the
+ * prune state (@prune_state) and also the pruning iterator (@prune_fit).
+ */
 static inline int
 rt_prune_table(rtable *tab)
 {
@@ -1415,16 +1451,15 @@ rt_prune_table(rtable *tab)
 /**
  * rt_prune_loop - prune routing tables
  *
- * The prune loop scans routing tables and removes routes belonging to
- * flushing protocols and also stale network entries. Returns 1 when
- * all such routes are pruned. It is a part of the protocol flushing
- * loop.
+ * The prune loop scans routing tables and removes routes belonging to flushing
+ * protocols, discarded routes and also stale network entries. Returns 1 when
+ * all such routes are pruned. It is a part of the protocol flushing loop.
  *
- * The prune loop runs in two steps. In the first step it prunes just
- * the routes with flushing senders (in explicitly marked tables) so
- * the route removal is propagated as usual. In the second step, all
- * remaining relevant routes are removed. Ideally, there shouldn't be
- * any, but it happens when pipe filters are changed.
+ * The prune loop runs in two steps. In the first step it prunes just the routes
+ * with flushing senders (in explicitly marked tables) so the route removal is
+ * propagated as usual. In the second step, all remaining relevant routes are
+ * removed. Ideally, there shouldn't be any, but it happens when pipe filters
+ * are changed.
  */
 int
 rt_prune_loop(void)
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c
index ae9f6877..326883dd 100644
--- a/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@@ -51,6 +51,16 @@
  * and bgp_encode_attrs() which does the converse. Both functions are built around a
  * @bgp_attr_table array describing all important characteristics of all known attributes.
  * Unknown transitive attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
+ *
+ * BGP protocol implements graceful restart in both restarting (local restart)
+ * and receiving (neighbor restart) roles. The first is handled mostly by the
+ * graceful restart code in the nest, BGP protocol just handles capabilities,
+ * sets @gr_wait and locks graceful restart until end-of-RIB mark is received.
+ * The second is implemented by internal restart of the BGP state to %BS_IDLE
+ * and protocol state to %PS_START, but keeping the protocol up from the core
+ * point of view and therefore maintaining received routes. Routing table
+ * refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing
+ * stale routes after reestablishment of BGP session during graceful restart.
  */
 
 #undef LOCAL_DEBUG
@@ -431,6 +441,17 @@ bgp_conn_enter_idle_state(struct bgp_conn *conn)
     bgp_conn_leave_established_state(p);
 }
 
+/**
+ * bgp_handle_graceful_restart - handle detected BGP graceful restart
+ * @p: BGP instance
+ *
+ * This function is called when a BGP graceful restart of the neighbor is
+ * detected (when the TCP connection fails or when a new TCP connection
+ * appears). The function activates processing of the restart - starts routing
+ * table refresh cycle and activates BGP restart timer. The protocol state goes
+ * back to %PS_START, but changing BGP state back to %BS_IDLE is left for the
+ * caller.
+ */
 void
 bgp_handle_graceful_restart(struct bgp_proto *p)
 {
@@ -448,6 +469,16 @@ bgp_handle_graceful_restart(struct bgp_proto *p)
   rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook);
 }
 
+/**
+ * bgp_graceful_restart_done - finish active BGP graceful restart
+ * @p: BGP instance
+ *
+ * This function is called when the active BGP graceful restart of the neighbor
+ * should be finished - either successfully (the neighbor sends all paths and
+ * reports end-of-RIB on the new session) or unsuccessfully (the neighbor does
+ * not support BGP graceful restart on the new session). The function ends
+ * routing table refresh cycle and stops BGP restart timer.
+ */
 void
 bgp_graceful_restart_done(struct bgp_proto *p)
 {
@@ -457,6 +488,15 @@ bgp_graceful_restart_done(struct bgp_proto *p)
   rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
 }
 
+/**
+ * bgp_graceful_restart_timeout - timeout of graceful restart 'restart timer'
+ * @t: timer
+ *
+ * This function is a timeout hook for @gr_timer, implementing BGP restart time
+ * limit for reestablisment of the BGP session after the graceful restart. When
+ * fired, we just proceed with the usual protocol restart.
+ */
+
 static void
 bgp_graceful_restart_timeout(timer *t)
 {
@@ -968,7 +1008,7 @@ bgp_start(struct proto *P)
   p->remote_id = 0;
   p->source_addr = p->cf->source_addr;
 
-  if (P->gr_recovery)
+  if (p->p.gr_recovery && p->cf->gr_mode)
     proto_graceful_restart_lock(P);
 
   /*
-- 
cgit v1.2.3


From 227af309e55a59f14d1a5a757f17900164bffc97 Mon Sep 17 00:00:00 2001
From: Ondrej Zajicek <santiago@crfreenet.org>
Date: Mon, 24 Mar 2014 12:32:12 +0100
Subject: Fixes some minor issues in graceful restart.

---
 nest/proto.c    | 69 +++++++++++++++++++++++++++++++++------------------------
 proto/bgp/bgp.c |  2 +-
 2 files changed, 41 insertions(+), 30 deletions(-)

(limited to 'proto/bgp')

diff --git a/nest/proto.c b/nest/proto.c
index e990b48f..13a0833a 100644
--- a/nest/proto.c
+++ b/nest/proto.c
@@ -56,31 +56,10 @@ static void proto_fell_down(struct proto *p);
 static char *proto_state_name(struct proto *p);
 
 static void
-proto_enqueue(list *l, struct proto *p)
-{
-  add_tail(l, &p->n);
-}
-
-static void
-proto_set_core_state(struct proto *p, uint state)
+proto_relink(struct proto *p)
 {
   list *l = NULL;
 
-  p->core_state = state;
-
-  if (p->debug & D_STATES)
-    {
-      char *name = proto_state_name(p);
-      if (name != p->last_state_name_announced)
-	{
-	  p->last_state_name_announced = name;
-	  PD(p, "State changed to %s", proto_state_name(p));
-	}
-    }
-  else
-    p->last_state_name_announced = NULL;
-
-  rem_node(&p->n);
   switch (p->core_state)
     {
     case FS_HUNGRY:
@@ -95,9 +74,28 @@ proto_set_core_state(struct proto *p, uint state)
     default:
       ASSERT(0);
     }
-  proto_enqueue(l, p);
+
+  rem_node(&p->n);
+  add_tail(l, &p->n);
+}
+
+static void
+proto_log_state_change(struct proto *p)
+{
+  if (p->debug & D_STATES)
+    {
+      char *name = proto_state_name(p);
+      if (name != p->last_state_name_announced)
+	{
+	  p->last_state_name_announced = name;
+	  PD(p, "State changed to %s", proto_state_name(p));
+	}
+    }
+  else
+    p->last_state_name_announced = NULL;
 }
 
+
 /**
  * proto_new - create a new protocol instance
  * @c: protocol configuration
@@ -390,7 +388,8 @@ proto_init(struct proto_config *c)
   q->export_state = ES_DOWN;
   q->last_state_change = now;
 
-  proto_enqueue(&initial_proto_list, q);
+  add_tail(&initial_proto_list, &q->n);
+
   if (p == &proto_unix_iface)
     initial_device_proto = q;
 
@@ -758,7 +757,10 @@ graceful_restart_done(struct timer *t UNUSED)
 
       /* Resume postponed export of routes */
       if ((p->proto_state == PS_UP) && p->gr_wait)
+      {
 	proto_want_export_up(p);
+	proto_log_state_change(p);
+      }
 
       /* Cleanup */
       p->gr_recovery = 0;
@@ -954,6 +956,7 @@ proto_feed_more(void *P)
     {
       DBG("Feeding protocol %s finished\n", p->name);
       p->export_state = ES_READY;
+      proto_log_state_change(p);
 
       if (p->feed_done)
 	p->feed_done(p);
@@ -1047,7 +1050,9 @@ proto_flush_loop(void *unused UNUSED)
 
 	DBG("Flushing protocol %s\n", p->name);
 	p->flushing = 0;
-	proto_set_core_state(p, FS_HUNGRY);
+	p->core_state = FS_HUNGRY;
+	proto_relink(p);
+	proto_log_state_change(p);
 	if (p->proto_state == PS_DOWN)
 	  proto_fell_down(p);
 	goto again;
@@ -1138,6 +1143,7 @@ proto_request_feeding(struct proto *p)
   p->stats.exp_routes = 0;
 
   proto_schedule_feed(p, 0);
+  proto_log_state_change(p);
 }
 
 static const char *
@@ -1222,7 +1228,8 @@ proto_want_core_up(struct proto *p)
       proto_reset_limit(p->main_ahook->out_limit);
     }
 
-  proto_set_core_state(p, FS_HAPPY);
+  p->core_state = FS_HAPPY;
+  proto_relink(p);
 }
 
 static void
@@ -1254,7 +1261,8 @@ proto_want_core_down(struct proto *p)
   ASSERT(p->core_state == CS_HAPPY);
   ASSERT(p->export_state == ES_DOWN);
 
-  proto_set_core_state(p, FS_FLUSHING);
+  p->core_state = FS_FLUSHING;
+  proto_relink(p);
   proto_schedule_flush_loop();
 
   if (!p->proto->multitable)
@@ -1373,6 +1381,7 @@ proto_notify_state(struct proto *p, unsigned ps)
 
       if (cs == FS_HUNGRY)		/* Shutdown finished */
 	{
+	  proto_log_state_change(p);
 	  proto_fell_down(p);
 	  return;			/* The protocol might have ceased to exist */
 	}
@@ -1381,6 +1390,8 @@ proto_notify_state(struct proto *p, unsigned ps)
     default:
       bug("%s: Invalid state %d", p->name, ps);
     }
+
+  proto_log_state_change(p);
 }
 
 /*
@@ -1404,8 +1415,8 @@ proto_state_name(struct proto *p)
 	case ES_READY:			return "up";
 	default:      			return "???";
 	}
-    case P(PS_STOP, FS_HUNGRY):		return "stop";
-    case P(PS_STOP, FS_FLUSHING):
+    case P(PS_STOP, FS_HUNGRY):
+    case P(PS_STOP, FS_FLUSHING):	return "stop";
     case P(PS_DOWN, FS_FLUSHING):	return "flush";
     default:      			return "???";
     }
diff --git a/proto/bgp/bgp.c b/proto/bgp/bgp.c
index 326883dd..ca619f31 100644
--- a/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@@ -927,7 +927,7 @@ static void
 bgp_feed_done(struct proto *P)
 {
   struct bgp_proto *p = (struct bgp_proto *) P;
-  if (!p->conn || !p->cf->gr_mode)
+  if (!p->conn || !p->cf->gr_mode || p->p.refeeding)
     return;
 
   p->send_end_mark = 1;
-- 
cgit v1.2.3