diff options
Diffstat (limited to 'sysdep/unix')
-rw-r--r-- | sysdep/unix/Makefile | 2 | ||||
-rw-r--r-- | sysdep/unix/alloc.c | 239 | ||||
-rw-r--r-- | sysdep/unix/coroutine.c | 206 | ||||
-rw-r--r-- | sysdep/unix/io-loop.c | 620 | ||||
-rw-r--r-- | sysdep/unix/io-loop.h | 58 | ||||
-rw-r--r-- | sysdep/unix/io.c | 235 | ||||
-rw-r--r-- | sysdep/unix/krt.c | 447 | ||||
-rw-r--r-- | sysdep/unix/krt.h | 14 | ||||
-rw-r--r-- | sysdep/unix/log.c | 76 | ||||
-rw-r--r-- | sysdep/unix/main.c | 28 | ||||
-rw-r--r-- | sysdep/unix/unix.h | 1 |
11 files changed, 1416 insertions, 510 deletions
diff --git a/sysdep/unix/Makefile b/sysdep/unix/Makefile index d0d36b5f..07f454ab 100644 --- a/sysdep/unix/Makefile +++ b/sysdep/unix/Makefile @@ -1,4 +1,4 @@ -src := alloc.c io.c krt.c log.c main.c random.c +src := alloc.c io.c io-loop.c krt.c log.c main.c random.c coroutine.c obj := $(src-o-files) $(all-daemon) $(cf-local) diff --git a/sysdep/unix/alloc.c b/sysdep/unix/alloc.c index 5dd70c99..c09a8356 100644 --- a/sysdep/unix/alloc.c +++ b/sysdep/unix/alloc.c @@ -11,77 +11,73 @@ #include "lib/lists.h" #include "lib/event.h" +#include "sysdep/unix/io-loop.h" + #include <stdlib.h> #include <unistd.h> +#include <stdatomic.h> +#include <errno.h> #ifdef HAVE_MMAP #include <sys/mman.h> #endif -#ifdef HAVE_MMAP -#define KEEP_PAGES 512 +long page_size = 0; -static u64 page_size = 0; +#ifdef HAVE_MMAP +#if DEBUGGING +#define FP_NODE_OFFSET 42 +#else +#define FP_NODE_OFFSET 1 +#endif static _Bool use_fake = 0; - -uint pages_kept = 0; -static list pages_list; - -static void cleanup_pages(void *data); -static event page_cleanup_event = { .hook = cleanup_pages }; - #else -static const u64 page_size = 4096; /* Fake page size */ +static _Bool use_fake = 1; #endif -u64 get_page_size(void) +static void * +alloc_sys_page(void) { - if (page_size) - return page_size; + void *ptr = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); -#ifdef HAVE_MMAP - if (page_size = sysconf(_SC_PAGESIZE)) - { - if ((u64_popcount(page_size) > 1) || (page_size > 16384)) - { - /* Too big or strange page, use the aligned allocator instead */ - page_size = 4096; - use_fake = 1; - } - return page_size; - } + if (ptr == MAP_FAILED) + bug("mmap(%lu) failed: %m", page_size); - bug("Page size must be non-zero"); -#endif + return ptr; } +extern int shutting_down; /* Shutdown requested. */ + void * alloc_page(void) { #ifdef HAVE_MMAP - if (pages_kept) - { - node *page = TAIL(pages_list); - rem_node(page); - pages_kept--; - memset(page, 0, get_page_size()); - return page; - } - if (!use_fake) { - void *ret = mmap(NULL, get_page_size(), PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (ret == MAP_FAILED) - bug("mmap(%lu) failed: %m", page_size); - return ret; + struct free_pages *fp = &birdloop_current->pages; + if (!fp->cnt) + return alloc_sys_page(); + + node *n = HEAD(fp->list); + rem_node(n); + if ((--fp->cnt < fp->min) && !shutting_down) + ev_send(fp->cleanup->list, fp->cleanup); + + void *ptr = n - FP_NODE_OFFSET; + memset(ptr, 0, page_size); + return ptr; } else #endif { +#ifdef HAVE_ALIGNED_ALLOC void *ret = aligned_alloc(page_size, page_size); if (!ret) bug("aligned_alloc(%lu) failed", page_size); return ret; +#else + bug("BIRD should have already died on fatal error."); +#endif } } @@ -91,14 +87,14 @@ free_page(void *ptr) #ifdef HAVE_MMAP if (!use_fake) { - if (!pages_kept) - init_list(&pages_list); - - memset(ptr, 0, sizeof(node)); - add_tail(&pages_list, ptr); - - if (++pages_kept > KEEP_PAGES) - ev_schedule(&page_cleanup_event); + struct free_pages *fp = &birdloop_current->pages; + struct node *n = ptr; + n += FP_NODE_OFFSET; + + memset(n, 0, sizeof(node)); + add_tail(&fp->list, n); + if ((++fp->cnt > fp->max) && !shutting_down) + ev_send(fp->cleanup->list, fp->cleanup); } else #endif @@ -106,24 +102,147 @@ free_page(void *ptr) } #ifdef HAVE_MMAP + +#define GFP (&main_birdloop.pages) + +void +flush_pages(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_inside(loop->parent->loop)); + + struct free_pages *fp = &loop->pages; + struct free_pages *pfp = &loop->parent->loop->pages; + + add_tail_list(&pfp->list, &fp->list); + pfp->cnt += fp->cnt; + + fp->cnt = 0; + fp->list = (list) {}; + fp->min = 0; + fp->max = 0; + + rfree(fp->cleanup); + fp->cleanup = NULL; +} + +static void +cleanup_pages(void *data) +{ + struct birdloop *loop = data; + birdloop_enter(loop); + + ASSERT_DIE(birdloop_inside(loop->parent->loop)); + + struct free_pages *fp = &loop->pages; + struct free_pages *pfp = &loop->parent->loop->pages; + + while ((fp->cnt < fp->min) && (pfp->cnt > pfp->min)) + { + node *n = HEAD(pfp->list); + rem_node(n); + add_tail(&fp->list, n); + fp->cnt++; + pfp->cnt--; + } + + while (fp->cnt < fp->min) + { + node *n = alloc_sys_page(); + add_tail(&fp->list, n + FP_NODE_OFFSET); + fp->cnt++; + } + + while (fp->cnt > fp->max) + { + node *n = HEAD(fp->list); + rem_node(n); + add_tail(&pfp->list, n); + fp->cnt--; + pfp->cnt++; + } + + birdloop_leave(loop); + + if (!shutting_down && (pfp->cnt > pfp->max)) + ev_send(pfp->cleanup->list, pfp->cleanup); +} + static void -cleanup_pages(void *data UNUSED) +cleanup_global_pages(void *data UNUSED) { - for (uint seen = 0; (pages_kept > KEEP_PAGES) && (seen < KEEP_PAGES); seen++) + while (GFP->cnt < GFP->max) { - void *ptr = HEAD(pages_list); - rem_node(ptr); - if (munmap(ptr, get_page_size()) == 0) - pages_kept--; -#ifdef ENOMEM + node *n = alloc_sys_page(); + add_tail(&GFP->list, n + FP_NODE_OFFSET); + GFP->cnt++; + } + + for (uint limit = GFP->cnt; (limit > 0) && (GFP->cnt > GFP->max); limit--) + { + node *n = TAIL(GFP->list); + rem_node(n); + + if (munmap(n - FP_NODE_OFFSET, page_size) == 0) + GFP->cnt--; else if (errno == ENOMEM) - add_tail(&pages_list, ptr); -#endif + add_head(&GFP->list, n); else - bug("munmap(%p) failed: %m", ptr); + bug("munmap(%p) failed: %m", n - FP_NODE_OFFSET); + } +} + +void +init_pages(struct birdloop *loop) +{ + struct free_pages *fp = &loop->pages; + + init_list(&fp->list); + fp->cleanup = ev_new_init(loop->parent->loop->pool, cleanup_pages, loop); + fp->cleanup->list = (loop->parent->loop == &main_birdloop) ? &global_work_list : birdloop_event_list(loop->parent->loop); + fp->min = 4; + fp->max = 16; + + for (fp->cnt = 0; fp->cnt < fp->min; fp->cnt++) + { + node *n = alloc_sys_page(); + add_tail(&fp->list, n + FP_NODE_OFFSET); + } +} + +static event global_free_pages_cleanup_event = { .hook = cleanup_global_pages, .list = &global_work_list }; + +void resource_sys_init(void) +{ + if (!(page_size = sysconf(_SC_PAGESIZE))) + die("System page size must be non-zero"); + + if (u64_popcount(page_size) == 1) + { + init_list(&GFP->list); + GFP->cleanup = &global_free_pages_cleanup_event; + GFP->min = 0; + GFP->max = 256; + return; } - if (pages_kept > KEEP_PAGES) - ev_schedule(&page_cleanup_event); +#ifdef HAVE_ALIGNED_ALLOC + log(L_WARN "Got strange memory page size (%lu), using the aligned allocator instead", page_size); +#else + die("Got strange memory page size (%lu) and aligned_alloc is not available", page_size); +#endif + + /* Too big or strange page, use the aligned allocator instead */ + page_size = 4096; + use_fake = 1; } + +#else + +void +resource_sys_init(void) +{ + page_size = 4096; + use_fake = 1; +} + #endif diff --git a/sysdep/unix/coroutine.c b/sysdep/unix/coroutine.c new file mode 100644 index 00000000..4747d01a --- /dev/null +++ b/sysdep/unix/coroutine.c @@ -0,0 +1,206 @@ +/* + * BIRD Coroutines + * + * (c) 2017 Martin Mares <mj@ucw.cz> + * (c) 2020 Maria Matejka <mq@jmq.cz> + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#undef LOCAL_DEBUG + +#undef DEBUG_LOCKING + +#include "lib/birdlib.h" +#include "lib/locking.h" +#include "lib/coro.h" +#include "lib/rcu.h" +#include "lib/resource.h" +#include "lib/timer.h" + +#include "conf/conf.h" + +#define CORO_STACK_SIZE 65536 + +/* + * Implementation of coroutines based on POSIX threads + */ + +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <pthread.h> +#include <semaphore.h> +#include <stdatomic.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +/* + * Locking subsystem + */ + +_Thread_local struct lock_order locking_stack = {}; +_Thread_local struct domain_generic **last_locked = NULL; + +#define ASSERT_NO_LOCK ASSERT_DIE(last_locked == NULL) + +struct domain_generic { + pthread_mutex_t mutex; + uint order; + struct domain_generic **prev; + struct lock_order *locked_by; + const char *name; +}; + +#define DOMAIN_INIT(_name, _order) { .mutex = PTHREAD_MUTEX_INITIALIZER, .name = _name, .order = _order } + +static struct domain_generic the_bird_domain_gen = DOMAIN_INIT("The BIRD", OFFSETOF(struct lock_order, the_bird)); + +DOMAIN(the_bird) the_bird_domain = { .the_bird = &the_bird_domain_gen }; + +struct domain_generic * +domain_new(const char *name, uint order) +{ + ASSERT_DIE(order < sizeof(struct lock_order)); + struct domain_generic *dg = xmalloc(sizeof(struct domain_generic)); + *dg = (struct domain_generic) DOMAIN_INIT(name, order); + return dg; +} + +void +domain_free(struct domain_generic *dg) +{ + pthread_mutex_destroy(&dg->mutex); + xfree(dg); +} + +uint dg_order(struct domain_generic *dg) +{ + return dg->order; +} + +void do_lock(struct domain_generic *dg, struct domain_generic **lsp) +{ + if ((char *) lsp - (char *) &locking_stack != dg->order) + bug("Trying to lock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack); + + if (lsp <= last_locked) + bug("Trying to lock in a bad order"); + if (*lsp) + bug("Inconsistent locking stack state on lock"); + + btime lock_begin = current_time(); + pthread_mutex_lock(&dg->mutex); + btime duration = current_time() - lock_begin; + if (config && (duration > config->watchdog_warning)) + log(L_WARN "Locking of %s took %d ms", dg->name, (int) (duration TO_MS)); + + if (dg->prev || dg->locked_by) + bug("Previous unlock not finished correctly"); + dg->prev = last_locked; + *lsp = dg; + last_locked = lsp; + dg->locked_by = &locking_stack; +} + +void do_unlock(struct domain_generic *dg, struct domain_generic **lsp) +{ + if ((char *) lsp - (char *) &locking_stack != dg->order) + bug("Trying to unlock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack); + + if (dg->locked_by != &locking_stack) + bug("Inconsistent domain state on unlock"); + if ((last_locked != lsp) || (*lsp != dg)) + bug("Inconsistent locking stack state on unlock"); + dg->locked_by = NULL; + last_locked = dg->prev; + *lsp = NULL; + dg->prev = NULL; + pthread_mutex_unlock(&dg->mutex); +} + +/* Coroutines */ +struct coroutine { + resource r; + pthread_t id; + pthread_attr_t attr; + struct rcu_coro rcu; + void (*entry)(void *); + void *data; +}; + +static _Thread_local _Bool coro_cleaned_up = 0; + +static void coro_free(resource *r) +{ + struct coroutine *c = (void *) r; + rcu_coro_stop(&c->rcu); + ASSERT_DIE(pthread_equal(pthread_self(), c->id)); + pthread_attr_destroy(&c->attr); + coro_cleaned_up = 1; +} + +static void coro_dump(resource *r UNUSED) { } + +static struct resclass coro_class = { + .name = "Coroutine", + .size = sizeof(struct coroutine), + .free = coro_free, + .dump = coro_dump, +}; + +_Thread_local struct coroutine *this_coro = NULL; + +static void *coro_entry(void *p) +{ + struct coroutine *c = p; + + ASSERT_DIE(c->entry); + + this_coro = c; + rcu_coro_start(&c->rcu); + + c->entry(c->data); + ASSERT_DIE(coro_cleaned_up); + + return NULL; +} + +struct coroutine *coro_run(pool *p, void (*entry)(void *), void *data) +{ + ASSERT_DIE(entry); + ASSERT_DIE(p); + + struct coroutine *c = ralloc(p, &coro_class); + + c->entry = entry; + c->data = data; + + int e = 0; + + if (e = pthread_attr_init(&c->attr)) + die("pthread_attr_init() failed: %M", e); + + if (e = pthread_attr_setstacksize(&c->attr, CORO_STACK_SIZE)) + die("pthread_attr_setstacksize(%u) failed: %M", CORO_STACK_SIZE, e); + + if (e = pthread_attr_setdetachstate(&c->attr, PTHREAD_CREATE_DETACHED)) + die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e); + + if (e = pthread_create(&c->id, &c->attr, coro_entry, c)) + die("pthread_create() failed: %M", e); + + return c; +} + +void +coro_yield(void) +{ + const struct timespec req = { .tv_nsec = 100 }; + nanosleep(&req, NULL); +} diff --git a/sysdep/unix/io-loop.c b/sysdep/unix/io-loop.c new file mode 100644 index 00000000..9b107a1a --- /dev/null +++ b/sysdep/unix/io-loop.c @@ -0,0 +1,620 @@ +/* + * BIRD -- I/O and event loop + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <poll.h> +#include <pthread.h> +#include <time.h> +#include <sys/time.h> + +#include "nest/bird.h" + +#include "lib/buffer.h" +#include "lib/coro.h" +#include "lib/lists.h" +#include "lib/resource.h" +#include "lib/event.h" +#include "lib/timer.h" +#include "lib/socket.h" + +#include "lib/io-loop.h" +#include "sysdep/unix/io-loop.h" +#include "conf/conf.h" + +/* + * Current thread context + */ + +_Thread_local struct birdloop *birdloop_current = NULL; +static _Thread_local struct birdloop *birdloop_wakeup_masked; +static _Thread_local uint birdloop_wakeup_masked_count; + +event_list * +birdloop_event_list(struct birdloop *loop) +{ + return &loop->event_list; +} + +struct timeloop * +birdloop_time_loop(struct birdloop *loop) +{ + return &loop->time; +} + +pool * +birdloop_pool(struct birdloop *loop) +{ + return loop->pool; +} + +_Bool +birdloop_inside(struct birdloop *loop) +{ + for (struct birdloop *c = birdloop_current; c; c = c->prev_loop) + if (loop == c) + return 1; + + return 0; +} + +/* + * Wakeup code for birdloop + */ + +static void +pipe_new(int *pfds) +{ + int rv = pipe(pfds); + if (rv < 0) + die("pipe: %m"); + + if (fcntl(pfds[0], F_SETFL, O_NONBLOCK) < 0) + die("fcntl(O_NONBLOCK): %m"); + + if (fcntl(pfds[1], F_SETFL, O_NONBLOCK) < 0) + die("fcntl(O_NONBLOCK): %m"); +} + +void +pipe_drain(int fd) +{ + char buf[64]; + int rv; + + try: + rv = read(fd, buf, 64); + if (rv < 0) + { + if (errno == EINTR) + goto try; + if (errno == EAGAIN) + return; + die("wakeup read: %m"); + } + if (rv == 64) + goto try; +} + +void +pipe_kick(int fd) +{ + u64 v = 1; + int rv; + + try: + rv = write(fd, &v, sizeof(u64)); + if (rv < 0) + { + if (errno == EINTR) + goto try; + if (errno == EAGAIN) + return; + die("wakeup write: %m"); + } +} + +static inline void +wakeup_init(struct birdloop *loop) +{ + pipe_new(loop->wakeup_fds); +} + +static inline void +wakeup_drain(struct birdloop *loop) +{ + pipe_drain(loop->wakeup_fds[0]); +} + +static inline void +wakeup_do_kick(struct birdloop *loop) +{ + pipe_kick(loop->wakeup_fds[1]); +} + +void +birdloop_ping(struct birdloop *loop) +{ + u32 ping_sent = atomic_fetch_add_explicit(&loop->ping_sent, 1, memory_order_acq_rel); + if (ping_sent) + return; + + if (loop == birdloop_wakeup_masked) + birdloop_wakeup_masked_count++; + else + wakeup_do_kick(loop); +} + + +/* + * Sockets + */ + +static void +sockets_init(struct birdloop *loop) +{ + init_list(&loop->sock_list); + loop->sock_num = 0; + + BUFFER_INIT(loop->poll_sk, loop->pool, 4); + BUFFER_INIT(loop->poll_fd, loop->pool, 4); + loop->poll_changed = 1; /* add wakeup fd */ +} + +static void +sockets_add(struct birdloop *loop, sock *s) +{ + ASSERT_DIE(!enlisted(&s->n)); + + add_tail(&loop->sock_list, &s->n); + loop->sock_num++; + + s->loop = loop; + s->index = -1; + loop->poll_changed = 1; + + birdloop_ping(loop); +} + +void +sk_start(sock *s) +{ + ASSERT_DIE(birdloop_current != &main_birdloop); + sockets_add(birdloop_current, s); +} + +static void +sockets_remove(struct birdloop *loop, sock *s) +{ + ASSERT_DIE(s->loop == loop); + + if (!enlisted(&s->n)) + return; + + rem_node(&s->n); + loop->sock_num--; + + if (s->index >= 0) + { + loop->poll_sk.data[s->index] = NULL; + s->index = -1; + loop->poll_changed = 1; + birdloop_ping(loop); + } + + s->loop = NULL; +} + +void +sk_stop(sock *s) +{ + sockets_remove(birdloop_current, s); +} + +static inline uint sk_want_events(sock *s) +{ + uint out = ((s->ttx != s->tpos) ? POLLOUT : 0); + if (s->rx_hook) + if (s->cork) + { + LOCK_DOMAIN(cork, s->cork->lock); + if (!enlisted(&s->cork_node)) + if (s->cork->count) + { +// log(L_TRACE "Socket %p corked", s); + add_tail(&s->cork->sockets, &s->cork_node); + } + else + out |= POLLIN; + UNLOCK_DOMAIN(cork, s->cork->lock); + } + else + out |= POLLIN; + +// log(L_TRACE "sk_want_events(%p) = %x", s, out); + return out; +} + + +void +sk_ping(sock *s) +{ + s->loop->poll_changed = 1; + birdloop_ping(s->loop); +} + +/* +FIXME: this should be called from sock code + +static void +sockets_update(struct birdloop *loop, sock *s) +{ + if (s->index >= 0) + loop->poll_fd.data[s->index].events = sk_want_events(s); +} +*/ + +static void +sockets_prepare(struct birdloop *loop) +{ + BUFFER_SET(loop->poll_sk, loop->sock_num + 1); + BUFFER_SET(loop->poll_fd, loop->sock_num + 1); + + struct pollfd *pfd = loop->poll_fd.data; + sock **psk = loop->poll_sk.data; + uint i = 0; + node *n; + + WALK_LIST(n, loop->sock_list) + { + sock *s = SKIP_BACK(sock, n, n); + + ASSERT(i < loop->sock_num); + + s->index = i; + *psk = s; + pfd->fd = s->fd; + pfd->events = sk_want_events(s); + pfd->revents = 0; + + pfd++; + psk++; + i++; + } + + ASSERT(i == loop->sock_num); + + /* Add internal wakeup fd */ + *psk = NULL; + pfd->fd = loop->wakeup_fds[0]; + pfd->events = POLLIN; + pfd->revents = 0; + + loop->poll_changed = 0; +} + +int sk_read(sock *s, int revents); +int sk_write(sock *s); + +static void +sockets_fire(struct birdloop *loop) +{ + struct pollfd *pfd = loop->poll_fd.data; + sock **psk = loop->poll_sk.data; + int poll_num = loop->poll_fd.used - 1; + + times_update(); + + /* Last fd is internal wakeup fd */ + if (pfd[poll_num].revents & POLLIN) + { + wakeup_drain(loop); + loop->poll_changed = 1; + } + + int i; + for (i = 0; i < poll_num; pfd++, psk++, i++) + { + if (!*psk) + continue; + + if (! pfd->revents) + continue; + + if (pfd->revents & POLLNVAL) + bug("poll: invalid fd %d", pfd->fd); + + int e = 1; + + if (pfd->revents & POLLIN) + while (e && *psk && (*psk)->rx_hook) + e = sk_read(*psk, pfd->revents); + + e = 1; + if (pfd->revents & POLLOUT) + { + loop->poll_changed = 1; + while (e && *psk) + e = sk_write(*psk); + } + } +} + + +/* + * Birdloop + */ + +struct birdloop main_birdloop; + +static void birdloop_enter_locked(struct birdloop *loop); + +void +birdloop_init(void) +{ + wakeup_init(&main_birdloop); + + main_birdloop.time.domain = the_bird_domain.the_bird; + main_birdloop.time.loop = &main_birdloop; + + times_update(); + timers_init(&main_birdloop.time, &root_pool); + + root_pool.loop = &main_birdloop; + main_birdloop.pool = &root_pool; + + birdloop_enter_locked(&main_birdloop); +} + +static void birdloop_main(void *arg); + +void +birdloop_free(resource *r) +{ + struct birdloop *loop = (void *) r; + + ASSERT_DIE(loop->links == 0); + domain_free(loop->time.domain); +} + +void +birdloop_dump(resource *r) +{ + struct birdloop *loop = (void *) r; + + debug("%s\n", loop->pool->name); +} + +struct resmem birdloop_memsize(resource *r) +{ + struct birdloop *loop = (void *) r; + + return (struct resmem) { + .effective = sizeof(struct birdloop) - sizeof(resource) - ALLOC_OVERHEAD, + .overhead = ALLOC_OVERHEAD + sizeof(resource) + page_size * list_length(&loop->pages.list), + }; +} + +struct resclass birdloop_class = { + .name = "IO Loop", + .size = sizeof(struct birdloop), + .free = birdloop_free, + .dump = birdloop_dump, + .memsize = birdloop_memsize, +}; + +struct birdloop * +birdloop_new(pool *pp, uint order, const char *name) +{ + struct domain_generic *dg = domain_new(name, order); + + struct birdloop *loop = ralloc(pp, &birdloop_class); + + loop->time.domain = dg; + loop->time.loop = loop; + + birdloop_enter(loop); + + loop->pool = rp_new(pp, loop, name); + loop->parent = pp; + rmove(&loop->r, loop->pool); + + wakeup_init(loop); + ev_init_list(&loop->event_list, loop, name); + timers_init(&loop->time, loop->pool); + sockets_init(loop); + + init_pages(loop); + + loop->time.coro = coro_run(loop->pool, birdloop_main, loop); + + birdloop_leave(loop); + + return loop; +} + +static void +birdloop_do_stop(struct birdloop *loop, void (*stopped)(void *data), void *data) +{ + loop->stopped = stopped; + loop->stop_data = data; + wakeup_do_kick(loop); +} + +void +birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data) +{ + DG_LOCK(loop->time.domain); + birdloop_do_stop(loop, stopped, data); + DG_UNLOCK(loop->time.domain); +} + +void +birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data) +{ + ASSERT_DIE(loop == birdloop_current); + ASSERT_DIE(DG_IS_LOCKED(loop->time.domain)); + + birdloop_do_stop(loop, stopped, data); +} + +static void +birdloop_enter_locked(struct birdloop *loop) +{ + ASSERT_DIE(DG_IS_LOCKED(loop->time.domain)); + ASSERT_DIE(!birdloop_inside(loop)); + + /* Store the old context */ + loop->prev_loop = birdloop_current; + + /* Put the new context */ + birdloop_current = loop; +} + +void +birdloop_enter(struct birdloop *loop) +{ + DG_LOCK(loop->time.domain); + return birdloop_enter_locked(loop); +} + +static void +birdloop_leave_locked(struct birdloop *loop) +{ + /* Check the current context */ + ASSERT_DIE(birdloop_current == loop); + + /* Restore the old context */ + birdloop_current = loop->prev_loop; +} + +void +birdloop_leave(struct birdloop *loop) +{ + birdloop_leave_locked(loop); + DG_UNLOCK(loop->time.domain); +} + +void +birdloop_mask_wakeups(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_wakeup_masked == NULL); + birdloop_wakeup_masked = loop; +} + +void +birdloop_unmask_wakeups(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_wakeup_masked == loop); + birdloop_wakeup_masked = NULL; + if (birdloop_wakeup_masked_count) + wakeup_do_kick(loop); + + birdloop_wakeup_masked_count = 0; +} + +void +birdloop_link(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_inside(loop)); + loop->links++; +} + +void +birdloop_unlink(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_inside(loop)); + ASSERT_DIE(loop->links); + if (!--loop->links) + birdloop_ping(loop); +} + +static void +birdloop_main(void *arg) +{ + struct birdloop *loop = arg; + timer *t; + int rv, timeout; + + btime loop_begin = current_time(); + + birdloop_enter(loop); + while (1) + { + timers_fire(&loop->time, 0); + if (ev_run_list(&loop->event_list)) + timeout = 0; + else if (t = timers_first(&loop->time)) + timeout = (tm_remains(t) TO_MS) + 1; + else + timeout = -1; + + if (loop->poll_changed) + sockets_prepare(loop); + + btime duration = current_time_update() - loop_begin; + if (duration > config->watchdog_warning) + log(L_WARN "I/O loop cycle took %d ms", (int) (duration TO_MS)); + + birdloop_leave(loop); + + try: + rv = poll(loop->poll_fd.data, loop->poll_fd.used, timeout); + if (rv < 0) + { + if (errno == EINTR || errno == EAGAIN) + goto try; + die("poll: %m"); + } + + birdloop_enter(loop); + + if (loop->stopped && !loop->links) + break; + + loop_begin = current_time_update(); + + if (rv) + sockets_fire(loop); + + atomic_exchange_explicit(&loop->ping_sent, 0, memory_order_acq_rel); + } + + /* Flush remaining events */ + ASSERT_DIE(!ev_run_list(&loop->event_list)); + + /* No timers allowed */ + ASSERT_DIE(timers_count(&loop->time) == 0); + ASSERT_DIE(EMPTY_LIST(loop->sock_list)); + ASSERT_DIE(loop->sock_num == 0); + + birdloop_leave(loop); + + /* Lock parent loop */ + pool *parent = loop->parent; + birdloop_enter(parent->loop); + + /* Move the loop temporarily to parent pool */ + birdloop_enter(loop); + rmove(&loop->r, parent); + birdloop_leave(loop); + + /* Announce loop stop */ + loop->stopped(loop->stop_data); + + /* Free the pool and loop */ + birdloop_enter(loop); + rp_free(loop->pool, parent); + flush_pages(loop); + birdloop_leave(loop); + rfree(&loop->r); + + /* And finally leave the parent loop before finishing */ + birdloop_leave(parent->loop); +} diff --git a/sysdep/unix/io-loop.h b/sysdep/unix/io-loop.h new file mode 100644 index 00000000..1727637a --- /dev/null +++ b/sysdep/unix/io-loop.h @@ -0,0 +1,58 @@ +/* + * BIRD -- I/O and event loop + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_SYSDEP_UNIX_IO_LOOP_H_ +#define _BIRD_SYSDEP_UNIX_IO_LOOP_H_ + +#include "nest/bird.h" + +#include "lib/lists.h" +#include "lib/event.h" +#include "lib/timer.h" + +struct free_pages +{ + list list; /* List of empty pages */ + event *cleanup; /* Event to call when number of pages is outside bounds */ + u16 min, max; /* Minimal and maximal number of free pages kept */ + uint cnt; /* Number of empty pages */ +}; + +struct birdloop +{ + resource r; + + pool *pool; + pool *parent; + + struct timeloop time; + event_list event_list; + list sock_list; + uint sock_num; + + BUFFER(sock *) poll_sk; + BUFFER(struct pollfd) poll_fd; + u8 poll_changed; + + _Atomic u32 ping_sent; + int wakeup_fds[2]; + + uint links; + + struct free_pages pages; + + void (*stopped)(void *data); + void *stop_data; + + struct birdloop *prev_loop; +}; + +extern _Thread_local struct birdloop *birdloop_current; + +void init_pages(struct birdloop *loop); +void flush_pages(struct birdloop *loop); + +#endif diff --git a/sysdep/unix/io.c b/sysdep/unix/io.c index 4fd77453..0e5adc14 100644 --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@ -36,12 +36,14 @@ #include "lib/resource.h" #include "lib/socket.h" #include "lib/event.h" +#include "lib/locking.h" #include "lib/timer.h" #include "lib/string.h" #include "nest/iface.h" #include "conf/conf.h" #include "sysdep/unix/unix.h" +#include "sysdep/unix/io-loop.h" #include CONFIG_INCLUDE_SYSIO_H /* Maximum number of calls of tx handler for one socket in one @@ -122,55 +124,50 @@ rf_fileno(struct rfile *f) btime boot_time; + void -times_init(struct timeloop *loop) +times_update(void) { struct timespec ts; int rv; + btime old_time = current_time(); + btime old_real_time = current_real_time(); + rv = clock_gettime(CLOCK_MONOTONIC, &ts); if (rv < 0) die("Monotonic clock is missing"); if ((ts.tv_sec < 0) || (((u64) ts.tv_sec) > ((u64) 1 << 40))) log(L_WARN "Monotonic clock is crazy"); - - loop->last_time = ts.tv_sec S + ts.tv_nsec NS; - loop->real_time = 0; -} - -void -times_update(struct timeloop *loop) -{ - struct timespec ts; - int rv; - - rv = clock_gettime(CLOCK_MONOTONIC, &ts); - if (rv < 0) - die("clock_gettime: %m"); - + btime new_time = ts.tv_sec S + ts.tv_nsec NS; - if (new_time < loop->last_time) + if (new_time < old_time) log(L_ERR "Monotonic clock is broken"); - loop->last_time = new_time; - loop->real_time = 0; -} - -void -times_update_real_time(struct timeloop *loop) -{ - struct timespec ts; - int rv; - rv = clock_gettime(CLOCK_REALTIME, &ts); if (rv < 0) die("clock_gettime: %m"); - loop->real_time = ts.tv_sec S + ts.tv_nsec NS; -} + btime new_real_time = ts.tv_sec S + ts.tv_nsec NS; + + if (!atomic_compare_exchange_strong_explicit( + &last_time, + &old_time, + new_time, + memory_order_acq_rel, + memory_order_relaxed)) + DBG("Time update collision: last_time"); + if (!atomic_compare_exchange_strong_explicit( + &real_time, + &old_real_time, + new_real_time, + memory_order_acq_rel, + memory_order_relaxed)) + DBG("Time update collision: real_time"); +} /** * DOC: Sockets @@ -797,6 +794,7 @@ sk_free(resource *r) { sock *s = (sock *) r; + ASSERT_DIE(!s->loop || birdloop_inside(s->loop)); sk_free_bufs(s); #ifdef HAVE_LIBSSH @@ -804,20 +802,30 @@ sk_free(resource *r) sk_ssh_free(s); #endif - if (s->fd < 0) - return; + if (s->cork) + { + LOCK_DOMAIN(cork, s->cork->lock); + if (enlisted(&s->cork_node)) + rem_node(&s->cork_node); + UNLOCK_DOMAIN(cork, s->cork->lock); + } - /* FIXME: we should call sk_stop() for SKF_THREAD sockets */ - if (!(s->flags & SKF_THREAD)) + if (!s->loop) + ; + else if (s->flags & SKF_THREAD) + sk_stop(s); + else { if (s == current_sock) current_sock = sk_next(s); if (s == stored_sock) stored_sock = sk_next(s); - rem_node(&s->n); + + if (enlisted(&s->n)) + rem_node(&s->n); } - if (s->type != SK_SSH && s->type != SK_SSH_ACTIVE) + if (s->type != SK_SSH && s->type != SK_SSH_ACTIVE && s->fd != -1) close(s->fd); s->fd = -1; @@ -1108,7 +1116,15 @@ sk_passive_connected(sock *s, int type) return 1; } - sk_insert(t); + if (s->flags & SKF_PASSIVE_THREAD) + t->flags |= SKF_THREAD; + else + { + ASSERT_DIE(s->loop == &main_birdloop); + t->loop = &main_birdloop; + sk_insert(t); + } + sk_alloc_bufs(t); s->rx_hook(t, 0); return 1; @@ -1329,6 +1345,17 @@ sk_open(sock *s) ip_addr bind_addr = IPA_NONE; sockaddr sa; + if (s->flags & SKF_THREAD) + { + ASSERT_DIE(s->loop && (s->loop != &main_birdloop)); + ASSERT_DIE(birdloop_inside(s->loop)); + } + else + { + ASSERT_DIE(!s->loop); + s->loop = &main_birdloop; + } + if (s->type <= SK_IP) { /* @@ -1512,10 +1539,41 @@ sk_open_unix(sock *s, char *name) return -1; s->fd = fd; + s->loop = &main_birdloop; sk_insert(s); return 0; } +static void +sk_reloop_hook(void *_vs) +{ + sock *s = _vs; + if (birdloop_inside(&main_birdloop)) + { + s->flags &= ~SKF_THREAD; + sk_insert(s); + } + else + { + s->flags |= SKF_THREAD; + sk_start(s); + } +} + +void +sk_reloop(sock *s, struct birdloop *loop) +{ + if (enlisted(&s->n)) + rem_node(&s->n); + + s->reloop = (event) { + .hook = sk_reloop_hook, + .data = s, + }; + + ev_send_loop(loop, &s->reloop); +} + #define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \ CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL) @@ -1661,6 +1719,7 @@ sk_maybe_write(sock *s) s->err_hook(s, (errno != EPIPE) ? errno : 0); return -1; } + sk_ping(s); return 0; } s->ttx += e; @@ -1868,6 +1927,25 @@ sk_read(sock *s, int revents) case SK_TCP: case SK_UNIX: { + if (s->cork) + { + int cont = 0; + LOCK_DOMAIN(cork, s->cork->lock); + if (!enlisted(&s->cork_node)) + if (s->cork->count) + { +// log(L_TRACE "Socket %p corked", s); + add_tail(&s->cork->sockets, &s->cork_node); + sk_ping(s); + } + else + cont = 1; + UNLOCK_DOMAIN(cork, s->cork->lock); + + if (!cont) + return 0; + } + int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos); if (c < 0) @@ -2020,30 +2098,17 @@ struct event_log_entry static struct event_log_entry event_log[EVENT_LOG_LENGTH]; static struct event_log_entry *event_open; static int event_log_pos, event_log_num, watchdog_active; -static btime last_time; +static btime last_io_time; static btime loop_time; static void io_update_time(void) { - struct timespec ts; - int rv; - - /* - * This is third time-tracking procedure (after update_times() above and - * times_update() in BFD), dedicated to internal event log and latency - * tracking. Hopefully, we consolidate these sometimes. - */ - - rv = clock_gettime(CLOCK_MONOTONIC, &ts); - if (rv < 0) - die("clock_gettime: %m"); - - last_time = ts.tv_sec S + ts.tv_nsec NS; + last_io_time = current_time_update(); if (event_open) { - event_open->duration = last_time - event_open->timestamp; + event_open->duration = last_io_time - event_open->timestamp; if (event_open->duration > config->latency_limit) log(L_WARN "Event 0x%p 0x%p took %d ms", @@ -2072,7 +2137,7 @@ io_log_event(void *hook, void *data) en->hook = hook; en->data = data; - en->timestamp = last_time; + en->timestamp = last_io_time; en->duration = 0; event_log_num++; @@ -2100,14 +2165,14 @@ io_log_dump(void) struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH; if (en->hook) log(L_DEBUG " Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data, - (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS)); + (int) ((last_io_time - en->timestamp) TO_MS), (int) (en->duration TO_MS)); } } void watchdog_sigalrm(int sig UNUSED) { - /* Update last_time and duration, but skip latency check */ + /* Update last_io_time and duration, but skip latency check */ config->latency_limit = 0xffffffff; io_update_time(); @@ -2120,7 +2185,7 @@ watchdog_start1(void) { io_update_time(); - loop_time = last_time; + loop_time = last_io_time; } static inline void @@ -2128,7 +2193,7 @@ watchdog_start(void) { io_update_time(); - loop_time = last_time; + loop_time = last_io_time; event_log_num = 0; if (config->watchdog_timeout) @@ -2149,7 +2214,7 @@ watchdog_stop(void) watchdog_active = 0; } - btime duration = last_time - loop_time; + btime duration = last_io_time - loop_time; if (duration > config->watchdog_warning) log(L_WARN "I/O loop cycle took %d ms for %d events", (int) (duration TO_MS), event_log_num); @@ -2164,8 +2229,9 @@ void io_init(void) { init_list(&sock_list); - init_list(&global_event_list); - init_list(&global_work_list); + ev_init_list(&global_event_list, &main_birdloop, "Global event list"); + ev_init_list(&global_work_list, &main_birdloop, "Global work list"); + ev_init_list(&main_birdloop.event_list, &main_birdloop, "Global fast event list"); krt_io_init(); // XXX init_times(); // XXX update_times(); @@ -2179,11 +2245,15 @@ static int short_loops = 0; #define SHORT_LOOP_MAX 10 #define WORK_EVENTS_MAX 10 +void pipe_drain(int fd); +void check_stored_pages(void); + void io_loop(void) { int poll_tout, timeout; int nfds, events, pout; + int reload_requested = 0; timer *t; sock *s; node *n; @@ -2193,27 +2263,39 @@ io_loop(void) watchdog_start1(); for(;;) { - times_update(&main_timeloop); + times_update(); events = ev_run_list(&global_event_list); events = ev_run_list_limited(&global_work_list, WORK_EVENTS_MAX) || events; - timers_fire(&main_timeloop); + events = ev_run_list(&main_birdloop.event_list) || events; + timers_fire(&main_birdloop.time, 1); io_close_event(); +#if DEBUGGING +#define PERIODIC_WAKEUP 86400000 +#else +#define PERIODIC_WAKEUP 3000 +#endif +restart_poll: // FIXME - poll_tout = (events ? 0 : 3000); /* Time in milliseconds */ - if (t = timers_first(&main_timeloop)) + poll_tout = ((reload_requested || events) ? 0 : PERIODIC_WAKEUP); /* Time in milliseconds */ + if (t = timers_first(&main_birdloop.time)) { - times_update(&main_timeloop); + times_update(); timeout = (tm_remains(t) TO_MS) + 1; poll_tout = MIN(poll_tout, timeout); } - nfds = 0; + /* A hack to reload main io_loop() when something has changed asynchronously. */ + pfd[0].fd = main_birdloop.wakeup_fds[0]; + pfd[0].events = POLLIN; + + nfds = 1; + WALK_LIST(n, sock_list) { pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */ s = SKIP_BACK(sock, n, n); - if (s->rx_hook) + if (s->rx_hook && !ev_corked(s->cork)) { pfd[nfds].fd = s->fd; pfd[nfds].events |= POLLIN; @@ -2267,7 +2349,9 @@ io_loop(void) /* And finally enter poll() to find active sockets */ watchdog_stop(); + birdloop_leave(&main_birdloop); pout = poll(pfd, nfds, poll_tout); + birdloop_enter(&main_birdloop); watchdog_start(); if (pout < 0) @@ -2276,9 +2360,24 @@ io_loop(void) continue; die("poll: %m"); } + + if (pout && (pfd[0].revents & POLLIN)) + { + /* IO loop reload requested */ + pipe_drain(main_birdloop.wakeup_fds[0]); + reload_requested = 1; + goto restart_poll; + } + + if (reload_requested) + { + reload_requested = 0; + atomic_exchange_explicit(&main_birdloop.ping_sent, 0, memory_order_acq_rel); + } + if (pout) { - times_update(&main_timeloop); + times_update(); /* guaranteed to be non-empty */ current_sock = SKIP_BACK(sock, n, HEAD(sock_list)); diff --git a/sysdep/unix/krt.c b/sysdep/unix/krt.c index 7c2614b1..0cb86213 100644 --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@ -74,7 +74,7 @@ static list krt_proto_list; void krt_io_init(void) { - krt_pool = rp_new(&root_pool, "Kernel Syncer"); + krt_pool = rp_new(&root_pool, &main_birdloop, "Kernel Syncer"); krt_filter_lp = lp_new_default(krt_pool); init_list(&krt_proto_list); krt_sys_io_init(); @@ -251,14 +251,14 @@ static inline void krt_trace_in(struct krt_proto *p, rte *e, char *msg) { if (p->p.debug & D_PACKETS) - log(L_TRACE "%s: %N: %s", p->p.name, e->net->n.addr, msg); + log(L_TRACE "%s: %N: %s", p->p.name, e->net, msg); } static inline void krt_trace_in_rl(struct tbf *f, struct krt_proto *p, rte *e, char *msg) { if (p->p.debug & D_PACKETS) - log_rl(f, L_TRACE "%s: %N: %s", p->p.name, e->net->n.addr, msg); + log_rl(f, L_TRACE "%s: %N: %s", p->p.name, e->net, msg); } /* @@ -277,261 +277,33 @@ static struct tbf rl_alien = TBF_DEFAULT_LOG_LIMITS; * the same key. */ -static inline int -krt_same_key(rte *a, rte *b) +static inline u32 +krt_metric(rte *a) { - return a->u.krt.metric == b->u.krt.metric; + eattr *ea = ea_find(a->attrs->eattrs, EA_KRT_METRIC); + return ea ? ea->u.data : 0; } static inline int -krt_uptodate(rte *a, rte *b) -{ - if (a->attrs != b->attrs) - return 0; - - if (a->u.krt.proto != b->u.krt.proto) - return 0; - - return 1; -} - -static void -krt_learn_announce_update(struct krt_proto *p, rte *e) -{ - net *n = e->net; - rta *aa = rta_clone(e->attrs); - rte *ee = rte_get_temp(aa); - ee->pflags = EA_ID_FLAG(EA_KRT_SOURCE) | EA_ID_FLAG(EA_KRT_METRIC); - ee->u.krt = e->u.krt; - rte_update(&p->p, n->n.addr, ee); -} - -static void -krt_learn_announce_delete(struct krt_proto *p, net *n) +krt_rte_better(rte *a, rte *b) { - rte_update(&p->p, n->n.addr, NULL); + return (krt_metric(a) > krt_metric(b)); } /* Called when alien route is discovered during scan */ static void -krt_learn_scan(struct krt_proto *p, rte *e) -{ - net *n0 = e->net; - net *n = net_get(p->krt_table, n0->n.addr); - rte *m, **mm; - - e->attrs = rta_lookup(e->attrs); - - for(mm=&n->routes; m = *mm; mm=&m->next) - if (krt_same_key(m, e)) - break; - if (m) - { - if (krt_uptodate(m, e)) - { - krt_trace_in_rl(&rl_alien, p, e, "[alien] seen"); - rte_free(e); - m->u.krt.seen = 1; - } - else - { - krt_trace_in(p, e, "[alien] updated"); - *mm = m->next; - rte_free(m); - m = NULL; - } - } - else - krt_trace_in(p, e, "[alien] created"); - if (!m) - { - e->next = n->routes; - n->routes = e; - e->u.krt.seen = 1; - } -} - -static void -krt_learn_prune(struct krt_proto *p) -{ - struct fib *fib = &p->krt_table->fib; - struct fib_iterator fit; - - KRT_TRACE(p, D_EVENTS, "Pruning inherited routes"); - - FIB_ITERATE_INIT(&fit, fib); -again: - FIB_ITERATE_START(fib, &fit, net, n) - { - rte *e, **ee, *best, **pbest, *old_best; - - /* - * Note that old_best may be NULL even if there was an old best route in - * the previous step, because it might be replaced in krt_learn_scan(). - * But in that case there is a new valid best route. - */ - - old_best = NULL; - best = NULL; - pbest = NULL; - ee = &n->routes; - while (e = *ee) - { - if (e->u.krt.best) - old_best = e; - - if (!e->u.krt.seen) - { - *ee = e->next; - rte_free(e); - continue; - } - - if (!best || best->u.krt.metric > e->u.krt.metric) - { - best = e; - pbest = ee; - } - - e->u.krt.seen = 0; - e->u.krt.best = 0; - ee = &e->next; - } - if (!n->routes) - { - DBG("%I/%d: deleting\n", n->n.prefix, n->n.pxlen); - if (old_best) - krt_learn_announce_delete(p, n); - - FIB_ITERATE_PUT(&fit); - fib_delete(fib, n); - goto again; - } - - best->u.krt.best = 1; - *pbest = best->next; - best->next = n->routes; - n->routes = best; - - if ((best != old_best) || p->reload) - { - DBG("%I/%d: announcing (metric=%d)\n", n->n.prefix, n->n.pxlen, best->u.krt.metric); - krt_learn_announce_update(p, best); - } - else - DBG("%I/%d: uptodate (metric=%d)\n", n->n.prefix, n->n.pxlen, best->u.krt.metric); - } - FIB_ITERATE_END; - - p->reload = 0; -} - -static void -krt_learn_async(struct krt_proto *p, rte *e, int new) +krt_learn_rte(struct krt_proto *p, rte *e) { - net *n0 = e->net; - net *n = net_get(p->krt_table, n0->n.addr); - rte *g, **gg, *best, **bestp, *old_best; - - e->attrs = rta_lookup(e->attrs); - - old_best = n->routes; - for(gg=&n->routes; g = *gg; gg = &g->next) - if (krt_same_key(g, e)) - break; - if (new) - { - if (g) - { - if (krt_uptodate(g, e)) - { - krt_trace_in(p, e, "[alien async] same"); - rte_free(e); - return; - } - krt_trace_in(p, e, "[alien async] updated"); - *gg = g->next; - rte_free(g); - } - else - krt_trace_in(p, e, "[alien async] created"); - - e->next = n->routes; - n->routes = e; - } - else if (!g) - { - krt_trace_in(p, e, "[alien async] delete failed"); - rte_free(e); - return; - } - else - { - krt_trace_in(p, e, "[alien async] removed"); - *gg = g->next; - rte_free(e); - rte_free(g); - } - best = n->routes; - bestp = &n->routes; - for(gg=&n->routes; g=*gg; gg=&g->next) - { - if (best->u.krt.metric > g->u.krt.metric) - { - best = g; - bestp = gg; - } - - g->u.krt.best = 0; - } - - if (best) - { - best->u.krt.best = 1; - *bestp = best->next; - best->next = n->routes; - n->routes = best; - } - - if (best != old_best) - { - DBG("krt_learn_async: distributing change\n"); - if (best) - krt_learn_announce_update(p, best); - else - krt_learn_announce_delete(p, n); - } + struct rte_src *src = e->src = rt_get_source(&p->p, krt_metric(e)); + rte_update(p->p.main_channel, e->net, e, e->src); + rt_unlock_source(src); } static void krt_learn_init(struct krt_proto *p) { if (KRT_CF->learn) - { - struct rtable_config *cf = mb_allocz(p->p.pool, sizeof(struct rtable_config)); - cf->name = "Inherited"; - cf->addr_type = p->p.net_type; - cf->internal = 1; - - p->krt_table = rt_setup(p->p.pool, cf); - } -} - -static void -krt_dump(struct proto *P) -{ - struct krt_proto *p = (struct krt_proto *) P; - - if (!KRT_CF->learn) - return; - debug("KRT: Table of inheritable routes\n"); - rt_dump(p->krt_table); -} - -static void -krt_dump_attrs(rte *e) -{ - debug(" [m=%d,p=%d]", e->u.krt.metric, e->u.krt.proto); + channel_setup_in_table(p->p.main_channel, 1); } #endif @@ -543,47 +315,58 @@ krt_dump_attrs(rte *e) static inline int krt_is_installed(struct krt_proto *p, net *n) { - return n->routes && bmap_test(&p->p.main_channel->export_map, n->routes->id); + return n->routes && bmap_test(&p->p.main_channel->export_map, n->routes->rte.id); } -static void -krt_flush_routes(struct krt_proto *p) +static uint +rte_feed_count(net *n) { - struct rtable *t = p->p.main_channel->table; + uint count = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + if (rte_is_valid(RTES_OR_NULL(e))) + count++; + return count; +} - KRT_TRACE(p, D_EVENTS, "Flushing kernel routes"); - FIB_WALK(&t->fib, net, n) +static void +rte_feed_obtain(net *n, rte **feed, uint count) +{ + uint i = 0; + for (struct rte_storage *e = n->routes; e; e = e->next) + if (rte_is_valid(RTES_OR_NULL(e))) { - if (krt_is_installed(p, n)) - { - /* FIXME: this does not work if gw is changed in export filter */ - krt_replace_rte(p, n, NULL, n->routes); - } + ASSERT_DIE(i < count); + feed[i++] = &e->rte; } - FIB_WALK_END; + ASSERT_DIE(i == count); } static struct rte * -krt_export_net(struct krt_proto *p, net *net, rte **rt_free) +krt_export_net(struct krt_proto *p, net *net) { struct channel *c = p->p.main_channel; const struct filter *filter = c->out_filter; - rte *rt; if (c->ra_mode == RA_MERGED) - return rt_export_merged(c, net, rt_free, krt_filter_lp, 1); + { + uint count = rte_feed_count(net); + if (!count) + return NULL; - rt = net->routes; - *rt_free = NULL; + rte **feed = alloca(count * sizeof(rte *)); + rte_feed_obtain(net, feed, count); + return rt_export_merged(c, feed, count, krt_filter_lp, 1); + } - if (!rte_is_valid(rt)) + static _Thread_local rte rt; + rt = net->routes->rte; + + if (!rte_is_valid(&rt)) return NULL; if (filter == FILTER_REJECT) return NULL; - rte_make_tmp_attrs(&rt, krt_filter_lp, NULL); - /* We could run krt_preexport() here, but it is already handled by krt_is_installed() */ if (filter == FILTER_ACCEPT) @@ -594,13 +377,9 @@ krt_export_net(struct krt_proto *p, net *net, rte **rt_free) accept: - if (rt != net->routes) - *rt_free = rt; - return rt; + return &rt; reject: - if (rt != net->routes) - rte_free(rt); return NULL; } @@ -624,13 +403,13 @@ krt_same_dest(rte *k, rte *e) */ void -krt_got_route(struct krt_proto *p, rte *e) +krt_got_route(struct krt_proto *p, rte *e, s8 src) { - rte *new = NULL, *rt_free = NULL; - net *n = e->net; + rte *new = NULL; + e->pflags = 0; #ifdef KRT_ALLOW_LEARN - switch (e->u.krt.src) + switch (src) { case KRT_SRC_KERNEL: goto ignore; @@ -640,26 +419,28 @@ krt_got_route(struct krt_proto *p, rte *e) case KRT_SRC_ALIEN: if (KRT_CF->learn) - krt_learn_scan(p, e); + krt_learn_rte(p, e); else - { - krt_trace_in_rl(&rl_alien, p, e, "[alien] ignored"); - rte_free(e); - } + krt_trace_in_rl(&rl_alien, p, e, "[alien] ignored"); return; } #endif /* The rest is for KRT_SRC_BIRD (or KRT_SRC_UNKNOWN) */ + RT_LOCK(p->p.main_channel->table); + /* Deleting all routes if flush is requested */ + if (p->flush_routes) + goto delete; /* We wait for the initial feed to have correct installed state */ if (!p->ready) goto ignore; - if (!krt_is_installed(p, n)) + net *net = net_find(RT_PRIV(p->p.main_channel->table), e->net); + if (!net || !krt_is_installed(p, net)) goto delete; - new = krt_export_net(p, n, &rt_free); + new = krt_export_net(p, net); /* Rejected by filters */ if (!new) @@ -692,20 +473,16 @@ ignore: update: krt_trace_in(p, new, "updating"); - krt_replace_rte(p, n, new, e); + krt_replace_rte(p, e->net, new, e); goto done; delete: krt_trace_in(p, e, "deleting"); - krt_replace_rte(p, n, NULL, e); + krt_replace_rte(p, e->net, NULL, e); goto done; done: - rte_free(e); - - if (rt_free) - rte_free(rt_free); - + RT_UNLOCK(p->p.main_channel->table); lp_flush(krt_filter_lp); } @@ -713,50 +490,65 @@ static void krt_init_scan(struct krt_proto *p) { bmap_reset(&p->seen_map, 1024); + +#ifdef KRT_ALLOW_LEARN + if (KRT_CF->learn) + channel_refresh_begin(p->p.main_channel); +#endif } static void krt_prune(struct krt_proto *p) { - struct rtable *t = p->p.main_channel->table; + RT_LOCK(p->p.main_channel->table); + rtable_private *t = RT_PRIV(p->p.main_channel->table); KRT_TRACE(p, D_EVENTS, "Pruning table %s", t->name); FIB_WALK(&t->fib, net, n) { - if (p->ready && krt_is_installed(p, n) && !bmap_test(&p->seen_map, n->routes->id)) + if (p->ready && krt_is_installed(p, n) && !bmap_test(&p->seen_map, n->routes->rte.id)) { - rte *rt_free = NULL; - rte *new = krt_export_net(p, n, &rt_free); + rte *new = krt_export_net(p, n); if (new) { krt_trace_in(p, new, "installing"); - krt_replace_rte(p, n, new, NULL); + krt_replace_rte(p, n->n.addr, new, NULL); } - if (rt_free) - rte_free(rt_free); - lp_flush(krt_filter_lp); } } FIB_WALK_END; + RT_UNLOCK(p->p.main_channel->table); + #ifdef KRT_ALLOW_LEARN if (KRT_CF->learn) - krt_learn_prune(p); + channel_refresh_end(p->p.main_channel); #endif if (p->ready) p->initialized = 1; } +static void +krt_flush_routes(struct krt_proto *p) +{ + KRT_TRACE(p, D_EVENTS, "Flushing kernel routes"); + p->flush_routes = 1; + krt_init_scan(p); + krt_do_scan(p); + /* No prune! */ + p->flush_routes = 0; +} + void -krt_got_route_async(struct krt_proto *p, rte *e, int new) +krt_got_route_async(struct krt_proto *p, rte *e, int new, s8 src) { - net *net = e->net; + e->pflags = 0; - switch (e->u.krt.src) + switch (src) { case KRT_SRC_BIRD: /* Should be filtered by the back end */ @@ -766,7 +558,7 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new) if (new) { krt_trace_in(p, e, "[redirect] deleting"); - krt_replace_rte(p, net, NULL, e); + krt_replace_rte(p, e->net, NULL, e); } /* If !new, it is probably echo of our deletion */ break; @@ -775,12 +567,11 @@ krt_got_route_async(struct krt_proto *p, rte *e, int new) case KRT_SRC_ALIEN: if (KRT_CF->learn) { - krt_learn_async(p, e, new); + krt_learn_rte(p, e); return; } #endif } - rte_free(e); } /* @@ -886,30 +677,15 @@ krt_scan_timer_kick(struct krt_proto *p) * Updates */ -static void -krt_make_tmp_attrs(struct rte *rt, struct linpool *pool) -{ - rte_init_tmp_attrs(rt, pool, 2); - rte_make_tmp_attr(rt, EA_KRT_SOURCE, EAF_TYPE_INT, rt->u.krt.proto); - rte_make_tmp_attr(rt, EA_KRT_METRIC, EAF_TYPE_INT, rt->u.krt.metric); -} - -static void -krt_store_tmp_attrs(struct rte *rt, struct linpool *pool) -{ - rte_init_tmp_attrs(rt, pool, 2); - rt->u.krt.proto = rte_store_tmp_attr(rt, EA_KRT_SOURCE); - rt->u.krt.metric = rte_store_tmp_attr(rt, EA_KRT_METRIC); -} - static int -krt_preexport(struct proto *P, rte **new, struct linpool *pool UNUSED) +krt_preexport(struct channel *c, rte *e) { - // struct krt_proto *p = (struct krt_proto *) P; - rte *e = *new; - - if (e->attrs->src->proto == P) + if (e->src->owner == &c->proto->sources) +#ifdef CONFIG_SINGLE_ROUTE + return 1; /* Passing the route directly for rt_notify() to ignore */ +#else return -1; +#endif if (!krt_capable(e)) return -1; @@ -918,8 +694,8 @@ krt_preexport(struct proto *P, rte **new, struct linpool *pool UNUSED) } static void -krt_rt_notify(struct proto *P, struct channel *ch UNUSED, net *net, - rte *new, rte *old) +krt_rt_notify(struct proto *P, struct channel *ch UNUSED, const net_addr *net, + rte *new, const rte *old) { struct krt_proto *p = (struct krt_proto *) P; @@ -928,13 +704,10 @@ krt_rt_notify(struct proto *P, struct channel *ch UNUSED, net *net, #ifdef CONFIG_SINGLE_ROUTE /* - * Implicit withdraw - when the imported kernel route becomes the best one, - * we know that the previous one exported to the kernel was already removed, - * but if we processed the update as usual, we would send withdraw to the - * kernel, which would remove the new imported route instead. + * When the imported kernel route becomes the best one, we get it directly and + * we simply know that it is already there. Nothing to do. */ - rte *best = net->routes; - if (!new && best && (best->attrs->src->proto == P)) + if (new->src->owner == &P->sources) return; #endif @@ -983,14 +756,6 @@ krt_feed_end(struct channel *C) } -static int -krt_rte_same(rte *a, rte *b) -{ - /* src is always KRT_SRC_ALIEN and type is irrelevant */ - return (a->u.krt.proto == b->u.krt.proto) && (a->u.krt.metric == b->u.krt.metric); -} - - /* * Protocol glue */ @@ -1049,9 +814,7 @@ krt_init(struct proto_config *CF) p->p.if_notify = krt_if_notify; p->p.reload_routes = krt_reload_routes; p->p.feed_end = krt_feed_end; - p->p.make_tmp_attrs = krt_make_tmp_attrs; - p->p.store_tmp_attrs = krt_store_tmp_attrs; - p->p.rte_same = krt_rte_same; + p->p.rte_better = krt_rte_better; krt_sys_init(p); return &p->p; @@ -1207,8 +970,4 @@ struct protocol proto_unix_kernel = { .reconfigure = krt_reconfigure, .copy_config = krt_copy_config, .get_attr = krt_get_attr, -#ifdef KRT_ALLOW_LEARN - .dump = krt_dump, - .dump_attrs = krt_dump_attrs, -#endif }; diff --git a/sysdep/unix/krt.h b/sysdep/unix/krt.h index 62228f08..968c5b16 100644 --- a/sysdep/unix/krt.h +++ b/sysdep/unix/krt.h @@ -24,6 +24,9 @@ struct kif_proto; #define EA_KRT_SOURCE EA_CODE(PROTOCOL_KERNEL, 0) #define EA_KRT_METRIC EA_CODE(PROTOCOL_KERNEL, 1) +#define KRT_REF_SEEN 0x1 /* Seen in table */ +#define KRT_REF_BEST 0x2 /* Best in table */ + /* Whenever we recognize our own routes, we allow learing of foreign routes */ #ifdef CONFIG_SELF_CONSCIOUS @@ -48,10 +51,6 @@ struct krt_proto { struct proto p; struct krt_state sys; /* Sysdep state */ -#ifdef KRT_ALLOW_LEARN - struct rtable *krt_table; /* Internal table of inherited routes */ -#endif - #ifndef CONFIG_ALL_TABLES_AT_ONCE timer *scan_timer; #endif @@ -63,6 +62,7 @@ struct krt_proto { byte ready; /* Initial feed has been finished */ byte initialized; /* First scan has been finished */ byte reload; /* Next scan is doing reload */ + byte flush_routes; /* Scanning to flush */ }; extern pool *krt_pool; @@ -76,8 +76,8 @@ extern pool *krt_pool; struct proto_config * kif_init_config(int class); void kif_request_scan(void); -void krt_got_route(struct krt_proto *p, struct rte *e); -void krt_got_route_async(struct krt_proto *p, struct rte *e, int new); +void krt_got_route(struct krt_proto *p, struct rte *e, s8 src); +void krt_got_route_async(struct krt_proto *p, struct rte *e, int new, s8 src); static inline int krt_get_sync_error(struct krt_proto *p, struct rte *e) @@ -140,7 +140,7 @@ void krt_sys_copy_config(struct krt_config *, struct krt_config *); int krt_capable(rte *e); void krt_do_scan(struct krt_proto *); -void krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old); +void krt_replace_rte(struct krt_proto *p, const net_addr *n, rte *new, const rte *old); int krt_sys_get_attr(const eattr *a, byte *buf, int buflen); diff --git a/sysdep/unix/log.c b/sysdep/unix/log.c index 14d18c01..68a04e78 100644 --- a/sysdep/unix/log.c +++ b/sysdep/unix/log.c @@ -15,6 +15,7 @@ * user's manual. */ +#include <stdatomic.h> #include <stdio.h> #include <stdlib.h> #include <stdarg.h> @@ -35,8 +36,10 @@ static FILE *dbgf; static list *current_log_list; static char *current_syslog_name; /* NULL -> syslog closed */ +static _Atomic uint max_coro_id = ATOMIC_VAR_INIT(1); +static _Thread_local uint this_coro_id; -#ifdef USE_PTHREADS +#define THIS_CORO_ID (this_coro_id ?: (this_coro_id = atomic_fetch_add_explicit(&max_coro_id, 1, memory_order_acq_rel))) #include <pthread.h> @@ -48,15 +51,6 @@ static pthread_t main_thread; void main_thread_init(void) { main_thread = pthread_self(); } static int main_thread_self(void) { return pthread_equal(pthread_self(), main_thread); } -#else - -static inline void log_lock(void) { } -static inline void log_unlock(void) { } -void main_thread_init(void) { } -static int main_thread_self(void) { return 1; } - -#endif - #ifdef HAVE_SYSLOG_H #include <sys/syslog.h> @@ -189,7 +183,7 @@ log_commit(int class, buffer *buf) l->pos += msg_len; } - fprintf(l->fh, "%s <%s> ", tbuf, class_names[class]); + fprintf(l->fh, "%s [%04x] <%s> ", tbuf, THIS_CORO_ID, class_names[class]); } fputs(buf->start, l->fh); fputc('\n', l->fh); @@ -299,6 +293,8 @@ die(const char *msg, ...) exit(1); } +static struct timespec dbg_time_start; + /** * debug - write to debug output * @msg: a printf-like message @@ -309,22 +305,36 @@ die(const char *msg, ...) void debug(const char *msg, ...) { -#define MAX_DEBUG_BUFSIZE 65536 +#define MAX_DEBUG_BUFSIZE 16384 va_list args; - static uint bufsize = 4096; - static char *buf = NULL; - - if (!buf) - buf = mb_alloc(&root_pool, bufsize); + char buf[MAX_DEBUG_BUFSIZE], *pos = buf; + int max = MAX_DEBUG_BUFSIZE; va_start(args, msg); if (dbgf) { - while (bvsnprintf(buf, bufsize, msg, args) < 0) - if (bufsize >= MAX_DEBUG_BUFSIZE) - bug("Extremely long debug output, split it."); - else - buf = mb_realloc(buf, (bufsize *= 2)); + struct timespec dbg_time; + clock_gettime(CLOCK_MONOTONIC, &dbg_time); + uint nsec; + uint sec; + + if (dbg_time.tv_nsec > dbg_time_start.tv_nsec) + { + nsec = dbg_time.tv_nsec - dbg_time_start.tv_nsec; + sec = dbg_time.tv_sec - dbg_time_start.tv_sec; + } + else + { + nsec = 1000000000 + dbg_time.tv_nsec - dbg_time_start.tv_nsec; + sec = dbg_time.tv_sec - dbg_time_start.tv_sec - 1; + } + + int n = bsnprintf(pos, max, "%u.%09u: [%04x] ", sec, nsec, THIS_CORO_ID); + pos += n; + max -= n; + + if (bvsnprintf(pos, max, msg, args) < 0) + bug("Extremely long debug output, split it."); fputs(buf, dbgf); } @@ -378,6 +388,21 @@ default_log_list(int initial, const char **syslog_name) } void +log_cleanup(int syslog) +{ + struct log_config *l; + + if (current_log_list) + WALK_LIST(l, *current_log_list) + if (l->rf) + log_close(l); + + if (syslog && current_syslog_name) + closelog(); +} + + +void log_switch(int initial, list *logs, const char *new_syslog_name) { struct log_config *l; @@ -389,10 +414,7 @@ log_switch(int initial, list *logs, const char *new_syslog_name) logs = default_log_list(initial, &new_syslog_name); /* Close the logs to avoid pinning them on disk when deleted */ - if (current_log_list) - WALK_LIST(l, *current_log_list) - if (l->rf) - log_close(l); + log_cleanup(0); /* Reopen the logs, needed for 'configure undo' */ if (logs) @@ -429,6 +451,8 @@ done: void log_init_debug(char *f) { + clock_gettime(CLOCK_MONOTONIC, &dbg_time_start); + if (dbgf && dbgf != stderr) fclose(dbgf); if (!f) diff --git a/sysdep/unix/main.c b/sysdep/unix/main.c index 392aff9d..c326ba2b 100644 --- a/sysdep/unix/main.c +++ b/sysdep/unix/main.c @@ -28,6 +28,7 @@ #include "lib/resource.h" #include "lib/socket.h" #include "lib/event.h" +#include "lib/locking.h" #include "lib/timer.h" #include "lib/string.h" #include "nest/route.h" @@ -51,7 +52,7 @@ async_dump(void) { debug("INTERNAL STATE DUMP\n\n"); - rdump(&root_pool); + rp_dump(&root_pool); sk_dump_all(); // XXXX tm_dump_all(); if_dump_all(); @@ -201,7 +202,9 @@ sysdep_preconfig(struct config *c) int sysdep_commit(struct config *new, struct config *old UNUSED) { - log_switch(0, &new->logfiles, new->syslog_name); + if (!new->shutdown) + log_switch(0, &new->logfiles, new->syslog_name); + return 0; } @@ -479,6 +482,14 @@ cli_err(sock *s, int err) cli_free(s->data); } +static void +cli_connect_err(sock *s UNUSED, int err) +{ + ASSERT_DIE(err); + if (config->cli_debug) + log(L_INFO "Failed to accept CLI connection: %s", strerror(err)); +} + static int cli_connect(sock *s, uint size UNUSED) { @@ -507,6 +518,7 @@ cli_init_unix(uid_t use_uid, gid_t use_gid) s = cli_sk = sk_new(cli_pool); s->type = SK_UNIX_PASSIVE; s->rx_hook = cli_connect; + s->err_hook = cli_connect_err; s->rbsize = 1024; s->fast_rx = 1; @@ -603,6 +615,7 @@ sysdep_shutdown_done(void) unlink_pid_file(); unlink(path_control_socket); log_msg(L_FATAL "Shutdown completed"); + log_cleanup(1); exit(0); } @@ -673,7 +686,7 @@ signal_init(void) * Parsing of command-line arguments */ -static char *opt_list = "bc:dD:ps:P:u:g:flRh"; +static char *opt_list = "c:dD:ps:P:u:g:flRh"; int parse_and_exit; char *bird_name; static char *use_user; @@ -852,6 +865,8 @@ parse_args(int argc, char **argv) } } +void resource_sys_init(void); + /* * Hic Est main() */ @@ -864,13 +879,17 @@ main(int argc, char **argv) dmalloc_debug(0x2f03d00); #endif + times_update(); + resource_sys_init(); parse_args(argc, argv); log_switch(1, NULL, NULL); + the_bird_lock(); + random_init(); net_init(); resource_init(); - timer_init(); + birdloop_init(); olock_init(); io_init(); rt_init(); @@ -920,6 +939,7 @@ main(int argc, char **argv) dup2(0, 2); } + main_thread_init(); write_pid_file(); diff --git a/sysdep/unix/unix.h b/sysdep/unix/unix.h index ad85d1ea..51ec1b1e 100644 --- a/sysdep/unix/unix.h +++ b/sysdep/unix/unix.h @@ -122,6 +122,7 @@ void krt_io_init(void); void main_thread_init(void); void log_init_debug(char *); /* Initialize debug dump to given file (NULL=stderr, ""=off) */ void log_switch(int initial, list *l, const char *); +void log_cleanup(int syslog); struct log_config { node n; |