summaryrefslogtreecommitdiff
path: root/sysdep/unix/alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'sysdep/unix/alloc.c')
-rw-r--r--sysdep/unix/alloc.c397
1 files changed, 241 insertions, 156 deletions
diff --git a/sysdep/unix/alloc.c b/sysdep/unix/alloc.c
index c09a8356..cafcc8dd 100644
--- a/sysdep/unix/alloc.c
+++ b/sysdep/unix/alloc.c
@@ -10,239 +10,324 @@
#include "lib/resource.h"
#include "lib/lists.h"
#include "lib/event.h"
+#include "lib/rcu.h"
-#include "sysdep/unix/io-loop.h"
-
+#include <errno.h>
#include <stdlib.h>
#include <unistd.h>
-#include <stdatomic.h>
-#include <errno.h>
#ifdef HAVE_MMAP
#include <sys/mman.h>
#endif
+#ifdef CONFIG_DISABLE_THP
+#include <sys/prctl.h>
+#ifndef PR_SET_THP_DISABLE
+#define PR_SET_THP_DISABLE 41
+#endif
+#endif
+
long page_size = 0;
#ifdef HAVE_MMAP
-#if DEBUGGING
-#define FP_NODE_OFFSET 42
-#else
-#define FP_NODE_OFFSET 1
-#endif
+#define KEEP_PAGES_MAX 512
+#define KEEP_PAGES_MIN 32
+#define KEEP_PAGES_MAX_LOCAL 16
+#define ALLOC_PAGES_AT_ONCE 8
+
+STATIC_ASSERT(KEEP_PAGES_MIN * 4 < KEEP_PAGES_MAX);
+STATIC_ASSERT(ALLOC_PAGES_AT_ONCE < KEEP_PAGES_MAX_LOCAL);
+
static _Bool use_fake = 0;
+static _Bool initialized = 0;
+
+#if DEBUGGING
+struct free_page {
+ node unused[42];
+ struct free_page * _Atomic next;
+};
#else
-static _Bool use_fake = 1;
+struct free_page {
+ struct free_page * _Atomic next;
+};
#endif
+#define EP_POS_MAX ((page_size - OFFSETOF(struct empty_pages, pages)) / sizeof (void *))
+
+struct empty_pages {
+ struct empty_pages *next;
+ uint pos;
+ void *pages[0];
+};
+
+DEFINE_DOMAIN(resource);
+static DOMAIN(resource) empty_pages_domain;
+static struct empty_pages *empty_pages = NULL;
+
+static struct free_page * _Atomic page_stack = NULL;
+static _Thread_local struct free_page * local_page_stack = NULL;
+
+static void page_cleanup(void *);
+static event page_cleanup_event = { .hook = page_cleanup, };
+#define SCHEDULE_CLEANUP do if (initialized && !shutting_down) ev_send(&global_event_list, &page_cleanup_event); while (0)
+
+_Atomic int pages_kept = 0;
+_Atomic int pages_kept_locally = 0;
+static _Thread_local int pages_kept_here = 0;
+
static void *
alloc_sys_page(void)
{
- void *ptr = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ void *ptr = mmap(NULL, page_size * ALLOC_PAGES_AT_ONCE, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (ptr == MAP_FAILED)
- bug("mmap(%lu) failed: %m", page_size);
+ die("mmap(%ld) failed: %m", (s64) page_size);
return ptr;
}
extern int shutting_down; /* Shutdown requested. */
+#else // ! HAVE_MMAP
+#define use_fake 1
+#endif
+
void *
alloc_page(void)
{
-#ifdef HAVE_MMAP
- if (!use_fake)
+ /* If the system page allocator is goofy, we use posix_memalign to get aligned blocks of memory. */
+ if (use_fake)
{
- struct free_pages *fp = &birdloop_current->pages;
- if (!fp->cnt)
- return alloc_sys_page();
-
- node *n = HEAD(fp->list);
- rem_node(n);
- if ((--fp->cnt < fp->min) && !shutting_down)
- ev_send(fp->cleanup->list, fp->cleanup);
-
- void *ptr = n - FP_NODE_OFFSET;
- memset(ptr, 0, page_size);
+ void *ptr = NULL;
+ int err = posix_memalign(&ptr, page_size, page_size);
+
+ if (err || !ptr)
+ die("posix_memalign(%ld) failed", (s64) page_size);
+
return ptr;
}
- else
-#endif
- {
-#ifdef HAVE_ALIGNED_ALLOC
- void *ret = aligned_alloc(page_size, page_size);
- if (!ret)
- bug("aligned_alloc(%lu) failed", page_size);
- return ret;
-#else
- bug("BIRD should have already died on fatal error.");
-#endif
- }
-}
-void
-free_page(void *ptr)
-{
#ifdef HAVE_MMAP
- if (!use_fake)
+ /* If there is any free page kept hot in this thread, we use it. */
+ struct free_page *fp = local_page_stack;
+ if (fp)
{
- struct free_pages *fp = &birdloop_current->pages;
- struct node *n = ptr;
- n += FP_NODE_OFFSET;
-
- memset(n, 0, sizeof(node));
- add_tail(&fp->list, n);
- if ((++fp->cnt > fp->max) && !shutting_down)
- ev_send(fp->cleanup->list, fp->cleanup);
+ local_page_stack = atomic_load_explicit(&fp->next, memory_order_acquire);
+ atomic_fetch_sub_explicit(&pages_kept_locally, 1, memory_order_relaxed);
+ pages_kept_here--;
+ return fp;
}
- else
-#endif
- free(ptr);
-}
-#ifdef HAVE_MMAP
+ ASSERT_DIE(pages_kept_here == 0);
-#define GFP (&main_birdloop.pages)
+ /* If there is any free page kept hot in global storage, we use it. */
+ rcu_read_lock();
+ fp = atomic_load_explicit(&page_stack, memory_order_acquire);
+ while (fp && !atomic_compare_exchange_strong_explicit(
+ &page_stack, &fp, atomic_load_explicit(&fp->next, memory_order_acquire),
+ memory_order_acq_rel, memory_order_acquire))
+ ;
+ rcu_read_unlock();
-void
-flush_pages(struct birdloop *loop)
-{
- ASSERT_DIE(birdloop_inside(loop->parent->loop));
+ if (fp)
+ {
+ atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed);
+ return fp;
+ }
- struct free_pages *fp = &loop->pages;
- struct free_pages *pfp = &loop->parent->loop->pages;
+ /* If there is any free page kept cold, we use that. */
+ LOCK_DOMAIN(resource, empty_pages_domain);
+ if (empty_pages) {
+ if (empty_pages->pos)
+ /* Either the keeper page contains at least one cold page pointer, return that */
+ fp = empty_pages->pages[--empty_pages->pos];
+ else
+ {
+ /* Or the keeper page has no more cold page pointer, return the keeper page */
+ fp = (struct free_page *) empty_pages;
+ empty_pages = empty_pages->next;
+ }
+ }
+ UNLOCK_DOMAIN(resource, empty_pages_domain);
- add_tail_list(&pfp->list, &fp->list);
- pfp->cnt += fp->cnt;
-
- fp->cnt = 0;
- fp->list = (list) {};
- fp->min = 0;
- fp->max = 0;
+ if (fp)
+ return fp;
- rfree(fp->cleanup);
- fp->cleanup = NULL;
+ /* And in the worst case, allocate some new pages by mmap() */
+ void *ptr = alloc_sys_page();
+ for (int i=1; i<ALLOC_PAGES_AT_ONCE; i++)
+ free_page(ptr + page_size * i);
+
+ return ptr;
+#endif
}
-static void
-cleanup_pages(void *data)
+void
+free_page(void *ptr)
{
- struct birdloop *loop = data;
- birdloop_enter(loop);
-
- ASSERT_DIE(birdloop_inside(loop->parent->loop));
-
- struct free_pages *fp = &loop->pages;
- struct free_pages *pfp = &loop->parent->loop->pages;
-
- while ((fp->cnt < fp->min) && (pfp->cnt > pfp->min))
+ /* If the system page allocator is goofy, we just free the block and care no more. */
+ if (use_fake)
{
- node *n = HEAD(pfp->list);
- rem_node(n);
- add_tail(&fp->list, n);
- fp->cnt++;
- pfp->cnt--;
- }
-
- while (fp->cnt < fp->min)
- {
- node *n = alloc_sys_page();
- add_tail(&fp->list, n + FP_NODE_OFFSET);
- fp->cnt++;
+ free(ptr);
+ return;
}
- while (fp->cnt > fp->max)
+#ifdef HAVE_MMAP
+ /* We primarily try to keep the pages locally. */
+ struct free_page *fp = ptr;
+ if (shutting_down || (pages_kept_here < KEEP_PAGES_MAX_LOCAL))
{
- node *n = HEAD(fp->list);
- rem_node(n);
- add_tail(&pfp->list, n);
- fp->cnt--;
- pfp->cnt++;
+ atomic_store_explicit(&fp->next, local_page_stack, memory_order_relaxed);
+ local_page_stack = fp;
+
+ atomic_fetch_add_explicit(&pages_kept_locally, 1, memory_order_relaxed);
+ pages_kept_here++;
+ return;
}
- birdloop_leave(loop);
+ /* If there are too many local pages, we add the free page to the global hot-free-page list */
+ rcu_read_lock();
+ struct free_page *next = atomic_load_explicit(&page_stack, memory_order_acquire);
- if (!shutting_down && (pfp->cnt > pfp->max))
- ev_send(pfp->cleanup->list, pfp->cleanup);
+ do atomic_store_explicit(&fp->next, next, memory_order_release);
+ while (!atomic_compare_exchange_strong_explicit(
+ &page_stack, &next, fp,
+ memory_order_acq_rel, memory_order_acquire));
+ rcu_read_unlock();
+
+ /* And if there are too many global hot free pages, we ask for page cleanup */
+ if (atomic_fetch_add_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX)
+ SCHEDULE_CLEANUP;
+#endif
}
-static void
-cleanup_global_pages(void *data UNUSED)
+/* When the routine is going to sleep for a long time, we flush the local
+ * hot page cache to not keep dirty pages for nothing. */
+void
+flush_local_pages(void)
{
- while (GFP->cnt < GFP->max)
- {
- node *n = alloc_sys_page();
- add_tail(&GFP->list, n + FP_NODE_OFFSET);
- GFP->cnt++;
- }
+ if (use_fake || !local_page_stack || shutting_down)
+ return;
- for (uint limit = GFP->cnt; (limit > 0) && (GFP->cnt > GFP->max); limit--)
+ /* We first count the pages to enable consistency checking.
+ * Also, we need to know the last page. */
+ struct free_page *last = local_page_stack, *next;
+ int check_count = 1;
+ while (next = atomic_load_explicit(&last->next, memory_order_acquire))
{
- node *n = TAIL(GFP->list);
- rem_node(n);
-
- if (munmap(n - FP_NODE_OFFSET, page_size) == 0)
- GFP->cnt--;
- else if (errno == ENOMEM)
- add_head(&GFP->list, n);
- else
- bug("munmap(%p) failed: %m", n - FP_NODE_OFFSET);
+ check_count++;
+ last = next;
}
+
+ /* The actual number of pages must be equal to the counter value. */
+ ASSERT_DIE(check_count == pages_kept_here);
+
+ /* Repeatedly trying to insert the whole page list into global page stack at once. */
+ rcu_read_lock();
+ next = atomic_load_explicit(&page_stack, memory_order_acquire);
+
+ /* First we set the outwards pointer (from our last),
+ * then we try to set the inwards pointer to our first page. */
+ do atomic_store_explicit(&last->next, next, memory_order_release);
+ while (!atomic_compare_exchange_strong_explicit(
+ &page_stack, &next, local_page_stack,
+ memory_order_acq_rel, memory_order_acquire));
+ rcu_read_unlock();
+
+ /* Finished. Now the local stack is empty. */
+ local_page_stack = NULL;
+ pages_kept_here = 0;
+
+ /* Check the state of global page cache and maybe schedule its cleanup. */
+ atomic_fetch_sub_explicit(&pages_kept_locally, check_count, memory_order_relaxed);
+ if (atomic_fetch_add_explicit(&pages_kept, check_count, memory_order_relaxed) >= KEEP_PAGES_MAX)
+ SCHEDULE_CLEANUP;
}
-void
-init_pages(struct birdloop *loop)
+#ifdef HAVE_MMAP
+static void
+page_cleanup(void *_ UNUSED)
{
- struct free_pages *fp = &loop->pages;
+ /* Cleanup on shutdown is ignored. All pages may be kept hot, OS will take care. */
+ if (shutting_down)
+ return;
+
+ struct free_page *stack = atomic_exchange_explicit(&page_stack, NULL, memory_order_acq_rel);
+ if (!stack)
+ return;
- init_list(&fp->list);
- fp->cleanup = ev_new_init(loop->parent->loop->pool, cleanup_pages, loop);
- fp->cleanup->list = (loop->parent->loop == &main_birdloop) ? &global_work_list : birdloop_event_list(loop->parent->loop);
- fp->min = 4;
- fp->max = 16;
- for (fp->cnt = 0; fp->cnt < fp->min; fp->cnt++)
+ do {
+ synchronize_rcu();
+ struct free_page *fp = stack;
+ stack = atomic_load_explicit(&fp->next, memory_order_acquire);
+
+ LOCK_DOMAIN(resource, empty_pages_domain);
+ /* Empty pages are stored as pointers. To store them, we need a pointer block. */
+ if (!empty_pages || (empty_pages->pos == EP_POS_MAX))
+ {
+ /* There is either no pointer block or the last block is full. We use this block as a pointer block. */
+ empty_pages = (struct empty_pages *) fp;
+ *empty_pages = (struct empty_pages) {};
+ }
+ else
+ {
+ /* We store this block as a pointer into the first free place
+ * and tell the OS that the underlying memory is trash. */
+ empty_pages->pages[empty_pages->pos++] = fp;
+ if (madvise(fp, page_size,
+#ifdef CONFIG_MADV_DONTNEED_TO_FREE
+ MADV_DONTNEED
+#else
+ MADV_FREE
+#endif
+ ) < 0)
+ bug("madvise(%p) failed: %m", fp);
+ }
+ UNLOCK_DOMAIN(resource, empty_pages_domain);
+ }
+ while ((atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX / 2) && stack);
+
+ while (stack)
{
- node *n = alloc_sys_page();
- add_tail(&fp->list, n + FP_NODE_OFFSET);
+ struct free_page *f = stack;
+ stack = atomic_load_explicit(&f->next, memory_order_acquire);
+ free_page(f);
+
+ atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed);
}
}
+#endif
-static event global_free_pages_cleanup_event = { .hook = cleanup_global_pages, .list = &global_work_list };
-
-void resource_sys_init(void)
+void
+resource_sys_init(void)
{
+#ifdef CONFIG_DISABLE_THP
+ /* Disable transparent huge pages, they do not work properly with madvice(MADV_DONTNEED) */
+ if (prctl(PR_SET_THP_DISABLE, (unsigned long) 1, (unsigned long) 0, (unsigned long) 0, (unsigned long) 0) < 0)
+ log(L_WARN "Cannot disable transparent huge pages: prctl(PR_SET_THP_DISABLE) failed: %m");
+#endif
+
+#ifdef HAVE_MMAP
+ /* Check what page size the system supports */
if (!(page_size = sysconf(_SC_PAGESIZE)))
die("System page size must be non-zero");
- if (u64_popcount(page_size) == 1)
+ if ((u64_popcount(page_size) == 1) && (page_size >= (1 << 10)) && (page_size <= (1 << 18)))
{
- init_list(&GFP->list);
- GFP->cleanup = &global_free_pages_cleanup_event;
- GFP->min = 0;
- GFP->max = 256;
+ /* We assume that page size has only one bit and is between 1K and 256K (incl.).
+ * Otherwise, the assumptions in lib/slab.c (sl_head's num_full range) aren't met. */
+
+ empty_pages_domain = DOMAIN_NEW(resource, "Empty Pages");
+ initialized = 1;
return;
}
-#ifdef HAVE_ALIGNED_ALLOC
- log(L_WARN "Got strange memory page size (%lu), using the aligned allocator instead", page_size);
-#else
- die("Got strange memory page size (%lu) and aligned_alloc is not available", page_size);
-#endif
-
/* Too big or strange page, use the aligned allocator instead */
- page_size = 4096;
+ log(L_WARN "Got strange memory page size (%ld), using the aligned allocator instead", (s64) page_size);
use_fake = 1;
-}
-
-#else
+#endif
-void
-resource_sys_init(void)
-{
page_size = 4096;
- use_fake = 1;
+ initialized = 1;
}
-
-#endif