diff options
Diffstat (limited to 'sysdep/unix/alloc.c')
-rw-r--r-- | sysdep/unix/alloc.c | 397 |
1 files changed, 241 insertions, 156 deletions
diff --git a/sysdep/unix/alloc.c b/sysdep/unix/alloc.c index c09a8356..cafcc8dd 100644 --- a/sysdep/unix/alloc.c +++ b/sysdep/unix/alloc.c @@ -10,239 +10,324 @@ #include "lib/resource.h" #include "lib/lists.h" #include "lib/event.h" +#include "lib/rcu.h" -#include "sysdep/unix/io-loop.h" - +#include <errno.h> #include <stdlib.h> #include <unistd.h> -#include <stdatomic.h> -#include <errno.h> #ifdef HAVE_MMAP #include <sys/mman.h> #endif +#ifdef CONFIG_DISABLE_THP +#include <sys/prctl.h> +#ifndef PR_SET_THP_DISABLE +#define PR_SET_THP_DISABLE 41 +#endif +#endif + long page_size = 0; #ifdef HAVE_MMAP -#if DEBUGGING -#define FP_NODE_OFFSET 42 -#else -#define FP_NODE_OFFSET 1 -#endif +#define KEEP_PAGES_MAX 512 +#define KEEP_PAGES_MIN 32 +#define KEEP_PAGES_MAX_LOCAL 16 +#define ALLOC_PAGES_AT_ONCE 8 + +STATIC_ASSERT(KEEP_PAGES_MIN * 4 < KEEP_PAGES_MAX); +STATIC_ASSERT(ALLOC_PAGES_AT_ONCE < KEEP_PAGES_MAX_LOCAL); + static _Bool use_fake = 0; +static _Bool initialized = 0; + +#if DEBUGGING +struct free_page { + node unused[42]; + struct free_page * _Atomic next; +}; #else -static _Bool use_fake = 1; +struct free_page { + struct free_page * _Atomic next; +}; #endif +#define EP_POS_MAX ((page_size - OFFSETOF(struct empty_pages, pages)) / sizeof (void *)) + +struct empty_pages { + struct empty_pages *next; + uint pos; + void *pages[0]; +}; + +DEFINE_DOMAIN(resource); +static DOMAIN(resource) empty_pages_domain; +static struct empty_pages *empty_pages = NULL; + +static struct free_page * _Atomic page_stack = NULL; +static _Thread_local struct free_page * local_page_stack = NULL; + +static void page_cleanup(void *); +static event page_cleanup_event = { .hook = page_cleanup, }; +#define SCHEDULE_CLEANUP do if (initialized && !shutting_down) ev_send(&global_event_list, &page_cleanup_event); while (0) + +_Atomic int pages_kept = 0; +_Atomic int pages_kept_locally = 0; +static _Thread_local int pages_kept_here = 0; + static void * alloc_sys_page(void) { - void *ptr = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + void *ptr = mmap(NULL, page_size * ALLOC_PAGES_AT_ONCE, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (ptr == MAP_FAILED) - bug("mmap(%lu) failed: %m", page_size); + die("mmap(%ld) failed: %m", (s64) page_size); return ptr; } extern int shutting_down; /* Shutdown requested. */ +#else // ! HAVE_MMAP +#define use_fake 1 +#endif + void * alloc_page(void) { -#ifdef HAVE_MMAP - if (!use_fake) + /* If the system page allocator is goofy, we use posix_memalign to get aligned blocks of memory. */ + if (use_fake) { - struct free_pages *fp = &birdloop_current->pages; - if (!fp->cnt) - return alloc_sys_page(); - - node *n = HEAD(fp->list); - rem_node(n); - if ((--fp->cnt < fp->min) && !shutting_down) - ev_send(fp->cleanup->list, fp->cleanup); - - void *ptr = n - FP_NODE_OFFSET; - memset(ptr, 0, page_size); + void *ptr = NULL; + int err = posix_memalign(&ptr, page_size, page_size); + + if (err || !ptr) + die("posix_memalign(%ld) failed", (s64) page_size); + return ptr; } - else -#endif - { -#ifdef HAVE_ALIGNED_ALLOC - void *ret = aligned_alloc(page_size, page_size); - if (!ret) - bug("aligned_alloc(%lu) failed", page_size); - return ret; -#else - bug("BIRD should have already died on fatal error."); -#endif - } -} -void -free_page(void *ptr) -{ #ifdef HAVE_MMAP - if (!use_fake) + /* If there is any free page kept hot in this thread, we use it. */ + struct free_page *fp = local_page_stack; + if (fp) { - struct free_pages *fp = &birdloop_current->pages; - struct node *n = ptr; - n += FP_NODE_OFFSET; - - memset(n, 0, sizeof(node)); - add_tail(&fp->list, n); - if ((++fp->cnt > fp->max) && !shutting_down) - ev_send(fp->cleanup->list, fp->cleanup); + local_page_stack = atomic_load_explicit(&fp->next, memory_order_acquire); + atomic_fetch_sub_explicit(&pages_kept_locally, 1, memory_order_relaxed); + pages_kept_here--; + return fp; } - else -#endif - free(ptr); -} -#ifdef HAVE_MMAP + ASSERT_DIE(pages_kept_here == 0); -#define GFP (&main_birdloop.pages) + /* If there is any free page kept hot in global storage, we use it. */ + rcu_read_lock(); + fp = atomic_load_explicit(&page_stack, memory_order_acquire); + while (fp && !atomic_compare_exchange_strong_explicit( + &page_stack, &fp, atomic_load_explicit(&fp->next, memory_order_acquire), + memory_order_acq_rel, memory_order_acquire)) + ; + rcu_read_unlock(); -void -flush_pages(struct birdloop *loop) -{ - ASSERT_DIE(birdloop_inside(loop->parent->loop)); + if (fp) + { + atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed); + return fp; + } - struct free_pages *fp = &loop->pages; - struct free_pages *pfp = &loop->parent->loop->pages; + /* If there is any free page kept cold, we use that. */ + LOCK_DOMAIN(resource, empty_pages_domain); + if (empty_pages) { + if (empty_pages->pos) + /* Either the keeper page contains at least one cold page pointer, return that */ + fp = empty_pages->pages[--empty_pages->pos]; + else + { + /* Or the keeper page has no more cold page pointer, return the keeper page */ + fp = (struct free_page *) empty_pages; + empty_pages = empty_pages->next; + } + } + UNLOCK_DOMAIN(resource, empty_pages_domain); - add_tail_list(&pfp->list, &fp->list); - pfp->cnt += fp->cnt; - - fp->cnt = 0; - fp->list = (list) {}; - fp->min = 0; - fp->max = 0; + if (fp) + return fp; - rfree(fp->cleanup); - fp->cleanup = NULL; + /* And in the worst case, allocate some new pages by mmap() */ + void *ptr = alloc_sys_page(); + for (int i=1; i<ALLOC_PAGES_AT_ONCE; i++) + free_page(ptr + page_size * i); + + return ptr; +#endif } -static void -cleanup_pages(void *data) +void +free_page(void *ptr) { - struct birdloop *loop = data; - birdloop_enter(loop); - - ASSERT_DIE(birdloop_inside(loop->parent->loop)); - - struct free_pages *fp = &loop->pages; - struct free_pages *pfp = &loop->parent->loop->pages; - - while ((fp->cnt < fp->min) && (pfp->cnt > pfp->min)) + /* If the system page allocator is goofy, we just free the block and care no more. */ + if (use_fake) { - node *n = HEAD(pfp->list); - rem_node(n); - add_tail(&fp->list, n); - fp->cnt++; - pfp->cnt--; - } - - while (fp->cnt < fp->min) - { - node *n = alloc_sys_page(); - add_tail(&fp->list, n + FP_NODE_OFFSET); - fp->cnt++; + free(ptr); + return; } - while (fp->cnt > fp->max) +#ifdef HAVE_MMAP + /* We primarily try to keep the pages locally. */ + struct free_page *fp = ptr; + if (shutting_down || (pages_kept_here < KEEP_PAGES_MAX_LOCAL)) { - node *n = HEAD(fp->list); - rem_node(n); - add_tail(&pfp->list, n); - fp->cnt--; - pfp->cnt++; + atomic_store_explicit(&fp->next, local_page_stack, memory_order_relaxed); + local_page_stack = fp; + + atomic_fetch_add_explicit(&pages_kept_locally, 1, memory_order_relaxed); + pages_kept_here++; + return; } - birdloop_leave(loop); + /* If there are too many local pages, we add the free page to the global hot-free-page list */ + rcu_read_lock(); + struct free_page *next = atomic_load_explicit(&page_stack, memory_order_acquire); - if (!shutting_down && (pfp->cnt > pfp->max)) - ev_send(pfp->cleanup->list, pfp->cleanup); + do atomic_store_explicit(&fp->next, next, memory_order_release); + while (!atomic_compare_exchange_strong_explicit( + &page_stack, &next, fp, + memory_order_acq_rel, memory_order_acquire)); + rcu_read_unlock(); + + /* And if there are too many global hot free pages, we ask for page cleanup */ + if (atomic_fetch_add_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX) + SCHEDULE_CLEANUP; +#endif } -static void -cleanup_global_pages(void *data UNUSED) +/* When the routine is going to sleep for a long time, we flush the local + * hot page cache to not keep dirty pages for nothing. */ +void +flush_local_pages(void) { - while (GFP->cnt < GFP->max) - { - node *n = alloc_sys_page(); - add_tail(&GFP->list, n + FP_NODE_OFFSET); - GFP->cnt++; - } + if (use_fake || !local_page_stack || shutting_down) + return; - for (uint limit = GFP->cnt; (limit > 0) && (GFP->cnt > GFP->max); limit--) + /* We first count the pages to enable consistency checking. + * Also, we need to know the last page. */ + struct free_page *last = local_page_stack, *next; + int check_count = 1; + while (next = atomic_load_explicit(&last->next, memory_order_acquire)) { - node *n = TAIL(GFP->list); - rem_node(n); - - if (munmap(n - FP_NODE_OFFSET, page_size) == 0) - GFP->cnt--; - else if (errno == ENOMEM) - add_head(&GFP->list, n); - else - bug("munmap(%p) failed: %m", n - FP_NODE_OFFSET); + check_count++; + last = next; } + + /* The actual number of pages must be equal to the counter value. */ + ASSERT_DIE(check_count == pages_kept_here); + + /* Repeatedly trying to insert the whole page list into global page stack at once. */ + rcu_read_lock(); + next = atomic_load_explicit(&page_stack, memory_order_acquire); + + /* First we set the outwards pointer (from our last), + * then we try to set the inwards pointer to our first page. */ + do atomic_store_explicit(&last->next, next, memory_order_release); + while (!atomic_compare_exchange_strong_explicit( + &page_stack, &next, local_page_stack, + memory_order_acq_rel, memory_order_acquire)); + rcu_read_unlock(); + + /* Finished. Now the local stack is empty. */ + local_page_stack = NULL; + pages_kept_here = 0; + + /* Check the state of global page cache and maybe schedule its cleanup. */ + atomic_fetch_sub_explicit(&pages_kept_locally, check_count, memory_order_relaxed); + if (atomic_fetch_add_explicit(&pages_kept, check_count, memory_order_relaxed) >= KEEP_PAGES_MAX) + SCHEDULE_CLEANUP; } -void -init_pages(struct birdloop *loop) +#ifdef HAVE_MMAP +static void +page_cleanup(void *_ UNUSED) { - struct free_pages *fp = &loop->pages; + /* Cleanup on shutdown is ignored. All pages may be kept hot, OS will take care. */ + if (shutting_down) + return; + + struct free_page *stack = atomic_exchange_explicit(&page_stack, NULL, memory_order_acq_rel); + if (!stack) + return; - init_list(&fp->list); - fp->cleanup = ev_new_init(loop->parent->loop->pool, cleanup_pages, loop); - fp->cleanup->list = (loop->parent->loop == &main_birdloop) ? &global_work_list : birdloop_event_list(loop->parent->loop); - fp->min = 4; - fp->max = 16; - for (fp->cnt = 0; fp->cnt < fp->min; fp->cnt++) + do { + synchronize_rcu(); + struct free_page *fp = stack; + stack = atomic_load_explicit(&fp->next, memory_order_acquire); + + LOCK_DOMAIN(resource, empty_pages_domain); + /* Empty pages are stored as pointers. To store them, we need a pointer block. */ + if (!empty_pages || (empty_pages->pos == EP_POS_MAX)) + { + /* There is either no pointer block or the last block is full. We use this block as a pointer block. */ + empty_pages = (struct empty_pages *) fp; + *empty_pages = (struct empty_pages) {}; + } + else + { + /* We store this block as a pointer into the first free place + * and tell the OS that the underlying memory is trash. */ + empty_pages->pages[empty_pages->pos++] = fp; + if (madvise(fp, page_size, +#ifdef CONFIG_MADV_DONTNEED_TO_FREE + MADV_DONTNEED +#else + MADV_FREE +#endif + ) < 0) + bug("madvise(%p) failed: %m", fp); + } + UNLOCK_DOMAIN(resource, empty_pages_domain); + } + while ((atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX / 2) && stack); + + while (stack) { - node *n = alloc_sys_page(); - add_tail(&fp->list, n + FP_NODE_OFFSET); + struct free_page *f = stack; + stack = atomic_load_explicit(&f->next, memory_order_acquire); + free_page(f); + + atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed); } } +#endif -static event global_free_pages_cleanup_event = { .hook = cleanup_global_pages, .list = &global_work_list }; - -void resource_sys_init(void) +void +resource_sys_init(void) { +#ifdef CONFIG_DISABLE_THP + /* Disable transparent huge pages, they do not work properly with madvice(MADV_DONTNEED) */ + if (prctl(PR_SET_THP_DISABLE, (unsigned long) 1, (unsigned long) 0, (unsigned long) 0, (unsigned long) 0) < 0) + log(L_WARN "Cannot disable transparent huge pages: prctl(PR_SET_THP_DISABLE) failed: %m"); +#endif + +#ifdef HAVE_MMAP + /* Check what page size the system supports */ if (!(page_size = sysconf(_SC_PAGESIZE))) die("System page size must be non-zero"); - if (u64_popcount(page_size) == 1) + if ((u64_popcount(page_size) == 1) && (page_size >= (1 << 10)) && (page_size <= (1 << 18))) { - init_list(&GFP->list); - GFP->cleanup = &global_free_pages_cleanup_event; - GFP->min = 0; - GFP->max = 256; + /* We assume that page size has only one bit and is between 1K and 256K (incl.). + * Otherwise, the assumptions in lib/slab.c (sl_head's num_full range) aren't met. */ + + empty_pages_domain = DOMAIN_NEW(resource, "Empty Pages"); + initialized = 1; return; } -#ifdef HAVE_ALIGNED_ALLOC - log(L_WARN "Got strange memory page size (%lu), using the aligned allocator instead", page_size); -#else - die("Got strange memory page size (%lu) and aligned_alloc is not available", page_size); -#endif - /* Too big or strange page, use the aligned allocator instead */ - page_size = 4096; + log(L_WARN "Got strange memory page size (%ld), using the aligned allocator instead", (s64) page_size); use_fake = 1; -} - -#else +#endif -void -resource_sys_init(void) -{ page_size = 4096; - use_fake = 1; + initialized = 1; } - -#endif |