summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--sysdep/cf/linux.h1
-rw-r--r--sysdep/unix/alloc.c113
2 files changed, 92 insertions, 22 deletions
diff --git a/sysdep/cf/linux.h b/sysdep/cf/linux.h
index c640bef4..56ecf017 100644
--- a/sysdep/cf/linux.h
+++ b/sysdep/cf/linux.h
@@ -20,6 +20,7 @@
#define CONFIG_RESTRICTED_PRIVILEGES
#define CONFIG_INCLUDE_SYSPRIV_H "sysdep/linux/syspriv.h"
+#define CONFIG_MADV_DONTNEED_TO_FREE
#ifndef AF_MPLS
#define AF_MPLS 28
diff --git a/sysdep/unix/alloc.c b/sysdep/unix/alloc.c
index 847def30..3ea10c32 100644
--- a/sysdep/unix/alloc.c
+++ b/sysdep/unix/alloc.c
@@ -45,6 +45,18 @@ struct free_page {
};
#endif
+#define EP_POS_MAX ((page_size - OFFSETOF(struct empty_pages, pages)) / sizeof (void *))
+
+struct empty_pages {
+ struct empty_pages *next;
+ uint pos;
+ void *pages[0];
+};
+
+DEFINE_DOMAIN(resource);
+static DOMAIN(resource) empty_pages_domain;
+static struct empty_pages *empty_pages = NULL;
+
static struct free_page * _Atomic page_stack = NULL;
static _Thread_local struct free_page * local_page_stack = NULL;
@@ -76,6 +88,7 @@ extern int shutting_down; /* Shutdown requested. */
void *
alloc_page(void)
{
+ /* If the system page allocator is goofy, we use posix_memalign to get aligned blocks of memory. */
if (use_fake)
{
void *ptr = NULL;
@@ -88,6 +101,7 @@ alloc_page(void)
}
#ifdef HAVE_MMAP
+ /* If there is any free page kept hot in this thread, we use it. */
struct free_page *fp = local_page_stack;
if (fp)
{
@@ -97,6 +111,7 @@ alloc_page(void)
return fp;
}
+ /* If there is any free page kept hot in global storage, we use it. */
rcu_read_lock();
fp = atomic_load_explicit(&page_stack, memory_order_acquire);
while (fp && !atomic_compare_exchange_strong_explicit(
@@ -105,22 +120,43 @@ alloc_page(void)
;
rcu_read_unlock();
- if (!fp)
+ if (fp)
{
- void *ptr = alloc_sys_page();
- for (int i=1; i<ALLOC_PAGES_AT_ONCE; i++)
- free_page(ptr + page_size * i);
- return ptr;
+ atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed);
+ return fp;
+ }
+
+ /* If there is any free page kept cold, we use that. */
+ LOCK_DOMAIN(resource, empty_pages_domain);
+ if (empty_pages) {
+ if (empty_pages->pos)
+ /* Either the keeper page contains at least one cold page pointer, return that */
+ fp = empty_pages->pages[--empty_pages->pos];
+ else
+ {
+ /* Or the keeper page has no more cold page pointer, return the keeper page */
+ fp = (struct free_page *) empty_pages;
+ empty_pages = empty_pages->next;
+ }
}
+ UNLOCK_DOMAIN(resource, empty_pages_domain);
- atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed);
- return fp;
+ if (fp)
+ return fp;
+
+ /* And in the worst case, allocate some new pages by mmap() */
+ void *ptr = alloc_sys_page();
+ for (int i=1; i<ALLOC_PAGES_AT_ONCE; i++)
+ free_page(ptr + page_size * i);
+
+ return ptr;
#endif
}
void
free_page(void *ptr)
{
+ /* If the system page allocator is goofy, we just free the block and care no more. */
if (use_fake)
{
free(ptr);
@@ -128,6 +164,7 @@ free_page(void *ptr)
}
#ifdef HAVE_MMAP
+ /* We primarily try to keep the pages locally. */
struct free_page *fp = ptr;
if (shutting_down || (pages_kept_here < KEEP_PAGES_MAX_LOCAL))
{
@@ -137,6 +174,7 @@ free_page(void *ptr)
return;
}
+ /* If there are too many local pages, we add the free page to the global hot-free-page list */
rcu_read_lock();
struct free_page *next = atomic_load_explicit(&page_stack, memory_order_acquire);
@@ -146,17 +184,22 @@ free_page(void *ptr)
memory_order_acq_rel, memory_order_acquire));
rcu_read_unlock();
+ /* And if there are too many global hot free pages, we ask for page cleanup */
if (atomic_fetch_add_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX)
SCHEDULE_CLEANUP;
#endif
}
+/* When the routine is going to sleep for a long time, we flush the local
+ * hot page cache to not keep dirty pages for nothing. */
void
flush_local_pages(void)
{
if (use_fake || !local_page_stack || shutting_down)
return;
+ /* We first count the pages to enable consistency checking.
+ * Also, we need to know the last page. */
struct free_page *last = local_page_stack, *next;
int check_count = 1;
while (next = atomic_load_explicit(&last->next, memory_order_acquire))
@@ -165,20 +208,26 @@ flush_local_pages(void)
last = next;
}
+ /* The actual number of pages must be equal to the counter value. */
ASSERT_DIE(check_count == pages_kept_here);
+ /* Repeatedly trying to insert the whole page list into global page stack at once. */
rcu_read_lock();
next = atomic_load_explicit(&page_stack, memory_order_acquire);
+ /* First we set the outwards pointer (from our last),
+ * then we try to set the inwards pointer to our first page. */
do atomic_store_explicit(&last->next, next, memory_order_release);
while (!atomic_compare_exchange_strong_explicit(
&page_stack, &next, local_page_stack,
memory_order_acq_rel, memory_order_acquire));
rcu_read_unlock();
+ /* Finished. Now the local stack is empty. */
local_page_stack = NULL;
pages_kept_here = 0;
+ /* Check the state of global page cache and maybe schedule its cleanup. */
atomic_fetch_sub_explicit(&pages_kept_locally, check_count, memory_order_relaxed);
if (atomic_fetch_add_explicit(&pages_kept, check_count, memory_order_relaxed) >= KEEP_PAGES_MAX)
SCHEDULE_CLEANUP;
@@ -188,6 +237,7 @@ flush_local_pages(void)
static void
page_cleanup(void *_ UNUSED)
{
+ /* Cleanup on shutdown is ignored. All pages may be kept hot, OS will take care. */
if (shutting_down)
return;
@@ -195,18 +245,37 @@ page_cleanup(void *_ UNUSED)
if (!stack)
return;
- synchronize_rcu();
+ /* Cleanup gets called when hot free page cache is too big.
+ * Moving some pages to the cold free page cache. */
do {
- struct free_page *f = stack;
- stack = atomic_load_explicit(&f->next, memory_order_acquire);
-
- if (munmap(f, page_size) == 0)
- continue;
- else if (errno != ENOMEM)
- bug("munmap(%p) failed: %m", f);
+ synchronize_rcu();
+ struct free_page *fp = stack;
+ stack = atomic_load_explicit(&fp->next, memory_order_acquire);
+
+ LOCK_DOMAIN(resource, empty_pages_domain);
+ /* Empty pages are stored as pointers. To store them, we need a pointer block. */
+ if (!empty_pages || (empty_pages->pos == EP_POS_MAX))
+ {
+ /* There is either no pointer block or the last block is full. We use this block as a pointer block. */
+ empty_pages = (struct empty_pages *) fp;
+ *empty_pages = (struct empty_pages) {};
+ }
else
- free_page(f);
+ {
+ /* We store this block as a pointer into the first free place
+ * and tell the OS that the underlying memory is trash. */
+ empty_pages->pages[empty_pages->pos++] = fp;
+ if (madvise(fp, page_size,
+#ifdef CONFIG_MADV_DONTNEED_TO_FREE
+ MADV_DONTNEED
+#else
+ MADV_FREE
+#endif
+ ) < 0)
+ bug("madvise(%p) failed: %m", fp);
+ }
+ UNLOCK_DOMAIN(resource, empty_pages_domain);
}
while ((atomic_fetch_sub_explicit(&pages_kept, 1, memory_order_relaxed) >= KEEP_PAGES_MAX / 2) && stack);
@@ -225,22 +294,22 @@ void
resource_sys_init(void)
{
#ifdef HAVE_MMAP
+ /* Check what page size the system supports */
if (!(page_size = sysconf(_SC_PAGESIZE)))
die("System page size must be non-zero");
- if (u64_popcount(page_size) == 1)
+ if ((u64_popcount(page_size) == 1) && (page_size >= (1 << 10)) && (page_size <= (1 << 18)))
{
+ /* We assume that page size has only one bit and is between 1K and 256K (incl.).
+ * Otherwise, the assumptions in lib/slab.c (sl_head's num_full range) aren't met. */
- for (int i = 0; i < (KEEP_PAGES_MIN * 2); i++)
- free_page(alloc_page());
-
- page_cleanup(NULL);
+ empty_pages_domain = DOMAIN_NEW(resource, "Empty Pages");
initialized = 1;
return;
}
/* Too big or strange page, use the aligned allocator instead */
- log(L_WARN "Got strange memory page size (%lu), using the aligned allocator instead", page_size);
+ log(L_WARN "Got strange memory page size (%ld), using the aligned allocator instead", (s64) page_size);
use_fake = 1;
#endif