Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,7 @@ $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,de
$(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h
$(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
$(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
$(BUILDDIR)/gc-stacks.o $(BUILDDIR)/gc-stacks.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
$(BUILDDIR)/gc-stock.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h
$(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc-heap-snapshot.h
$(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)/gc-alloc-profiler.h
Expand Down
53 changes: 30 additions & 23 deletions src/gc-stacks.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// This file is a part of Julia. License is MIT: https://julialang.org/license

#include "gc-common.h"
#include "gc-stock.h"
#include "threading.h"
#ifndef _OS_WINDOWS_
# include <sys/resource.h>
Expand Down Expand Up @@ -202,7 +203,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
return stk;
}

void sweep_stack_pools(void) JL_NOTSAFEPOINT
void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT
{
// Stack sweeping algorithm:
// // deallocate stacks if we have too many sitting around unused
Expand All @@ -215,33 +216,38 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
// bufsz = t->bufsz
// if (stkbuf)
// push(free_stacks[sz], stkbuf)
assert(gc_n_threads);
for (int i = 0; i < gc_n_threads; i++) {
jl_atomic_fetch_add(&gc_n_threads_sweeping, 1);
while (1) {
int i = jl_atomic_fetch_add_relaxed(&gc_ptls_sweep_idx, -1);
if (i < 0)
break;
jl_ptls_t ptls2 = gc_all_tls_states[i];
if (ptls2 == NULL)
continue;

// free half of stacks that remain unused since last sweep
for (int p = 0; p < JL_N_STACK_POOLS; p++) {
small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p];
size_t n_to_free;
if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
n_to_free = al->len; // not alive yet or dead, so it does not need these anymore
}
else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) {
n_to_free = al->len / 2;
if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL))
n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL;
}
else {
n_to_free = 0;
}
for (int n = 0; n < n_to_free; n++) {
void *stk = small_arraylist_pop(al);
free_stack(stk, pool_sizes[p]);
}
if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
small_arraylist_free(al);
if (i == jl_atomic_load_relaxed(&gc_stack_free_idx)) {
for (int p = 0; p < JL_N_STACK_POOLS; p++) {
small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p];
size_t n_to_free;
if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
n_to_free = al->len; // not alive yet or dead, so it does not need these anymore
}
else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) {
n_to_free = al->len / 2;
if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL))
n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL;
}
else {
n_to_free = 0;
}
for (int n = 0; n < n_to_free; n++) {
void *stk = small_arraylist_pop(al);
free_stack(stk, pool_sizes[p]);
}
if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
small_arraylist_free(al);
}
}
}
if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
Expand Down Expand Up @@ -287,6 +293,7 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
}
live_tasks->len -= ndel;
}
jl_atomic_fetch_add(&gc_n_threads_sweeping, -1);
}

JL_DLLEXPORT jl_array_t *jl_live_tasks(void)
Expand Down
63 changes: 57 additions & 6 deletions src/gc-stock.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ _Atomic(int) gc_n_threads_sweeping;
_Atomic(jl_gc_padded_page_stack_t *) gc_allocd_scratch;
// `tid` of mutator thread that triggered GC
_Atomic(int) gc_master_tid;
// counter for sharing work when sweeping stacks
_Atomic(int) gc_ptls_sweep_idx;
// counter for round robin of giving back stack pages to the OS
_Atomic(int) gc_stack_free_idx = 0;
// `tid` of first GC thread
int gc_first_tid;
// Mutex/cond used to synchronize wakeup of GC threads on parallel marking
Expand Down Expand Up @@ -994,13 +998,50 @@ STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_pa
// sweep over all memory that is being used and not in a pool
static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
{
sweep_stack_pools();
gc_sweep_foreign_objs();
sweep_malloced_memory();
sweep_big(ptls);
jl_engine_sweep(gc_all_tls_states);
}

// wake up all threads to sweep the stacks
void gc_sweep_wake_all_stacks(jl_ptls_t ptls) JL_NOTSAFEPOINT
{
uv_mutex_lock(&gc_threads_lock);
int first = gc_first_parallel_collector_thread_id();
int last = gc_last_parallel_collector_thread_id();
for (int i = first; i <= last; i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
gc_check_ptls_of_parallel_collector_thread(ptls2);
jl_atomic_fetch_add(&ptls2->gc_tls.gc_stack_sweep_requested, 1);
}
uv_cond_broadcast(&gc_threads_cond);
uv_mutex_unlock(&gc_threads_lock);
return;
}

void gc_sweep_wait_for_all_stacks(void) JL_NOTSAFEPOINT
{
while ((jl_atomic_load_acquire(&gc_ptls_sweep_idx)>= 0 ) || jl_atomic_load_acquire(&gc_n_threads_sweeping) != 0) {
jl_cpu_pause();
}
}

void sweep_stack_pools(jl_ptls_t ptls) JL_NOTSAFEPOINT
{
// initialize ptls index for parallel sweeping of stack pools
assert(gc_n_threads);
int stack_free_idx = jl_atomic_load_relaxed(&gc_stack_free_idx);
if (stack_free_idx + 1 == gc_n_threads)
jl_atomic_store_relaxed(&gc_stack_free_idx, 0);
else
jl_atomic_store_relaxed(&gc_stack_free_idx, stack_free_idx + 1);
jl_atomic_store_release(&gc_ptls_sweep_idx, gc_n_threads - 1); // idx == gc_n_threads = release stacks to the OS so it's serial
gc_sweep_wake_all_stacks(ptls);
sweep_stack_pool_loop();
gc_sweep_wait_for_all_stacks();
}

static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT
{
assert(pg->fl_begin_offset != UINT16_MAX);
Expand Down Expand Up @@ -1076,7 +1117,7 @@ int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_sc
}

// wake up all threads to sweep the pages
void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch)
void gc_sweep_wake_all_pages(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch)
{
int parallel_sweep_worthwhile = gc_sweep_prescan(ptls, new_gc_allocd_scratch);
if (parallel_sweep_worthwhile && !page_profile_enabled) {
Expand Down Expand Up @@ -1112,7 +1153,7 @@ void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_
}

// wait for all threads to finish sweeping
void gc_sweep_wait_for_all(void)
void gc_sweep_wait_for_all_pages(void)
{
jl_atomic_store(&gc_allocd_scratch, NULL);
while (jl_atomic_load_acquire(&gc_n_threads_sweeping) != 0) {
Expand Down Expand Up @@ -1258,9 +1299,9 @@ static void gc_sweep_pool(void)
// the actual sweeping
jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) calloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t));
jl_ptls_t ptls = jl_current_task->ptls;
gc_sweep_wake_all(ptls, new_gc_allocd_scratch);
gc_sweep_wake_all_pages(ptls, new_gc_allocd_scratch);
gc_sweep_pool_parallel(ptls);
gc_sweep_wait_for_all();
gc_sweep_wait_for_all_pages();

// reset half-pages pointers
for (int t_i = 0; t_i < n_threads; t_i++) {
Expand Down Expand Up @@ -3069,6 +3110,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
#endif
current_sweep_full = sweep_full;
sweep_weak_refs();
sweep_stack_pools(ptls);
gc_sweep_other(ptls, sweep_full);
gc_scrub();
gc_verify_tags();
Expand Down Expand Up @@ -3480,6 +3522,10 @@ STATIC_INLINE int may_sweep(jl_ptls_t ptls) JL_NOTSAFEPOINT
return (jl_atomic_load(&ptls->gc_tls.gc_sweeps_requested) > 0);
}

STATIC_INLINE int may_sweep_stack(jl_ptls_t ptls) JL_NOTSAFEPOINT
{
return (jl_atomic_load(&ptls->gc_tls.gc_stack_sweep_requested) > 0);
}
// parallel gc thread function
void jl_parallel_gc_threadfun(void *arg)
{
Expand All @@ -3502,12 +3548,17 @@ void jl_parallel_gc_threadfun(void *arg)

while (1) {
uv_mutex_lock(&gc_threads_lock);
while (!may_mark() && !may_sweep(ptls)) {
while (!may_mark() && !may_sweep(ptls) && !may_sweep_stack(ptls)) {
uv_cond_wait(&gc_threads_cond, &gc_threads_lock);
}
uv_mutex_unlock(&gc_threads_lock);
assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD);
gc_mark_loop_parallel(ptls, 0);
if (may_sweep_stack(ptls)) {
assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD);
sweep_stack_pool_loop();
jl_atomic_fetch_add(&ptls->gc_tls.gc_stack_sweep_requested, -1);
}
if (may_sweep(ptls)) {
assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD);
gc_sweep_pool_parallel(ptls);
Expand Down
4 changes: 3 additions & 1 deletion src/gc-stock.h
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,8 @@ extern uv_cond_t gc_threads_cond;
extern uv_sem_t gc_sweep_assists_needed;
extern _Atomic(int) gc_n_threads_marking;
extern _Atomic(int) gc_n_threads_sweeping;
extern _Atomic(int) gc_ptls_sweep_idx;
extern _Atomic(int) gc_stack_free_idx;
extern _Atomic(int) n_threads_running;
extern uv_barrier_t thread_init_done;
void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
Expand All @@ -521,7 +523,7 @@ void gc_mark_loop_serial(jl_ptls_t ptls);
void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
void gc_sweep_pool_parallel(jl_ptls_t ptls);
void gc_free_pages(void);
void sweep_stack_pools(void) JL_NOTSAFEPOINT;
void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT;
void jl_gc_debug_init(void);

// GC pages
Expand Down
1 change: 1 addition & 0 deletions src/gc-tls.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ typedef struct {
jl_gc_markqueue_t mark_queue;
jl_gc_mark_cache_t gc_cache;
_Atomic(size_t) gc_sweeps_requested;
_Atomic(uint8_t) gc_stack_sweep_requested;
arraylist_t sweep_objs;
} jl_gc_tls_states_t;

Expand Down