Skip to content

Commit

Permalink
Prevent pthread startup hangs (#411)
Browse files Browse the repository at this point in the history
The hang scenario is as follows:

1. thread A starts executing a pthread function, which takes a critical
pthread lock.
2. thread B is creating a new thread, and enters thread creation/busy
state. It tries to take the same lock, and pauses.
3. thread C starts a collection, and signals all threads
4. thread A pauses waiting for the resume signal with the lock held
5. deadlock -- thread A holds the critical lock. Thread B cannot
continue, but is in a busy state (sets state to Delayed). Thread C is
waiting for all threads to leave the busy state.

The solution is to absolutely forbid sending signals while a thread is
starting.

In order to achieve this, we allocate a bit of the `startingThreadCount`
atomic uint. If this bit is set, it means a thread has taken the lock,
and crucially, when it was set *no threads were starting*.

Any thread that then starts will increment `startingThreadCount`, but
find the bit is set. If the bit is set, and it's not the thread that is
running the collection (the collection thread is allowed to start worker
threads), then the `stopTheWorld` lock is taken to pause that thread
until the collection is over.

The thread stopping the world is now free to send signals to all
threads, knowing that there will be no thread that is in the critical
stage of starting up.
  • Loading branch information
schveiguy authored Jan 14, 2025
1 parent 45fdd67 commit 427d047
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 40 deletions.
3 changes: 2 additions & 1 deletion sdlib/d/gc/scanner.d
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ public:

// First thing, start the worker threads, so they can do work ASAP.
foreach (ref tid; threads) {
pthread_create(&tid, null, markThreadEntry, cast(void*) &this);
import d.rt.trampoline;
createGCThread(&tid, null, markThreadEntry, cast(void*) &this);
}

// Scan the roots.
Expand Down
47 changes: 26 additions & 21 deletions sdlib/d/gc/thread.d
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,13 @@ void createProcess() {
registerTlsSegments();
}

void createThread() {
void createThread(bool AllowStopTheWorld)() {
enterBusyState();
scope(exit) {
exitThreadCreation();
if (AllowStopTheWorld) {
allowStopTheWorld();
}

exitBusyState();
}

Expand All @@ -49,12 +52,12 @@ void destroyThread() {
gThreadState.remove(&threadCache);
}

void enterThreadCreation() {
gThreadState.enterThreadCreation();
void preventStopTheWorld() {
gThreadState.preventStopTheWorld();
}

void exitThreadCreation() {
gThreadState.exitThreadCreation();
void allowStopTheWorld() {
gThreadState.allowStopTheWorld();
}

uint getRegisteredThreadCount() {
Expand Down Expand Up @@ -114,9 +117,6 @@ void initThread() {

struct ThreadState {
private:
import d.sync.atomic;
Atomic!uint startingThreadCount;

import d.sync.mutex;
shared Mutex mStats;

Expand All @@ -126,21 +126,24 @@ private:
Mutex mThreadList;
ThreadRing registeredThreads;

Mutex stopTheWorldMutex;
import d.sync.sharedlock;
shared SharedLock stopTheWorldLock;

public:
/**
* Thread management.
* Stop the world prevention.
*/
void enterThreadCreation() shared {
startingThreadCount.fetchAdd(1);
void preventStopTheWorld() shared {
stopTheWorldLock.sharedLock();
}

void exitThreadCreation() shared {
auto s = startingThreadCount.fetchSub(1);
assert(s > 0, "enterThreadCreation was not called!");
void allowStopTheWorld() shared {
stopTheWorldLock.sharedUnlock();
}

/**
* Thread management.
*/
void register(ThreadCache* tcache) shared {
mThreadList.lock();
scope(exit) mThreadList.unlock();
Expand Down Expand Up @@ -181,11 +184,12 @@ public:
import d.gc.hooks;
__sd_gc_pre_stop_the_world_hook();

stopTheWorldMutex.lock();
// Prevent any new threads from being created
stopTheWorldLock.exclusiveLock();

uint count;
while (suspendRunningThreads(count++)
|| startingThreadCount.load() > 0) {

while (suspendRunningThreads(count++)) {
import sys.posix.sched;
sched_yield();
}
Expand All @@ -197,14 +201,15 @@ public:
sched_yield();
}

stopTheWorldMutex.unlock();
// Allow thread creation again.
stopTheWorldLock.exclusiveUnlock();

import d.gc.hooks;
__sd_gc_post_restart_the_world_hook();
}

void scanSuspendedThreads(ScanDg scan) shared {
assert(stopTheWorldMutex.isHeld());
assert(stopTheWorldLock.count == SharedLock.Exclusive);

mThreadList.lock();
scope(exit) mThreadList.unlock();
Expand Down
38 changes: 20 additions & 18 deletions sdlib/d/rt/trampoline.d
Original file line number Diff line number Diff line change
Expand Up @@ -12,31 +12,33 @@ extern(C) int pthread_create(pthread_t* thread, const pthread_attr_t* attr,
PthreadFunction start_routine, void* arg) {
auto runner = new ThreadRunner(start_routine, arg);

/**
* We do not want the GC to stop this specific thread
* while it is creating another thread.
*/
enterBusyState();
scope(exit) exitBusyState();

/**
* We notify the GC that we are starting a new thread.
* This allows the GC to not stop the workd while a thread
* is being created.
*/
enterThreadCreation();
// Stop the world cannot happen during thread startup.
preventStopTheWorld();

auto ret =
pthread_create_trampoline(thread, attr, cast(PthreadFunction) runThread,
runner);
pthread_create_trampoline(thread, attr,
cast(PthreadFunction) runThread!true, runner);
if (ret != 0) {
// The spawned thread will call this when there are no errors.
exitThreadCreation();
allowStopTheWorld();
}

return ret;
}

/**
* This special hook does not involve preventing "Stop the world". otherwise
* creates a thread in the same way.
*/
int createGCThread(pthread_t* thread, const pthread_attr_t* attr,
PthreadFunction start_routine, void* arg) {
auto runner = new ThreadRunner(start_routine, arg);

auto ret = pthread_create_trampoline(
thread, attr, cast(PthreadFunction) runThread!false, runner);
return ret;
}

private:

struct ThreadRunner {
Expand All @@ -49,11 +51,11 @@ struct ThreadRunner {
}
}

void* runThread(ThreadRunner* runner) {
void* runThread(bool AllowStopTheWorld)(ThreadRunner* runner) {
auto fun = runner.fun;
auto arg = runner.arg;

createThread();
createThread!AllowStopTheWorld();
__sd_gc_free(runner);

// Make sure we clean up after ourselves.
Expand Down

0 comments on commit 427d047

Please sign in to comment.