Debug:Qutex: Add deadlock detection based on elapsed time

We now detect that a deadlock is likely when
CONFIG_DEBUG_QUTEX_DEADLOCK_TIMEOUT_MS has elapsed. This is the
preliminary work required to do a backtrace through the call
stack and figure out if a deadlock has really occured.

To do this, we'd have to go through the async call chain and
search for a previous caller which acquired the same qutex as
the one that first failed during this Lockvoker LockSet acquisition
attempt.
This commit is contained in:
2025-09-21 15:11:28 -04:00
parent dbc9569775
commit d2ed525106
6 changed files with 59 additions and 9 deletions
+8
View File
@@ -26,6 +26,14 @@ math(EXPR MIND_VOSCILLATOR_FREQ_MS "1000 / ${MIND_VOSCILLATOR_PERIOD_MS}")
# World thread configuration
option(WORLD_USE_BODY_THREAD
"Use body thread for world component instead of separate world thread" OFF)
# Qutex deadlock detection configuration
set(DEBUG_CONFIG_QUTEX_DEADLOCK_TIMEOUT_MS 500 CACHE STRING
"Timeout in milliseconds for deadlock detection in qutex system")
if(NOT DEBUG_CONFIG_QUTEX_DEADLOCK_TIMEOUT_MS GREATER 0)
message(FATAL_ERROR "DEBUG_CONFIG_QUTEX_DEADLOCK_TIMEOUT_MS must be a positive integer > 0")
endif()
# Test configuration
option(ENABLE_TESTS "Enable building tests" OFF)
+3
View File
@@ -12,6 +12,9 @@
/* World thread configuration */
#cmakedefine WORLD_USE_BODY_THREAD
/* Qutex deadlock detection configuration */
#define DEBUG_CONFIG_QUTEX_DEADLOCK_TIMEOUT_MS @DEBUG_CONFIG_QUTEX_DEADLOCK_TIMEOUT_MS@
/* Cross-compilation configuration */
#cmakedefine CMAKE_CROSSCOMPILING
+4 -1
View File
@@ -81,9 +81,12 @@ public:
/**
* @brief Try to acquire all locks in order; back off if acquisition fails
* @param lockvoker The LockerAndInvoker attempting to acquire the locks
* @param firstFailedQutex Output parameter to receive the first Qutex that
* failed acquisition (can be nullptr)
* @return true if all locks were acquired, false otherwise
*/
bool tryAcquireOrBackOff(LockerAndInvokerBase &lockvoker);
bool tryAcquireOrBackOff(
LockerAndInvokerBase &lockvoker, Qutex *firstFailedQutex = nullptr);
void unregisterFromQutexQueues();
/**
+3 -2
View File
@@ -24,8 +24,8 @@ public:
/**
* @brief Constructor
*/
Qutex()
: isOwned(false)
Qutex(const std::string &_name)
: isOwned(false), name(_name)
{}
/**
@@ -88,6 +88,7 @@ public:
SpinLock lock;
LockerAndInvokerBase::List queue;
bool isOwned;
std::string name;
};
} // namespace smo
+34 -5
View File
@@ -4,6 +4,8 @@
#include <functional>
#include <memory>
#include <atomic>
#include <chrono>
#include <iostream>
#include <componentThread.h>
#include <lockSet.h>
#include <asynchronousContinuation.h>
@@ -63,7 +65,8 @@ public:
: LockerAndInvokerBase(&serializedContinuation),
serializedContinuation(serializedContinuation),
target(target),
invocationTarget(std::move(invocationTarget))
invocationTarget(std::move(invocationTarget)),
creationTimestamp(std::chrono::steady_clock::now())
{
firstWake();
}
@@ -81,11 +84,28 @@ public:
"executing on wrong ComponentThread");
}
Qutex *firstFailedQutexPtr = nullptr;
bool isDeadlockLikely = isDeadlockLikely();
if (!serializedContinuation.requiredLocks.tryAcquireOrBackOff(
*this))
*this, (isDeadlockLikely ? &firstFailedQutexPtr : nullptr)))
{
// Just allow this lockvoker to be dropped from its io_service.
allowAwakening();
if (!isDeadlockLikely)
{ return; }
Qutex &firstFailedQutex = *firstFailedQutexPtr;
std::cerr << __func__ << ": Deadlock likely: "
<< "Lockvoker has been waiting for "
<< std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - creationTimestamp)
.count()
<< "ms, failed on qutex @" << &firstFailedQutex
<< " (" << firstFailedQutex.name << ")" << std::endl;
return;
}
@@ -137,9 +157,8 @@ public:
target->getIoService().post(*this);
}
/**
* @brief Allow awakening by resetting the awake flag
*/
private:
// Allow awakening by resetting the awake flag
void allowAwakening()
{ serializedContinuation.isAwakeOrBeingAwakened.store(false); }
@@ -158,11 +177,21 @@ public:
awaken(true);
}
// Check if CONFIG_QUTEX_DEADLOCK_TIMEOUT_MS has elapsed since creation
bool isDeadlockLikely() const
{
auto now = std::chrono::steady_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
now - creationTimestamp);
return elapsed.count() >= DEBUG_CONFIG_QUTEX_DEADLOCK_TIMEOUT_MS;
}
private:
SerializedAsynchronousContinuation<OriginalCbFnT>
&serializedContinuation;
InvocationTargetT invocationTarget;
std::shared_ptr<ComponentThread> target;
std::chrono::steady_clock::time_point creationTimestamp;
};
};
+7 -1
View File
@@ -47,7 +47,9 @@ void LockSet<OriginalCbFnT>::registerInQutexQueues(
}
template <class OriginalCbFnT>
bool LockSet<OriginalCbFnT>::tryAcquireOrBackOff(LockerAndInvokerBase &lockvoker)
bool LockSet<OriginalCbFnT>::tryAcquireOrBackOff(
LockerAndInvokerBase &lockvoker, Qutex *firstFailedQutex
)
{
if (!registeredInQutexQueues)
{
@@ -72,6 +74,10 @@ bool LockSet<OriginalCbFnT>::tryAcquireOrBackOff(LockerAndInvokerBase &lockvoker
if (!lockUsageDesc.first.get().tryAcquire(
lockvoker, nRequiredLocks))
{
// Set the first failed qutex for debugging
if (firstFailedQutex) {
*firstFailedQutex = &lockUsageDesc.first.get();
}
break;
}