2025-09-22 21:29:43 -04:00
|
|
|
#include <config.h>
|
|
|
|
|
#include <serializedAsynchronousContinuation.h>
|
|
|
|
|
#include <qutex.h>
|
|
|
|
|
|
|
|
|
|
namespace smo {
|
|
|
|
|
|
|
|
|
|
#ifdef CONFIG_ENABLE_DEBUG_LOCKS
|
2025-09-27 20:51:20 -04:00
|
|
|
|
|
|
|
|
template <class OriginalCbFnT>
|
|
|
|
|
template <class InvocationTargetT>
|
|
|
|
|
bool
|
|
|
|
|
SerializedAsynchronousContinuation<OriginalCbFnT>
|
|
|
|
|
::LockerAndInvoker<InvocationTargetT>
|
|
|
|
|
::traceContinuationHistoryForDeadlockOn(Qutex& firstFailedQutex)
|
|
|
|
|
{
|
|
|
|
|
/** EXPLANATION:
|
|
|
|
|
* In this function we will trace through the chain of continuations that
|
|
|
|
|
* led up to this Lockvoker's continuation. For each continuation which is
|
|
|
|
|
* a SerializedAsynchronousContinuation, we check through its LockSet to see
|
|
|
|
|
* if it contains the lock that failed acquisition. If it does, we have a
|
|
|
|
|
* deadlock.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
/* We can't start with the continuation directly referenced by this starting
|
|
|
|
|
* Lockvoker as it would contain the all locks we're currently trying to
|
|
|
|
|
* acquire...and rightly so because it's the continuation for this current
|
|
|
|
|
* lockvoker.
|
|
|
|
|
*/
|
|
|
|
|
for (std::shared_ptr<AsynchronousContinuationChainLink> currContin =
|
2025-09-29 14:37:16 -04:00
|
|
|
this->serializedContinuation.getCallersContinuationShPtr();
|
2025-09-27 20:51:20 -04:00
|
|
|
currContin != nullptr;
|
2025-09-29 14:37:16 -04:00
|
|
|
currContin = currContin->getCallersContinuationShPtr())
|
2025-09-27 20:51:20 -04:00
|
|
|
{
|
|
|
|
|
auto serializedCont = std::dynamic_pointer_cast<
|
|
|
|
|
SerializedAsynchronousContinuation<OriginalCbFnT>>(currContin);
|
|
|
|
|
|
|
|
|
|
if (serializedCont == nullptr) { continue; }
|
|
|
|
|
|
|
|
|
|
// Check if the firstFailedQutex is in this continuation's LockSet
|
|
|
|
|
try {
|
|
|
|
|
const auto& lockUsageDesc = serializedCont->requiredLocks
|
|
|
|
|
.getLockUsageDesc(firstFailedQutex);
|
|
|
|
|
} catch (const std::runtime_error& e) {
|
|
|
|
|
std::cerr << __func__ << ": " << e.what() << std::endl;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::cout << __func__ << ":Deadlock detected: Found "
|
|
|
|
|
<< "firstFailedQutex @" << &firstFailedQutex
|
|
|
|
|
<< " (" << firstFailedQutex.name << ") in LockSet of "
|
|
|
|
|
<< "SerializedAsynchronousContinuation @"
|
|
|
|
|
<< serializedCont.get() << std::endl;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 12:58:41 -04:00
|
|
|
template <class OriginalCbFnT>
|
|
|
|
|
template <class InvocationTargetT>
|
|
|
|
|
bool
|
|
|
|
|
SerializedAsynchronousContinuation<OriginalCbFnT>
|
|
|
|
|
::LockerAndInvoker<InvocationTargetT>
|
2025-09-29 18:14:10 -04:00
|
|
|
::obsolete::traceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
|
2025-09-29 12:58:41 -04:00
|
|
|
{
|
2025-09-29 13:38:53 -04:00
|
|
|
/** EXPLANATION:
|
|
|
|
|
* In this function we check for gridlocks which are slightly different
|
|
|
|
|
* from deadlocks. In a gridlock, two requests are waiting for locks that
|
|
|
|
|
* are held by the other. I.e:
|
|
|
|
|
*
|
|
|
|
|
* R1 holds LockA and is waiting for LockB.
|
|
|
|
|
* R2 holds LockB and is waiting for LockA.
|
|
|
|
|
*
|
|
|
|
|
* This differs from deadlocks because it's not a single request which is
|
|
|
|
|
* attempting to re-acquire a lock that it already holds.
|
|
|
|
|
*
|
|
|
|
|
* To detect this condition, we wait until the acquisition timeout has
|
|
|
|
|
* expired. Then: we extract the current owner of the first lock we're
|
|
|
|
|
* failing to acquire.
|
|
|
|
|
*
|
|
|
|
|
* From there, we go through each of the locks in the foreign owner's
|
|
|
|
|
* current (i.e: immediate, most recent continuation's) required LockSet.
|
|
|
|
|
* For each of the locks in the foreign owner's most immediate required
|
|
|
|
|
* LockSet, we trace backward in our *OWN* history to see if any of *OUR*
|
|
|
|
|
* continuations (excluding our most immediate continuation) contains that
|
|
|
|
|
* lock.
|
|
|
|
|
*
|
|
|
|
|
* If we find a match, that means that we're holding a lock that the foreign
|
|
|
|
|
* owner is waiting for. And we already know that the foreign owner is
|
|
|
|
|
* holding a lock that we're waiting for (when we extracted the current
|
|
|
|
|
* owner of the first failed lock in our most immediate Lockset).
|
|
|
|
|
*
|
|
|
|
|
* Hence, we have a gridlock.
|
|
|
|
|
*/
|
|
|
|
|
|
2025-09-29 14:37:16 -04:00
|
|
|
std::shared_ptr<LockerAndInvokerBase> foreignOwnerShPtr =
|
|
|
|
|
firstFailedQutex.getCurrOwner();
|
2025-09-29 13:38:53 -04:00
|
|
|
// If no current owner, can't be a gridlock
|
2025-09-29 14:37:16 -04:00
|
|
|
if (foreignOwnerShPtr == nullptr)
|
2025-09-29 13:38:53 -04:00
|
|
|
{ return false; }
|
|
|
|
|
|
|
|
|
|
// Use reference for the rest of the function for safety.
|
2025-09-29 14:37:16 -04:00
|
|
|
LockerAndInvokerBase &foreignOwner = *foreignOwnerShPtr;
|
2025-09-29 13:38:53 -04:00
|
|
|
|
|
|
|
|
/* For each lock in the foreign owner's LockSet, check if we hold it
|
|
|
|
|
* in any of our previous continuations (excluding our most immediate one)
|
|
|
|
|
*/
|
|
|
|
|
for (size_t i = 0; i < foreignOwner.getLockSetSize(); ++i)
|
|
|
|
|
{
|
|
|
|
|
Qutex& foreignLock = foreignOwner.getLockAt(i);
|
|
|
|
|
|
|
|
|
|
/* Skip the firstFailedQutex since we already know the foreign owner
|
|
|
|
|
* holds it -- hence it's impossible for any of our previous
|
|
|
|
|
* continuations to hold it.
|
|
|
|
|
*/
|
|
|
|
|
if (&foreignLock == &firstFailedQutex)
|
|
|
|
|
{ continue; }
|
|
|
|
|
|
|
|
|
|
/** EXPLANATION:
|
|
|
|
|
* Trace backward through our continuation history (excluding our most
|
|
|
|
|
* immediate continuation).
|
|
|
|
|
*
|
|
|
|
|
* The reason we exclude our most immediate continuation is because the
|
|
|
|
|
* LockSet acquisition algorithm backs off if it fails to acquire ALL
|
|
|
|
|
* locks in the set. So if the lock that the foreign owner is waiting
|
|
|
|
|
* for is in our most immediate continuation, and NOT in one of our
|
|
|
|
|
* previous continuations, then we will back off and the foreign owner
|
|
|
|
|
* should eventually be able to acquire that lock.
|
|
|
|
|
*/
|
|
|
|
|
for (std::shared_ptr<AsynchronousContinuationChainLink> currContin =
|
2025-09-29 14:37:16 -04:00
|
|
|
this->serializedContinuation.getCallersContinuationShPtr();
|
2025-09-29 13:38:53 -04:00
|
|
|
currContin != nullptr;
|
2025-09-29 14:37:16 -04:00
|
|
|
currContin = currContin->getCallersContinuationShPtr())
|
2025-09-29 13:38:53 -04:00
|
|
|
{
|
|
|
|
|
auto serializedCont = std::dynamic_pointer_cast<
|
|
|
|
|
SerializedAsynchronousContinuation<OriginalCbFnT>>(currContin);
|
|
|
|
|
|
|
|
|
|
if (serializedCont == nullptr) { continue; }
|
|
|
|
|
|
|
|
|
|
// Check if this continuation holds the foreign lock
|
|
|
|
|
try {
|
|
|
|
|
const auto& lockUsageDesc = serializedCont->requiredLocks
|
|
|
|
|
.getLockUsageDesc(foreignLock);
|
|
|
|
|
|
|
|
|
|
// Matched! We hold a lock that the foreign owner is waiting for
|
|
|
|
|
std::cout << __func__ << ": Gridlock detected: We hold lock @"
|
|
|
|
|
<< &foreignLock << " (" << foreignLock.name << ") in "
|
|
|
|
|
"continuation @" << serializedCont.get()
|
|
|
|
|
<< ", while foreign owner @" << &foreignOwner
|
|
|
|
|
<< " holds lock @" << &firstFailedQutex << " ("
|
|
|
|
|
<< firstFailedQutex.name << ") that we're waiting for"
|
|
|
|
|
<< std::endl;
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
} catch (const std::runtime_error& e) {
|
|
|
|
|
// This continuation doesn't hold the foreign lock. Continue.
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 12:58:41 -04:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 18:14:10 -04:00
|
|
|
/** EXPLANATION - GRIDLOCK DETECTION ALGORITHM:
|
|
|
|
|
* This file implements gridlock detection algorithms that use a central
|
|
|
|
|
* acquisition history to track all lockvokers suspected of being gridlocked.
|
|
|
|
|
*
|
|
|
|
|
* ALGORITHM OVERVIEW:
|
|
|
|
|
* 1. When a lockvoker finds that DEADLOCK_TIMEOUT_MS has elapsed and it
|
|
|
|
|
* still can't acquire a particular lock (firstFailedQutex), it creates
|
|
|
|
|
* a new entry in a global acquisition history.
|
|
|
|
|
*
|
|
|
|
|
* 2. The acquisition history is an unordered_map with:
|
|
|
|
|
* - Key: std::shared_ptr<LockerAndInvokerBase>
|
|
|
|
|
* (the timed-out lockvoker -- aka, itself)
|
|
|
|
|
* - Value: std::pair<
|
|
|
|
|
* std::reference_wrapper<Qutex>,
|
|
|
|
|
* std::forward_list<std::reference_wrapper<Qutex>>>
|
|
|
|
|
* * pair.first: The firstFailedQutex that this lockvoker WANTS but
|
|
|
|
|
* can't acquire. This metadata is essential for later-arriving
|
|
|
|
|
* entrants to analyze what their predecessor timed-out sequences
|
|
|
|
|
* want.
|
|
|
|
|
* * pair.second: A list of all acquired Qutexes in this lockvoker's
|
|
|
|
|
* continuation history.
|
|
|
|
|
*
|
|
|
|
|
* 3. Each timed-out lockvoker:
|
|
|
|
|
* a) Adds itself to the acquisition history map with its wanted lock and
|
|
|
|
|
* acquired locks
|
|
|
|
|
* b) Iterates through all OTHER entries in the map (excluding itself)
|
|
|
|
|
* c) For each other entry, checks if that entry's acquired locks
|
|
|
|
|
* (pair.second) contains the lock that this lockvoker wants
|
|
|
|
|
* (aka: firstFailedQutex/pair.first)
|
|
|
|
|
* d) If found, we have detected a gridlock: two sequences where at least
|
|
|
|
|
* one wants a lock held by the other, and the other wants a lock that
|
|
|
|
|
* it can't acquire.
|
|
|
|
|
*
|
|
|
|
|
* GRIDLOCK CONDITION:
|
|
|
|
|
* A gridlock exists when we find a circular chain of dependencies:
|
|
|
|
|
* - Lockvoker A wants LockX but can't acquire it (held by Lockvoker B)
|
|
|
|
|
* - Lockvoker B wants LockY but can't acquire it (held by Lockvoker C, D, etc.)
|
|
|
|
|
* - The chain must be circular (eventually leading back to Lockvoker A or another
|
|
|
|
|
* lockvoker in the chain) to ensure it's a true gridlock, not just a delay
|
|
|
|
|
*
|
|
|
|
|
* TIMED DELAY, I/O DELAY, or LONG-RUNNING OPERATION FALSE-POSITIVE:
|
|
|
|
|
* Without circularity detection, we could incorrectly flag a simple delay, I/O
|
|
|
|
|
* delay, or long-running operation as a gridlock. For example: Lockvoker A
|
|
|
|
|
* wants LockX (held by Lockvoker B), and Lockvoker B is currently in a 10-second
|
|
|
|
|
* sleep/delay. When B wakes up, it will release LockX, allowing A to proceed.
|
|
|
|
|
* This is not a gridlock - it's just A waiting longer than DEADLOCK_TIMEOUT_MS
|
|
|
|
|
* for B to finish its work. True gridlocks require circular dependencies where
|
|
|
|
|
* no sequence can make progress because they're all waiting for each other in
|
|
|
|
|
* a cycle.
|
|
|
|
|
*
|
|
|
|
|
* The central history metadata enables us to detect complex gridlocks involving
|
|
|
|
|
* multiple lockvokers (2, 3, 4, 5+ sequences) by building up the acquisition
|
|
|
|
|
* history over time as different lockvokers timeout and add their information.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
template <class OriginalCbFnT>
|
|
|
|
|
template <class InvocationTargetT>
|
|
|
|
|
bool
|
|
|
|
|
SerializedAsynchronousContinuation<OriginalCbFnT>
|
|
|
|
|
::LockerAndInvoker<InvocationTargetT>
|
|
|
|
|
::heuristicallyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
|
|
|
|
|
{
|
|
|
|
|
/** HEURISTIC APPROACH:
|
|
|
|
|
* Due to the computational complexity of full circularity detection,
|
|
|
|
|
* we implement a heuristically adequate check: when we find 2 sequences
|
|
|
|
|
* where one depends on the other, and the other has reached timeout,
|
|
|
|
|
* we assume this is a likely gridlock. This approach is not
|
|
|
|
|
* algorithmically complete (it may miss some complex circular
|
|
|
|
|
* dependencies or flag false positives), but it is heuristically useful
|
|
|
|
|
* for debugging and identifying potential concurrency issues in
|
|
|
|
|
* practice.
|
|
|
|
|
*
|
|
|
|
|
* See the file-local comment above for the complete algorithm
|
|
|
|
|
* explanation.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template <class OriginalCbFnT>
|
|
|
|
|
template <class InvocationTargetT>
|
|
|
|
|
bool
|
|
|
|
|
SerializedAsynchronousContinuation<OriginalCbFnT>
|
|
|
|
|
::LockerAndInvoker<InvocationTargetT>
|
|
|
|
|
::completelyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
|
|
|
|
|
{
|
|
|
|
|
/** ALGORITHMICALLY COMPLETE VERSION:
|
|
|
|
|
* This function is intended to implement the algorithmically complete
|
|
|
|
|
* version of gridlock detection that performs full circularity detection.
|
|
|
|
|
* This would involve building a dependency graph from the acquisition
|
|
|
|
|
* history and using graph traversal algorithms (such as DFS with cycle
|
|
|
|
|
* detection) to identify true circular dependencies.
|
|
|
|
|
*
|
|
|
|
|
* See the file-local comment above for the complete algorithm
|
|
|
|
|
* explanation.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-22 21:29:43 -04:00
|
|
|
template <class OriginalCbFnT>
|
|
|
|
|
template <class InvocationTargetT>
|
|
|
|
|
void
|
|
|
|
|
SerializedAsynchronousContinuation<OriginalCbFnT>
|
|
|
|
|
::LockerAndInvoker<InvocationTargetT>
|
2025-09-27 20:51:20 -04:00
|
|
|
::handleDeadlock(Qutex& firstFailedQutex)
|
2025-09-22 21:29:43 -04:00
|
|
|
{
|
2025-09-27 20:51:20 -04:00
|
|
|
std::cerr << __func__ << ": Deadlock: "
|
2025-09-22 21:29:43 -04:00
|
|
|
<< "Lockvoker has been waiting for "
|
|
|
|
|
<< std::chrono::duration_cast<std::chrono::milliseconds>(
|
|
|
|
|
std::chrono::steady_clock::now() - this->creationTimestamp)
|
|
|
|
|
.count()
|
|
|
|
|
<< "ms, failed on qutex @" << &firstFailedQutex
|
|
|
|
|
<< " (" << firstFailedQutex.name << ")" << std::endl;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Explicit template instantiations for the types we need
|
|
|
|
|
// Add more as needed for your specific use cases
|
|
|
|
|
|
|
|
|
|
} // namespace smo
|