Files
salmanoff/smocore/serializedAsynchronousContinuation.cpp
T
hayodea 8123ec1227 Qutex: Deprecate old gridlock trace; add skeleton new trace
We add the skeleton of a correct history tracer for gridlocks. The
previous history tracer made the incorrect assumption that we would
find the foreign sequence's currently desired LockSet inside of the
lockvoker that it has stored inside of Qutex::currOwner.
2025-09-29 18:14:10 -04:00

287 lines
11 KiB
C++

#include <config.h>
#include <serializedAsynchronousContinuation.h>
#include <qutex.h>
namespace smo {
#ifdef CONFIG_ENABLE_DEBUG_LOCKS
template <class OriginalCbFnT>
template <class InvocationTargetT>
bool
SerializedAsynchronousContinuation<OriginalCbFnT>
::LockerAndInvoker<InvocationTargetT>
::traceContinuationHistoryForDeadlockOn(Qutex& firstFailedQutex)
{
/** EXPLANATION:
* In this function we will trace through the chain of continuations that
* led up to this Lockvoker's continuation. For each continuation which is
* a SerializedAsynchronousContinuation, we check through its LockSet to see
* if it contains the lock that failed acquisition. If it does, we have a
* deadlock.
*/
/* We can't start with the continuation directly referenced by this starting
* Lockvoker as it would contain the all locks we're currently trying to
* acquire...and rightly so because it's the continuation for this current
* lockvoker.
*/
for (std::shared_ptr<AsynchronousContinuationChainLink> currContin =
this->serializedContinuation.getCallersContinuationShPtr();
currContin != nullptr;
currContin = currContin->getCallersContinuationShPtr())
{
auto serializedCont = std::dynamic_pointer_cast<
SerializedAsynchronousContinuation<OriginalCbFnT>>(currContin);
if (serializedCont == nullptr) { continue; }
// Check if the firstFailedQutex is in this continuation's LockSet
try {
const auto& lockUsageDesc = serializedCont->requiredLocks
.getLockUsageDesc(firstFailedQutex);
} catch (const std::runtime_error& e) {
std::cerr << __func__ << ": " << e.what() << std::endl;
continue;
}
std::cout << __func__ << ":Deadlock detected: Found "
<< "firstFailedQutex @" << &firstFailedQutex
<< " (" << firstFailedQutex.name << ") in LockSet of "
<< "SerializedAsynchronousContinuation @"
<< serializedCont.get() << std::endl;
return true;
}
return false;
}
template <class OriginalCbFnT>
template <class InvocationTargetT>
bool
SerializedAsynchronousContinuation<OriginalCbFnT>
::LockerAndInvoker<InvocationTargetT>
::obsolete::traceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
{
/** EXPLANATION:
* In this function we check for gridlocks which are slightly different
* from deadlocks. In a gridlock, two requests are waiting for locks that
* are held by the other. I.e:
*
* R1 holds LockA and is waiting for LockB.
* R2 holds LockB and is waiting for LockA.
*
* This differs from deadlocks because it's not a single request which is
* attempting to re-acquire a lock that it already holds.
*
* To detect this condition, we wait until the acquisition timeout has
* expired. Then: we extract the current owner of the first lock we're
* failing to acquire.
*
* From there, we go through each of the locks in the foreign owner's
* current (i.e: immediate, most recent continuation's) required LockSet.
* For each of the locks in the foreign owner's most immediate required
* LockSet, we trace backward in our *OWN* history to see if any of *OUR*
* continuations (excluding our most immediate continuation) contains that
* lock.
*
* If we find a match, that means that we're holding a lock that the foreign
* owner is waiting for. And we already know that the foreign owner is
* holding a lock that we're waiting for (when we extracted the current
* owner of the first failed lock in our most immediate Lockset).
*
* Hence, we have a gridlock.
*/
std::shared_ptr<LockerAndInvokerBase> foreignOwnerShPtr =
firstFailedQutex.getCurrOwner();
// If no current owner, can't be a gridlock
if (foreignOwnerShPtr == nullptr)
{ return false; }
// Use reference for the rest of the function for safety.
LockerAndInvokerBase &foreignOwner = *foreignOwnerShPtr;
/* For each lock in the foreign owner's LockSet, check if we hold it
* in any of our previous continuations (excluding our most immediate one)
*/
for (size_t i = 0; i < foreignOwner.getLockSetSize(); ++i)
{
Qutex& foreignLock = foreignOwner.getLockAt(i);
/* Skip the firstFailedQutex since we already know the foreign owner
* holds it -- hence it's impossible for any of our previous
* continuations to hold it.
*/
if (&foreignLock == &firstFailedQutex)
{ continue; }
/** EXPLANATION:
* Trace backward through our continuation history (excluding our most
* immediate continuation).
*
* The reason we exclude our most immediate continuation is because the
* LockSet acquisition algorithm backs off if it fails to acquire ALL
* locks in the set. So if the lock that the foreign owner is waiting
* for is in our most immediate continuation, and NOT in one of our
* previous continuations, then we will back off and the foreign owner
* should eventually be able to acquire that lock.
*/
for (std::shared_ptr<AsynchronousContinuationChainLink> currContin =
this->serializedContinuation.getCallersContinuationShPtr();
currContin != nullptr;
currContin = currContin->getCallersContinuationShPtr())
{
auto serializedCont = std::dynamic_pointer_cast<
SerializedAsynchronousContinuation<OriginalCbFnT>>(currContin);
if (serializedCont == nullptr) { continue; }
// Check if this continuation holds the foreign lock
try {
const auto& lockUsageDesc = serializedCont->requiredLocks
.getLockUsageDesc(foreignLock);
// Matched! We hold a lock that the foreign owner is waiting for
std::cout << __func__ << ": Gridlock detected: We hold lock @"
<< &foreignLock << " (" << foreignLock.name << ") in "
"continuation @" << serializedCont.get()
<< ", while foreign owner @" << &foreignOwner
<< " holds lock @" << &firstFailedQutex << " ("
<< firstFailedQutex.name << ") that we're waiting for"
<< std::endl;
return true;
} catch (const std::runtime_error& e) {
// This continuation doesn't hold the foreign lock. Continue.
continue;
}
}
}
return false;
}
/** EXPLANATION - GRIDLOCK DETECTION ALGORITHM:
* This file implements gridlock detection algorithms that use a central
* acquisition history to track all lockvokers suspected of being gridlocked.
*
* ALGORITHM OVERVIEW:
* 1. When a lockvoker finds that DEADLOCK_TIMEOUT_MS has elapsed and it
* still can't acquire a particular lock (firstFailedQutex), it creates
* a new entry in a global acquisition history.
*
* 2. The acquisition history is an unordered_map with:
* - Key: std::shared_ptr<LockerAndInvokerBase>
* (the timed-out lockvoker -- aka, itself)
* - Value: std::pair<
* std::reference_wrapper<Qutex>,
* std::forward_list<std::reference_wrapper<Qutex>>>
* * pair.first: The firstFailedQutex that this lockvoker WANTS but
* can't acquire. This metadata is essential for later-arriving
* entrants to analyze what their predecessor timed-out sequences
* want.
* * pair.second: A list of all acquired Qutexes in this lockvoker's
* continuation history.
*
* 3. Each timed-out lockvoker:
* a) Adds itself to the acquisition history map with its wanted lock and
* acquired locks
* b) Iterates through all OTHER entries in the map (excluding itself)
* c) For each other entry, checks if that entry's acquired locks
* (pair.second) contains the lock that this lockvoker wants
* (aka: firstFailedQutex/pair.first)
* d) If found, we have detected a gridlock: two sequences where at least
* one wants a lock held by the other, and the other wants a lock that
* it can't acquire.
*
* GRIDLOCK CONDITION:
* A gridlock exists when we find a circular chain of dependencies:
* - Lockvoker A wants LockX but can't acquire it (held by Lockvoker B)
* - Lockvoker B wants LockY but can't acquire it (held by Lockvoker C, D, etc.)
* - The chain must be circular (eventually leading back to Lockvoker A or another
* lockvoker in the chain) to ensure it's a true gridlock, not just a delay
*
* TIMED DELAY, I/O DELAY, or LONG-RUNNING OPERATION FALSE-POSITIVE:
* Without circularity detection, we could incorrectly flag a simple delay, I/O
* delay, or long-running operation as a gridlock. For example: Lockvoker A
* wants LockX (held by Lockvoker B), and Lockvoker B is currently in a 10-second
* sleep/delay. When B wakes up, it will release LockX, allowing A to proceed.
* This is not a gridlock - it's just A waiting longer than DEADLOCK_TIMEOUT_MS
* for B to finish its work. True gridlocks require circular dependencies where
* no sequence can make progress because they're all waiting for each other in
* a cycle.
*
* The central history metadata enables us to detect complex gridlocks involving
* multiple lockvokers (2, 3, 4, 5+ sequences) by building up the acquisition
* history over time as different lockvokers timeout and add their information.
*/
template <class OriginalCbFnT>
template <class InvocationTargetT>
bool
SerializedAsynchronousContinuation<OriginalCbFnT>
::LockerAndInvoker<InvocationTargetT>
::heuristicallyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
{
/** HEURISTIC APPROACH:
* Due to the computational complexity of full circularity detection,
* we implement a heuristically adequate check: when we find 2 sequences
* where one depends on the other, and the other has reached timeout,
* we assume this is a likely gridlock. This approach is not
* algorithmically complete (it may miss some complex circular
* dependencies or flag false positives), but it is heuristically useful
* for debugging and identifying potential concurrency issues in
* practice.
*
* See the file-local comment above for the complete algorithm
* explanation.
*/
return false;
}
template <class OriginalCbFnT>
template <class InvocationTargetT>
bool
SerializedAsynchronousContinuation<OriginalCbFnT>
::LockerAndInvoker<InvocationTargetT>
::completelyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
{
/** ALGORITHMICALLY COMPLETE VERSION:
* This function is intended to implement the algorithmically complete
* version of gridlock detection that performs full circularity detection.
* This would involve building a dependency graph from the acquisition
* history and using graph traversal algorithms (such as DFS with cycle
* detection) to identify true circular dependencies.
*
* See the file-local comment above for the complete algorithm
* explanation.
*/
return false;
}
template <class OriginalCbFnT>
template <class InvocationTargetT>
void
SerializedAsynchronousContinuation<OriginalCbFnT>
::LockerAndInvoker<InvocationTargetT>
::handleDeadlock(Qutex& firstFailedQutex)
{
std::cerr << __func__ << ": Deadlock: "
<< "Lockvoker has been waiting for "
<< std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - this->creationTimestamp)
.count()
<< "ms, failed on qutex @" << &firstFailedQutex
<< " (" << firstFailedQutex.name << ")" << std::endl;
}
#endif
// Explicit template instantiations for the types we need
// Add more as needed for your specific use cases
} // namespace smo