diff --git a/include/serializedAsynchronousContinuation.h b/include/serializedAsynchronousContinuation.h index 7f02155..bf98a11 100644 --- a/include/serializedAsynchronousContinuation.h +++ b/include/serializedAsynchronousContinuation.h @@ -116,7 +116,7 @@ public: bool isDeadlock = traceContinuationHistoryForDeadlockOn( firstFailedQutex); - bool isGridlock = traceContinuationHistoryForGridlockOn( + bool isGridlock = heuristicallyTraceContinuationHistoryForGridlockOn( firstFailedQutex); if (!isDeadlock && !isGridlock) @@ -222,8 +222,15 @@ public: { return isDeadlockLikely(); } #ifdef CONFIG_ENABLE_DEBUG_LOCKS + struct obsolete { + bool traceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex); + }; + bool traceContinuationHistoryForDeadlockOn(Qutex &firstFailedQutex); - bool traceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex); + bool heuristicallyTraceContinuationHistoryForGridlockOn( + Qutex &firstFailedQutex); + bool completelyTraceContinuationHistoryForGridlockOn( + Qutex &firstFailedQutex); bool traceContinuationHistoryForDeadlock(void) { for (auto& lockUsageDesc diff --git a/smocore/serializedAsynchronousContinuation.cpp b/smocore/serializedAsynchronousContinuation.cpp index 47446d4..2eb85c1 100644 --- a/smocore/serializedAsynchronousContinuation.cpp +++ b/smocore/serializedAsynchronousContinuation.cpp @@ -62,7 +62,7 @@ template bool SerializedAsynchronousContinuation ::LockerAndInvoker -::traceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex) +::obsolete::traceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex) { /** EXPLANATION: * In this function we check for gridlocks which are slightly different @@ -163,6 +163,106 @@ SerializedAsynchronousContinuation return false; } +/** EXPLANATION - GRIDLOCK DETECTION ALGORITHM: + * This file implements gridlock detection algorithms that use a central + * acquisition history to track all lockvokers suspected of being gridlocked. + * + * ALGORITHM OVERVIEW: + * 1. When a lockvoker finds that DEADLOCK_TIMEOUT_MS has elapsed and it + * still can't acquire a particular lock (firstFailedQutex), it creates + * a new entry in a global acquisition history. + * + * 2. The acquisition history is an unordered_map with: + * - Key: std::shared_ptr + * (the timed-out lockvoker -- aka, itself) + * - Value: std::pair< + * std::reference_wrapper, + * std::forward_list>> + * * pair.first: The firstFailedQutex that this lockvoker WANTS but + * can't acquire. This metadata is essential for later-arriving + * entrants to analyze what their predecessor timed-out sequences + * want. + * * pair.second: A list of all acquired Qutexes in this lockvoker's + * continuation history. + * + * 3. Each timed-out lockvoker: + * a) Adds itself to the acquisition history map with its wanted lock and + * acquired locks + * b) Iterates through all OTHER entries in the map (excluding itself) + * c) For each other entry, checks if that entry's acquired locks + * (pair.second) contains the lock that this lockvoker wants + * (aka: firstFailedQutex/pair.first) + * d) If found, we have detected a gridlock: two sequences where at least + * one wants a lock held by the other, and the other wants a lock that + * it can't acquire. + * + * GRIDLOCK CONDITION: + * A gridlock exists when we find a circular chain of dependencies: + * - Lockvoker A wants LockX but can't acquire it (held by Lockvoker B) + * - Lockvoker B wants LockY but can't acquire it (held by Lockvoker C, D, etc.) + * - The chain must be circular (eventually leading back to Lockvoker A or another + * lockvoker in the chain) to ensure it's a true gridlock, not just a delay + * + * TIMED DELAY, I/O DELAY, or LONG-RUNNING OPERATION FALSE-POSITIVE: + * Without circularity detection, we could incorrectly flag a simple delay, I/O + * delay, or long-running operation as a gridlock. For example: Lockvoker A + * wants LockX (held by Lockvoker B), and Lockvoker B is currently in a 10-second + * sleep/delay. When B wakes up, it will release LockX, allowing A to proceed. + * This is not a gridlock - it's just A waiting longer than DEADLOCK_TIMEOUT_MS + * for B to finish its work. True gridlocks require circular dependencies where + * no sequence can make progress because they're all waiting for each other in + * a cycle. + * + * The central history metadata enables us to detect complex gridlocks involving + * multiple lockvokers (2, 3, 4, 5+ sequences) by building up the acquisition + * history over time as different lockvokers timeout and add their information. + */ + +template +template +bool +SerializedAsynchronousContinuation +::LockerAndInvoker +::heuristicallyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex) +{ + /** HEURISTIC APPROACH: + * Due to the computational complexity of full circularity detection, + * we implement a heuristically adequate check: when we find 2 sequences + * where one depends on the other, and the other has reached timeout, + * we assume this is a likely gridlock. This approach is not + * algorithmically complete (it may miss some complex circular + * dependencies or flag false positives), but it is heuristically useful + * for debugging and identifying potential concurrency issues in + * practice. + * + * See the file-local comment above for the complete algorithm + * explanation. + */ + + return false; +} + +template +template +bool +SerializedAsynchronousContinuation +::LockerAndInvoker +::completelyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex) +{ + /** ALGORITHMICALLY COMPLETE VERSION: + * This function is intended to implement the algorithmically complete + * version of gridlock detection that performs full circularity detection. + * This would involve building a dependency graph from the acquisition + * history and using graph traversal algorithms (such as DFS with cycle + * detection) to identify true circular dependencies. + * + * See the file-local comment above for the complete algorithm + * explanation. + */ + + return false; +} + template template void