#include "qutexAcquisitionHistoryTracker.h" #include "serializedAsynchronousContinuation.h" #include "qutex.h" #include #include #include namespace smo { /** EXPLANATION - GRIDLOCK DETECTION ALGORITHM: * This file implements gridlock detection algorithms that use a central * acquisition history to track all lockvokers suspected of being gridlocked. * * ALGORITHM OVERVIEW: * 1. When a lockvoker finds that DEADLOCK_TIMEOUT_MS has elapsed and it * still can't acquire a particular lock (firstFailedQutex), it creates * a new entry in a global acquisition history. * * 2. The acquisition history is an unordered_map with: * - Key: std::shared_ptr * (the timed-out lockvoker's continuation) * - Value: std::pair< * std::reference_wrapper, * std::unique_ptr>>> * * pair.first: The firstFailedQutex that this lockvoker WANTS but * can't acquire. This metadata is essential for later-arriving * entrants to analyze what their predecessor timed-out sequences * want. * * pair.second: A unique_ptr to a list of all acquired Qutexes in this * lockvoker's continuation history. * * 3. Each timed-out lockvoker: * a) Adds itself to the acquisition history map with its wanted lock and * acquired locks * b) Iterates through all OTHER entries in the map (excluding itself) * c) For each other entry, checks if that entry's acquired locks * (pair.second) contains the lock that this lockvoker wants * (aka: firstFailedQutex/pair.first) * d) If found, we have detected a gridlock: two sequences where at least * one wants a lock held by the other, and the other wants a lock that * it can't acquire. * * GRIDLOCK CONDITION: * A gridlock exists when we find a circular chain of dependencies: * - Lockvoker A wants LockX but can't acquire it (held by Lockvoker B) * - Lockvoker B wants LockY but can't acquire it (held by Lockvoker C, D, etc.) * - The chain must be circular (eventually leading back to Lockvoker A or another * lockvoker in the chain) to ensure it's a true gridlock, not just a delay * * TIMED DELAY, I/O DELAY, or LONG-RUNNING OPERATION FALSE-POSITIVE: * Without circularity detection, we could incorrectly flag a simple delay, I/O * delay, or long-running operation as a gridlock. For example: Lockvoker A * wants LockX (held by Lockvoker B), and Lockvoker B is currently in a 10-second * sleep/delay. When B wakes up, it will release LockX, allowing A to proceed. * This is not a gridlock - it's just A waiting longer than DEADLOCK_TIMEOUT_MS * for B to finish its work. True gridlocks require circular dependencies where * no sequence can make progress because they're all waiting for each other in * a cycle. * * The central history metadata enables us to detect complex gridlocks involving * multiple lockvokers (2, 3, 4, 5+ sequences) by building up the acquisition * history over time as different lockvokers timeout and add their information. */ bool QutexAcquisitionHistoryTracker ::heuristicallyTraceContinuationHistoryForGridlockOn( Qutex &firstFailedQutex, std::shared_ptr& currentContinuation) { /** HEURISTIC APPROACH: * Due to the computational complexity of full circularity detection, * we implement a heuristically adequate check: when we find 2 sequences * where one depends on the other, and the other has reached timeout, * we assume this is a likely gridlock. This approach is not * algorithmically complete (it may miss some complex circular * dependencies or flag false positives), but it is heuristically useful * for debugging and identifying potential concurrency issues in * practice. * * See the file-local comment above for the complete algorithm * explanation. */ /** NOTICE: * Generally we should have all global data structures owned by a single * ComponentThread; and qutexes really should only be used to serialize * async sequences being enqueued on the same ComponentThread. But this * doesn't prevent multiple CPUs from trying to add/remove entries to/from * the acquisition history at the same time. Why? The acquisition history * isn't per-CPU, it's global. * * The problem with using a SpinLock here is that if the STL uses mutexes * internally to lock containers, we could end up in a situation where * spinning waiters will be busy-spinning while the owner is sleeping? * * But this should not happen since the nature of the order of operations is * that the spinlock ensures that only one CPU at a time can be * adding/removing entries; and thus everytime an method is called on the * unordered_map, the caller will always succeed at acquiring the underlying * STL mutex. * * So it should be safe to use a SpinLock here. */ acquisitionHistoryLock.acquire(); // Iterate through all entries in the acquisition history for (const auto& entry : acquisitionHistory) { const auto& continuation = entry.first; const auto& historyEntry = entry.second; // Skip the current continuation (don't compare with itself) if (continuation == currentContinuation) { continue; } // Check if firstFailedQutex is in this continuation's held locks const std::unique_ptr>>& heldLocks = historyEntry.second; if (!heldLocks) { continue; } for (const auto& heldLock : *heldLocks) { /* Found firstFailedQutex in another continuation's held locks * This indicates a potential gridlock */ if (&heldLock.get() == &firstFailedQutex) { acquisitionHistoryLock.release(); return true; } } } acquisitionHistoryLock.release(); return false; } bool QutexAcquisitionHistoryTracker ::completelyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex) { /** ALGORITHMICALLY COMPLETE VERSION: * This function is intended to implement the algorithmically complete * version of gridlock detection that performs full circularity detection. * This would involve building a dependency graph from the acquisition * history and using graph traversal algorithms (such as DFS with cycle * detection) to identify true circular dependencies. * * See the file-local comment above for the complete algorithm * explanation. */ // acquisitionHistoryLock.acquire(); // TODO: Implement full circularity detection algorithm // acquisitionHistoryLock.release(); return false; } } // namespace smo