diff --git a/include/asynchronousContinuationChainLink.h b/include/asynchronousContinuationChainLink.h index 3e1ac74..84a2663 100644 --- a/include/asynchronousContinuationChainLink.h +++ b/include/asynchronousContinuationChainLink.h @@ -13,8 +13,12 @@ namespace smo { * * The chain walking logic can use dynamic_cast to determine the most * derived type and perform appropriate operations. + * + * Inherits from enable_shared_from_this to allow objects to obtain a + * shared_ptr to themselves, which is useful for gridlock detection tracking. */ class AsynchronousContinuationChainLink +: public std::enable_shared_from_this { public: virtual ~AsynchronousContinuationChainLink() = default; diff --git a/include/qutexAcquisitionHistoryTracker.h b/include/qutexAcquisitionHistoryTracker.h new file mode 100644 index 0000000..48da2e0 --- /dev/null +++ b/include/qutexAcquisitionHistoryTracker.h @@ -0,0 +1,126 @@ +#ifndef QUTEX_ACQUISITION_HISTORY_TRACKER_H +#define QUTEX_ACQUISITION_HISTORY_TRACKER_H + +#include +#include +#include +#include + + +namespace smo { + +// Forward declarations +class Qutex; +class AsynchronousContinuationChainLink; + +/** + * @brief QutexAcquisitionHistoryTracker - Tracks acquisition history for + * gridlock detection + * + * This class maintains a central acquisition history to track all lockvokers + * suspected of being gridlocked. It stores information about what locks each + * timed-out lockvoker wants and what locks they hold in their continuation + * history. + */ +class QutexAcquisitionHistoryTracker +{ +public: + /** + * @brief Type definition for the acquisition history entry + * + * pair.first: The firstFailedQutex that this lockvoker WANTS but can't + * acquire + * pair.second: A unique_ptr to a list of all acquired Qutexes in this + * lockvoker's continuation history + */ + typedef std::pair< + std::reference_wrapper, + std::unique_ptr>> + > AcquisitionHistoryEntry; + + /** + * @brief Type definition for the acquisition history map + * + * Key: std::shared_ptr + * (the continuation that contains the timed-out lockvoker) + * Value: AcquisitionHistoryEntry + * (its wanted lock (aka: firstFailedQutex/pair.first) + held locks) + */ + typedef std::unordered_map< + std::shared_ptr, + AcquisitionHistoryEntry + > AcquisitionHistoryMap; + +public: + static QutexAcquisitionHistoryTracker& getInstance() + { + static QutexAcquisitionHistoryTracker instance; + return instance; + } + + /** + * @brief Add a continuation to the acquisition history if it doesn't + * already exist + * @param continuation Shared pointer to the + * AsynchronousContinuationChainLink + * @param wantedLock The lock that this continuation wants but can't + * acquire + * @param heldLocks Unique pointer to list of locks held in this + * continuation's history (will be moved) + */ + void addIfNotExists( + std::shared_ptr &continuation, + Qutex& wantedLock, + std::unique_ptr>> + heldLocks + ) + { + auto it = acquisitionHistory.find(continuation); + + // If a continuation already exists, don't add it again + if (it != acquisitionHistory.end()) { + return; + } + + acquisitionHistory.emplace(continuation, std::make_pair( + std::ref(wantedLock), std::move(heldLocks))); + } + + /** + * @brief Remove a continuation from the acquisition history + * @param continuation Shared pointer to the AsynchronousContinuationChainLink + * to remove + * @return true if the continuation was found and removed, false if not found + */ + bool remove(std::shared_ptr &continuation) + { + auto it = acquisitionHistory.find(continuation); + if (it != acquisitionHistory.end()) { + acquisitionHistory.erase(it); + return true; + } + return false; + } + + bool heuristicallyTraceContinuationHistoryForGridlockOn( + Qutex &firstFailedQutex) const; + bool completelyTraceContinuationHistoryForGridlockOn( + Qutex &firstFailedQutex) const; + + // Disable copy constructor and assignment operator + QutexAcquisitionHistoryTracker( + const QutexAcquisitionHistoryTracker&) = delete; + QutexAcquisitionHistoryTracker& operator=( + const QutexAcquisitionHistoryTracker&) = delete; + +private: + QutexAcquisitionHistoryTracker() = default; + ~QutexAcquisitionHistoryTracker() = default; + +private: + AcquisitionHistoryMap acquisitionHistory; +}; + +} // namespace smo + +#endif // QUTEX_ACQUISITION_HISTORY_TRACKER_H diff --git a/include/serializedAsynchronousContinuation.h b/include/serializedAsynchronousContinuation.h index bf98a11..6af0bc4 100644 --- a/include/serializedAsynchronousContinuation.h +++ b/include/serializedAsynchronousContinuation.h @@ -12,6 +12,7 @@ #include #include #include +#include namespace smo { @@ -36,6 +37,10 @@ public: std::forward(args)...); } + // Return list of all qutexes in predecessors' LockSets; excludes self. + std::unique_ptr>> + getAcquiredQutexHistory() const; + public: LockSet requiredLocks; std::atomic isAwakeOrBeingAwakened{false}; @@ -116,8 +121,33 @@ public: bool isDeadlock = traceContinuationHistoryForDeadlockOn( firstFailedQutex); - bool isGridlock = heuristicallyTraceContinuationHistoryForGridlockOn( - firstFailedQutex); + bool gridlockIsHeuristicallyLikely = false; + bool gridlockIsAlgorithmicallyLikely = false; + + if (gridlockLikely) + { + auto tracker = QutexAcquisitionHistoryTracker + ::getInstance(); + + auto heldLocks = serializedContinuation + .getAcquiredQutexHistory(); + + // Add this continuation to the tracker + tracker.addIfNotExists( + serializedContinuation.shared_from_this(), + firstFailedQutex, std::move(heldLocks)); + + gridlockIsHeuristicallyLikely = tracker + .heuristicallyTraceContinuationHistoryForGridlockOn( + firstFailedQutex); + + gridlockIsAlgorithmicallyLikely = tracker + .completelyTraceContinuationHistoryForGridlockOn( + firstFailedQutex); + } + + bool isGridlock = (gridlockIsHeuristicallyLikely + || gridlockIsAlgorithmicallyLikely); if (!isDeadlock && !isGridlock) { return; } @@ -145,6 +175,28 @@ public: * can't acquire the locks anyway. */ serializedContinuation.requiredLocks.unregisterFromQutexQueues(); + +#ifdef CONFIG_ENABLE_DEBUG_LOCKS + /** EXPLANATION: + * If we were being tracked for gridlock detection but successfully + * acquired all locks, it was a false positive due to timed delay, + * long-running operation, or I/O delay + */ + if (gridlockLikely) + { + bool removed = QutexAcquisitionHistoryTracker::getInstance() + .remove(serializedContinuation.shared_from_this()); + + if (removed) + { + std::cerr << "LockerAndInvoker::operator(): False positive gridlock " + "detection - continuation was being tracked but successfully " + "acquired all locks. This was likely due to timed delay, " + "long-running operation, or I/O delay." << std::endl; + } + } +#endif + invocationTarget(); } @@ -227,10 +279,6 @@ public: }; bool traceContinuationHistoryForDeadlockOn(Qutex &firstFailedQutex); - bool heuristicallyTraceContinuationHistoryForGridlockOn( - Qutex &firstFailedQutex); - bool completelyTraceContinuationHistoryForGridlockOn( - Qutex &firstFailedQutex); bool traceContinuationHistoryForDeadlock(void) { for (auto& lockUsageDesc diff --git a/smocore/CMakeLists.txt b/smocore/CMakeLists.txt index b3a9755..97bb585 100644 --- a/smocore/CMakeLists.txt +++ b/smocore/CMakeLists.txt @@ -13,6 +13,7 @@ add_library(smocore STATIC lockerAndInvokerBase.cpp lockSet.cpp serializedAsynchronousContinuation.cpp + qutexAcquisitionHistoryTracker.cpp # Body body/body.cpp diff --git a/smocore/qutexAcquisitionHistoryTracker.cpp b/smocore/qutexAcquisitionHistoryTracker.cpp new file mode 100644 index 0000000..ff78e55 --- /dev/null +++ b/smocore/qutexAcquisitionHistoryTracker.cpp @@ -0,0 +1,104 @@ +#include "qutexAcquisitionHistoryTracker.h" +#include "serializedAsynchronousContinuation.h" +#include "qutex.h" +#include +#include +#include + +namespace smo { + +/** EXPLANATION - GRIDLOCK DETECTION ALGORITHM: + * This file implements gridlock detection algorithms that use a central + * acquisition history to track all lockvokers suspected of being gridlocked. + * + * ALGORITHM OVERVIEW: + * 1. When a lockvoker finds that DEADLOCK_TIMEOUT_MS has elapsed and it + * still can't acquire a particular lock (firstFailedQutex), it creates + * a new entry in a global acquisition history. + * + * 2. The acquisition history is an unordered_map with: + * - Key: std::shared_ptr + * (the timed-out lockvoker's continuation) + * - Value: std::pair< + * std::reference_wrapper, + * std::unique_ptr>>> + * * pair.first: The firstFailedQutex that this lockvoker WANTS but + * can't acquire. This metadata is essential for later-arriving + * entrants to analyze what their predecessor timed-out sequences + * want. + * * pair.second: A unique_ptr to a list of all acquired Qutexes in this + * lockvoker's continuation history. + * + * 3. Each timed-out lockvoker: + * a) Adds itself to the acquisition history map with its wanted lock and + * acquired locks + * b) Iterates through all OTHER entries in the map (excluding itself) + * c) For each other entry, checks if that entry's acquired locks + * (pair.second) contains the lock that this lockvoker wants + * (aka: firstFailedQutex/pair.first) + * d) If found, we have detected a gridlock: two sequences where at least + * one wants a lock held by the other, and the other wants a lock that + * it can't acquire. + * + * GRIDLOCK CONDITION: + * A gridlock exists when we find a circular chain of dependencies: + * - Lockvoker A wants LockX but can't acquire it (held by Lockvoker B) + * - Lockvoker B wants LockY but can't acquire it (held by Lockvoker C, D, etc.) + * - The chain must be circular (eventually leading back to Lockvoker A or another + * lockvoker in the chain) to ensure it's a true gridlock, not just a delay + * + * TIMED DELAY, I/O DELAY, or LONG-RUNNING OPERATION FALSE-POSITIVE: + * Without circularity detection, we could incorrectly flag a simple delay, I/O + * delay, or long-running operation as a gridlock. For example: Lockvoker A + * wants LockX (held by Lockvoker B), and Lockvoker B is currently in a 10-second + * sleep/delay. When B wakes up, it will release LockX, allowing A to proceed. + * This is not a gridlock - it's just A waiting longer than DEADLOCK_TIMEOUT_MS + * for B to finish its work. True gridlocks require circular dependencies where + * no sequence can make progress because they're all waiting for each other in + * a cycle. + * + * The central history metadata enables us to detect complex gridlocks involving + * multiple lockvokers (2, 3, 4, 5+ sequences) by building up the acquisition + * history over time as different lockvokers timeout and add their information. + */ + +bool QutexAcquisitionHistoryTracker +::heuristicallyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex) +const +{ + /** HEURISTIC APPROACH: + * Due to the computational complexity of full circularity detection, + * we implement a heuristically adequate check: when we find 2 sequences + * where one depends on the other, and the other has reached timeout, + * we assume this is a likely gridlock. This approach is not + * algorithmically complete (it may miss some complex circular + * dependencies or flag false positives), but it is heuristically useful + * for debugging and identifying potential concurrency issues in + * practice. + * + * See the file-local comment above for the complete algorithm + * explanation. + */ + + return false; +} + +bool QutexAcquisitionHistoryTracker +::completelyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex) +const +{ + /** ALGORITHMICALLY COMPLETE VERSION: + * This function is intended to implement the algorithmically complete + * version of gridlock detection that performs full circularity detection. + * This would involve building a dependency graph from the acquisition + * history and using graph traversal algorithms (such as DFS with cycle + * detection) to identify true circular dependencies. + * + * See the file-local comment above for the complete algorithm + * explanation. + */ + + return false; +} + +} // namespace smo diff --git a/smocore/serializedAsynchronousContinuation.cpp b/smocore/serializedAsynchronousContinuation.cpp index 2eb85c1..4d76bda 100644 --- a/smocore/serializedAsynchronousContinuation.cpp +++ b/smocore/serializedAsynchronousContinuation.cpp @@ -163,106 +163,6 @@ SerializedAsynchronousContinuation return false; } -/** EXPLANATION - GRIDLOCK DETECTION ALGORITHM: - * This file implements gridlock detection algorithms that use a central - * acquisition history to track all lockvokers suspected of being gridlocked. - * - * ALGORITHM OVERVIEW: - * 1. When a lockvoker finds that DEADLOCK_TIMEOUT_MS has elapsed and it - * still can't acquire a particular lock (firstFailedQutex), it creates - * a new entry in a global acquisition history. - * - * 2. The acquisition history is an unordered_map with: - * - Key: std::shared_ptr - * (the timed-out lockvoker -- aka, itself) - * - Value: std::pair< - * std::reference_wrapper, - * std::forward_list>> - * * pair.first: The firstFailedQutex that this lockvoker WANTS but - * can't acquire. This metadata is essential for later-arriving - * entrants to analyze what their predecessor timed-out sequences - * want. - * * pair.second: A list of all acquired Qutexes in this lockvoker's - * continuation history. - * - * 3. Each timed-out lockvoker: - * a) Adds itself to the acquisition history map with its wanted lock and - * acquired locks - * b) Iterates through all OTHER entries in the map (excluding itself) - * c) For each other entry, checks if that entry's acquired locks - * (pair.second) contains the lock that this lockvoker wants - * (aka: firstFailedQutex/pair.first) - * d) If found, we have detected a gridlock: two sequences where at least - * one wants a lock held by the other, and the other wants a lock that - * it can't acquire. - * - * GRIDLOCK CONDITION: - * A gridlock exists when we find a circular chain of dependencies: - * - Lockvoker A wants LockX but can't acquire it (held by Lockvoker B) - * - Lockvoker B wants LockY but can't acquire it (held by Lockvoker C, D, etc.) - * - The chain must be circular (eventually leading back to Lockvoker A or another - * lockvoker in the chain) to ensure it's a true gridlock, not just a delay - * - * TIMED DELAY, I/O DELAY, or LONG-RUNNING OPERATION FALSE-POSITIVE: - * Without circularity detection, we could incorrectly flag a simple delay, I/O - * delay, or long-running operation as a gridlock. For example: Lockvoker A - * wants LockX (held by Lockvoker B), and Lockvoker B is currently in a 10-second - * sleep/delay. When B wakes up, it will release LockX, allowing A to proceed. - * This is not a gridlock - it's just A waiting longer than DEADLOCK_TIMEOUT_MS - * for B to finish its work. True gridlocks require circular dependencies where - * no sequence can make progress because they're all waiting for each other in - * a cycle. - * - * The central history metadata enables us to detect complex gridlocks involving - * multiple lockvokers (2, 3, 4, 5+ sequences) by building up the acquisition - * history over time as different lockvokers timeout and add their information. - */ - -template -template -bool -SerializedAsynchronousContinuation -::LockerAndInvoker -::heuristicallyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex) -{ - /** HEURISTIC APPROACH: - * Due to the computational complexity of full circularity detection, - * we implement a heuristically adequate check: when we find 2 sequences - * where one depends on the other, and the other has reached timeout, - * we assume this is a likely gridlock. This approach is not - * algorithmically complete (it may miss some complex circular - * dependencies or flag false positives), but it is heuristically useful - * for debugging and identifying potential concurrency issues in - * practice. - * - * See the file-local comment above for the complete algorithm - * explanation. - */ - - return false; -} - -template -template -bool -SerializedAsynchronousContinuation -::LockerAndInvoker -::completelyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex) -{ - /** ALGORITHMICALLY COMPLETE VERSION: - * This function is intended to implement the algorithmically complete - * version of gridlock detection that performs full circularity detection. - * This would involve building a dependency graph from the acquisition - * history and using graph traversal algorithms (such as DFS with cycle - * detection) to identify true circular dependencies. - * - * See the file-local comment above for the complete algorithm - * explanation. - */ - - return false; -} - template template void @@ -278,8 +178,44 @@ SerializedAsynchronousContinuation << "ms, failed on qutex @" << &firstFailedQutex << " (" << firstFailedQutex.name << ")" << std::endl; } + #endif +template +std::unique_ptr>> +SerializedAsynchronousContinuation::getAcquiredQutexHistory() +const +{ + auto heldLocks = std::make_unique< + std::forward_list>>(); + + /** EXPLANATION: + * Walk through the continuation chain to collect all acquired locks + * + * We don't add the current continuation's locks because it's the one + * failing to acquire locks and backing off. So we start from the previous + * continuation. + */ + for (std::shared_ptr currContin = + this->getCallersContinuationShPtr(); + currContin != nullptr; + currContin = currContin->getCallersContinuationShPtr()) + { + auto serializedCont = std::dynamic_pointer_cast< + SerializedAsynchronousContinuation>(currContin); + + if (serializedCont == nullptr) { continue; } + + // Add this continuation's locks to the held locks list + for (size_t i = 0; i < serializedCont->requiredLocks.locks.size(); ++i) + { + heldLocks->push_front(serializedCont->requiredLocks.locks[i].first); + } + } + + return heldLocks; +} + // Explicit template instantiations for the types we need // Add more as needed for your specific use cases