Add QutexAcquisitionHistoryTracker; integrate plumbing

We add the new Qutex acquisision history tracker that allows us
to dynamically detect qutex gridlocks. We've integrated it into
LockerAndInvoker::operator() in a preliminary way.

We also moved all of the trace*ForGridlockOn() methods into the
new QutexAcquisitionHistoryTracker singleton class. They're
more appropriately located there. They're still unimplemented
though.
This commit is contained in:
2025-09-29 19:27:02 -04:00
parent 8123ec1227
commit 71564b4d83
6 changed files with 325 additions and 106 deletions
@@ -13,8 +13,12 @@ namespace smo {
*
* The chain walking logic can use dynamic_cast to determine the most
* derived type and perform appropriate operations.
*
* Inherits from enable_shared_from_this to allow objects to obtain a
* shared_ptr to themselves, which is useful for gridlock detection tracking.
*/
class AsynchronousContinuationChainLink
: public std::enable_shared_from_this<AsynchronousContinuationChainLink>
{
public:
virtual ~AsynchronousContinuationChainLink() = default;
+126
View File
@@ -0,0 +1,126 @@
#ifndef QUTEX_ACQUISITION_HISTORY_TRACKER_H
#define QUTEX_ACQUISITION_HISTORY_TRACKER_H
#include <unordered_map>
#include <memory>
#include <forward_list>
#include <functional>
namespace smo {
// Forward declarations
class Qutex;
class AsynchronousContinuationChainLink;
/**
* @brief QutexAcquisitionHistoryTracker - Tracks acquisition history for
* gridlock detection
*
* This class maintains a central acquisition history to track all lockvokers
* suspected of being gridlocked. It stores information about what locks each
* timed-out lockvoker wants and what locks they hold in their continuation
* history.
*/
class QutexAcquisitionHistoryTracker
{
public:
/**
* @brief Type definition for the acquisition history entry
*
* pair.first: The firstFailedQutex that this lockvoker WANTS but can't
* acquire
* pair.second: A unique_ptr to a list of all acquired Qutexes in this
* lockvoker's continuation history
*/
typedef std::pair<
std::reference_wrapper<Qutex>,
std::unique_ptr<std::forward_list<std::reference_wrapper<Qutex>>>
> AcquisitionHistoryEntry;
/**
* @brief Type definition for the acquisition history map
*
* Key: std::shared_ptr<AsynchronousContinuationChainLink>
* (the continuation that contains the timed-out lockvoker)
* Value: AcquisitionHistoryEntry
* (its wanted lock (aka: firstFailedQutex/pair.first) + held locks)
*/
typedef std::unordered_map<
std::shared_ptr<AsynchronousContinuationChainLink>,
AcquisitionHistoryEntry
> AcquisitionHistoryMap;
public:
static QutexAcquisitionHistoryTracker& getInstance()
{
static QutexAcquisitionHistoryTracker instance;
return instance;
}
/**
* @brief Add a continuation to the acquisition history if it doesn't
* already exist
* @param continuation Shared pointer to the
* AsynchronousContinuationChainLink
* @param wantedLock The lock that this continuation wants but can't
* acquire
* @param heldLocks Unique pointer to list of locks held in this
* continuation's history (will be moved)
*/
void addIfNotExists(
std::shared_ptr<AsynchronousContinuationChainLink> &continuation,
Qutex& wantedLock,
std::unique_ptr<std::forward_list<std::reference_wrapper<Qutex>>>
heldLocks
)
{
auto it = acquisitionHistory.find(continuation);
// If a continuation already exists, don't add it again
if (it != acquisitionHistory.end()) {
return;
}
acquisitionHistory.emplace(continuation, std::make_pair(
std::ref(wantedLock), std::move(heldLocks)));
}
/**
* @brief Remove a continuation from the acquisition history
* @param continuation Shared pointer to the AsynchronousContinuationChainLink
* to remove
* @return true if the continuation was found and removed, false if not found
*/
bool remove(std::shared_ptr<AsynchronousContinuationChainLink> &continuation)
{
auto it = acquisitionHistory.find(continuation);
if (it != acquisitionHistory.end()) {
acquisitionHistory.erase(it);
return true;
}
return false;
}
bool heuristicallyTraceContinuationHistoryForGridlockOn(
Qutex &firstFailedQutex) const;
bool completelyTraceContinuationHistoryForGridlockOn(
Qutex &firstFailedQutex) const;
// Disable copy constructor and assignment operator
QutexAcquisitionHistoryTracker(
const QutexAcquisitionHistoryTracker&) = delete;
QutexAcquisitionHistoryTracker& operator=(
const QutexAcquisitionHistoryTracker&) = delete;
private:
QutexAcquisitionHistoryTracker() = default;
~QutexAcquisitionHistoryTracker() = default;
private:
AcquisitionHistoryMap acquisitionHistory;
};
} // namespace smo
#endif // QUTEX_ACQUISITION_HISTORY_TRACKER_H
+54 -6
View File
@@ -12,6 +12,7 @@
#include <asynchronousContinuation.h>
#include <lockerAndInvokerBase.h>
#include <callback.h>
#include <qutexAcquisitionHistoryTracker.h>
namespace smo {
@@ -36,6 +37,10 @@ public:
std::forward<Args>(args)...);
}
// Return list of all qutexes in predecessors' LockSets; excludes self.
std::unique_ptr<std::forward_list<std::reference_wrapper<Qutex>>>
getAcquiredQutexHistory() const;
public:
LockSet<OriginalCbFnT> requiredLocks;
std::atomic<bool> isAwakeOrBeingAwakened{false};
@@ -116,8 +121,33 @@ public:
bool isDeadlock = traceContinuationHistoryForDeadlockOn(
firstFailedQutex);
bool isGridlock = heuristicallyTraceContinuationHistoryForGridlockOn(
firstFailedQutex);
bool gridlockIsHeuristicallyLikely = false;
bool gridlockIsAlgorithmicallyLikely = false;
if (gridlockLikely)
{
auto tracker = QutexAcquisitionHistoryTracker
::getInstance();
auto heldLocks = serializedContinuation
.getAcquiredQutexHistory();
// Add this continuation to the tracker
tracker.addIfNotExists(
serializedContinuation.shared_from_this(),
firstFailedQutex, std::move(heldLocks));
gridlockIsHeuristicallyLikely = tracker
.heuristicallyTraceContinuationHistoryForGridlockOn(
firstFailedQutex);
gridlockIsAlgorithmicallyLikely = tracker
.completelyTraceContinuationHistoryForGridlockOn(
firstFailedQutex);
}
bool isGridlock = (gridlockIsHeuristicallyLikely
|| gridlockIsAlgorithmicallyLikely);
if (!isDeadlock && !isGridlock)
{ return; }
@@ -145,6 +175,28 @@ public:
* can't acquire the locks anyway.
*/
serializedContinuation.requiredLocks.unregisterFromQutexQueues();
#ifdef CONFIG_ENABLE_DEBUG_LOCKS
/** EXPLANATION:
* If we were being tracked for gridlock detection but successfully
* acquired all locks, it was a false positive due to timed delay,
* long-running operation, or I/O delay
*/
if (gridlockLikely)
{
bool removed = QutexAcquisitionHistoryTracker::getInstance()
.remove(serializedContinuation.shared_from_this());
if (removed)
{
std::cerr << "LockerAndInvoker::operator(): False positive gridlock "
"detection - continuation was being tracked but successfully "
"acquired all locks. This was likely due to timed delay, "
"long-running operation, or I/O delay." << std::endl;
}
}
#endif
invocationTarget();
}
@@ -227,10 +279,6 @@ public:
};
bool traceContinuationHistoryForDeadlockOn(Qutex &firstFailedQutex);
bool heuristicallyTraceContinuationHistoryForGridlockOn(
Qutex &firstFailedQutex);
bool completelyTraceContinuationHistoryForGridlockOn(
Qutex &firstFailedQutex);
bool traceContinuationHistoryForDeadlock(void)
{
for (auto& lockUsageDesc
+1
View File
@@ -13,6 +13,7 @@ add_library(smocore STATIC
lockerAndInvokerBase.cpp
lockSet.cpp
serializedAsynchronousContinuation.cpp
qutexAcquisitionHistoryTracker.cpp
# Body
body/body.cpp
+104
View File
@@ -0,0 +1,104 @@
#include "qutexAcquisitionHistoryTracker.h"
#include "serializedAsynchronousContinuation.h"
#include "qutex.h"
#include <memory>
#include <forward_list>
#include <functional>
namespace smo {
/** EXPLANATION - GRIDLOCK DETECTION ALGORITHM:
* This file implements gridlock detection algorithms that use a central
* acquisition history to track all lockvokers suspected of being gridlocked.
*
* ALGORITHM OVERVIEW:
* 1. When a lockvoker finds that DEADLOCK_TIMEOUT_MS has elapsed and it
* still can't acquire a particular lock (firstFailedQutex), it creates
* a new entry in a global acquisition history.
*
* 2. The acquisition history is an unordered_map with:
* - Key: std::shared_ptr<AsynchronousContinuationChainLink>
* (the timed-out lockvoker's continuation)
* - Value: std::pair<
* std::reference_wrapper<Qutex>,
* std::unique_ptr<std::forward_list<std::reference_wrapper<Qutex>>>>
* * pair.first: The firstFailedQutex that this lockvoker WANTS but
* can't acquire. This metadata is essential for later-arriving
* entrants to analyze what their predecessor timed-out sequences
* want.
* * pair.second: A unique_ptr to a list of all acquired Qutexes in this
* lockvoker's continuation history.
*
* 3. Each timed-out lockvoker:
* a) Adds itself to the acquisition history map with its wanted lock and
* acquired locks
* b) Iterates through all OTHER entries in the map (excluding itself)
* c) For each other entry, checks if that entry's acquired locks
* (pair.second) contains the lock that this lockvoker wants
* (aka: firstFailedQutex/pair.first)
* d) If found, we have detected a gridlock: two sequences where at least
* one wants a lock held by the other, and the other wants a lock that
* it can't acquire.
*
* GRIDLOCK CONDITION:
* A gridlock exists when we find a circular chain of dependencies:
* - Lockvoker A wants LockX but can't acquire it (held by Lockvoker B)
* - Lockvoker B wants LockY but can't acquire it (held by Lockvoker C, D, etc.)
* - The chain must be circular (eventually leading back to Lockvoker A or another
* lockvoker in the chain) to ensure it's a true gridlock, not just a delay
*
* TIMED DELAY, I/O DELAY, or LONG-RUNNING OPERATION FALSE-POSITIVE:
* Without circularity detection, we could incorrectly flag a simple delay, I/O
* delay, or long-running operation as a gridlock. For example: Lockvoker A
* wants LockX (held by Lockvoker B), and Lockvoker B is currently in a 10-second
* sleep/delay. When B wakes up, it will release LockX, allowing A to proceed.
* This is not a gridlock - it's just A waiting longer than DEADLOCK_TIMEOUT_MS
* for B to finish its work. True gridlocks require circular dependencies where
* no sequence can make progress because they're all waiting for each other in
* a cycle.
*
* The central history metadata enables us to detect complex gridlocks involving
* multiple lockvokers (2, 3, 4, 5+ sequences) by building up the acquisition
* history over time as different lockvokers timeout and add their information.
*/
bool QutexAcquisitionHistoryTracker
::heuristicallyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
const
{
/** HEURISTIC APPROACH:
* Due to the computational complexity of full circularity detection,
* we implement a heuristically adequate check: when we find 2 sequences
* where one depends on the other, and the other has reached timeout,
* we assume this is a likely gridlock. This approach is not
* algorithmically complete (it may miss some complex circular
* dependencies or flag false positives), but it is heuristically useful
* for debugging and identifying potential concurrency issues in
* practice.
*
* See the file-local comment above for the complete algorithm
* explanation.
*/
return false;
}
bool QutexAcquisitionHistoryTracker
::completelyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
const
{
/** ALGORITHMICALLY COMPLETE VERSION:
* This function is intended to implement the algorithmically complete
* version of gridlock detection that performs full circularity detection.
* This would involve building a dependency graph from the acquisition
* history and using graph traversal algorithms (such as DFS with cycle
* detection) to identify true circular dependencies.
*
* See the file-local comment above for the complete algorithm
* explanation.
*/
return false;
}
} // namespace smo
+36 -100
View File
@@ -163,106 +163,6 @@ SerializedAsynchronousContinuation<OriginalCbFnT>
return false;
}
/** EXPLANATION - GRIDLOCK DETECTION ALGORITHM:
* This file implements gridlock detection algorithms that use a central
* acquisition history to track all lockvokers suspected of being gridlocked.
*
* ALGORITHM OVERVIEW:
* 1. When a lockvoker finds that DEADLOCK_TIMEOUT_MS has elapsed and it
* still can't acquire a particular lock (firstFailedQutex), it creates
* a new entry in a global acquisition history.
*
* 2. The acquisition history is an unordered_map with:
* - Key: std::shared_ptr<LockerAndInvokerBase>
* (the timed-out lockvoker -- aka, itself)
* - Value: std::pair<
* std::reference_wrapper<Qutex>,
* std::forward_list<std::reference_wrapper<Qutex>>>
* * pair.first: The firstFailedQutex that this lockvoker WANTS but
* can't acquire. This metadata is essential for later-arriving
* entrants to analyze what their predecessor timed-out sequences
* want.
* * pair.second: A list of all acquired Qutexes in this lockvoker's
* continuation history.
*
* 3. Each timed-out lockvoker:
* a) Adds itself to the acquisition history map with its wanted lock and
* acquired locks
* b) Iterates through all OTHER entries in the map (excluding itself)
* c) For each other entry, checks if that entry's acquired locks
* (pair.second) contains the lock that this lockvoker wants
* (aka: firstFailedQutex/pair.first)
* d) If found, we have detected a gridlock: two sequences where at least
* one wants a lock held by the other, and the other wants a lock that
* it can't acquire.
*
* GRIDLOCK CONDITION:
* A gridlock exists when we find a circular chain of dependencies:
* - Lockvoker A wants LockX but can't acquire it (held by Lockvoker B)
* - Lockvoker B wants LockY but can't acquire it (held by Lockvoker C, D, etc.)
* - The chain must be circular (eventually leading back to Lockvoker A or another
* lockvoker in the chain) to ensure it's a true gridlock, not just a delay
*
* TIMED DELAY, I/O DELAY, or LONG-RUNNING OPERATION FALSE-POSITIVE:
* Without circularity detection, we could incorrectly flag a simple delay, I/O
* delay, or long-running operation as a gridlock. For example: Lockvoker A
* wants LockX (held by Lockvoker B), and Lockvoker B is currently in a 10-second
* sleep/delay. When B wakes up, it will release LockX, allowing A to proceed.
* This is not a gridlock - it's just A waiting longer than DEADLOCK_TIMEOUT_MS
* for B to finish its work. True gridlocks require circular dependencies where
* no sequence can make progress because they're all waiting for each other in
* a cycle.
*
* The central history metadata enables us to detect complex gridlocks involving
* multiple lockvokers (2, 3, 4, 5+ sequences) by building up the acquisition
* history over time as different lockvokers timeout and add their information.
*/
template <class OriginalCbFnT>
template <class InvocationTargetT>
bool
SerializedAsynchronousContinuation<OriginalCbFnT>
::LockerAndInvoker<InvocationTargetT>
::heuristicallyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
{
/** HEURISTIC APPROACH:
* Due to the computational complexity of full circularity detection,
* we implement a heuristically adequate check: when we find 2 sequences
* where one depends on the other, and the other has reached timeout,
* we assume this is a likely gridlock. This approach is not
* algorithmically complete (it may miss some complex circular
* dependencies or flag false positives), but it is heuristically useful
* for debugging and identifying potential concurrency issues in
* practice.
*
* See the file-local comment above for the complete algorithm
* explanation.
*/
return false;
}
template <class OriginalCbFnT>
template <class InvocationTargetT>
bool
SerializedAsynchronousContinuation<OriginalCbFnT>
::LockerAndInvoker<InvocationTargetT>
::completelyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
{
/** ALGORITHMICALLY COMPLETE VERSION:
* This function is intended to implement the algorithmically complete
* version of gridlock detection that performs full circularity detection.
* This would involve building a dependency graph from the acquisition
* history and using graph traversal algorithms (such as DFS with cycle
* detection) to identify true circular dependencies.
*
* See the file-local comment above for the complete algorithm
* explanation.
*/
return false;
}
template <class OriginalCbFnT>
template <class InvocationTargetT>
void
@@ -278,8 +178,44 @@ SerializedAsynchronousContinuation<OriginalCbFnT>
<< "ms, failed on qutex @" << &firstFailedQutex
<< " (" << firstFailedQutex.name << ")" << std::endl;
}
#endif
template <class OriginalCbFnT>
std::unique_ptr<std::forward_list<std::reference_wrapper<Qutex>>>
SerializedAsynchronousContinuation<OriginalCbFnT>::getAcquiredQutexHistory()
const
{
auto heldLocks = std::make_unique<
std::forward_list<std::reference_wrapper<Qutex>>>();
/** EXPLANATION:
* Walk through the continuation chain to collect all acquired locks
*
* We don't add the current continuation's locks because it's the one
* failing to acquire locks and backing off. So we start from the previous
* continuation.
*/
for (std::shared_ptr<AsynchronousContinuationChainLink> currContin =
this->getCallersContinuationShPtr();
currContin != nullptr;
currContin = currContin->getCallersContinuationShPtr())
{
auto serializedCont = std::dynamic_pointer_cast<
SerializedAsynchronousContinuation<OriginalCbFnT>>(currContin);
if (serializedCont == nullptr) { continue; }
// Add this continuation's locks to the held locks list
for (size_t i = 0; i < serializedCont->requiredLocks.locks.size(); ++i)
{
heldLocks->push_front(serializedCont->requiredLocks.locks[i].first);
}
}
return heldLocks;
}
// Explicit template instantiations for the types we need
// Add more as needed for your specific use cases