2025-09-29 19:27:02 -04:00
|
|
|
#include "qutexAcquisitionHistoryTracker.h"
|
|
|
|
|
#include "serializedAsynchronousContinuation.h"
|
|
|
|
|
#include "qutex.h"
|
|
|
|
|
#include <memory>
|
|
|
|
|
#include <forward_list>
|
|
|
|
|
#include <functional>
|
2025-09-29 20:47:04 -04:00
|
|
|
#include <iostream>
|
2025-09-29 19:27:02 -04:00
|
|
|
|
|
|
|
|
namespace smo {
|
|
|
|
|
|
|
|
|
|
/** EXPLANATION - GRIDLOCK DETECTION ALGORITHM:
|
|
|
|
|
* This file implements gridlock detection algorithms that use a central
|
|
|
|
|
* acquisition history to track all lockvokers suspected of being gridlocked.
|
|
|
|
|
*
|
|
|
|
|
* ALGORITHM OVERVIEW:
|
|
|
|
|
* 1. When a lockvoker finds that DEADLOCK_TIMEOUT_MS has elapsed and it
|
|
|
|
|
* still can't acquire a particular lock (firstFailedQutex), it creates
|
|
|
|
|
* a new entry in a global acquisition history.
|
|
|
|
|
*
|
|
|
|
|
* 2. The acquisition history is an unordered_map with:
|
|
|
|
|
* - Key: std::shared_ptr<AsynchronousContinuationChainLink>
|
|
|
|
|
* (the timed-out lockvoker's continuation)
|
|
|
|
|
* - Value: std::pair<
|
|
|
|
|
* std::reference_wrapper<Qutex>,
|
|
|
|
|
* std::unique_ptr<std::forward_list<std::reference_wrapper<Qutex>>>>
|
|
|
|
|
* * pair.first: The firstFailedQutex that this lockvoker WANTS but
|
|
|
|
|
* can't acquire. This metadata is essential for later-arriving
|
|
|
|
|
* entrants to analyze what their predecessor timed-out sequences
|
|
|
|
|
* want.
|
|
|
|
|
* * pair.second: A unique_ptr to a list of all acquired Qutexes in this
|
|
|
|
|
* lockvoker's continuation history.
|
|
|
|
|
*
|
|
|
|
|
* 3. Each timed-out lockvoker:
|
|
|
|
|
* a) Adds itself to the acquisition history map with its wanted lock and
|
|
|
|
|
* acquired locks
|
|
|
|
|
* b) Iterates through all OTHER entries in the map (excluding itself)
|
|
|
|
|
* c) For each other entry, checks if that entry's acquired locks
|
|
|
|
|
* (pair.second) contains the lock that this lockvoker wants
|
|
|
|
|
* (aka: firstFailedQutex/pair.first)
|
|
|
|
|
* d) If found, we have detected a gridlock: two sequences where at least
|
|
|
|
|
* one wants a lock held by the other, and the other wants a lock that
|
|
|
|
|
* it can't acquire.
|
|
|
|
|
*
|
|
|
|
|
* GRIDLOCK CONDITION:
|
|
|
|
|
* A gridlock exists when we find a circular chain of dependencies:
|
|
|
|
|
* - Lockvoker A wants LockX but can't acquire it (held by Lockvoker B)
|
|
|
|
|
* - Lockvoker B wants LockY but can't acquire it (held by Lockvoker C, D, etc.)
|
|
|
|
|
* - The chain must be circular (eventually leading back to Lockvoker A or another
|
|
|
|
|
* lockvoker in the chain) to ensure it's a true gridlock, not just a delay
|
|
|
|
|
*
|
|
|
|
|
* TIMED DELAY, I/O DELAY, or LONG-RUNNING OPERATION FALSE-POSITIVE:
|
|
|
|
|
* Without circularity detection, we could incorrectly flag a simple delay, I/O
|
|
|
|
|
* delay, or long-running operation as a gridlock. For example: Lockvoker A
|
|
|
|
|
* wants LockX (held by Lockvoker B), and Lockvoker B is currently in a 10-second
|
|
|
|
|
* sleep/delay. When B wakes up, it will release LockX, allowing A to proceed.
|
|
|
|
|
* This is not a gridlock - it's just A waiting longer than DEADLOCK_TIMEOUT_MS
|
|
|
|
|
* for B to finish its work. True gridlocks require circular dependencies where
|
|
|
|
|
* no sequence can make progress because they're all waiting for each other in
|
|
|
|
|
* a cycle.
|
|
|
|
|
*
|
|
|
|
|
* The central history metadata enables us to detect complex gridlocks involving
|
|
|
|
|
* multiple lockvokers (2, 3, 4, 5+ sequences) by building up the acquisition
|
|
|
|
|
* history over time as different lockvokers timeout and add their information.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
bool QutexAcquisitionHistoryTracker
|
2025-09-29 20:02:23 -04:00
|
|
|
::heuristicallyTraceContinuationHistoryForGridlockOn(
|
|
|
|
|
Qutex &firstFailedQutex,
|
|
|
|
|
std::shared_ptr<AsynchronousContinuationChainLink>& currentContinuation)
|
2025-09-29 19:27:02 -04:00
|
|
|
{
|
|
|
|
|
/** HEURISTIC APPROACH:
|
|
|
|
|
* Due to the computational complexity of full circularity detection,
|
|
|
|
|
* we implement a heuristically adequate check: when we find 2 sequences
|
|
|
|
|
* where one depends on the other, and the other has reached timeout,
|
|
|
|
|
* we assume this is a likely gridlock. This approach is not
|
|
|
|
|
* algorithmically complete (it may miss some complex circular
|
|
|
|
|
* dependencies or flag false positives), but it is heuristically useful
|
|
|
|
|
* for debugging and identifying potential concurrency issues in
|
|
|
|
|
* practice.
|
|
|
|
|
*
|
|
|
|
|
* See the file-local comment above for the complete algorithm
|
|
|
|
|
* explanation.
|
|
|
|
|
*/
|
|
|
|
|
|
2025-09-29 20:34:56 -04:00
|
|
|
/** NOTICE:
|
|
|
|
|
* Generally we should have all global data structures owned by a single
|
|
|
|
|
* ComponentThread; and qutexes really should only be used to serialize
|
|
|
|
|
* async sequences being enqueued on the same ComponentThread. But this
|
|
|
|
|
* doesn't prevent multiple CPUs from trying to add/remove entries to/from
|
|
|
|
|
* the acquisition history at the same time. Why? The acquisition history
|
|
|
|
|
* isn't per-CPU, it's global.
|
|
|
|
|
*
|
|
|
|
|
* The problem with using a SpinLock here is that if the STL uses mutexes
|
|
|
|
|
* internally to lock containers, we could end up in a situation where
|
|
|
|
|
* spinning waiters will be busy-spinning while the owner is sleeping?
|
|
|
|
|
*
|
|
|
|
|
* But this should not happen since the nature of the order of operations is
|
|
|
|
|
* that the spinlock ensures that only one CPU at a time can be
|
|
|
|
|
* adding/removing entries; and thus everytime an method is called on the
|
|
|
|
|
* unordered_map, the caller will always succeed at acquiring the underlying
|
|
|
|
|
* STL mutex.
|
|
|
|
|
*
|
|
|
|
|
* So it should be safe to use a SpinLock here.
|
|
|
|
|
*/
|
|
|
|
|
acquisitionHistoryLock.acquire();
|
|
|
|
|
|
2025-09-29 20:02:23 -04:00
|
|
|
// Iterate through all entries in the acquisition history
|
|
|
|
|
for (const auto& entry : acquisitionHistory) {
|
|
|
|
|
const auto& continuation = entry.first;
|
|
|
|
|
const auto& historyEntry = entry.second;
|
|
|
|
|
|
|
|
|
|
// Skip the current continuation (don't compare with itself)
|
|
|
|
|
if (continuation == currentContinuation) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Check if firstFailedQutex is in this continuation's held locks
|
|
|
|
|
const std::unique_ptr<std::forward_list<std::reference_wrapper<Qutex>>>&
|
|
|
|
|
heldLocks = historyEntry.second;
|
|
|
|
|
|
|
|
|
|
if (!heldLocks)
|
|
|
|
|
{ continue; }
|
|
|
|
|
|
|
|
|
|
for (const auto& heldLock : *heldLocks)
|
|
|
|
|
{
|
|
|
|
|
/* Found firstFailedQutex in another continuation's held locks
|
|
|
|
|
* This indicates a potential gridlock
|
|
|
|
|
*/
|
2025-09-29 20:47:04 -04:00
|
|
|
if (&heldLock.get() != &firstFailedQutex)
|
|
|
|
|
{ continue; }
|
|
|
|
|
|
|
|
|
|
acquisitionHistoryLock.release();
|
|
|
|
|
|
|
|
|
|
std::cerr << __func__ << ": GRIDLOCK DETECTED: Current "
|
|
|
|
|
"continuation @" << currentContinuation.get()
|
|
|
|
|
<< " wants lock '" << firstFailedQutex.name
|
|
|
|
|
<< "' which is held by continuation @"
|
|
|
|
|
<< continuation.get() << std::endl;
|
|
|
|
|
|
|
|
|
|
return true;
|
2025-09-29 20:02:23 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2025-09-29 20:34:56 -04:00
|
|
|
acquisitionHistoryLock.release();
|
2025-09-29 19:27:02 -04:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool QutexAcquisitionHistoryTracker
|
|
|
|
|
::completelyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
|
|
|
|
|
{
|
2025-09-29 20:48:49 -04:00
|
|
|
(void)firstFailedQutex;
|
|
|
|
|
|
2025-09-29 19:27:02 -04:00
|
|
|
/** ALGORITHMICALLY COMPLETE VERSION:
|
|
|
|
|
* This function is intended to implement the algorithmically complete
|
|
|
|
|
* version of gridlock detection that performs full circularity detection.
|
|
|
|
|
* This would involve building a dependency graph from the acquisition
|
|
|
|
|
* history and using graph traversal algorithms (such as DFS with cycle
|
|
|
|
|
* detection) to identify true circular dependencies.
|
|
|
|
|
*
|
|
|
|
|
* See the file-local comment above for the complete algorithm
|
|
|
|
|
* explanation.
|
|
|
|
|
*/
|
|
|
|
|
|
2025-09-29 20:34:56 -04:00
|
|
|
// acquisitionHistoryLock.acquire();
|
|
|
|
|
// TODO: Implement full circularity detection algorithm
|
|
|
|
|
// acquisitionHistoryLock.release();
|
2025-09-29 19:27:02 -04:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace smo
|