salmanoff/smocore/serializedAsynchronousContinuation.cpp

#include <config.h>
#include <serializedAsynchronousContinuation.h>
#include <qutex.h>

namespace smo {

#ifdef CONFIG_ENABLE_DEBUG_LOCKS

template <class OriginalCbFnT>
template <class InvocationTargetT>
bool
SerializedAsynchronousContinuation<OriginalCbFnT>
::LockerAndInvoker<InvocationTargetT>
::traceContinuationHistoryForDeadlockOn(Qutex& firstFailedQutex)
{
	/**	EXPLANATION:
	 * In this function we will trace through the chain of continuations that
	 * led up to this Lockvoker's continuation. For each continuation which is
	 * a SerializedAsynchronousContinuation, we check through its LockSet to see
	 * if it contains the lock that failed acquisition. If it does, we have a
	 * deadlock.
	 */

	/* We can't start with the continuation directly referenced by this starting
	 * Lockvoker as it would contain the all locks we're currently trying to
	 * acquire...and rightly so because it's the continuation for this current
	 * lockvoker.
	 */
	for (std::shared_ptr<AsynchronousContinuationChainLink> currContin =
			this->serializedContinuation.getCallersContinuationShPtr();
		 currContin != nullptr;
		 currContin = currContin->getCallersContinuationShPtr())
	{
		auto serializedCont = std::dynamic_pointer_cast<
			SerializedAsynchronousContinuation<OriginalCbFnT>>(currContin);

		if (serializedCont == nullptr) { continue; }

		// Check if the firstFailedQutex is in this continuation's LockSet
		try {
			const auto& lockUsageDesc = serializedCont->requiredLocks
				.getLockUsageDesc(firstFailedQutex);
		} catch (const std::runtime_error& e) {
			std::cerr << __func__ << ": " << e.what() << std::endl;
			continue;
		}

		std::cout << __func__ << ":Deadlock detected: Found "
			<< "firstFailedQutex @" << &firstFailedQutex
			<< " (" << firstFailedQutex.name << ") in LockSet of "
			<< "SerializedAsynchronousContinuation @"
			<< serializedCont.get() << std::endl;

		return true;
	}

	return false;
}

template <class OriginalCbFnT>
template <class InvocationTargetT>
bool
SerializedAsynchronousContinuation<OriginalCbFnT>
::LockerAndInvoker<InvocationTargetT>
::obsolete::traceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
{
	/**	EXPLANATION:
	 * In this function we check for gridlocks which are slightly different
	 * from deadlocks. In a gridlock, two requests are waiting for locks that
	 * are held by the other. I.e:
	 *
	 * R1 holds LockA and is waiting for LockB.
	 * R2 holds LockB and is waiting for LockA.
	 *
	 * This differs from deadlocks because it's not a single request which is
	 * attempting to re-acquire a lock that it already holds.
	 *
	 * To detect this condition, we wait until the acquisition timeout has
	 * expired. Then: we extract the current owner of the first lock we're
	 * failing to acquire.
	 *
	 * From there, we go through each of the locks in the foreign owner's
	 * current (i.e: immediate, most recent continuation's) required LockSet.
	 * For each of the locks in the foreign owner's most immediate required
	 * LockSet, we trace backward in our *OWN* history to see if any of *OUR*
	 * continuations (excluding our most immediate continuation) contains that
	 * lock.
	 *
	 * If we find a match, that means that we're holding a lock that the foreign
	 * owner is waiting for. And we already know that the foreign owner is
	 * holding a lock that we're waiting for (when we extracted the current
	 * owner of the first failed lock in our most immediate Lockset).
	 *
	 * Hence, we have a gridlock.
	 */

	std::shared_ptr<LockerAndInvokerBase> foreignOwnerShPtr =
		firstFailedQutex.getCurrOwner();
	// If no current owner, can't be a gridlock
	if (foreignOwnerShPtr == nullptr)
		{ return false; }

	// Use reference for the rest of the function for safety.
	LockerAndInvokerBase &foreignOwner = *foreignOwnerShPtr;

	/* For each lock in the foreign owner's LockSet, check if we hold it
	 * in any of our previous continuations (excluding our most immediate one)
	 */
	for (size_t i = 0; i < foreignOwner.getLockSetSize(); ++i)
	{
		Qutex& foreignLock = foreignOwner.getLockAt(i);

		/* Skip the firstFailedQutex since we already know the foreign owner
		 * holds it -- hence it's impossible for any of our previous
		 * continuations to hold it.
		 */
		if (&foreignLock == &firstFailedQutex)
			{ continue; }

		/**	EXPLANATION:
		 * Trace backward through our continuation history (excluding our most
		 * immediate continuation).
		 *
		 * The reason we exclude our most immediate continuation is because the
		 * LockSet acquisition algorithm backs off if it fails to acquire ALL
		 * locks in the set. So if the lock that the foreign owner is waiting
		 * for is in our most immediate continuation, and NOT in one of our
		 * previous continuations, then we will back off and the foreign owner
		 * should eventually be able to acquire that lock.
		 */
		for (std::shared_ptr<AsynchronousContinuationChainLink> currContin =
				this->serializedContinuation.getCallersContinuationShPtr();
			 currContin != nullptr;
			 currContin = currContin->getCallersContinuationShPtr())
		{
			auto serializedCont = std::dynamic_pointer_cast<
				SerializedAsynchronousContinuation<OriginalCbFnT>>(currContin);

			if (serializedCont == nullptr) { continue; }

			// Check if this continuation holds the foreign lock
			try {
				const auto& lockUsageDesc = serializedCont->requiredLocks
					.getLockUsageDesc(foreignLock);

				// Matched! We hold a lock that the foreign owner is waiting for
				std::cout << __func__ << ": Gridlock detected: We hold lock @"
					<< &foreignLock << " (" << foreignLock.name << ") in "
					"continuation @" << serializedCont.get()
					<< ", while foreign owner @" << &foreignOwner
					<< " holds lock @" << &firstFailedQutex << " ("
					<< firstFailedQutex.name << ") that we're waiting for"
					<< std::endl;

				return true;
			} catch (const std::runtime_error& e) {
				// This continuation doesn't hold the foreign lock. Continue.
				continue;
			}
		}
	}

	return false;
}

/**	EXPLANATION - GRIDLOCK DETECTION ALGORITHM:
 * This file implements gridlock detection algorithms that use a central
 * acquisition history to track all lockvokers suspected of being gridlocked.
 *
 *	ALGORITHM OVERVIEW:
 * 1. When a lockvoker finds that DEADLOCK_TIMEOUT_MS has elapsed and it
 *    still can't acquire a particular lock (firstFailedQutex), it creates
 *    a new entry in a global acquisition history.
 *
 * 2. The acquisition history is an unordered_map with:
 *    - Key: std::shared_ptr<LockerAndInvokerBase>
 *		(the timed-out lockvoker -- aka, itself)
 *    - Value: std::pair<
 *					std::reference_wrapper<Qutex>,
 *					std::forward_list<std::reference_wrapper<Qutex>>>
 *      * pair.first: The firstFailedQutex that this lockvoker WANTS but
 *			can't acquire. This metadata is essential for later-arriving
 *			entrants to analyze what their predecessor timed-out sequences
 *			want.
 *      * pair.second: A list of all acquired Qutexes in this lockvoker's
 *			continuation history.
 *
 * 3. Each timed-out lockvoker:
 *    a) Adds itself to the acquisition history map with its wanted lock and
 *		acquired locks
 *    b) Iterates through all OTHER entries in the map (excluding itself)
 *    c) For each other entry, checks if that entry's acquired locks
 *		(pair.second) contains the lock that this lockvoker wants
 *		(aka: firstFailedQutex/pair.first)
 *    d) If found, we have detected a gridlock: two sequences where at least
 *		one wants a lock held by the other, and the other wants a lock that
 *		it can't acquire.
 *
 *	GRIDLOCK CONDITION:
 * A gridlock exists when we find a circular chain of dependencies:
 * - Lockvoker A wants LockX but can't acquire it (held by Lockvoker B)
 * - Lockvoker B wants LockY but can't acquire it (held by Lockvoker C, D, etc.)
 * - The chain must be circular (eventually leading back to Lockvoker A or another
 *   lockvoker in the chain) to ensure it's a true gridlock, not just a delay
 *
 *	TIMED DELAY, I/O DELAY, or LONG-RUNNING OPERATION FALSE-POSITIVE:
 * Without circularity detection, we could incorrectly flag a simple delay, I/O
 * delay, or long-running operation as a gridlock. For example: Lockvoker A
 * wants LockX (held by Lockvoker B), and Lockvoker B is currently in a 10-second
 * sleep/delay. When B wakes up, it will release LockX, allowing A to proceed.
 * This is not a gridlock - it's just A waiting longer than DEADLOCK_TIMEOUT_MS
 * for B to finish its work. True gridlocks require circular dependencies where
 * no sequence can make progress because they're all waiting for each other in
 * a cycle.
 *
 * The central history metadata enables us to detect complex gridlocks involving
 * multiple lockvokers (2, 3, 4, 5+ sequences) by building up the acquisition
 * history over time as different lockvokers timeout and add their information.
 */

template <class OriginalCbFnT>
template <class InvocationTargetT>
bool
SerializedAsynchronousContinuation<OriginalCbFnT>
::LockerAndInvoker<InvocationTargetT>
::heuristicallyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
{
	/** HEURISTIC APPROACH:
	 * Due to the computational complexity of full circularity detection,
	 * we implement a heuristically adequate check: when we find 2 sequences
	 * where one depends on the other, and the other has reached timeout,
	 * we assume this is a likely gridlock. This approach is not
	 * algorithmically complete (it may miss some complex circular
	 * dependencies or flag false positives), but it is heuristically useful
	 * for debugging and identifying potential concurrency issues in
	 * practice.
	 *
	 * See the file-local comment above for the complete algorithm
	 * explanation.
	 */

	return false;
}

template <class OriginalCbFnT>
template <class InvocationTargetT>
bool
SerializedAsynchronousContinuation<OriginalCbFnT>
::LockerAndInvoker<InvocationTargetT>
::completelyTraceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex)
{
	/** ALGORITHMICALLY COMPLETE VERSION:
	 * This function is intended to implement the algorithmically complete
	 * version of gridlock detection that performs full circularity detection.
	 * This would involve building a dependency graph from the acquisition
	 * history and using graph traversal algorithms (such as DFS with cycle
	 * detection) to identify true circular dependencies.
	 *
	 * See the file-local comment above for the complete algorithm
	 * explanation.
	 */

	return false;
}

template <class OriginalCbFnT>
template <class InvocationTargetT>
void
SerializedAsynchronousContinuation<OriginalCbFnT>
::LockerAndInvoker<InvocationTargetT>
::handleDeadlock(Qutex& firstFailedQutex)
{
	std::cerr << __func__ << ": Deadlock: "
		<< "Lockvoker has been waiting for "
		<< std::chrono::duration_cast<std::chrono::milliseconds>(
			std::chrono::steady_clock::now() - this->creationTimestamp)
			.count()
		<< "ms, failed on qutex @" << &firstFailedQutex
		<< " (" << firstFailedQutex.name << ")" << std::endl;
}
#endif

// Explicit template instantiations for the types we need
// Add more as needed for your specific use cases

} // namespace smo