From 462247d74390eb43ec154a6cbf0532c3eae71f53 Mon Sep 17 00:00:00 2001 From: Hayodea Hekol Date: Mon, 29 Sep 2025 13:38:53 -0400 Subject: [PATCH] Qutex: Add gridlock detection --- include/lockerAndInvokerBase.h | 6 ++ include/serializedAsynchronousContinuation.h | 9 ++ .../serializedAsynchronousContinuation.cpp | 96 ++++++++++++++++++- 3 files changed, 110 insertions(+), 1 deletion(-) diff --git a/include/lockerAndInvokerBase.h b/include/lockerAndInvokerBase.h index cdf5fde..f0991c1 100644 --- a/include/lockerAndInvokerBase.h +++ b/include/lockerAndInvokerBase.h @@ -45,6 +45,12 @@ public: */ virtual void awaken(bool forceAwaken = false) = 0; + /* These two are ued to iterate through the lockset of a Lockvoker in a + * template-erased manner. We use them in the gridlock detection algorithm. + */ + virtual size_t getLockSetSize() const = 0; + virtual Qutex& getLockAt(size_t index) const = 0; + /** * @brief Equality operator * diff --git a/include/serializedAsynchronousContinuation.h b/include/serializedAsynchronousContinuation.h index 33c556e..5b80d08 100644 --- a/include/serializedAsynchronousContinuation.h +++ b/include/serializedAsynchronousContinuation.h @@ -175,6 +175,15 @@ public: target->getIoService().post(*this); } + size_t getLockSetSize() const override + { return serializedContinuation.requiredLocks.locks.size(); } + + Qutex& getLockAt(size_t index) const override + { + return serializedContinuation.requiredLocks.locks[index] + .first.get(); + } + private: // Allow awakening by resetting the awake flag void allowAwakening() diff --git a/smocore/serializedAsynchronousContinuation.cpp b/smocore/serializedAsynchronousContinuation.cpp index 69f9c5a..4f0038b 100644 --- a/smocore/serializedAsynchronousContinuation.cpp +++ b/smocore/serializedAsynchronousContinuation.cpp @@ -64,7 +64,101 @@ SerializedAsynchronousContinuation ::LockerAndInvoker ::traceContinuationHistoryForGridlockOn(Qutex &firstFailedQutex) { - // Empty implementation - to be filled in later + /** EXPLANATION: + * In this function we check for gridlocks which are slightly different + * from deadlocks. In a gridlock, two requests are waiting for locks that + * are held by the other. I.e: + * + * R1 holds LockA and is waiting for LockB. + * R2 holds LockB and is waiting for LockA. + * + * This differs from deadlocks because it's not a single request which is + * attempting to re-acquire a lock that it already holds. + * + * To detect this condition, we wait until the acquisition timeout has + * expired. Then: we extract the current owner of the first lock we're + * failing to acquire. + * + * From there, we go through each of the locks in the foreign owner's + * current (i.e: immediate, most recent continuation's) required LockSet. + * For each of the locks in the foreign owner's most immediate required + * LockSet, we trace backward in our *OWN* history to see if any of *OUR* + * continuations (excluding our most immediate continuation) contains that + * lock. + * + * If we find a match, that means that we're holding a lock that the foreign + * owner is waiting for. And we already know that the foreign owner is + * holding a lock that we're waiting for (when we extracted the current + * owner of the first failed lock in our most immediate Lockset). + * + * Hence, we have a gridlock. + */ + + LockerAndInvokerBase* foreignOwnerPtr = firstFailedQutex.getCurrOwner(); + // If no current owner, can't be a gridlock + if (foreignOwnerPtr == nullptr) + { return false; } + + // Use reference for the rest of the function for safety. + LockerAndInvokerBase& foreignOwner = *foreignOwnerPtr; + + /* For each lock in the foreign owner's LockSet, check if we hold it + * in any of our previous continuations (excluding our most immediate one) + */ + for (size_t i = 0; i < foreignOwner.getLockSetSize(); ++i) + { + Qutex& foreignLock = foreignOwner.getLockAt(i); + + /* Skip the firstFailedQutex since we already know the foreign owner + * holds it -- hence it's impossible for any of our previous + * continuations to hold it. + */ + if (&foreignLock == &firstFailedQutex) + { continue; } + + /** EXPLANATION: + * Trace backward through our continuation history (excluding our most + * immediate continuation). + * + * The reason we exclude our most immediate continuation is because the + * LockSet acquisition algorithm backs off if it fails to acquire ALL + * locks in the set. So if the lock that the foreign owner is waiting + * for is in our most immediate continuation, and NOT in one of our + * previous continuations, then we will back off and the foreign owner + * should eventually be able to acquire that lock. + */ + for (std::shared_ptr currContin = + this->serializedContinuation.getCallersContinuation(); + currContin != nullptr; + currContin = currContin->getCallersContinuation()) + { + auto serializedCont = std::dynamic_pointer_cast< + SerializedAsynchronousContinuation>(currContin); + + if (serializedCont == nullptr) { continue; } + + // Check if this continuation holds the foreign lock + try { + const auto& lockUsageDesc = serializedCont->requiredLocks + .getLockUsageDesc(foreignLock); + + // Matched! We hold a lock that the foreign owner is waiting for + std::cout << __func__ << ": Gridlock detected: We hold lock @" + << &foreignLock << " (" << foreignLock.name << ") in " + "continuation @" << serializedCont.get() + << ", while foreign owner @" << &foreignOwner + << " holds lock @" << &firstFailedQutex << " (" + << firstFailedQutex.name << ") that we're waiting for" + << std::endl; + + return true; + } catch (const std::runtime_error& e) { + // This continuation doesn't hold the foreign lock. Continue. + continue; + } + } + } + return false; }