OClCollMeshEngn: Rearrange steps in startCollateKernel

Just to make it match startCompactKernel. No other reason.
This commit is contained in:
2025-11-14 18:01:48 -04:00
parent c08e075763
commit 2e75dd40aa
@@ -424,21 +424,6 @@ bool OpenClCollatingAndMeshingEngine::startCollateKernel(
// Store the caller's callback
collateKernelCb = std::move(callback);
/** EXPLANATION:
* It shouldn't be necessary to map the assembly/collation buffers here
* since we don't need to read/write them on the host CPUs (unless we're
* intervening to debug; in which case we should map them as CL_MAP_READ).
*
* Otherwise, the foreign GPU's view of the data in the assembly buffer
* is currently up to date; and the collation buffer's state is undefined...
* and also irrelevant since it's only going to be used for output anyway.
*/
mapAssemblyBuffer(CL_MAP_WRITE_INVALIDATE_REGION);
unmapAssemblyBuffer();
mapCollationBuffer(CL_MAP_WRITE);
unmapCollationBuffer();
// Validate buffers callable
auto validateBuffers = [this, &assemblyBuff, &collationBuff]() {
struct iovec assemblyIov = assemblyBuff.getClEngineIovec();
@@ -458,6 +443,31 @@ bool OpenClCollatingAndMeshingEngine::startCollateKernel(
return setupCollateDgramsArgs(assemblyBuff);
};
/** EXPLANATION:
* It shouldn't be necessary to map the assembly/collation buffers here
* since we don't need to read/write them on the host CPUs (unless we're
* intervening to debug; in which case we should map them as CL_MAP_READ).
*
* Otherwise, the foreign GPU's view of the data in the assembly buffer
* is currently up to date; and the collation buffer's state is undefined...
* and also irrelevant since it's only going to be used for output anyway.
*/
if (!mapAssemblyBuffer(CL_MAP_WRITE_INVALIDATE_REGION))
{
std::cerr << __func__ << ": failed to map assembly buffer" << std::endl;
return false;
}
unmapAssemblyBuffer();
if (!mapCollationBuffer(CL_MAP_WRITE))
{
std::cerr << __func__ << ": failed to map assembly buffer" << std::endl;
return false;
}
unmapCollationBuffer();
// Calculate global work size (just num slots in the frame)
size_t globalWorkSize = static_cast<uint32_t>(frameAssemblyDesc->numSlots);