From 79df8b3f74ddd5307a130755188d754076f262c1 Mon Sep 17 00:00:00 2001 From: Hayodea Hekol Date: Sun, 23 Nov 2025 05:54:51 -0400 Subject: [PATCH] OClCollMeshEngn,PcloudStimProd: Produce into intensity stimbuff PcloudStimulusBuffer::produceFrameReq(): Now correctly produces into the stim frames for the PcloudIntensityStimulusBuffer object that's attached to the PcloudStimulusProducer. If there's no attached I stimbuff, then the OpenCL kernel will simply not write out the intensity data. This is the first moment when we actually use the SP-MC ringbuffer properly and actually cycle through the frames, producing into them one by one. --- stimBuffApis/livoxGen1/collateDgrams.cl | 54 +++++++-- .../openClCollatingAndMeshingEngine.cpp | 108 +++++++++++++++--- .../openClCollatingAndMeshingEngine.h | 11 +- .../livoxGen1/pcloudStimulusProducer.cpp | 38 ++++-- 4 files changed, 175 insertions(+), 36 deletions(-) diff --git a/stimBuffApis/livoxGen1/collateDgrams.cl b/stimBuffApis/livoxGen1/collateDgrams.cl index 3462a17..29dc2c2 100644 --- a/stimBuffApis/livoxGen1/collateDgrams.cl +++ b/stimBuffApis/livoxGen1/collateDgrams.cl @@ -24,6 +24,7 @@ inline int readInt32LE(__global uchar* ptr) __kernel void collate( __global uchar* assembly, __global float* collation, + __global float* intensityBuffer, uint slotStride, uint nPointsPerSlot, uint nDgramsPerFrame) @@ -44,9 +45,12 @@ __kernel void collate( __global uchar* pointsArray = slotStart + 18; // Base offset in collation buffer for this slot (in floats) - // Each PointXYZI is 4 floats (x, y, z, intensity) - #define FLOATS_PER_POINT 4 + // Each PointXYZ is 3 floats (x, y, z) + #define FLOATS_PER_POINT 3 uint collationBaseOffset = slotIndex * nPointsPerSlot * FLOATS_PER_POINT; + // Base offset in intensity buffer for this slot (in floats) + // Each intensity is 1 float + uint intensityBaseOffset = slotIndex * nPointsPerSlot; DBG_PRINTF("Running kernel: about to process points in slot.\n"); // Process based on data type using nested ifs (outer) with loops (inner) @@ -79,12 +83,16 @@ __kernel void collate( slotIndex, i, intensity); } - // Write to collation buffer + // Write XYZ to collation buffer uint offset = collationBaseOffset + (i * FLOATS_PER_POINT); collation[offset + 0] = x; collation[offset + 1] = y; collation[offset + 2] = z; - collation[offset + 3] = intensity; + // Write intensity conditionally - divert to intensity buffer if attached, else discard + if (intensityBuffer != NULL) { + intensityBuffer[intensityBaseOffset + i] = intensity; + } + // Don't write intensity to collation buffer } } else if (dataType == 2) @@ -117,12 +125,16 @@ __kernel void collate( slotIndex, i, intensity); } - // Write to collation buffer + // Write XYZ to collation buffer uint offset = collationBaseOffset + (i * FLOATS_PER_POINT); collation[offset + 0] = x; collation[offset + 1] = y; collation[offset + 2] = z; - collation[offset + 3] = intensity; + // Write intensity conditionally - divert to intensity buffer if attached, else discard + if (intensityBuffer != NULL) { + intensityBuffer[intensityBaseOffset + i] = intensity; + } + // Don't write intensity to collation buffer } } else if (dataType == 4) @@ -161,7 +173,11 @@ __kernel void collate( collation[offset1 + 0] = x1; collation[offset1 + 1] = y1; collation[offset1 + 2] = z1; - collation[offset1 + 3] = intensity1; + // Write intensity conditionally - divert to intensity buffer if attached, else discard + if (intensityBuffer != NULL) { + intensityBuffer[intensityBaseOffset + pointIndex] = intensity1; + } + // Don't write intensity to collation buffer ++pointIndex; // Process second point @@ -188,7 +204,11 @@ __kernel void collate( collation[offset2 + 0] = x2; collation[offset2 + 1] = y2; collation[offset2 + 2] = z2; - collation[offset2 + 3] = intensity2; + // Write intensity conditionally - divert to intensity buffer if attached, else discard + if (intensityBuffer != NULL) { + intensityBuffer[intensityBaseOffset + pointIndex] = intensity2; + } + // Don't write intensity to collation buffer ++pointIndex; } } @@ -228,7 +248,11 @@ __kernel void collate( collation[offset1 + 0] = x1; collation[offset1 + 1] = y1; collation[offset1 + 2] = z1; - collation[offset1 + 3] = intensity1; + // Write intensity conditionally - divert to intensity buffer if attached, else discard + if (intensityBuffer != NULL) { + intensityBuffer[intensityBaseOffset + pointIndex] = intensity1; + } + // Don't write intensity to collation buffer ++pointIndex; // Process second point @@ -255,7 +279,11 @@ __kernel void collate( collation[offset2 + 0] = x2; collation[offset2 + 1] = y2; collation[offset2 + 2] = z2; - collation[offset2 + 3] = intensity2; + // Write intensity conditionally - divert to intensity buffer if attached, else discard + if (intensityBuffer != NULL) { + intensityBuffer[intensityBaseOffset + pointIndex] = intensity2; + } + // Don't write intensity to collation buffer ++pointIndex; // Process third point @@ -282,7 +310,11 @@ __kernel void collate( collation[offset3 + 0] = x3; collation[offset3 + 1] = y3; collation[offset3 + 2] = z3; - collation[offset3 + 3] = intensity3; + // Write intensity conditionally - divert to intensity buffer if attached, else discard + if (intensityBuffer != NULL) { + intensityBuffer[intensityBaseOffset + pointIndex] = intensity3; + } + // Don't write intensity to collation buffer ++pointIndex; } } diff --git a/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.cpp b/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.cpp index 2057329..5bdfabe 100644 --- a/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.cpp +++ b/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.cpp @@ -165,7 +165,7 @@ void OpenClCollatingAndMeshingEngine::finalize() // Complete any running kernels if (compactIsRunning) { compactKernelComplete(true); } - if (collateIsRunning) { collateKernelComplete(true); } + if (collateIsRunning) { collateKernelComplete(std::nullopt, true); } // Release OpenCL buffers via smo hooks if (smoHooksPtr && smoHooksPtr->ComputeManager_releaseUseHostPtrBuffer) @@ -325,6 +325,7 @@ bool OpenClCollatingAndMeshingEngine::startCompactKernel( bool OpenClCollatingAndMeshingEngine::startCollateKernel( StagingBuffer& assemblyBuff, StagingBuffer& collationBuff, + std::optional> intensityStimFrame, collateKernelCbFn callback) { // Store the caller's callback @@ -345,8 +346,8 @@ bool OpenClCollatingAndMeshingEngine::startCollateKernel( }; // Setup args callable - auto setupArgs = [this, &assemblyBuff]() { - return setupCollateDgramsArgs(assemblyBuff); + auto setupArgs = [this, &assemblyBuff, intensityStimFrame]() { + return setupCollateDgramsArgs(assemblyBuff, intensityStimFrame); }; /** EXPLANATION: @@ -374,6 +375,28 @@ bool OpenClCollatingAndMeshingEngine::startCollateKernel( unmapCollationBuffer(); + // Map/unmap intensity buffer if it exists + if (intensityStimFrame.has_value()) + { + StimulusFrame& intensityFrame = intensityStimFrame->get(); + cl_mem intensityClBuffer = intensityFrame.clBuffer + ->getAssociatedBufferHandleForDevice(computeDevice); + + if (intensityClBuffer) + { + void* mappedIntensityBuffer = nullptr; + if (!mapBuffer(intensityClBuffer, intensityFrame.slotDesc.nBytes, + CL_MAP_WRITE_INVALIDATE_REGION, mappedIntensityBuffer)) + { + std::cerr << __func__ << ": failed to map intensity buffer" + << std::endl; + return false; + } + + unmapBuffer(intensityClBuffer, mappedIntensityBuffer); + } + } + // Calculate global work size (just num slots in the frame) size_t globalWorkSize = static_cast(frameAssemblyDesc->numSlots); @@ -528,7 +551,8 @@ bool OpenClCollatingAndMeshingEngine::setupSlotCompactorsArgs( } bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs( - StagingBuffer& assemblyBuff) + StagingBuffer& assemblyBuff, + std::optional> intensityStimFrame) { // Extract parameters for collateDgrams kernel uint32_t slotStride = static_cast(assemblyBuff.slotStrideNBytes); @@ -563,7 +587,14 @@ bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs( return false; } - err = clSetKernelArg(collateKernel, 2, sizeof(uint32_t), &slotStride); + // Set intensity buffer argument (arg 2) + cl_mem intensityClBuffer = nullptr; + if (intensityStimFrame.has_value()) + { + intensityClBuffer = intensityStimFrame->get().clBuffer + ->getAssociatedBufferHandleForDevice(computeDevice); + } + err = clSetKernelArg(collateKernel, 2, sizeof(cl_mem), &intensityClBuffer); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 2: " << err @@ -571,7 +602,7 @@ bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs( return false; } - err = clSetKernelArg(collateKernel, 3, sizeof(uint32_t), &nPointsPerSlot); + err = clSetKernelArg(collateKernel, 3, sizeof(uint32_t), &slotStride); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 3: " << err @@ -579,7 +610,7 @@ bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs( return false; } - err = clSetKernelArg(collateKernel, 4, sizeof(uint32_t), &nDgramsPerFrame); + err = clSetKernelArg(collateKernel, 4, sizeof(uint32_t), &nPointsPerSlot); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 4: " << err @@ -587,6 +618,14 @@ bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs( return false; } + err = clSetKernelArg(collateKernel, 5, sizeof(uint32_t), &nDgramsPerFrame); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to set kernel arg 5: " << err + << std::endl; + return false; + } + return true; } @@ -611,8 +650,9 @@ void OpenClCollatingAndMeshingEngine::compactKernelComplete(bool isFinalizing) if (isFinalizing) { mapFlags = CL_MAP_WRITE_INVALIDATE_REGION; } else { mapFlags = CL_MAP_READ; } - mapAssemblyBuffer(mapFlags); - unmapAssemblyBuffer(); + if (mapAssemblyBuffer(mapFlags)) { + unmapAssemblyBuffer(); + } clFlush(computeDevice->commandQueue); // Stop only compact kernel @@ -628,7 +668,9 @@ void OpenClCollatingAndMeshingEngine::compactKernelComplete(bool isFinalizing) compactIsRunning = false; } -void OpenClCollatingAndMeshingEngine::collateKernelComplete(bool isFinalizing) +void OpenClCollatingAndMeshingEngine::collateKernelComplete( + std::optional> intensityStimFrame, + bool isFinalizing) { cl_map_flags mapFlags; /** EXPLANATION: @@ -638,8 +680,28 @@ void OpenClCollatingAndMeshingEngine::collateKernelComplete(bool isFinalizing) if (isFinalizing) { mapFlags = CL_MAP_WRITE_INVALIDATE_REGION; } else { mapFlags = CL_MAP_READ; } - mapCollationBuffer(mapFlags); - unmapCollationBuffer(); + if (mapCollationBuffer(mapFlags)) { + unmapCollationBuffer(); + } + + // Map/unmap intensity buffer if it exists + if (intensityStimFrame.has_value()) + { + StimulusFrame& intensityFrame = intensityStimFrame->get(); + cl_mem intensityClBuffer = intensityFrame.clBuffer + ->getAssociatedBufferHandleForDevice(computeDevice); + + if (intensityClBuffer) + { + void* mappedIntensityBuffer = nullptr; + if (mapBuffer(intensityClBuffer, intensityFrame.slotDesc.nBytes, + CL_MAP_READ, mappedIntensityBuffer)) + { + unmapBuffer(intensityClBuffer, mappedIntensityBuffer); + } + } + } + clFlush(computeDevice->commandQueue); // Stop only collate kernel @@ -770,18 +832,21 @@ private: OpenClCollatingAndMeshingEngine& engine; AsynchronousLoop frameAssemblyResult; StimulusFrame& stimulusFrame; + std::optional> intensityStimFrame; public: CompactCollateAndMeshFrameReq( OpenClCollatingAndMeshingEngine& engine_, AsynchronousLoop& asyncLoop, StimulusFrame& stimulusFrame_, + std::optional> intensityStimFrame_, const std::shared_ptr& caller, Callback cb) : PostedAsynchronousContinuation( caller, cb), engine(engine_), - frameAssemblyResult(asyncLoop), stimulusFrame(stimulusFrame_) + frameAssemblyResult(asyncLoop), stimulusFrame(stimulusFrame_), + intensityStimFrame(intensityStimFrame_) {} public: @@ -876,6 +941,7 @@ public: bool success = engine.startCollateKernel( engine.parent.assemblyBuffer, engine.parent.collationBuffer, + context->intensityStimFrame, std::bind( &CompactCollateAndMeshFrameReq ::compactCollateAndMeshFrameReq4_collateDone_maybePosted, @@ -884,7 +950,7 @@ public: if (!success) { - engine.collateKernelComplete(); + engine.collateKernelComplete(context->intensityStimFrame); callOriginalCallback(false); return; } @@ -904,7 +970,16 @@ public: return; } - engine.collateKernelComplete(); + /** EXPLANATION: + * The reason we don't call collateKernelComplete before checking + * shouldAcceptRequests is because if shouldAcceptRequests is false, then + * we shouldn't touch any of the collate cycle state...or any state within + * the engine at all, since finalize() may well have been called. + * + * Therefore it's finalize()'s responsibility to ensure that it properly + * completes/cleans up any in-flight operations. + */ + engine.collateKernelComplete(context->intensityStimFrame); // Record collate kernel end time engine.collateKernelEndTime = std::chrono::high_resolution_clock::now(); @@ -947,6 +1022,7 @@ public: void OpenClCollatingAndMeshingEngine::compactCollateAndMeshFrameReq( AsynchronousLoop& asyncLoop, StimulusFrame& stimulusFrame, + std::optional> intensityStimFrame, Callback callback) { { @@ -960,7 +1036,7 @@ void OpenClCollatingAndMeshingEngine::compactCollateAndMeshFrameReq( auto caller = smoHooksPtr->ComponentThread_getSelf(); auto request = std::make_shared( - *this, asyncLoop, stimulusFrame, + *this, asyncLoop, stimulusFrame, intensityStimFrame, caller, std::move(callback)); diff --git a/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.h b/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.h index 59325be..8eb63c7 100644 --- a/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.h +++ b/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,7 @@ public: compactCollateAndMeshFrameReqCbFn; void compactCollateAndMeshFrameReq( AsynchronousLoop& asyncLoop, StimulusFrame& stimulusFrame, + std::optional> intensityStimFrame, Callback callback); private: @@ -60,10 +62,13 @@ private: compactKernelCbFn callback); bool startCollateKernel( StagingBuffer& assemblyBuff, StagingBuffer& collationBuff, + std::optional> intensityStimFrame, collateKernelCbFn callback); void compactKernelComplete(bool isFinalizing=false); - void collateKernelComplete(bool isFinalizing=false); + void collateKernelComplete( + std::optional> intensityStimFrame, + bool isFinalizing=false); bool stop(); public: @@ -131,7 +136,9 @@ private: bool compileAndPrepareKernels(); bool setupSlotCompactorsArgs( StagingBuffer& assemblyBuff, uint32_t nSucceeded); - bool setupCollateDgramsArgs(StagingBuffer& assemblyBuff); + bool setupCollateDgramsArgs( + StagingBuffer& assemblyBuff, + std::optional> intensityStimFrame); // Generic buffer mapping/unmapping for zero-copy synchronization bool mapBuffer( diff --git a/stimBuffApis/livoxGen1/pcloudStimulusProducer.cpp b/stimBuffApis/livoxGen1/pcloudStimulusProducer.cpp index 435846b..625507d 100644 --- a/stimBuffApis/livoxGen1/pcloudStimulusProducer.cpp +++ b/stimBuffApis/livoxGen1/pcloudStimulusProducer.cpp @@ -22,9 +22,9 @@ static StagingBuffer::IOEngineConstraints openClInputConstraints( * This should eventually be aligned to 4B and padded to 12B. */ // slotStartAlignmentByteVal (page alignment) - sizeof(float) * 4, - // slotPadToNBytes (pointer size) - sizeof(float) * 4, + sizeof(float), + // slotPadToNBytes (XYZ = 3 floats per point) + sizeof(float) * 3, // frameStartAlignmentByteVal (page alignment) static_cast(sysconf(_SC_PAGE_SIZE)), // framePadToNBytes (pointer size) @@ -34,7 +34,7 @@ static StagingBuffer::IOEngineConstraints openClInputConstraints( static StagingBuffer::IOEngineConstraints openClMeshInputConstraints( // slotStartAlignmentByteVal (page alignment) static_cast(sysconf(_SC_PAGE_SIZE)), - // slotPadToNBytes (pointer size) + // slotPadToNBytes: This is dynamically calculated based on the return mode. sizeof(float) * 3, // frameStartAlignmentByteVal (page alignment) static_cast(sysconf(_SC_PAGE_SIZE)), @@ -44,7 +44,7 @@ static StagingBuffer::IOEngineConstraints openClMeshInputConstraints( static StagingBuffer::IOEngineConstraints openClIntensityInputConstraints( // slotStartAlignmentByteVal (page alignment) static_cast(sysconf(_SC_PAGE_SIZE)), - // slotPadToNBytes (intensity value size) + // slotPadToNBytes: This is dynamically calculated based on the return mode. sizeof(float), // frameStartAlignmentByteVal (page alignment) static_cast(sysconf(_SC_PAGE_SIZE)), @@ -54,7 +54,7 @@ static StagingBuffer::IOEngineConstraints openClIntensityInputConstraints( static StagingBuffer::IOEngineConstraints openClAmbienceInputConstraints( // slotStartAlignmentByteVal (page alignment) static_cast(sysconf(_SC_PAGE_SIZE)), - // slotPadToNBytes (pointer size) + // slotPadToNBytes: This is dynamically calculated based on the return mode. sizeof(float), // frameStartAlignmentByteVal (page alignment) static_cast(sysconf(_SC_PAGE_SIZE)), @@ -335,6 +335,7 @@ private: PcloudStimulusProducer& pcloudProducer; AsynchronousLoop frameAssemblyResult; StimulusFrame& stimulusFrame; + std::optional> intensityStimFrame; public: ProduceFrameReq( @@ -392,8 +393,26 @@ public: context->frameAssemblyResult = loop; + // Check if intensity buffer is attached and acquire frame if so + if (pcloudProducer.intensityStimulusBuffer) + { + size_t intensityRingbuffIndex = pcloudProducer + .intensityStimulusBuffer->ringBuffer.getIndexToProduceInto(); + + StimulusFrame& intensityStimFrame = pcloudProducer + .intensityStimulusBuffer->ringBuffer.getDataAtSlot( + intensityRingbuffIndex); + + intensityStimFrame.lock.writeAcquire(); + context->intensityStimFrame = std::make_optional( + std::ref(intensityStimFrame)); + } + else { + context->intensityStimFrame = std::nullopt; + } + pcloudProducer.openClCollatingAndMeshingEngine.compactCollateAndMeshFrameReq( - loop, stimulusFrame, + loop, stimulusFrame, context->intensityStimFrame, {context, std::bind( &ProduceFrameReq::produceFrameReq3_compactCollateDone, context.get(), context, @@ -404,6 +423,11 @@ public: [[maybe_unused]] std::shared_ptr context, bool success, StimulusFrame& /*stimulusFrame*/) { + // Release intensity frame if it was used + if (context->intensityStimFrame.has_value()) { + context->intensityStimFrame->get().lock.writeRelease(); + } + SpinLock::Guard lock(pcloudProducer.shouldContinueLock); if (!pcloudProducer.shouldContinue) {