OClCollMeshEngn,PcloudStimProd: Produce into intensity stimbuff

PcloudStimulusBuffer::produceFrameReq():
Now correctly produces into the stim frames for the
PcloudIntensityStimulusBuffer object that's attached to the
PcloudStimulusProducer. If there's no attached I stimbuff, then
the OpenCL kernel will simply not write out the intensity data.

This is the first moment when we actually use the SP-MC ringbuffer
properly and actually cycle through the frames, producing into
them one by one.
This commit is contained in:
2025-11-23 05:54:51 -04:00
parent a025d13fce
commit 79df8b3f74
4 changed files with 175 additions and 36 deletions
+43 -11
View File
@@ -24,6 +24,7 @@ inline int readInt32LE(__global uchar* ptr)
__kernel void collate(
__global uchar* assembly,
__global float* collation,
__global float* intensityBuffer,
uint slotStride,
uint nPointsPerSlot,
uint nDgramsPerFrame)
@@ -44,9 +45,12 @@ __kernel void collate(
__global uchar* pointsArray = slotStart + 18;
// Base offset in collation buffer for this slot (in floats)
// Each PointXYZI is 4 floats (x, y, z, intensity)
#define FLOATS_PER_POINT 4
// Each PointXYZ is 3 floats (x, y, z)
#define FLOATS_PER_POINT 3
uint collationBaseOffset = slotIndex * nPointsPerSlot * FLOATS_PER_POINT;
// Base offset in intensity buffer for this slot (in floats)
// Each intensity is 1 float
uint intensityBaseOffset = slotIndex * nPointsPerSlot;
DBG_PRINTF("Running kernel: about to process points in slot.\n");
// Process based on data type using nested ifs (outer) with loops (inner)
@@ -79,12 +83,16 @@ __kernel void collate(
slotIndex, i, intensity);
}
// Write to collation buffer
// Write XYZ to collation buffer
uint offset = collationBaseOffset + (i * FLOATS_PER_POINT);
collation[offset + 0] = x;
collation[offset + 1] = y;
collation[offset + 2] = z;
collation[offset + 3] = intensity;
// Write intensity conditionally - divert to intensity buffer if attached, else discard
if (intensityBuffer != NULL) {
intensityBuffer[intensityBaseOffset + i] = intensity;
}
// Don't write intensity to collation buffer
}
}
else if (dataType == 2)
@@ -117,12 +125,16 @@ __kernel void collate(
slotIndex, i, intensity);
}
// Write to collation buffer
// Write XYZ to collation buffer
uint offset = collationBaseOffset + (i * FLOATS_PER_POINT);
collation[offset + 0] = x;
collation[offset + 1] = y;
collation[offset + 2] = z;
collation[offset + 3] = intensity;
// Write intensity conditionally - divert to intensity buffer if attached, else discard
if (intensityBuffer != NULL) {
intensityBuffer[intensityBaseOffset + i] = intensity;
}
// Don't write intensity to collation buffer
}
}
else if (dataType == 4)
@@ -161,7 +173,11 @@ __kernel void collate(
collation[offset1 + 0] = x1;
collation[offset1 + 1] = y1;
collation[offset1 + 2] = z1;
collation[offset1 + 3] = intensity1;
// Write intensity conditionally - divert to intensity buffer if attached, else discard
if (intensityBuffer != NULL) {
intensityBuffer[intensityBaseOffset + pointIndex] = intensity1;
}
// Don't write intensity to collation buffer
++pointIndex;
// Process second point
@@ -188,7 +204,11 @@ __kernel void collate(
collation[offset2 + 0] = x2;
collation[offset2 + 1] = y2;
collation[offset2 + 2] = z2;
collation[offset2 + 3] = intensity2;
// Write intensity conditionally - divert to intensity buffer if attached, else discard
if (intensityBuffer != NULL) {
intensityBuffer[intensityBaseOffset + pointIndex] = intensity2;
}
// Don't write intensity to collation buffer
++pointIndex;
}
}
@@ -228,7 +248,11 @@ __kernel void collate(
collation[offset1 + 0] = x1;
collation[offset1 + 1] = y1;
collation[offset1 + 2] = z1;
collation[offset1 + 3] = intensity1;
// Write intensity conditionally - divert to intensity buffer if attached, else discard
if (intensityBuffer != NULL) {
intensityBuffer[intensityBaseOffset + pointIndex] = intensity1;
}
// Don't write intensity to collation buffer
++pointIndex;
// Process second point
@@ -255,7 +279,11 @@ __kernel void collate(
collation[offset2 + 0] = x2;
collation[offset2 + 1] = y2;
collation[offset2 + 2] = z2;
collation[offset2 + 3] = intensity2;
// Write intensity conditionally - divert to intensity buffer if attached, else discard
if (intensityBuffer != NULL) {
intensityBuffer[intensityBaseOffset + pointIndex] = intensity2;
}
// Don't write intensity to collation buffer
++pointIndex;
// Process third point
@@ -282,7 +310,11 @@ __kernel void collate(
collation[offset3 + 0] = x3;
collation[offset3 + 1] = y3;
collation[offset3 + 2] = z3;
collation[offset3 + 3] = intensity3;
// Write intensity conditionally - divert to intensity buffer if attached, else discard
if (intensityBuffer != NULL) {
intensityBuffer[intensityBaseOffset + pointIndex] = intensity3;
}
// Don't write intensity to collation buffer
++pointIndex;
}
}
@@ -165,7 +165,7 @@ void OpenClCollatingAndMeshingEngine::finalize()
// Complete any running kernels
if (compactIsRunning) { compactKernelComplete(true); }
if (collateIsRunning) { collateKernelComplete(true); }
if (collateIsRunning) { collateKernelComplete(std::nullopt, true); }
// Release OpenCL buffers via smo hooks
if (smoHooksPtr && smoHooksPtr->ComputeManager_releaseUseHostPtrBuffer)
@@ -325,6 +325,7 @@ bool OpenClCollatingAndMeshingEngine::startCompactKernel(
bool OpenClCollatingAndMeshingEngine::startCollateKernel(
StagingBuffer& assemblyBuff, StagingBuffer& collationBuff,
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
collateKernelCbFn callback)
{
// Store the caller's callback
@@ -345,8 +346,8 @@ bool OpenClCollatingAndMeshingEngine::startCollateKernel(
};
// Setup args callable
auto setupArgs = [this, &assemblyBuff]() {
return setupCollateDgramsArgs(assemblyBuff);
auto setupArgs = [this, &assemblyBuff, intensityStimFrame]() {
return setupCollateDgramsArgs(assemblyBuff, intensityStimFrame);
};
/** EXPLANATION:
@@ -374,6 +375,28 @@ bool OpenClCollatingAndMeshingEngine::startCollateKernel(
unmapCollationBuffer();
// Map/unmap intensity buffer if it exists
if (intensityStimFrame.has_value())
{
StimulusFrame& intensityFrame = intensityStimFrame->get();
cl_mem intensityClBuffer = intensityFrame.clBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
if (intensityClBuffer)
{
void* mappedIntensityBuffer = nullptr;
if (!mapBuffer(intensityClBuffer, intensityFrame.slotDesc.nBytes,
CL_MAP_WRITE_INVALIDATE_REGION, mappedIntensityBuffer))
{
std::cerr << __func__ << ": failed to map intensity buffer"
<< std::endl;
return false;
}
unmapBuffer(intensityClBuffer, mappedIntensityBuffer);
}
}
// Calculate global work size (just num slots in the frame)
size_t globalWorkSize = static_cast<uint32_t>(frameAssemblyDesc->numSlots);
@@ -528,7 +551,8 @@ bool OpenClCollatingAndMeshingEngine::setupSlotCompactorsArgs(
}
bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs(
StagingBuffer& assemblyBuff)
StagingBuffer& assemblyBuff,
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame)
{
// Extract parameters for collateDgrams kernel
uint32_t slotStride = static_cast<uint32_t>(assemblyBuff.slotStrideNBytes);
@@ -563,7 +587,14 @@ bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs(
return false;
}
err = clSetKernelArg(collateKernel, 2, sizeof(uint32_t), &slotStride);
// Set intensity buffer argument (arg 2)
cl_mem intensityClBuffer = nullptr;
if (intensityStimFrame.has_value())
{
intensityClBuffer = intensityStimFrame->get().clBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
}
err = clSetKernelArg(collateKernel, 2, sizeof(cl_mem), &intensityClBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 2: " << err
@@ -571,7 +602,7 @@ bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs(
return false;
}
err = clSetKernelArg(collateKernel, 3, sizeof(uint32_t), &nPointsPerSlot);
err = clSetKernelArg(collateKernel, 3, sizeof(uint32_t), &slotStride);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 3: " << err
@@ -579,7 +610,7 @@ bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs(
return false;
}
err = clSetKernelArg(collateKernel, 4, sizeof(uint32_t), &nDgramsPerFrame);
err = clSetKernelArg(collateKernel, 4, sizeof(uint32_t), &nPointsPerSlot);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 4: " << err
@@ -587,6 +618,14 @@ bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs(
return false;
}
err = clSetKernelArg(collateKernel, 5, sizeof(uint32_t), &nDgramsPerFrame);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 5: " << err
<< std::endl;
return false;
}
return true;
}
@@ -611,8 +650,9 @@ void OpenClCollatingAndMeshingEngine::compactKernelComplete(bool isFinalizing)
if (isFinalizing) { mapFlags = CL_MAP_WRITE_INVALIDATE_REGION; }
else { mapFlags = CL_MAP_READ; }
mapAssemblyBuffer(mapFlags);
if (mapAssemblyBuffer(mapFlags)) {
unmapAssemblyBuffer();
}
clFlush(computeDevice->commandQueue);
// Stop only compact kernel
@@ -628,7 +668,9 @@ void OpenClCollatingAndMeshingEngine::compactKernelComplete(bool isFinalizing)
compactIsRunning = false;
}
void OpenClCollatingAndMeshingEngine::collateKernelComplete(bool isFinalizing)
void OpenClCollatingAndMeshingEngine::collateKernelComplete(
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
bool isFinalizing)
{
cl_map_flags mapFlags;
/** EXPLANATION:
@@ -638,8 +680,28 @@ void OpenClCollatingAndMeshingEngine::collateKernelComplete(bool isFinalizing)
if (isFinalizing) { mapFlags = CL_MAP_WRITE_INVALIDATE_REGION; }
else { mapFlags = CL_MAP_READ; }
mapCollationBuffer(mapFlags);
if (mapCollationBuffer(mapFlags)) {
unmapCollationBuffer();
}
// Map/unmap intensity buffer if it exists
if (intensityStimFrame.has_value())
{
StimulusFrame& intensityFrame = intensityStimFrame->get();
cl_mem intensityClBuffer = intensityFrame.clBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
if (intensityClBuffer)
{
void* mappedIntensityBuffer = nullptr;
if (mapBuffer(intensityClBuffer, intensityFrame.slotDesc.nBytes,
CL_MAP_READ, mappedIntensityBuffer))
{
unmapBuffer(intensityClBuffer, mappedIntensityBuffer);
}
}
}
clFlush(computeDevice->commandQueue);
// Stop only collate kernel
@@ -770,18 +832,21 @@ private:
OpenClCollatingAndMeshingEngine& engine;
AsynchronousLoop frameAssemblyResult;
StimulusFrame& stimulusFrame;
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame;
public:
CompactCollateAndMeshFrameReq(
OpenClCollatingAndMeshingEngine& engine_,
AsynchronousLoop& asyncLoop,
StimulusFrame& stimulusFrame_,
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame_,
const std::shared_ptr<ComponentThread>& caller,
Callback<compactCollateAndMeshFrameReqCbFn> cb)
: PostedAsynchronousContinuation<compactCollateAndMeshFrameReqCbFn>(
caller, cb),
engine(engine_),
frameAssemblyResult(asyncLoop), stimulusFrame(stimulusFrame_)
frameAssemblyResult(asyncLoop), stimulusFrame(stimulusFrame_),
intensityStimFrame(intensityStimFrame_)
{}
public:
@@ -876,6 +941,7 @@ public:
bool success = engine.startCollateKernel(
engine.parent.assemblyBuffer, engine.parent.collationBuffer,
context->intensityStimFrame,
std::bind(
&CompactCollateAndMeshFrameReq
::compactCollateAndMeshFrameReq4_collateDone_maybePosted,
@@ -884,7 +950,7 @@ public:
if (!success)
{
engine.collateKernelComplete();
engine.collateKernelComplete(context->intensityStimFrame);
callOriginalCallback(false);
return;
}
@@ -904,7 +970,16 @@ public:
return;
}
engine.collateKernelComplete();
/** EXPLANATION:
* The reason we don't call collateKernelComplete before checking
* shouldAcceptRequests is because if shouldAcceptRequests is false, then
* we shouldn't touch any of the collate cycle state...or any state within
* the engine at all, since finalize() may well have been called.
*
* Therefore it's finalize()'s responsibility to ensure that it properly
* completes/cleans up any in-flight operations.
*/
engine.collateKernelComplete(context->intensityStimFrame);
// Record collate kernel end time
engine.collateKernelEndTime = std::chrono::high_resolution_clock::now();
@@ -947,6 +1022,7 @@ public:
void OpenClCollatingAndMeshingEngine::compactCollateAndMeshFrameReq(
AsynchronousLoop& asyncLoop, StimulusFrame& stimulusFrame,
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
Callback<compactCollateAndMeshFrameReqCbFn> callback)
{
{
@@ -960,7 +1036,7 @@ void OpenClCollatingAndMeshingEngine::compactCollateAndMeshFrameReq(
auto caller = smoHooksPtr->ComponentThread_getSelf();
auto request = std::make_shared<CompactCollateAndMeshFrameReq>(
*this, asyncLoop, stimulusFrame,
*this, asyncLoop, stimulusFrame, intensityStimFrame,
caller,
std::move(callback));
@@ -6,6 +6,7 @@
#include <cstddef>
#include <memory>
#include <functional>
#include <optional>
#include <iostream>
#include <stdexcept>
#include <chrono>
@@ -48,6 +49,7 @@ public:
compactCollateAndMeshFrameReqCbFn;
void compactCollateAndMeshFrameReq(
AsynchronousLoop& asyncLoop, StimulusFrame& stimulusFrame,
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
Callback<compactCollateAndMeshFrameReqCbFn> callback);
private:
@@ -60,10 +62,13 @@ private:
compactKernelCbFn callback);
bool startCollateKernel(
StagingBuffer& assemblyBuff, StagingBuffer& collationBuff,
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
collateKernelCbFn callback);
void compactKernelComplete(bool isFinalizing=false);
void collateKernelComplete(bool isFinalizing=false);
void collateKernelComplete(
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
bool isFinalizing=false);
bool stop();
public:
@@ -131,7 +136,9 @@ private:
bool compileAndPrepareKernels();
bool setupSlotCompactorsArgs(
StagingBuffer& assemblyBuff, uint32_t nSucceeded);
bool setupCollateDgramsArgs(StagingBuffer& assemblyBuff);
bool setupCollateDgramsArgs(
StagingBuffer& assemblyBuff,
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame);
// Generic buffer mapping/unmapping for zero-copy synchronization
bool mapBuffer(
@@ -22,9 +22,9 @@ static StagingBuffer::IOEngineConstraints openClInputConstraints(
* This should eventually be aligned to 4B and padded to 12B.
*/
// slotStartAlignmentByteVal (page alignment)
sizeof(float) * 4,
// slotPadToNBytes (pointer size)
sizeof(float) * 4,
sizeof(float),
// slotPadToNBytes (XYZ = 3 floats per point)
sizeof(float) * 3,
// frameStartAlignmentByteVal (page alignment)
static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
// framePadToNBytes (pointer size)
@@ -34,7 +34,7 @@ static StagingBuffer::IOEngineConstraints openClInputConstraints(
static StagingBuffer::IOEngineConstraints openClMeshInputConstraints(
// slotStartAlignmentByteVal (page alignment)
static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
// slotPadToNBytes (pointer size)
// slotPadToNBytes: This is dynamically calculated based on the return mode.
sizeof(float) * 3,
// frameStartAlignmentByteVal (page alignment)
static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
@@ -44,7 +44,7 @@ static StagingBuffer::IOEngineConstraints openClMeshInputConstraints(
static StagingBuffer::IOEngineConstraints openClIntensityInputConstraints(
// slotStartAlignmentByteVal (page alignment)
static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
// slotPadToNBytes (intensity value size)
// slotPadToNBytes: This is dynamically calculated based on the return mode.
sizeof(float),
// frameStartAlignmentByteVal (page alignment)
static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
@@ -54,7 +54,7 @@ static StagingBuffer::IOEngineConstraints openClIntensityInputConstraints(
static StagingBuffer::IOEngineConstraints openClAmbienceInputConstraints(
// slotStartAlignmentByteVal (page alignment)
static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
// slotPadToNBytes (pointer size)
// slotPadToNBytes: This is dynamically calculated based on the return mode.
sizeof(float),
// frameStartAlignmentByteVal (page alignment)
static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
@@ -335,6 +335,7 @@ private:
PcloudStimulusProducer& pcloudProducer;
AsynchronousLoop frameAssemblyResult;
StimulusFrame& stimulusFrame;
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame;
public:
ProduceFrameReq(
@@ -392,8 +393,26 @@ public:
context->frameAssemblyResult = loop;
// Check if intensity buffer is attached and acquire frame if so
if (pcloudProducer.intensityStimulusBuffer)
{
size_t intensityRingbuffIndex = pcloudProducer
.intensityStimulusBuffer->ringBuffer.getIndexToProduceInto();
StimulusFrame& intensityStimFrame = pcloudProducer
.intensityStimulusBuffer->ringBuffer.getDataAtSlot(
intensityRingbuffIndex);
intensityStimFrame.lock.writeAcquire();
context->intensityStimFrame = std::make_optional(
std::ref(intensityStimFrame));
}
else {
context->intensityStimFrame = std::nullopt;
}
pcloudProducer.openClCollatingAndMeshingEngine.compactCollateAndMeshFrameReq(
loop, stimulusFrame,
loop, stimulusFrame, context->intensityStimFrame,
{context, std::bind(
&ProduceFrameReq::produceFrameReq3_compactCollateDone,
context.get(), context,
@@ -404,6 +423,11 @@ public:
[[maybe_unused]] std::shared_ptr<ProduceFrameReq> context,
bool success, StimulusFrame& /*stimulusFrame*/)
{
// Release intensity frame if it was used
if (context->intensityStimFrame.has_value()) {
context->intensityStimFrame->get().lock.writeRelease();
}
SpinLock::Guard lock(pcloudProducer.shouldContinueLock);
if (!pcloudProducer.shouldContinue)
{