Files
salmanoff/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.cpp
T
hayodea 5cce473e01 PcloudStimProd: make sh_ptrs to Pcloud*StimBuff atomic<>
This change is a bit pedantic, but since these vars aren't accessed
in any hotpath, it's fine to be pedantic. We made these sh_ptrs
atomic so we can use acquire and release side effects when loading
and storing them. This doesn't eliminate the problem of seeing
inconsistent state across microcontrollers, but it helps with simple
accesses like these ones we already do.
2025-11-26 00:14:15 -04:00

1205 lines
33 KiB
C++

#include <boostAsioLinkageFix.h>
#include <stdexcept>
#include <iostream>
#include <cstring>
#include <cstddef>
#include <vector>
#include <string>
#include <string_view>
#include <boost/system/error_code.hpp>
#include <asynchronousContinuation.h>
#include <callback.h>
#include <asynchronousLoop.h>
#include <componentThread.h>
#include <user/stimulusFrame.h>
#include <livoxProto1/device.h>
#include "livoxGen1.h"
#include "openClCollatingAndMeshingEngine.h"
#include "pcloudStimulusProducer.h"
#include "openClKernels.h"
#include <user/frameAssemblyDesc.h>
#include "ioUringAssemblyEngine.h"
#include <user/senseApiDesc.h>
extern const smo::stim_buff::SmoCallbacks* smoHooksPtr;
namespace smo {
namespace stim_buff {
OpenClCollatingAndMeshingEngine::OpenClCollatingAndMeshingEngine(
PcloudStimulusProducer& parent_)
: parent(parent_),
computeDevice(nullptr),
slotCompactorProgram(nullptr), collateProgram(nullptr),
slotCompactorKernel(nullptr), collateKernel(nullptr),
clAssemblyBufferClBuffer(nullptr),
clCollationBufferClBuffer(nullptr),
clAssemblyBuffer(nullptr),
clCollationBuffer(nullptr),
shouldAcceptRequests(false),
compactIsRunning(false),
collateIsRunning(false),
currentCompactKernelEvent(nullptr), currentCollateKernelEvent(nullptr),
assemblyBufferPtr(nullptr),
assemblyBufferSize(0),
collationBufferPtr(nullptr),
collationBufferSize(0),
mappedAssemblyBuffer(nullptr),
mappedCollationBuffer(nullptr),
frameAssemblyDesc(nullptr)
{
}
OpenClCollatingAndMeshingEngine::~OpenClCollatingAndMeshingEngine()
{
finalize();
}
bool OpenClCollatingAndMeshingEngine::setup()
{
// Defensive check to prevent double-calling
{
SpinLock::Guard lock(shouldAcceptRequestsLock);
if (shouldAcceptRequests)
{
throw std::runtime_error(std::string(__func__) + ": setup() called "
"while already set up");
}
}
if (!smoHooksPtr || !smoHooksPtr->ComputeManager_getDevice)
{
std::cerr << __func__ << ": smo hooks not available" << std::endl;
return false;
}
// Get ComputeDevice from smo hooks
computeDevice = smoHooksPtr->ComputeManager_getDevice();
if (!computeDevice)
{
std::cerr << __func__ << ": failed to get compute device" << std::endl;
return false;
}
// Get StagingBuffer memory pointers from parent
struct iovec assemblyIov = parent.assemblyBuffer.getClEngineIovec();
struct iovec collationIov = parent.collationBuffer.getClEngineIovec();
assemblyBufferPtr = assemblyIov.iov_base;
assemblyBufferSize = assemblyIov.iov_len;
collationBufferPtr = collationIov.iov_base;
collationBufferSize = collationIov.iov_len;
// Get FrameAssemblyDesc from assembly buffer
frameAssemblyDesc = static_cast<std::shared_ptr<FrameAssemblyDesc>>(
parent.assemblyBuffer);
if (!frameAssemblyDesc || frameAssemblyDesc->slots.empty())
{
std::cerr << __func__ << ": invalid frame descriptor" << std::endl;
goto cleanup;
}
// Create OpenCL buffers using smo hooks
if (!smoHooksPtr->ComputeManager_createUseHostPtrBuffer)
{
std::cerr << __func__ << ": createUseHostPtrBuffer hook not available"
<< std::endl;
goto cleanup;
}
clAssemblyBufferClBuffer = smoHooksPtr
->ComputeManager_createUseHostPtrBuffer(
assemblyBufferPtr, assemblyBufferSize, CL_MEM_READ_WRITE);
if (!clAssemblyBufferClBuffer)
{
std::cerr << __func__ << ": failed to create assembly buffer"
<< std::endl;
goto cleanup;
}
clCollationBufferClBuffer = smoHooksPtr
->ComputeManager_createUseHostPtrBuffer(
collationBufferPtr, collationBufferSize, CL_MEM_WRITE_ONLY);
if (!clCollationBufferClBuffer)
{
std::cerr << __func__ << ": failed to create collation buffer"
<< std::endl;
goto cleanup;
}
// Cache cl_mem handles for the device we're using
clAssemblyBuffer = clAssemblyBufferClBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
clCollationBuffer = clCollationBufferClBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
if (!clAssemblyBuffer || !clCollationBuffer)
{
std::cerr << __func__ << ": failed to get buffer handles for device"
<< std::endl;
goto cleanup;
}
// Compile and prepare both kernels
if (!compileAndPrepareKernels()) {
goto cleanup;
}
clFlush(computeDevice->commandQueue);
clFinish(computeDevice->commandQueue);
shouldAcceptRequests = true;
return true;
cleanup:
finalize();
return false;
}
void OpenClCollatingAndMeshingEngine::finalize()
{
// Call stop() to set shouldAcceptRequests to false and get previous state
bool wasAcceptingRequests = stop();
(void)wasAcceptingRequests;
// Complete any running kernels
if (compactIsRunning) { compactKernelComplete(true); }
if (collateIsRunning) { collateKernelComplete(std::nullopt, std::nullopt, true); }
// Release OpenCL buffers via smo hooks
if (smoHooksPtr && smoHooksPtr->ComputeManager_releaseUseHostPtrBuffer)
{
if (clCollationBufferClBuffer)
{
smoHooksPtr->ComputeManager_releaseUseHostPtrBuffer(
clCollationBufferClBuffer);
clCollationBufferClBuffer.reset();
}
if (clAssemblyBufferClBuffer)
{
smoHooksPtr->ComputeManager_releaseUseHostPtrBuffer(
clAssemblyBufferClBuffer);
clAssemblyBufferClBuffer.reset();
}
}
// Reset cached cl_mem handles
clCollationBuffer = nullptr;
clAssemblyBuffer = nullptr;
// Release kernels
if (slotCompactorKernel)
{
clReleaseKernel(slotCompactorKernel);
slotCompactorKernel = nullptr;
}
if (collateKernel)
{
clReleaseKernel(collateKernel);
collateKernel = nullptr;
}
// Release programs
if (slotCompactorProgram)
{
clReleaseProgram(slotCompactorProgram);
slotCompactorProgram = nullptr;
}
if (collateProgram)
{
clReleaseProgram(collateProgram);
collateProgram = nullptr;
}
// Release compute device via smo hooks
if (smoHooksPtr && smoHooksPtr->ComputeManager_releaseDevice
&& computeDevice)
{
smoHooksPtr->ComputeManager_releaseDevice(computeDevice);
computeDevice.reset();
}
// Reset state variables
compactIsRunning = false;
collateIsRunning = false;
currentCompactKernelEvent = nullptr;
currentCollateKernelEvent = nullptr;
assemblyBufferPtr = nullptr;
assemblyBufferSize = 0;
collationBufferPtr = nullptr;
collationBufferSize = 0;
frameAssemblyDesc = nullptr;
}
// Static callback for compact kernel event
void CL_CALLBACK OpenClCollatingAndMeshingEngine::compactKernelEventCallback(
cl_event /*event*/, cl_int event_command_exec_status, void* user_data)
{
OpenClCollatingAndMeshingEngine* engine =
static_cast<OpenClCollatingAndMeshingEngine*>(user_data);
if (!engine || !engine->compactKernelCb)
{ return; }
// Post to io_service to call callback on the correct thread
if (engine->parent.device && engine->parent.device->componentThread)
{
engine->parent.device->componentThread->getIoService().post(
std::bind(engine->compactKernelCb, event_command_exec_status));
}
}
// Static callback for collate kernel event
void CL_CALLBACK OpenClCollatingAndMeshingEngine::collateKernelEventCallback(
cl_event /*event*/, cl_int event_command_exec_status, void* user_data)
{
OpenClCollatingAndMeshingEngine* engine =
static_cast<OpenClCollatingAndMeshingEngine*>(user_data);
if (!engine || !engine->collateKernelCb)
{ return; }
// Post to io_service to call callback on the correct thread
if (engine->parent.device && engine->parent.device->componentThread)
{
engine->parent.device->componentThread->getIoService().post(
std::bind(engine->collateKernelCb, event_command_exec_status));
}
}
bool OpenClCollatingAndMeshingEngine::startCompactKernel(
StagingBuffer& assemblyBuff, uint32_t nSucceeded,
compactKernelCbFn callback)
{
// Store the caller's callback
compactKernelCb = std::move(callback);
// Validate buffers callable
auto validateBuffers = [this, &assemblyBuff]() {
struct iovec assemblyIov = assemblyBuff.getClEngineIovec();
if (assemblyIov.iov_base != assemblyBufferPtr
|| assemblyIov.iov_len != assemblyBufferSize)
{
throw std::runtime_error(
std::string(__func__) + ": buffer mismatch - buffers have "
"changed");
}
};
// Setup args callable
auto setupArgs = [this, &assemblyBuff, nSucceeded]() {
return setupSlotCompactorsArgs(assemblyBuff, nSucceeded);
};
/** EXPLANAITON:
* Map assembly buffer as WRITE_INVALIDATE_REGION to inform OpenCL that host
* (io_uring) has written data, and that the host doesn't care what the
* prior contents of the device's cache see. The device must invalidate
* its own view of the HOST_PTR and accept our view.
*
* Then immediately unmap to let OpenCL make the changes visible to the GPU
*/
if (!mapAssemblyBuffer(CL_MAP_WRITE_INVALIDATE_REGION))
{
std::cerr << __func__ << ": failed to map assembly buffer" << std::endl;
return false;
}
// Unmap immediately to sync host writes to GPU
unmapAssemblyBuffer();
bool success = startKernel(
slotCompactorKernel,
&currentCompactKernelEvent,
setupArgs,
validateBuffers,
1, // globalWorkSize
compactKernelEventCallback,
"slotCompactor",
compactIsRunning);
if (!success) { return false; }
return true;
}
bool OpenClCollatingAndMeshingEngine::startCollateKernel(
StagingBuffer& assemblyBuff, StagingBuffer& collationBuff,
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame,
collateKernelCbFn callback)
{
// Store the caller's callback
collateKernelCb = std::move(callback);
// Validate buffers callable
auto validateBuffers = [this, &assemblyBuff, &collationBuff]() {
struct iovec assemblyIov = assemblyBuff.getClEngineIovec();
struct iovec collationIov = collationBuff.getClEngineIovec();
if (assemblyIov.iov_base != assemblyBufferPtr
|| assemblyIov.iov_len != assemblyBufferSize
|| collationIov.iov_base != collationBufferPtr
|| collationIov.iov_len != collationBufferSize)
{
throw std::runtime_error(
std::string(__func__) + ": buffer mismatch - buffers have changed");
}
};
// Setup args callable
auto setupArgs = [
this, &assemblyBuff, intensityStimFrame, ambienceStimFrame
]()
{
return setupCollateDgramsArgs(
assemblyBuff, intensityStimFrame, ambienceStimFrame);
};
/** EXPLANATION:
* It shouldn't be necessary to map the assembly/collation buffers here
* since we don't need to read/write them on the host CPUs (unless we're
* intervening to debug; in which case we should map them as CL_MAP_READ).
*
* Otherwise, the foreign GPU's view of the data in the assembly buffer
* is currently up to date; and the collation buffer's state is undefined...
* and also irrelevant since it's only going to be used for output anyway.
*/
if (!mapAssemblyBuffer(CL_MAP_WRITE_INVALIDATE_REGION))
{
std::cerr << __func__ << ": failed to map assembly buffer" << std::endl;
return false;
}
unmapAssemblyBuffer();
if (!mapCollationBuffer(CL_MAP_WRITE))
{
std::cerr << __func__ << ": failed to map assembly buffer" << std::endl;
return false;
}
unmapCollationBuffer();
// Map/unmap intensity buffer if it exists
if (intensityStimFrame.has_value())
{
StimulusFrame& intensityFrame = intensityStimFrame->get();
cl_mem intensityClBuffer = intensityFrame.clBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
if (intensityClBuffer)
{
void* mappedIntensityBuffer = nullptr;
if (!mapBuffer(
intensityClBuffer, intensityFrame.slotDesc.nBytes,
CL_MAP_WRITE_INVALIDATE_REGION, mappedIntensityBuffer))
{
std::cerr << __func__ << ": failed to map intensity buffer"
<< std::endl;
return false;
}
unmapBuffer(intensityClBuffer, mappedIntensityBuffer);
}
}
// Map/unmap ambience buffer if it exists
if (ambienceStimFrame.has_value())
{
StimulusFrame& ambienceFrame = ambienceStimFrame->get();
cl_mem ambienceClBuffer = ambienceFrame.clBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
if (ambienceClBuffer)
{
void* mappedAmbienceBuffer = nullptr;
if (!mapBuffer(
ambienceClBuffer, ambienceFrame.slotDesc.nBytes,
CL_MAP_WRITE_INVALIDATE_REGION, mappedAmbienceBuffer))
{
std::cerr << __func__ << ": failed to map ambience buffer"
<< std::endl;
return false;
}
unmapBuffer(ambienceClBuffer, mappedAmbienceBuffer);
}
}
// Calculate global work size (just num slots in the frame)
size_t globalWorkSize = static_cast<uint32_t>(frameAssemblyDesc->numSlots);
bool success = startKernel(
collateKernel,
&currentCollateKernelEvent,
setupArgs,
validateBuffers,
globalWorkSize,
collateKernelEventCallback,
"collateDgrams",
collateIsRunning);
if (!success) { return false; }
return true;
}
bool OpenClCollatingAndMeshingEngine::compileAndPrepareKernel(
const char* kernelSource, size_t kernelSourceLen,
const char* kernelName, cl_program& program, cl_kernel& kernel)
{
cl_int err;
// Create program from source
program = clCreateProgramWithSource(
computeDevice->context, 1, &kernelSource, &kernelSourceLen, &err);
if (err != CL_SUCCESS || !program)
{
std::cerr << __func__ << ": failed to create " << kernelName
<< " program: " << err << std::endl;
return false;
}
// Build program
err = clBuildProgram(program, 1, &computeDevice->device,
nullptr, nullptr, nullptr);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to build " << kernelName
<< " program: " << err << std::endl;
// Print build log if available
size_t logSize = 0;
clGetProgramBuildInfo(
program, computeDevice->device, CL_PROGRAM_BUILD_LOG,
0, nullptr, &logSize);
if (logSize > 0)
{
std::vector<char> log(logSize);
clGetProgramBuildInfo(
program, computeDevice->device, CL_PROGRAM_BUILD_LOG,
logSize, log.data(), nullptr);
std::cerr << kernelName << " build log: " << log.data()
<< std::endl;
}
return false;
}
// Create kernel
kernel = clCreateKernel(program, kernelName, &err);
if (err != CL_SUCCESS || !kernel)
{
std::cerr << __func__ << ": failed to create " << kernelName
<< " kernel: " << err << std::endl;
return false;
}
return true;
}
bool OpenClCollatingAndMeshingEngine::compileAndPrepareKernels()
{
// Compile slotCompactor kernel
if (!compileAndPrepareKernel(
slotCompactorKernelStart, slotCompactorKernelNBytes,
"slotCompactor", slotCompactorProgram, slotCompactorKernel))
{
return false;
}
// Compile collateDgrams kernel
if (!compileAndPrepareKernel(
collateKernelStart, collateKernelNBytes,
"collate", collateProgram, collateKernel))
{
return false;
}
return true;
}
bool OpenClCollatingAndMeshingEngine::setupSlotCompactorsArgs(
StagingBuffer& assemblyBuff, uint32_t nSucceeded)
{
// Extract parameters for slotCompactor kernel
uint32_t numSlots = static_cast<uint32_t>(frameAssemblyDesc->numSlots);
uint32_t slotStride = static_cast<uint32_t>(assemblyBuff.slotStrideNBytes);
uint32_t slotSize = static_cast<uint32_t>(frameAssemblyDesc->slotSizeBytes);
uint32_t nSucceededUint = static_cast<uint32_t>(nSucceeded);
// Set kernel arguments for slotCompactor
cl_int err;
err = clSetKernelArg(
slotCompactorKernel, 0, sizeof(cl_mem), &clAssemblyBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 0: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(slotCompactorKernel, 1, sizeof(uint32_t), &numSlots);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 1: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(slotCompactorKernel, 2, sizeof(uint32_t), &slotStride);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 2: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(slotCompactorKernel, 3, sizeof(uint32_t), &slotSize);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 3: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(
slotCompactorKernel, 4, sizeof(uint32_t), &nSucceededUint);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 4: " << err
<< std::endl;
return false;
}
return true;
}
bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs(
StagingBuffer& assemblyBuff,
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame)
{
// Extract parameters for collateDgrams kernel
uint32_t slotStride = static_cast<uint32_t>(assemblyBuff.slotStrideNBytes);
// Calculate nPointsPerSlot from device return mode
if (!parent.device)
{
std::cerr << __func__ << ": device not available" << std::endl;
return false;
}
int returnMode = static_cast<int>(parent.device->currentReturnMode);
uint32_t nPointsPerSlot = static_cast<uint32_t>(
livoxProto1::Device::getNPointsPerDgram(returnMode));
uint32_t nDgramsPerFrame = static_cast<uint32_t>(
frameAssemblyDesc->numSlots);
// Set kernel arguments for collateDgrams
cl_int err;
err = clSetKernelArg(collateKernel, 0, sizeof(cl_mem), &clAssemblyBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 0: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(collateKernel, 1, sizeof(cl_mem), &clCollationBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 1: " << err
<< std::endl;
return false;
}
// Set intensity buffer argument (arg 2)
cl_mem intensityClBuffer = nullptr;
if (intensityStimFrame.has_value())
{
intensityClBuffer = intensityStimFrame->get().clBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
}
err = clSetKernelArg(collateKernel, 2, sizeof(cl_mem), &intensityClBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 2: " << err
<< std::endl;
return false;
}
// Set ambience buffer argument (arg 3)
cl_mem ambienceClBuffer = nullptr;
if (ambienceStimFrame.has_value())
{
ambienceClBuffer = ambienceStimFrame->get().clBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
}
err = clSetKernelArg(collateKernel, 3, sizeof(cl_mem), &ambienceClBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 3: " << err
<< std::endl;
return false;
}
// Set ambienceHighVal argument (arg 4)
uint32_t ambienceHighVal = 0;
std::shared_ptr<PcloudAmbienceStimulusBuffer> ambienceBuff = nullptr;
if (ambienceStimFrame.has_value()
&& (ambienceBuff = parent.ambienceStimulusBuffer.load(
std::memory_order_acquire)))
{
ambienceHighVal = ambienceBuff->ambienceHighVal;
}
err = clSetKernelArg(collateKernel, 4, sizeof(uint32_t), &ambienceHighVal);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 4: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(collateKernel, 5, sizeof(uint32_t), &slotStride);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 3: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(collateKernel, 6, sizeof(uint32_t), &nPointsPerSlot);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 6: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(collateKernel, 7, sizeof(uint32_t), &nDgramsPerFrame);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 7: " << err
<< std::endl;
return false;
}
return true;
}
bool OpenClCollatingAndMeshingEngine::stop()
{
// Acquire and release lock tightly around setting the flag
SpinLock::Guard lock(shouldAcceptRequestsLock);
bool wasAcceptingRequests = shouldAcceptRequests;
shouldAcceptRequests = false;
return wasAcceptingRequests;
}
void OpenClCollatingAndMeshingEngine::compactKernelComplete(bool isFinalizing)
{
cl_map_flags mapFlags;
/** EXPLANATION:
* Technically we should only need to do this if we plan to read the
* compacted slots for debugging purposes. Otherwise this is unnecessary.
*/
if (isFinalizing) { mapFlags = CL_MAP_WRITE_INVALIDATE_REGION; }
else { mapFlags = CL_MAP_READ; }
if (mapAssemblyBuffer(mapFlags)) {
unmapAssemblyBuffer();
}
clFlush(computeDevice->commandQueue);
// Stop only compact kernel
if (compactIsRunning && currentCompactKernelEvent)
{
clWaitForEvents(1, &currentCompactKernelEvent);
clReleaseEvent(currentCompactKernelEvent);
currentCompactKernelEvent = nullptr;
}
clFinish(computeDevice->commandQueue);
compactKernelCb = [](cl_int){};
compactIsRunning = false;
}
void OpenClCollatingAndMeshingEngine::collateKernelComplete(
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame,
bool isFinalizing)
{
cl_map_flags mapFlags;
/** EXPLANATION:
* Technically we should only need to do this if we plan to read the
* collated dgrams for debugging purposes. Otherwise this is unnecessary.
*/
if (isFinalizing) { mapFlags = CL_MAP_WRITE_INVALIDATE_REGION; }
else { mapFlags = CL_MAP_READ; }
if (mapCollationBuffer(mapFlags)) {
unmapCollationBuffer();
}
// Map/unmap intensity buffer if it exists
if (intensityStimFrame.has_value())
{
StimulusFrame& intensityFrame = intensityStimFrame->get();
cl_mem intensityClBuffer = intensityFrame.clBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
if (intensityClBuffer)
{
void* mappedIntensityBuffer = nullptr;
if (mapBuffer(
intensityClBuffer, intensityFrame.slotDesc.nBytes,
CL_MAP_READ, mappedIntensityBuffer))
{
unmapBuffer(intensityClBuffer, mappedIntensityBuffer);
}
}
}
// Map/unmap ambience buffer if it exists
if (ambienceStimFrame.has_value())
{
StimulusFrame& ambienceFrame = ambienceStimFrame->get();
cl_mem ambienceClBuffer = ambienceFrame.clBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
if (ambienceClBuffer)
{
void* mappedAmbienceBuffer = nullptr;
if (mapBuffer(
ambienceClBuffer, ambienceFrame.slotDesc.nBytes,
CL_MAP_READ, mappedAmbienceBuffer))
{
unmapBuffer(ambienceClBuffer, mappedAmbienceBuffer);
}
}
}
clFlush(computeDevice->commandQueue);
// Stop only collate kernel
if (collateIsRunning && currentCollateKernelEvent)
{
clWaitForEvents(1, &currentCollateKernelEvent);
clReleaseEvent(currentCollateKernelEvent);
currentCollateKernelEvent = nullptr;
}
clFinish(computeDevice->commandQueue);
collateKernelCb = [](cl_int){};
collateIsRunning = false;
}
bool OpenClCollatingAndMeshingEngine::mapBuffer(
cl_mem buffer, size_t size, cl_map_flags mapFlags, void*& mappedPtr)
{
if (!computeDevice->commandQueue || !buffer)
{
std::cerr << __func__ << ": engine not set up or invalid buffer"
<< std::endl;
return false;
}
// If already mapped, return early with success.
if (mappedPtr != nullptr) { return true; }
cl_int err;
mappedPtr = clEnqueueMapBuffer(
computeDevice->commandQueue, buffer, CL_TRUE, mapFlags,
0, size, 0, nullptr, nullptr, &err);
if (err != CL_SUCCESS || !mappedPtr)
{
std::cerr << __func__ << ": failed to map buffer: " << err
<< std::endl;
goto cleanup;
}
return true;
cleanup:
mappedPtr = nullptr;
return false;
}
bool OpenClCollatingAndMeshingEngine::unmapBuffer(
cl_mem buffer, void*& mappedPtr
)
{
if (mappedPtr == nullptr)
{
// Already unmapped
return true;
}
if (!computeDevice->commandQueue || !buffer)
{
std::cerr << __func__ << ": engine not set up or invalid buffer.\n";
return false;
}
cl_int err;
cl_event unmapEvent = nullptr;
err = clEnqueueUnmapMemObject(
computeDevice->commandQueue, buffer, mappedPtr,
0, nullptr, &unmapEvent);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to unmap buffer: " << err
<< std::endl;
goto cleanup;
}
// Wait for unmap to complete and release the event
err = clWaitForEvents(1, &unmapEvent);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to wait for unmap event: " << err
<< std::endl;
goto cleanup_unmapEvent;
}
clReleaseEvent(unmapEvent);
mappedPtr = nullptr;
return true;
cleanup_unmapEvent:
clReleaseEvent(unmapEvent);
cleanup:
return false;
}
bool OpenClCollatingAndMeshingEngine::mapAssemblyBuffer(cl_map_flags mapFlags)
{
return mapBuffer(
clAssemblyBuffer, assemblyBufferSize, mapFlags, mappedAssemblyBuffer);
}
bool OpenClCollatingAndMeshingEngine::unmapAssemblyBuffer()
{
unmapBuffer(clAssemblyBuffer, mappedAssemblyBuffer);
mappedAssemblyBuffer = nullptr;
return true;
}
bool OpenClCollatingAndMeshingEngine::mapCollationBuffer(cl_map_flags mapFlags)
{
return mapBuffer(
clCollationBuffer, collationBufferSize, mapFlags,
mappedCollationBuffer);
}
bool OpenClCollatingAndMeshingEngine::unmapCollationBuffer()
{
unmapBuffer(clCollationBuffer, mappedCollationBuffer);
mappedCollationBuffer = nullptr;
return true;
}
class OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq
: public PostedAsynchronousContinuation<compactCollateAndMeshFrameReqCbFn>
{
private:
OpenClCollatingAndMeshingEngine& engine;
AsynchronousLoop frameAssemblyResult;
StimulusFrame& stimulusFrame;
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame;
std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame;
public:
CompactCollateAndMeshFrameReq(
OpenClCollatingAndMeshingEngine& engine_,
AsynchronousLoop& asyncLoop,
StimulusFrame& stimulusFrame_,
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame_,
std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame_,
const std::shared_ptr<ComponentThread>& caller,
Callback<compactCollateAndMeshFrameReqCbFn> cb)
: PostedAsynchronousContinuation<compactCollateAndMeshFrameReqCbFn>(
caller, cb),
engine(engine_),
frameAssemblyResult(asyncLoop), stimulusFrame(stimulusFrame_),
intensityStimFrame(intensityStimFrame_),
ambienceStimFrame(ambienceStimFrame_)
{}
public:
void callOriginalCallback(bool success)
{ callOriginalCb(success, std::ref(stimulusFrame)); }
public:
void compactCollateAndMeshFrameReq1_doCompact_posted(
std::shared_ptr<CompactCollateAndMeshFrameReq> context)
{
SpinLock::Guard lock(engine.shouldAcceptRequestsLock);
if (!engine.shouldAcceptRequests)
{
callOriginalCallback(false);
return;
}
// Record compact kernel start time
engine.compactKernelStartTime = std::chrono::high_resolution_clock::now();
bool success = engine.startCompactKernel(
engine.parent.assemblyBuffer,
static_cast<uint32_t>(context->frameAssemblyResult.nSucceeded.load()),
std::bind(
&CompactCollateAndMeshFrameReq
::compactCollateAndMeshFrameReq2_compactDone_posted,
context.get(), context,
std::placeholders::_1));
if (!success)
{
engine.compactKernelComplete();
callOriginalCallback(false);
return;
}
}
void compactCollateAndMeshFrameReq2_compactDone_posted(
std::shared_ptr<CompactCollateAndMeshFrameReq> context,
cl_int compactStatus)
{
SpinLock::Guard lock(engine.shouldAcceptRequestsLock);
if (!engine.shouldAcceptRequests)
{
/** EXPLANATION:
* We intentionally don't call compactKernelComplete() here because
* if shouldAcceptRequests is false, then the caller that called
* finalize() will also be forced to call compactKernelComplete()
* inside of finalize().
*/
callOriginalCallback(false);
return;
}
engine.compactKernelComplete();
// Record compact kernel end time
engine.compactKernelEndTime = std::chrono::high_resolution_clock::now();
// If compact failed, call callback directly with failure
if (compactStatus != CL_SUCCESS)
{
callOriginalCallback(false);
return;
}
#if 0
// Print first 4 bytes of each slot
if (engine.frameAssemblyDesc)
{
for (size_t i = 0; i < engine.frameAssemblyDesc->numSlots; ++i) {
engine.parent.ioUringAssemblyEngine.printSlotBytes(i, 4);
}
}
#endif
lock.unlockPrematurely();
context->compactCollateAndMeshFrameReq3_doCollate_posted(context);
}
void compactCollateAndMeshFrameReq3_doCollate_posted(
std::shared_ptr<CompactCollateAndMeshFrameReq> context)
{
SpinLock::Guard lock(engine.shouldAcceptRequestsLock);
if (!engine.shouldAcceptRequests)
{
callOriginalCallback(false);
return;
}
// Record collate kernel start time
engine.collateKernelStartTime = std::chrono::high_resolution_clock::now();
bool success = engine.startCollateKernel(
engine.parent.assemblyBuffer, engine.parent.collationBuffer,
context->intensityStimFrame, context->ambienceStimFrame,
std::bind(
&CompactCollateAndMeshFrameReq
::compactCollateAndMeshFrameReq4_collateDone_maybePosted,
context.get(), context,
std::placeholders::_1));
if (!success)
{
engine.collateKernelComplete(
context->intensityStimFrame, context->ambienceStimFrame);
callOriginalCallback(false);
return;
}
}
void compactCollateAndMeshFrameReq4_collateDone_maybePosted(
[[maybe_unused]] std::shared_ptr<CompactCollateAndMeshFrameReq> context,
cl_int collateStatus)
{
SpinLock::Guard lock(engine.shouldAcceptRequestsLock);
if (!engine.shouldAcceptRequests)
{
/* We intentionally don't call collateKernelComplete() here for the
* same reason as above.
*/
callOriginalCallback(false);
return;
}
/** EXPLANATION:
* The reason we don't call collateKernelComplete before checking
* shouldAcceptRequests is because if shouldAcceptRequests is false, then
* we shouldn't touch any of the collate cycle state...or any state within
* the engine at all, since finalize() may well have been called.
*
* Therefore it's finalize()'s responsibility to ensure that it properly
* completes/cleans up any in-flight operations.
*/
engine.collateKernelComplete(
context->intensityStimFrame, context->ambienceStimFrame);
// Record collate kernel end time
engine.collateKernelEndTime = std::chrono::high_resolution_clock::now();
bool success = (collateStatus == CL_SUCCESS);
// Early callback + return pattern
if (!success)
{
callOriginalCallback(false);
return;
}
int returnMode = static_cast<int>(engine.parent.device->currentReturnMode);
size_t pointsPerDgram = livoxProto1::Device::getNPointsPerDgram(
returnMode);
uint32_t nSucceeded = context->frameAssemblyResult.nSucceeded.load();
size_t totalPoints = nSucceeded * pointsPerDgram;
// Count points with intensity greater than 116
size_t highIntensityCount = 0;
if (context->intensityStimFrame.has_value())
{
StimulusFrame& intensityFrame = context->intensityStimFrame->get();
float* intensityFloats = reinterpret_cast<float*>(intensityFrame.slotDesc.vaddr);
for (size_t i = 0; i < totalPoints; ++i)
{
float intensity = intensityFloats[i];
if (intensity >= 116.0f)
{
++highIntensityCount;
}
}
}
// Sum up ambience counts from ambience buffer
uint32_t ambienceCountSum = 0;
if (context->ambienceStimFrame.has_value())
{
StimulusFrame& ambienceFrame = context->ambienceStimFrame->get();
uint32_t* ambienceCounts = reinterpret_cast<uint32_t*>(ambienceFrame.slotDesc.vaddr);
for (uint32_t i = 0; i < nSucceeded; ++i)
{
ambienceCountSum += ambienceCounts[i];
}
}
std::cout << __func__ << ": intensityRingBufferIndex="
<< (context->intensityStimFrame.has_value() ?
context->intensityStimFrame->get().ringBufferIndex : SIZE_MAX)
<< ", ambienceRingBufferIndex="
<< (context->ambienceStimFrame.has_value() ?
context->ambienceStimFrame->get().ringBufferIndex : SIZE_MAX)
<< ", pointsPerDgram=" << pointsPerDgram
<< ", nSucceeded=" << nSucceeded
<< ", totalPoints=" << totalPoints
<< ", highIntensityCount=" << highIntensityCount
<< ", ambienceCountSum=" << ambienceCountSum << std::endl;
callOriginalCallback(success);
}
};
void OpenClCollatingAndMeshingEngine::compactCollateAndMeshFrameReq(
AsynchronousLoop& asyncLoop, StimulusFrame& stimulusFrame,
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame,
Callback<compactCollateAndMeshFrameReqCbFn> callback)
{
{
SpinLock::Guard lock(shouldAcceptRequestsLock);
if (!shouldAcceptRequests)
{
callback.callbackFn(false, stimulusFrame);
return;
}
}
auto caller = smoHooksPtr->ComponentThread_getSelf();
auto request = std::make_shared<CompactCollateAndMeshFrameReq>(
*this, asyncLoop, stimulusFrame, intensityStimFrame, ambienceStimFrame,
caller,
std::move(callback));
// Check if compaction is needed
bool needsCompaction = IoUringAssemblyEngine::compactionIsNeeded(
asyncLoop.nSucceeded.load(), asyncLoop.nTotal);
// Start with compaction if needed, then chain to collation
if (needsCompaction)
{
parent.device->componentThread->getIoService().post(
STC(std::bind(
&CompactCollateAndMeshFrameReq
::compactCollateAndMeshFrameReq1_doCompact_posted,
request.get(), request)));
}
else
{
// Skip compaction, go straight to collation
parent.device->componentThread->getIoService().post(
STC(std::bind(
&CompactCollateAndMeshFrameReq
::compactCollateAndMeshFrameReq3_doCollate_posted,
request.get(), request)));
}
}
std::chrono::milliseconds OpenClCollatingAndMeshingEngine::getCompactKernelDuration() const
{
auto duration = compactKernelEndTime - compactKernelStartTime;
if (duration.count() < 0)
{
return std::chrono::milliseconds(0);
}
return std::chrono::duration_cast<std::chrono::milliseconds>(duration);
}
std::chrono::milliseconds OpenClCollatingAndMeshingEngine::getCollateKernelDuration() const
{
auto duration = collateKernelEndTime - collateKernelStartTime;
if (duration.count() < 0)
{
return std::chrono::milliseconds(0);
}
return std::chrono::duration_cast<std::chrono::milliseconds>(duration);
}
} // namespace stim_buff
} // namespace smo