Files
salmanoff/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.cpp
T
hayodea 280b6f7d1c OClCollMeshEngn: Produce ambience count; set postrin threshold
We modify the semantics/meaning of the ambience stim feature.
It now represents the number of frames whose average intensity
is below the ambienceLowVal.

We can now implement the postrin as the event wherein the number
of frames whose intensity <= ambienceLowVal exceeds
postrin-interest-threshold.
2025-11-28 02:55:24 -04:00

1321 lines
37 KiB
C++

#include <boostAsioLinkageFix.h>
#include <stdexcept>
#include <iostream>
#include <cstring>
#include <cstddef>
#include <vector>
#include <string>
#include <string_view>
#include <algorithm>
#include <boost/system/error_code.hpp>
#include <boost/asio/deadline_timer.hpp>
#include <asynchronousContinuation.h>
#include <asynchronousBridge.h>
#include <callback.h>
#include <asynchronousLoop.h>
#include <componentThread.h>
#include <user/stimulusFrame.h>
#include <livoxProto1/device.h>
#include "livoxGen1.h"
#include "openClCollatingAndMeshingEngine.h"
#include "pcloudStimulusProducer.h"
#include "openClKernels.h"
#include <user/frameAssemblyDesc.h>
#include "ioUringAssemblyEngine.h"
#include <user/senseApiDesc.h>
extern const smo::stim_buff::SmoCallbacks* smoHooksPtr;
namespace smo {
namespace stim_buff {
OpenClCollatingAndMeshingEngine::OpenClCollatingAndMeshingEngine(
PcloudStimulusProducer& parent_)
: parent(parent_),
computeDevice(nullptr),
slotCompactorProgram(nullptr), collateProgram(nullptr),
slotCompactorKernel(nullptr), collateKernel(nullptr),
clAssemblyBufferClBuffer(nullptr),
clCollationBufferClBuffer(nullptr),
clAverageIntensityBufferClBuffer(nullptr),
clAssemblyBuffer(nullptr),
clCollationBuffer(nullptr),
clAverageIntensityBuffer(nullptr),
shouldAcceptRequests(false),
compactIsRunning(false),
collateIsRunning(false),
currentCompactKernelEvent(nullptr), currentCollateKernelEvent(nullptr),
assemblyBufferPtr(nullptr),
assemblyBufferSize(0),
collationBufferPtr(nullptr),
collationBufferSize(0),
averageIntensityBufferPtr(nullptr),
averageIntensityBufferSize(0),
mappedAssemblyBuffer(nullptr),
mappedCollationBuffer(nullptr),
mappedAverageIntensityBuffer(nullptr),
frameAssemblyDesc(nullptr)
{
}
OpenClCollatingAndMeshingEngine::~OpenClCollatingAndMeshingEngine()
{
finalize();
}
bool OpenClCollatingAndMeshingEngine::setup()
{
// Defensive check to prevent double-calling
{
SpinLock::Guard lock(shouldAcceptRequestsLock);
if (shouldAcceptRequests)
{
throw std::runtime_error(std::string(__func__) + ": setup() called "
"while already set up");
}
}
if (!smoHooksPtr || !smoHooksPtr->ComputeManager_getDevice)
{
std::cerr << __func__ << ": smo hooks not available" << std::endl;
return false;
}
// Get ComputeDevice from smo hooks
computeDevice = smoHooksPtr->ComputeManager_getDevice();
if (!computeDevice)
{
std::cerr << __func__ << ": failed to get compute device" << std::endl;
return false;
}
// Get StagingBuffer memory pointers from parent
struct iovec assemblyIov = parent.assemblyBuffer.getClEngineIovec();
struct iovec collationIov = parent.collationBuffer.getClEngineIovec();
struct iovec averageIntensityIov = parent.averageIntensityBuffer
.getClEngineIovec();
assemblyBufferPtr = assemblyIov.iov_base;
assemblyBufferSize = assemblyIov.iov_len;
collationBufferPtr = collationIov.iov_base;
collationBufferSize = collationIov.iov_len;
averageIntensityBufferPtr = averageIntensityIov.iov_base;
averageIntensityBufferSize = averageIntensityIov.iov_len;
// Get FrameAssemblyDesc from assembly buffer
frameAssemblyDesc = static_cast<std::shared_ptr<FrameAssemblyDesc>>(
parent.assemblyBuffer);
if (!frameAssemblyDesc || frameAssemblyDesc->slots.empty())
{
std::cerr << __func__ << ": invalid frame descriptor" << std::endl;
goto cleanup;
}
// Create OpenCL buffers using smo hooks
if (!smoHooksPtr->ComputeManager_createUseHostPtrBuffer)
{
std::cerr << __func__ << ": createUseHostPtrBuffer hook not available"
<< std::endl;
goto cleanup;
}
clAssemblyBufferClBuffer = smoHooksPtr
->ComputeManager_createUseHostPtrBuffer(
assemblyBufferPtr, assemblyBufferSize, CL_MEM_READ_WRITE);
if (!clAssemblyBufferClBuffer)
{
std::cerr << __func__ << ": failed to create assembly buffer"
<< std::endl;
goto cleanup;
}
clCollationBufferClBuffer = smoHooksPtr
->ComputeManager_createUseHostPtrBuffer(
collationBufferPtr, collationBufferSize, CL_MEM_WRITE_ONLY);
if (!clCollationBufferClBuffer)
{
std::cerr << __func__ << ": failed to create collation buffer"
<< std::endl;
goto cleanup;
}
clAverageIntensityBufferClBuffer = smoHooksPtr
->ComputeManager_createUseHostPtrBuffer(
averageIntensityBufferPtr, averageIntensityBufferSize,
CL_MEM_WRITE_ONLY);
if (!clAverageIntensityBufferClBuffer)
{
std::cerr << __func__ << ": failed to create average intensity buffer"
<< std::endl;
goto cleanup;
}
// Cache cl_mem handles for the device we're using
clAssemblyBuffer = clAssemblyBufferClBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
clCollationBuffer = clCollationBufferClBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
clAverageIntensityBuffer = clAverageIntensityBufferClBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
if (!clAssemblyBuffer || !clCollationBuffer || !clAverageIntensityBuffer)
{
std::cerr << __func__ << ": failed to get buffer handles for device"
<< std::endl;
goto cleanup;
}
// Compile and prepare both kernels
if (!compileAndPrepareKernels()) {
goto cleanup;
}
clFlush(computeDevice->commandQueue);
clFinish(computeDevice->commandQueue);
shouldAcceptRequests = true;
return true;
cleanup:
finalize();
return false;
}
void OpenClCollatingAndMeshingEngine::finalize()
{
// Call stop() to set shouldAcceptRequests to false and get previous state
bool wasAcceptingRequests = stop();
(void)wasAcceptingRequests;
// Complete any running kernels
if (compactIsRunning) { compactKernelComplete(true); }
if (collateIsRunning) {
collateKernelComplete(std::nullopt, std::nullopt, true);
}
{
/** EXPLANATION:
* Calculate the delay as the maximum of the configured delay and any
* future delays. The 0 is a placeholder for any delays that will be
* introduced in the future. When new delays are added, they should be
* included in the std::max() call (e.g., std::max(
* OCLCOLLMESH_ENGN_FINALIZE_DELAY_MS, futureDelay1, futureDelay2, 0)).
*/
int delayMs = std::max(OCLCOLLMESH_ENGN_FINALIZE_DELAY_MS, 0);
auto& ioService = smoHooksPtr->ComponentThread_getSelf()->getIoService();
AsynchronousBridge bridge(ioService);
boost::asio::deadline_timer timeoutTimer(ioService);
/** EXPLANATION:
* We wait for delayMs milliseconds to ensure that any in-flight OpenCL
* kernel operations have definitely finished. OpenCL kernels cannot be
* cancelled once enqueued, so in-flight kernels may still be executing
* when finalize() is called. The delay ensures any running kernels
* complete and their callbacks execute before we destroy resources.
* This prevents use-after-free errors from resumed async continuations
* accessing destroyed state.
*/
timeoutTimer.expires_from_now(
boost::posix_time::milliseconds(delayMs));
timeoutTimer.async_wait(
[&bridge](const boost::system::error_code& error)
{
(void)error;
// Always signal complete, whether timeout expired or was cancelled
bridge.setAsyncOperationComplete();
});
bridge.waitForAsyncOperationCompleteOrIoServiceStopped();
}
// Release OpenCL buffers via smo hooks
if (smoHooksPtr && smoHooksPtr->ComputeManager_releaseUseHostPtrBuffer)
{
if (clAverageIntensityBufferClBuffer)
{
smoHooksPtr->ComputeManager_releaseUseHostPtrBuffer(
clAverageIntensityBufferClBuffer);
clAverageIntensityBufferClBuffer.reset();
}
if (clCollationBufferClBuffer)
{
smoHooksPtr->ComputeManager_releaseUseHostPtrBuffer(
clCollationBufferClBuffer);
clCollationBufferClBuffer.reset();
}
if (clAssemblyBufferClBuffer)
{
smoHooksPtr->ComputeManager_releaseUseHostPtrBuffer(
clAssemblyBufferClBuffer);
clAssemblyBufferClBuffer.reset();
}
}
// Reset cached cl_mem handles
clCollationBuffer = nullptr;
clAverageIntensityBuffer = nullptr;
clAssemblyBuffer = nullptr;
// Release kernels
if (slotCompactorKernel)
{
clReleaseKernel(slotCompactorKernel);
slotCompactorKernel = nullptr;
}
if (collateKernel)
{
clReleaseKernel(collateKernel);
collateKernel = nullptr;
}
// Release programs
if (slotCompactorProgram)
{
clReleaseProgram(slotCompactorProgram);
slotCompactorProgram = nullptr;
}
if (collateProgram)
{
clReleaseProgram(collateProgram);
collateProgram = nullptr;
}
// Release compute device via smo hooks
if (smoHooksPtr && smoHooksPtr->ComputeManager_releaseDevice
&& computeDevice)
{
smoHooksPtr->ComputeManager_releaseDevice(computeDevice);
computeDevice.reset();
}
// Reset state variables
compactIsRunning = false;
collateIsRunning = false;
currentCompactKernelEvent = nullptr;
currentCollateKernelEvent = nullptr;
assemblyBufferPtr = nullptr;
assemblyBufferSize = 0;
collationBufferPtr = nullptr;
collationBufferSize = 0;
averageIntensityBufferPtr = nullptr;
averageIntensityBufferSize = 0;
frameAssemblyDesc = nullptr;
}
// Static callback for compact kernel event
void CL_CALLBACK OpenClCollatingAndMeshingEngine::compactKernelEventCallback(
cl_event /*event*/, cl_int event_command_exec_status, void* user_data)
{
OpenClCollatingAndMeshingEngine* engine =
static_cast<OpenClCollatingAndMeshingEngine*>(user_data);
if (!engine || !engine->compactKernelCb)
{ return; }
// Post to io_service to call callback on the correct thread
if (engine->parent.device && engine->parent.device->componentThread)
{
engine->parent.device->componentThread->getIoService().post(
std::bind(engine->compactKernelCb, event_command_exec_status));
}
}
// Static callback for collate kernel event
void CL_CALLBACK OpenClCollatingAndMeshingEngine::collateKernelEventCallback(
cl_event /*event*/, cl_int event_command_exec_status, void* user_data)
{
OpenClCollatingAndMeshingEngine* engine =
static_cast<OpenClCollatingAndMeshingEngine*>(user_data);
if (!engine || !engine->collateKernelCb)
{ return; }
// Post to io_service to call callback on the correct thread
if (engine->parent.device && engine->parent.device->componentThread)
{
engine->parent.device->componentThread->getIoService().post(
std::bind(engine->collateKernelCb, event_command_exec_status));
}
}
bool OpenClCollatingAndMeshingEngine::startCompactKernel(
StagingBuffer& assemblyBuff, uint32_t nSucceeded,
compactKernelCbFn callback)
{
// Store the caller's callback
compactKernelCb = std::move(callback);
// Validate buffers callable
auto validateBuffers = [this, &assemblyBuff]() {
struct iovec assemblyIov = assemblyBuff.getClEngineIovec();
if (assemblyIov.iov_base != assemblyBufferPtr
|| assemblyIov.iov_len != assemblyBufferSize)
{
throw std::runtime_error(
std::string(__func__) + ": buffer mismatch - buffers have "
"changed");
}
};
// Setup args callable
auto setupArgs = [this, &assemblyBuff, nSucceeded]() {
return setupSlotCompactorsArgs(assemblyBuff, nSucceeded);
};
/** EXPLANAITON:
* Map assembly buffer as WRITE_INVALIDATE_REGION to inform OpenCL that host
* (io_uring) has written data, and that the host doesn't care what the
* prior contents of the device's cache see. The device must invalidate
* its own view of the HOST_PTR and accept our view.
*
* Then immediately unmap to let OpenCL make the changes visible to the GPU
*/
if (!mapAssemblyBuffer(CL_MAP_WRITE_INVALIDATE_REGION))
{
std::cerr << __func__ << ": failed to map assembly buffer" << std::endl;
return false;
}
// Unmap immediately to sync host writes to GPU
unmapAssemblyBuffer();
bool success = startKernel(
slotCompactorKernel,
&currentCompactKernelEvent,
setupArgs,
validateBuffers,
1, // globalWorkSize
compactKernelEventCallback,
"slotCompactor",
compactIsRunning);
if (!success) { return false; }
return true;
}
bool OpenClCollatingAndMeshingEngine::startCollateKernel(
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame,
collateKernelCbFn callback)
{
// Store the caller's callback
collateKernelCb = std::move(callback);
// Validate buffers callable
auto validateBuffers = [this]() {
struct iovec assemblyIov = parent.assemblyBuffer.getClEngineIovec();
struct iovec collationIov = parent.collationBuffer.getClEngineIovec();
struct iovec averageIntensityIov = parent.averageIntensityBuffer
.getClEngineIovec();
if (assemblyIov.iov_base != assemblyBufferPtr
|| assemblyIov.iov_len != assemblyBufferSize
|| collationIov.iov_base != collationBufferPtr
|| collationIov.iov_len != collationBufferSize
|| averageIntensityIov.iov_base != averageIntensityBufferPtr
|| averageIntensityIov.iov_len != averageIntensityBufferSize)
{
throw std::runtime_error(
std::string(__func__) + ": buffer mismatch - buffers have changed");
}
};
// Setup args callable
auto setupArgs = [this, intensityStimFrame, ambienceStimFrame]()
{
return setupCollateDgramsArgs(intensityStimFrame, ambienceStimFrame);
};
/** EXPLANATION:
* It shouldn't be necessary to map the assembly/collation buffers here
* since we don't need to read/write them on the host CPUs (unless we're
* intervening to debug; in which case we should map them as CL_MAP_READ).
*
* Otherwise, the foreign GPU's view of the data in the assembly buffer
* is currently up to date; and the collation buffer's state is undefined...
* and also irrelevant since it's only going to be used for output anyway.
*/
if (!mapAssemblyBuffer(CL_MAP_WRITE_INVALIDATE_REGION))
{
std::cerr << __func__ << ": failed to map assembly buffer" << std::endl;
return false;
}
unmapAssemblyBuffer();
if (!mapCollationBuffer(CL_MAP_WRITE))
{
std::cerr << __func__ << ": failed to map assembly buffer" << std::endl;
return false;
}
unmapCollationBuffer();
if (!mapAverageIntensityBuffer(CL_MAP_WRITE))
{
std::cerr << __func__ << ": failed to map average intensity buffer"
<< std::endl;
return false;
}
unmapAverageIntensityBuffer();
// Map/unmap intensity buffer if it exists
if (intensityStimFrame.has_value())
{
StimulusFrame& intensityFrame = intensityStimFrame->get();
cl_mem intensityClBuffer = intensityFrame.clBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
if (intensityClBuffer)
{
void* mappedIntensityBuffer = nullptr;
if (!mapBuffer(
intensityClBuffer, intensityFrame.slotDesc.nBytes,
CL_MAP_WRITE_INVALIDATE_REGION, mappedIntensityBuffer))
{
std::cerr << __func__ << ": failed to map intensity buffer"
<< std::endl;
return false;
}
unmapBuffer(intensityClBuffer, mappedIntensityBuffer);
}
}
// Calculate global work size (just num slots in the frame)
size_t globalWorkSize = static_cast<uint32_t>(frameAssemblyDesc->numSlots);
bool success = startKernel(
collateKernel,
&currentCollateKernelEvent,
setupArgs,
validateBuffers,
globalWorkSize,
collateKernelEventCallback,
"collateDgrams",
collateIsRunning);
if (!success) { return false; }
return true;
}
bool OpenClCollatingAndMeshingEngine::compileAndPrepareKernel(
const char* kernelSource, size_t kernelSourceLen,
const char* kernelName, cl_program& program, cl_kernel& kernel)
{
cl_int err;
// Create program from source
program = clCreateProgramWithSource(
computeDevice->context, 1, &kernelSource, &kernelSourceLen, &err);
if (err != CL_SUCCESS || !program)
{
std::cerr << __func__ << ": failed to create " << kernelName
<< " program: " << err << std::endl;
return false;
}
// Build program
err = clBuildProgram(program, 1, &computeDevice->device,
nullptr, nullptr, nullptr);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to build " << kernelName
<< " program: " << err << std::endl;
// Print build log if available
size_t logSize = 0;
clGetProgramBuildInfo(
program, computeDevice->device, CL_PROGRAM_BUILD_LOG,
0, nullptr, &logSize);
if (logSize > 0)
{
std::vector<char> log(logSize);
clGetProgramBuildInfo(
program, computeDevice->device, CL_PROGRAM_BUILD_LOG,
logSize, log.data(), nullptr);
std::cerr << kernelName << " build log: " << log.data()
<< std::endl;
}
return false;
}
// Create kernel
kernel = clCreateKernel(program, kernelName, &err);
if (err != CL_SUCCESS || !kernel)
{
std::cerr << __func__ << ": failed to create " << kernelName
<< " kernel: " << err << std::endl;
return false;
}
return true;
}
bool OpenClCollatingAndMeshingEngine::compileAndPrepareKernels()
{
// Compile slotCompactor kernel
if (!compileAndPrepareKernel(
slotCompactorKernelStart, slotCompactorKernelNBytes,
"slotCompactor", slotCompactorProgram, slotCompactorKernel))
{
return false;
}
// Compile collateDgrams kernel
if (!compileAndPrepareKernel(
collateKernelStart, collateKernelNBytes,
"collate", collateProgram, collateKernel))
{
return false;
}
return true;
}
bool OpenClCollatingAndMeshingEngine::setupSlotCompactorsArgs(
StagingBuffer& assemblyBuff, uint32_t nSucceeded)
{
// Extract parameters for slotCompactor kernel
uint32_t numSlots = static_cast<uint32_t>(frameAssemblyDesc->numSlots);
uint32_t slotStride = static_cast<uint32_t>(assemblyBuff.slotStrideNBytes);
uint32_t slotSize = static_cast<uint32_t>(frameAssemblyDesc->slotSizeBytes);
uint32_t nSucceededUint = static_cast<uint32_t>(nSucceeded);
// Set kernel arguments for slotCompactor
cl_int err;
err = clSetKernelArg(
slotCompactorKernel, 0, sizeof(cl_mem), &clAssemblyBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 0: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(slotCompactorKernel, 1, sizeof(uint32_t), &numSlots);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 1: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(slotCompactorKernel, 2, sizeof(uint32_t), &slotStride);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 2: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(slotCompactorKernel, 3, sizeof(uint32_t), &slotSize);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 3: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(
slotCompactorKernel, 4, sizeof(uint32_t), &nSucceededUint);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 4: " << err
<< std::endl;
return false;
}
return true;
}
bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs(
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame)
{
// Extract parameters for collateDgrams kernel
uint32_t slotStride = static_cast<uint32_t>(
parent.assemblyBuffer.slotStrideNBytes);
// Calculate nPointsPerSlot from device return mode
if (!parent.device)
{
std::cerr << __func__ << ": device not available" << std::endl;
return false;
}
int returnMode = static_cast<int>(parent.device->currentReturnMode);
uint32_t nPointsPerSlot = static_cast<uint32_t>(
livoxProto1::Device::getNPointsPerDgram(returnMode));
uint32_t nDgramsPerFrame = static_cast<uint32_t>(
frameAssemblyDesc->numSlots);
// Set kernel arguments for collateDgrams
cl_int err;
err = clSetKernelArg(collateKernel, 0, sizeof(cl_mem), &clAssemblyBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 0: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(collateKernel, 1, sizeof(cl_mem), &clCollationBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 1: " << err
<< std::endl;
return false;
}
// Set intensity buffer argument (arg 2)
cl_mem intensityClBuffer = nullptr;
if (intensityStimFrame.has_value())
{
intensityClBuffer = intensityStimFrame->get().clBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
}
err = clSetKernelArg(collateKernel, 2, sizeof(cl_mem), &intensityClBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 2: " << err
<< std::endl;
return false;
}
// Set average intensity buffer argument (arg 3)
/** EXPLANATION:
* We only pass the average intensity buffer argument to the collate kernel
* when ambienceStimFrame is present. This is because the collate kernel
* only needs the average intensity buffer if ambience processing is
* requested (i.e., the ambience stimulus buffer is attached). If no
* ambienceStimFrame is supplied, we skip passing the buffer to avoid
* unnecessary work.
*/
cl_mem averageIntensityClBuffer = nullptr;
if (ambienceStimFrame.has_value()) {
averageIntensityClBuffer = clAverageIntensityBuffer;
}
err = clSetKernelArg(
collateKernel, 3, sizeof(cl_mem), &averageIntensityClBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 3: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(collateKernel, 4, sizeof(uint32_t), &slotStride);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 4: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(collateKernel, 5, sizeof(uint32_t), &nPointsPerSlot);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 5: " << err
<< std::endl;
return false;
}
err = clSetKernelArg(collateKernel, 6, sizeof(uint32_t), &nDgramsPerFrame);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 6: " << err
<< std::endl;
return false;
}
return true;
}
bool OpenClCollatingAndMeshingEngine::stop()
{
// Acquire and release lock tightly around setting the flag
SpinLock::Guard lock(shouldAcceptRequestsLock);
bool wasAcceptingRequests = shouldAcceptRequests;
shouldAcceptRequests = false;
return wasAcceptingRequests;
}
void OpenClCollatingAndMeshingEngine::compactKernelComplete(bool isFinalizing)
{
cl_map_flags mapFlags;
/** EXPLANATION:
* Technically we should only need to do this if we plan to read the
* compacted slots for debugging purposes. Otherwise this is unnecessary.
*/
if (isFinalizing) { mapFlags = CL_MAP_WRITE_INVALIDATE_REGION; }
else { mapFlags = CL_MAP_READ; }
if (mapAssemblyBuffer(mapFlags)) {
unmapAssemblyBuffer();
}
clFlush(computeDevice->commandQueue);
// Stop only compact kernel
if (compactIsRunning && currentCompactKernelEvent)
{
clWaitForEvents(1, &currentCompactKernelEvent);
clReleaseEvent(currentCompactKernelEvent);
currentCompactKernelEvent = nullptr;
}
clFinish(computeDevice->commandQueue);
compactKernelCb = [](cl_int){};
compactIsRunning = false;
}
void OpenClCollatingAndMeshingEngine::collateKernelComplete(
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame,
bool isFinalizing)
{
(void)ambienceStimFrame;
cl_map_flags mapFlags;
/** EXPLANATION:
* Technically we should only need to do this if we plan to read the
* collated dgrams for debugging purposes. Otherwise this is unnecessary.
*/
if (isFinalizing) { mapFlags = CL_MAP_WRITE_INVALIDATE_REGION; }
else { mapFlags = CL_MAP_READ; }
if (mapCollationBuffer(mapFlags)) {
unmapCollationBuffer();
}
if (mapAverageIntensityBuffer(mapFlags)) {
unmapAverageIntensityBuffer();
}
// Map/unmap intensity buffer if it exists
if (intensityStimFrame.has_value())
{
StimulusFrame& intensityFrame = intensityStimFrame->get();
cl_mem intensityClBuffer = intensityFrame.clBuffer
->getAssociatedBufferHandleForDevice(computeDevice);
if (intensityClBuffer)
{
void* mappedIntensityBuffer = nullptr;
if (mapBuffer(
intensityClBuffer, intensityFrame.slotDesc.nBytes,
CL_MAP_READ, mappedIntensityBuffer))
{
unmapBuffer(intensityClBuffer, mappedIntensityBuffer);
}
}
}
clFlush(computeDevice->commandQueue);
// Stop only collate kernel
if (collateIsRunning && currentCollateKernelEvent)
{
clWaitForEvents(1, &currentCollateKernelEvent);
clReleaseEvent(currentCollateKernelEvent);
currentCollateKernelEvent = nullptr;
}
clFinish(computeDevice->commandQueue);
collateKernelCb = [](cl_int){};
collateIsRunning = false;
}
bool OpenClCollatingAndMeshingEngine::mapBuffer(
cl_mem buffer, size_t size, cl_map_flags mapFlags, void*& mappedPtr)
{
if (!computeDevice->commandQueue || !buffer)
{
std::cerr << __func__ << ": engine not set up or invalid buffer"
<< std::endl;
return false;
}
// If already mapped, return early with success.
if (mappedPtr != nullptr) { return true; }
cl_int err;
mappedPtr = clEnqueueMapBuffer(
computeDevice->commandQueue, buffer, CL_TRUE, mapFlags,
0, size, 0, nullptr, nullptr, &err);
if (err != CL_SUCCESS || !mappedPtr)
{
std::cerr << __func__ << ": failed to map buffer: " << err
<< std::endl;
goto cleanup;
}
return true;
cleanup:
mappedPtr = nullptr;
return false;
}
bool OpenClCollatingAndMeshingEngine::unmapBuffer(
cl_mem buffer, void*& mappedPtr
)
{
if (mappedPtr == nullptr)
{
// Already unmapped
return true;
}
if (!computeDevice->commandQueue || !buffer)
{
std::cerr << __func__ << ": engine not set up or invalid buffer.\n";
return false;
}
cl_int err;
cl_event unmapEvent = nullptr;
err = clEnqueueUnmapMemObject(
computeDevice->commandQueue, buffer, mappedPtr,
0, nullptr, &unmapEvent);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to unmap buffer: " << err
<< std::endl;
goto cleanup;
}
// Wait for unmap to complete and release the event
err = clWaitForEvents(1, &unmapEvent);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to wait for unmap event: " << err
<< std::endl;
goto cleanup_unmapEvent;
}
clReleaseEvent(unmapEvent);
mappedPtr = nullptr;
return true;
cleanup_unmapEvent:
clReleaseEvent(unmapEvent);
cleanup:
return false;
}
bool OpenClCollatingAndMeshingEngine::mapAssemblyBuffer(cl_map_flags mapFlags)
{
return mapBuffer(
clAssemblyBuffer, assemblyBufferSize, mapFlags, mappedAssemblyBuffer);
}
bool OpenClCollatingAndMeshingEngine::unmapAssemblyBuffer()
{
unmapBuffer(clAssemblyBuffer, mappedAssemblyBuffer);
mappedAssemblyBuffer = nullptr;
return true;
}
bool OpenClCollatingAndMeshingEngine::mapCollationBuffer(cl_map_flags mapFlags)
{
return mapBuffer(
clCollationBuffer, collationBufferSize, mapFlags,
mappedCollationBuffer);
}
bool OpenClCollatingAndMeshingEngine::unmapCollationBuffer()
{
unmapBuffer(clCollationBuffer, mappedCollationBuffer);
mappedCollationBuffer = nullptr;
return true;
}
bool OpenClCollatingAndMeshingEngine::mapAverageIntensityBuffer(
cl_map_flags mapFlags
)
{
return mapBuffer(
clAverageIntensityBuffer, averageIntensityBufferSize, mapFlags,
mappedAverageIntensityBuffer);
}
bool OpenClCollatingAndMeshingEngine::unmapAverageIntensityBuffer()
{
unmapBuffer(clAverageIntensityBuffer, mappedAverageIntensityBuffer);
mappedAverageIntensityBuffer = nullptr;
return true;
}
void OpenClCollatingAndMeshingEngine::produceAmbienceStimulusFrame(
std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame,
uint32_t nSucceeded)
{
if (!ambienceStimFrame.has_value()) { return; }
std::shared_ptr<PcloudAmbienceStimulusBuffer> ambienceBuff =
parent.ambienceStimulusBuffer.load(std::memory_order_acquire);
if (!ambienceBuff) { return; }
uint32_t lowVal = ambienceBuff->ambienceIntensityLowVal;
// Read average intensity values from averageIntensityBuffer
float* averageIntensityAverages = reinterpret_cast<float*>(
averageIntensityBufferPtr);
// Count frames whose average intensity is <= lowVal (postrin only)
uint32_t ambienceCount = 0;
for (uint32_t i = 0; i < nSucceeded; ++i)
{
float avg = averageIntensityAverages[i];
if (avg <= static_cast<float>(lowVal))
{
++ambienceCount;
}
}
// Write the ambience count to the ambienceStimFrame
StimulusFrame& ambienceFrame = ambienceStimFrame->get();
uint32_t* ambienceValue = reinterpret_cast<uint32_t*>(
ambienceFrame.slotDesc.vaddr);
ambienceValue[0] = ambienceCount;
}
class OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq
: public PostedAsynchronousContinuation<compactCollateAndMeshFrameReqCbFn>
{
private:
OpenClCollatingAndMeshingEngine& engine;
AsynchronousLoop frameAssemblyResult;
StimulusFrame& stimulusFrame;
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame;
std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame;
public:
CompactCollateAndMeshFrameReq(
OpenClCollatingAndMeshingEngine& engine_,
AsynchronousLoop& asyncLoop,
StimulusFrame& stimulusFrame_,
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame_,
std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame_,
const std::shared_ptr<ComponentThread>& caller,
Callback<compactCollateAndMeshFrameReqCbFn> cb)
: PostedAsynchronousContinuation<compactCollateAndMeshFrameReqCbFn>(
caller, cb),
engine(engine_),
frameAssemblyResult(asyncLoop), stimulusFrame(stimulusFrame_),
intensityStimFrame(intensityStimFrame_),
ambienceStimFrame(ambienceStimFrame_)
{}
public:
void callOriginalCallback(bool success)
{ callOriginalCb(success, std::ref(stimulusFrame)); }
public:
void compactCollateAndMeshFrameReq1_doCompact_posted(
std::shared_ptr<CompactCollateAndMeshFrameReq> context)
{
SpinLock::Guard lock(engine.shouldAcceptRequestsLock);
if (!engine.shouldAcceptRequests)
{
callOriginalCallback(false);
return;
}
// Record compact kernel start time
engine.compactKernelStartTime = std::chrono::high_resolution_clock::now();
bool success = engine.startCompactKernel(
engine.parent.assemblyBuffer,
static_cast<uint32_t>(context->frameAssemblyResult.nSucceeded.load()),
std::bind(
&CompactCollateAndMeshFrameReq
::compactCollateAndMeshFrameReq2_compactDone_posted,
context.get(), context,
std::placeholders::_1));
if (!success)
{
engine.compactKernelComplete();
callOriginalCallback(false);
return;
}
}
void compactCollateAndMeshFrameReq2_compactDone_posted(
std::shared_ptr<CompactCollateAndMeshFrameReq> context,
cl_int compactStatus)
{
SpinLock::Guard lock(engine.shouldAcceptRequestsLock);
if (!engine.shouldAcceptRequests)
{
/** EXPLANATION:
* We intentionally don't call compactKernelComplete() here because
* if shouldAcceptRequests is false, then the caller that called
* finalize() will also be forced to call compactKernelComplete()
* inside of finalize().
*/
callOriginalCallback(false);
return;
}
engine.compactKernelComplete();
// Record compact kernel end time
engine.compactKernelEndTime = std::chrono::high_resolution_clock::now();
// If compact failed, call callback directly with failure
if (compactStatus != CL_SUCCESS)
{
callOriginalCallback(false);
return;
}
#if 0
// Print first 4 bytes of each slot
if (engine.frameAssemblyDesc)
{
for (size_t i = 0; i < engine.frameAssemblyDesc->numSlots; ++i) {
engine.parent.ioUringAssemblyEngine.printSlotBytes(i, 4);
}
}
#endif
lock.unlockPrematurely();
context->compactCollateAndMeshFrameReq3_doCollate_posted(context);
}
void compactCollateAndMeshFrameReq3_doCollate_posted(
std::shared_ptr<CompactCollateAndMeshFrameReq> context)
{
SpinLock::Guard lock(engine.shouldAcceptRequestsLock);
if (!engine.shouldAcceptRequests)
{
callOriginalCallback(false);
return;
}
// Record collate kernel start time
engine.collateKernelStartTime = std::chrono::high_resolution_clock::now();
bool success = engine.startCollateKernel(
context->intensityStimFrame, context->ambienceStimFrame,
std::bind(
&CompactCollateAndMeshFrameReq
::compactCollateAndMeshFrameReq4_collateDone_maybePosted,
context.get(), context,
std::placeholders::_1));
if (!success)
{
engine.collateKernelComplete(
context->intensityStimFrame, context->ambienceStimFrame);
callOriginalCallback(false);
return;
}
}
void compactCollateAndMeshFrameReq4_collateDone_maybePosted(
[[maybe_unused]] std::shared_ptr<CompactCollateAndMeshFrameReq> context,
cl_int collateStatus)
{
SpinLock::Guard lock(engine.shouldAcceptRequestsLock);
if (!engine.shouldAcceptRequests)
{
/* We intentionally don't call collateKernelComplete() here for the
* same reason as above.
*/
callOriginalCallback(false);
return;
}
/** EXPLANATION:
* The reason we don't call collateKernelComplete before checking
* shouldAcceptRequests is because if shouldAcceptRequests is false, then
* we shouldn't touch any of the collate cycle state...or any state within
* the engine at all, since finalize() may well have been called.
*
* Therefore it's finalize()'s responsibility to ensure that it properly
* completes/cleans up any in-flight operations.
*/
engine.collateKernelComplete(
context->intensityStimFrame, context->ambienceStimFrame);
// Record collate kernel end time
engine.collateKernelEndTime = std::chrono::high_resolution_clock::now();
bool success = (collateStatus == CL_SUCCESS);
// Early callback + return pattern
if (!success)
{
callOriginalCallback(false);
return;
}
uint32_t nSucceeded = context->frameAssemblyResult.nSucceeded.load();
// Produce ambience frame if ambience buffer is attached
if (context->ambienceStimFrame.has_value())
{
engine.produceAmbienceStimulusFrame(
context->ambienceStimFrame, nSucceeded);
}
int returnMode = static_cast<int>(engine.parent.device->currentReturnMode);
size_t pointsPerDgram = livoxProto1::Device::getNPointsPerDgram(
returnMode);
size_t totalPoints = nSucceeded * pointsPerDgram;
// Count points with intensity greater than 116
size_t highIntensityCount = 0;
if (context->intensityStimFrame.has_value())
{
StimulusFrame& intensityFrame = context->intensityStimFrame->get();
float* intensityFloats = reinterpret_cast<float*>(intensityFrame.slotDesc.vaddr);
for (size_t i = 0; i < totalPoints; ++i)
{
float intensity = intensityFloats[i];
if (intensity >= 116.0f)
{
++highIntensityCount;
}
}
}
// Print all averages above thresholds from average intensity buffer
if (context->ambienceStimFrame.has_value())
{
std::shared_ptr<PcloudAmbienceStimulusBuffer> ambienceBuff =
engine.parent.ambienceStimulusBuffer.load(std::memory_order_acquire);
uint32_t lowVal = ambienceBuff->ambienceIntensityLowVal;
uint32_t postrinThreshold = ambienceBuff->postrinInterestThreshold;
float* averageIntensityAverages = reinterpret_cast<float*>(
engine.averageIntensityBufferPtr);
// Count frames that met the postrin threshold
uint32_t framesMetThreshold = 0;
for (uint32_t i = 0; i < nSucceeded; ++i)
{
float avg = averageIntensityAverages[i];
if (avg <= static_cast<float>(lowVal))
{
++framesMetThreshold;
}
}
// Read the stimFrame value (ambience count)
StimulusFrame& ambienceFrame = context->ambienceStimFrame->get();
uint32_t* ambienceValue = reinterpret_cast<uint32_t*>(
ambienceFrame.slotDesc.vaddr);
uint32_t stimFrameValue = ambienceValue[0];
bool meetsPostrinThreshold = (framesMetThreshold >= postrinThreshold);
std::cout << __func__ << ": frames met threshold=" << framesMetThreshold
<< ", stimFrame value=" << stimFrameValue
<< ", postrin threshold=" << postrinThreshold
<< ", meets postrin=" << (meetsPostrinThreshold ? "yes" : "no")
<< std::endl;
}
std::cout << __func__ << ": intensityRingBufferIndex="
<< (context->intensityStimFrame.has_value() ?
context->intensityStimFrame->get().ringBufferIndex : SIZE_MAX)
<< ", ambienceRingBufferIndex="
<< (context->ambienceStimFrame.has_value() ?
context->ambienceStimFrame->get().ringBufferIndex : SIZE_MAX)
<< ", pointsPerDgram=" << pointsPerDgram
<< ", nSucceeded=" << nSucceeded
<< ", totalPoints=" << totalPoints
<< ", highIntensityCount=" << highIntensityCount << std::endl;
callOriginalCallback(success);
}
};
void OpenClCollatingAndMeshingEngine::compactCollateAndMeshFrameReq(
AsynchronousLoop& asyncLoop, StimulusFrame& stimulusFrame,
std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame,
Callback<compactCollateAndMeshFrameReqCbFn> callback)
{
{
SpinLock::Guard lock(shouldAcceptRequestsLock);
if (!shouldAcceptRequests)
{
callback.callbackFn(false, stimulusFrame);
return;
}
}
auto caller = smoHooksPtr->ComponentThread_getSelf();
auto request = std::make_shared<CompactCollateAndMeshFrameReq>(
*this, asyncLoop, stimulusFrame, intensityStimFrame, ambienceStimFrame,
caller,
std::move(callback));
// Check if compaction is needed
bool needsCompaction = IoUringAssemblyEngine::compactionIsNeeded(
asyncLoop.nSucceeded.load(), asyncLoop.nTotal);
// Start with compaction if needed, then chain to collation
if (needsCompaction)
{
parent.device->componentThread->getIoService().post(
STC(std::bind(
&CompactCollateAndMeshFrameReq
::compactCollateAndMeshFrameReq1_doCompact_posted,
request.get(), request)));
}
else
{
// Skip compaction, go straight to collation
parent.device->componentThread->getIoService().post(
STC(std::bind(
&CompactCollateAndMeshFrameReq
::compactCollateAndMeshFrameReq3_doCollate_posted,
request.get(), request)));
}
}
std::chrono::milliseconds OpenClCollatingAndMeshingEngine::getCompactKernelDuration() const
{
auto duration = compactKernelEndTime - compactKernelStartTime;
if (duration.count() < 0)
{
return std::chrono::milliseconds(0);
}
return std::chrono::duration_cast<std::chrono::milliseconds>(duration);
}
std::chrono::milliseconds OpenClCollatingAndMeshingEngine::getCollateKernelDuration() const
{
auto duration = collateKernelEndTime - collateKernelStartTime;
if (duration.count() < 0)
{
return std::chrono::milliseconds(0);
}
return std::chrono::duration_cast<std::chrono::milliseconds>(duration);
}
} // namespace stim_buff
} // namespace smo