c08e075763
For some reason, waiting on the event object returned by clEnqueueUnmapMemObject, but only when called from within finalize(). Under normal operating conditions when we map and then unmap our HOST_PTR buffers, everything works just fine. I can't discern any relevant difference. Adding a bridged 300ms delay in setup() doesn't help either so it doesn't seem to be solved by allowing the rusticl worker threads to finish initializing. GDB output from the segfault appended. Sadly, no debug symbols for the ubuntu rusticl package. ``` [New Thread 0xffffdd2ce140 (LWP 1056313)] validateOpenClVersion: OpenCL platform version: OpenCL 3.0 validateOpenClVersion: OpenCL device version: OpenCL 3.0 [New Thread 0xffffdcabe140 (LWP 1056314)] [New Thread 0xffffc9a8f140 (LWP 1056315)] start: Starting stimulus buffer for device 3JEDK380010Z39 attachDeviceReq3: Got return mode (0) for device: 3JEDK380010Z39 discardHeartbeatAck: Lidar not ready for operation: work_state: 0x0 (Initializing), ack_msg: 0x1b discardHeartbeatAck: Lidar not ready for operation: work_state: 0x0 (Initializing), ack_msg: 0x45 discardHeartbeatAck: Lidar not ready for operation: work_state: 0x0 (Initializing), ack_msg: 0x45 discardHeartbeatAck: Lidar not ready for operation: work_state: 0x0 (Initializing), ack_msg: 0x45 attachDeviceReq5: Failed to enable pcloud data for dev 3JEDK380010Z39 newDeviceAttachmentSpecInd2: Attach failed for device spec Device Identifier: avia0, Sensor Type: e, QualeIface API: structural-qualeiface, QualeIface API Params: (), StimBuff API: livoxGen1, StimBuff API Params: (), Provider: livoxProto1, Provider Params: (smo-ip=10.42.0.2 ), Device Selector: 3JEDK380010Z39 attachAllUnattachedDevicesFromReq2: Failed to attach device: avia0 Mrntt: attached 0 of 2 sense devices. Mrntt: Body component initialized. initializeReq2: Failed to initialize globalMind marionetteInitializeReqCb: Failed to initialize Marionette. Shutting down. Mrntt: About to detach all sense devices. Mrntt: Successfully detached 0 of 0 sense devices. Mrntt: About to finalize all stim buff api libs. compactKernelCompletecalling w/mapFlags=4. INV=4; READ=1. mapBuffer 1 mapBuffer 2 mapBuffer 3: cmdQ: 0xffffec013d68, buffer: 0xffffec07b6b8, mapflags: 4 mapBuffer 4 mapBuffer 5 unmapBuffer 1 unmapBuffer 2 unmapBuffer 3 unmapBuffer 4 unmapBuffer 4.1 unmapBuffer 5 Thread 9 "rusticl queue t" received signal SIGSEGV, Segmentation fault. [Switching to Thread 0xffffdcabe140 (LWP 1056314)] Download failed: Invalid argument. Continuing without source file ./string/../sysdeps/aarch64/multiarch/../memcpy.S. __memcpy_generic () at ../sysdeps/aarch64/multiarch/../memcpy.S:155 warning: 155 ../sysdeps/aarch64/multiarch/../memcpy.S: No such file or directory (gdb) (gdb) bt (gdb) ```
1100 lines
29 KiB
C++
1100 lines
29 KiB
C++
#include <boostAsioLinkageFix.h>
|
|
#include <stdexcept>
|
|
#include <iostream>
|
|
#include <cstring>
|
|
#include <vector>
|
|
#include <string>
|
|
#include <string_view>
|
|
#include <boost/system/error_code.hpp>
|
|
#include <asynchronousContinuation.h>
|
|
#include <callback.h>
|
|
#include <asynchronousLoop.h>
|
|
#include <componentThread.h>
|
|
#include <user/stimulusFrame.h>
|
|
#include "livoxGen1.h"
|
|
#include "openClCollatingAndMeshingEngine.h"
|
|
#include "pcloudStimulusBuffer.h"
|
|
#include "openClKernels.h"
|
|
#include "frameAssemblyDesc.h"
|
|
#include "ioUringAssemblyEngine.h"
|
|
|
|
namespace smo {
|
|
namespace stim_buff {
|
|
|
|
/* @brief Helper function to parse OpenCL version string.
|
|
* Expected format: "OpenCL <major>.<minor> <vendor info>"
|
|
* @param versionStr The OpenCL version string to parse.
|
|
* @return A pair of (major, minor) version numbers.
|
|
* If parsing fails, returns (-1, -1).
|
|
*/
|
|
static std::pair<int, int> parseOpenClVersion(const std::string& versionStr)
|
|
{
|
|
size_t spacePos = versionStr.find(' ');
|
|
if (spacePos == std::string::npos) { return {-1, -1}; }
|
|
|
|
std::string versionNum = versionStr.substr(spacePos + 1);
|
|
size_t dotPos = versionNum.find('.');
|
|
if (dotPos == std::string::npos) { return {-1, -1}; }
|
|
|
|
try {
|
|
int major = std::stoi(versionNum.substr(0, dotPos));
|
|
int minor = std::stoi(versionNum.substr(dotPos + 1));
|
|
return {major, minor};
|
|
} catch (const std::exception&) {
|
|
return {-1, -1};
|
|
}
|
|
}
|
|
|
|
/*
|
|
* @brief Validates OpenCL version string and checks if it meets minimum requirement.
|
|
* @param versionStr The OpenCL version string to validate.
|
|
* @param versionType Description of version type (e.g., "platform", "device") for error messages.
|
|
* @param minMajor Minimum major version required.
|
|
* @param minMinor Minimum minor version required (for the given major version).
|
|
* @return true if version is valid and meets minimum requirement, false otherwise.
|
|
*/
|
|
static bool validateOpenClVersion(
|
|
std::string_view versionStr, std::string_view versionType,
|
|
int minMajor, int minMinor)
|
|
{
|
|
auto [major, minor] = parseOpenClVersion(std::string(versionStr));
|
|
|
|
// Early return if version couldn't be parsed
|
|
if (major == -1 && minor == -1)
|
|
{
|
|
std::cerr << __func__ << ": failed to parse OpenCL " << versionType
|
|
<< " version: " << versionStr << std::endl;
|
|
return false;
|
|
}
|
|
|
|
// Require minimum version
|
|
if (major < minMajor || (major == minMajor && minor < minMinor))
|
|
{
|
|
std::cerr << __func__ << ": OpenCL " << versionType << " version "
|
|
<< major << "." << minor << " found, but " << minMajor << "."
|
|
<< minMinor << " or higher is required" << std::endl;
|
|
return false;
|
|
}
|
|
|
|
std::cout << __func__ << ": OpenCL " << versionType << " version: "
|
|
<< versionStr << std::endl;
|
|
return true;
|
|
}
|
|
|
|
OpenClCollatingAndMeshingEngine::OpenClCollatingAndMeshingEngine(
|
|
PcloudStimulusBuffer& parent_)
|
|
: parent(parent_),
|
|
platform(nullptr),
|
|
device(nullptr),
|
|
context(nullptr),
|
|
commandQueue(nullptr),
|
|
slotCompactorProgram(nullptr), collateProgram(nullptr),
|
|
slotCompactorKernel(nullptr), collateKernel(nullptr),
|
|
clAssemblyBuffer(nullptr),
|
|
clCollationBuffer(nullptr),
|
|
shouldAcceptRequests(false),
|
|
compactIsRunning(false),
|
|
collateIsRunning(false),
|
|
currentCompactKernelEvent(nullptr), currentCollateKernelEvent(nullptr),
|
|
assemblyBufferPtr(nullptr),
|
|
assemblyBufferSize(0),
|
|
collationBufferPtr(nullptr),
|
|
collationBufferSize(0),
|
|
mappedAssemblyBuffer(nullptr),
|
|
mappedCollationBuffer(nullptr),
|
|
frameAssemblyDesc(nullptr)
|
|
{
|
|
}
|
|
|
|
OpenClCollatingAndMeshingEngine::~OpenClCollatingAndMeshingEngine()
|
|
{
|
|
finalize();
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::setup()
|
|
{
|
|
// Defensive check to prevent double-calling
|
|
{
|
|
SpinLock::Guard lock(shouldAcceptRequestsLock);
|
|
if (shouldAcceptRequests)
|
|
{
|
|
throw std::runtime_error(std::string(__func__) + ": setup() called "
|
|
"while already set up");
|
|
}
|
|
}
|
|
|
|
cl_int err;
|
|
cl_command_queue_properties queueProps = 0;
|
|
|
|
// Get platform
|
|
cl_uint numPlatforms;
|
|
err = clGetPlatformIDs(1, &platform, &numPlatforms);
|
|
if (err != CL_SUCCESS || numPlatforms == 0)
|
|
{
|
|
std::cerr << __func__ << ": failed to get OpenCL platform: "
|
|
<< err << std::endl;
|
|
return false;
|
|
}
|
|
|
|
// Get device
|
|
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, nullptr);
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to get GPU device: "
|
|
<< err << std::endl;
|
|
return false;
|
|
}
|
|
|
|
// Check OpenCL version - require 1.2 or higher
|
|
char platformVersion[128];
|
|
err = clGetPlatformInfo(platform, CL_PLATFORM_VERSION,
|
|
sizeof(platformVersion), platformVersion, nullptr);
|
|
if (err == CL_SUCCESS)
|
|
{
|
|
if (!validateOpenClVersion(platformVersion, "platform", 1, 2)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Also check device version
|
|
char deviceVersion[128];
|
|
err = clGetDeviceInfo(device, CL_DEVICE_VERSION,
|
|
sizeof(deviceVersion), deviceVersion, nullptr);
|
|
if (err == CL_SUCCESS)
|
|
{
|
|
if (!validateOpenClVersion(deviceVersion, "device", 1, 2)) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Create context
|
|
context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
|
|
if (err != CL_SUCCESS || !context)
|
|
{
|
|
std::cerr << __func__ << ": failed to create OpenCL context: "
|
|
<< err << std::endl;
|
|
goto cleanup;
|
|
}
|
|
|
|
// Create command queue (OpenCL 1.2 API)
|
|
commandQueue = clCreateCommandQueue(
|
|
context, device, queueProps, &err);
|
|
|
|
if (err != CL_SUCCESS || !commandQueue)
|
|
{
|
|
std::cerr << __func__ << ": failed to create command queue: "
|
|
<< err << std::endl;
|
|
goto cleanup;
|
|
}
|
|
|
|
// Declare variables early to avoid goto crossing initialization
|
|
struct iovec assemblyIov;
|
|
struct iovec collationIov;
|
|
|
|
// Get StagingBuffer memory pointers from parent
|
|
assemblyIov = parent.assemblyBuffer.getClEngineIovec();
|
|
collationIov = parent.collationBuffer.getClEngineIovec();
|
|
|
|
assemblyBufferPtr = assemblyIov.iov_base;
|
|
assemblyBufferSize = assemblyIov.iov_len;
|
|
collationBufferPtr = collationIov.iov_base;
|
|
collationBufferSize = collationIov.iov_len;
|
|
|
|
// Get FrameAssemblyDesc from assembly buffer
|
|
frameAssemblyDesc = static_cast<std::shared_ptr<FrameAssemblyDesc>>(
|
|
parent.assemblyBuffer);
|
|
|
|
if (!frameAssemblyDesc || frameAssemblyDesc->slots.empty())
|
|
{
|
|
std::cerr << __func__ << ": invalid frame descriptor" << std::endl;
|
|
goto cleanup;
|
|
}
|
|
|
|
// Create OpenCL buffers using CL_MEM_USE_HOST_PTR for zero-copy
|
|
clAssemblyBuffer = clCreateBuffer(
|
|
context,
|
|
CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
|
|
assemblyBufferSize, assemblyBufferPtr,
|
|
&err);
|
|
|
|
if (err != CL_SUCCESS || !clAssemblyBuffer)
|
|
{
|
|
std::cerr << __func__ << ": failed to create assembly buffer: "
|
|
<< err << std::endl;
|
|
goto cleanup;
|
|
}
|
|
|
|
clCollationBuffer = clCreateBuffer(
|
|
context,
|
|
CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY,
|
|
collationBufferSize, collationBufferPtr,
|
|
&err);
|
|
|
|
if (err != CL_SUCCESS || !clCollationBuffer)
|
|
{
|
|
std::cerr << __func__ << ": failed to create collation buffer: "
|
|
<< err << std::endl;
|
|
goto cleanup;
|
|
}
|
|
|
|
// Compile and prepare both kernels
|
|
if (!compileAndPrepareKernels()) {
|
|
goto cleanup;
|
|
}
|
|
|
|
clFlush(commandQueue);
|
|
clFinish(commandQueue);
|
|
shouldAcceptRequests = true;
|
|
return true;
|
|
|
|
cleanup:
|
|
finalize();
|
|
return false;
|
|
}
|
|
|
|
void OpenClCollatingAndMeshingEngine::finalize()
|
|
{
|
|
// Call stop() to set shouldAcceptRequests to false and get previous state
|
|
bool wasAcceptingRequests = stop();
|
|
(void)wasAcceptingRequests;
|
|
|
|
// Complete any running kernels
|
|
if (compactIsRunning) { compactKernelComplete(true); }
|
|
if (collateIsRunning) { collateKernelComplete(true); }
|
|
|
|
// Release OpenCL buffers in reverse order
|
|
if (clCollationBuffer)
|
|
{
|
|
clReleaseMemObject(clCollationBuffer);
|
|
clCollationBuffer = nullptr;
|
|
}
|
|
if (clAssemblyBuffer)
|
|
{
|
|
clReleaseMemObject(clAssemblyBuffer);
|
|
clAssemblyBuffer = nullptr;
|
|
}
|
|
|
|
// Release kernels
|
|
if (slotCompactorKernel)
|
|
{
|
|
clReleaseKernel(slotCompactorKernel);
|
|
slotCompactorKernel = nullptr;
|
|
}
|
|
if (collateKernel)
|
|
{
|
|
clReleaseKernel(collateKernel);
|
|
collateKernel = nullptr;
|
|
}
|
|
|
|
// Release programs
|
|
if (slotCompactorProgram)
|
|
{
|
|
clReleaseProgram(slotCompactorProgram);
|
|
slotCompactorProgram = nullptr;
|
|
}
|
|
if (collateProgram)
|
|
{
|
|
clReleaseProgram(collateProgram);
|
|
collateProgram = nullptr;
|
|
}
|
|
|
|
// Release command queue
|
|
if (commandQueue)
|
|
{
|
|
clReleaseCommandQueue(commandQueue);
|
|
commandQueue = nullptr;
|
|
}
|
|
|
|
// Release context
|
|
if (context)
|
|
{
|
|
clReleaseContext(context);
|
|
context = nullptr;
|
|
}
|
|
|
|
// Reset state variables
|
|
device = nullptr;
|
|
platform = nullptr;
|
|
compactIsRunning = false;
|
|
collateIsRunning = false;
|
|
currentCompactKernelEvent = nullptr;
|
|
currentCollateKernelEvent = nullptr;
|
|
assemblyBufferPtr = nullptr;
|
|
assemblyBufferSize = 0;
|
|
collationBufferPtr = nullptr;
|
|
collationBufferSize = 0;
|
|
frameAssemblyDesc = nullptr;
|
|
}
|
|
|
|
// Static callback for compact kernel event
|
|
void CL_CALLBACK OpenClCollatingAndMeshingEngine::compactKernelEventCallback(
|
|
cl_event /*event*/, cl_int event_command_exec_status, void* user_data)
|
|
{
|
|
OpenClCollatingAndMeshingEngine* engine =
|
|
static_cast<OpenClCollatingAndMeshingEngine*>(user_data);
|
|
|
|
if (!engine || !engine->compactKernelCb)
|
|
{ return; }
|
|
|
|
// Post to io_service to call callback on the correct thread
|
|
if (engine->parent.device && engine->parent.device->componentThread)
|
|
{
|
|
engine->parent.device->componentThread->getIoService().post(
|
|
std::bind(engine->compactKernelCb, event_command_exec_status));
|
|
}
|
|
}
|
|
|
|
// Static callback for collate kernel event
|
|
void CL_CALLBACK OpenClCollatingAndMeshingEngine::collateKernelEventCallback(
|
|
cl_event /*event*/, cl_int event_command_exec_status, void* user_data)
|
|
{
|
|
OpenClCollatingAndMeshingEngine* engine =
|
|
static_cast<OpenClCollatingAndMeshingEngine*>(user_data);
|
|
|
|
if (!engine || !engine->collateKernelCb)
|
|
{ return; }
|
|
|
|
// Post to io_service to call callback on the correct thread
|
|
if (engine->parent.device && engine->parent.device->componentThread)
|
|
{
|
|
engine->parent.device->componentThread->getIoService().post(
|
|
std::bind(engine->collateKernelCb, event_command_exec_status));
|
|
}
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::startCompactKernel(
|
|
StagingBuffer& assemblyBuff, uint32_t nSucceeded,
|
|
compactKernelCbFn callback)
|
|
{
|
|
// Store the caller's callback
|
|
compactKernelCb = std::move(callback);
|
|
|
|
// Validate buffers callable
|
|
auto validateBuffers = [this, &assemblyBuff]() {
|
|
struct iovec assemblyIov = assemblyBuff.getClEngineIovec();
|
|
if (assemblyIov.iov_base != assemblyBufferPtr
|
|
|| assemblyIov.iov_len != assemblyBufferSize)
|
|
{
|
|
throw std::runtime_error(
|
|
std::string(__func__) + ": buffer mismatch - buffers have "
|
|
"changed");
|
|
}
|
|
};
|
|
|
|
// Setup args callable
|
|
auto setupArgs = [this, &assemblyBuff, nSucceeded]() {
|
|
return setupSlotCompactorsArgs(assemblyBuff, nSucceeded);
|
|
};
|
|
|
|
/** EXPLANAITON:
|
|
* Map assembly buffer as WRITE_INVALIDATE_REGION to inform OpenCL that host
|
|
* (io_uring) has written data, and that the host doesn't care what the
|
|
* prior contents of the device's cache see. The device must invalidate
|
|
* its own view of the HOST_PTR and accept our view.
|
|
*
|
|
* Then immediately unmap to let OpenCL make the changes visible to the GPU
|
|
*/
|
|
if (!mapAssemblyBuffer(CL_MAP_WRITE_INVALIDATE_REGION))
|
|
{
|
|
std::cerr << __func__ << ": failed to map assembly buffer" << std::endl;
|
|
return false;
|
|
}
|
|
|
|
// Unmap immediately to sync host writes to GPU
|
|
unmapAssemblyBuffer();
|
|
|
|
bool success = startKernel(
|
|
slotCompactorKernel,
|
|
¤tCompactKernelEvent,
|
|
setupArgs,
|
|
validateBuffers,
|
|
1, // globalWorkSize
|
|
compactKernelEventCallback,
|
|
"slotCompactor",
|
|
compactIsRunning);
|
|
|
|
if (!success) { return false; }
|
|
return true;
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::startCollateKernel(
|
|
StagingBuffer& assemblyBuff, StagingBuffer& collationBuff,
|
|
collateKernelCbFn callback)
|
|
{
|
|
// Store the caller's callback
|
|
collateKernelCb = std::move(callback);
|
|
|
|
/** EXPLANATION:
|
|
* It shouldn't be necessary to map the assembly/collation buffers here
|
|
* since we don't need to read/write them on the host CPUs (unless we're
|
|
* intervening to debug; in which case we should map them as CL_MAP_READ).
|
|
*
|
|
* Otherwise, the foreign GPU's view of the data in the assembly buffer
|
|
* is currently up to date; and the collation buffer's state is undefined...
|
|
* and also irrelevant since it's only going to be used for output anyway.
|
|
*/
|
|
|
|
mapAssemblyBuffer(CL_MAP_WRITE_INVALIDATE_REGION);
|
|
unmapAssemblyBuffer();
|
|
mapCollationBuffer(CL_MAP_WRITE);
|
|
unmapCollationBuffer();
|
|
|
|
// Validate buffers callable
|
|
auto validateBuffers = [this, &assemblyBuff, &collationBuff]() {
|
|
struct iovec assemblyIov = assemblyBuff.getClEngineIovec();
|
|
struct iovec collationIov = collationBuff.getClEngineIovec();
|
|
if (assemblyIov.iov_base != assemblyBufferPtr
|
|
|| assemblyIov.iov_len != assemblyBufferSize
|
|
|| collationIov.iov_base != collationBufferPtr
|
|
|| collationIov.iov_len != collationBufferSize)
|
|
{
|
|
throw std::runtime_error(
|
|
std::string(__func__) + ": buffer mismatch - buffers have changed");
|
|
}
|
|
};
|
|
|
|
// Setup args callable
|
|
auto setupArgs = [this, &assemblyBuff]() {
|
|
return setupCollateDgramsArgs(assemblyBuff);
|
|
};
|
|
|
|
// Calculate global work size (just num slots in the frame)
|
|
size_t globalWorkSize = static_cast<uint32_t>(frameAssemblyDesc->numSlots);
|
|
|
|
bool success = startKernel(
|
|
collateKernel,
|
|
¤tCollateKernelEvent,
|
|
setupArgs,
|
|
validateBuffers,
|
|
globalWorkSize,
|
|
collateKernelEventCallback,
|
|
"collateDgrams",
|
|
collateIsRunning);
|
|
|
|
if (!success) { return false; }
|
|
return true;
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::compileAndPrepareKernel(
|
|
const char* kernelSource, size_t kernelSourceLen,
|
|
const char* kernelName, cl_program& program, cl_kernel& kernel)
|
|
{
|
|
cl_int err;
|
|
|
|
// Create program from source
|
|
program = clCreateProgramWithSource(
|
|
context, 1, &kernelSource, &kernelSourceLen, &err);
|
|
|
|
if (err != CL_SUCCESS || !program)
|
|
{
|
|
std::cerr << __func__ << ": failed to create " << kernelName
|
|
<< " program: " << err << std::endl;
|
|
return false;
|
|
}
|
|
|
|
// Build program
|
|
err = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to build " << kernelName
|
|
<< " program: " << err << std::endl;
|
|
|
|
// Print build log if available
|
|
size_t logSize = 0;
|
|
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
|
|
0, nullptr, &logSize);
|
|
|
|
if (logSize > 0)
|
|
{
|
|
std::vector<char> log(logSize);
|
|
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
|
|
logSize, log.data(), nullptr);
|
|
std::cerr << kernelName << " build log: " << log.data()
|
|
<< std::endl;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
// Create kernel
|
|
kernel = clCreateKernel(program, kernelName, &err);
|
|
if (err != CL_SUCCESS || !kernel)
|
|
{
|
|
std::cerr << __func__ << ": failed to create " << kernelName
|
|
<< " kernel: " << err << std::endl;
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::compileAndPrepareKernels()
|
|
{
|
|
// Compile slotCompactor kernel
|
|
if (!compileAndPrepareKernel(
|
|
slotCompactorKernelStart, slotCompactorKernelNBytes,
|
|
"slotCompactor", slotCompactorProgram, slotCompactorKernel))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
// Compile collateDgrams kernel
|
|
if (!compileAndPrepareKernel(
|
|
collateKernelStart, collateKernelNBytes,
|
|
"collate", collateProgram, collateKernel))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::setupSlotCompactorsArgs(
|
|
StagingBuffer& assemblyBuff, uint32_t nSucceeded)
|
|
{
|
|
// Extract parameters for slotCompactor kernel
|
|
uint32_t numSlots = static_cast<uint32_t>(frameAssemblyDesc->numSlots);
|
|
uint32_t slotStride = static_cast<uint32_t>(assemblyBuff.slotStrideNBytes);
|
|
uint32_t slotSize = static_cast<uint32_t>(frameAssemblyDesc->slotSizeBytes);
|
|
uint32_t firstSlotOffset = static_cast<uint32_t>(
|
|
assemblyBuff.firstSlotOffsetNBytes);
|
|
uint32_t nSucceededUint = static_cast<uint32_t>(nSucceeded);
|
|
|
|
// Set kernel arguments for slotCompactor
|
|
cl_int err;
|
|
err = clSetKernelArg(
|
|
slotCompactorKernel, 0, sizeof(cl_mem), &clAssemblyBuffer);
|
|
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to set kernel arg 0: " << err
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
err = clSetKernelArg(slotCompactorKernel, 1, sizeof(uint32_t), &numSlots);
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to set kernel arg 1: " << err
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
err = clSetKernelArg(slotCompactorKernel, 2, sizeof(uint32_t), &slotStride);
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to set kernel arg 2: " << err
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
err = clSetKernelArg(slotCompactorKernel, 3, sizeof(uint32_t), &slotSize);
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to set kernel arg 3: " << err
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
err = clSetKernelArg(
|
|
slotCompactorKernel, 4, sizeof(uint32_t), &firstSlotOffset);
|
|
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to set kernel arg 4: " << err
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
err = clSetKernelArg(
|
|
slotCompactorKernel, 5, sizeof(uint32_t), &nSucceededUint);
|
|
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to set kernel arg 5: " << err
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs(
|
|
StagingBuffer& assemblyBuff)
|
|
{
|
|
// Extract parameters for collateDgrams kernel
|
|
uint32_t slotStride = static_cast<uint32_t>(assemblyBuff.slotStrideNBytes);
|
|
uint32_t firstSlotOffset = static_cast<uint32_t>(
|
|
assemblyBuff.firstSlotOffsetNBytes);
|
|
|
|
// Calculate nPointsPerSlot from device return mode
|
|
if (!parent.device)
|
|
{
|
|
std::cerr << __func__ << ": device not available" << std::endl;
|
|
return false;
|
|
}
|
|
int returnMode = static_cast<int>(parent.device->currentReturnMode);
|
|
uint32_t nPointsPerSlot = static_cast<uint32_t>(
|
|
IoUringAssemblyEngine::computePointsPerDgram(returnMode));
|
|
uint32_t nDgramsPerFrame = static_cast<uint32_t>(
|
|
frameAssemblyDesc->numSlots);
|
|
|
|
// Set kernel arguments for collateDgrams
|
|
cl_int err;
|
|
err = clSetKernelArg(collateKernel, 0, sizeof(cl_mem), &clAssemblyBuffer);
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to set kernel arg 0: " << err
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
err = clSetKernelArg(collateKernel, 1, sizeof(cl_mem), &clCollationBuffer);
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to set kernel arg 1: " << err
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
err = clSetKernelArg(collateKernel, 2, sizeof(uint32_t), &slotStride);
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to set kernel arg 2: " << err
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
err = clSetKernelArg(collateKernel, 3, sizeof(uint32_t), &firstSlotOffset);
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to set kernel arg 3: " << err
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
err = clSetKernelArg(collateKernel, 4, sizeof(uint32_t), &nPointsPerSlot);
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to set kernel arg 4: " << err
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
err = clSetKernelArg(collateKernel, 5, sizeof(uint32_t), &nDgramsPerFrame);
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to set kernel arg 5: " << err
|
|
<< std::endl;
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::stop()
|
|
{
|
|
// Acquire and release lock tightly around setting the flag
|
|
SpinLock::Guard lock(shouldAcceptRequestsLock);
|
|
bool wasAcceptingRequests = shouldAcceptRequests;
|
|
shouldAcceptRequests = false;
|
|
return wasAcceptingRequests;
|
|
}
|
|
|
|
void OpenClCollatingAndMeshingEngine::compactKernelComplete()
|
|
{
|
|
/** EXPLANATION:
|
|
* Technically we should only need to do this if we plan to read the
|
|
* compacted slots for debugging purposes. Otherwise this is unnecessary.
|
|
*/
|
|
mapAssemblyBuffer(CL_MAP_READ);
|
|
unmapAssemblyBuffer();
|
|
clFlush(commandQueue);
|
|
clFinish(commandQueue);
|
|
|
|
// Stop only compact kernel
|
|
if (compactIsRunning && currentCompactKernelEvent)
|
|
{
|
|
clWaitForEvents(1, ¤tCompactKernelEvent);
|
|
clReleaseEvent(currentCompactKernelEvent);
|
|
currentCompactKernelEvent = nullptr;
|
|
}
|
|
compactKernelCb = [](cl_int){};
|
|
compactIsRunning = false;
|
|
}
|
|
|
|
void OpenClCollatingAndMeshingEngine::collateKernelComplete()
|
|
{
|
|
/** EXPLANATION:
|
|
* Technically we should only need to do this if we plan to read the
|
|
* collated dgrams for debugging purposes. Otherwise this is unnecessary.
|
|
*/
|
|
mapCollationBuffer(CL_MAP_READ);
|
|
unmapCollationBuffer();
|
|
clFlush(commandQueue);
|
|
clFinish(commandQueue);
|
|
// Stop only collate kernel
|
|
if (collateIsRunning && currentCollateKernelEvent)
|
|
{
|
|
clWaitForEvents(1, ¤tCollateKernelEvent);
|
|
clReleaseEvent(currentCollateKernelEvent);
|
|
currentCollateKernelEvent = nullptr;
|
|
}
|
|
collateKernelCb = [](cl_int){};
|
|
collateIsRunning = false;
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::mapBuffer(
|
|
cl_mem buffer, size_t size, cl_map_flags mapFlags, void*& mappedPtr)
|
|
{
|
|
if (!commandQueue || !buffer)
|
|
{
|
|
std::cerr << __func__ << ": engine not set up or invalid buffer"
|
|
<< std::endl;
|
|
|
|
return false;
|
|
}
|
|
|
|
// If already mapped, return early with success.
|
|
if (mappedPtr != nullptr) { return true; }
|
|
|
|
cl_int err;
|
|
mappedPtr = clEnqueueMapBuffer(
|
|
commandQueue, buffer, CL_TRUE, mapFlags,
|
|
0, size, 0, nullptr, nullptr, &err);
|
|
|
|
if (err != CL_SUCCESS || !mappedPtr)
|
|
{
|
|
std::cerr << __func__ << ": failed to map buffer: " << err
|
|
<< std::endl;
|
|
goto cleanup;
|
|
}
|
|
|
|
return true;
|
|
|
|
cleanup:
|
|
mappedPtr = nullptr;
|
|
return false;
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::unmapBuffer(
|
|
cl_mem buffer, void*& mappedPtr
|
|
)
|
|
{
|
|
if (mappedPtr == nullptr)
|
|
{
|
|
// Already unmapped
|
|
return true;
|
|
}
|
|
|
|
if (!commandQueue || !buffer)
|
|
{
|
|
std::cerr << __func__ << ": engine not set up or invalid buffer.\n";
|
|
return false;
|
|
}
|
|
|
|
cl_int err;
|
|
cl_event unmapEvent = nullptr;
|
|
err = clEnqueueUnmapMemObject(
|
|
commandQueue, buffer, mappedPtr,
|
|
0, nullptr, &unmapEvent);
|
|
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to unmap buffer: " << err
|
|
<< std::endl;
|
|
goto cleanup;
|
|
}
|
|
|
|
// Wait for unmap to complete and release the event
|
|
err = clWaitForEvents(1, &unmapEvent);
|
|
if (err != CL_SUCCESS)
|
|
{
|
|
std::cerr << __func__ << ": failed to wait for unmap event: " << err
|
|
<< std::endl;
|
|
goto cleanup_unmapEvent;
|
|
}
|
|
|
|
clReleaseEvent(unmapEvent);
|
|
mappedPtr = nullptr;
|
|
return true;
|
|
|
|
cleanup_unmapEvent:
|
|
clReleaseEvent(unmapEvent);
|
|
cleanup:
|
|
return false;
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::mapAssemblyBuffer(cl_map_flags mapFlags)
|
|
{
|
|
return mapBuffer(
|
|
clAssemblyBuffer, assemblyBufferSize, mapFlags, mappedAssemblyBuffer);
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::unmapAssemblyBuffer()
|
|
{
|
|
unmapBuffer(clAssemblyBuffer, mappedAssemblyBuffer);
|
|
mappedAssemblyBuffer = nullptr;
|
|
return true;
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::mapCollationBuffer(cl_map_flags mapFlags)
|
|
{
|
|
return mapBuffer(
|
|
clCollationBuffer, collationBufferSize, mapFlags,
|
|
mappedCollationBuffer);
|
|
}
|
|
|
|
bool OpenClCollatingAndMeshingEngine::unmapCollationBuffer()
|
|
{
|
|
unmapBuffer(clCollationBuffer, mappedCollationBuffer);
|
|
mappedCollationBuffer = nullptr;
|
|
return true;
|
|
}
|
|
|
|
class OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq
|
|
: public PostedAsynchronousContinuation<compactCollateAndMeshFrameReqCbFn>
|
|
{
|
|
private:
|
|
OpenClCollatingAndMeshingEngine& engine;
|
|
AsynchronousLoop frameAssemblyResult;
|
|
StimulusFrame& stimulusFrame;
|
|
|
|
public:
|
|
CompactCollateAndMeshFrameReq(
|
|
OpenClCollatingAndMeshingEngine& engine_,
|
|
AsynchronousLoop& asyncLoop,
|
|
StimulusFrame& stimulusFrame_,
|
|
const std::shared_ptr<ComponentThread>& caller,
|
|
Callback<compactCollateAndMeshFrameReqCbFn> cb)
|
|
: PostedAsynchronousContinuation<compactCollateAndMeshFrameReqCbFn>(
|
|
caller, cb),
|
|
engine(engine_),
|
|
frameAssemblyResult(asyncLoop), stimulusFrame(stimulusFrame_)
|
|
{}
|
|
|
|
public:
|
|
void callOriginalCallback(bool success)
|
|
{ callOriginalCb(success, std::ref(stimulusFrame)); }
|
|
|
|
public:
|
|
void compactCollateAndMeshFrameReq1_doCompact_posted(
|
|
std::shared_ptr<CompactCollateAndMeshFrameReq> context)
|
|
{
|
|
SpinLock::Guard lock(engine.shouldAcceptRequestsLock);
|
|
if (!engine.shouldAcceptRequests)
|
|
{
|
|
callOriginalCallback(false);
|
|
return;
|
|
}
|
|
|
|
// Record compact kernel start time
|
|
engine.compactKernelStartTime = std::chrono::high_resolution_clock::now();
|
|
|
|
bool success = engine.startCompactKernel(
|
|
engine.parent.assemblyBuffer,
|
|
static_cast<uint32_t>(context->frameAssemblyResult.nSucceeded.load()),
|
|
std::bind(
|
|
&CompactCollateAndMeshFrameReq
|
|
::compactCollateAndMeshFrameReq2_compactDone_posted,
|
|
context.get(), context,
|
|
std::placeholders::_1));
|
|
|
|
if (!success)
|
|
{
|
|
engine.compactKernelComplete();
|
|
callOriginalCallback(false);
|
|
return;
|
|
}
|
|
}
|
|
|
|
void compactCollateAndMeshFrameReq2_compactDone_posted(
|
|
std::shared_ptr<CompactCollateAndMeshFrameReq> context,
|
|
cl_int compactStatus)
|
|
{
|
|
SpinLock::Guard lock(engine.shouldAcceptRequestsLock);
|
|
if (!engine.shouldAcceptRequests)
|
|
{
|
|
/** EXPLANATION:
|
|
* We intentionally don't call compactKernelComplete() here because
|
|
* if shouldAcceptRequests is false, then the caller that called
|
|
* finalize() will also be forced to call compactKernelComplete()
|
|
* inside of finalize().
|
|
*/
|
|
callOriginalCallback(false);
|
|
return;
|
|
}
|
|
|
|
engine.compactKernelComplete();
|
|
// Record compact kernel end time
|
|
engine.compactKernelEndTime = std::chrono::high_resolution_clock::now();
|
|
|
|
// If compact failed, call callback directly with failure
|
|
if (compactStatus != CL_SUCCESS)
|
|
{
|
|
callOriginalCallback(false);
|
|
return;
|
|
}
|
|
|
|
#if 0
|
|
// Print first 4 bytes of each slot
|
|
if (engine.frameAssemblyDesc)
|
|
{
|
|
for (size_t i = 0; i < engine.frameAssemblyDesc->numSlots; ++i) {
|
|
engine.parent.ioUringAssemblyEngine.printSlotBytes(i, 4);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
lock.unlockPrematurely();
|
|
context->compactCollateAndMeshFrameReq3_doCollate_posted(context);
|
|
}
|
|
|
|
void compactCollateAndMeshFrameReq3_doCollate_posted(
|
|
std::shared_ptr<CompactCollateAndMeshFrameReq> context)
|
|
{
|
|
SpinLock::Guard lock(engine.shouldAcceptRequestsLock);
|
|
if (!engine.shouldAcceptRequests)
|
|
{
|
|
callOriginalCallback(false);
|
|
return;
|
|
}
|
|
|
|
// Record collate kernel start time
|
|
engine.collateKernelStartTime = std::chrono::high_resolution_clock::now();
|
|
|
|
bool success = engine.startCollateKernel(
|
|
engine.parent.assemblyBuffer, engine.parent.collationBuffer,
|
|
std::bind(
|
|
&CompactCollateAndMeshFrameReq
|
|
::compactCollateAndMeshFrameReq4_collateDone_maybePosted,
|
|
context.get(), context,
|
|
std::placeholders::_1));
|
|
|
|
if (!success)
|
|
{
|
|
engine.collateKernelComplete();
|
|
callOriginalCallback(false);
|
|
return;
|
|
}
|
|
}
|
|
|
|
void compactCollateAndMeshFrameReq4_collateDone_maybePosted(
|
|
[[maybe_unused]] std::shared_ptr<CompactCollateAndMeshFrameReq> context,
|
|
cl_int collateStatus)
|
|
{
|
|
SpinLock::Guard lock(engine.shouldAcceptRequestsLock);
|
|
if (!engine.shouldAcceptRequests)
|
|
{
|
|
/* We intentionally don't call collateKernelComplete() here for the
|
|
* same reason as above.
|
|
*/
|
|
callOriginalCallback(false);
|
|
return;
|
|
}
|
|
|
|
engine.collateKernelComplete();
|
|
// Record collate kernel end time
|
|
engine.collateKernelEndTime = std::chrono::high_resolution_clock::now();
|
|
|
|
bool success = (collateStatus == CL_SUCCESS);
|
|
|
|
// Early callback + return pattern
|
|
if (!success)
|
|
{
|
|
callOriginalCallback(false);
|
|
return;
|
|
}
|
|
|
|
int returnMode = static_cast<int>(engine.parent.device->currentReturnMode);
|
|
size_t pointsPerDgram = IoUringAssemblyEngine::computePointsPerDgram(returnMode);
|
|
uint32_t nSucceeded = context->frameAssemblyResult.nSucceeded.load();
|
|
size_t totalPoints = nSucceeded * pointsPerDgram;
|
|
|
|
// Count points with intensity greater than 116
|
|
float* collationFloats = static_cast<float*>(engine.collationBufferPtr);
|
|
size_t highIntensityCount = 0;
|
|
for (size_t i = 0; i < totalPoints; ++i)
|
|
{
|
|
float intensity = collationFloats[i * 4 + 3];
|
|
if (intensity > 116.0f)
|
|
{
|
|
++highIntensityCount;
|
|
}
|
|
}
|
|
|
|
std::cout << __func__ << ": pointsPerDgram=" << pointsPerDgram
|
|
<< ", nSucceeded=" << nSucceeded
|
|
<< ", totalPoints=" << totalPoints
|
|
<< ", highIntensityCount=" << highIntensityCount << std::endl;
|
|
|
|
callOriginalCallback(success);
|
|
}
|
|
};
|
|
|
|
void OpenClCollatingAndMeshingEngine::compactCollateAndMeshFrameReq(
|
|
AsynchronousLoop& asyncLoop, StimulusFrame& stimulusFrame,
|
|
Callback<compactCollateAndMeshFrameReqCbFn> callback)
|
|
{
|
|
{
|
|
SpinLock::Guard lock(shouldAcceptRequestsLock);
|
|
if (!shouldAcceptRequests)
|
|
{
|
|
callback.callbackFn(false, stimulusFrame);
|
|
return;
|
|
}
|
|
}
|
|
|
|
auto caller = smoHooksPtr->ComponentThread_getSelf();
|
|
auto request = std::make_shared<CompactCollateAndMeshFrameReq>(
|
|
*this, asyncLoop, stimulusFrame,
|
|
caller,
|
|
std::move(callback));
|
|
|
|
// Check if compaction is needed
|
|
bool needsCompaction = IoUringAssemblyEngine::compactionIsNeeded(
|
|
asyncLoop.nSucceeded.load(), asyncLoop.nTotal);
|
|
|
|
// Start with compaction if needed, then chain to collation
|
|
if (needsCompaction)
|
|
{
|
|
parent.device->componentThread->getIoService().post(
|
|
STC(std::bind(
|
|
&CompactCollateAndMeshFrameReq
|
|
::compactCollateAndMeshFrameReq1_doCompact_posted,
|
|
request.get(), request)));
|
|
}
|
|
else
|
|
{
|
|
// Skip compaction, go straight to collation
|
|
parent.device->componentThread->getIoService().post(
|
|
STC(std::bind(
|
|
&CompactCollateAndMeshFrameReq
|
|
::compactCollateAndMeshFrameReq3_doCollate_posted,
|
|
request.get(), request)));
|
|
}
|
|
}
|
|
|
|
std::chrono::milliseconds OpenClCollatingAndMeshingEngine::getCompactKernelDuration() const
|
|
{
|
|
auto duration = compactKernelEndTime - compactKernelStartTime;
|
|
if (duration.count() < 0)
|
|
{
|
|
return std::chrono::milliseconds(0);
|
|
}
|
|
return std::chrono::duration_cast<std::chrono::milliseconds>(duration);
|
|
}
|
|
|
|
std::chrono::milliseconds OpenClCollatingAndMeshingEngine::getCollateKernelDuration() const
|
|
{
|
|
auto duration = collateKernelEndTime - collateKernelStartTime;
|
|
if (duration.count() < 0)
|
|
{
|
|
return std::chrono::milliseconds(0);
|
|
}
|
|
return std::chrono::duration_cast<std::chrono::milliseconds>(duration);
|
|
}
|
|
|
|
} // namespace stim_buff
|
|
} // namespace smo
|