#include #include #include #include #include #include #include "openClCollatingAndMeshingEngine.h" #include "pcloudStimulusBuffer.h" #include "openClKernels.h" #include "frameAssemblyDesc.h" #include "ioUringAssemblyEngine.h" namespace smo { namespace stim_buff { OpenClCollatingAndMeshingEngine::OpenClCollatingAndMeshingEngine( PcloudStimulusBuffer& parent_) : parent(parent_), platform(nullptr), device(nullptr), context(nullptr), commandQueue(nullptr), slotCompactorProgram(nullptr), collateProgram(nullptr), slotCompactorKernel(nullptr), collateKernel(nullptr), clAssemblyBuffer(nullptr), clCollationBuffer(nullptr), compactIsSetup(false), compactIsRunning(false), collateIsSetup(false), collateIsRunning(false), currentCompactKernelEvent(nullptr), currentCollateKernelEvent(nullptr), assemblyBufferPtr(nullptr), assemblyBufferSize(0), collationBufferPtr(nullptr), collationBufferSize(0), frameAssemblyDesc(nullptr) { } OpenClCollatingAndMeshingEngine::~OpenClCollatingAndMeshingEngine() { finalize(); } bool OpenClCollatingAndMeshingEngine::setup() { if (compactIsSetup && collateIsSetup) { return true; } cl_int err; cl_queue_properties queueProps[] = {CL_QUEUE_PROPERTIES, 0, 0}; // Get platform cl_uint numPlatforms; err = clGetPlatformIDs(1, &platform, &numPlatforms); if (err != CL_SUCCESS || numPlatforms == 0) { std::cerr << __func__ << ": failed to get OpenCL platform: " << err << std::endl; return false; } // Get device err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, nullptr); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to get GPU device: " << err << std::endl; return false; } // Create context context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err); if (err != CL_SUCCESS || !context) { std::cerr << __func__ << ": failed to create OpenCL context: " << err << std::endl; goto cleanup; } // Create command queue commandQueue = clCreateCommandQueueWithProperties( context, device, queueProps, &err); if (err != CL_SUCCESS || !commandQueue) { std::cerr << __func__ << ": failed to create command queue: " << err << std::endl; goto cleanup; } // Declare variables early to avoid goto crossing initialization struct iovec assemblyIov; struct iovec collationIov; // Get StagingBuffer memory pointers from parent assemblyIov = parent.assemblyBuffer.getClEngineIovec(); collationIov = parent.collationBuffer.getClEngineIovec(); assemblyBufferPtr = assemblyIov.iov_base; assemblyBufferSize = assemblyIov.iov_len; collationBufferPtr = collationIov.iov_base; collationBufferSize = collationIov.iov_len; // Get FrameAssemblyDesc from assembly buffer frameAssemblyDesc = static_cast>( parent.assemblyBuffer); if (!frameAssemblyDesc || frameAssemblyDesc->slots.empty()) { std::cerr << __func__ << ": invalid frame descriptor" << std::endl; goto cleanup; } // Create OpenCL buffers using CL_MEM_USE_HOST_PTR for zero-copy clAssemblyBuffer = clCreateBuffer( context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, assemblyBufferSize, assemblyBufferPtr, &err); if (err != CL_SUCCESS || !clAssemblyBuffer) { std::cerr << __func__ << ": failed to create assembly buffer: " << err << std::endl; goto cleanup; } clCollationBuffer = clCreateBuffer( context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, collationBufferSize, collationBufferPtr, &err); if (err != CL_SUCCESS || !clCollationBuffer) { std::cerr << __func__ << ": failed to create collation buffer: " << err << std::endl; goto cleanup; } // Compile and prepare both kernels if (!compileAndPrepareKernels()) { goto cleanup; } compactIsSetup = true; collateIsSetup = true; return true; cleanup: finalize(); return false; } void OpenClCollatingAndMeshingEngine::finalize() { // Call stop() first stop(); // Release OpenCL buffers in reverse order if (clCollationBuffer) { clReleaseMemObject(clCollationBuffer); clCollationBuffer = nullptr; } if (clAssemblyBuffer) { clReleaseMemObject(clAssemblyBuffer); clAssemblyBuffer = nullptr; } // Release kernels if (slotCompactorKernel) { clReleaseKernel(slotCompactorKernel); slotCompactorKernel = nullptr; } if (collateKernel) { clReleaseKernel(collateKernel); collateKernel = nullptr; } // Release programs if (slotCompactorProgram) { clReleaseProgram(slotCompactorProgram); slotCompactorProgram = nullptr; } if (collateProgram) { clReleaseProgram(collateProgram); collateProgram = nullptr; } // Release command queue if (commandQueue) { clReleaseCommandQueue(commandQueue); commandQueue = nullptr; } // Release context if (context) { clReleaseContext(context); context = nullptr; } // Reset state variables device = nullptr; platform = nullptr; compactIsSetup = false; compactIsRunning = false; collateIsSetup = false; collateIsRunning = false; currentCompactKernelEvent = nullptr; currentCollateKernelEvent = nullptr; assemblyBufferPtr = nullptr; assemblyBufferSize = 0; collationBufferPtr = nullptr; collationBufferSize = 0; frameAssemblyDesc = nullptr; } // Static callback for compact kernel event void CL_CALLBACK OpenClCollatingAndMeshingEngine::compactKernelEventCallback( cl_event /*event*/, cl_int event_command_exec_status, void* user_data) { OpenClCollatingAndMeshingEngine* engine = static_cast(user_data); if (!engine || !engine->compactIsRunning || !engine->compactKernelCb) { return; } // Post to io_service to call callback on the correct thread if (engine->parent.device && engine->parent.device->componentThread) { engine->parent.device->componentThread->getIoService().post( std::bind(engine->compactKernelCb, event_command_exec_status)); } } // Static callback for collate kernel event void CL_CALLBACK OpenClCollatingAndMeshingEngine::collateKernelEventCallback( cl_event /*event*/, cl_int event_command_exec_status, void* user_data) { OpenClCollatingAndMeshingEngine* engine = static_cast(user_data); if (!engine || !engine->collateIsRunning || !engine->collateKernelCb) { return; } // Post to io_service to call callback on the correct thread if (engine->parent.device && engine->parent.device->componentThread) { engine->parent.device->componentThread->getIoService().post( std::bind(engine->collateKernelCb, event_command_exec_status)); } } bool OpenClCollatingAndMeshingEngine::startCompactKernel( StagingBuffer& assemblyBuff, uint32_t nSucceeded, compactKernelCbFn callback) { // Store the caller's callback compactKernelCb = std::move(callback); // Validate buffers callable auto validateBuffers = [this, &assemblyBuff]() { struct iovec assemblyIov = assemblyBuff.getClEngineIovec(); if (assemblyIov.iov_base != assemblyBufferPtr || assemblyIov.iov_len != assemblyBufferSize) { throw std::runtime_error( std::string(__func__) + ": buffer mismatch - buffers have changed"); } }; // Setup args callable auto setupArgs = [this, &assemblyBuff, nSucceeded]() { return setupSlotCompactorsArgs(assemblyBuff, nSucceeded); }; return startKernel( slotCompactorKernel, ¤tCompactKernelEvent, setupArgs, validateBuffers, 1, // globalWorkSize compactKernelEventCallback, "slotCompactor", compactIsSetup, compactIsRunning); } bool OpenClCollatingAndMeshingEngine::startCollateKernel( StagingBuffer& assemblyBuff, StagingBuffer& collationBuff, collateKernelCbFn callback) { // Store the caller's callback collateKernelCb = std::move(callback); // Validate buffers callable auto validateBuffers = [this, &assemblyBuff, &collationBuff]() { struct iovec assemblyIov = assemblyBuff.getClEngineIovec(); struct iovec collationIov = collationBuff.getClEngineIovec(); if (assemblyIov.iov_base != assemblyBufferPtr || assemblyIov.iov_len != assemblyBufferSize || collationIov.iov_base != collationBufferPtr || collationIov.iov_len != collationBufferSize) { throw std::runtime_error( std::string(__func__) + ": buffer mismatch - buffers have changed"); } }; // Setup args callable auto setupArgs = [this, &assemblyBuff]() { return setupCollateDgramsArgs(assemblyBuff); }; // Calculate global work size uint32_t nDgramsPerFrame = static_cast( frameAssemblyDesc->numSlots); size_t globalWorkSize = nDgramsPerFrame; return startKernel( collateKernel, ¤tCollateKernelEvent, setupArgs, validateBuffers, globalWorkSize, collateKernelEventCallback, "collateDgrams", collateIsSetup, collateIsRunning); } bool OpenClCollatingAndMeshingEngine::compileAndPrepareKernel( const char* kernelSource, size_t kernelSourceLen, const char* kernelName, cl_program& program, cl_kernel& kernel) { cl_int err; // Create program from source program = clCreateProgramWithSource( context, 1, &kernelSource, &kernelSourceLen, &err); if (err != CL_SUCCESS || !program) { std::cerr << __func__ << ": failed to create " << kernelName << " program: " << err << std::endl; return false; } // Build program err = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to build " << kernelName << " program: " << err << std::endl; // Print build log if available size_t logSize = 0; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize); if (logSize > 0) { std::vector log(logSize); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log.data(), nullptr); std::cerr << kernelName << " build log: " << log.data() << std::endl; } return false; } // Create kernel kernel = clCreateKernel(program, kernelName, &err); if (err != CL_SUCCESS || !kernel) { std::cerr << __func__ << ": failed to create " << kernelName << " kernel: " << err << std::endl; return false; } return true; } bool OpenClCollatingAndMeshingEngine::compileAndPrepareKernels() { // Compile slotCompactor kernel if (!compileAndPrepareKernel( slotCompactorKernelStart, slotCompactorKernelNBytes, "slotCompactor", slotCompactorProgram, slotCompactorKernel)) { return false; } // Compile collateDgrams kernel if (!compileAndPrepareKernel( collateKernelStart, collateKernelNBytes, "collate", collateProgram, collateKernel)) { return false; } return true; } bool OpenClCollatingAndMeshingEngine::setupSlotCompactorsArgs( StagingBuffer& assemblyBuff, uint32_t nSucceeded) { // Extract parameters for slotCompactor kernel uint32_t numSlots = static_cast(frameAssemblyDesc->numSlots); uint32_t slotStride = static_cast(assemblyBuff.slotStrideNBytes); uint32_t slotSize = static_cast(frameAssemblyDesc->slotSizeBytes); uint32_t firstSlotOffset = static_cast( assemblyBuff.firstSlotOffsetNBytes); uint32_t nSucceededUint = static_cast(nSucceeded); // Set kernel arguments for slotCompactor cl_int err; err = clSetKernelArg( slotCompactorKernel, 0, sizeof(cl_mem), &clAssemblyBuffer); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 0: " << err << std::endl; return false; } err = clSetKernelArg(slotCompactorKernel, 1, sizeof(uint32_t), &numSlots); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 1: " << err << std::endl; return false; } err = clSetKernelArg(slotCompactorKernel, 2, sizeof(uint32_t), &slotStride); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 2: " << err << std::endl; return false; } err = clSetKernelArg(slotCompactorKernel, 3, sizeof(uint32_t), &slotSize); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 3: " << err << std::endl; return false; } err = clSetKernelArg( slotCompactorKernel, 4, sizeof(uint32_t), &firstSlotOffset); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 4: " << err << std::endl; return false; } err = clSetKernelArg( slotCompactorKernel, 5, sizeof(uint32_t), &nSucceededUint); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 5: " << err << std::endl; return false; } return true; } bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs( StagingBuffer& assemblyBuff) { // Extract parameters for collateDgrams kernel uint32_t slotStride = static_cast(assemblyBuff.slotStrideNBytes); uint32_t firstSlotOffset = static_cast( assemblyBuff.firstSlotOffsetNBytes); // Calculate nPointsPerSlot from device return mode if (!parent.device) { std::cerr << __func__ << ": device not available" << std::endl; return false; } int returnMode = static_cast(parent.device->currentReturnMode); uint32_t nPointsPerSlot = static_cast( IoUringAssemblyEngine::computePointsPerDgram(returnMode)); uint32_t nDgramsPerFrame = static_cast( frameAssemblyDesc->numSlots); // Set kernel arguments for collateDgrams cl_int err; err = clSetKernelArg(collateKernel, 0, sizeof(cl_mem), &clAssemblyBuffer); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 0: " << err << std::endl; return false; } err = clSetKernelArg(collateKernel, 1, sizeof(cl_mem), &clCollationBuffer); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 1: " << err << std::endl; return false; } err = clSetKernelArg(collateKernel, 2, sizeof(uint32_t), &slotStride); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 2: " << err << std::endl; return false; } err = clSetKernelArg(collateKernel, 3, sizeof(uint32_t), &firstSlotOffset); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 3: " << err << std::endl; return false; } err = clSetKernelArg(collateKernel, 4, sizeof(uint32_t), &nPointsPerSlot); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 4: " << err << std::endl; return false; } err = clSetKernelArg(collateKernel, 5, sizeof(uint32_t), &nDgramsPerFrame); if (err != CL_SUCCESS) { std::cerr << __func__ << ": failed to set kernel arg 5: " << err << std::endl; return false; } return true; } void OpenClCollatingAndMeshingEngine::stop() { // Stop compact kernel if running if (compactIsRunning && currentCompactKernelEvent) { clWaitForEvents(1, ¤tCompactKernelEvent); clReleaseEvent(currentCompactKernelEvent); currentCompactKernelEvent = nullptr; compactIsRunning = false; } // Stop collate kernel if running if (collateIsRunning && currentCollateKernelEvent) { clWaitForEvents(1, ¤tCollateKernelEvent); clReleaseEvent(currentCollateKernelEvent); currentCollateKernelEvent = nullptr; collateIsRunning = false; } } void OpenClCollatingAndMeshingEngine::stopCompactKernel() { // Stop only compact kernel if (compactIsRunning && currentCompactKernelEvent) { clWaitForEvents(1, ¤tCompactKernelEvent); clReleaseEvent(currentCompactKernelEvent); currentCompactKernelEvent = nullptr; compactIsRunning = false; } compactKernelCb = [](cl_int){}; } void OpenClCollatingAndMeshingEngine::stopCollateKernel() { // Stop only collate kernel if (collateIsRunning && currentCollateKernelEvent) { clWaitForEvents(1, ¤tCollateKernelEvent); clReleaseEvent(currentCollateKernelEvent); currentCollateKernelEvent = nullptr; collateIsRunning = false; } collateKernelCb = [](cl_int){}; } } // namespace stim_buff } // namespace smo