diff --git a/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.cpp b/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.cpp index 5b17c33..23f8b08 100644 --- a/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.cpp +++ b/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.cpp @@ -3,12 +3,12 @@ #include #include #include -#include #include #include "openClCollatingAndMeshingEngine.h" #include "pcloudStimulusBuffer.h" #include "openClKernels.h" #include "frameAssemblyDesc.h" +#include "ioUringAssemblyEngine.h" namespace smo { namespace stim_buff { @@ -19,14 +19,13 @@ platform(nullptr), device(nullptr), context(nullptr), commandQueue(nullptr), -program(nullptr), -kernel(nullptr), +slotCompactorProgram(nullptr), collateProgram(nullptr), +slotCompactorKernel(nullptr), collateKernel(nullptr), isSetup(false), clAssemblyBuffer(nullptr), clCollationBuffer(nullptr), isRunning(false), currentKernelEvent(nullptr), -memoryPinned(false), assemblyBufferPtr(nullptr), assemblyBufferSize(0), collationBufferPtr(nullptr), @@ -90,8 +89,6 @@ bool OpenClCollatingAndMeshingEngine::setup() // Declare variables early to avoid goto crossing initialization struct iovec assemblyIov; struct iovec collationIov; - const char* kernelSource; - size_t kernelSourceLen; // Get StagingBuffer memory pointers from parent assemblyIov = parent.assemblyBuffer.getClEngineIovec(); @@ -102,24 +99,6 @@ bool OpenClCollatingAndMeshingEngine::setup() collationBufferPtr = collationIov.iov_base; collationBufferSize = collationIov.iov_len; - // Pin memory pages using mlock() - if (mlock(assemblyBufferPtr, assemblyBufferSize) != 0) - { - std::cerr << __func__ << ": failed to pin assembly buffer memory: " - << strerror(errno) << std::endl; - goto cleanup; - } - - if (mlock(collationBufferPtr, collationBufferSize) != 0) - { - std::cerr << __func__ << ": failed to pin collation buffer memory: " - << strerror(errno) << std::endl; - munlock(assemblyBufferPtr, assemblyBufferSize); - goto cleanup; - } - - memoryPinned = true; - // Create OpenCL buffers using CL_MEM_USE_HOST_PTR for zero-copy clAssemblyBuffer = clCreateBuffer( context, @@ -147,46 +126,9 @@ bool OpenClCollatingAndMeshingEngine::setup() goto cleanup; } - // Create program and kernel from external source - kernelSource = slotCompactorKernelStart; - kernelSourceLen = slotCompactorKernelNBytes; - program = clCreateProgramWithSource( - context, 1, &kernelSource, &kernelSourceLen, &err); - - if (err != CL_SUCCESS || !program) + // Compile and prepare both kernels + if (!compileAndPrepareKernels()) { - std::cerr << __func__ << ": failed to create program: " - << err << std::endl; - goto cleanup; - } - - err = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr); - if (err != CL_SUCCESS) - { - std::cerr << __func__ << ": failed to build program: " - << err << std::endl; - - // Print build log if available - size_t logSize = 0; - clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, - 0, nullptr, &logSize); - - if (logSize > 0) - { - std::vector log(logSize); - clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, - logSize, log.data(), nullptr); - std::cerr << "Build log: " << log.data() << std::endl; - } - - goto cleanup; - } - - kernel = clCreateKernel(program, "slotCompactor", &err); - if (err != CL_SUCCESS || !kernel) - { - std::cerr << __func__ << ": failed to create kernel: " - << err << std::endl; goto cleanup; } @@ -203,19 +145,6 @@ void OpenClCollatingAndMeshingEngine::finalize() // Call stop() first stop(); - // Unpin memory pages if they were pinned - if (memoryPinned) - { - if (collationBufferPtr && collationBufferSize > 0) { - munlock(collationBufferPtr, collationBufferSize); - } - if (assemblyBufferPtr && assemblyBufferSize > 0) { - munlock(assemblyBufferPtr, assemblyBufferSize); - } - - memoryPinned = false; - } - // Release OpenCL buffers in reverse order if (clCollationBuffer) { @@ -228,18 +157,28 @@ void OpenClCollatingAndMeshingEngine::finalize() clAssemblyBuffer = nullptr; } - // Release kernel - if (kernel) + // Release kernels + if (slotCompactorKernel) { - clReleaseKernel(kernel); - kernel = nullptr; + clReleaseKernel(slotCompactorKernel); + slotCompactorKernel = nullptr; + } + if (collateKernel) + { + clReleaseKernel(collateKernel); + collateKernel = nullptr; } - // Release program - if (program) + // Release programs + if (slotCompactorProgram) { - clReleaseProgram(program); - program = nullptr; + clReleaseProgram(slotCompactorProgram); + slotCompactorProgram = nullptr; + } + if (collateProgram) + { + clReleaseProgram(collateProgram); + collateProgram = nullptr; } // Release command queue @@ -333,66 +272,24 @@ void OpenClCollatingAndMeshingEngine::start( return; } - // Extract parameters for slotCompactor kernel - uint32_t numSlots = static_cast(frameDesc->numSlots); - uint32_t slotStride = static_cast(assemblyBuff.slotStrideNBytes); - uint32_t slotSize = static_cast(frameDesc->slotSizeBytes); - uint32_t firstSlotOffset = static_cast(assemblyBuff.firstSlotOffsetNBytes); - - // Set kernel arguments for slotCompactor - cl_int err; - err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &clAssemblyBuffer); - if (err != CL_SUCCESS) - { - std::cerr << __func__ << ": failed to set kernel arg 0: " << err << std::endl; + // Set up kernel arguments for slotCompactor + if (!setupSlotCompactorsArgs(assemblyBuff, nSucceeded)) { + return; + } + // Set up kernel arguments for collateDgrams + if (!setupCollateDgramsArgs(assemblyBuff)) { return; } - err = clSetKernelArg(kernel, 1, sizeof(uint32_t), &numSlots); - if (err != CL_SUCCESS) - { - std::cerr << __func__ << ": failed to set kernel arg 1: " << err << std::endl; - return; - } - - err = clSetKernelArg(kernel, 2, sizeof(uint32_t), &slotStride); - if (err != CL_SUCCESS) - { - std::cerr << __func__ << ": failed to set kernel arg 2: " << err << std::endl; - return; - } - - err = clSetKernelArg(kernel, 3, sizeof(uint32_t), &slotSize); - if (err != CL_SUCCESS) - { - std::cerr << __func__ << ": failed to set kernel arg 3: " << err << std::endl; - return; - } - - err = clSetKernelArg(kernel, 4, sizeof(uint32_t), &firstSlotOffset); - if (err != CL_SUCCESS) - { - std::cerr << __func__ << ": failed to set kernel arg 4: " << err << std::endl; - return; - } - - uint32_t nSucceededUint = static_cast(nSucceeded); - err = clSetKernelArg(kernel, 5, sizeof(uint32_t), &nSucceededUint); - if (err != CL_SUCCESS) - { - std::cerr << __func__ << ": failed to set kernel arg 5: " << err << std::endl; - return; - } - - // Enqueue kernel execution (single work item for sequential processing) + // Enqueue slotCompactor kernel execution (single work item for sequential processing) size_t globalWorkSize = 1; - err = clEnqueueNDRangeKernel( - commandQueue, kernel, 1, nullptr, &globalWorkSize, nullptr, + cl_int err = clEnqueueNDRangeKernel( + commandQueue, slotCompactorKernel, 1, nullptr, &globalWorkSize, nullptr, 0, nullptr, ¤tKernelEvent); if (err != CL_SUCCESS) { - std::cerr << __func__ << ": failed to enqueue kernel: " << err << std::endl; + std::cerr << __func__ << ": failed to enqueue slotCompactor kernel: " << err << std::endl; return; } @@ -420,6 +317,219 @@ void OpenClCollatingAndMeshingEngine::start( // The callback will be invoked when the kernel completes } +bool OpenClCollatingAndMeshingEngine::compileAndPrepareKernel( + const char* kernelSource, size_t kernelSourceLen, + const char* kernelName, cl_program& program, cl_kernel& kernel) +{ + cl_int err; + + // Create program from source + program = clCreateProgramWithSource( + context, 1, &kernelSource, &kernelSourceLen, &err); + + if (err != CL_SUCCESS || !program) + { + std::cerr << __func__ << ": failed to create " << kernelName + << " program: " << err << std::endl; + return false; + } + + // Build program + err = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to build " << kernelName + << " program: " << err << std::endl; + + // Print build log if available + size_t logSize = 0; + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, + 0, nullptr, &logSize); + + if (logSize > 0) + { + std::vector log(logSize); + clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, + logSize, log.data(), nullptr); + std::cerr << kernelName << " build log: " << log.data() << std::endl; + } + + return false; + } + + // Create kernel + kernel = clCreateKernel(program, kernelName, &err); + if (err != CL_SUCCESS || !kernel) + { + std::cerr << __func__ << ": failed to create " << kernelName + << " kernel: " << err << std::endl; + return false; + } + + return true; +} + +bool OpenClCollatingAndMeshingEngine::compileAndPrepareKernels() +{ + // Compile slotCompactor kernel + if (!compileAndPrepareKernel( + slotCompactorKernelStart, slotCompactorKernelNBytes, + "slotCompactor", slotCompactorProgram, slotCompactorKernel)) + { + return false; + } + + // Compile collateDgrams kernel + if (!compileAndPrepareKernel( + collateKernelStart, collateKernelNBytes, + "collate", collateProgram, collateKernel)) + { + return false; + } + + return true; +} + +bool OpenClCollatingAndMeshingEngine::setupSlotCompactorsArgs( + StagingBuffer& assemblyBuff, uint32_t nSucceeded) +{ + // Get FrameAssemblyDesc from assembly buffer + std::shared_ptr frameDesc = + static_cast>(assemblyBuff); + if (!frameDesc || frameDesc->slots.empty()) + { + std::cerr << __func__ << ": invalid frame descriptor" << std::endl; + return false; + } + + // Extract parameters for slotCompactor kernel + uint32_t numSlots = static_cast(frameDesc->numSlots); + uint32_t slotStride = static_cast(assemblyBuff.slotStrideNBytes); + uint32_t slotSize = static_cast(frameDesc->slotSizeBytes); + uint32_t firstSlotOffset = static_cast(assemblyBuff.firstSlotOffsetNBytes); + uint32_t nSucceededUint = static_cast(nSucceeded); + + // Set kernel arguments for slotCompactor + cl_int err; + err = clSetKernelArg(slotCompactorKernel, 0, sizeof(cl_mem), &clAssemblyBuffer); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to set kernel arg 0: " << err << std::endl; + return false; + } + + err = clSetKernelArg(slotCompactorKernel, 1, sizeof(uint32_t), &numSlots); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to set kernel arg 1: " << err << std::endl; + return false; + } + + err = clSetKernelArg(slotCompactorKernel, 2, sizeof(uint32_t), &slotStride); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to set kernel arg 2: " << err << std::endl; + return false; + } + + err = clSetKernelArg(slotCompactorKernel, 3, sizeof(uint32_t), &slotSize); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to set kernel arg 3: " << err << std::endl; + return false; + } + + err = clSetKernelArg(slotCompactorKernel, 4, sizeof(uint32_t), &firstSlotOffset); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to set kernel arg 4: " << err << std::endl; + return false; + } + + err = clSetKernelArg(slotCompactorKernel, 5, sizeof(uint32_t), &nSucceededUint); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to set kernel arg 5: " << err << std::endl; + return false; + } + + return true; +} + +bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs( + StagingBuffer& assemblyBuff) +{ + // Get FrameAssemblyDesc from assembly buffer + std::shared_ptr frameDesc = + static_cast>(assemblyBuff); + if (!frameDesc || frameDesc->slots.empty()) + { + std::cerr << __func__ << ": invalid frame descriptor" << std::endl; + return false; + } + + // Extract parameters for collateDgrams kernel + uint32_t slotStride = static_cast(assemblyBuff.slotStrideNBytes); + uint32_t firstSlotOffset = static_cast(assemblyBuff.firstSlotOffsetNBytes); + + // Calculate nPointsPerSlot from device return mode + if (!parent.device) + { + std::cerr << __func__ << ": device not available" << std::endl; + return false; + } + int returnMode = static_cast(parent.device->currentReturnMode); + uint32_t nPointsPerSlot = static_cast( + IoUringAssemblyEngine::computePointsPerDgram(returnMode)); + uint32_t nDgramsPerFrame = static_cast(frameDesc->numSlots); + + // Set kernel arguments for collateDgrams + cl_int err; + err = clSetKernelArg(collateKernel, 0, sizeof(cl_mem), &clAssemblyBuffer); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to set kernel arg 0: " << err << std::endl; + return false; + } + + err = clSetKernelArg(collateKernel, 1, sizeof(cl_mem), &clCollationBuffer); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to set kernel arg 1: " << err << std::endl; + return false; + } + + err = clSetKernelArg(collateKernel, 2, sizeof(uint32_t), &slotStride); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to set kernel arg 2: " << err << std::endl; + return false; + } + + err = clSetKernelArg(collateKernel, 3, sizeof(uint32_t), &firstSlotOffset); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to set kernel arg 3: " << err << std::endl; + return false; + } + + err = clSetKernelArg(collateKernel, 4, sizeof(uint32_t), &nPointsPerSlot); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to set kernel arg 4: " << err << std::endl; + return false; + } + + err = clSetKernelArg(collateKernel, 5, sizeof(uint32_t), &nDgramsPerFrame); + if (err != CL_SUCCESS) + { + std::cerr << __func__ << ": failed to set kernel arg 5: " << err << std::endl; + return false; + } + + return true; +} + void OpenClCollatingAndMeshingEngine::stop() { if (!isRunning) { diff --git a/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.h b/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.h index 16ac1e0..c06cdce 100644 --- a/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.h +++ b/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.h @@ -51,8 +51,10 @@ private: cl_device_id device; cl_context context; cl_command_queue commandQueue; - cl_program program; - cl_kernel kernel; + cl_program slotCompactorProgram; + cl_program collateProgram; + cl_kernel slotCompactorKernel; + cl_kernel collateKernel; bool isSetup; // OpenCL buffers @@ -62,9 +64,8 @@ private: // State tracking bool isRunning; cl_event currentKernelEvent; - bool memoryPinned; - // Memory pinning tracking + // Memory tracking void* assemblyBufferPtr; size_t assemblyBufferSize; void* collationBufferPtr; @@ -76,6 +77,15 @@ private: // Static callback for OpenCL event static void CL_CALLBACK kernelEventCallback( cl_event event, cl_int event_command_exec_status, void* user_data); + + // Private helper methods + bool compileAndPrepareKernel( + const char* kernelSource, size_t kernelSourceLen, + const char* kernelName, cl_program& program, cl_kernel& kernel); + bool compileAndPrepareKernels(); + bool setupSlotCompactorsArgs( + StagingBuffer& assemblyBuff, uint32_t nSucceeded); + bool setupCollateDgramsArgs(StagingBuffer& assemblyBuff); }; } // namespace stim_buff