OClCollatingMeshingEngn: Compile both kernels side by side

This commit is contained in:
2025-11-09 04:49:37 -04:00
parent 6264a128a8
commit 7977f0bcc9
2 changed files with 260 additions and 140 deletions
@@ -3,12 +3,12 @@
#include <iostream> #include <iostream>
#include <cstring> #include <cstring>
#include <vector> #include <vector>
#include <sys/mman.h>
#include <boost/system/error_code.hpp> #include <boost/system/error_code.hpp>
#include "openClCollatingAndMeshingEngine.h" #include "openClCollatingAndMeshingEngine.h"
#include "pcloudStimulusBuffer.h" #include "pcloudStimulusBuffer.h"
#include "openClKernels.h" #include "openClKernels.h"
#include "frameAssemblyDesc.h" #include "frameAssemblyDesc.h"
#include "ioUringAssemblyEngine.h"
namespace smo { namespace smo {
namespace stim_buff { namespace stim_buff {
@@ -19,14 +19,13 @@ platform(nullptr),
device(nullptr), device(nullptr),
context(nullptr), context(nullptr),
commandQueue(nullptr), commandQueue(nullptr),
program(nullptr), slotCompactorProgram(nullptr), collateProgram(nullptr),
kernel(nullptr), slotCompactorKernel(nullptr), collateKernel(nullptr),
isSetup(false), isSetup(false),
clAssemblyBuffer(nullptr), clAssemblyBuffer(nullptr),
clCollationBuffer(nullptr), clCollationBuffer(nullptr),
isRunning(false), isRunning(false),
currentKernelEvent(nullptr), currentKernelEvent(nullptr),
memoryPinned(false),
assemblyBufferPtr(nullptr), assemblyBufferPtr(nullptr),
assemblyBufferSize(0), assemblyBufferSize(0),
collationBufferPtr(nullptr), collationBufferPtr(nullptr),
@@ -90,8 +89,6 @@ bool OpenClCollatingAndMeshingEngine::setup()
// Declare variables early to avoid goto crossing initialization // Declare variables early to avoid goto crossing initialization
struct iovec assemblyIov; struct iovec assemblyIov;
struct iovec collationIov; struct iovec collationIov;
const char* kernelSource;
size_t kernelSourceLen;
// Get StagingBuffer memory pointers from parent // Get StagingBuffer memory pointers from parent
assemblyIov = parent.assemblyBuffer.getClEngineIovec(); assemblyIov = parent.assemblyBuffer.getClEngineIovec();
@@ -102,24 +99,6 @@ bool OpenClCollatingAndMeshingEngine::setup()
collationBufferPtr = collationIov.iov_base; collationBufferPtr = collationIov.iov_base;
collationBufferSize = collationIov.iov_len; collationBufferSize = collationIov.iov_len;
// Pin memory pages using mlock()
if (mlock(assemblyBufferPtr, assemblyBufferSize) != 0)
{
std::cerr << __func__ << ": failed to pin assembly buffer memory: "
<< strerror(errno) << std::endl;
goto cleanup;
}
if (mlock(collationBufferPtr, collationBufferSize) != 0)
{
std::cerr << __func__ << ": failed to pin collation buffer memory: "
<< strerror(errno) << std::endl;
munlock(assemblyBufferPtr, assemblyBufferSize);
goto cleanup;
}
memoryPinned = true;
// Create OpenCL buffers using CL_MEM_USE_HOST_PTR for zero-copy // Create OpenCL buffers using CL_MEM_USE_HOST_PTR for zero-copy
clAssemblyBuffer = clCreateBuffer( clAssemblyBuffer = clCreateBuffer(
context, context,
@@ -147,46 +126,9 @@ bool OpenClCollatingAndMeshingEngine::setup()
goto cleanup; goto cleanup;
} }
// Create program and kernel from external source // Compile and prepare both kernels
kernelSource = slotCompactorKernelStart; if (!compileAndPrepareKernels())
kernelSourceLen = slotCompactorKernelNBytes;
program = clCreateProgramWithSource(
context, 1, &kernelSource, &kernelSourceLen, &err);
if (err != CL_SUCCESS || !program)
{ {
std::cerr << __func__ << ": failed to create program: "
<< err << std::endl;
goto cleanup;
}
err = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to build program: "
<< err << std::endl;
// Print build log if available
size_t logSize = 0;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
0, nullptr, &logSize);
if (logSize > 0)
{
std::vector<char> log(logSize);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
logSize, log.data(), nullptr);
std::cerr << "Build log: " << log.data() << std::endl;
}
goto cleanup;
}
kernel = clCreateKernel(program, "slotCompactor", &err);
if (err != CL_SUCCESS || !kernel)
{
std::cerr << __func__ << ": failed to create kernel: "
<< err << std::endl;
goto cleanup; goto cleanup;
} }
@@ -203,19 +145,6 @@ void OpenClCollatingAndMeshingEngine::finalize()
// Call stop() first // Call stop() first
stop(); stop();
// Unpin memory pages if they were pinned
if (memoryPinned)
{
if (collationBufferPtr && collationBufferSize > 0) {
munlock(collationBufferPtr, collationBufferSize);
}
if (assemblyBufferPtr && assemblyBufferSize > 0) {
munlock(assemblyBufferPtr, assemblyBufferSize);
}
memoryPinned = false;
}
// Release OpenCL buffers in reverse order // Release OpenCL buffers in reverse order
if (clCollationBuffer) if (clCollationBuffer)
{ {
@@ -228,18 +157,28 @@ void OpenClCollatingAndMeshingEngine::finalize()
clAssemblyBuffer = nullptr; clAssemblyBuffer = nullptr;
} }
// Release kernel // Release kernels
if (kernel) if (slotCompactorKernel)
{ {
clReleaseKernel(kernel); clReleaseKernel(slotCompactorKernel);
kernel = nullptr; slotCompactorKernel = nullptr;
}
if (collateKernel)
{
clReleaseKernel(collateKernel);
collateKernel = nullptr;
} }
// Release program // Release programs
if (program) if (slotCompactorProgram)
{ {
clReleaseProgram(program); clReleaseProgram(slotCompactorProgram);
program = nullptr; slotCompactorProgram = nullptr;
}
if (collateProgram)
{
clReleaseProgram(collateProgram);
collateProgram = nullptr;
} }
// Release command queue // Release command queue
@@ -333,66 +272,24 @@ void OpenClCollatingAndMeshingEngine::start(
return; return;
} }
// Extract parameters for slotCompactor kernel // Set up kernel arguments for slotCompactor
uint32_t numSlots = static_cast<uint32_t>(frameDesc->numSlots); if (!setupSlotCompactorsArgs(assemblyBuff, nSucceeded)) {
uint32_t slotStride = static_cast<uint32_t>(assemblyBuff.slotStrideNBytes); return;
uint32_t slotSize = static_cast<uint32_t>(frameDesc->slotSizeBytes); }
uint32_t firstSlotOffset = static_cast<uint32_t>(assemblyBuff.firstSlotOffsetNBytes); // Set up kernel arguments for collateDgrams
if (!setupCollateDgramsArgs(assemblyBuff)) {
// Set kernel arguments for slotCompactor
cl_int err;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &clAssemblyBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 0: " << err << std::endl;
return; return;
} }
err = clSetKernelArg(kernel, 1, sizeof(uint32_t), &numSlots); // Enqueue slotCompactor kernel execution (single work item for sequential processing)
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 1: " << err << std::endl;
return;
}
err = clSetKernelArg(kernel, 2, sizeof(uint32_t), &slotStride);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 2: " << err << std::endl;
return;
}
err = clSetKernelArg(kernel, 3, sizeof(uint32_t), &slotSize);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 3: " << err << std::endl;
return;
}
err = clSetKernelArg(kernel, 4, sizeof(uint32_t), &firstSlotOffset);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 4: " << err << std::endl;
return;
}
uint32_t nSucceededUint = static_cast<uint32_t>(nSucceeded);
err = clSetKernelArg(kernel, 5, sizeof(uint32_t), &nSucceededUint);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 5: " << err << std::endl;
return;
}
// Enqueue kernel execution (single work item for sequential processing)
size_t globalWorkSize = 1; size_t globalWorkSize = 1;
err = clEnqueueNDRangeKernel( cl_int err = clEnqueueNDRangeKernel(
commandQueue, kernel, 1, nullptr, &globalWorkSize, nullptr, commandQueue, slotCompactorKernel, 1, nullptr, &globalWorkSize, nullptr,
0, nullptr, &currentKernelEvent); 0, nullptr, &currentKernelEvent);
if (err != CL_SUCCESS) if (err != CL_SUCCESS)
{ {
std::cerr << __func__ << ": failed to enqueue kernel: " << err << std::endl; std::cerr << __func__ << ": failed to enqueue slotCompactor kernel: " << err << std::endl;
return; return;
} }
@@ -420,6 +317,219 @@ void OpenClCollatingAndMeshingEngine::start(
// The callback will be invoked when the kernel completes // The callback will be invoked when the kernel completes
} }
bool OpenClCollatingAndMeshingEngine::compileAndPrepareKernel(
const char* kernelSource, size_t kernelSourceLen,
const char* kernelName, cl_program& program, cl_kernel& kernel)
{
cl_int err;
// Create program from source
program = clCreateProgramWithSource(
context, 1, &kernelSource, &kernelSourceLen, &err);
if (err != CL_SUCCESS || !program)
{
std::cerr << __func__ << ": failed to create " << kernelName
<< " program: " << err << std::endl;
return false;
}
// Build program
err = clBuildProgram(program, 1, &device, nullptr, nullptr, nullptr);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to build " << kernelName
<< " program: " << err << std::endl;
// Print build log if available
size_t logSize = 0;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
0, nullptr, &logSize);
if (logSize > 0)
{
std::vector<char> log(logSize);
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,
logSize, log.data(), nullptr);
std::cerr << kernelName << " build log: " << log.data() << std::endl;
}
return false;
}
// Create kernel
kernel = clCreateKernel(program, kernelName, &err);
if (err != CL_SUCCESS || !kernel)
{
std::cerr << __func__ << ": failed to create " << kernelName
<< " kernel: " << err << std::endl;
return false;
}
return true;
}
bool OpenClCollatingAndMeshingEngine::compileAndPrepareKernels()
{
// Compile slotCompactor kernel
if (!compileAndPrepareKernel(
slotCompactorKernelStart, slotCompactorKernelNBytes,
"slotCompactor", slotCompactorProgram, slotCompactorKernel))
{
return false;
}
// Compile collateDgrams kernel
if (!compileAndPrepareKernel(
collateKernelStart, collateKernelNBytes,
"collate", collateProgram, collateKernel))
{
return false;
}
return true;
}
bool OpenClCollatingAndMeshingEngine::setupSlotCompactorsArgs(
StagingBuffer& assemblyBuff, uint32_t nSucceeded)
{
// Get FrameAssemblyDesc from assembly buffer
std::shared_ptr<FrameAssemblyDesc> frameDesc =
static_cast<std::shared_ptr<FrameAssemblyDesc>>(assemblyBuff);
if (!frameDesc || frameDesc->slots.empty())
{
std::cerr << __func__ << ": invalid frame descriptor" << std::endl;
return false;
}
// Extract parameters for slotCompactor kernel
uint32_t numSlots = static_cast<uint32_t>(frameDesc->numSlots);
uint32_t slotStride = static_cast<uint32_t>(assemblyBuff.slotStrideNBytes);
uint32_t slotSize = static_cast<uint32_t>(frameDesc->slotSizeBytes);
uint32_t firstSlotOffset = static_cast<uint32_t>(assemblyBuff.firstSlotOffsetNBytes);
uint32_t nSucceededUint = static_cast<uint32_t>(nSucceeded);
// Set kernel arguments for slotCompactor
cl_int err;
err = clSetKernelArg(slotCompactorKernel, 0, sizeof(cl_mem), &clAssemblyBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 0: " << err << std::endl;
return false;
}
err = clSetKernelArg(slotCompactorKernel, 1, sizeof(uint32_t), &numSlots);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 1: " << err << std::endl;
return false;
}
err = clSetKernelArg(slotCompactorKernel, 2, sizeof(uint32_t), &slotStride);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 2: " << err << std::endl;
return false;
}
err = clSetKernelArg(slotCompactorKernel, 3, sizeof(uint32_t), &slotSize);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 3: " << err << std::endl;
return false;
}
err = clSetKernelArg(slotCompactorKernel, 4, sizeof(uint32_t), &firstSlotOffset);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 4: " << err << std::endl;
return false;
}
err = clSetKernelArg(slotCompactorKernel, 5, sizeof(uint32_t), &nSucceededUint);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 5: " << err << std::endl;
return false;
}
return true;
}
bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs(
StagingBuffer& assemblyBuff)
{
// Get FrameAssemblyDesc from assembly buffer
std::shared_ptr<FrameAssemblyDesc> frameDesc =
static_cast<std::shared_ptr<FrameAssemblyDesc>>(assemblyBuff);
if (!frameDesc || frameDesc->slots.empty())
{
std::cerr << __func__ << ": invalid frame descriptor" << std::endl;
return false;
}
// Extract parameters for collateDgrams kernel
uint32_t slotStride = static_cast<uint32_t>(assemblyBuff.slotStrideNBytes);
uint32_t firstSlotOffset = static_cast<uint32_t>(assemblyBuff.firstSlotOffsetNBytes);
// Calculate nPointsPerSlot from device return mode
if (!parent.device)
{
std::cerr << __func__ << ": device not available" << std::endl;
return false;
}
int returnMode = static_cast<int>(parent.device->currentReturnMode);
uint32_t nPointsPerSlot = static_cast<uint32_t>(
IoUringAssemblyEngine::computePointsPerDgram(returnMode));
uint32_t nDgramsPerFrame = static_cast<uint32_t>(frameDesc->numSlots);
// Set kernel arguments for collateDgrams
cl_int err;
err = clSetKernelArg(collateKernel, 0, sizeof(cl_mem), &clAssemblyBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 0: " << err << std::endl;
return false;
}
err = clSetKernelArg(collateKernel, 1, sizeof(cl_mem), &clCollationBuffer);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 1: " << err << std::endl;
return false;
}
err = clSetKernelArg(collateKernel, 2, sizeof(uint32_t), &slotStride);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 2: " << err << std::endl;
return false;
}
err = clSetKernelArg(collateKernel, 3, sizeof(uint32_t), &firstSlotOffset);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 3: " << err << std::endl;
return false;
}
err = clSetKernelArg(collateKernel, 4, sizeof(uint32_t), &nPointsPerSlot);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 4: " << err << std::endl;
return false;
}
err = clSetKernelArg(collateKernel, 5, sizeof(uint32_t), &nDgramsPerFrame);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 5: " << err << std::endl;
return false;
}
return true;
}
void OpenClCollatingAndMeshingEngine::stop() void OpenClCollatingAndMeshingEngine::stop()
{ {
if (!isRunning) { if (!isRunning) {
@@ -51,8 +51,10 @@ private:
cl_device_id device; cl_device_id device;
cl_context context; cl_context context;
cl_command_queue commandQueue; cl_command_queue commandQueue;
cl_program program; cl_program slotCompactorProgram;
cl_kernel kernel; cl_program collateProgram;
cl_kernel slotCompactorKernel;
cl_kernel collateKernel;
bool isSetup; bool isSetup;
// OpenCL buffers // OpenCL buffers
@@ -62,9 +64,8 @@ private:
// State tracking // State tracking
bool isRunning; bool isRunning;
cl_event currentKernelEvent; cl_event currentKernelEvent;
bool memoryPinned;
// Memory pinning tracking // Memory tracking
void* assemblyBufferPtr; void* assemblyBufferPtr;
size_t assemblyBufferSize; size_t assemblyBufferSize;
void* collationBufferPtr; void* collationBufferPtr;
@@ -76,6 +77,15 @@ private:
// Static callback for OpenCL event // Static callback for OpenCL event
static void CL_CALLBACK kernelEventCallback( static void CL_CALLBACK kernelEventCallback(
cl_event event, cl_int event_command_exec_status, void* user_data); cl_event event, cl_int event_command_exec_status, void* user_data);
// Private helper methods
bool compileAndPrepareKernel(
const char* kernelSource, size_t kernelSourceLen,
const char* kernelName, cl_program& program, cl_kernel& kernel);
bool compileAndPrepareKernels();
bool setupSlotCompactorsArgs(
StagingBuffer& assemblyBuff, uint32_t nSucceeded);
bool setupCollateDgramsArgs(StagingBuffer& assemblyBuff);
}; };
} // namespace stim_buff } // namespace stim_buff