From f58f90836687c63b7ddb602178eec89587c68698 Mon Sep 17 00:00:00 2001 From: Hayodea Hekol Date: Wed, 12 Nov 2025 12:29:19 -0400 Subject: [PATCH] OpenCL checks: Add check for the need to clFlush kernel cmds --- compile/CMakeLists.txt | 5 + compile/clshmemlatency.cpp | 2 +- compile/clshmemlatency_callback.cpp | 300 ++++++++++++++++++++++++++++ 3 files changed, 306 insertions(+), 1 deletion(-) create mode 100644 compile/clshmemlatency_callback.cpp diff --git a/compile/CMakeLists.txt b/compile/CMakeLists.txt index 6d35d97..56ec87e 100644 --- a/compile/CMakeLists.txt +++ b/compile/CMakeLists.txt @@ -39,6 +39,11 @@ if(COMPILE_CL_CHECKS) PUBLIC ${OPENCL_INCLUDE_DIRS}) target_link_libraries(clshmemlatency ${OPENCL_LIBRARIES}) + add_executable(clshmemlatency_callback clshmemlatency_callback.cpp) + target_include_directories(clshmemlatency_callback + PUBLIC ${OPENCL_INCLUDE_DIRS}) + target_link_libraries(clshmemlatency_callback + ${OPENCL_LIBRARIES}) add_executable(clshmemcheck clshmemcheck.cpp) target_include_directories(clshmemcheck PUBLIC ${OPENCL_INCLUDE_DIRS}) diff --git a/compile/clshmemlatency.cpp b/compile/clshmemlatency.cpp index 9eb5439..bb117da 100644 --- a/compile/clshmemlatency.cpp +++ b/compile/clshmemlatency.cpp @@ -121,7 +121,7 @@ int main() { // -------------------- // Run a few iterations - for (int iter = 0; iter < 5; ++iter) { + for (int iter = 0; iter < 10; ++iter) { cl_event evt; auto t0 = std::chrono::high_resolution_clock::now(); diff --git a/compile/clshmemlatency_callback.cpp b/compile/clshmemlatency_callback.cpp new file mode 100644 index 0000000..f06ee42 --- /dev/null +++ b/compile/clshmemlatency_callback.cpp @@ -0,0 +1,300 @@ +#define CL_TARGET_OPENCL_VERSION 300 +#include +#include +#include +#include +#include +#include +#include +#include + +void checkCLError(cl_int err, const char* msg) { + if (err != CL_SUCCESS) { + std::cerr << "OpenCL Error " << err << " at: " << msg << std::endl; + exit(1); + } +} + +// Callback context for waiting on events +struct CallbackContext { + std::mutex mtx; + std::condition_variable cv; + bool completed; + cl_int status; + std::chrono::high_resolution_clock::time_point* timestamp; +}; + +// Helper function to wait for callback completion +void waitForCallback(CallbackContext& ctx) { + std::unique_lock lock(ctx.mtx); + ctx.cv.wait(lock, [&ctx] { return ctx.completed; }); +std::cout <<"waitForCallback cv.wait() returned.\n"; +} + +// Static callback for map buffer event +void CL_CALLBACK mapEventCallback(cl_event /*event*/, cl_int event_command_exec_status, void* user_data) { + CallbackContext* ctx = static_cast(user_data); +std::cout <<"mapEventCallback called and about to lock mutex.\n"; + std::unique_lock lock(ctx->mtx); + ctx->status = event_command_exec_status; + if (ctx->timestamp) { + *ctx->timestamp = std::chrono::high_resolution_clock::now(); + } + ctx->completed = true; + ctx->cv.notify_one(); +std::cout <<"mapEventCallback just notified.\n"; +} + +// Static callback for kernel execution event +void CL_CALLBACK kernelEventCallback(cl_event /*event*/, cl_int event_command_exec_status, void* user_data) { + CallbackContext* ctx = static_cast(user_data); +std::cout <<"mapEventCallback called and about to lock mutex.\n"; + std::unique_lock lock(ctx->mtx); + ctx->status = event_command_exec_status; + if (ctx->timestamp) { + *ctx->timestamp = std::chrono::high_resolution_clock::now(); + } + ctx->completed = true; + ctx->cv.notify_one(); +std::cout <<"mapEventCallback just notified.\n"; +} + +// Static callback for unmap buffer event +void CL_CALLBACK unmapEventCallback(cl_event /*event*/, cl_int event_command_exec_status, void* user_data) { + CallbackContext* ctx = static_cast(user_data); +std::cout <<"mapEventCallback called and about to lock mutex.\n"; + std::unique_lock lock(ctx->mtx); + ctx->status = event_command_exec_status; + if (ctx->timestamp) { + *ctx->timestamp = std::chrono::high_resolution_clock::now(); + } + ctx->completed = true; + ctx->cv.notify_one(); +std::cout <<"mapEventCallback just notified.\n"; +} + +// -------------------- +// Kernel source +// Simple mock kernel that simulates splitting XYZ/I +// Each "point" is 16 bytes (XYZ + Intensity) +const char* kernelSrc = R"CLC( +__kernel void xyz_i_split(__global uchar* assembly, + __global uchar* xyzOut, + __global uchar* iOut, + const uint numPoints) { + uint gid = get_global_id(0); + if (gid >= numPoints) return; + + uint offset = gid * 16; + // Copy XYZ (12 bytes) to xyzOut + for (int i=0; i<12; ++i) + xyzOut[gid*12 + i] = assembly[offset + i]; + + // Copy Intensity (4 bytes) to iOut + for (int i=0; i<4; ++i) + iOut[gid*4 + i] = assembly[offset + 12 + i]; +} +)CLC"; + +int main() { + // -------------------- + // CHANGE THIS VALUE to set number of points per assembly buffer + const size_t numPointsPerAssembly = 100000; // e.g., ~3333 points per fill + const size_t bytesPerPoint = 16; // 12 bytes XYZ + 4 bytes I + + const size_t assemblyBufSize = numPointsPerAssembly * bytesPerPoint; + const size_t xyzBufSize = numPointsPerAssembly * 12; + const size_t iBufSize = numPointsPerAssembly * 4; + + cl_uint numPlatforms = 0; + checkCLError(clGetPlatformIDs(0, nullptr, &numPlatforms), "get num platforms"); + std::vector platforms(numPlatforms); + checkCLError(clGetPlatformIDs(numPlatforms, platforms.data(), nullptr), "get platforms"); + + std::cout << "Found " << numPlatforms << " OpenCL platforms\n\n"; + + for (cl_uint p = 0; p < numPlatforms; ++p) { + char platformName[256]; + clGetPlatformInfo(platforms[p], CL_PLATFORM_NAME, sizeof(platformName), platformName, nullptr); + std::cout << "Platform " << p << ": " << platformName << "\n"; + + cl_uint numDevices = 0; + clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, 0, nullptr, &numDevices); + std::vector devices(numDevices); + clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, numDevices, devices.data(), nullptr); + + for (cl_uint d = 0; d < numDevices; ++d) { + char deviceName[256]; + clGetDeviceInfo(devices[d], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr); + std::cout << " Device " << d << ": " << deviceName << "\n"; + + cl_int err; + cl_context ctx = clCreateContext(nullptr, 1, &devices[d], nullptr, nullptr, &err); + checkCLError(err, "create context"); + + cl_queue_properties queueProps[] = {CL_QUEUE_PROPERTIES, 0, 0}; + cl_command_queue q = clCreateCommandQueueWithProperties(ctx, devices[d], queueProps, &err); + checkCLError(err, "create queue"); + + // -------------------- + // Allocate host buffers + std::vector assemblyHost(assemblyBufSize, 42); + std::vector xyzHost(xyzBufSize, 0); + std::vector iHost(iBufSize, 0); + + std::vector xyzHostCPU(xyzBufSize, 0); + std::vector iHostCPU(iBufSize, 0); + + // Create CL buffers + cl_mem assemblyBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, assemblyBufSize, assemblyHost.data(), &err); + checkCLError(err, "create assembly buffer"); + cl_mem xyzBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, xyzBufSize, xyzHost.data(), &err); + checkCLError(err, "create xyz buffer"); + cl_mem iBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, iBufSize, iHost.data(), &err); + checkCLError(err, "create i buffer"); + + // Build program + cl_program prog = clCreateProgramWithSource(ctx, 1, &kernelSrc, nullptr, &err); + checkCLError(err, "create program"); + + err = clBuildProgram(prog, 1, &devices[d], nullptr, nullptr, nullptr); + if (err != CL_SUCCESS) { + // Print build log + size_t logSize = 0; + clGetProgramBuildInfo(prog, devices[d], CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize); + std::vector log(logSize); + clGetProgramBuildInfo(prog, devices[d], CL_PROGRAM_BUILD_LOG, logSize, log.data(), nullptr); + std::cerr << log.data() << "\n"; + } + checkCLError(err, "build program"); + + cl_kernel kernel = clCreateKernel(prog, "xyz_i_split", &err); + checkCLError(err, "create kernel"); + + // Set kernel args + clSetKernelArg(kernel, 0, sizeof(cl_mem), &assemblyBuf); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &xyzBuf); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &iBuf); + clSetKernelArg(kernel, 3, sizeof(cl_uint), &numPointsPerAssembly); + + const size_t globalWorkSize = numPointsPerAssembly; + + // -------------------- + // Run a few iterations + for (int iter = 0; iter < 10; ++iter) { + auto t0 = std::chrono::high_resolution_clock::now(); + std::chrono::high_resolution_clock::time_point t1, t2, t3; + + cl_event mapEvt; + void* mappedAssembly = clEnqueueMapBuffer(q, assemblyBuf, CL_FALSE, CL_MAP_READ, 0, assemblyBufSize, 0, nullptr, &mapEvt, &err); + checkCLError(err, "map assembly buffer"); + + // Retain event to keep it alive until callback completes + err = clRetainEvent(mapEvt); + checkCLError(err, "retain map event"); + + // Wait for map event using callback + CallbackContext mapCtx; + mapCtx.completed = false; + mapCtx.timestamp = &t1; + err = clSetEventCallback(mapEvt, CL_COMPLETE, mapEventCallback, &mapCtx); + checkCLError(err, "set map event callback"); + // Force queue flush to ensure event processing + err = clFlush(q); + checkCLError(err, "flush queue"); +std::cout <<"About to waitForCalllback for clEnqueueMapBuffer.\n"; + waitForCallback(mapCtx); + checkCLError(mapCtx.status, "map buffer"); + + // Release event after callback completes + err = clReleaseEvent(mapEvt); + checkCLError(err, "release map event"); + + cl_event kernelEvt; + err = clEnqueueNDRangeKernel(q, kernel, 1, nullptr, &globalWorkSize, nullptr, 0, nullptr, &kernelEvt); + checkCLError(err, "enqueue kernel"); + + // Retain event to keep it alive until callback completes + err = clRetainEvent(kernelEvt); + checkCLError(err, "retain kernel event"); + + // Wait for kernel event using callback + CallbackContext kernelCtx; + kernelCtx.completed = false; + kernelCtx.timestamp = &t2; + err = clSetEventCallback(kernelEvt, CL_COMPLETE, kernelEventCallback, &kernelCtx); + checkCLError(err, "set kernel event callback"); + // Force queue flush to ensure event processing + err = clFlush(q); + checkCLError(err, "flush queue"); +std::cout <<"About to waitForCalllback for clEnqueueNDRangeKernel.\n"; + waitForCallback(kernelCtx); + checkCLError(kernelCtx.status, "kernel execution"); + + // Release event after callback completes + err = clReleaseEvent(kernelEvt); + checkCLError(err, "release kernel event"); + + cl_event unmapEvt; + err = clEnqueueUnmapMemObject(q, assemblyBuf, mappedAssembly, 0, nullptr, &unmapEvt); + checkCLError(err, "unmap assembly buffer"); + + // Retain event to keep it alive until callback completes + err = clRetainEvent(unmapEvt); + checkCLError(err, "retain unmap event"); + + // Wait for unmap event using callback + CallbackContext unmapCtx; + unmapCtx.completed = false; + unmapCtx.timestamp = &t3; + err = clSetEventCallback(unmapEvt, CL_COMPLETE, unmapEventCallback, &unmapCtx); + checkCLError(err, "set unmap event callback"); + // Force queue flush to ensure event processing + err = clFlush(q); + checkCLError(err, "flush queue"); +std::cout <<"About to waitForCalllback for clEnqueueUnmapMemObject.\n"; + waitForCallback(unmapCtx); + checkCLError(unmapCtx.status, "unmap buffer"); + + // Release event after callback completes + err = clReleaseEvent(unmapEvt); + checkCLError(err, "release unmap event"); + + // -------------------- + // Host CPU split + auto cpuStart = std::chrono::high_resolution_clock::now(); + for (size_t pt = 0; pt < numPointsPerAssembly; ++pt) { + size_t off = pt * 16; + for (int i = 0; i < 12; ++i) + xyzHostCPU[pt*12 + i] = assemblyHost[off + i]; + for (int i = 0; i < 4; ++i) + iHostCPU[pt*4 + i] = assemblyHost[off + 12 + i]; + } + auto cpuEnd = std::chrono::high_resolution_clock::now(); + + std::chrono::duration mapElapsed = t1 - t0; + std::chrono::duration kernelElapsed = t2 - t1; + std::chrono::duration unmapElapsed = t3 - t2; + std::chrono::duration cpuElapsed = cpuEnd - cpuStart; + + std::cout << "Iteration " << iter + << " | Map: " << mapElapsed.count() + << " ms | Kernel: " << kernelElapsed.count() + << " ms | Unmap: " << unmapElapsed.count() + << " ms | CPU Split: " << cpuElapsed.count() << " ms\n"; + } + + // Cleanup + clReleaseKernel(kernel); + clReleaseProgram(prog); + clReleaseMemObject(assemblyBuf); + clReleaseMemObject(xyzBuf); + clReleaseMemObject(iBuf); + clReleaseCommandQueue(q); + clReleaseContext(ctx); + } + std::cout << std::endl; + } + return 0; +} +