From 0e872042ee4a792ef3f74b97d2e21b916517a03d Mon Sep 17 00:00:00 2001 From: Latent Prion Date: Sat, 25 Oct 2025 03:39:42 -0400 Subject: [PATCH] Add some compile-time CL utilities --- CMakeLists.txt | 1 + compile/CMakeLists.txt | 19 ++++ compile/clshmemcheck.cpp | 90 ++++++++++++++++++ compile/clshmemlatency.cpp | 181 ++++++++++++++++++++++++++++++++++++ compile/clzerocopycheck.cpp | 115 +++++++++++++++++++++++ 5 files changed, 406 insertions(+) create mode 100644 compile/CMakeLists.txt create mode 100644 compile/clshmemcheck.cpp create mode 100644 compile/clshmemlatency.cpp create mode 100644 compile/clzerocopycheck.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 02d06b2..f135fbe 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,6 +85,7 @@ endif() if(ENABLE_TESTS) add_subdirectory(third_party) endif() +add_subdirectory(compile) # Add core components add_subdirectory(smocore) add_subdirectory(commonLibs) diff --git a/compile/CMakeLists.txt b/compile/CMakeLists.txt new file mode 100644 index 0000000..0d68400 --- /dev/null +++ b/compile/CMakeLists.txt @@ -0,0 +1,19 @@ +if(COMPILE_CL_CHECKS) + find_package(OpenCL REQUIRED) + + add_executable(clshmemlatency clshmemlatency.cpp) + target_include_directories(clshmemlatency + PUBLIC ${OpenCL_INCLUDE_DIRS}) + target_link_libraries(clshmemlatency + ${OpenCL_LIBRARY}) + add_executable(clshmemcheck clshmemcheck.cpp) + target_include_directories(clshmemcheck + PUBLIC ${OpenCL_INCLUDE_DIRS}) + target_link_libraries(clshmemcheck + ${OpenCL_LIBRARY}) + add_executable(clzerocopycheck clzerocopycheck.cpp) + target_include_directories(clzerocopycheck + PUBLIC ${OpenCL_INCLUDE_DIRS}) + target_link_libraries(clzerocopycheck + ${OpenCL_LIBRARY}) +endif() diff --git a/compile/clshmemcheck.cpp b/compile/clshmemcheck.cpp new file mode 100644 index 0000000..8e291c1 --- /dev/null +++ b/compile/clshmemcheck.cpp @@ -0,0 +1,90 @@ +#include +#include +#include +#include +#include + +void checkCLError(cl_int err, const char* msg) { + if (err != CL_SUCCESS) { + std::cerr << "OpenCL Error " << err << " at: " << msg << std::endl; + exit(1); + } +} + +int main() { + cl_uint numPlatforms = 0; + checkCLError(clGetPlatformIDs(0, nullptr, &numPlatforms), "get num platforms"); + std::vector platforms(numPlatforms); + checkCLError(clGetPlatformIDs(numPlatforms, platforms.data(), nullptr), "get platforms"); + + std::cout << "Found " << numPlatforms << " OpenCL platforms\n\n"; + + for (cl_uint p = 0; p < numPlatforms; ++p) { + char platformName[256]; + clGetPlatformInfo(platforms[p], CL_PLATFORM_NAME, sizeof(platformName), platformName, nullptr); + std::cout << "Platform " << p << ": " << platformName << "\n"; + + cl_uint numDevices = 0; + clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, 0, nullptr, &numDevices); + std::vector devices(numDevices); + clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, numDevices, devices.data(), nullptr); + + for (cl_uint d = 0; d < numDevices; ++d) { + char deviceName[256]; + clGetDeviceInfo(devices[d], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr); + std::cout << " Device " << d << ": " << deviceName << "\n"; + + cl_bool unifiedMem = CL_FALSE; + clGetDeviceInfo(devices[d], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(unifiedMem), &unifiedMem, nullptr); + std::cout << " Host-Device unified memory: " << (unifiedMem ? "Yes" : "No") << "\n"; + +#ifdef CL_DEVICE_SVM_CAPABILITIES + cl_device_svm_capabilities svmCaps = 0; + clGetDeviceInfo(devices[d], CL_DEVICE_SVM_CAPABILITIES, sizeof(svmCaps), &svmCaps, nullptr); + std::cout << " SVM capabilities:\n"; + if (!svmCaps) std::cout << " None\n"; + if (svmCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER) + std::cout << " - Coarse-grain buffer sharing\n"; + if (svmCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER) + std::cout << " - Fine-grain buffer sharing\n"; + if (svmCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM) + std::cout << " - Fine-grain system sharing\n"; + if (svmCaps & CL_DEVICE_SVM_ATOMICS) + std::cout << " - Atomics supported\n"; +#endif + + // Optional runtime test: check if CL_MEM_USE_HOST_PTR buffer reuses pointer + const size_t bufSize = 1024 * 1024; + std::vector hostBuffer(bufSize, 42); + + cl_int err; + cl_context ctx = clCreateContext(nullptr, 1, &devices[d], nullptr, nullptr, &err); + checkCLError(err, "create context"); + + cl_mem buf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, bufSize, hostBuffer.data(), &err); + checkCLError(err, "create buffer"); + + cl_command_queue q = clCreateCommandQueue(ctx, devices[d], 0, &err); + checkCLError(err, "create queue"); + + // Simple host → device → host round-trip test + cl_event evt; + + auto start = std::chrono::high_resolution_clock::now(); + + void* mapped = clEnqueueMapBuffer(q, buf, CL_TRUE, CL_MAP_READ, 0, bufSize, 0, nullptr, &evt, &err); + checkCLError(err, "map buffer"); + clWaitForEvents(1, &evt); + + clReleaseMemObject(buf); + auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed = end - start; + std::cout << " Map latency: " << elapsed.count() << " ms (lower → likely zero-copy)\n"; + + clReleaseCommandQueue(q); + clReleaseContext(ctx); + } + std::cout << std::endl; + } + return 0; +} diff --git a/compile/clshmemlatency.cpp b/compile/clshmemlatency.cpp new file mode 100644 index 0000000..e233dad --- /dev/null +++ b/compile/clshmemlatency.cpp @@ -0,0 +1,181 @@ +#include +#include +#include +#include +#include + +void checkCLError(cl_int err, const char* msg) { + if (err != CL_SUCCESS) { + std::cerr << "OpenCL Error " << err << " at: " << msg << std::endl; + exit(1); + } +} + +// -------------------- +// Kernel source +// Simple mock kernel that simulates splitting XYZ/I +// Each "point" is 16 bytes (XYZ + Intensity) +const char* kernelSrc = R"CLC( +__kernel void xyz_i_split(__global uchar* assembly, + __global uchar* xyzOut, + __global uchar* iOut, + const uint numPoints) { + uint gid = get_global_id(0); + if (gid >= numPoints) return; + + uint offset = gid * 16; + // Copy XYZ (12 bytes) to xyzOut + for (int i=0; i<12; ++i) + xyzOut[gid*12 + i] = assembly[offset + i]; + + // Copy Intensity (4 bytes) to iOut + for (int i=0; i<4; ++i) + iOut[gid*4 + i] = assembly[offset + 12 + i]; +} +)CLC"; + +int main() { + // -------------------- + // CHANGE THIS VALUE to set number of points per assembly buffer + const size_t numPointsPerAssembly = 100000; // e.g., ~3333 points per fill + const size_t bytesPerPoint = 16; // 12 bytes XYZ + 4 bytes I + + const size_t assemblyBufSize = numPointsPerAssembly * bytesPerPoint; + const size_t xyzBufSize = numPointsPerAssembly * 12; + const size_t iBufSize = numPointsPerAssembly * 4; + + cl_uint numPlatforms = 0; + checkCLError(clGetPlatformIDs(0, nullptr, &numPlatforms), "get num platforms"); + std::vector platforms(numPlatforms); + checkCLError(clGetPlatformIDs(numPlatforms, platforms.data(), nullptr), "get platforms"); + + std::cout << "Found " << numPlatforms << " OpenCL platforms\n\n"; + + for (cl_uint p = 0; p < numPlatforms; ++p) { + char platformName[256]; + clGetPlatformInfo(platforms[p], CL_PLATFORM_NAME, sizeof(platformName), platformName, nullptr); + std::cout << "Platform " << p << ": " << platformName << "\n"; + + cl_uint numDevices = 0; + clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, 0, nullptr, &numDevices); + std::vector devices(numDevices); + clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, numDevices, devices.data(), nullptr); + + for (cl_uint d = 0; d < numDevices; ++d) { + char deviceName[256]; + clGetDeviceInfo(devices[d], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr); + std::cout << " Device " << d << ": " << deviceName << "\n"; + + cl_int err; + cl_context ctx = clCreateContext(nullptr, 1, &devices[d], nullptr, nullptr, &err); + checkCLError(err, "create context"); + + cl_command_queue q = clCreateCommandQueue(ctx, devices[d], 0, &err); + checkCLError(err, "create queue"); + + // -------------------- + // Allocate host buffers + std::vector assemblyHost(assemblyBufSize, 42); + std::vector xyzHost(xyzBufSize, 0); + std::vector iHost(iBufSize, 0); + + std::vector xyzHostCPU(xyzBufSize, 0); + std::vector iHostCPU(iBufSize, 0); + + // Create CL buffers + cl_mem assemblyBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, assemblyBufSize, assemblyHost.data(), &err); + checkCLError(err, "create assembly buffer"); + cl_mem xyzBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, xyzBufSize, xyzHost.data(), &err); + checkCLError(err, "create xyz buffer"); + cl_mem iBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, iBufSize, iHost.data(), &err); + checkCLError(err, "create i buffer"); + + // Build program + cl_program prog = clCreateProgramWithSource(ctx, 1, &kernelSrc, nullptr, &err); + checkCLError(err, "create program"); + + err = clBuildProgram(prog, 1, &devices[d], nullptr, nullptr, nullptr); + if (err != CL_SUCCESS) { + // Print build log + size_t logSize = 0; + clGetProgramBuildInfo(prog, devices[d], CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize); + std::vector log(logSize); + clGetProgramBuildInfo(prog, devices[d], CL_PROGRAM_BUILD_LOG, logSize, log.data(), nullptr); + std::cerr << log.data() << "\n"; + } + checkCLError(err, "build program"); + + cl_kernel kernel = clCreateKernel(prog, "xyz_i_split", &err); + checkCLError(err, "create kernel"); + + // Set kernel args + clSetKernelArg(kernel, 0, sizeof(cl_mem), &assemblyBuf); + clSetKernelArg(kernel, 1, sizeof(cl_mem), &xyzBuf); + clSetKernelArg(kernel, 2, sizeof(cl_mem), &iBuf); + clSetKernelArg(kernel, 3, sizeof(cl_uint), &numPointsPerAssembly); + + const size_t globalWorkSize = numPointsPerAssembly; + + // -------------------- + // Run a few iterations + for (int iter = 0; iter < 5; ++iter) { + cl_event evt; + auto t0 = std::chrono::high_resolution_clock::now(); + + void* mappedAssembly = clEnqueueMapBuffer(q, assemblyBuf, CL_TRUE, CL_MAP_READ, 0, assemblyBufSize, 0, nullptr, &evt, &err); + checkCLError(err, "map assembly buffer"); + clWaitForEvents(1, &evt); + + auto t1 = std::chrono::high_resolution_clock::now(); + + err = clEnqueueNDRangeKernel(q, kernel, 1, nullptr, &globalWorkSize, nullptr, 0, nullptr, &evt); + checkCLError(err, "enqueue kernel"); + clWaitForEvents(1, &evt); + + auto t2 = std::chrono::high_resolution_clock::now(); + + cl_event unmapEvt; + err = clEnqueueUnmapMemObject(q, assemblyBuf, mappedAssembly, 0, nullptr, &unmapEvt); + checkCLError(err, "unmap assembly buffer"); + clWaitForEvents(1, &unmapEvt); + + auto t3 = std::chrono::high_resolution_clock::now(); + + // -------------------- + // Host CPU split + auto cpuStart = std::chrono::high_resolution_clock::now(); + for (size_t pt = 0; pt < numPointsPerAssembly; ++pt) { + size_t off = pt * 16; + for (int i = 0; i < 12; ++i) + xyzHostCPU[pt*12 + i] = assemblyHost[off + i]; + for (int i = 0; i < 4; ++i) + iHostCPU[pt*4 + i] = assemblyHost[off + 12 + i]; + } + auto cpuEnd = std::chrono::high_resolution_clock::now(); + + std::chrono::duration mapElapsed = t1 - t0; + std::chrono::duration kernelElapsed = t2 - t1; + std::chrono::duration unmapElapsed = t3 - t2; + std::chrono::duration cpuElapsed = cpuEnd - cpuStart; + + std::cout << "Iteration " << iter + << " | Map: " << mapElapsed.count() + << " ms | Kernel: " << kernelElapsed.count() + << " ms | Unmap: " << unmapElapsed.count() + << " ms | CPU Split: " << cpuElapsed.count() << " ms\n"; + } + + // Cleanup + clReleaseKernel(kernel); + clReleaseProgram(prog); + clReleaseMemObject(assemblyBuf); + clReleaseMemObject(xyzBuf); + clReleaseMemObject(iBuf); + clReleaseCommandQueue(q); + clReleaseContext(ctx); + } + std::cout << std::endl; + } + return 0; +} + diff --git a/compile/clzerocopycheck.cpp b/compile/clzerocopycheck.cpp new file mode 100644 index 0000000..6b219e5 --- /dev/null +++ b/compile/clzerocopycheck.cpp @@ -0,0 +1,115 @@ +#include +#include +#include +#include + +#define CHECK(err, msg) \ + if (err != CL_SUCCESS) { \ + std::cerr << "ERROR: " << msg << " (" << err << ")\n"; \ + return 1; \ + } + +const char *kernelSrc = R"CLC( +__kernel void check_shared(__global const int* in, __global int* out) { + int gid = get_global_id(0); + out[gid] = in[gid] + 42; // simple deterministic transform +} +)CLC"; + +int main() { + cl_int err; + + // Pick first available device + cl_uint numPlatforms; + CHECK(clGetPlatformIDs(0, nullptr, &numPlatforms), "clGetPlatformIDs count"); + std::vector plats(numPlatforms); + CHECK(clGetPlatformIDs(numPlatforms, plats.data(), nullptr), "clGetPlatformIDs"); + + cl_platform_id plat = plats[0]; + cl_device_id dev; + CHECK(clGetDeviceIDs(plat, CL_DEVICE_TYPE_GPU, 1, &dev, nullptr), "clGetDeviceIDs"); + + cl_context ctx = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &err); + CHECK(err, "clCreateContext"); + + cl_command_queue q = clCreateCommandQueue(ctx, dev, 0, &err); + CHECK(err, "clCreateCommandQueue"); + + // Create program and kernel + const size_t srcLen = std::strlen(kernelSrc); + cl_program prog = clCreateProgramWithSource(ctx, 1, &kernelSrc, &srcLen, &err); + CHECK(err, "clCreateProgramWithSource"); + + err = clBuildProgram(prog, 1, &dev, nullptr, nullptr, nullptr); + if (err != CL_SUCCESS) { + size_t logSize; + clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize); + std::vector log(logSize); + clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, logSize, log.data(), nullptr); + std::cerr << "--- Build Log ---\n" << log.data() << "\n"; + return 1; + } + + cl_kernel krn = clCreateKernel(prog, "check_shared", &err); + CHECK(err, "clCreateKernel"); + + const size_t N = 8; + size_t bufSize = N * sizeof(int); + + // Allocate host-visible buffer + cl_mem bufIn = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufSize, nullptr, &err); + CHECK(err, "clCreateBuffer input"); + cl_mem bufOut = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, bufSize, nullptr, &err); + CHECK(err, "clCreateBuffer output"); + + // Map the buffer (should return pointer to real host memory if unified) + int* hostPtr = (int*)clEnqueueMapBuffer(q, bufIn, CL_TRUE, CL_MAP_WRITE, 0, bufSize, 0, nullptr, nullptr, &err); + CHECK(err, "clEnqueueMapBuffer"); + + std::cout << "Mapped host pointer: " << static_cast(hostPtr) << "\n"; + + // Write pattern directly into mapped memory + for (size_t i = 0; i < N; ++i) + hostPtr[i] = 100 + i; + + // No clEnqueueWriteBuffer call! We rely on shared memory. + clEnqueueUnmapMemObject(q, bufIn, hostPtr, 0, nullptr, nullptr); + clFinish(q); + + // Set kernel args + clSetKernelArg(krn, 0, sizeof(cl_mem), &bufIn); + clSetKernelArg(krn, 1, sizeof(cl_mem), &bufOut); + + size_t global = N; + err = clEnqueueNDRangeKernel(q, krn, 1, nullptr, &global, nullptr, 0, nullptr, nullptr); + CHECK(err, "clEnqueueNDRangeKernel"); + clFinish(q); + + // Read back result + int* outPtr = (int*)clEnqueueMapBuffer(q, bufOut, CL_TRUE, CL_MAP_READ, 0, bufSize, 0, nullptr, nullptr, &err); + CHECK(err, "map output"); + + std::cout << "Result: "; + for (size_t i = 0; i < N; ++i) + std::cout << outPtr[i] << " "; + std::cout << "\n"; + + // Validate + bool ok = true; + for (size_t i = 0; i < N; ++i) + if (outPtr[i] != 142 + i) ok = false; + + std::cout << (ok ? "✅ GPU saw host writes (zero-copy confirmed)\n" + : "❌ GPU did not see host writes (copying or staging occurred)\n"); + + clEnqueueUnmapMemObject(q, bufOut, outPtr, 0, nullptr, nullptr); + clFinish(q); + + clReleaseMemObject(bufIn); + clReleaseMemObject(bufOut); + clReleaseKernel(krn); + clReleaseProgram(prog); + clReleaseCommandQueue(q); + clReleaseContext(ctx); + return 0; +}