Add some compile-time CL utilities
This commit is contained in:
@@ -0,0 +1,19 @@
|
||||
if(COMPILE_CL_CHECKS)
|
||||
find_package(OpenCL REQUIRED)
|
||||
|
||||
add_executable(clshmemlatency clshmemlatency.cpp)
|
||||
target_include_directories(clshmemlatency
|
||||
PUBLIC ${OpenCL_INCLUDE_DIRS})
|
||||
target_link_libraries(clshmemlatency
|
||||
${OpenCL_LIBRARY})
|
||||
add_executable(clshmemcheck clshmemcheck.cpp)
|
||||
target_include_directories(clshmemcheck
|
||||
PUBLIC ${OpenCL_INCLUDE_DIRS})
|
||||
target_link_libraries(clshmemcheck
|
||||
${OpenCL_LIBRARY})
|
||||
add_executable(clzerocopycheck clzerocopycheck.cpp)
|
||||
target_include_directories(clzerocopycheck
|
||||
PUBLIC ${OpenCL_INCLUDE_DIRS})
|
||||
target_link_libraries(clzerocopycheck
|
||||
${OpenCL_LIBRARY})
|
||||
endif()
|
||||
@@ -0,0 +1,90 @@
|
||||
#include <CL/cl.h>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
#include <cstring>
|
||||
|
||||
void checkCLError(cl_int err, const char* msg) {
|
||||
if (err != CL_SUCCESS) {
|
||||
std::cerr << "OpenCL Error " << err << " at: " << msg << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
cl_uint numPlatforms = 0;
|
||||
checkCLError(clGetPlatformIDs(0, nullptr, &numPlatforms), "get num platforms");
|
||||
std::vector<cl_platform_id> platforms(numPlatforms);
|
||||
checkCLError(clGetPlatformIDs(numPlatforms, platforms.data(), nullptr), "get platforms");
|
||||
|
||||
std::cout << "Found " << numPlatforms << " OpenCL platforms\n\n";
|
||||
|
||||
for (cl_uint p = 0; p < numPlatforms; ++p) {
|
||||
char platformName[256];
|
||||
clGetPlatformInfo(platforms[p], CL_PLATFORM_NAME, sizeof(platformName), platformName, nullptr);
|
||||
std::cout << "Platform " << p << ": " << platformName << "\n";
|
||||
|
||||
cl_uint numDevices = 0;
|
||||
clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, 0, nullptr, &numDevices);
|
||||
std::vector<cl_device_id> devices(numDevices);
|
||||
clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, numDevices, devices.data(), nullptr);
|
||||
|
||||
for (cl_uint d = 0; d < numDevices; ++d) {
|
||||
char deviceName[256];
|
||||
clGetDeviceInfo(devices[d], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr);
|
||||
std::cout << " Device " << d << ": " << deviceName << "\n";
|
||||
|
||||
cl_bool unifiedMem = CL_FALSE;
|
||||
clGetDeviceInfo(devices[d], CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(unifiedMem), &unifiedMem, nullptr);
|
||||
std::cout << " Host-Device unified memory: " << (unifiedMem ? "Yes" : "No") << "\n";
|
||||
|
||||
#ifdef CL_DEVICE_SVM_CAPABILITIES
|
||||
cl_device_svm_capabilities svmCaps = 0;
|
||||
clGetDeviceInfo(devices[d], CL_DEVICE_SVM_CAPABILITIES, sizeof(svmCaps), &svmCaps, nullptr);
|
||||
std::cout << " SVM capabilities:\n";
|
||||
if (!svmCaps) std::cout << " None\n";
|
||||
if (svmCaps & CL_DEVICE_SVM_COARSE_GRAIN_BUFFER)
|
||||
std::cout << " - Coarse-grain buffer sharing\n";
|
||||
if (svmCaps & CL_DEVICE_SVM_FINE_GRAIN_BUFFER)
|
||||
std::cout << " - Fine-grain buffer sharing\n";
|
||||
if (svmCaps & CL_DEVICE_SVM_FINE_GRAIN_SYSTEM)
|
||||
std::cout << " - Fine-grain system sharing\n";
|
||||
if (svmCaps & CL_DEVICE_SVM_ATOMICS)
|
||||
std::cout << " - Atomics supported\n";
|
||||
#endif
|
||||
|
||||
// Optional runtime test: check if CL_MEM_USE_HOST_PTR buffer reuses pointer
|
||||
const size_t bufSize = 1024 * 1024;
|
||||
std::vector<char> hostBuffer(bufSize, 42);
|
||||
|
||||
cl_int err;
|
||||
cl_context ctx = clCreateContext(nullptr, 1, &devices[d], nullptr, nullptr, &err);
|
||||
checkCLError(err, "create context");
|
||||
|
||||
cl_mem buf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, bufSize, hostBuffer.data(), &err);
|
||||
checkCLError(err, "create buffer");
|
||||
|
||||
cl_command_queue q = clCreateCommandQueue(ctx, devices[d], 0, &err);
|
||||
checkCLError(err, "create queue");
|
||||
|
||||
// Simple host → device → host round-trip test
|
||||
cl_event evt;
|
||||
|
||||
auto start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
void* mapped = clEnqueueMapBuffer(q, buf, CL_TRUE, CL_MAP_READ, 0, bufSize, 0, nullptr, &evt, &err);
|
||||
checkCLError(err, "map buffer");
|
||||
clWaitForEvents(1, &evt);
|
||||
|
||||
clReleaseMemObject(buf);
|
||||
auto end = std::chrono::high_resolution_clock::now();
|
||||
std::chrono::duration<double, std::milli> elapsed = end - start;
|
||||
std::cout << " Map latency: " << elapsed.count() << " ms (lower → likely zero-copy)\n";
|
||||
|
||||
clReleaseCommandQueue(q);
|
||||
clReleaseContext(ctx);
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -0,0 +1,181 @@
|
||||
#include <CL/cl.h>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
#include <cstring>
|
||||
|
||||
void checkCLError(cl_int err, const char* msg) {
|
||||
if (err != CL_SUCCESS) {
|
||||
std::cerr << "OpenCL Error " << err << " at: " << msg << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
// --------------------
|
||||
// Kernel source
|
||||
// Simple mock kernel that simulates splitting XYZ/I
|
||||
// Each "point" is 16 bytes (XYZ + Intensity)
|
||||
const char* kernelSrc = R"CLC(
|
||||
__kernel void xyz_i_split(__global uchar* assembly,
|
||||
__global uchar* xyzOut,
|
||||
__global uchar* iOut,
|
||||
const uint numPoints) {
|
||||
uint gid = get_global_id(0);
|
||||
if (gid >= numPoints) return;
|
||||
|
||||
uint offset = gid * 16;
|
||||
// Copy XYZ (12 bytes) to xyzOut
|
||||
for (int i=0; i<12; ++i)
|
||||
xyzOut[gid*12 + i] = assembly[offset + i];
|
||||
|
||||
// Copy Intensity (4 bytes) to iOut
|
||||
for (int i=0; i<4; ++i)
|
||||
iOut[gid*4 + i] = assembly[offset + 12 + i];
|
||||
}
|
||||
)CLC";
|
||||
|
||||
int main() {
|
||||
// --------------------
|
||||
// CHANGE THIS VALUE to set number of points per assembly buffer
|
||||
const size_t numPointsPerAssembly = 100000; // e.g., ~3333 points per fill
|
||||
const size_t bytesPerPoint = 16; // 12 bytes XYZ + 4 bytes I
|
||||
|
||||
const size_t assemblyBufSize = numPointsPerAssembly * bytesPerPoint;
|
||||
const size_t xyzBufSize = numPointsPerAssembly * 12;
|
||||
const size_t iBufSize = numPointsPerAssembly * 4;
|
||||
|
||||
cl_uint numPlatforms = 0;
|
||||
checkCLError(clGetPlatformIDs(0, nullptr, &numPlatforms), "get num platforms");
|
||||
std::vector<cl_platform_id> platforms(numPlatforms);
|
||||
checkCLError(clGetPlatformIDs(numPlatforms, platforms.data(), nullptr), "get platforms");
|
||||
|
||||
std::cout << "Found " << numPlatforms << " OpenCL platforms\n\n";
|
||||
|
||||
for (cl_uint p = 0; p < numPlatforms; ++p) {
|
||||
char platformName[256];
|
||||
clGetPlatformInfo(platforms[p], CL_PLATFORM_NAME, sizeof(platformName), platformName, nullptr);
|
||||
std::cout << "Platform " << p << ": " << platformName << "\n";
|
||||
|
||||
cl_uint numDevices = 0;
|
||||
clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, 0, nullptr, &numDevices);
|
||||
std::vector<cl_device_id> devices(numDevices);
|
||||
clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, numDevices, devices.data(), nullptr);
|
||||
|
||||
for (cl_uint d = 0; d < numDevices; ++d) {
|
||||
char deviceName[256];
|
||||
clGetDeviceInfo(devices[d], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr);
|
||||
std::cout << " Device " << d << ": " << deviceName << "\n";
|
||||
|
||||
cl_int err;
|
||||
cl_context ctx = clCreateContext(nullptr, 1, &devices[d], nullptr, nullptr, &err);
|
||||
checkCLError(err, "create context");
|
||||
|
||||
cl_command_queue q = clCreateCommandQueue(ctx, devices[d], 0, &err);
|
||||
checkCLError(err, "create queue");
|
||||
|
||||
// --------------------
|
||||
// Allocate host buffers
|
||||
std::vector<unsigned char> assemblyHost(assemblyBufSize, 42);
|
||||
std::vector<unsigned char> xyzHost(xyzBufSize, 0);
|
||||
std::vector<unsigned char> iHost(iBufSize, 0);
|
||||
|
||||
std::vector<unsigned char> xyzHostCPU(xyzBufSize, 0);
|
||||
std::vector<unsigned char> iHostCPU(iBufSize, 0);
|
||||
|
||||
// Create CL buffers
|
||||
cl_mem assemblyBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, assemblyBufSize, assemblyHost.data(), &err);
|
||||
checkCLError(err, "create assembly buffer");
|
||||
cl_mem xyzBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, xyzBufSize, xyzHost.data(), &err);
|
||||
checkCLError(err, "create xyz buffer");
|
||||
cl_mem iBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, iBufSize, iHost.data(), &err);
|
||||
checkCLError(err, "create i buffer");
|
||||
|
||||
// Build program
|
||||
cl_program prog = clCreateProgramWithSource(ctx, 1, &kernelSrc, nullptr, &err);
|
||||
checkCLError(err, "create program");
|
||||
|
||||
err = clBuildProgram(prog, 1, &devices[d], nullptr, nullptr, nullptr);
|
||||
if (err != CL_SUCCESS) {
|
||||
// Print build log
|
||||
size_t logSize = 0;
|
||||
clGetProgramBuildInfo(prog, devices[d], CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize);
|
||||
std::vector<char> log(logSize);
|
||||
clGetProgramBuildInfo(prog, devices[d], CL_PROGRAM_BUILD_LOG, logSize, log.data(), nullptr);
|
||||
std::cerr << log.data() << "\n";
|
||||
}
|
||||
checkCLError(err, "build program");
|
||||
|
||||
cl_kernel kernel = clCreateKernel(prog, "xyz_i_split", &err);
|
||||
checkCLError(err, "create kernel");
|
||||
|
||||
// Set kernel args
|
||||
clSetKernelArg(kernel, 0, sizeof(cl_mem), &assemblyBuf);
|
||||
clSetKernelArg(kernel, 1, sizeof(cl_mem), &xyzBuf);
|
||||
clSetKernelArg(kernel, 2, sizeof(cl_mem), &iBuf);
|
||||
clSetKernelArg(kernel, 3, sizeof(cl_uint), &numPointsPerAssembly);
|
||||
|
||||
const size_t globalWorkSize = numPointsPerAssembly;
|
||||
|
||||
// --------------------
|
||||
// Run a few iterations
|
||||
for (int iter = 0; iter < 5; ++iter) {
|
||||
cl_event evt;
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
void* mappedAssembly = clEnqueueMapBuffer(q, assemblyBuf, CL_TRUE, CL_MAP_READ, 0, assemblyBufSize, 0, nullptr, &evt, &err);
|
||||
checkCLError(err, "map assembly buffer");
|
||||
clWaitForEvents(1, &evt);
|
||||
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
err = clEnqueueNDRangeKernel(q, kernel, 1, nullptr, &globalWorkSize, nullptr, 0, nullptr, &evt);
|
||||
checkCLError(err, "enqueue kernel");
|
||||
clWaitForEvents(1, &evt);
|
||||
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
cl_event unmapEvt;
|
||||
err = clEnqueueUnmapMemObject(q, assemblyBuf, mappedAssembly, 0, nullptr, &unmapEvt);
|
||||
checkCLError(err, "unmap assembly buffer");
|
||||
clWaitForEvents(1, &unmapEvt);
|
||||
|
||||
auto t3 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// --------------------
|
||||
// Host CPU split
|
||||
auto cpuStart = std::chrono::high_resolution_clock::now();
|
||||
for (size_t pt = 0; pt < numPointsPerAssembly; ++pt) {
|
||||
size_t off = pt * 16;
|
||||
for (int i = 0; i < 12; ++i)
|
||||
xyzHostCPU[pt*12 + i] = assemblyHost[off + i];
|
||||
for (int i = 0; i < 4; ++i)
|
||||
iHostCPU[pt*4 + i] = assemblyHost[off + 12 + i];
|
||||
}
|
||||
auto cpuEnd = std::chrono::high_resolution_clock::now();
|
||||
|
||||
std::chrono::duration<double, std::milli> mapElapsed = t1 - t0;
|
||||
std::chrono::duration<double, std::milli> kernelElapsed = t2 - t1;
|
||||
std::chrono::duration<double, std::milli> unmapElapsed = t3 - t2;
|
||||
std::chrono::duration<double, std::milli> cpuElapsed = cpuEnd - cpuStart;
|
||||
|
||||
std::cout << "Iteration " << iter
|
||||
<< " | Map: " << mapElapsed.count()
|
||||
<< " ms | Kernel: " << kernelElapsed.count()
|
||||
<< " ms | Unmap: " << unmapElapsed.count()
|
||||
<< " ms | CPU Split: " << cpuElapsed.count() << " ms\n";
|
||||
}
|
||||
|
||||
// Cleanup
|
||||
clReleaseKernel(kernel);
|
||||
clReleaseProgram(prog);
|
||||
clReleaseMemObject(assemblyBuf);
|
||||
clReleaseMemObject(xyzBuf);
|
||||
clReleaseMemObject(iBuf);
|
||||
clReleaseCommandQueue(q);
|
||||
clReleaseContext(ctx);
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,115 @@
|
||||
#include <CL/cl.h>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
#define CHECK(err, msg) \
|
||||
if (err != CL_SUCCESS) { \
|
||||
std::cerr << "ERROR: " << msg << " (" << err << ")\n"; \
|
||||
return 1; \
|
||||
}
|
||||
|
||||
const char *kernelSrc = R"CLC(
|
||||
__kernel void check_shared(__global const int* in, __global int* out) {
|
||||
int gid = get_global_id(0);
|
||||
out[gid] = in[gid] + 42; // simple deterministic transform
|
||||
}
|
||||
)CLC";
|
||||
|
||||
int main() {
|
||||
cl_int err;
|
||||
|
||||
// Pick first available device
|
||||
cl_uint numPlatforms;
|
||||
CHECK(clGetPlatformIDs(0, nullptr, &numPlatforms), "clGetPlatformIDs count");
|
||||
std::vector<cl_platform_id> plats(numPlatforms);
|
||||
CHECK(clGetPlatformIDs(numPlatforms, plats.data(), nullptr), "clGetPlatformIDs");
|
||||
|
||||
cl_platform_id plat = plats[0];
|
||||
cl_device_id dev;
|
||||
CHECK(clGetDeviceIDs(plat, CL_DEVICE_TYPE_GPU, 1, &dev, nullptr), "clGetDeviceIDs");
|
||||
|
||||
cl_context ctx = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &err);
|
||||
CHECK(err, "clCreateContext");
|
||||
|
||||
cl_command_queue q = clCreateCommandQueue(ctx, dev, 0, &err);
|
||||
CHECK(err, "clCreateCommandQueue");
|
||||
|
||||
// Create program and kernel
|
||||
const size_t srcLen = std::strlen(kernelSrc);
|
||||
cl_program prog = clCreateProgramWithSource(ctx, 1, &kernelSrc, &srcLen, &err);
|
||||
CHECK(err, "clCreateProgramWithSource");
|
||||
|
||||
err = clBuildProgram(prog, 1, &dev, nullptr, nullptr, nullptr);
|
||||
if (err != CL_SUCCESS) {
|
||||
size_t logSize;
|
||||
clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize);
|
||||
std::vector<char> log(logSize);
|
||||
clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, logSize, log.data(), nullptr);
|
||||
std::cerr << "--- Build Log ---\n" << log.data() << "\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
cl_kernel krn = clCreateKernel(prog, "check_shared", &err);
|
||||
CHECK(err, "clCreateKernel");
|
||||
|
||||
const size_t N = 8;
|
||||
size_t bufSize = N * sizeof(int);
|
||||
|
||||
// Allocate host-visible buffer
|
||||
cl_mem bufIn = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufSize, nullptr, &err);
|
||||
CHECK(err, "clCreateBuffer input");
|
||||
cl_mem bufOut = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, bufSize, nullptr, &err);
|
||||
CHECK(err, "clCreateBuffer output");
|
||||
|
||||
// Map the buffer (should return pointer to real host memory if unified)
|
||||
int* hostPtr = (int*)clEnqueueMapBuffer(q, bufIn, CL_TRUE, CL_MAP_WRITE, 0, bufSize, 0, nullptr, nullptr, &err);
|
||||
CHECK(err, "clEnqueueMapBuffer");
|
||||
|
||||
std::cout << "Mapped host pointer: " << static_cast<void*>(hostPtr) << "\n";
|
||||
|
||||
// Write pattern directly into mapped memory
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
hostPtr[i] = 100 + i;
|
||||
|
||||
// No clEnqueueWriteBuffer call! We rely on shared memory.
|
||||
clEnqueueUnmapMemObject(q, bufIn, hostPtr, 0, nullptr, nullptr);
|
||||
clFinish(q);
|
||||
|
||||
// Set kernel args
|
||||
clSetKernelArg(krn, 0, sizeof(cl_mem), &bufIn);
|
||||
clSetKernelArg(krn, 1, sizeof(cl_mem), &bufOut);
|
||||
|
||||
size_t global = N;
|
||||
err = clEnqueueNDRangeKernel(q, krn, 1, nullptr, &global, nullptr, 0, nullptr, nullptr);
|
||||
CHECK(err, "clEnqueueNDRangeKernel");
|
||||
clFinish(q);
|
||||
|
||||
// Read back result
|
||||
int* outPtr = (int*)clEnqueueMapBuffer(q, bufOut, CL_TRUE, CL_MAP_READ, 0, bufSize, 0, nullptr, nullptr, &err);
|
||||
CHECK(err, "map output");
|
||||
|
||||
std::cout << "Result: ";
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
std::cout << outPtr[i] << " ";
|
||||
std::cout << "\n";
|
||||
|
||||
// Validate
|
||||
bool ok = true;
|
||||
for (size_t i = 0; i < N; ++i)
|
||||
if (outPtr[i] != 142 + i) ok = false;
|
||||
|
||||
std::cout << (ok ? "✅ GPU saw host writes (zero-copy confirmed)\n"
|
||||
: "❌ GPU did not see host writes (copying or staging occurred)\n");
|
||||
|
||||
clEnqueueUnmapMemObject(q, bufOut, outPtr, 0, nullptr, nullptr);
|
||||
clFinish(q);
|
||||
|
||||
clReleaseMemObject(bufIn);
|
||||
clReleaseMemObject(bufOut);
|
||||
clReleaseKernel(krn);
|
||||
clReleaseProgram(prog);
|
||||
clReleaseCommandQueue(q);
|
||||
clReleaseContext(ctx);
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user