Files

182 lines
7.6 KiB
C++

#include <CL/cl.h>
#include <iostream>
#include <vector>
#include <chrono>
#include <cstring>
void checkCLError(cl_int err, const char* msg) {
if (err != CL_SUCCESS) {
std::cerr << "OpenCL Error " << err << " at: " << msg << std::endl;
exit(1);
}
}
// --------------------
// Kernel source
// Simple mock kernel that simulates splitting XYZ/I
// Each "point" is 16 bytes (XYZ + Intensity)
const char* kernelSrc = R"CLC(
__kernel void xyz_i_split(__global uchar* assembly,
__global uchar* xyzOut,
__global uchar* iOut,
const uint numPoints) {
uint gid = get_global_id(0);
if (gid >= numPoints) return;
uint offset = gid * 16;
// Copy XYZ (12 bytes) to xyzOut
for (int i=0; i<12; ++i)
xyzOut[gid*12 + i] = assembly[offset + i];
// Copy Intensity (4 bytes) to iOut
for (int i=0; i<4; ++i)
iOut[gid*4 + i] = assembly[offset + 12 + i];
}
)CLC";
int main() {
// --------------------
// CHANGE THIS VALUE to set number of points per assembly buffer
const size_t numPointsPerAssembly = 100000; // e.g., ~3333 points per fill
const size_t bytesPerPoint = 16; // 12 bytes XYZ + 4 bytes I
const size_t assemblyBufSize = numPointsPerAssembly * bytesPerPoint;
const size_t xyzBufSize = numPointsPerAssembly * 12;
const size_t iBufSize = numPointsPerAssembly * 4;
cl_uint numPlatforms = 0;
checkCLError(clGetPlatformIDs(0, nullptr, &numPlatforms), "get num platforms");
std::vector<cl_platform_id> platforms(numPlatforms);
checkCLError(clGetPlatformIDs(numPlatforms, platforms.data(), nullptr), "get platforms");
std::cout << "Found " << numPlatforms << " OpenCL platforms\n\n";
for (cl_uint p = 0; p < numPlatforms; ++p) {
char platformName[256];
clGetPlatformInfo(platforms[p], CL_PLATFORM_NAME, sizeof(platformName), platformName, nullptr);
std::cout << "Platform " << p << ": " << platformName << "\n";
cl_uint numDevices = 0;
clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, 0, nullptr, &numDevices);
std::vector<cl_device_id> devices(numDevices);
clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, numDevices, devices.data(), nullptr);
for (cl_uint d = 0; d < numDevices; ++d) {
char deviceName[256];
clGetDeviceInfo(devices[d], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr);
std::cout << " Device " << d << ": " << deviceName << "\n";
cl_int err;
cl_context ctx = clCreateContext(nullptr, 1, &devices[d], nullptr, nullptr, &err);
checkCLError(err, "create context");
cl_command_queue q = clCreateCommandQueue(ctx, devices[d], 0, &err);
checkCLError(err, "create queue");
// --------------------
// Allocate host buffers
std::vector<unsigned char> assemblyHost(assemblyBufSize, 42);
std::vector<unsigned char> xyzHost(xyzBufSize, 0);
std::vector<unsigned char> iHost(iBufSize, 0);
std::vector<unsigned char> xyzHostCPU(xyzBufSize, 0);
std::vector<unsigned char> iHostCPU(iBufSize, 0);
// Create CL buffers
cl_mem assemblyBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, assemblyBufSize, assemblyHost.data(), &err);
checkCLError(err, "create assembly buffer");
cl_mem xyzBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, xyzBufSize, xyzHost.data(), &err);
checkCLError(err, "create xyz buffer");
cl_mem iBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, iBufSize, iHost.data(), &err);
checkCLError(err, "create i buffer");
// Build program
cl_program prog = clCreateProgramWithSource(ctx, 1, &kernelSrc, nullptr, &err);
checkCLError(err, "create program");
err = clBuildProgram(prog, 1, &devices[d], nullptr, nullptr, nullptr);
if (err != CL_SUCCESS) {
// Print build log
size_t logSize = 0;
clGetProgramBuildInfo(prog, devices[d], CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize);
std::vector<char> log(logSize);
clGetProgramBuildInfo(prog, devices[d], CL_PROGRAM_BUILD_LOG, logSize, log.data(), nullptr);
std::cerr << log.data() << "\n";
}
checkCLError(err, "build program");
cl_kernel kernel = clCreateKernel(prog, "xyz_i_split", &err);
checkCLError(err, "create kernel");
// Set kernel args
clSetKernelArg(kernel, 0, sizeof(cl_mem), &assemblyBuf);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &xyzBuf);
clSetKernelArg(kernel, 2, sizeof(cl_mem), &iBuf);
clSetKernelArg(kernel, 3, sizeof(cl_uint), &numPointsPerAssembly);
const size_t globalWorkSize = numPointsPerAssembly;
// --------------------
// Run a few iterations
for (int iter = 0; iter < 5; ++iter) {
cl_event evt;
auto t0 = std::chrono::high_resolution_clock::now();
void* mappedAssembly = clEnqueueMapBuffer(q, assemblyBuf, CL_TRUE, CL_MAP_READ, 0, assemblyBufSize, 0, nullptr, &evt, &err);
checkCLError(err, "map assembly buffer");
clWaitForEvents(1, &evt);
auto t1 = std::chrono::high_resolution_clock::now();
err = clEnqueueNDRangeKernel(q, kernel, 1, nullptr, &globalWorkSize, nullptr, 0, nullptr, &evt);
checkCLError(err, "enqueue kernel");
clWaitForEvents(1, &evt);
auto t2 = std::chrono::high_resolution_clock::now();
cl_event unmapEvt;
err = clEnqueueUnmapMemObject(q, assemblyBuf, mappedAssembly, 0, nullptr, &unmapEvt);
checkCLError(err, "unmap assembly buffer");
clWaitForEvents(1, &unmapEvt);
auto t3 = std::chrono::high_resolution_clock::now();
// --------------------
// Host CPU split
auto cpuStart = std::chrono::high_resolution_clock::now();
for (size_t pt = 0; pt < numPointsPerAssembly; ++pt) {
size_t off = pt * 16;
for (int i = 0; i < 12; ++i)
xyzHostCPU[pt*12 + i] = assemblyHost[off + i];
for (int i = 0; i < 4; ++i)
iHostCPU[pt*4 + i] = assemblyHost[off + 12 + i];
}
auto cpuEnd = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> mapElapsed = t1 - t0;
std::chrono::duration<double, std::milli> kernelElapsed = t2 - t1;
std::chrono::duration<double, std::milli> unmapElapsed = t3 - t2;
std::chrono::duration<double, std::milli> cpuElapsed = cpuEnd - cpuStart;
std::cout << "Iteration " << iter
<< " | Map: " << mapElapsed.count()
<< " ms | Kernel: " << kernelElapsed.count()
<< " ms | Unmap: " << unmapElapsed.count()
<< " ms | CPU Split: " << cpuElapsed.count() << " ms\n";
}
// Cleanup
clReleaseKernel(kernel);
clReleaseProgram(prog);
clReleaseMemObject(assemblyBuf);
clReleaseMemObject(xyzBuf);
clReleaseMemObject(iBuf);
clReleaseCommandQueue(q);
clReleaseContext(ctx);
}
std::cout << std::endl;
}
return 0;
}