#define CL_TARGET_OPENCL_VERSION 300 #include #include #include #include #include #include void checkCLError(cl_int err, const char* msg) { if (err != CL_SUCCESS) { std::cerr << "OpenCL Error " << err << " at: " << msg << std::endl; exit(1); } } // -------------------- // Kernel source // Simple mock kernel that simulates splitting XYZ/I // Each "point" is 16 bytes (XYZ + Intensity) const char* kernelSrc = R"CLC( __kernel void xyz_i_split(__global uchar* assembly, __global uchar* xyzOut, __global uchar* iOut, const uint numPoints) { uint gid = get_global_id(0); if (gid >= numPoints) return; uint offset = gid * 16; // Copy XYZ (12 bytes) to xyzOut for (int i=0; i<12; ++i) xyzOut[gid*12 + i] = assembly[offset + i]; // Copy Intensity (4 bytes) to iOut for (int i=0; i<4; ++i) iOut[gid*4 + i] = assembly[offset + 12 + i]; } )CLC"; int main() { // -------------------- // CHANGE THIS VALUE to set number of points per assembly buffer const size_t numPointsPerAssembly = 100000; // e.g., ~3333 points per fill const size_t bytesPerPoint = 16; // 12 bytes XYZ + 4 bytes I const size_t assemblyBufSize = numPointsPerAssembly * bytesPerPoint; const size_t xyzBufSize = numPointsPerAssembly * 12; const size_t iBufSize = numPointsPerAssembly * 4; cl_uint numPlatforms = 0; checkCLError(clGetPlatformIDs(0, nullptr, &numPlatforms), "get num platforms"); std::vector platforms(numPlatforms); checkCLError(clGetPlatformIDs(numPlatforms, platforms.data(), nullptr), "get platforms"); std::cout << "Found " << numPlatforms << " OpenCL platforms\n\n"; for (cl_uint p = 0; p < numPlatforms; ++p) { char platformName[256]; clGetPlatformInfo(platforms[p], CL_PLATFORM_NAME, sizeof(platformName), platformName, nullptr); std::cout << "Platform " << p << ": " << platformName << "\n"; cl_uint numDevices = 0; clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, 0, nullptr, &numDevices); std::vector devices(numDevices); clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, numDevices, devices.data(), nullptr); for (cl_uint d = 0; d < numDevices; ++d) { char deviceName[256]; clGetDeviceInfo(devices[d], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr); std::cout << " Device " << d << ": " << deviceName << "\n"; cl_int err; cl_context ctx = clCreateContext(nullptr, 1, &devices[d], nullptr, nullptr, &err); checkCLError(err, "create context"); cl_queue_properties queueProps[] = {CL_QUEUE_PROPERTIES, 0, 0}; cl_command_queue q = clCreateCommandQueueWithProperties(ctx, devices[d], queueProps, &err); checkCLError(err, "create queue"); // -------------------- // Allocate host buffers std::vector assemblyHost(assemblyBufSize, 42); std::vector xyzHost(xyzBufSize, 0); std::vector iHost(iBufSize, 0); std::vector xyzHostCPU(xyzBufSize, 0); std::vector iHostCPU(iBufSize, 0); // Create CL buffers cl_mem assemblyBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, assemblyBufSize, assemblyHost.data(), &err); checkCLError(err, "create assembly buffer"); cl_mem xyzBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, xyzBufSize, xyzHost.data(), &err); checkCLError(err, "create xyz buffer"); cl_mem iBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, iBufSize, iHost.data(), &err); checkCLError(err, "create i buffer"); // Build program cl_program prog = clCreateProgramWithSource(ctx, 1, &kernelSrc, nullptr, &err); checkCLError(err, "create program"); err = clBuildProgram(prog, 1, &devices[d], nullptr, nullptr, nullptr); if (err != CL_SUCCESS) { // Print build log size_t logSize = 0; clGetProgramBuildInfo(prog, devices[d], CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize); std::vector log(logSize); clGetProgramBuildInfo(prog, devices[d], CL_PROGRAM_BUILD_LOG, logSize, log.data(), nullptr); std::cerr << log.data() << "\n"; } checkCLError(err, "build program"); cl_kernel kernel = clCreateKernel(prog, "xyz_i_split", &err); checkCLError(err, "create kernel"); // Set kernel args clSetKernelArg(kernel, 0, sizeof(cl_mem), &assemblyBuf); clSetKernelArg(kernel, 1, sizeof(cl_mem), &xyzBuf); clSetKernelArg(kernel, 2, sizeof(cl_mem), &iBuf); clSetKernelArg(kernel, 3, sizeof(cl_uint), &numPointsPerAssembly); const size_t globalWorkSize = numPointsPerAssembly; // -------------------- // Run a few iterations for (int iter = 0; iter < 10; ++iter) { cl_event evt; auto t0 = std::chrono::high_resolution_clock::now(); void* mappedAssembly = clEnqueueMapBuffer(q, assemblyBuf, CL_TRUE, CL_MAP_READ, 0, assemblyBufSize, 0, nullptr, &evt, &err); checkCLError(err, "map assembly buffer"); clWaitForEvents(1, &evt); auto t1 = std::chrono::high_resolution_clock::now(); err = clEnqueueNDRangeKernel(q, kernel, 1, nullptr, &globalWorkSize, nullptr, 0, nullptr, &evt); checkCLError(err, "enqueue kernel"); clWaitForEvents(1, &evt); auto t2 = std::chrono::high_resolution_clock::now(); cl_event unmapEvt; err = clEnqueueUnmapMemObject(q, assemblyBuf, mappedAssembly, 0, nullptr, &unmapEvt); checkCLError(err, "unmap assembly buffer"); clWaitForEvents(1, &unmapEvt); auto t3 = std::chrono::high_resolution_clock::now(); // -------------------- // Host CPU split auto cpuStart = std::chrono::high_resolution_clock::now(); for (size_t pt = 0; pt < numPointsPerAssembly; ++pt) { size_t off = pt * 16; for (int i = 0; i < 12; ++i) xyzHostCPU[pt*12 + i] = assemblyHost[off + i]; for (int i = 0; i < 4; ++i) iHostCPU[pt*4 + i] = assemblyHost[off + 12 + i]; } auto cpuEnd = std::chrono::high_resolution_clock::now(); std::chrono::duration mapElapsed = t1 - t0; std::chrono::duration kernelElapsed = t2 - t1; std::chrono::duration unmapElapsed = t3 - t2; std::chrono::duration cpuElapsed = cpuEnd - cpuStart; std::cout << "Iteration " << iter << " | Map: " << mapElapsed.count() << " ms | Kernel: " << kernelElapsed.count() << " ms | Unmap: " << unmapElapsed.count() << " ms | CPU Split: " << cpuElapsed.count() << " ms\n"; } // Cleanup clReleaseKernel(kernel); clReleaseProgram(prog); clReleaseMemObject(assemblyBuf); clReleaseMemObject(xyzBuf); clReleaseMemObject(iBuf); clReleaseCommandQueue(q); clReleaseContext(ctx); } std::cout << std::endl; } return 0; }