Files
salmanoff/compile/clzerocopycheck.cpp
T

116 lines
3.9 KiB
C++

#include <CL/cl.h>
#include <iostream>
#include <vector>
#include <cstring>
#define CHECK(err, msg) \
if (err != CL_SUCCESS) { \
std::cerr << "ERROR: " << msg << " (" << err << ")\n"; \
return 1; \
}
const char *kernelSrc = R"CLC(
__kernel void check_shared(__global const int* in, __global int* out) {
int gid = get_global_id(0);
out[gid] = in[gid] + 42; // simple deterministic transform
}
)CLC";
int main() {
cl_int err;
// Pick first available device
cl_uint numPlatforms;
CHECK(clGetPlatformIDs(0, nullptr, &numPlatforms), "clGetPlatformIDs count");
std::vector<cl_platform_id> plats(numPlatforms);
CHECK(clGetPlatformIDs(numPlatforms, plats.data(), nullptr), "clGetPlatformIDs");
cl_platform_id plat = plats[0];
cl_device_id dev;
CHECK(clGetDeviceIDs(plat, CL_DEVICE_TYPE_GPU, 1, &dev, nullptr), "clGetDeviceIDs");
cl_context ctx = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &err);
CHECK(err, "clCreateContext");
cl_command_queue q = clCreateCommandQueue(ctx, dev, 0, &err);
CHECK(err, "clCreateCommandQueue");
// Create program and kernel
const size_t srcLen = std::strlen(kernelSrc);
cl_program prog = clCreateProgramWithSource(ctx, 1, &kernelSrc, &srcLen, &err);
CHECK(err, "clCreateProgramWithSource");
err = clBuildProgram(prog, 1, &dev, nullptr, nullptr, nullptr);
if (err != CL_SUCCESS) {
size_t logSize;
clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize);
std::vector<char> log(logSize);
clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, logSize, log.data(), nullptr);
std::cerr << "--- Build Log ---\n" << log.data() << "\n";
return 1;
}
cl_kernel krn = clCreateKernel(prog, "check_shared", &err);
CHECK(err, "clCreateKernel");
const size_t N = 8;
size_t bufSize = N * sizeof(int);
// Allocate host-visible buffer
cl_mem bufIn = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufSize, nullptr, &err);
CHECK(err, "clCreateBuffer input");
cl_mem bufOut = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, bufSize, nullptr, &err);
CHECK(err, "clCreateBuffer output");
// Map the buffer (should return pointer to real host memory if unified)
int* hostPtr = (int*)clEnqueueMapBuffer(q, bufIn, CL_TRUE, CL_MAP_WRITE, 0, bufSize, 0, nullptr, nullptr, &err);
CHECK(err, "clEnqueueMapBuffer");
std::cout << "Mapped host pointer: " << static_cast<void*>(hostPtr) << "\n";
// Write pattern directly into mapped memory
for (size_t i = 0; i < N; ++i)
hostPtr[i] = 100 + i;
// No clEnqueueWriteBuffer call! We rely on shared memory.
clEnqueueUnmapMemObject(q, bufIn, hostPtr, 0, nullptr, nullptr);
clFinish(q);
// Set kernel args
clSetKernelArg(krn, 0, sizeof(cl_mem), &bufIn);
clSetKernelArg(krn, 1, sizeof(cl_mem), &bufOut);
size_t global = N;
err = clEnqueueNDRangeKernel(q, krn, 1, nullptr, &global, nullptr, 0, nullptr, nullptr);
CHECK(err, "clEnqueueNDRangeKernel");
clFinish(q);
// Read back result
int* outPtr = (int*)clEnqueueMapBuffer(q, bufOut, CL_TRUE, CL_MAP_READ, 0, bufSize, 0, nullptr, nullptr, &err);
CHECK(err, "map output");
std::cout << "Result: ";
for (size_t i = 0; i < N; ++i)
std::cout << outPtr[i] << " ";
std::cout << "\n";
// Validate
bool ok = true;
for (size_t i = 0; i < N; ++i)
if (outPtr[i] != 142 + i) ok = false;
std::cout << (ok ? "✅ GPU saw host writes (zero-copy confirmed)\n"
: "❌ GPU did not see host writes (copying or staging occurred)\n");
clEnqueueUnmapMemObject(q, bufOut, outPtr, 0, nullptr, nullptr);
clFinish(q);
clReleaseMemObject(bufIn);
clReleaseMemObject(bufOut);
clReleaseKernel(krn);
clReleaseProgram(prog);
clReleaseCommandQueue(q);
clReleaseContext(ctx);
return 0;
}