116 lines
3.9 KiB
C++
116 lines
3.9 KiB
C++
|
|
#include <CL/cl.h>
|
||
|
|
#include <iostream>
|
||
|
|
#include <vector>
|
||
|
|
#include <cstring>
|
||
|
|
|
||
|
|
#define CHECK(err, msg) \
|
||
|
|
if (err != CL_SUCCESS) { \
|
||
|
|
std::cerr << "ERROR: " << msg << " (" << err << ")\n"; \
|
||
|
|
return 1; \
|
||
|
|
}
|
||
|
|
|
||
|
|
const char *kernelSrc = R"CLC(
|
||
|
|
__kernel void check_shared(__global const int* in, __global int* out) {
|
||
|
|
int gid = get_global_id(0);
|
||
|
|
out[gid] = in[gid] + 42; // simple deterministic transform
|
||
|
|
}
|
||
|
|
)CLC";
|
||
|
|
|
||
|
|
int main() {
|
||
|
|
cl_int err;
|
||
|
|
|
||
|
|
// Pick first available device
|
||
|
|
cl_uint numPlatforms;
|
||
|
|
CHECK(clGetPlatformIDs(0, nullptr, &numPlatforms), "clGetPlatformIDs count");
|
||
|
|
std::vector<cl_platform_id> plats(numPlatforms);
|
||
|
|
CHECK(clGetPlatformIDs(numPlatforms, plats.data(), nullptr), "clGetPlatformIDs");
|
||
|
|
|
||
|
|
cl_platform_id plat = plats[0];
|
||
|
|
cl_device_id dev;
|
||
|
|
CHECK(clGetDeviceIDs(plat, CL_DEVICE_TYPE_GPU, 1, &dev, nullptr), "clGetDeviceIDs");
|
||
|
|
|
||
|
|
cl_context ctx = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &err);
|
||
|
|
CHECK(err, "clCreateContext");
|
||
|
|
|
||
|
|
cl_command_queue q = clCreateCommandQueue(ctx, dev, 0, &err);
|
||
|
|
CHECK(err, "clCreateCommandQueue");
|
||
|
|
|
||
|
|
// Create program and kernel
|
||
|
|
const size_t srcLen = std::strlen(kernelSrc);
|
||
|
|
cl_program prog = clCreateProgramWithSource(ctx, 1, &kernelSrc, &srcLen, &err);
|
||
|
|
CHECK(err, "clCreateProgramWithSource");
|
||
|
|
|
||
|
|
err = clBuildProgram(prog, 1, &dev, nullptr, nullptr, nullptr);
|
||
|
|
if (err != CL_SUCCESS) {
|
||
|
|
size_t logSize;
|
||
|
|
clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize);
|
||
|
|
std::vector<char> log(logSize);
|
||
|
|
clGetProgramBuildInfo(prog, dev, CL_PROGRAM_BUILD_LOG, logSize, log.data(), nullptr);
|
||
|
|
std::cerr << "--- Build Log ---\n" << log.data() << "\n";
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
|
||
|
|
cl_kernel krn = clCreateKernel(prog, "check_shared", &err);
|
||
|
|
CHECK(err, "clCreateKernel");
|
||
|
|
|
||
|
|
const size_t N = 8;
|
||
|
|
size_t bufSize = N * sizeof(int);
|
||
|
|
|
||
|
|
// Allocate host-visible buffer
|
||
|
|
cl_mem bufIn = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufSize, nullptr, &err);
|
||
|
|
CHECK(err, "clCreateBuffer input");
|
||
|
|
cl_mem bufOut = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, bufSize, nullptr, &err);
|
||
|
|
CHECK(err, "clCreateBuffer output");
|
||
|
|
|
||
|
|
// Map the buffer (should return pointer to real host memory if unified)
|
||
|
|
int* hostPtr = (int*)clEnqueueMapBuffer(q, bufIn, CL_TRUE, CL_MAP_WRITE, 0, bufSize, 0, nullptr, nullptr, &err);
|
||
|
|
CHECK(err, "clEnqueueMapBuffer");
|
||
|
|
|
||
|
|
std::cout << "Mapped host pointer: " << static_cast<void*>(hostPtr) << "\n";
|
||
|
|
|
||
|
|
// Write pattern directly into mapped memory
|
||
|
|
for (size_t i = 0; i < N; ++i)
|
||
|
|
hostPtr[i] = 100 + i;
|
||
|
|
|
||
|
|
// No clEnqueueWriteBuffer call! We rely on shared memory.
|
||
|
|
clEnqueueUnmapMemObject(q, bufIn, hostPtr, 0, nullptr, nullptr);
|
||
|
|
clFinish(q);
|
||
|
|
|
||
|
|
// Set kernel args
|
||
|
|
clSetKernelArg(krn, 0, sizeof(cl_mem), &bufIn);
|
||
|
|
clSetKernelArg(krn, 1, sizeof(cl_mem), &bufOut);
|
||
|
|
|
||
|
|
size_t global = N;
|
||
|
|
err = clEnqueueNDRangeKernel(q, krn, 1, nullptr, &global, nullptr, 0, nullptr, nullptr);
|
||
|
|
CHECK(err, "clEnqueueNDRangeKernel");
|
||
|
|
clFinish(q);
|
||
|
|
|
||
|
|
// Read back result
|
||
|
|
int* outPtr = (int*)clEnqueueMapBuffer(q, bufOut, CL_TRUE, CL_MAP_READ, 0, bufSize, 0, nullptr, nullptr, &err);
|
||
|
|
CHECK(err, "map output");
|
||
|
|
|
||
|
|
std::cout << "Result: ";
|
||
|
|
for (size_t i = 0; i < N; ++i)
|
||
|
|
std::cout << outPtr[i] << " ";
|
||
|
|
std::cout << "\n";
|
||
|
|
|
||
|
|
// Validate
|
||
|
|
bool ok = true;
|
||
|
|
for (size_t i = 0; i < N; ++i)
|
||
|
|
if (outPtr[i] != 142 + i) ok = false;
|
||
|
|
|
||
|
|
std::cout << (ok ? "✅ GPU saw host writes (zero-copy confirmed)\n"
|
||
|
|
: "❌ GPU did not see host writes (copying or staging occurred)\n");
|
||
|
|
|
||
|
|
clEnqueueUnmapMemObject(q, bufOut, outPtr, 0, nullptr, nullptr);
|
||
|
|
clFinish(q);
|
||
|
|
|
||
|
|
clReleaseMemObject(bufIn);
|
||
|
|
clReleaseMemObject(bufOut);
|
||
|
|
clReleaseKernel(krn);
|
||
|
|
clReleaseProgram(prog);
|
||
|
|
clReleaseCommandQueue(q);
|
||
|
|
clReleaseContext(ctx);
|
||
|
|
return 0;
|
||
|
|
}
|