OClCollMeshEngn: hide StagingBuffer's firstSlotOffset

This commit is contained in:
2025-11-20 01:48:59 -04:00
parent 51d2a70a3f
commit e233dc51d6
4 changed files with 13 additions and 38 deletions
+4 -2
View File
@@ -119,12 +119,14 @@ public:
/** EXPLANATION: /** EXPLANATION:
* Returns an iovec for OpenCL engine buffer access. * Returns an iovec for OpenCL engine buffer access.
* The buffer is mmap()-allocated and suitable for CL_MEM_USE_HOST_PTR. * The buffer is mmap()-allocated and suitable for CL_MEM_USE_HOST_PTR.
* Returns pointer to first slot (offset by firstSlotOffsetNBytes) and
* size from first slot to end of buffer.
*/ */
struct iovec getClEngineIovec() const struct iovec getClEngineIovec() const
{ {
struct iovec iov; struct iovec iov;
iov.iov_base = buffer.get(); iov.iov_base = buffer.get() + firstSlotOffsetNBytes;
iov.iov_len = bufferNBytes; iov.iov_len = bufferNBytes - firstSlotOffsetNBytes;
return iov; return iov;
} }
+1 -3
View File
@@ -25,7 +25,6 @@ __kernel void collate(
__global uchar* assembly, __global uchar* assembly,
__global float* collation, __global float* collation,
uint slotStride, uint slotStride,
uint firstSlotOffset,
uint nPointsPerSlot, uint nPointsPerSlot,
uint nDgramsPerFrame) uint nDgramsPerFrame)
{ {
@@ -36,8 +35,7 @@ __kernel void collate(
if (slotIndex >= nDgramsPerFrame) { return; } if (slotIndex >= nDgramsPerFrame) { return; }
// Calculate slot address // Calculate slot address
__global uchar* slotStart = assembly + firstSlotOffset __global uchar* slotStart = assembly + (slotIndex * slotStride);
+ (slotIndex * slotStride);
// Read data_type from offset 9 (1 byte) // Read data_type from offset 9 (1 byte)
uchar dataType = slotStart[9]; uchar dataType = slotStart[9];
@@ -476,8 +476,6 @@ bool OpenClCollatingAndMeshingEngine::setupSlotCompactorsArgs(
uint32_t numSlots = static_cast<uint32_t>(frameAssemblyDesc->numSlots); uint32_t numSlots = static_cast<uint32_t>(frameAssemblyDesc->numSlots);
uint32_t slotStride = static_cast<uint32_t>(assemblyBuff.slotStrideNBytes); uint32_t slotStride = static_cast<uint32_t>(assemblyBuff.slotStrideNBytes);
uint32_t slotSize = static_cast<uint32_t>(frameAssemblyDesc->slotSizeBytes); uint32_t slotSize = static_cast<uint32_t>(frameAssemblyDesc->slotSizeBytes);
uint32_t firstSlotOffset = static_cast<uint32_t>(
assemblyBuff.firstSlotOffsetNBytes);
uint32_t nSucceededUint = static_cast<uint32_t>(nSucceeded); uint32_t nSucceededUint = static_cast<uint32_t>(nSucceeded);
// Set kernel arguments for slotCompactor // Set kernel arguments for slotCompactor
@@ -517,7 +515,7 @@ bool OpenClCollatingAndMeshingEngine::setupSlotCompactorsArgs(
} }
err = clSetKernelArg( err = clSetKernelArg(
slotCompactorKernel, 4, sizeof(uint32_t), &firstSlotOffset); slotCompactorKernel, 4, sizeof(uint32_t), &nSucceededUint);
if (err != CL_SUCCESS) if (err != CL_SUCCESS)
{ {
@@ -526,16 +524,6 @@ bool OpenClCollatingAndMeshingEngine::setupSlotCompactorsArgs(
return false; return false;
} }
err = clSetKernelArg(
slotCompactorKernel, 5, sizeof(uint32_t), &nSucceededUint);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 5: " << err
<< std::endl;
return false;
}
return true; return true;
} }
@@ -544,8 +532,6 @@ bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs(
{ {
// Extract parameters for collateDgrams kernel // Extract parameters for collateDgrams kernel
uint32_t slotStride = static_cast<uint32_t>(assemblyBuff.slotStrideNBytes); uint32_t slotStride = static_cast<uint32_t>(assemblyBuff.slotStrideNBytes);
uint32_t firstSlotOffset = static_cast<uint32_t>(
assemblyBuff.firstSlotOffsetNBytes);
// Calculate nPointsPerSlot from device return mode // Calculate nPointsPerSlot from device return mode
if (!parent.device) if (!parent.device)
@@ -585,7 +571,7 @@ bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs(
return false; return false;
} }
err = clSetKernelArg(collateKernel, 3, sizeof(uint32_t), &firstSlotOffset); err = clSetKernelArg(collateKernel, 3, sizeof(uint32_t), &nPointsPerSlot);
if (err != CL_SUCCESS) if (err != CL_SUCCESS)
{ {
std::cerr << __func__ << ": failed to set kernel arg 3: " << err std::cerr << __func__ << ": failed to set kernel arg 3: " << err
@@ -593,7 +579,7 @@ bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs(
return false; return false;
} }
err = clSetKernelArg(collateKernel, 4, sizeof(uint32_t), &nPointsPerSlot); err = clSetKernelArg(collateKernel, 4, sizeof(uint32_t), &nDgramsPerFrame);
if (err != CL_SUCCESS) if (err != CL_SUCCESS)
{ {
std::cerr << __func__ << ": failed to set kernel arg 4: " << err std::cerr << __func__ << ": failed to set kernel arg 4: " << err
@@ -601,14 +587,6 @@ bool OpenClCollatingAndMeshingEngine::setupCollateDgramsArgs(
return false; return false;
} }
err = clSetKernelArg(collateKernel, 5, sizeof(uint32_t), &nDgramsPerFrame);
if (err != CL_SUCCESS)
{
std::cerr << __func__ << ": failed to set kernel arg 5: " << err
<< std::endl;
return false;
}
return true; return true;
} }
+5 -8
View File
@@ -10,7 +10,6 @@ __kernel void slotCompactor(
uint numSlots, uint numSlots,
uint slotStride, uint slotStride,
uint slotSize, uint slotSize,
uint firstSlotOffset,
uint nSucceeded) uint nSucceeded)
{ {
// Sequential processing: single work item processes all slots // Sequential processing: single work item processes all slots
@@ -22,8 +21,8 @@ __kernel void slotCompactor(
// 3. Exit early once we've moved nFailed dummy slots // 3. Exit early once we've moved nFailed dummy slots
DBG_PRINTF("slotCompactor: KERNEL STARTED\n"); DBG_PRINTF("slotCompactor: KERNEL STARTED\n");
DBG_PRINTF("slotCompactor: numSlots=%u, slotStride=%u, slotSize=%u, firstSlotOffset=%u, nSucceeded=%u\n", DBG_PRINTF("slotCompactor: numSlots=%u, slotStride=%u, slotSize=%u, nSucceeded=%u\n",
numSlots, slotStride, slotSize, firstSlotOffset, nSucceeded); numSlots, slotStride, slotSize, nSucceeded);
uint nFailed = numSlots - nSucceeded; // Calculate number of failed slots uint nFailed = numSlots - nSucceeded; // Calculate number of failed slots
uint nonDummiesSeen = 0; // Track how many non-dummy slots we've seen uint nonDummiesSeen = 0; // Track how many non-dummy slots we've seen
@@ -53,8 +52,7 @@ __kernel void slotCompactor(
} }
// Calculate slot address // Calculate slot address
__global uchar* slotAddr = assembly + firstSlotOffset __global uchar* slotAddr = assembly + (i * slotStride);
+ (i * slotStride);
// Check if slot is dummy: first 4 bytes should all be 0xFF // Check if slot is dummy: first 4 bytes should all be 0xFF
bool isDummy = (slotAddr[0] == 0xFF) && (slotAddr[1] == 0xFF) bool isDummy = (slotAddr[0] == 0xFF) && (slotAddr[1] == 0xFF)
@@ -78,8 +76,7 @@ __kernel void slotCompactor(
bool foundNonDummy = false; bool foundNonDummy = false;
for (int j = (int)rightmostNonDummy; j > (int)i; --j) for (int j = (int)rightmostNonDummy; j > (int)i; --j)
{ {
__global uchar* checkSlotAddr = assembly + firstSlotOffset __global uchar* checkSlotAddr = assembly + (j * slotStride);
+ (j * slotStride);
bool checkIsDummy = (checkSlotAddr[0] == 0xFF) bool checkIsDummy = (checkSlotAddr[0] == 0xFF)
&& (checkSlotAddr[1] == 0xFF) && (checkSlotAddr[2] == 0xFF) && (checkSlotAddr[1] == 0xFF) && (checkSlotAddr[2] == 0xFF)
@@ -99,7 +96,7 @@ __kernel void slotCompactor(
if (foundNonDummy) if (foundNonDummy)
{ {
DBG_PRINTF("slotCompactor: Moving slot from %u to %u\n", rightmostNonDummy, i); DBG_PRINTF("slotCompactor: Moving slot from %u to %u\n", rightmostNonDummy, i);
__global uchar* srcAddr = assembly + firstSlotOffset __global uchar* srcAddr = assembly
+ (rightmostNonDummy * slotStride); + (rightmostNonDummy * slotStride);
// Copy slot data (byte-by-byte copy) // Copy slot data (byte-by-byte copy)