diff --git a/stimBuffApis/livoxGen1/openClKernels.cl.S b/stimBuffApis/livoxGen1/openClKernels.cl.S index 80cf2f4..d4280dc 100644 --- a/stimBuffApis/livoxGen1/openClKernels.cl.S +++ b/stimBuffApis/livoxGen1/openClKernels.cl.S @@ -14,4 +14,20 @@ collateKernelNBytes: .long .collateKernelEnd - collateKernelStart .size collateKernelNBytes, 4 + .section .rodata + .global slotCompactorKernelStart + .global slotCompactorKernelNBytes + .type slotCompactorKernelStart, @object + .type slotCompactorKernelNBytes, @object + +slotCompactorKernelStart: + .incbin "slotCompactor.cl" + .size slotCompactorKernelStart, . - slotCompactorKernelStart +.slotCompactorKernelEnd: + + .section .data +slotCompactorKernelNBytes: + .long .slotCompactorKernelEnd - slotCompactorKernelStart + .size slotCompactorKernelNBytes, 4 + .section .note.GNU-stack,"",@progbits diff --git a/stimBuffApis/livoxGen1/openClKernels.h b/stimBuffApis/livoxGen1/openClKernels.h index 84e549c..56720c0 100644 --- a/stimBuffApis/livoxGen1/openClKernels.h +++ b/stimBuffApis/livoxGen1/openClKernels.h @@ -11,6 +11,10 @@ extern "C" { extern const char collateKernelStart[]; extern const uint32_t collateKernelNBytes; +// External symbols for slotCompactor kernel (unmangled, not namespaced) +extern const char slotCompactorKernelStart[]; +extern const uint32_t slotCompactorKernelNBytes; + #ifdef __cplusplus } // extern "C" #endif diff --git a/stimBuffApis/livoxGen1/slotCompactor.cl b/stimBuffApis/livoxGen1/slotCompactor.cl new file mode 100644 index 0000000..1e343a0 --- /dev/null +++ b/stimBuffApis/livoxGen1/slotCompactor.cl @@ -0,0 +1,83 @@ +__kernel void slotCompactor( + __global uchar* assembly, + uint numSlots, + uint slotStride, + uint slotSize, + uint firstSlotOffset, + uint nSucceeded) +{ + // Sequential processing: single work item processes all slots + // Compact non-dummy slots to the beginning (lowest addresses) + // Dummy slots will remain at the end (highest addresses) + // Optimizations: + // 1. For each dummy, find rightmost non-dummy and copy it there + // 2. Exit early once we've seen nSucceeded non-dummy slots + // 3. Exit early once we've moved nFailed dummy slots + + uint nFailed = numSlots - nSucceeded; // Calculate number of failed slots + uint nonDummiesSeen = 0; // Track how many non-dummy slots we've seen + uint dummiesMoved = 0; // Track how many dummy slots we've moved + + // Process slots from beginning to end + for (uint i = 0; i < numSlots; ++i) + { + // Optimization 2: Exit early once we've seen nSucceeded non-dummy slots + if (nonDummiesSeen >= nSucceeded) { break; } + // Optimization 3: Exit early once we've moved nFailed dummy slots + if (dummiesMoved >= nFailed) { break; } + + // Calculate slot address + __global uchar* slotAddr = assembly + firstSlotOffset + + (i * slotStride); + + // Check if slot is dummy: first 4 bytes should all be 0xFF + bool isDummy = (slotAddr[0] == 0xFF) && (slotAddr[1] == 0xFF) + && (slotAddr[2] == 0xFF) && (slotAddr[3] == 0xFF); + + if (isDummy) + { + // Optimization 1: Find rightmost non-dummy slot and copy it here + uint rightmostNonDummy = numSlots; + for (int j = (int)numSlots - 1; j > (int)i; --j) + { + __global uchar* checkSlotAddr = assembly + firstSlotOffset + + (j * slotStride); + + bool checkIsDummy = (checkSlotAddr[0] == 0xFF) + && (checkSlotAddr[1] == 0xFF) && (checkSlotAddr[2] == 0xFF) + && (checkSlotAddr[3] == 0xFF); + + if (!checkIsDummy) + { + rightmostNonDummy = (uint)j; + break; + } + } + + // If we found a non-dummy slot to the right, copy it here + if (rightmostNonDummy < numSlots) + { + __global uchar* srcAddr = assembly + firstSlotOffset + + (rightmostNonDummy * slotStride); + + // Copy slot data (byte-by-byte copy) + for (uint j = 0; j < slotSize; ++j) { + slotAddr[j] = srcAddr[j]; + } + + // Mark the source slot as dummy (move it to the end) + for (uint j = 0; j < 4; ++j) { + srcAddr[j] = 0xFF; + } + + ++dummiesMoved; + ++nonDummiesSeen; // We just moved a non-dummy to this position + } + } + else + { + // Slot is non-dummy - it's already in the right place + ++nonDummiesSeen; + } + } +}