diff --git a/stimBuffApis/livoxGen1/CMakeLists.txt b/stimBuffApis/livoxGen1/CMakeLists.txt index 40e732a..5a5020e 100644 --- a/stimBuffApis/livoxGen1/CMakeLists.txt +++ b/stimBuffApis/livoxGen1/CMakeLists.txt @@ -50,10 +50,11 @@ if(ENABLE_STIMBUFFAPI_livoxGen1) ) # Set assembler working directory so .incbin can find the .cl file - # Also declare dependency on collateDgrams.cl + # Also declare dependency on collateDgrams.cl and slotCompactor.cl set_source_files_properties(openClKernels.cl.S PROPERTIES COMPILE_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}" - OBJECT_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/collateDgrams.cl" + OBJECT_DEPENDS + "${CMAKE_CURRENT_SOURCE_DIR}/collateDgrams.cl;${CMAKE_CURRENT_SOURCE_DIR}/slotCompactor.cl" ) target_include_directories(livoxGen1 PUBLIC diff --git a/stimBuffApis/livoxGen1/slotCompactor.cl b/stimBuffApis/livoxGen1/slotCompactor.cl index 1e343a0..7ad3ba7 100644 --- a/stimBuffApis/livoxGen1/slotCompactor.cl +++ b/stimBuffApis/livoxGen1/slotCompactor.cl @@ -18,6 +18,10 @@ __kernel void slotCompactor( uint nonDummiesSeen = 0; // Track how many non-dummy slots we've seen uint dummiesMoved = 0; // Track how many dummy slots we've moved + // Initialize rightmostNonDummy to start from the end + // We'll decrement it each time we use it to avoid re-selecting the same slot + uint rightmostNonDummy = numSlots - 1; + // Process slots from beginning to end for (uint i = 0; i < numSlots; ++i) { @@ -34,50 +38,57 @@ __kernel void slotCompactor( bool isDummy = (slotAddr[0] == 0xFF) && (slotAddr[1] == 0xFF) && (slotAddr[2] == 0xFF) && (slotAddr[3] == 0xFF); - if (isDummy) + // Early continue for non-dummy slots (already in the right place) + if (!isDummy) { - // Optimization 1: Find rightmost non-dummy slot and copy it here - uint rightmostNonDummy = numSlots; - for (int j = (int)numSlots - 1; j > (int)i; --j) - { - __global uchar* checkSlotAddr = assembly + firstSlotOffset - + (j * slotStride); - - bool checkIsDummy = (checkSlotAddr[0] == 0xFF) - && (checkSlotAddr[1] == 0xFF) && (checkSlotAddr[2] == 0xFF) - && (checkSlotAddr[3] == 0xFF); - - if (!checkIsDummy) - { - rightmostNonDummy = (uint)j; - break; - } - } - - // If we found a non-dummy slot to the right, copy it here - if (rightmostNonDummy < numSlots) - { - __global uchar* srcAddr = assembly + firstSlotOffset - + (rightmostNonDummy * slotStride); - - // Copy slot data (byte-by-byte copy) - for (uint j = 0; j < slotSize; ++j) { - slotAddr[j] = srcAddr[j]; - } - - // Mark the source slot as dummy (move it to the end) - for (uint j = 0; j < 4; ++j) { - srcAddr[j] = 0xFF; - } - - ++dummiesMoved; - ++nonDummiesSeen; // We just moved a non-dummy to this position - } - } - else - { - // Slot is non-dummy - it's already in the right place ++nonDummiesSeen; + continue; + } + + // Optimization 1: Find rightmost non-dummy slot starting from where we left off + // Search backwards from rightmostNonDummy until we find a non-dummy slot + // or reach the current position + bool foundNonDummy = false; + for (int j = (int)rightmostNonDummy; j > (int)i; --j) + { + __global uchar* checkSlotAddr = assembly + firstSlotOffset + + (j * slotStride); + + bool checkIsDummy = (checkSlotAddr[0] == 0xFF) + && (checkSlotAddr[1] == 0xFF) && (checkSlotAddr[2] == 0xFF) + && (checkSlotAddr[3] == 0xFF); + + // Early continue for dummy slots + if (checkIsDummy) + { continue; } + + // Found a non-dummy slot + rightmostNonDummy = (uint)j; + foundNonDummy = true; + break; + } + + // If we found a non-dummy slot to the right, copy it here + if (foundNonDummy) + { + __global uchar* srcAddr = assembly + firstSlotOffset + + (rightmostNonDummy * slotStride); + + // Copy slot data (byte-by-byte copy) + for (uint j = 0; j < slotSize; ++j) { + slotAddr[j] = srcAddr[j]; + } + + // Mark the source slot as dummy (move it to the end) + for (uint j = 0; j < 4; ++j) { + srcAddr[j] = 0xFF; + } + + // Decrement rightmostNonDummy to avoid re-selecting the same slot + --rightmostNonDummy; + + ++dummiesMoved; + ++nonDummiesSeen; // We just moved a non-dummy to this position } } }