// Debug macro: define DEBUG_SLOT_COMPACTOR to enable printf statements #ifdef DEBUG_SLOT_COMPACTOR #define DBG_PRINTF(...) printf(__VA_ARGS__) #else #define DBG_PRINTF(...) #endif __kernel void slotCompactor( __global uchar* assembly, uint numSlots, uint slotStride, uint slotSize, uint nSucceeded) { // Sequential processing: single work item processes all slots // Compact non-dummy slots to the beginning (lowest addresses) // Dummy slots will remain at the end (highest addresses) // Optimizations: // 1. For each dummy, find rightmost non-dummy and copy it there // 2. Exit early once we've seen nSucceeded non-dummy slots // 3. Exit early once we've moved nFailed dummy slots DBG_PRINTF("slotCompactor: KERNEL STARTED\n"); DBG_PRINTF("slotCompactor: numSlots=%u, slotStride=%u, slotSize=%u, nSucceeded=%u\n", numSlots, slotStride, slotSize, nSucceeded); uint nFailed = numSlots - nSucceeded; // Calculate number of failed slots uint nonDummiesSeen = 0; // Track how many non-dummy slots we've seen uint dummiesMoved = 0; // Track how many dummy slots we've moved DBG_PRINTF("slotCompactor: nFailed=%u\n", nFailed); // Initialize rightmostNonDummy to start from the end // We'll decrement it each time we use it to avoid re-selecting the same slot uint rightmostNonDummy = numSlots - 1; // Process slots from beginning to end DBG_PRINTF("slotCompactor: Starting loop, numSlots=%u\n", numSlots); for (uint i = 0; i < numSlots; ++i) { // Optimization 2: Exit early once we've seen nSucceeded non-dummy slots if (nonDummiesSeen >= nSucceeded) { DBG_PRINTF("slotCompactor: Early exit at i=%u, nonDummiesSeen=%u >= nSucceeded=%u\n", i, nonDummiesSeen, nSucceeded); break; } // Optimization 3: Exit early once we've moved nFailed dummy slots if (dummiesMoved >= nFailed) { DBG_PRINTF("slotCompactor: Early exit at i=%u, dummiesMoved=%u >= nFailed=%u\n", i, dummiesMoved, nFailed); break; } // Calculate slot address __global uchar* slotAddr = assembly + (i * slotStride); // Check if slot is dummy: first 4 bytes should all be 0xFF bool isDummy = (slotAddr[0] == 0xFF) && (slotAddr[1] == 0xFF) && (slotAddr[2] == 0xFF) && (slotAddr[3] == 0xFF); if (i < 5 || i == numSlots - 1) { DBG_PRINTF("slotCompactor: i=%u, slot[0-3]=0x%02X%02X%02X%02X, isDummy=%d\n", i, slotAddr[0], slotAddr[1], slotAddr[2], slotAddr[3], isDummy ? 1 : 0); } // Early continue for non-dummy slots (already in the right place) if (!isDummy) { ++nonDummiesSeen; continue; } // Optimization 1: Find rightmost non-dummy slot starting from where we left off // Search backwards from rightmostNonDummy until we find a non-dummy slot // or reach the current position bool foundNonDummy = false; for (int j = (int)rightmostNonDummy; j > (int)i; --j) { __global uchar* checkSlotAddr = assembly + (j * slotStride); bool checkIsDummy = (checkSlotAddr[0] == 0xFF) && (checkSlotAddr[1] == 0xFF) && (checkSlotAddr[2] == 0xFF) && (checkSlotAddr[3] == 0xFF); // Early continue for dummy slots if (checkIsDummy) { continue; } // Found a non-dummy slot rightmostNonDummy = (uint)j; foundNonDummy = true; break; } // If we found a non-dummy slot to the right, copy it here if (foundNonDummy) { DBG_PRINTF("slotCompactor: Moving slot from %u to %u\n", rightmostNonDummy, i); __global uchar* srcAddr = assembly + (rightmostNonDummy * slotStride); // Copy slot data (byte-by-byte copy) for (uint j = 0; j < slotSize; ++j) { slotAddr[j] = srcAddr[j]; } // Mark the source slot as dummy (move it to the end) for (uint j = 0; j < 4; ++j) { srcAddr[j] = 0xFF; } // Decrement rightmostNonDummy to avoid re-selecting the same slot --rightmostNonDummy; ++dummiesMoved; ++nonDummiesSeen; // We just moved a non-dummy to this position } else { if (i < 5) { DBG_PRINTF("slotCompactor: i=%u, no non-dummy found to move\n", i); } } } DBG_PRINTF("slotCompactor: Loop complete, nonDummiesSeen=%u, dummiesMoved=%u\n", nonDummiesSeen, dummiesMoved); DBG_PRINTF("slotCompactor: KERNEL FINISHED\n"); }