livoxG1: Add new OpenCl kernel to compact dgrams before collation
This commit is contained in:
@@ -14,4 +14,20 @@ collateKernelNBytes:
|
|||||||
.long .collateKernelEnd - collateKernelStart
|
.long .collateKernelEnd - collateKernelStart
|
||||||
.size collateKernelNBytes, 4
|
.size collateKernelNBytes, 4
|
||||||
|
|
||||||
|
.section .rodata
|
||||||
|
.global slotCompactorKernelStart
|
||||||
|
.global slotCompactorKernelNBytes
|
||||||
|
.type slotCompactorKernelStart, @object
|
||||||
|
.type slotCompactorKernelNBytes, @object
|
||||||
|
|
||||||
|
slotCompactorKernelStart:
|
||||||
|
.incbin "slotCompactor.cl"
|
||||||
|
.size slotCompactorKernelStart, . - slotCompactorKernelStart
|
||||||
|
.slotCompactorKernelEnd:
|
||||||
|
|
||||||
|
.section .data
|
||||||
|
slotCompactorKernelNBytes:
|
||||||
|
.long .slotCompactorKernelEnd - slotCompactorKernelStart
|
||||||
|
.size slotCompactorKernelNBytes, 4
|
||||||
|
|
||||||
.section .note.GNU-stack,"",@progbits
|
.section .note.GNU-stack,"",@progbits
|
||||||
|
|||||||
@@ -11,6 +11,10 @@ extern "C" {
|
|||||||
extern const char collateKernelStart[];
|
extern const char collateKernelStart[];
|
||||||
extern const uint32_t collateKernelNBytes;
|
extern const uint32_t collateKernelNBytes;
|
||||||
|
|
||||||
|
// External symbols for slotCompactor kernel (unmangled, not namespaced)
|
||||||
|
extern const char slotCompactorKernelStart[];
|
||||||
|
extern const uint32_t slotCompactorKernelNBytes;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -0,0 +1,83 @@
|
|||||||
|
__kernel void slotCompactor(
|
||||||
|
__global uchar* assembly,
|
||||||
|
uint numSlots,
|
||||||
|
uint slotStride,
|
||||||
|
uint slotSize,
|
||||||
|
uint firstSlotOffset,
|
||||||
|
uint nSucceeded)
|
||||||
|
{
|
||||||
|
// Sequential processing: single work item processes all slots
|
||||||
|
// Compact non-dummy slots to the beginning (lowest addresses)
|
||||||
|
// Dummy slots will remain at the end (highest addresses)
|
||||||
|
// Optimizations:
|
||||||
|
// 1. For each dummy, find rightmost non-dummy and copy it there
|
||||||
|
// 2. Exit early once we've seen nSucceeded non-dummy slots
|
||||||
|
// 3. Exit early once we've moved nFailed dummy slots
|
||||||
|
|
||||||
|
uint nFailed = numSlots - nSucceeded; // Calculate number of failed slots
|
||||||
|
uint nonDummiesSeen = 0; // Track how many non-dummy slots we've seen
|
||||||
|
uint dummiesMoved = 0; // Track how many dummy slots we've moved
|
||||||
|
|
||||||
|
// Process slots from beginning to end
|
||||||
|
for (uint i = 0; i < numSlots; ++i)
|
||||||
|
{
|
||||||
|
// Optimization 2: Exit early once we've seen nSucceeded non-dummy slots
|
||||||
|
if (nonDummiesSeen >= nSucceeded) { break; }
|
||||||
|
// Optimization 3: Exit early once we've moved nFailed dummy slots
|
||||||
|
if (dummiesMoved >= nFailed) { break; }
|
||||||
|
|
||||||
|
// Calculate slot address
|
||||||
|
__global uchar* slotAddr = assembly + firstSlotOffset
|
||||||
|
+ (i * slotStride);
|
||||||
|
|
||||||
|
// Check if slot is dummy: first 4 bytes should all be 0xFF
|
||||||
|
bool isDummy = (slotAddr[0] == 0xFF) && (slotAddr[1] == 0xFF)
|
||||||
|
&& (slotAddr[2] == 0xFF) && (slotAddr[3] == 0xFF);
|
||||||
|
|
||||||
|
if (isDummy)
|
||||||
|
{
|
||||||
|
// Optimization 1: Find rightmost non-dummy slot and copy it here
|
||||||
|
uint rightmostNonDummy = numSlots;
|
||||||
|
for (int j = (int)numSlots - 1; j > (int)i; --j)
|
||||||
|
{
|
||||||
|
__global uchar* checkSlotAddr = assembly + firstSlotOffset
|
||||||
|
+ (j * slotStride);
|
||||||
|
|
||||||
|
bool checkIsDummy = (checkSlotAddr[0] == 0xFF)
|
||||||
|
&& (checkSlotAddr[1] == 0xFF) && (checkSlotAddr[2] == 0xFF)
|
||||||
|
&& (checkSlotAddr[3] == 0xFF);
|
||||||
|
|
||||||
|
if (!checkIsDummy)
|
||||||
|
{
|
||||||
|
rightmostNonDummy = (uint)j;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we found a non-dummy slot to the right, copy it here
|
||||||
|
if (rightmostNonDummy < numSlots)
|
||||||
|
{
|
||||||
|
__global uchar* srcAddr = assembly + firstSlotOffset
|
||||||
|
+ (rightmostNonDummy * slotStride);
|
||||||
|
|
||||||
|
// Copy slot data (byte-by-byte copy)
|
||||||
|
for (uint j = 0; j < slotSize; ++j) {
|
||||||
|
slotAddr[j] = srcAddr[j];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mark the source slot as dummy (move it to the end)
|
||||||
|
for (uint j = 0; j < 4; ++j) {
|
||||||
|
srcAddr[j] = 0xFF;
|
||||||
|
}
|
||||||
|
|
||||||
|
++dummiesMoved;
|
||||||
|
++nonDummiesSeen; // We just moved a non-dummy to this position
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Slot is non-dummy - it's already in the right place
|
||||||
|
++nonDummiesSeen;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user