Files
salmanoff/include/user/stagingBuffer.h
T
hayodea 1d64ce0c7e StagingBuff: support both Mlock & IOUring pin; Use in IoUAssmEngn
We use io_uring_register_buffers() for IoUringAssemblyEngine instead
of using mlock(). This __appears__ to have reduced CPU utilization on
the Dell laptop. Could also be that we recently upgraded total RAM
from 8GiB to 32GiB.
2026-04-02 03:51:22 -04:00

267 lines
7.0 KiB
C++

#ifndef STAGINGBUFFER_H
#define STAGINGBUFFER_H
#include <memory>
#include <cstdint>
#include <atomic>
#include <string>
#include <sstream>
#include <sys/mman.h>
#include <sys/uio.h>
namespace smo {
namespace stim_buff {
// Forward declaration
class FrameAssemblyDesc;
} // namespace stim_buff
} // namespace smo
struct io_uring;
namespace smo {
namespace stim_buff {
/**
* StagingBuffer manages a large buffer to guide io_uring in assembling some
* number of Livox Avia pcloud UDP dgrams into a single stim frame.
*
* The buffer operates in a cycle:
* 1. io_uring assembles UDP dgrams into the buffer until it's full
* 2. Buffer is handed off to the stimbuff layer to be appended to the stimbuff.
* 3. When the stimbuff layer has appended the current assembled frame, the
* assembly buffer is reset and cycle repeats.
*/
class StagingBuffer
{
public:
enum class PinningMechanism
{
NONE,
MLOCK,
IO_URING
};
class IOEngineConstraints
{
public:
// Default constructor creates uninitialized constraints
IOEngineConstraints() = default;
IOEngineConstraints(
size_t slotStartAlignmentByteVal_,
size_t slotPadToNBytes_,
size_t frameStartAlignmentByteVal_,
size_t framePadToNBytes_)
: slotStartAlignmentByteVal(slotStartAlignmentByteVal_),
slotPadToNBytes(slotPadToNBytes_),
frameStartAlignmentByteVal(frameStartAlignmentByteVal_),
framePadToNBytes(framePadToNBytes_)
{}
~IOEngineConstraints() = default;
size_t slotStartAlignmentByteVal, slotPadToNBytes,
frameStartAlignmentByteVal, framePadToNBytes;
// Static defaults for io_uring and OpenCL
static const IOEngineConstraints ioUringConstraints;
static const IOEngineConstraints openClInputConstraints;
inline std::string stringify() const
{
std::ostringstream oss;
oss << "IOEngineConstraints{"
<< "slotStartAlignmentByteVal=" << slotStartAlignmentByteVal
<< ", slotPadToNBytes=" << slotPadToNBytes
<< ", frameStartAlignmentByteVal=" << frameStartAlignmentByteVal
<< ", framePadToNBytes=" << framePadToNBytes
<< "}";
return oss.str();
}
};
public:
/** EXPLANATION:
* Default constructor creates uninitialized buffer.
* Must be properly initialized using placement new with the parameterized constructor.
*/
StagingBuffer() = default;
/** EXPLANATION:
* We use the input and output engine constraints to determine the total
* amount of memory required internally to assemble a single frame with
* the given number of points per frame.
*/
explicit StagingBuffer(
const IOEngineConstraints& inputEngineConstraints,
const IOEngineConstraints& outputEngineConstraints,
size_t nSlots);
~StagingBuffer();
// Non-copyable, movable
StagingBuffer(const StagingBuffer&) = delete;
StagingBuffer& operator=(const StagingBuffer&) = delete;
StagingBuffer(StagingBuffer&&) = default;
StagingBuffer& operator=(StagingBuffer&&) = default;
class Pinner
{
public:
Pinner(const Pinner&) = delete;
Pinner& operator=(const Pinner&) = delete;
Pinner(Pinner&&) = delete;
Pinner& operator=(Pinner&&) = delete;
protected:
explicit Pinner(StagingBuffer& parent_);
~Pinner() = default;
StagingBuffer& parent;
};
class MlockPinner
: public Pinner
{
public:
explicit MlockPinner(StagingBuffer& parent);
~MlockPinner();
MlockPinner(const MlockPinner&) = delete;
MlockPinner& operator=(const MlockPinner&) = delete;
MlockPinner(MlockPinner&&) = delete;
MlockPinner& operator=(MlockPinner&&) = delete;
};
class IoUringPinner
: public Pinner
{
public:
IoUringPinner(StagingBuffer& parent, struct io_uring* ring);
~IoUringPinner();
IoUringPinner(const IoUringPinner&) = delete;
IoUringPinner& operator=(const IoUringPinner&) = delete;
IoUringPinner(IoUringPinner&&) = delete;
IoUringPinner& operator=(IoUringPinner&&) = delete;
private:
struct io_uring* ring;
};
public:
/** EXPLANATION:
* Returns an input-engine-agnostic descriptor describing per-frame packet
* slot layout. Different input engines should be able to convert this into
* engine-specific metadata. E.g: io_uring's SQE descriptor.
*/
operator std::shared_ptr<FrameAssemblyDesc>() const { return frameDesc; }
// operator OpenClSharedBufferDescriptor() const;
bool isAssembling() const { return assemblingFlag.load(); }
void startAssembly() { assemblingFlag.store(true); }
void stopAssembly() { assemblingFlag.store(false); }
std::unique_ptr<MlockPinner> makeMlockPinner();
std::unique_ptr<IoUringPinner> makeIoUringPinner(struct io_uring* ring);
/** EXPLANATION:
* Returns an iovec for io_uring registration.
* The buffer is mmap()-allocated and suitable for IORING_REGISTER_BUFFERS.
*/
struct iovec getIoUringRegisterIoVec() const
{
struct iovec iov;
iov.iov_base = buffer.get();
iov.iov_len = bufferNBytes;
return iov;
}
/** EXPLANATION:
* Returns an iovec for OpenCL engine buffer access.
* The buffer is mmap()-allocated and suitable for CL_MEM_USE_HOST_PTR.
* Returns pointer to first slot (offset by firstSlotOffsetNBytes) and
* size from first slot to end of buffer.
*/
struct iovec getClEngineIovec() const
{
struct iovec iov;
iov.iov_base = buffer.get() + firstSlotOffsetNBytes;
iov.iov_len = bufferNBytes - firstSlotOffsetNBytes;
return iov;
}
inline std::string stringify() const
{
std::ostringstream oss;
oss << "StagingBuffer{"
<< "nSlots=" << nSlots
<< ", bufferNBytes=" << bufferNBytes
<< ", slotStrideNBytes=" << slotStrideNBytes
<< ", constraints=" << inputConstraints.stringify()
<< "}";
return oss.str();
}
private:
void computeSlotStrideAndBufferSize();
void assertUnpinnedAndMarkPinned(PinningMechanism mechanism);
static size_t calculateFirstSlotOffsetAndValidate(
uint8_t* buffer,
size_t bufferNBytes,
size_t nSlots,
size_t slotStrideNBytes,
const IOEngineConstraints& inputConstraints);
// Custom deleter for mmap-allocated buffer
struct MmapDeleter
{
size_t size;
// Default constructor for use with default-constructed StagingBuffer
MmapDeleter() : size(0) {}
MmapDeleter(size_t s) : size(s) {}
void operator()(uint8_t* ptr) const
{
if (ptr != nullptr && size > 0)
{
munmap(ptr, size);
}
}
};
// Buffer data - mmap-allocated for io_uring registration
// Using unique_ptr<uint8_t, MmapDeleter> instead of array syntax
// since we have a custom deleter that knows the size
std::unique_ptr<uint8_t, MmapDeleter> buffer;
size_t bufferNBytes = 0;
// Layout/invariants
size_t nSlots = 0;
public:
size_t slotStrideNBytes = 0;
size_t firstSlotOffsetNBytes = 0; // offset from buffer start to first slot
private:
IOEngineConstraints inputConstraints;
// Descriptor (computed once; reused across frames)
mutable std::shared_ptr<FrameAssemblyDesc> frameDesc;
// Current state
std::atomic<size_t> currentNBytes{0};
std::atomic<bool> assemblingFlag{false};
bool currentlyPinned = false;
PinningMechanism currentPinningMechanism = PinningMechanism::NONE;
friend class MlockPinner;
friend class IoUringPinner;
};
} // namespace stim_buff
} // namespace smo
#endif // STAGINGBUFFER_H