Add AGENTS.md

Salmanoff: Version increment to v0.01.001
Docs: Document ambience stimbuff and high-val param
2026-04-01 21:55:53 -04:00 · 2025-11-23 07:35:21 -04:00 · 2025-11-23 07:34:59 -04:00 · 2025-11-23 07:28:04 -04:00 · 2025-11-23 07:25:53 -04:00 · 2025-11-23 07:20:55 -04:00
71 changed files with 6356 additions and 1174 deletions
@@ -12,3 +12,4 @@ config.h.in
 configure
 *.swp
 cscope.out
+*.tmp
@@ -1,5 +1,6 @@
 {
    "files.associations": {
+		"*.cl": "c",
 		"cstdint": "cpp",
 		"array": "cpp",
 		"atomic": "cpp",
@@ -82,12 +83,19 @@
 		"strstream": "cpp",
 		"regex": "cpp",
 		"stacktrace": "cpp",
-		"stdfloat": "cpp"
+		"stdfloat": "cpp",
+		"cfenv": "cpp",
+		"expected": "cpp",
+		"valarray": "cpp",
+		"core": "cpp",
+		"nonlinearoptimization": "cpp",
+		"*.txx": "cpp"
 	},
    "editor.rulers": [80, 120],
    "editor.tabSize": 4,
    "editor.insertSpaces": false,
    "editor.detectIndentation": false,
+    "editor.inlayHints.enabled": "off",
    "C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
    "C_Cpp.default.browse.limitSymbolsToIncludedHeaders": true,
    "C_Cpp.default.browse.path": [
@@ -0,0 +1,10 @@
+# Project Instructions
+
+- Always break functions into logical subfunctions. No long-scrolling functions, in any language. This applies to source code, scripts, build scripts, CMake, Makefiles, and similar project files. Preserve this subfunction splitting discipline during refactors.
+- Modularity is non-negotiable. Always group logically related functions together into a module. Preserve modularity during refactors.
+- Reuse or extend existing abstractions instead of duplicating logic wherever possible. Don't repeat yourself. The goal here is to prevent duplication. Not to discourage appropriate logical separation of prior abstractions into new logical abstractions where sensible.
+- Always isolate configurable behaviour into configuration variables appropriate for the language and framework being used.
+- Never bake in literals; at minimum, declare them at the top of the file with a semantically meaningful name.
+- UI should be responsive. Always prefer to use pre-packaged UI toolkit widgets, containers and colour sets harmoniously, instead of writing custom CSS overrides. Write custom CSS only if there's no UI toolkit mechanism available.
+- Aggressively isolate, split off, deduplicate and reuse code which can be made into common library code. Do the same with UI elements. Do this both when implementing new features and opportunistically while refactoring or changing old code/UI elements.
+- Names of files, functions, classes, abstractions, database fields, etc should be aimed at disambiguating purpose and function, rather than at brevity.
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.16)
-project(salmanoff VERSION 0.01.000 LANGUAGES CXX)
+project(salmanoff VERSION 0.01.001 LANGUAGES CXX)

 include(CMakeDependentOption)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/DAPSS.cmake)
@@ -7,12 +7,13 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/DebugOpts.cmake)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/VerifyBoostDynamic.cmake)

 # Set C++ standard
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)

 # Build type
 if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Debug FORCE)
+	set(CMAKE_BUILD_TYPE Debug FORCE)
 endif()

 # Compiler flags
@@ -116,6 +117,50 @@ find_package(PkgConfig REQUIRED)
 find_package(FLEX REQUIRED)
 find_package(BISON REQUIRED)

+# Find OpenCL 1.2 or higher: try find_package first, fall back to pkg-config
+find_package(OpenCL 1.2 QUIET)
+if(OpenCL_FOUND)
+	# Normalize find_package variables to match pkg_check_modules naming
+	set(OPENCL_FOUND TRUE)
+	set(OPENCL_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
+	# Handle both OpenCL_LIBRARY (singular) and OpenCL_LIBRARIES (plural)
+	if(OpenCL_LIBRARIES)
+		set(OPENCL_LIBRARIES ${OpenCL_LIBRARIES})
+	else()
+		set(OPENCL_LIBRARIES ${OpenCL_LIBRARY})
+	endif()
+	set(OPENCL_LIBRARY_DIRS "")
+	message(STATUS "Found OpenCL using find_package")
+
+	# Check if version is available and validate
+	if(OpenCL_VERSION)
+		if(OpenCL_VERSION VERSION_LESS "1.2")
+			message(FATAL_ERROR
+				"OpenCL version ${OpenCL_VERSION} found, but 1.2 or higher is required")
+		endif()
+		message(STATUS "OpenCL version: ${OpenCL_VERSION}")
+	else()
+		message(WARNING
+			"OpenCL version could not be determined. "
+			"Version 1.2+ is required at runtime.")
+	endif()
+else()
+	# Fall back to pkg-config
+	pkg_check_modules(OPENCL OpenCL)
+	if(NOT OPENCL_FOUND)
+		message(FATAL_ERROR
+			"Failed to find OpenCL: both find_package and "
+			"pkg_check_modules failed. Try installing the "
+			"'ocl-icd-opencl-dev' package (or the appropriate "
+			"OpenCL development package for your system)."
+		)
+	endif()
+	message(STATUS "Found OpenCL using pkg-config")
+	message(WARNING
+		"OpenCL version could not be determined via pkg-config. "
+		"Version 1.2+ is required at runtime.")
+endif()
+
 # Need dlopen() and dlsym()
 find_library(DL_LIBRARY NAMES dl ldl)
 if(NOT DL_LIBRARY)
@@ -0,0 +1,86 @@
+# Bug somehow related to either OpenClCollateAndMeshingEngine or PcloudStimBuff:
+
+printSlotBytes: Slot 21 vaddr=0xfffff7fb4000 (4 bytes):
+0000: 05 01 01 00                                      |....|
+printSlotBytes: Slot 22 vaddr=0xfffff7fb5000 (4 bytes):
+0000: 05 01 01 00                                      |....|
+printSlotBytes: Slot 23 vaddr=0xfffff7fb6000 (4 bytes):
+0000: 05 01 01 00                                      |....|
+printSlotBytes: Slot 24 vaddr=0xfffff7fb7000 (4 bytes):
+0000: 05 01 01 00                                      |....|
+printSlotBytes: Slot 25 vaddr=0xfffff7fb8000 (4 bytes):
+0000: 05 01 01 00                                      |....|
+printSlotBytes: Slot 26 vaddr=0xfffff7fb9000 (4 bytes):
+0000: 05 01 01 00                                      |....|
+printSlotBytes: Slot 27 vaddr=0xfffff7fba000 (4 bytes):
+0000: 05 01 01 00                                      |....|
+printSlotBytes: Slot 28 vaddr=0xfffff7fbb000 (4 bytes):
+0000: 05 01 01 00                                      |....|
+printSlotBytes: Slot 29 vaddr=0xfffff7fbc000 (4 bytes):
+0000: 05 01 01 00                                      |....|
+produceFrameReq2_assembleDone: Successfully assembled frame 29 slots succeeded out of 30 total slots
+compactCollateAndMeshFrameReq: Started compact kernel
+startKernel: already running, call stop() first
+produceFrameReq3_compactCollateDone: Failed to compact and collate frame
+Mrntt: About to detach all sense devices.
+xcbWindow_detachDeviceReq: Detached X11 window device:
+Device Identifier: win0, Sensor Type: e, QualeIface API: visual-qualeiface, QualeIface API Params: (), StimBuff API: xcb, StimBuff API Params: (dev-substring ), Provider: xorg, Provider Params: (display=1 screen=0 ), Device Selector: mut
+
+enDisablePcloudDataReq2: Command timeout for device 3JEDK380010Z39
+detachDeviceReq1: Failed to disable pcloud data for stimbuff 3JEDK380010Z39
+stop: Stopped stimulus buffer for device 3JEDK380010Z39
+disconnectReq: Sent disconnect message to 10.42.0.139:65000
+detachDeviceReq2: Successfully detached pcloud stimbuff for device 3JEDK380010Z39 and possibly also destroyed device.
+Mrntt: Successfully detached 2 of 2 sense devices.
+Mrntt: About to finalize all stim buff api libs.
+stop: UDP Command Demuxer stopped
+stop: BroadcastListener stopped
+
+Thread 9 "rusticl queue t" received signal SIGSEGV, Segmentation fault.
+[Switching to Thread 0xffffca4ee140 (LWP 11695)]
+0x0000fffff48517b0 in std::_Bind<void (smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq::*(smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq*, std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, std::_Placeholder<1>))(std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, int)>::_Bind(std::_Bind<void (smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq::*(smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq*, std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, std::_Placeholder<1>))(std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, int)> const&) (
+    this=0xffffdc000c70) at /usr/include/c++/13/functional:581
+581	      _Bind(const _Bind&) = default;
+(gdb) bt
+#0  0x0000fffff48517b0 in std::_Bind<void (smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq::*(smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq*, std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, std::_Placeholder<1>))(std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, int)>::_Bind(std::_Bind<void (smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq::*(smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq*, std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, std::_Placeholder<1>))(std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, int)> const&) (this=0xffffdc000c70) at /usr/include/c++/13/functional:581
+#1  0x0000fffff4851818 in std::_Function_base::_Base_manager<std::_Bind<void (smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq::*(smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq*, std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, std::_Placeholder<1>))(std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, int)> >::_M_create<std::_Bind<void (smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq::*(smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq*, std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, std::_Placeholder<1>))(std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, int)> const&>(std::_Any_data&, std::_Bind<void (smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq::*(smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq*, std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, std::_Placeholder<1>))(std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, int)> const&, std::integral_constant<bool, false>) (__dest=..., __f=...)
+    at /usr/include/c++/13/bits/std_function.h:161
+#2  0x0000fffff4850704 in std::_Function_base::_Base_manager<std::_Bind<void (smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq::*(smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq*, std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, std::_Placeholder<1>))(std::shared_ptr--Type <RET> for more, q to quit, c to continue without paging--c
+<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, int)> >::_M_init_functor<std::_Bind<void (smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq::*(smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq*, std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, std::_Placeholder<1>))(std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, int)> const&>(std::_Any_data&, std::_Bind<void (smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq::*(smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq*, std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, std::_Placeholder<1>))(std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, int)> const&) (__functor=..., __f=...) at /usr/include/c++/13/bits/std_function.h:215
+#3  0x0000fffff484fbf0 in std::_Function_base::_Base_manager<std::_Bind<void (smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq::*(smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq*, std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, std::_Placeholder<1>))(std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, int)> >::_M_manager(std::_Any_data&, std::_Any_data const&, std::_Manager_operation) (__dest=..., 
+    __source=..., __op=std::__clone_functor) at /usr/include/c++/13/bits/std_function.h:198
+#4  0x0000fffff484f0bc in std::_Function_handler<void (int), std::_Bind<void (smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq::*(smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq*, std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, std::_Placeholder<1>))(std::shared_ptr<smo::stim_buff::OpenClCollatingAndMeshingEngine::CompactCollateAndMeshFrameReq>, int)> >::_M_manager(std::_Any_data&, std::_Any_data const&, std::_Manager_operation) (__dest=..., 
+    __source=..., __op=std::__clone_functor) at /usr/include/c++/13/bits/std_function.h:282
+#5  0x0000fffff484f2b0 in std::function<void (int)>::function(std::function<void (int)> const&) (this=0xffffca4ecd40, __x=...) at /usr/include/c++/13/bits/std_function.h:391
+#6  0x0000fffff484e9c0 in std::_Bind<std::function<void (int)> (int)>::_Bind<int&>(std::function<void (int)> const&, int&) (this=0xffffca4ecd40, __f=...)
+    at /usr/include/c++/13/functional:572
+#7  0x0000fffff484e170 in std::bind<std::function<void (int)>&, int&>(std::function<void (int)>&, int&) (__f=...) at /usr/include/c++/13/functional:885
+#8  0x0000fffff484aa68 in smo::stim_buff::OpenClCollatingAndMeshingEngine::compactKernelEventCallback (event_command_exec_status=0, user_data=0xffffe4009e80)
+    at /home/latentprion/gits/salmanoff-git/stimBuffApis/livoxGen1/openClCollatingAndMeshingEngine.cpp:249
+#9  0x0000ffffcb3e34b4 in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#10 0x0000ffffcb3d173c in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#11 0x0000ffffcb3d1d28 in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#12 0x0000ffffcb3b0b34 in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#13 0x0000ffffcb40886c in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#14 0x0000ffffcb39a728 in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#15 0x0000ffffcb39a7b0 in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#16 0x0000ffffcb3b0a40 in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#17 0x0000ffffcb3b130c in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#18 0x0000ffffcb3d2dfc in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#19 0x0000ffffcb371148 in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#20 0x0000ffffcb3f9b40 in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#21 0x0000ffffcb3713c8 in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#22 0x0000ffffcb378988 in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#23 0x0000ffffcb37120c in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#24 0x0000ffffcb371000 in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#25 0x0000ffffcb392888 in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#26 0x0000ffffcb45f23c in ?? () from /lib/aarch64-linux-gnu/libRusticlOpenCL.so.1
+#27 0x0000fffff7ac595c in start_thread (arg=0xfffff58cf880) at ./nptl/pthread_create.c:447
+#28 0x0000fffff7b2bb0c in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone3.S:76
+(gdb) 
+
+## Race conditions in OClCollMeshEngn:
+
+engine not set up or invalid
+
+
@@ -24,14 +24,21 @@ set(CPACK_RESOURCE_FILE_README "${CMAKE_CURRENT_SOURCE_DIR}/README.md")
 # Enable deb and rpm generators
 set(CPACK_GENERATOR "DEB;RPM")

-# DEB package specific settings
+# DEB package specific settings (Ubuntu)
 set(CPACK_DEBIAN_PACKAGE_MAINTAINER
    "Salmanoff Project <maintainer@salmanoff.org>")
 set(CPACK_DEBIAN_PACKAGE_SECTION "science")
 set(CPACK_DEBIAN_PACKAGE_PRIORITY "optional")
+# Target Ubuntu distribution
+set(CPACK_DEBIAN_PACKAGE_DISTRIBUTION "ubuntu")
+# Build dependencies (from builddeps file)
+# These are needed to build the package from source
+set(CPACK_DEBIAN_PACKAGE_BUILD_DEPENDS
+    "build-essential, cmake (>= 3.16), libboost-all-dev, flex, bison, ocl-icd-opencl-dev, liburing-dev")
+
+# Runtime dependencies (from builddeps file - runtime equivalents)
 set(CPACK_DEBIAN_PACKAGE_DEPENDS
-    "libboost-system1.74.0 | libboost-system1.73.0 | libboost-system1.72.0, "
-    "libc6, libstdc++6")
+    "libboost-system1.74.0 | libboost-system1.73.0 | libboost-system1.72.0, libboost-log1.74.0 | libboost-log1.73.0 | libboost-log1.72.0, libc6, libstdc++6, ocl-icd-libopencl1 | libopencl1, liburing2 | liburing1")
 set(CPACK_DEBIAN_PACKAGE_RECOMMENDS "libxcb1, libx11-6")
 set(CPACK_DEBIAN_PACKAGE_SUGGESTS "livox-sdk")

@@ -39,13 +46,17 @@ set(CPACK_DEBIAN_PACKAGE_SUGGESTS "livox-sdk")
 set(CPACK_RPM_PACKAGE_LICENSE "Proprietary")
 set(CPACK_RPM_PACKAGE_GROUP "Applications/Engineering")
 set(CPACK_RPM_PACKAGE_URL "https://github.com/salmanoff/salmanoff")
-set(CPACK_RPM_PACKAGE_REQUIRES "boost-system >= 1.72.0, glibc, libstdc++")
+set(CPACK_RPM_PACKAGE_REQUIRES "boost-system >= 1.72.0, boost-log >= 1.72.0, glibc, libstdc++, ocl-icd, liburing")
 set(CPACK_RPM_PACKAGE_SUGGESTS "xcb, libX11, livox-sdk")

 # Package file naming using project variables
 set(CPACK_PACKAGE_FILE_NAME
    "${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${CMAKE_SYSTEM_PROCESSOR}")

+# Enable automatic dependency detection for Debian packages
+# This uses dpkg-shlibdeps to automatically detect shared library dependencies
+set(CPACK_DEBIAN_PACKAGE_SHLIBDEPS ON)
+
 # Set compression
 set(CPACK_DEB_COMPONENT_INSTALL ON)
 set(CPACK_RPM_COMPONENT_INSTALL ON)
@@ -1,5 +1,7 @@
 add_library(attachmentSupport SHARED
-	stimulusBuffer.cpp
+	compute.cpp
+	stimulusProducer.cpp
+	stagingBuffer.cpp
 )

 target_include_directories(attachmentSupport PUBLIC
@@ -0,0 +1,147 @@
+#include <user/compute.h>
+#include <stdexcept>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <string_view>
+
+namespace smo {
+namespace compute {
+
+// Helper function to parse OpenCL version string
+static std::pair<int, int> parseOpenClVersion(const std::string& versionStr)
+{
+	size_t spacePos = versionStr.find(' ');
+	if (spacePos == std::string::npos) { return {-1, -1}; }
+
+	std::string versionNum = versionStr.substr(spacePos + 1);
+	size_t dotPos = versionNum.find('.');
+	if (dotPos == std::string::npos) { return {-1, -1}; }
+
+	try {
+		int major = std::stoi(versionNum.substr(0, dotPos));
+		int minor = std::stoi(versionNum.substr(dotPos + 1));
+		return {major, minor};
+	} catch (const std::exception&) {
+		return {-1, -1};
+	}
+}
+
+// Implementation of validateOpenClVersion (declared in user/compute.h)
+bool validateOpenClVersion(
+	std::string_view versionStr, std::string_view versionType,
+	int minMajor, int minMinor)
+{
+	auto [major, minor] = parseOpenClVersion(std::string(versionStr));
+
+	if (major == -1 && minor == -1)
+	{
+		std::cerr << __func__ << ": failed to parse OpenCL " << versionType
+			<< " version: " << versionStr << std::endl;
+		return false;
+	}
+
+	if (major < minMajor || (major == minMajor && minor < minMinor))
+	{
+		std::cerr << __func__ << ": OpenCL " << versionType << " version "
+			<< major << "." << minor << " found, but " << minMajor << "."
+			<< minMinor << " or higher is required" << std::endl;
+		return false;
+	}
+
+	std::cout << __func__ << ": OpenCL " << versionType << " version: "
+		<< versionStr << std::endl;
+	return true;
+}
+
+ComputeDevice::ComputeDevice(cl_platform_id platformId, cl_device_id deviceId)
+: platform(platformId), device(deviceId),
+context(nullptr), commandQueue(nullptr)
+{
+	cl_int err;
+
+	// Create context for this device
+	context = clCreateContext(
+		nullptr, 1, &device,
+		nullptr, nullptr, &err);
+
+	if (err != CL_SUCCESS || !context)
+	{
+		throw std::runtime_error(
+			std::string(__func__) + ": failed to create context for device: " +
+			std::to_string(err));
+	}
+
+	// Create command queue
+	cl_command_queue_properties queueProps = 0;
+	commandQueue = clCreateCommandQueue(
+		context, device, queueProps, &err);
+
+	if (err != CL_SUCCESS || !commandQueue)
+	{
+		clReleaseContext(context);
+		context = nullptr;
+		throw std::runtime_error(
+			std::string(__func__) + ": failed to create command queue for "
+			"device: " + std::to_string(err));
+	}
+}
+
+ClBuffer::ClBuffer(void* hostPtr, size_t size, cl_mem_flags flags,
+	const std::vector<std::shared_ptr<ComputeDevice>>& devices)
+	: hostPtr(hostPtr), size(size), flags(flags)
+{
+	associations.reserve(devices.size());
+
+	// Create a buffer for each device's context
+	for (const auto& device : devices)
+	{
+		if (!device->context) { continue; }
+
+		cl_int err;
+		cl_mem_flags bufferFlags = CL_MEM_USE_HOST_PTR | flags;
+		cl_mem buffer = clCreateBuffer(
+			device->context,
+			bufferFlags,
+			size, hostPtr,
+			&err);
+
+		if (err != CL_SUCCESS || !buffer)
+		{
+			// Release any buffers already created before throwing
+			for (auto& assoc : associations)
+			{
+				if (assoc.buffer) {
+					clReleaseMemObject(assoc.buffer);
+				}
+			}
+			throw std::runtime_error(
+				std::string(__func__) + ": failed to create buffer for "
+				"device: " + std::to_string(err));
+		}
+
+		associations.emplace_back(buffer, device);
+	}
+}
+
+cl_mem ClBuffer::getAssociatedBufferHandleForDevice(
+	const std::shared_ptr<ComputeDevice>& device) const
+{
+	if (!device)
+	{
+		throw std::invalid_argument(std::string(__func__)
+			+ ": device is nullptr");
+	}
+
+	for (const auto& assoc : associations)
+	{
+		if (assoc.device == device) {
+			return assoc.buffer;
+		}
+	}
+
+	return nullptr;
+}
+
+} // namespace compute
+} // namespace smo
@@ -0,0 +1,236 @@
+#include <user/stagingBuffer.h>
+#include <unistd.h>
+#include <cstdint>
+#include <stdexcept>
+#include <sys/mman.h>
+#include <vector>
+
+#include <user/frameAssemblyDesc.h>
+
+namespace smo {
+namespace stim_buff {
+
+// Static defaults for io_uring
+const StagingBuffer::IOEngineConstraints
+StagingBuffer::IOEngineConstraints::ioUringConstraints(
+	// slotStartAlignmentByteVal (page alignment for DMA)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
+	// slotPadToNBytes (MTU 1500 - UDP/IP header 28)
+	1472,
+	// frameStartAlignmentByteVal (page alignment for DMA)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
+	// framePadToNBytes (MTU 1500 - UDP/IP header 28)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE))
+);
+
+// Static defaults for OpenCL input
+const StagingBuffer::IOEngineConstraints
+StagingBuffer::IOEngineConstraints::openClInputConstraints(
+	// slotStartAlignmentByteVal (page alignment)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
+	// slotPadToNBytes (XYZI point size)
+	16,
+	// frameStartAlignmentByteVal (page alignment)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
+	// framePadToNBytes (pointer size)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE))
+);
+
+// Helper function to calculate maximum alignment needed for first slot
+// (must satisfy both frame and slot alignment)
+static size_t calculateMaxAlignment(
+	size_t frameStartAlignmentByteVal,
+	size_t slotStartAlignmentByteVal)
+{
+	if (frameStartAlignmentByteVal >= slotStartAlignmentByteVal)
+	{
+		if (frameStartAlignmentByteVal % slotStartAlignmentByteVal == 0)
+			{ return frameStartAlignmentByteVal; }
+		else
+		{
+			// Need LCM, but for simplicity use the larger alignment
+			// In practice, alignments are usually powers of 2, so this should work
+			return std::max(
+				frameStartAlignmentByteVal, slotStartAlignmentByteVal);
+		}
+	}
+	else
+	{
+		if (slotStartAlignmentByteVal % frameStartAlignmentByteVal == 0)
+			{ return slotStartAlignmentByteVal; }
+		else
+		{
+			return std::max(
+				frameStartAlignmentByteVal, slotStartAlignmentByteVal);
+		}
+	}
+}
+
+void StagingBuffer::computeSlotStrideAndBufferSize()
+{
+	// Slot stride is the maximum of alignment and padding, rounded up to a multiple of alignment
+	size_t minSlotStride = std::max(
+		inputConstraints.slotStartAlignmentByteVal,
+		inputConstraints.slotPadToNBytes);
+	slotStrideNBytes = ((minSlotStride + inputConstraints.slotStartAlignmentByteVal - 1)
+		/ inputConstraints.slotStartAlignmentByteVal)
+		* inputConstraints.slotStartAlignmentByteVal;
+
+	// Calculate maximum alignment needed for first slot (must satisfy both frame and slot alignment)
+	size_t maxAlignment = calculateMaxAlignment(
+		inputConstraints.frameStartAlignmentByteVal,
+		inputConstraints.slotStartAlignmentByteVal);
+
+	// Calculate minimum buffer size
+	size_t minBufferSize = std::max(
+		inputConstraints.framePadToNBytes,
+		inputConstraints.slotPadToNBytes);
+
+	// Calculate total size needed for nSlots slots
+	size_t slotAreaSize = nSlots * slotStrideNBytes;
+
+	// Add padding space at buffer start for alignment offset (worst case: max alignment - 1)
+	size_t alignmentPadding = maxAlignment - 1;
+
+	// Total size needed: alignment padding + slot area, then ensure minimum is met
+	size_t rawSize = alignmentPadding + slotAreaSize;
+	if (rawSize < minBufferSize)
+		{ rawSize = minBufferSize; }
+
+	// Align up to the maximum alignment to ensure we can always find a valid offset
+	bufferNBytes = ((rawSize + maxAlignment - 1) / maxAlignment) * maxAlignment;
+}
+
+// Static member function to calculate offset and validate invariants
+size_t StagingBuffer::calculateFirstSlotOffsetAndValidate(
+	uint8_t* buffer,
+	size_t bufferNBytes,
+	size_t nSlots,
+	size_t slotStrideNBytes,
+	const StagingBuffer::IOEngineConstraints& inputConstraints)
+{
+	// Calculate maximum alignment needed for first slot
+	size_t maxAlignment = calculateMaxAlignment(
+		inputConstraints.frameStartAlignmentByteVal,
+		inputConstraints.slotStartAlignmentByteVal);
+
+	// Calculate offset to align first slot to both frame and slot alignment
+	uintptr_t bufferAddr = reinterpret_cast<uintptr_t>(buffer);
+	uintptr_t alignedAddr = ((bufferAddr + maxAlignment - 1) / maxAlignment)
+		* maxAlignment;
+	size_t firstSlotOffsetNBytes = alignedAddr - bufferAddr;
+
+	// Validate invariants with exceptions
+	uint8_t* firstSlotAddr = buffer + firstSlotOffsetNBytes;
+	if (
+		reinterpret_cast<uintptr_t>(firstSlotAddr)
+		% inputConstraints.frameStartAlignmentByteVal != 0)
+	{
+		throw std::runtime_error(std::string(__func__)
+			+ ": StagingBuffer: first slot address not aligned to "
+			+ std::to_string(inputConstraints.frameStartAlignmentByteVal));
+	}
+
+	if (
+		reinterpret_cast<uintptr_t>(firstSlotAddr)
+		% inputConstraints.slotStartAlignmentByteVal != 0)
+	{
+		throw std::runtime_error(std::string(__func__)
+			+ ": StagingBuffer: first slot address not aligned to "
+			+ std::to_string(inputConstraints.slotStartAlignmentByteVal));
+	}
+
+	size_t minBufferSize = std::max(
+		inputConstraints.framePadToNBytes,
+		inputConstraints.slotPadToNBytes);
+	if (bufferNBytes < minBufferSize)
+	{
+		throw std::runtime_error(std::string(__func__)
+			+ ": StagingBuffer: buffer size less than minimum required (max of "
+			+ std::to_string(inputConstraints.framePadToNBytes)
+			+ " and "
+			+ std::to_string(inputConstraints.slotPadToNBytes)
+			+ ")");
+	}
+
+	if (firstSlotOffsetNBytes + nSlots * slotStrideNBytes
+		> bufferNBytes)
+	{
+		throw std::runtime_error(std::string(__func__)
+			+ ": StagingBuffer: buffer size insufficient to hold "
+			+ std::to_string(nSlots)
+			+ " slots with proper alignment and padding");
+	}
+
+	return firstSlotOffsetNBytes;
+}
+
+StagingBuffer::StagingBuffer(
+	const IOEngineConstraints& inputEngineConstraints_,
+	const IOEngineConstraints& /*outputEngineConstraints*/,
+	size_t nSlots)
+: buffer(nullptr, MmapDeleter(0)), bufferNBytes(0),
+nSlots(nSlots), slotStrideNBytes(0),
+firstSlotOffsetNBytes(0),
+inputConstraints(inputEngineConstraints_),
+assemblingFlag(false)
+{
+	if (nSlots == 0)
+	{
+		throw std::invalid_argument(std::string(__func__)
+			+ ": StagingBuffer: nSlots must be > 0");
+	}
+
+	computeSlotStrideAndBufferSize();
+
+	/* Allocate buffer using mmap() for io_uring registration
+	 * MAP_ANONYMOUS | MAP_PRIVATE creates anonymous, non-file-backed memory
+	 */
+	void* mmapped = mmap(
+		nullptr, bufferNBytes,
+		PROT_READ | PROT_WRITE,
+		MAP_ANONYMOUS | MAP_PRIVATE,
+		-1, 0);
+
+	if (mmapped == MAP_FAILED)
+	{
+		throw std::runtime_error(std::string(__func__)
+			+ ": StagingBuffer: mmap() failed");
+	}
+
+	buffer = std::unique_ptr<uint8_t, MmapDeleter>(
+		static_cast<uint8_t*>(mmapped), MmapDeleter(bufferNBytes));
+	currentNBytes.store(0);
+
+	// Lock the buffer in memory to prevent swapping
+	if (mlock(buffer.get(), bufferNBytes) != 0)
+	{
+		throw std::runtime_error(std::string(__func__)
+			+ ": StagingBuffer: mlock() failed");
+	}
+
+	// Calculate offset and validate invariants (helper function in .cpp)
+	firstSlotOffsetNBytes = StagingBuffer::calculateFirstSlotOffsetAndValidate(
+		buffer.get(), bufferNBytes, nSlots,
+		slotStrideNBytes, inputConstraints);
+
+	// Build FrameAssemblyDesc once
+	std::vector<FrameAssemblyDesc::SlotDesc> slots;
+	slots.reserve(nSlots);
+	uint8_t *frameBase = buffer.get() + firstSlotOffsetNBytes;
+	for (size_t i = 0; i < nSlots; ++i)
+	{
+		size_t off = i * slotStrideNBytes;
+		FrameAssemblyDesc::SlotDesc s{
+			off, frameBase + off, inputConstraints.slotPadToNBytes};
+
+		slots.push_back(s);
+	}
+
+	frameDesc = std::make_shared<FrameAssemblyDesc>(
+		nSlots, inputConstraints.slotPadToNBytes, bufferNBytes,
+		std::move(slots));
+}
+
+} // namespace stim_buff
+} // namespace smo
@@ -1,103 +0,0 @@
-#include <boostAsioLinkageFix.h>
-#include <iostream>
-#include <config.h>
-#include <componentThread.h>
-#include <boost/asio/io_service.hpp>
-#include <boost/asio/deadline_timer.hpp>
-#include <boost/system/error_code.hpp>
-#include <spinLock.h>
-#include <asynchronousBridge.h>
-#include <user/stimulusBuffer.h>
-
-namespace smo {
-namespace stim_buff {
-
-void StimulusBuffer::stop()
-{
-	shouldContinue.store(false);
-
-	// Set up a timeout bridge using the io_service
-	boost::asio::deadline_timer delayTimer(ioService);
-	AsynchronousBridge bridge(ioService);
-
-	// Set up the delay to let in-flight operation finish
-	delayTimer.expires_from_now(
-		boost::posix_time::milliseconds(getStopDelayMs()));
-
-	delayTimer.async_wait(
-		[&bridge](const boost::system::error_code& error)
-		{
-			(void)error;
-
-			// Always signal complete, whether timeout expired or was cancelled
-			bridge.setAsyncOperationComplete();
-		});
-
-	bridge.waitForAsyncOperationCompleteOrIoServiceStopped();
-
-	std::cout << __func__ << ": Stopped stimulus buffer for device "
-		<< deviceAttachmentSpec->deviceSelector << std::endl;
-
-	// After delay, cancel timer and perform cleanup
-	timer.cancel();
-}
-
-void StimulusBuffer::scheduleNextTimeout(int delayMs)
-{
-	if (!shouldContinue.load())
-		{ return; }
-
-	// Schedule the next timeout using the provided delay
-	timer.expires_from_now(
-		boost::posix_time::milliseconds(delayMs));
-
-	timer.async_wait(
-		std::bind(
-			&StimulusBuffer::onTimeout, this, std::placeholders::_1));
-}
-
-void StimulusBuffer::onTimeout(const boost::system::error_code& error)
-{
-	// Timer was cancelled, which is expected when stopping
-	if (error == boost::asio::error::operation_aborted) {
-		return;
-	}
-
-	if (error)
-	{
-		std::cerr << "StimulusBuffer: Timer error: " << error.message()
-			<< std::endl;
-		return;
-	}
-
-	if (!shouldContinue.load())
-		{ return; }
-
-	/**	EXPLANATION:
-	 * We need to ensure that there's only ever one stimframe being produced
-	 * during any CONFIG_STIMBUFF_FRAME_PERIOD_MS period. To guarantee this, we
-	 * use a spinlock.
-	 *
-	 * When a new frame is to be produced, the async producer will first acquire
-	 * the frameAssemblyLimiter spinlock. This way, when the next timeout is
-	 * fired it can check whether its predecessor stimframe has finished being
-	 * produced. If the preceding stimframe is still being produced, then we'll
-	 * sleep for CONFIG_STIMBUFF_FRAME_RETRY_DELAY_MS ms before trying again.
-	 */
-	int nextWakeupDelayMs;
-	if (frameAssemblyRateLimiter.tryAcquire())
-		{ nextWakeupDelayMs = CONFIG_STIMBUFF_FRAME_PERIOD_MS; }
-	else
-		{ nextWakeupDelayMs = CONFIG_STIMBUFF_FRAME_RETRY_DELAY_MS; }
-
-	// Call the derived class's frame production handler
-	stimFrameProductionTimesliceInd();
-	// Note: The lock should be released when frame production completes
-
-	// Schedule next timeout with the pre-determined duration
-	scheduleNextTimeout(nextWakeupDelayMs);
-}
-
-} // namespace stim_buff
-} // namespace smo
-
@@ -0,0 +1,184 @@
+#include <boostAsioLinkageFix.h>
+#include <config.h>
+#include <iostream>
+#include <chrono>
+#include <algorithm>
+#include <boost/asio/io_service.hpp>
+#include <boost/asio/deadline_timer.hpp>
+#include <boost/system/error_code.hpp>
+#include <opts.h>
+#include <componentThread.h>
+#include <spinLock.h>
+#include <user/stimulusProducer.h>
+#include <user/stimulusBuffer.h>
+
+namespace smo {
+namespace stim_buff {
+
+std::shared_ptr<StimulusBuffer> StimulusProducer::getAttachedStimulusBuffer(
+	const std::shared_ptr<device::DeviceAttachmentSpec>& spec) const
+{
+	for (const auto& buffer : attachedStimulusBuffers)
+	{
+		if (buffer && buffer->deviceAttachmentSpec &&
+			*buffer->deviceAttachmentSpec == *spec)
+		{
+			return buffer;
+		}
+	}
+
+	return nullptr;
+}
+
+bool StimulusProducer::hasBufferWithQualeIfaceApi(
+	const std::string& qualeIfaceApi) const
+{
+	for (const auto& buffer : attachedStimulusBuffers)
+	{
+		if (!buffer || !buffer->deviceAttachmentSpec)
+		{
+			throw std::runtime_error(
+				"StimulusProducer::hasBufferWithQualeIfaceApi: encountered "
+				"null buffer or null deviceAttachmentSpec in "
+				"attachedStimulusBuffers (should never happen)");
+		}
+
+		if (buffer->deviceAttachmentSpec->qualeIfaceApi != qualeIfaceApi)
+			{ continue; }
+
+		return true;
+	}
+
+	return false;
+}
+
+void StimulusProducer::destroyAttachedStimulusBuffer(
+	const std::shared_ptr<StimulusBuffer>& buffer)
+{
+	if (!buffer) { return; }
+
+	auto it = std::find(
+		attachedStimulusBuffers.begin(),
+		attachedStimulusBuffers.end(),
+		buffer);
+
+	if (it != attachedStimulusBuffers.end()) {
+		attachedStimulusBuffers.erase(it);
+	}
+}
+
+void StimulusProducer::stop()
+{
+	{
+		SpinLock::Guard lock(shouldContinueLock);
+		shouldContinue = false;
+	}
+
+	// Cancel timer immediately
+	timer.cancel();
+
+	std::cout << __func__ << ": Stopped stimulus producer for device "
+		<< deviceAttachmentSpec->deviceSelector << std::endl;
+}
+
+void StimulusProducer::scheduleNextTimeout(int delayMs)
+{
+	if (!shouldContinue)
+		{ return; }
+
+	// Schedule the next timeout using the provided delay
+	timer.expires_from_now(
+		boost::posix_time::milliseconds(delayMs));
+
+	timer.async_wait(
+		std::bind(
+			&StimulusProducer::onTimeout, this, std::placeholders::_1));
+}
+
+void StimulusProducer::onTimeout(const boost::system::error_code& error)
+{
+	// Timer was cancelled, which is expected when stopping
+	if (error == boost::asio::error::operation_aborted) {
+		return;
+	}
+
+	if (error)
+	{
+		std::cerr << "StimulusProducer: Timer error: " << error.message()
+			<< std::endl;
+		return;
+	}
+
+	SpinLock::Guard lock(shouldContinueLock);
+	if (!shouldContinue)
+		{ return; }
+
+	/**	EXPLANATION:
+	 * We need to ensure that there's only ever one stimframe being produced
+	 * during any CONFIG_STIMBUFF_FRAME_PERIOD_MS period. To guarantee this, we
+	 * use a spinlock.
+	 *
+	 * When a new frame is to be produced, the async producer will first acquire
+	 * the frameAssemblyLimiter spinlock. This way, when the next timeout is
+	 * fired it can check whether its predecessor stimframe has finished being
+	 * produced. If the preceding stimframe is still being produced, then we'll
+	 * sleep for CONFIG_STIMBUFF_FRAME_RETRY_DELAY_MS ms before trying again.
+	 */
+	int nextWakeupDelayMs;
+	bool deferred = false;
+	if (frameAssemblyRateLimiter.tryAcquire())
+	{
+		nextWakeupDelayMs = CONFIG_STIMBUFF_FRAME_PERIOD_MS;
+
+		// Check if we're ending a deferral period
+		if (nDeferrals > 0)
+		{
+			auto deferralEndTime = std::chrono::high_resolution_clock::now();
+			auto duration = deferralEndTime - deferralStartTime;
+			auto durationMs = std::chrono::duration_cast<
+				std::chrono::milliseconds>(duration);
+
+			std::cout << __func__ << ": Deferral period ended. "
+				<< "Total deferrals: " << nDeferrals
+				<< ", Duration: " << durationMs.count() << "ms" << std::endl;
+
+			nDeferrals = 0;
+		}
+
+		/**	EXPLANATION:
+		 * Call the derived class's frame production handler
+		 * Note: The derived class's frame production handler (aka
+		 * its implementation of stimFrameProductionTimesliceInd()) must
+		 * release the lock when frame production completes
+		 */
+		stimFrameProductionTimesliceInd();
+	}
+	else
+	{
+		nextWakeupDelayMs = CONFIG_STIMBUFF_FRAME_RETRY_DELAY_MS;
+		deferred = true;
+
+		++nDeferrals;
+		// If this is first deferral, capture start stamp and print message
+		if (nDeferrals == 1)
+		{
+			deferralStartTime = std::chrono::high_resolution_clock::now();
+			std::cerr << __func__ << ": Deferral period beginning. "
+				"Configured deferral period: " << nextWakeupDelayMs << "ms"
+				<< std::endl;
+		}
+	}
+
+	scheduleNextTimeout(nextWakeupDelayMs);
+
+	// FIXME: We should be able to release the start/stop lock at this point.
+
+	if (deferred && OptionParser::getOptions().verbose)
+	{
+		std::cerr << __func__ << ": Deferring frame by " << nextWakeupDelayMs
+			<< "ms due to rate limit." << std::endl;
+	}
+}
+
+} // namespace stim_buff
+} // namespace smo
@@ -1,8 +1,10 @@
 #include <algorithm>
 #include <iostream>
+#include <functional>
 #include <opts.h>
 #include <componentThread.h>
 #include "broadcastListener.h"
+#include "core.h"

 namespace livoxProto1 {
 namespace comms {
@@ -53,7 +55,6 @@ void BroadcastListener::broadcastMsgInd(
 		return;
 	}

-
 	// Use placement new to construct BroadcastMessage in the buffer
 	BroadcastMessage* msg = new (bcastMsgRecvBuffer) BroadcastMessage;

@@ -93,30 +94,34 @@ void BroadcastListener::broadcastMsgInd(
 		reinterpret_cast<const char*>(msg->broadcast_code));

 	// Early return if device already exists
+	smo::SpinLock::Guard lock(isListeningLock);
+
 	if (deviceExists(broadcastCode))
 	{
 		// Device already exists, just log the update
-		if (OptionParser::getOptions().verbose)
+		if (getProtoState().smoCallbacks.OptionParser_getOptions().verbose)
 		{
 			std::cout << __func__
 				<< ": Received broadcast from known device: "
 				<< broadcastCode << " at " << senderIP << "\n";
 		}
-		return;
+	}
+	else
+	{
+		// Create new DiscoveredDevice using conversion constructor
+		auto device = std::make_shared<DiscoveredDevice>(*msg, senderIP);
+		discoveredDevices.push_back(device);
+		// Output device information using stringify
+		std::cout << __func__ << ": Discovered new Livox device: "
+			<< device->stringify() << "\n";
 	}

-	// Create new DiscoveredDevice using conversion constructor
-	auto device = std::make_shared<DiscoveredDevice>(*msg, senderIP);
-	discoveredDevices.push_back(device);
-
-	// Output device information using stringify
-	std::cout << __func__ << ": Discovered new Livox device: "
-		<< device->stringify() << "\n";
+	startReceive();
 }

 void BroadcastListener::start(void)
 {
-	if (isListening.load()) { return; }
+	if (isListening) { return; }

 	try
 	{
@@ -128,10 +133,15 @@ void BroadcastListener::start(void)
 		 * We should also set up a timer to check for devices that have gone
 		 * away.
 		 */
-		socket.open(boost::asio::ip::udp::v4());
-		socket.bind(listeningEndpoint);
+		{
+			smo::SpinLock::Guard lock(isListeningLock);
+
+			socket.open(boost::asio::ip::udp::v4());
+			socket.bind(listeningEndpoint);
+
+			isListening = true;
+		}

-		isListening.store(true);
 		// Start the first async receive operation
 		startReceive();
 		std::cout << __func__ << ": BroadcastListener started on port "
@@ -139,7 +149,7 @@ void BroadcastListener::start(void)
 	}
 	catch (const boost::system::system_error& e)
 	{
-		isListening.store(false);
+		isListening = false;
 		std::cerr << __func__ << ": Failed to start BroadcastListener: "
 			<< e.what() << std::endl;
 		throw;
@@ -148,27 +158,25 @@ void BroadcastListener::start(void)

 void BroadcastListener::startReceive(void)
 {
-	if (!isListening.load()) { return; }
+	if (!isListening) { return; }

 	socket.async_receive_from(
 		boost::asio::buffer(bcastMsgRecvBuffer, sizeof(bcastMsgRecvBuffer)),
 		senderEndpoint,
-		[this](const boost::system::error_code& ec, std::size_t bytes_received)
-		{
-			broadcastMsgInd(ec, bytes_received);
-
-			// Continue listening for the next packet
-			if (isListening.load())
-				{ startReceive(); }
-		}
+		std::bind(
+			&BroadcastListener::broadcastMsgInd, this,
+			std::placeholders::_1, std::placeholders::_2)
 	);
 }

 void BroadcastListener::stop(void)
 {
-	if (!isListening.load()) { return; }
+	{
+		smo::SpinLock::Guard lock(isListeningLock);
+		if (!isListening) { return; }

-	isListening.store(false);
+		isListening = false;
+	}

 	try
 	{
@@ -8,6 +8,7 @@
 #include <atomic>
 #include <boost/asio/ip/udp.hpp>
 #include <user/senseApiDesc.h>
+#include <spinLock.h>
 #include "device.h"

 namespace livoxProto1 {
@@ -67,7 +68,8 @@ private:

 	boost::asio::ip::udp::socket socket;
 	boost::asio::ip::udp::endpoint listeningEndpoint, senderEndpoint;
-	std::atomic<bool> isListening;
+	smo::SpinLock isListeningLock;
+	bool isListening;

 	uint8_t bcastMsgRecvBuffer[UDP_BCAST_MSG_BUFFER_NBYTES];
 };
@@ -114,7 +114,7 @@ public:

 		// Connection successful, add device to collection
 		context->deviceManager.devices.push_back(context->pendingDevice);
-		if (OptionParser::getOptions().verbose)
+		if (getProtoState().smoCallbacks.OptionParser_getOptions().verbose)
 		{
 			std::cout << __func__ << ": Successfully connected and added device "
 				<< context->pendingDevice->discoveredDevice.deviceIdentifier
@@ -230,7 +230,7 @@ void DeviceManager::destroyDeviceReq(
 	std::shared_ptr<Device> device = getDevice(dev->discoveredDevice).
 		value_or(nullptr);

-	if (!device || device->nAttachedStimBuffs > 0)
+	if (!device || device->nAttachedStimulusProducers > 0)
 	{
 		callback.callbackFn(false);
 		return;
@@ -98,7 +98,7 @@ Device::Device(const std::string &deviceIdentifier,
 	deviceIdentifier, comms::DeviceType::Mid40,
 	// Initialize empty. IP will be set upon successful connection.
 	""),
-nAttachedStimBuffs(0),
+nAttachedStimulusProducers(0),
 componentThread(componentThread),
 commandTimeoutMs(commandTimeoutMs), retryDelayMs(retryDelayMs),
 smoIp(smoIp), detectedSmoListeningIp(""), smoSubnetNbits(smoSubnetNbits),
@@ -150,7 +150,7 @@ public:
 		// Fail early - if handshake failed, try next method
 		if (!success)
 		{
-			if (OptionParser::getOptions().verbose)
+			if (getProtoState().smoCallbacks.OptionParser_getOptions().verbose)
 			{
 				std::cout << __func__ << ": Trying to connect to device by "
 					<< "identifier" << "\n";
@@ -222,7 +222,7 @@ void Device::connectReq(smo::Callback<Device::connectReqCbFn> callback)
 	auto request = std::make_shared<ConnectReq>(*this, std::move(callback));

 	// Try connecting to known device first
-	if (OptionParser::getOptions().verbose) {
+	if (getProtoState().smoCallbacks.OptionParser_getOptions().verbose) {
 		std::cout << __func__ << ": Trying to connect to known device" << "\n";
 	}

@@ -316,7 +316,7 @@ void Device::connectToKnownDeviceReq(
 		return;
 	}

-	if (OptionParser::getOptions().verbose)
+	if (getProtoState().smoCallbacks.OptionParser_getOptions().verbose)
 	{
 		std::cout << __func__ << ": Detected SMO listening IP for known device "
 			<< request->device.discoveredDevice.deviceIdentifier
@@ -402,7 +402,7 @@ void Device::connectByDeviceIdentifierReq(
 	// For heuristic construction, always use the provided smoIp.
 	request->device.detectedSmoListeningIp = request->device.smoIp;

-	if (OptionParser::getOptions().verbose)
+	if (getProtoState().smoCallbacks.OptionParser_getOptions().verbose)
 	{
 		std::cout << __func__ << ": About to try to connect to device by "
 			<< "identifier (" << discoveredDevice.deviceIdentifier << ")"
@@ -720,7 +720,7 @@ private:
 			return;
 		}

-		if (OptionParser::getOptions().verbose)
+		if (getProtoState().smoCallbacks.OptionParser_getOptions().verbose)
 		{
 			std::cout << __func__ << ": Handshake successful with "
 					<< deviceIP << "("
@@ -1042,6 +1042,8 @@ void Device::startHeartbeat()
 	}

 	// Register heartbeat ACK handler (cmd_set=0x00, cmd_id=0x03)
+	smo::SpinLock::Guard lock(heartbeatActiveLock);
+
 	registerUdpCommandHandler(
 		0x00, 0x03, discardHeartbeatAck, discoveredDevice.ipAddr);

@@ -1057,15 +1059,17 @@ void Device::startHeartbeat()

 void Device::stopHeartbeat()
 {
-	if (heartbeatActive.load())
 	{
-		heartbeatActive.store(false);
-		if (heartbeatTimer) {
-			heartbeatTimer->cancel();
-		}
+		smo::SpinLock::Guard lock(heartbeatActiveLock);

+		heartbeatActive.store(false);
 		unregisterUdpCommandHandler(0x00, 0x03, discoveredDevice.ipAddr);
 	}
+
+	if (heartbeatTimer) {
+		heartbeatTimer->cancel();
+		heartbeatTimer.reset();
+	}
 }

 void Device::sendHeartbeat()
@@ -1140,7 +1144,6 @@ void Device::sendHeartbeat()
 	}
 	catch (const std::exception& e)
 	{
-		heartbeatActive.store(false);
 		std::cerr << __func__ << ": Heartbeat send failed for device "
 			<< discoveredDevice.deviceIdentifier
 			<< ": " << e.what() << std::endl;
@@ -1156,15 +1159,21 @@ void Device::onHeartbeatTimer(const boost::system::error_code& error)

 	if (error)
 	{
-		heartbeatActive.store(false);
 		std::cerr << "[" << __func__ << "] Heartbeat timer error for device "
 			<< discoveredDevice.deviceIdentifier
 			<< ": " << error.message() << std::endl;
+
 		return;
 	}

 	// Send next heartbeat
-	sendHeartbeat();
+	{
+		smo::SpinLock::Guard lock(heartbeatActiveLock);
+		if (!heartbeatActive.load())
+			{ return; }
+
+		sendHeartbeat();
+	}
 }

 uint32_t Device::getSubnetMaskFor(uint8_t nbits)
@@ -1512,7 +1521,7 @@ protected:
 		      response->command.cmd_id == 0x04 &&
 		      response->ret_code == 0x00))
 		{
-			if (OptionParser::getOptions().verbose)
+			if (getProtoState().smoCallbacks.OptionParser_getOptions().verbose)
 			{
 				std::cout << __func__ << ": Failed to en/disable pcloud data "
 					"for device "
@@ -4,11 +4,13 @@
 #include <boostAsioLinkageFix.h>
 #include <string>
 #include <cstdint>
+#include <cstddef>
 #include <memory>
 #include <atomic>
 #include <optional>
 #include <functional>
 #include <unordered_map>
+#include <stdexcept>
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
@@ -17,6 +19,7 @@
 #include <boost/asio/posix/stream_descriptor.hpp>
 #include "protocol.h"
 #include <callback.h>
+#include <spinLock.h>

 // Custom hash function for std::pair<uint8_t, uint8_t>
 namespace std {
@@ -112,6 +115,34 @@ public:
 		Triple			= 0x03
 	};

+	/**
+	 * Get the number of points per datagram based on return mode
+	 * @param returnMode The return mode (0=SingleFirst, 1=SingleStrongest, 2=Dual, 3=Triple)
+	 * @return Number of points per datagram
+	 */
+	static inline size_t getNPointsPerDgram(int returnMode)
+	{
+		/*
+		 * Map modes to points per datagram based on Livox docs
+		 * 1: first, 2: strongest -> 96 samples => 96 points
+		 * 3: dual -> 48 samples * 2 points = 96
+		 * 4: triple -> 30 samples * 3 points = 90
+		 */
+		switch (returnMode)
+		{
+		case static_cast<int>(ReturnMode::SingleFirst):
+		case static_cast<int>(ReturnMode::SingleStrongest):
+		case static_cast<int>(ReturnMode::Dual):
+			return 96u;
+		case static_cast<int>(ReturnMode::Triple):
+			return 90u;
+		default:
+			throw std::runtime_error(
+				std::string(__func__) + ": Unknown returnMode "
+				+ std::to_string(returnMode));
+		}
+	}
+
 	// Utility methods
 	std::optional<std::string> getSmoIp(const std::string& deviceIP);

@@ -149,7 +180,7 @@ public:

 public:
 	comms::DiscoveredDevice discoveredDevice;
-	std::atomic<size_t> nAttachedStimBuffs;
+	std::atomic<size_t> nAttachedStimulusProducers;

 	// Configuration
 	std::shared_ptr<smo::ComponentThread> componentThread;
@@ -162,6 +193,7 @@ public:
 	// Heartbeat state
 	std::unique_ptr<boost::asio::deadline_timer> heartbeatTimer;
 	std::atomic<bool> heartbeatActive;
+	smo::SpinLock heartbeatActiveLock;

 	// Point cloud data state
 	std::atomic<bool> pcloudDataActive;
@@ -43,9 +43,13 @@ void UdpCommandDemuxer::start()

 	try
 	{
-		setupSockets();
-		isActive.store(true);
-		shouldStop.store(false);
+		{
+			smo::SpinLock::Guard lock(isActiveAndShouldStopLock);
+
+			setupSockets();
+			isActive.store(true);
+			shouldStop.store(false);
+		}

 		// Start the async receive loop
 		startAsyncReceive();
@@ -66,10 +70,13 @@ void UdpCommandDemuxer::start()

 void UdpCommandDemuxer::stop()
 {
-	if (!isActive.load())
-		{ return; }
+	{
+		smo::SpinLock::Guard lock(isActiveAndShouldStopLock);
+		if (!isActive.load())
+			{ return; }

-	shouldStop.store(true);
+		shouldStop.store(true);
+	}

 	// Close socket and cleanup
 	if (cmdEndpointFdDesc)
@@ -232,6 +239,8 @@ void UdpCommandDemuxer::onDataReady(const boost::system::error_code &error)
 		return;
 	}

+	smo::SpinLock::Guard lock(isActiveAndShouldStopLock);
+
 	if (!isActive.load() || shouldStop.load())
 		{ return; }

@@ -295,26 +304,33 @@ void UdpCommandDemuxer::processIncomingData()
 	{
 		// Extract command set and command ID from the datagram
 		if (bytesReceived >= static_cast<ssize_t>(
-			sizeof(livoxProto1::comms::Header) + sizeof(livoxProto1::comms::Command)))
+			sizeof(livoxProto1::comms::Header)
+			+ sizeof(livoxProto1::comms::Command)))
 		{
-			uint8_t cmd_set = receiveBuffer[sizeof(livoxProto1::comms::Header)];
-			uint8_t cmd_id = receiveBuffer[sizeof(livoxProto1::comms::Header) + 1];
+			uint8_t cmd_set = receiveBuffer[
+				sizeof(livoxProto1::comms::Header)];
+			uint8_t cmd_id = receiveBuffer[
+				sizeof(livoxProto1::comms::Header) + 1];

-			// Found matching device in temporary collection, invoke matching handlers
+			// Found matching dev in temp collection, invoke matching handlers
 			for (const auto& cmdHandler : tempIt->second)
 			{
-				if (cmdHandler.cmd_set != cmd_set || cmdHandler.cmd_id != cmd_id)
-					{ continue; }
+				if (cmdHandler.cmd_set != cmd_set
+					|| cmdHandler.cmd_id != cmd_id)
+				{
+					continue;
+				}

 				try
 				{
-					cmdHandler.handler(receiveBuffer, bytesReceived, senderAddr);
+					cmdHandler.handler(
+						receiveBuffer, bytesReceived, senderAddr);
 				}
 				catch (const std::exception &e)
 				{
-					std::cerr
-						<< __func__ << ": Temporary device handler exception for IP "
-						<< sourceIP << ": " << e.what() << std::endl;
+					std::cerr << __func__ << ": Temporary device handler "
+						"exception for IP " << sourceIP << ": " << e.what()
+						<< std::endl;
 				}
 			}
 		}
@@ -6,6 +6,7 @@
 #include <memory>
 #include <boost/asio/posix/stream_descriptor.hpp>
 #include <componentThread.h>
+#include <spinLock.h>

 namespace livoxProto1 {

@@ -81,6 +82,7 @@ private:
 	uint16_t dataPort;

 	// State management
+	smo::SpinLock isActiveAndShouldStopLock;
 	std::atomic<bool> isActive{false};
 	std::atomic<bool> shouldStop{false};

@@ -1,19 +1,57 @@
-if(COMPILE_CL_CHECKS)
-	find_package(OpenCL REQUIRED)
+option(COMPILE_CL_CHECKS "Compile CL checks" OFF)

+if(COMPILE_CL_CHECKS)
+	# Find OpenCL: try find_package first, fall back to pkg-config
+	find_package(OpenCL QUIET)
+	if(OpenCL_FOUND)
+		# Normalize find_package variables to match pkg_check_modules naming
+		set(OPENCL_FOUND TRUE)
+		set(OPENCL_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
+		# Handle both OpenCL_LIBRARY (singular) and OpenCL_LIBRARIES (plural)
+		if(OpenCL_LIBRARIES)
+			set(OPENCL_LIBRARIES ${OpenCL_LIBRARIES})
+		else()
+			set(OPENCL_LIBRARIES ${OpenCL_LIBRARY})
+		endif()
+		set(OPENCL_LIBRARY_DIRS "")
+		message(STATUS "Found OpenCL using find_package")
+	else()
+		# Fall back to pkg-config
+		pkg_check_modules(OPENCL OpenCL)
+		if(NOT OPENCL_FOUND)
+			message(FATAL_ERROR
+				"Failed to find OpenCL: both find_package and "
+				"pkg_check_modules failed. Try installing the "
+				"'ocl-icd-opencl-dev' package (or the appropriate "
+				"OpenCL development package for your system)."
+			)
+		endif()
+		message(STATUS "Found OpenCL using pkg-config")
+	endif()
+
+	add_executable(clhostshmemptrcheck clhostshmemptrcheck.cpp)
+	target_include_directories(clhostshmemptrcheck
+		PUBLIC ${OPENCL_INCLUDE_DIRS})
+	target_link_libraries(clhostshmemptrcheck
+		${OPENCL_LIBRARIES})
 	add_executable(clshmemlatency clshmemlatency.cpp)
 	target_include_directories(clshmemlatency
-		PUBLIC ${OpenCL_INCLUDE_DIRS})
+		PUBLIC ${OPENCL_INCLUDE_DIRS})
 	target_link_libraries(clshmemlatency
-		${OpenCL_LIBRARY})
+		${OPENCL_LIBRARIES})
+	add_executable(clshmemlatency_callback clshmemlatency_callback.cpp)
+	target_include_directories(clshmemlatency_callback
+		PUBLIC ${OPENCL_INCLUDE_DIRS})
+	target_link_libraries(clshmemlatency_callback
+		${OPENCL_LIBRARIES})
 	add_executable(clshmemcheck clshmemcheck.cpp)
 	target_include_directories(clshmemcheck
-		PUBLIC ${OpenCL_INCLUDE_DIRS})
+		PUBLIC ${OPENCL_INCLUDE_DIRS})
 	target_link_libraries(clshmemcheck
-		${OpenCL_LIBRARY})
+		${OPENCL_LIBRARIES})
 	add_executable(clzerocopycheck clzerocopycheck.cpp)
 	target_include_directories(clzerocopycheck
-		PUBLIC ${OpenCL_INCLUDE_DIRS})
+		PUBLIC ${OPENCL_INCLUDE_DIRS})
 	target_link_libraries(clzerocopycheck
-		${OpenCL_LIBRARY})
+		${OPENCL_LIBRARIES})
 endif()
@@ -0,0 +1,125 @@
+#define CL_TARGET_OPENCL_VERSION 300
+#include <CL/cl.h>
+#include <iostream>
+#include <vector>
+#include <cstring>
+
+static const char* clErrorToStr(cl_int err)
+{
+    switch(err) {
+        case CL_SUCCESS:                            return "CL_SUCCESS";
+        case CL_INVALID_VALUE:                      return "CL_INVALID_VALUE";
+        case CL_INVALID_CONTEXT:                    return "CL_INVALID_CONTEXT";
+        case CL_INVALID_MEM_OBJECT:                 return "CL_INVALID_MEM_OBJECT";
+        case CL_OUT_OF_HOST_MEMORY:                 return "CL_OUT_OF_HOST_MEMORY";
+        case CL_INVALID_OPERATION:                  return "CL_INVALID_OPERATION";
+        case CL_MEM_OBJECT_ALLOCATION_FAILURE:      return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+        default:                                    return "UNKNOWN_ERROR";
+    }
+}
+
+// Try creating a USE_HOST_PTR buffer on this device
+bool testUseHostPtr(cl_context ctx, cl_device_id dev)
+{
+    const size_t bufSize = 1024;
+    std::vector<char> host(bufSize, 0);
+
+    cl_int err = 0;
+    cl_mem buf = clCreateBuffer(
+        ctx,
+        CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE,
+        bufSize,
+        host.data(),
+        &err
+    );
+
+    if(err != CL_SUCCESS) {
+        std::cerr << "  clCreateBuffer(CL_MEM_USE_HOST_PTR) failed: "
+                  << clErrorToStr(err) << "\n";
+        return false;
+    }
+
+    // Try to enqueue a trivial write to verify it works
+    cl_queue_properties queueProps[] = {CL_QUEUE_PROPERTIES, 0, 0};
+    cl_command_queue q = clCreateCommandQueueWithProperties(ctx, dev, queueProps, &err);
+    if(err != CL_SUCCESS){
+        std::cerr << "  Failed to create command queue: "
+                  << clErrorToStr(err) << "\n";
+        clReleaseMemObject(buf);
+        return false;
+    }
+
+    err = clEnqueueWriteBuffer(q, buf, CL_TRUE, 0, bufSize, host.data(), 0, nullptr, nullptr);
+    clFinish(q);
+
+    bool ok = (err == CL_SUCCESS);
+
+    if(!ok) {
+        std::cerr << "  clEnqueueWriteBuffer failed: " << clErrorToStr(err) << "\n";
+    }
+
+    clReleaseCommandQueue(q);
+    clReleaseMemObject(buf);
+
+    return ok;
+}
+
+int main()
+{
+    cl_uint numPlatforms = 0;
+    clGetPlatformIDs(0, nullptr, &numPlatforms);
+
+    if(numPlatforms == 0){
+        std::cout << "No OpenCL platforms.\n";
+        return 0;
+    }
+
+    std::vector<cl_platform_id> plats(numPlatforms);
+    clGetPlatformIDs(numPlatforms, plats.data(), nullptr);
+
+    for(cl_uint p = 0; p < numPlatforms; ++p)
+    {
+        char buf[256];
+
+        clGetPlatformInfo(plats[p], CL_PLATFORM_NAME, sizeof(buf), buf, nullptr);
+        std::cout << "Platform: " << buf << "\n";
+
+        cl_uint numDevs = 0;
+        clGetDeviceIDs(plats[p], CL_DEVICE_TYPE_ALL, 0, nullptr, &numDevs);
+
+        if(numDevs == 0) {
+            std::cout << "  No devices found on this platform.\n";
+            continue;
+        }
+
+        std::vector<cl_device_id> devs(numDevs);
+        clGetDeviceIDs(plats[p], CL_DEVICE_TYPE_ALL, numDevs, devs.data(), nullptr);
+
+        for(cl_uint d = 0; d < numDevs; ++d)
+        {
+            clGetDeviceInfo(devs[d], CL_DEVICE_NAME, sizeof(buf), buf, nullptr);
+            std::cout << "  Device: " << buf << "\n";
+
+            // Create a context for this device
+            cl_int err;
+            cl_context ctx = clCreateContext(nullptr, 1, &devs[d], nullptr, nullptr, &err);
+
+            if(err != CL_SUCCESS) {
+                std::cout << "    Failed to create context: "
+                          << clErrorToStr(err) << "\n";
+                continue;
+            }
+
+            bool supported = testUseHostPtr(ctx, devs[d]);
+
+            if(supported)
+                std::cout << "    HOST_PTR appears supported.\n";
+            else
+                std::cout << "    HOST_PTR appears NOT supported.\n";
+
+            clReleaseContext(ctx);
+        }
+    }
+
+    return 0;
+}
@@ -1,8 +1,10 @@
+#define CL_TARGET_OPENCL_VERSION 300
 #include <CL/cl.h>
 #include <iostream>
 #include <vector>
 #include <chrono>
 #include <cstring>
+#include <cstdlib>

 void checkCLError(cl_int err, const char* msg) {
    if (err != CL_SUCCESS) {
@@ -64,7 +66,8 @@ int main() {
            cl_mem buf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, bufSize, hostBuffer.data(), &err);
            checkCLError(err, "create buffer");

-            cl_command_queue q = clCreateCommandQueue(ctx, devices[d], 0, &err);
+            cl_queue_properties queueProps[] = {CL_QUEUE_PROPERTIES, 0, 0};
+            cl_command_queue q = clCreateCommandQueueWithProperties(ctx, devices[d], queueProps, &err);
            checkCLError(err, "create queue");

            // Simple host → device → host round-trip test
@@ -72,10 +75,11 @@ int main() {

            auto start = std::chrono::high_resolution_clock::now();

-	void* mapped = clEnqueueMapBuffer(q, buf, CL_TRUE, CL_MAP_READ, 0, bufSize, 0, nullptr, &evt, &err);
-	checkCLError(err, "map buffer");
-	clWaitForEvents(1, &evt);
+            void* mapped = clEnqueueMapBuffer(q, buf, CL_TRUE, CL_MAP_READ, 0, bufSize, 0, nullptr, &evt, &err);
+            checkCLError(err, "map buffer");
+            clWaitForEvents(1, &evt);

+            clEnqueueUnmapMemObject(q, buf, mapped, 0, nullptr, nullptr);
            clReleaseMemObject(buf);
            auto end = std::chrono::high_resolution_clock::now();
            std::chrono::duration<double, std::milli> elapsed = end - start;
@@ -1,8 +1,10 @@
+#define CL_TARGET_OPENCL_VERSION 300
 #include <CL/cl.h>
 #include <iostream>
 #include <vector>
 #include <chrono>
 #include <cstring>
+#include <cstdlib>

 void checkCLError(cl_int err, const char* msg) {
    if (err != CL_SUCCESS) {
@@ -70,7 +72,8 @@ int main() {
            cl_context ctx = clCreateContext(nullptr, 1, &devices[d], nullptr, nullptr, &err);
            checkCLError(err, "create context");

-            cl_command_queue q = clCreateCommandQueue(ctx, devices[d], 0, &err);
+            cl_queue_properties queueProps[] = {CL_QUEUE_PROPERTIES, 0, 0};
+            cl_command_queue q = clCreateCommandQueueWithProperties(ctx, devices[d], queueProps, &err);
            checkCLError(err, "create queue");

            // --------------------
@@ -118,7 +121,7 @@ int main() {

            // --------------------
            // Run a few iterations
-            for (int iter = 0; iter < 5; ++iter) {
+            for (int iter = 0; iter < 10; ++iter) {
                cl_event evt;
                auto t0 = std::chrono::high_resolution_clock::now();

@@ -0,0 +1,300 @@
+#define CL_TARGET_OPENCL_VERSION 300
+#include <CL/cl.h>
+#include <iostream>
+#include <vector>
+#include <chrono>
+#include <cstring>
+#include <cstdlib>
+#include <mutex>
+#include <condition_variable>
+
+void checkCLError(cl_int err, const char* msg) {
+    if (err != CL_SUCCESS) {
+        std::cerr << "OpenCL Error " << err << " at: " << msg << std::endl;
+        exit(1);
+    }
+}
+
+// Callback context for waiting on events
+struct CallbackContext {
+    std::mutex mtx;
+    std::condition_variable cv;
+    bool completed;
+    cl_int status;
+    std::chrono::high_resolution_clock::time_point* timestamp;
+};
+
+// Helper function to wait for callback completion
+void waitForCallback(CallbackContext& ctx) {
+    std::unique_lock<std::mutex> lock(ctx.mtx);
+    ctx.cv.wait(lock, [&ctx] { return ctx.completed; });
+std::cout <<"waitForCallback cv.wait() returned.\n";
+}
+
+// Static callback for map buffer event
+void CL_CALLBACK mapEventCallback(cl_event /*event*/, cl_int event_command_exec_status, void* user_data) {
+    CallbackContext* ctx = static_cast<CallbackContext*>(user_data);
+std::cout <<"mapEventCallback called and about to lock mutex.\n";
+    std::unique_lock<std::mutex> lock(ctx->mtx);
+    ctx->status = event_command_exec_status;
+    if (ctx->timestamp) {
+        *ctx->timestamp = std::chrono::high_resolution_clock::now();
+    }
+    ctx->completed = true;
+    ctx->cv.notify_one();
+std::cout <<"mapEventCallback just notified.\n";
+}
+
+// Static callback for kernel execution event
+void CL_CALLBACK kernelEventCallback(cl_event /*event*/, cl_int event_command_exec_status, void* user_data) {
+    CallbackContext* ctx = static_cast<CallbackContext*>(user_data);
+std::cout <<"mapEventCallback called and about to lock mutex.\n";
+    std::unique_lock<std::mutex> lock(ctx->mtx);
+    ctx->status = event_command_exec_status;
+    if (ctx->timestamp) {
+        *ctx->timestamp = std::chrono::high_resolution_clock::now();
+    }
+    ctx->completed = true;
+    ctx->cv.notify_one();
+std::cout <<"mapEventCallback just notified.\n";
+}
+
+// Static callback for unmap buffer event
+void CL_CALLBACK unmapEventCallback(cl_event /*event*/, cl_int event_command_exec_status, void* user_data) {
+    CallbackContext* ctx = static_cast<CallbackContext*>(user_data);
+std::cout <<"mapEventCallback called and about to lock mutex.\n";
+    std::unique_lock<std::mutex> lock(ctx->mtx);
+    ctx->status = event_command_exec_status;
+    if (ctx->timestamp) {
+        *ctx->timestamp = std::chrono::high_resolution_clock::now();
+    }
+    ctx->completed = true;
+    ctx->cv.notify_one();
+std::cout <<"mapEventCallback just notified.\n";
+}
+
+// --------------------
+// Kernel source
+// Simple mock kernel that simulates splitting XYZ/I
+// Each "point" is 16 bytes (XYZ + Intensity)
+const char* kernelSrc = R"CLC(
+__kernel void xyz_i_split(__global uchar* assembly,
+                          __global uchar* xyzOut,
+                          __global uchar* iOut,
+                          const uint numPoints) {
+    uint gid = get_global_id(0);
+    if (gid >= numPoints) return;
+
+    uint offset = gid * 16;
+    // Copy XYZ (12 bytes) to xyzOut
+    for (int i=0; i<12; ++i)
+        xyzOut[gid*12 + i] = assembly[offset + i];
+
+    // Copy Intensity (4 bytes) to iOut
+    for (int i=0; i<4; ++i)
+        iOut[gid*4 + i] = assembly[offset + 12 + i];
+}
+)CLC";
+
+int main() {
+    // --------------------
+    // CHANGE THIS VALUE to set number of points per assembly buffer
+    const size_t numPointsPerAssembly = 100000; // e.g., ~3333 points per fill
+    const size_t bytesPerPoint = 16;          // 12 bytes XYZ + 4 bytes I
+
+    const size_t assemblyBufSize = numPointsPerAssembly * bytesPerPoint;
+    const size_t xyzBufSize = numPointsPerAssembly * 12;
+    const size_t iBufSize = numPointsPerAssembly * 4;
+
+    cl_uint numPlatforms = 0;
+    checkCLError(clGetPlatformIDs(0, nullptr, &numPlatforms), "get num platforms");
+    std::vector<cl_platform_id> platforms(numPlatforms);
+    checkCLError(clGetPlatformIDs(numPlatforms, platforms.data(), nullptr), "get platforms");
+
+    std::cout << "Found " << numPlatforms << " OpenCL platforms\n\n";
+
+    for (cl_uint p = 0; p < numPlatforms; ++p) {
+        char platformName[256];
+        clGetPlatformInfo(platforms[p], CL_PLATFORM_NAME, sizeof(platformName), platformName, nullptr);
+        std::cout << "Platform " << p << ": " << platformName << "\n";
+
+        cl_uint numDevices = 0;
+        clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, 0, nullptr, &numDevices);
+        std::vector<cl_device_id> devices(numDevices);
+        clGetDeviceIDs(platforms[p], CL_DEVICE_TYPE_ALL, numDevices, devices.data(), nullptr);
+
+        for (cl_uint d = 0; d < numDevices; ++d) {
+            char deviceName[256];
+            clGetDeviceInfo(devices[d], CL_DEVICE_NAME, sizeof(deviceName), deviceName, nullptr);
+            std::cout << "  Device " << d << ": " << deviceName << "\n";
+
+            cl_int err;
+            cl_context ctx = clCreateContext(nullptr, 1, &devices[d], nullptr, nullptr, &err);
+            checkCLError(err, "create context");
+
+            cl_queue_properties queueProps[] = {CL_QUEUE_PROPERTIES, 0, 0};
+            cl_command_queue q = clCreateCommandQueueWithProperties(ctx, devices[d], queueProps, &err);
+            checkCLError(err, "create queue");
+
+            // --------------------
+            // Allocate host buffers
+            std::vector<unsigned char> assemblyHost(assemblyBufSize, 42);
+            std::vector<unsigned char> xyzHost(xyzBufSize, 0);
+            std::vector<unsigned char> iHost(iBufSize, 0);
+
+            std::vector<unsigned char> xyzHostCPU(xyzBufSize, 0);
+            std::vector<unsigned char> iHostCPU(iBufSize, 0);
+
+            // Create CL buffers
+            cl_mem assemblyBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, assemblyBufSize, assemblyHost.data(), &err);
+            checkCLError(err, "create assembly buffer");
+            cl_mem xyzBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, xyzBufSize, xyzHost.data(), &err);
+            checkCLError(err, "create xyz buffer");
+            cl_mem iBuf = clCreateBuffer(ctx, CL_MEM_USE_HOST_PTR, iBufSize, iHost.data(), &err);
+            checkCLError(err, "create i buffer");
+
+            // Build program
+            cl_program prog = clCreateProgramWithSource(ctx, 1, &kernelSrc, nullptr, &err);
+            checkCLError(err, "create program");
+
+            err = clBuildProgram(prog, 1, &devices[d], nullptr, nullptr, nullptr);
+            if (err != CL_SUCCESS) {
+                // Print build log
+                size_t logSize = 0;
+                clGetProgramBuildInfo(prog, devices[d], CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize);
+                std::vector<char> log(logSize);
+                clGetProgramBuildInfo(prog, devices[d], CL_PROGRAM_BUILD_LOG, logSize, log.data(), nullptr);
+                std::cerr << log.data() << "\n";
+            }
+            checkCLError(err, "build program");
+
+            cl_kernel kernel = clCreateKernel(prog, "xyz_i_split", &err);
+            checkCLError(err, "create kernel");
+
+            // Set kernel args
+            clSetKernelArg(kernel, 0, sizeof(cl_mem), &assemblyBuf);
+            clSetKernelArg(kernel, 1, sizeof(cl_mem), &xyzBuf);
+            clSetKernelArg(kernel, 2, sizeof(cl_mem), &iBuf);
+            clSetKernelArg(kernel, 3, sizeof(cl_uint), &numPointsPerAssembly);
+
+            const size_t globalWorkSize = numPointsPerAssembly;
+
+            // --------------------
+            // Run a few iterations
+            for (int iter = 0; iter < 10; ++iter) {
+                auto t0 = std::chrono::high_resolution_clock::now();
+                std::chrono::high_resolution_clock::time_point t1, t2, t3;
+
+                cl_event mapEvt;
+                void* mappedAssembly = clEnqueueMapBuffer(q, assemblyBuf, CL_FALSE, CL_MAP_READ, 0, assemblyBufSize, 0, nullptr, &mapEvt, &err);
+                checkCLError(err, "map assembly buffer");
+
+                // Retain event to keep it alive until callback completes
+                err = clRetainEvent(mapEvt);
+                checkCLError(err, "retain map event");
+
+                // Wait for map event using callback
+                CallbackContext mapCtx;
+                mapCtx.completed = false;
+                mapCtx.timestamp = &t1;
+                err = clSetEventCallback(mapEvt, CL_COMPLETE, mapEventCallback, &mapCtx);
+                checkCLError(err, "set map event callback");
+                // Force queue flush to ensure event processing
+                err = clFlush(q);
+                checkCLError(err, "flush queue");
+std::cout <<"About to waitForCalllback for clEnqueueMapBuffer.\n";
+                waitForCallback(mapCtx);
+                checkCLError(mapCtx.status, "map buffer");
+
+                // Release event after callback completes
+                err = clReleaseEvent(mapEvt);
+                checkCLError(err, "release map event");
+
+                cl_event kernelEvt;
+                err = clEnqueueNDRangeKernel(q, kernel, 1, nullptr, &globalWorkSize, nullptr, 0, nullptr, &kernelEvt);
+                checkCLError(err, "enqueue kernel");
+
+                // Retain event to keep it alive until callback completes
+                err = clRetainEvent(kernelEvt);
+                checkCLError(err, "retain kernel event");
+
+                // Wait for kernel event using callback
+                CallbackContext kernelCtx;
+                kernelCtx.completed = false;
+                kernelCtx.timestamp = &t2;
+                err = clSetEventCallback(kernelEvt, CL_COMPLETE, kernelEventCallback, &kernelCtx);
+                checkCLError(err, "set kernel event callback");
+                // Force queue flush to ensure event processing
+                err = clFlush(q);
+                checkCLError(err, "flush queue");
+std::cout <<"About to waitForCalllback for clEnqueueNDRangeKernel.\n";
+                waitForCallback(kernelCtx);
+                checkCLError(kernelCtx.status, "kernel execution");
+
+                // Release event after callback completes
+                err = clReleaseEvent(kernelEvt);
+                checkCLError(err, "release kernel event");
+
+                cl_event unmapEvt;
+                err = clEnqueueUnmapMemObject(q, assemblyBuf, mappedAssembly, 0, nullptr, &unmapEvt);
+                checkCLError(err, "unmap assembly buffer");
+
+                // Retain event to keep it alive until callback completes
+                err = clRetainEvent(unmapEvt);
+                checkCLError(err, "retain unmap event");
+
+                // Wait for unmap event using callback
+                CallbackContext unmapCtx;
+                unmapCtx.completed = false;
+                unmapCtx.timestamp = &t3;
+                err = clSetEventCallback(unmapEvt, CL_COMPLETE, unmapEventCallback, &unmapCtx);
+                checkCLError(err, "set unmap event callback");
+                // Force queue flush to ensure event processing
+                err = clFlush(q);
+                checkCLError(err, "flush queue");
+std::cout <<"About to waitForCalllback for clEnqueueUnmapMemObject.\n";
+                waitForCallback(unmapCtx);
+                checkCLError(unmapCtx.status, "unmap buffer");
+
+                // Release event after callback completes
+                err = clReleaseEvent(unmapEvt);
+                checkCLError(err, "release unmap event");
+
+                // --------------------
+                // Host CPU split
+                auto cpuStart = std::chrono::high_resolution_clock::now();
+                for (size_t pt = 0; pt < numPointsPerAssembly; ++pt) {
+                    size_t off = pt * 16;
+                    for (int i = 0; i < 12; ++i)
+                        xyzHostCPU[pt*12 + i] = assemblyHost[off + i];
+                    for (int i = 0; i < 4; ++i)
+                        iHostCPU[pt*4 + i] = assemblyHost[off + 12 + i];
+                }
+                auto cpuEnd = std::chrono::high_resolution_clock::now();
+
+                std::chrono::duration<double, std::milli> mapElapsed = t1 - t0;
+                std::chrono::duration<double, std::milli> kernelElapsed = t2 - t1;
+                std::chrono::duration<double, std::milli> unmapElapsed = t3 - t2;
+                std::chrono::duration<double, std::milli> cpuElapsed = cpuEnd - cpuStart;
+
+                std::cout << "Iteration " << iter
+                          << " | Map: " << mapElapsed.count()
+                          << " ms | Kernel: " << kernelElapsed.count()
+                          << " ms | Unmap: " << unmapElapsed.count()
+                          << " ms | CPU Split: " << cpuElapsed.count() << " ms\n";
+            }
+
+            // Cleanup
+            clReleaseKernel(kernel);
+            clReleaseProgram(prog);
+            clReleaseMemObject(assemblyBuf);
+            clReleaseMemObject(xyzBuf);
+            clReleaseMemObject(iBuf);
+            clReleaseCommandQueue(q);
+            clReleaseContext(ctx);
+        }
+        std::cout << std::endl;
+    }
+    return 0;
+}
+
@@ -1,3 +1,4 @@
+#define CL_TARGET_OPENCL_VERSION 300
 #include <CL/cl.h>
 #include <iostream>
 #include <vector>
@@ -32,8 +33,9 @@ int main() {
    cl_context ctx = clCreateContext(nullptr, 1, &dev, nullptr, nullptr, &err);
    CHECK(err, "clCreateContext");

-    cl_command_queue q = clCreateCommandQueue(ctx, dev, 0, &err);
-    CHECK(err, "clCreateCommandQueue");
+    cl_queue_properties queueProps[] = {CL_QUEUE_PROPERTIES, 0, 0};
+    cl_command_queue q = clCreateCommandQueueWithProperties(ctx, dev, queueProps, &err);
+    CHECK(err, "clCreateCommandQueueWithProperties");

    // Create program and kernel
    const size_t srcLen = std::strlen(kernelSrc);
@@ -97,7 +99,7 @@ int main() {
    // Validate
    bool ok = true;
    for (size_t i = 0; i < N; ++i)
-        if (outPtr[i] != 142 + i) ok = false;
+        if (outPtr[i] != static_cast<int>(142 + i)) ok = false;

    std::cout << (ok ? "✅ GPU saw host writes (zero-copy confirmed)\n"
                     : "❌ GPU did not see host writes (copying or staging occurred)\n");
@@ -1 +1,3 @@
-+edev|avia0|structural-qualeiface()|livoxGen1()|livoxProto1()|3JEDK380010Z39
+edev|avia0|mesh()|livoxGen1()|livoxProto1()|3JEDK380010Z39||
+edev|avia0|pcloudIntensity()|livoxGen1()|livoxProto1()|3JEDK380010Z39||
+edev|avia0|pcloudAmbience()|livoxGen1()|livoxProto1()|3JEDK380010Z39
@@ -31,7 +31,27 @@ Each stim-buff-api is designed to work with specific stim-iface libraries that u
 **Stim-Buff-API**: `livoxGen1-pcloudIntensity`
 **Quale-Iface-API**: `pcloudIntensity` - Processes intensity/reflectivity data from point clouds

-### 2. Point Cloud Coordinate Data Device (Extrospector)
+### 2. Point Cloud Ambience Data Device (Interoceptor)
+
+**Purpose**: Provides ambience data from the LiDAR point cloud, counting high-intensity points per slot.
+
+**Syntax**:
+```
+idev | avia0 | pcloudAmbience | livoxGen1-pcloud(high-val=120) | livoxProto1(command-timeout-ms=1000,retry-delay-ms=3000,smo-ip=192.168.1.50,smo-subnet-nbits=24) | 3JEDK380010Z39
+```
+
+**Stim-Buff-API**: `livoxGen1-pcloud`
+**Quale-Iface-API**: `pcloudAmbience` - Counts points with intensity >= threshold per slot
+
+**Ambience High Value Parameter** (for pcloudAmbience quale-iface-api):
+- **Parameter names**: `high-value` or `high-val` (synonyms)
+- **Purpose**: Threshold value for counting high-intensity points in the ambience buffer
+- **Default value**: 116
+- **Usage**: Points with intensity >= `ambienceHighVal` are counted in the ambience buffer per slot
+- **Configuration**: Specified in `stim-buff-api-params` when using `pcloudAmbience` quale interface
+- **Example**: `high-val=120` or `high-value=120`
+
+### 3. Point Cloud Coordinate Data Device (Extrospector)

 **Purpose**: Provides spatial coordinate data from the LiDAR point cloud.

@@ -40,6 +60,11 @@ Each stim-buff-api is designed to work with specific stim-iface libraries that u
 +edev | avia0 | pcloud(format=xyz) | livoxGen1-pcloud(data-rate-hz=10) | livoxProto1(command-timeout-ms=1000,retry-delay-ms=3000,smo-ip=192.168.1.50,smo-subnet-nbits=24) | 3JEDK380010Z39
 ```

+**Example with n-dgrams-per-frame parameter**:
+```
+edev | avia0 | pcloud(format=xyz) | livoxGen1-pcloud(data-rate-hz=10,n-dgrams-per-frame=84) | livoxProto1(command-timeout-ms=1000,retry-delay-ms=3000,smo-ip=192.168.1.50,smo-subnet-nbits=24) | 3JEDK380010Z39
+```
+
 **Alternative Format Examples**:
 ```
 +edev | avia0 | pcloud(format=spherical) | livoxGen1-pcloud(data-rate-hz=10) | livoxProto1(command-timeout-ms=1000,retry-delay-ms=3000,smo-ip=192.168.1.50,smo-subnet-nbits=24) | 3JEDK380010Z39
@@ -61,7 +86,7 @@ Each stim-buff-api is designed to work with specific stim-iface libraries that u
 **Alternative Format Parameter Names** (synonymous):
 - `format` or `fmt`

-### 3. IMU Gyroscope Data Device (Interoceptor)
+### 4. IMU Gyroscope Data Device (Interoceptor)

 **Purpose**: Provides gyroscope data from the LiDAR's internal IMU.

@@ -73,7 +98,7 @@ Each stim-buff-api is designed to work with specific stim-iface libraries that u
 **Stim-Buff-API**: `livoxGen1-gyro`
 **Quale-Iface-API**: `gyro` - Processes gyroscope angular velocity data

-### 4. IMU Accelerometer Data Device (Interoceptor)
+### 5. IMU Accelerometer Data Device (Interoceptor)

 **Purpose**: Provides accelerometer data from the LiDAR's internal IMU.

@@ -134,6 +159,7 @@ The `livoxProto1` provider accepts the following parameters:
 | Stim Feature | Stim-Buff-API | Quale-Iface-API | Description |
 |--------------|---------------|----------------|-------------|
 | Point Cloud Intensity | `livoxGen1-pcloudIntensity` | `pcloudIntensity` | Light intensity/reflectivity data |
+| Point Cloud Ambience | `livoxGen1-pcloud` | `pcloudAmbience` | High-intensity point count per slot |
 | Point Cloud Coordinates | `livoxGen1-pcloud` | `pcloud` | Spatial coordinate data |
 | Gyroscope | `livoxGen1-gyro` | `gyro` | Angular velocity measurements |
 | Accelerometer | `livoxGen1-accel` | `accel` | Linear acceleration measurements |
@@ -142,9 +168,11 @@ The `livoxProto1` provider accepts the following parameters:

 Each stim-buff-api accepts device-specific parameters:

-| Parameter | Description | Example |
-|-----------|-------------|---------|
-| `data-rate-hz` | Data sampling rate in Hz | `data-rate-hz=10` |
+| Parameter | Description | Default | Example |
+|-----------|-------------|---------|---------|
+| `data-rate-hz` | Data sampling rate in Hz | - | `data-rate-hz=10` |
+| `n-dgrams-per-frame` / `num-dgrams-per-frame` | Number of UDP datagrams per staging buffer frame | 84 | `n-dgrams-per-frame=84` or `num-dgrams-per-frame=84` |
+| `high-value` / `high-val` | Threshold for counting high-intensity points in ambience buffer (for `pcloudAmbience` quale-iface-api) | 116 | `high-val=120` or `high-value=120` |

 ### Quale-Iface-API Parameters

@@ -158,6 +186,8 @@ The `pcloud` quale-iface-api accepts format parameters:
 | `dual-cartesian` | Dual Cartesian coordinate system |
 | `dual-spherical` | Dual spherical coordinate system |

+The `pcloudAmbience` quale-iface-api uses the `high-value` / `high-val` parameter (documented in Stim-Buff-API Parameters above) to determine the intensity threshold for counting high-intensity points per slot.
+
 ## Device Discovery and Connection

 The specification uses a retry-based connection strategy with two different approaches:
@@ -0,0 +1,16 @@
+This guy talks about getting it to work using a fake transform:
+
+https://stackoverflow.com/questions/52420672/ros-rviz-how-to-visualize-a-point-cloud-that-doesnt-have-a-fixed-frame-transfo
+
+This thread contains some info about what a transform is:
+
+https://answers.ros.org/question/328839/
+
+Somewhat useful troubleshooter:
+
+https://www.youtube.com/watch?v=b9YZITmCWe4
+
+Excellent, full-featured explanation:
+
+https://www.youtube.com/watch?v=QyvHhY4Y_Y8
+
@@ -0,0 +1,122 @@
+# The reason why Rusticl behaves so weirdly with USE_HOST_PTR
+
+```
+[18:21] == rusticluser [~oftc-webi@2803:1500:c00:eb3:c450:9864:8f21:f2fb] has joined #rusticl
+[18:22] <rusticluser> Hey guys, I have questions about the implementation of clEnqueueMapBuffer/clEnqueueUnmapMemObject in Rusticl.
+[18:22] <rusticluser> This webpage says I should ping karolherbst
+[18:22] <rusticluser> https://docs.mesa3d.org/rusticl.html
+[18:23] <rusticluser> I am finding some very odd behaviour on the Raspberry Pi 5, when using the v3d GPU via Rusticl
+[18:24] <rusticluser> (Gimme a bit to write up my questions)
+[18:25] == pbrobinson [~pbrobinso@2001:8b0:fb11:2681:e9:f8b:31b:f797] has joined #rusticl
+[18:29] <rusticluser> Here's a dump of the output from running `RUSTICL_ENABLE=v3d clinfo` on my Raspberry Pi 5: https://gist.github.com/latentPrion/9843ff5b98f21b20b9f6d5bce43006b3
+[18:30] <rusticluser> Of particular note is that it says that the V3D GPU has a unified memory architecture with the main ARM CPU complex:
+[18:30] <rusticluser> >   Unified memory for Host and Device              Yes
+[18:32] <rusticluser> Because all of my target platforms seem to have unified memory with the CL GPUs, I decided that I would aim to optimize my program by using CL_MEM_USE_HOST_PTR, and avoiding using clEnqueueRead/WriteBuffer. I have indeed got it working on both the RPi5 and on my x86 laptop, but some of the things that were required to get it working on the RPi5+Rusticl implementation are a bit odd, and I wanted to confirm whether these behaviours and apparent eccentricities are
+[18:32] <rusticluser> intentional
+[18:34] <rusticluser> Here is my code, for your perusal and reference.
+[18:34] <rusticluser> https://gist.github.com/latentPrion/d9fb3f0604a957d2055786a118072482
+[18:36] <rusticluser> So: the long and short of it is: I have an input buffer (called "assemblyBuffer") that was filled with data by io_uring. I create an openCL buffer for assemblyBuff, using CL_MEM_USE_HOST_PTR. I then want to pass this assemblyBuffer into an OpenCL kernel.
+[18:37] <rusticluser>  The OpenCL kernel doesn't see the data that was written into the buffer unless I use CL_MAP_WRITE_INVALIDATE. I can understand the reasoning behind this, if the reasoning is that the cache invalidation op is performed on the GPU side.
+[18:38] <rusticluser> That makes sense because the GPU's caches may hold stale data that prevent it from seeing the data I put into the HOST_PTR buffer. So the need to invalidate the GPU's caches makes perfect sense and I'm not complaining about this.
+[18:39] <rusticluser> It's the next bit that is a bit confusing to me, and which I suspect is a bug in RustIcl or the MESA driver behind it.
+[18:40] <rusticluser> I have a 2nd buffer, called the "collateBuffer", which is distinct from the "assemblyBuffer". I run a 2nd kernel after the first kernel, which takes the assemblyBuffer as input, and produces its output into the collationBuffer.
+[18:42] <rusticluser> Now, since the 1st kernel wrote its output data into the assemblyBuffer, this should mean that the GPU's caches should be up to date with the data that was just written into the assemblyBuffer by the 1st kernel -- because it was the GPU itself which wrote that data into the assembyBuffer
+[18:43] <rusticluser> Yet, for some reason, I'm still required to remap the assemblyBuffer with CL_MEM_WRITE_INVALIDATE_REGION when I want to run the 2nd kernel.
+[18:43] <rusticluser> 1. I have not modified the assemblyBuffer's data at all on the host CPU. The data in the assemblyBuffer is exactly what was written into it by the 1st kernel when it was running on the GPU.
+[18:44] <rusticluser> 2. The 2nd kernel doesn't write into, or modify the assemblyBuffer at all in any way. The 2nd kernel uses the assemblyBuffer as input *ONLY*.
+[18:44] <rusticluser>  
+[18:46] <rusticluser> I guess my question is: why am I required to first map and unmap the assemblyBuffer as CL_MAP_WRITE_INVALIDATE_REGION before the GPU can see the contents of the assemblyBuffer, even though the GPU itself just wrote that data into it, and the GPU's caches should be in sync with it?
+[18:47] <rusticluser> (You can see the remapping with CL_MAP_WRITE_INVALIDATE_REGION for the 2nd kernel's execution here: https://gist.github.com/latentPrion/d9fb3f0604a957d2055786a118072482#file-openclcollatingandmeshingengine-cpp-L343)
+[18:48] <rusticluser> Technically, I should be able to just map it as CL_MAP_WRITE without needing to specify INVALIDATE_REGION -- am I incorrect?
+[18:49] <rusticluser> Basically what you see in that pasted gist is what is required to get this to work on the RPi5, so any decisions you see in the code are constrained by either (1) Rusticl, (2) MESA drivers, (3) the RPi5 hardware
+[18:51] <rusticluser> I downloaded the Mesa source code and asked Cursor to scan it and find out what's going on (I don't know Rust, so I can't read the code myself very well) and Cursor says that there's an interediate layer of "shadow buffering" implemented by Rusticl between the host and GPU
+[18:52] <rusticluser> And that this intermediate shadow buffering layer is the source of the unexpected behaviours
+[19:11] <karolherbst> rusticluser: launching kernels on mapped buffers is undefined behavior
+[19:14] <karolherbst> though not sure if that's what you run into, just sounded like it
+[19:17] <rusticluser> karolherbst: Yea, but I don't keep them mapped -- notice that I map and then immediately unmap
+[19:18] <rusticluser> Literally: mapBuffer(); unmapBuffer() back to back lol -- good pointer though
+[19:18] <karolherbst> I'm a bit confused by the code, how do you verify that the GPU is or isn't reading the correct data?
+[19:18] <karolherbst> or do you access it through the host pointer directly?
+[19:19] <rusticluser> karolherbst: I check using printf() (OpenCL 1.2 extension) inside of the running kernel, and also I check the resulting output after the kernel has been executed
+[19:19] <karolherbst> ahh
+[19:19] <rusticluser> Would you like to see the kernels? They're just clutter for your headspace, but maybe they might give you some kind of information I don't know about
+[19:19] <karolherbst> USE_HOST_PTR is a bit weird, because it doesn't guarnatee coherency
+[19:20] <rusticluser> Yea -- I can understand that: the real thing that a developer who's using USE_HOST_PTR wants from the underlying implementation is something like this workflow:
+[19:22] <rusticluser> (1) clEnqueueMapBuffer(CL_MAP_WRITE) => /* (2) I write stuff into the buffer */ => (3) clEnqueueUnmapMemObject() /* At this point, during the unmap operation, the CL implementation is expected to write-back the host CPU's caches to main memory, and then invalidate the GPU's caches so that the GPU can see the writes that were stored to main memory
+[19:23] <rusticluser> And for the read-side, the workflow that the developer intuitively expects is:
+[19:25] <rusticluser> (1) clEnqueueMapBuffer(CL_MAP_READ) /* This mapping call should cause the GPU to write-back to main memory, and should cause the host CPU to invalidate its caches so it can see what was written by the GPU */ => (2) /* I read the stuff from the buffer */ => (3) clEnqueueUnmapMemObject() /* No special maintenance required here */
+[19:26] <karolherbst> right.. I think it's potentially also an issue with the rpi driver. It's not really well tested, so random bugs could always exist there. Might want to verify that your application behaves correctly on other GPUs
+[19:27] <rusticluser> Yea -- I only have this RPi5 as an ARM testbed, sadly. The other test machine I have is this shitty Intel Core I5 laptop with an Intel HD GPU. The Intel HD GPU doesn't require any mapping/unmapping of any kind -- the cache coherency domain seems to fully cover the GPU on the Intel laptop
+[19:28] <rusticluser> Idk, maybe it's a bug, maybe it's not -- I guess I was checking to see if the behaviour I was seeing was intentional and I just didn't properly understand the memory/execution model of OpenCL; or whether it's actually a bug somewhere in the underlying implementation's stack
+[19:28] <karolherbst> the intel is the only driver that ever added support for actually mapping host memory into the GPU when it's not page aligned
+[19:29] <rusticluser> Ah -- my HOST_PTRs are aligned to _SC_PAGE_SIZE
+[19:29] <karolherbst> I don't think the rpi driver supports mapping host memory at all
+[19:29] <rusticluser> :(
+[19:29] <karolherbst> yeah...
+[19:30] <karolherbst> not sure if it's because of missing kernel interfaces or what's the reason there
+[19:30] <rusticluser> How can I check and see? I have no understanding of GFX drivers and I hear they're a real domain-specific kind of mess to read;
+[19:30] <rusticluser> At minimum, which "module" in the mesa code provides the RPi5 opencl driver/support?
+[19:31] <karolherbst> well I know that it doesn't support it on the mesa side, but I haven't checked if there is in theory a kernel interface for it or not
+[19:31] <karolherbst> `src/gallium/drivers/v3d/` is the drive inside mesa
+[19:31] <rusticluser> *nod*, thanks
+[19:31] <rusticluser> Is this worth filing a ticket/issue for?
+[19:32] <karolherbst> _not_ sure. Maybe if there is a strong interest to also implement the GL/vulkan features allowing for mapping host memory
+[19:33] <rusticluser> Alright -- I'll just keep my eye on it and if it becomes an unmanageable problem, I'll file a ticket and probably also try to add the support myself
+[19:34] <rusticluser> These new LLMs really enable you to extend yourself into new domains and contribute to stuff you otherwise wouldn't have the time/insight to be able to, so if it really becomes unmanageable, I'll probably be able to just fix it and submit a patch
+[19:34] <karolherbst> though it should still work in theory, so not really sure what's going wrong there
+[19:34] <rusticluser> It should lol -- the purpose of the clEnqueueMap/Unmap calls isn't to actually "map" anything -- it's purely to manage the cache synchronization between the host CPU complex and the GPU
+[19:35] <karolherbst> but I'd verify if your application behaves as expected on other hardware/drivers as well, maybe even on discrete GPUs
+[19:35] <rusticluser> AFAICT, it's probably just a bug in the cache management
+[19:35] <rusticluser> It definitely won't work on a GPU that doesn't have shared memory because the design is explicitly for USE_HOST_PTR
+[19:36] <karolherbst> then it's broken also for shared memory systems
+[19:36] <rusticluser> Hmmm -- could you elaborate on that?
+[19:37] <karolherbst> USE_HOST_PTR doesn't really allow for different use csaes as it doesn't really gurantee anything except that the pointer returned by mapBuffer matches the host pointer
+[19:37] <karolherbst> aand that's all the additional guarantee it gives you
+[19:38] <karolherbst> you still have to use it as if it wouldn't be a host ptr allocation, because synchronization points are the same as with non host ptr allocations
+[19:38] <rusticluser> Yes, indeed: but it's also explicitly different from CL_MEM_ALLOC_HOST_PTR, I think? The difference is that CL_MEM_ALLOC_HOST_PTR is likely to be mapping in device MMIO registers
+[19:38] <karolherbst> alloc host ptr just means that the allocation is done in host memory instead of VRAM
+[19:38] <karolherbst> maybe
+[19:38] <karolherbst> it's just a hint
+[19:39] <karolherbst> like it uses GART infrastructure and the GPU just accesses memory over PCIe (if a discrete GPU)
+[19:39] <karolherbst> for unified memory GPU it shouldn't make any difference
+[19:39] <rusticluser> I'm sorry -- am I wrong? CL_MEM_ALLOC_HOST_PTR means only that the buffer returned will be *ACCESSIBLE* by the host. This means that the buffer could be MMIO mapped registers, or some other such memory range
+[19:40] <rusticluser> It doesn't actually mean that the buffer is allocated from host mem
+[19:40] <rusticluser> It just means that the buffer will be *ACCESSIBLE* from host mem, __POTENTIALLY__ without a copy
+[19:40] <karolherbst> it has nothing to do with access
+[19:41] <rusticluser> https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clCreateBuffer.html:
+[19:41] <karolherbst> sure, but it means something else
+[19:41] <rusticluser> > This flag specifies that the application wants the OpenCL implementation to allocate memory from host accessible memory.  CL_MEM_ALLOC_HOST_PTR and CL_MEM_USE_HOST_PTR are mutually exclusive.
+[19:41] <rusticluser> Ah ok lol
+[19:41] <karolherbst> like you can't access the memory allocation either way directly, because you have to map
+[19:42] <karolherbst> though CL_MEM_ALLOC_HOST_PTR is more of a "please don't use VRAM, so that reading out the memory on the host is quick"
+[19:42] <rusticluser> It seems like the reason why they say that ALLOC_HOST_PTR and USE_HOST_PTR are mutually exclusive is *precisely because* ALLOC_HOST_PTR is not guaranteed to be allocated within host memory lol
+[19:42] <karolherbst> well.. you have no control over what address the mapping will have
+[19:43] <karolherbst> USE_HOST_PTR already uses host memory, so alloc_host_ptr is meaningless
+[19:43] <rusticluser> I am fairly certain that MEM_ALLOC_HOST_PTR means, "You may use VRAM if you wish, but ensure that it's a portion of your internal VRAM that can be exposed and mapped as MMIO. You may also use host RAM if you wish -- both are fine"
+[19:43] <rusticluser> [19:43] <karolherbst> USE_HOST_PTR already uses host memory, so alloc_host_ptr is meaningless
+[19:43] <rusticluser> ^ Absolutely correct
+[19:43] <rusticluser> Wait whoa no
+[19:44] <karolherbst> VRAM can always be mapped into host memory, it's just slow
+[19:44] <karolherbst> and you have to fight with PCI bar sizes
+[19:44] <karolherbst> though you can also set different caching hints etc..
+[19:45] <rusticluser> When I say "VRAM" here, I was mimicking your language, but a more accurate term would be "device memory" because there's no guarantee that the OpenCL device is indeed a GPU, or that it exposes all of its global, local or private memory in an MMIO or host-accessible fashion lol
+[19:45] <rusticluser> Ok errm, I don't think arguing over this will go very far lol
+[19:46] <rusticluser> But I really appreciate your pointers -- I'll look for another test board
+[19:46] <rusticluser> Really appreciate your time -- I know this is a volunteer effort on your part
+[19:47] <fdobridge_> <leftmostcat> Heheh. Pointers.
+[19:47] == rusticluser [~oftc-webi@2803:1500:c00:eb3:c450:9864:8f21:f2fb]
+[19:47] ==  realname : OFTC WebIRC Client
+[19:47] ==  channels : #rusticl
+[19:47] ==  server   : weber.oftc.net [Newark, NJ, USA]
+[19:47] ==  realhost :  [ip: actually using host]
+[19:47] ==  idle     : 0 days 0 hours 1 minutes 20 seconds [connected: Wed Nov 12 18:21:37 2025]
+[19:47] == End of WHOIS
+[19:49] <karolherbst> yeah anyway.. on the rpi5 driver might as well not use use_host_ptr because rusticl will have to copy things around to fake host_ptr support anyway. So might as well then not use it. But I also wanted to implement more optimized map/unmap paths for single device context with unified memory, because atm it's asuming worst case and isn't really
+[19:49] <karolherbst> optimized very well anyway
+[19:49] <karolherbst> but those optimizations will also paper over correctness issues
+[19:51] <karolherbst> though I'm also not convinced that the emulation code is 100% correct...
+[19:52] <karolherbst> there _might_ be a bug if the mapping has different accesses, but I never found anything that ran into issues here
+[19:54] <karolherbst> you could run with `RUSTICL_DEBUG=memory` and see if the prints make any sense. It should tell when the memory content is migrated and moved around
+[20:00] <rusticluser> karolherbst: Ah that's awesome info, thanks
+[20:01] <rusticluser> It would be really useful to have an explicit confirmation of whether I'm actually getting zero-copy
+```
@@ -54,6 +54,11 @@ public:
 		return nTotal == 0;
 	}

+	void setRemainingIterationsToFailure()
+	{
+		nFailed.store(nTotal - nSucceeded.load());
+	}
+
 public:
 	unsigned int nTotal;
 	std::atomic<unsigned int> nSucceeded, nFailed;
@@ -80,14 +80,25 @@ public:
 	{
 	public:
 		explicit Guard(SpinLock& lock)
-		: lock_(lock)
+		: lock_(lock), unlocked_(false)
 		{
 			lock_.acquire();
 		}

 		~Guard()
 		{
-			lock_.release();
+			if (!unlocked_) {
+				lock_.release();
+			}
+		}
+
+		void unlockPrematurely()
+		{
+			if (!unlocked_)
+			{
+				lock_.release();
+				unlocked_ = true;
+			}
 		}

 		// Non-copyable, non-movable
@@ -98,6 +109,7 @@ public:

 	private:
 		SpinLock& lock_;
+		bool unlocked_;
 	};

 private:
@@ -0,0 +1,134 @@
+#ifndef _USER_COMPUTE_H
+#define _USER_COMPUTE_H
+
+#include <memory>
+#include <vector>
+#include <string_view>
+#define CL_TARGET_OPENCL_VERSION 120
+#include <CL/cl.h>
+
+namespace smo {
+namespace compute {
+
+// Helper function to validate OpenCL version
+bool validateOpenClVersion(
+	std::string_view versionStr, std::string_view versionType,
+	int minMajor, int minMinor);
+
+/**
+ * @brief OpenCL compute device information
+ *
+ * Manages a single OpenCL device, creating and owning its context and command
+ * queue.
+ */
+class ComputeDevice
+{
+public:
+	/**
+	 * @brief Construct a ComputeDevice from platform and device IDs
+	 *
+	 * Creates the OpenCL context and command queue for the device.
+	 * Throws std::runtime_error if context or queue creation fails.
+	 *
+	 * @param platformId OpenCL platform ID
+	 * @param deviceId OpenCL device ID
+	 */
+	ComputeDevice(cl_platform_id platformId, cl_device_id deviceId);
+
+	~ComputeDevice()
+	{
+		if (commandQueue)
+		{
+			clReleaseCommandQueue(commandQueue);
+			commandQueue = nullptr;
+		}
+		if (context)
+		{
+			clReleaseContext(context);
+			context = nullptr;
+		}
+	}
+
+	// Non-copyable
+	ComputeDevice(const ComputeDevice&) = delete;
+	ComputeDevice& operator=(const ComputeDevice&) = delete;
+
+	cl_platform_id platform;
+	cl_device_id device;
+	cl_context context;
+	cl_command_queue commandQueue;
+};
+
+/**
+ * @brief Association between an OpenCL buffer and a compute device
+ */
+struct ClBufferDeviceAssociation
+{
+	ClBufferDeviceAssociation(
+		cl_mem buf, const std::shared_ptr<ComputeDevice>& dev)
+	: buffer(buf), device(dev)
+	{}
+
+	cl_mem buffer;
+	std::shared_ptr<ComputeDevice> device;
+};
+
+/**
+ * @brief OpenCL buffer created on all compute devices
+ *
+ * Manages a USE_HOST_PTR buffer created on all available compute devices.
+ * The constructor creates buffers for all devices, and the destructor releases
+ * them.
+ */
+class ClBuffer
+{
+public:
+	/**
+	 * @brief Construct a ClBuffer and create buffers on all devices
+	 *
+	 * Creates a USE_HOST_PTR buffer on each device's context.
+	 * Throws std::runtime_error if buffer creation fails for any device.
+	 *
+	 * @param hostPtr Host pointer to use
+	 * @param size Size of buffer in bytes
+	 * @param flags Additional OpenCL memory flags
+	 * @param devices Vector of compute devices to create buffers on
+	 */
+	ClBuffer(
+		void* hostPtr, size_t size, cl_mem_flags flags,
+		const std::vector<std::shared_ptr<ComputeDevice>>& devices);
+
+	~ClBuffer()
+	{
+		for (auto& assoc : associations)
+		{
+			if (assoc.buffer)
+			{
+				clReleaseMemObject(assoc.buffer);
+				assoc.buffer = nullptr;
+			}
+		}
+	}
+
+	// Non-copyable
+	ClBuffer(const ClBuffer&) = delete;
+	ClBuffer& operator=(const ClBuffer&) = delete;
+
+	/**
+	 * @brief Get the cl_mem handle for a specific compute device
+	 * @param device The compute device to find the buffer for
+	 * @return The cl_mem handle for the device, or nullptr if not found
+	 */
+	cl_mem getAssociatedBufferHandleForDevice(
+		const std::shared_ptr<ComputeDevice>& device) const;
+
+	void* hostPtr;
+	size_t size;
+	cl_mem_flags flags;
+	std::vector<ClBufferDeviceAssociation> associations;
+};
+
+} // namespace compute
+} // namespace smo
+
+#endif // _USER_COMPUTE_H
@@ -114,6 +114,36 @@ public:
                + it->second + "' as integer: " + e.what());
        }
    }
+
+    /**
+     * @brief Parse an optional integer parameter from a parameter list using synonyms
+     * @param params The parameter vector to search in
+     * @param synonymNames The collection of synonymous parameter names to try
+     * @param defaultValue The default value to return if no parameter is found
+     * @return The parsed integer value, or defaultValue if none found
+     * @note Synonyms are tried in reverse order; lattermost synonym wins if multiple are present
+     */
+    static int parseOptionalParamAsIntWithSynonyms(
+		const std::vector<std::pair<std::string,std::string>>& params,
+		const std::vector<std::string>& synonymNames,
+		int defaultValue
+		)
+    {
+        // Loop through synonyms in reverse order; lattermost synonym wins.
+        for (auto synIt = synonymNames.rbegin();
+            synIt != synonymNames.rend(); ++synIt)
+        {
+            const auto& paramName = *synIt;
+            try {
+                return parseRequiredParamAsInt(params, paramName);
+            } catch (const std::exception&) {
+                // Parameter not found or parse error, continue to next synonym
+                continue;
+            }
+        }
+
+        return defaultValue;
+    }
 };

 class InteroceptorDevAttachmentSpec : public DeviceAttachmentSpec
@@ -62,4 +62,3 @@ public:

 #endif // _LIVOX_GEN1_FRAME_ASSEMBLY_DESC_H

-
@@ -6,14 +6,24 @@
 #include <string>
 #include <functional>
 #include <memory>
+#include <vector>
 #include <preprocessor.h>
 #include <user/deviceAttachmentSpec.h>
 #include <callback.h>
+#define CL_TARGET_OPENCL_VERSION 120
+#include <CL/cl.h>
+
+class OptionParser;

 namespace smo {

 class ComponentThread;

+namespace compute {
+class ClBuffer;
+class ComputeDevice;
+} // namespace compute
+
 namespace stim_buff {

 /**
@@ -81,6 +91,47 @@ struct SmoCallbacks
     * equivalent to calling ComponentThread::getSelf().
     */
    std::shared_ptr<ComponentThread> (*ComponentThread_getSelf)(void);
+
+    /**
+     * @brief Get the OptionParser singleton instance
+     * @return Reference to the OptionParser singleton
+     *
+     * This function provides access to the OptionParser singleton instance,
+     * equivalent to calling OptionParser::getOptions().
+     */
+    OptionParser& (*OptionParser_getOptions)(void);
+
+    /**
+     * @brief Create a USE_HOST_PTR buffer on all OpenCL contexts
+     * @param hostPtr Host pointer to the memory
+     * @param size Size of the buffer in bytes
+     * @param flags Additional OpenCL memory flags
+     * @return Shared pointer to ClBuffer managing buffers on all devices
+     */
+    std::shared_ptr<smo::compute::ClBuffer>
+	(*ComputeManager_createUseHostPtrBuffer)(
+        void* hostPtr, size_t size, cl_mem_flags flags);
+
+    /**
+     * @brief Release USE_HOST_PTR buffers from all contexts
+     * @param buffer Shared pointer to ClBuffer to release
+     */
+    void (*ComputeManager_releaseUseHostPtrBuffer)(
+        std::shared_ptr<smo::compute::ClBuffer> buffer);
+
+    /**
+     * @brief Get a compute device
+     * @return Shared pointer to ComputeDevice, or nullptr if no devices available
+     */
+    std::shared_ptr<smo::compute::ComputeDevice>
+	(*ComputeManager_getDevice)(void);
+
+    /**
+     * @brief Release a compute device
+     * @param device Shared pointer to ComputeDevice to release
+     */
+    void (*ComputeManager_releaseDevice)(
+		std::shared_ptr<smo::compute::ComputeDevice> device);
 };

 struct Sal_Mgmt_LibOps
@@ -2,13 +2,20 @@
 #define _SP_MC_RING_BUFFER_H

 #include <vector>
-#include <memory>
 #include <cstddef>
 #include <stdexcept>
-#include <algorithm>
+#include <string>
+#include <new>
+#include <memory>
+#include <user/stimulusFrame.h>
+#include <user/frameAssemblyDesc.h>
 #include <user/sequenceLock.h>
+#include <user/senseApiDesc.h>
+#define CL_TARGET_OPENCL_VERSION 120
+#include <CL/cl.h>

 namespace smo {
+namespace stim_buff {

 /**
 * @brief Single-producer, multi-consumer ring buffer w/per-slot sequence locks
@@ -20,120 +27,130 @@ namespace smo {
 */
 class SpMcRingBuffer
 {
-public:
-	class InputEngineConstraints
-	{
-	public:
-		InputEngineConstraints(
-			size_t slotStartAlignmentNBytes_,
-			size_t slotPadToNBytes_)
-		: slotStartAlignmentNBytes(slotStartAlignmentNBytes_),
-		  slotPadToNBytes(slotPadToNBytes_)
-		{}
-
-		~InputEngineConstraints() = default;
-
-		// Input-engine layout/constraints
-		size_t slotStartAlignmentNBytes;  // power-of-2 alignment (e.g., 4096)
-		size_t slotPadToNBytes;       // minimum size per slot
-	};
-
 public:
 	/**	EXPLANATION:
-	 * Constructor initializes the ring buffer with the given constraints and
-	 * number of slots. Calculates stride and allocates data buffer and sequence
-	 * locks array.
+	 * Constructor initializes the ring buffer with FrameAssemblyDesc.
+	 * Allocates frames vector with properly constructed StimulusFrame instances,
+	 * each initialized with a SlotDesc from the FrameAssemblyDesc.
 	 */
 	explicit SpMcRingBuffer(
-		size_t nSlots_,
-		const InputEngineConstraints& constraints_)
-	: nSlots(nSlots_), strideNBytes(0), bufferNBytes(0),
-	  constraints(constraints_)
+		const std::shared_ptr<FrameAssemblyDesc> &frameAssemblyDesc_,
+		const SmoCallbacks& callbacks,
+		cl_mem_flags flags)
+	:
+	nBuffers(frameAssemblyDesc_ ? frameAssemblyDesc_->slots.size() : 0),
+	frameAssemblyDesc(frameAssemblyDesc_),
+	slots(nBuffers),  // Default-construct all frames
+	producerNextUsableIndex(0)
 	{
-		if (nSlots == 0)
+		if (!frameAssemblyDesc)
 		{
 			throw std::invalid_argument(std::string(__func__)
-				+ ": SpMcRingBuffer: nSlots must be > 0");
+				+ ": SpMcRingBuffer: frameAssemblyDesc must not be null");
 		}

-		computeStrideAndBufferSize();
-		// Allocate data buffer: bufferNBytes (aligned up to alignment)
-		data.resize(bufferNBytes);
-		// Initialize sequence locks array: one lock per slot
-		// Use unique_ptr array since SequenceLock is not copyable or movable
-		sequenceLocks = std::make_unique<SequenceLock[]>(nSlots);
+		if (nBuffers == 0)
+		{
+			throw std::invalid_argument(std::string(__func__)
+				+ ": SpMcRingBuffer: frameAssemblyDesc must have at least one "
+				"slot");
+		}
+
+		// Re-invoke constructors w/placement new on default-constructed frames
+		for (size_t i = 0; i < nBuffers; ++i)
+		{
+			slots[i].~StimulusFrame();  // Destroy default-constructed object
+			new (&slots[i]) StimulusFrame(
+				frameAssemblyDesc->slots[i], callbacks, flags, i);
+		}
 	}

 	~SpMcRingBuffer() = default;

-	// Non-copyable, movable
+	// Non-copyable, non-movable (slots are non-movable)
 	SpMcRingBuffer(const SpMcRingBuffer&) = delete;
 	SpMcRingBuffer& operator=(const SpMcRingBuffer&) = delete;
-	SpMcRingBuffer(SpMcRingBuffer&&) = default;
-	SpMcRingBuffer& operator=(SpMcRingBuffer&&) = default;
+	SpMcRingBuffer(SpMcRingBuffer&&) = delete;
+	SpMcRingBuffer& operator=(SpMcRingBuffer&&) = delete;

 public:
 	/**
-	 * @brief Get a reference to data at the specified slot
+	 * @brief Get a reference to the StimulusFrame at the specified slot
 	 *
-	 * @tparam T The type of data stored in the slot
 	 * @param slotIndex The index of the slot (0-based)
-	 * @return Reference to T at the slot
-	 * @throws std::out_of_range if slotIndex >= nSlots
+	 * @return Reference to StimulusFrame at the slot
+	 * @throws std::out_of_range if slotIndex >= nBuffers
 	 */
-	template<typename T>
-	T& getDataAtSlot(size_t slotIndex)
+	StimulusFrame& getDataAtSlot(size_t slotIndex)
 	{
-		if (slotIndex >= nSlots)
+		if (slotIndex >= nBuffers)
 		{
 			throw std::out_of_range(std::string(__func__)
-				+ ": SpMcRingBuffer: slotIndex must be < nSlots");
+				+ ": SpMcRingBuffer: slotIndex must be < nBuffers");
 		}
-
-		size_t offset = slotIndex * strideNBytes;
-		return *reinterpret_cast<T*>(data.data() + offset);
+		return slots[slotIndex];
 	}

 	SequenceLock& getSequenceLockAtSlot(size_t slotIndex)
 	{
-		if (slotIndex >= nSlots)
+		if (slotIndex >= nBuffers)
 		{
 			throw std::out_of_range(std::string(__func__)
-				+ ": SpMcRingBuffer: slotIndex must be < nSlots");
+				+ ": SpMcRingBuffer: slotIndex must be < nBuffers");
 		}
-		return sequenceLocks[slotIndex];
+		return slots[slotIndex].lock;
 	}

-private:
-	void computeStrideAndBufferSize()
+	/**
+	 * @brief Get the next index to produce into, atomically incrementing it
+	 *
+	 * Uses sequence lock to perform an emulated fetch_add with modulo nBuffers
+	 * applied, ensuring the returned index is always < nBuffers.
+	 *
+	 * @return The index to produce into (always < nBuffers)
+	 */
+	size_t getIndexToProduceInto()
 	{
-		// Stride is the maximum of alignment and padding
-		strideNBytes = std::max(
-			constraints.slotStartAlignmentNBytes,
-			constraints.slotPadToNBytes);
-
-		// Buffer size is nSlots * strideNBytes, aligned up to alignment
-		size_t rawSize = nSlots * strideNBytes;
-		bufferNBytes = ((rawSize + constraints.slotStartAlignmentNBytes - 1)
-		                / constraints.slotStartAlignmentNBytes)
-		               * constraints.slotStartAlignmentNBytes;
+		producerNextUsableIndexLock.writeAcquire();
+		size_t currentIndex = producerNextUsableIndex;
+		size_t nextIndex = (currentIndex + 1) % nBuffers;
+		producerNextUsableIndex = nextIndex;
+		producerNextUsableIndexLock.writeRelease();
+		return currentIndex;
 	}

-	// Buffer data
-	std::vector<uint8_t> data;
-
-	// Sequence locks array: one lock per slot
-	// Use unique_ptr array since SequenceLock is not copyable or movable
-	std::unique_ptr<SequenceLock[]> sequenceLocks;
+	/**
+	 * @brief Abort production by setting the producer index to a specific value
+	 *
+	 * @param index The index to set (must be < nBuffers)
+	 * @throws std::out_of_range if index >= nBuffers
+	 */
+	void abortProduction(size_t index)
+	{
+		if (index >= nBuffers)
+		{
+			throw std::out_of_range(std::string(__func__)
+				+ ": SpMcRingBuffer: index must be < nBuffers");
+		}
+		producerNextUsableIndexLock.writeAcquire();
+		producerNextUsableIndex = index;
+		producerNextUsableIndexLock.writeRelease();
+	}

 public:
 	// Layout/invariants
-	size_t nSlots;
-	size_t strideNBytes;
-	size_t bufferNBytes;
-	InputEngineConstraints constraints;
+	size_t nBuffers;
+
+private:
+	// FrameAssemblyDesc describing the memory layout
+	std::shared_ptr<FrameAssemblyDesc> frameAssemblyDesc;
+	// Frames vector: each frame contains a sequence lock and SlotDesc
+	std::vector<StimulusFrame> slots;
+	SequenceLock producerNextUsableIndexLock;
+	size_t producerNextUsableIndex;
 };

+} // namespace stim_buff
 } // namespace smo

 #endif // _SP_MC_RING_BUFFER_H
@@ -3,21 +3,18 @@

 #include <memory>
 #include <cstdint>
-#include <functional>
 #include <atomic>
-#include <vector>
 #include <string>
 #include <sstream>
-#include <algorithm>
 #include <sys/mman.h>
 #include <sys/uio.h>
-#include <unistd.h>
-
-#include "frameAssemblyDesc.h"

 namespace smo {
 namespace stim_buff {

+// Forward declaration
+class FrameAssemblyDesc;
+
 /**
 * StagingBuffer manages a large buffer to guide io_uring in assembling some
 * number of Livox Avia pcloud UDP dgrams into a single stim frame.
@@ -31,53 +28,61 @@ namespace stim_buff {
 class StagingBuffer
 {
 public:
-	class InputEngineConstraints
+	class IOEngineConstraints
 	{
 	public:
-		InputEngineConstraints(
+		// Default constructor creates uninitialized constraints
+		IOEngineConstraints() = default;
+
+		IOEngineConstraints(
 			size_t slotStartAlignmentByteVal_,
-			size_t slotPadToNBytes_)
+			size_t slotPadToNBytes_,
+			size_t frameStartAlignmentByteVal_,
+			size_t framePadToNBytes_)
 		: slotStartAlignmentByteVal(slotStartAlignmentByteVal_),
-		  slotPadToNBytes(slotPadToNBytes_)
+		slotPadToNBytes(slotPadToNBytes_),
+		frameStartAlignmentByteVal(frameStartAlignmentByteVal_),
+		framePadToNBytes(framePadToNBytes_)
 		{}

-		~InputEngineConstraints() = default;
+		~IOEngineConstraints() = default;

-		// Input-engine layout/constraints
-		size_t slotStartAlignmentByteVal;  // power-of-2 alignment (e.g., 4096)
-		size_t slotPadToNBytes;            // minimum size per datagram slot
+		size_t slotStartAlignmentByteVal, slotPadToNBytes,
+			frameStartAlignmentByteVal, framePadToNBytes;

-		// Static defaults for io_uring
-		static const InputEngineConstraints ioUringConstraints;
+		// Static defaults for io_uring and OpenCL
+		static const IOEngineConstraints ioUringConstraints;
+		static const IOEngineConstraints openClInputConstraints;

 		inline std::string stringify() const
 		{
 			std::ostringstream oss;
-			oss << "InputEngineConstraints{"
+			oss << "IOEngineConstraints{"
 				<< "slotStartAlignmentByteVal=" << slotStartAlignmentByteVal
 				<< ", slotPadToNBytes=" << slotPadToNBytes
+				<< ", frameStartAlignmentByteVal=" << frameStartAlignmentByteVal
+				<< ", framePadToNBytes=" << framePadToNBytes
 				<< "}";
 			return oss.str();
 		}
 	};

-	class OutputEngineConstraints
-	{
-	public:
-		OutputEngineConstraints() = default;
-		~OutputEngineConstraints() = default;
-	};
-
 public:
+	/**	EXPLANATION:
+	 * Default constructor creates uninitialized buffer.
+	 * Must be properly initialized using placement new with the parameterized constructor.
+	 */
+	StagingBuffer() = default;
+
 	/**	EXPLANATION:
 	 * We use the input and output engine constraints to determine the total
 	 * amount of memory required internally to assemble a single frame with
 	 * the given number of points per frame.
 	 */
 	explicit StagingBuffer(
-		const InputEngineConstraints& inputEngineConstraints,
-		const OutputEngineConstraints& outputEngineConstraints,
-		size_t nDgramsPerFrame);
+		const IOEngineConstraints& inputEngineConstraints,
+		const IOEngineConstraints& outputEngineConstraints,
+		size_t nSlots);
 	~StagingBuffer() = default;

 	// Non-copyable, movable
@@ -111,11 +116,25 @@ public:
 		return iov;
 	}

+	/**	EXPLANATION:
+	 * Returns an iovec for OpenCL engine buffer access.
+	 * The buffer is mmap()-allocated and suitable for CL_MEM_USE_HOST_PTR.
+	 * Returns pointer to first slot (offset by firstSlotOffsetNBytes) and
+	 * size from first slot to end of buffer.
+	 */
+	struct iovec getClEngineIovec() const
+	{
+		struct iovec iov;
+		iov.iov_base = buffer.get() + firstSlotOffsetNBytes;
+		iov.iov_len = bufferNBytes - firstSlotOffsetNBytes;
+		return iov;
+	}
+
 	inline std::string stringify() const
 	{
 		std::ostringstream oss;
 		oss << "StagingBuffer{"
-			<< "nDgramsPerFrame=" << nDgramsPerFrame
+			<< "nSlots=" << nSlots
 			<< ", bufferNBytes=" << bufferNBytes
 			<< ", slotStrideNBytes=" << slotStrideNBytes
 			<< ", constraints=" << inputConstraints.stringify()
@@ -125,17 +144,26 @@ public:

 private:
 	void computeSlotStrideAndBufferSize();
+	static size_t calculateFirstSlotOffsetAndValidate(
+		uint8_t* buffer,
+		size_t bufferNBytes,
+		size_t nSlots,
+		size_t slotStrideNBytes,
+		const IOEngineConstraints& inputConstraints);

 	// Custom deleter for mmap-allocated buffer
 	struct MmapDeleter
 	{
 		size_t size;
+		// Default constructor for use with default-constructed StagingBuffer
+		MmapDeleter() : size(0) {}
 		MmapDeleter(size_t s) : size(s) {}
 		
 		void operator()(uint8_t* ptr) const
 		{
 			if (ptr != nullptr && size > 0)
 			{
+				munlock(ptr, size);
 				munmap(ptr, size);
 			}
 		}
@@ -148,9 +176,14 @@ private:
 	size_t bufferNBytes;

 	// Layout/invariants
-	size_t nDgramsPerFrame;
+	size_t nSlots;
+
+public:
 	size_t slotStrideNBytes;
-	InputEngineConstraints inputConstraints;
+	size_t firstSlotOffsetNBytes;  // offset from buffer start to first slot
+
+private:
+	IOEngineConstraints inputConstraints;

 	// Descriptor (computed once; reused across frames)
 	mutable std::shared_ptr<FrameAssemblyDesc> frameDesc;
@@ -160,90 +193,6 @@ private:
 	std::atomic<bool> assemblingFlag;
 };

-/** Inline implementations
- ******************************************************************************/
-
-inline StagingBuffer::StagingBuffer(
-	const InputEngineConstraints& inputEngineConstraints_,
-	const OutputEngineConstraints& /*outputEngineConstraints*/,
-	size_t nDgramsPerFrame)
-: buffer(nullptr, MmapDeleter(0)), bufferNBytes(0),
-nDgramsPerFrame(nDgramsPerFrame), slotStrideNBytes(0),
-inputConstraints(inputEngineConstraints_),
-assemblingFlag(false)
-{
-	if (nDgramsPerFrame == 0)
-	{
-		throw std::invalid_argument(std::string(__func__)
-			+ ": StagingBuffer: nDgramsPerFrame must be > 0");
-	}
-
-	computeSlotStrideAndBufferSize();
-
-	/* Allocate buffer using mmap() for io_uring registration
-	 * MAP_ANONYMOUS | MAP_PRIVATE creates anonymous, non-file-backed memory
-	 */
-	void* mmapped = mmap(
-		nullptr, bufferNBytes,
-		PROT_READ | PROT_WRITE,
-		MAP_ANONYMOUS | MAP_PRIVATE,
-		-1, 0);
-
-	if (mmapped == MAP_FAILED)
-	{
-		throw std::runtime_error(std::string(__func__)
-			+ ": StagingBuffer: mmap() failed");
-	}
-
-	buffer = std::unique_ptr<uint8_t, MmapDeleter>(
-		static_cast<uint8_t*>(mmapped), MmapDeleter(bufferNBytes));
-	currentNBytes.store(0);
-
-	// Build FrameAssemblyDesc once
-	std::vector<FrameAssemblyDesc::SlotDesc> slots;
-	slots.reserve(nDgramsPerFrame);
-	uint8_t *frameBase = buffer.get();
-	for (size_t i = 0; i < nDgramsPerFrame; ++i)
-	{
-		size_t off = i * slotStrideNBytes;
-		FrameAssemblyDesc::SlotDesc s{
-			off, frameBase + off, inputConstraints.slotPadToNBytes};
-
-		slots.push_back(s);
-	}
-
-	frameDesc = std::make_shared<FrameAssemblyDesc>(
-		nDgramsPerFrame, inputConstraints.slotPadToNBytes, bufferNBytes,
-		std::move(slots));
-}
-
-inline void StagingBuffer::computeSlotStrideAndBufferSize()
-{
-	// Slot stride is the maximum of alignment and padding
-	slotStrideNBytes = std::max(
-		inputConstraints.slotStartAlignmentByteVal,
-		inputConstraints.slotPadToNBytes);
-
-	// Buffer size is nDgramsPerFrame * slotStrideNBytes, aligned up to alignment
-	size_t rawSize = nDgramsPerFrame * slotStrideNBytes;
-	bufferNBytes = ((rawSize + inputConstraints.slotStartAlignmentByteVal - 1)
-	                / inputConstraints.slotStartAlignmentByteVal)
-	               * inputConstraints.slotStartAlignmentByteVal;
-}
-
-/** Specific input/output engine constraints
- ******************************************************************************/
-
-class OpenClConstraints
-:	public StagingBuffer::OutputEngineConstraints
-{
-public:
-	OpenClConstraints()
-	:	StagingBuffer::OutputEngineConstraints()
-	{}
-	~OpenClConstraints() = default;
-};
-
 } // namespace stim_buff
 } // namespace smo

@@ -1,63 +1,52 @@
 #ifndef _STIMULUS_BUFFER_H
 #define _STIMULUS_BUFFER_H

-#include <boostAsioLinkageFix.h>
+#include <config.h>
 #include <vector>
 #include <memory>
-#include <cstdint>
-#include <atomic>
-#include <mutex>
-#include <functional>
-#include <iostream>
-#include <config.h>
-#include <boost/asio/io_service.hpp>
-#include <boost/asio/deadline_timer.hpp>
-#include <spinLock.h>
-#include <asynchronousBridge.h>
 #include <user/spMcRingBuffer.h>
-#include "stimFrame.h"
+#include <user/stagingBuffer.h>
+#include <user/frameAssemblyDesc.h>
+#include <user/senseApiDesc.h>
+#include "stimulusFrame.h"
 #include "deviceAttachmentSpec.h"
+#define CL_TARGET_OPENCL_VERSION 120
+#include <CL/cl.h>

 namespace smo {
 namespace stim_buff {

+// Forward declaration
+class StimulusProducer;
+
 /**
- * StimulusBuffer manages a collection of stimulus frames with simultaneity stamps.
+ * StimulusBuffer manages a collection of stimulus frames and ring buffer.
 *
- * This buffer is designed to hold stimulus frames that have been assembled
- * from raw sensor data (e.g., Livox Avia point cloud data) and are ready
- * for processing by the mind layer.
- *
- * The buffer provides thread-safe operations for adding frames, retrieving
- * frames, and managing the buffer state.
+ * This buffer holds the actual frame storage and ring buffer for stimulus
+ * data. It maintains a reference to its parent StimulusProducer.
 */
 class StimulusBuffer
 {
-public:
-	class PcloudFormatDesc
-	{
-	public:
-		enum class Format
-		{
-			XYZ,
-			XYZI,
-		};
-
-	public:
-		Format format;
-	};
-
 public:
 	explicit StimulusBuffer(
+		StimulusProducer& parent,
 		const std::shared_ptr<device::DeviceAttachmentSpec>
 			&deviceAttachmentSpec,
-		size_t nSlots,
-		const SpMcRingBuffer::InputEngineConstraints& ringBufferConstraints,
-		boost::asio::io_service& ioService_)
-	: deviceAttachmentSpec(deviceAttachmentSpec),
-	ringBuffer(nSlots, ringBufferConstraints),
-	ioService(ioService_),
-	shouldContinue(false), timer(ioService)
+		int histbuffMs,
+		const StagingBuffer::IOEngineConstraints& inputEngineConstraints,
+		const StagingBuffer::IOEngineConstraints& outputEngineConstraints,
+		const SmoCallbacks& callbacks,
+		cl_mem_flags flags)
+	: parent(parent),
+	deviceAttachmentSpec(deviceAttachmentSpec),
+	histbuffMs(histbuffMs),
+	stagingBuffer(
+		inputEngineConstraints,
+		outputEngineConstraints,
+		static_cast<size_t>(histbuffMs / CONFIG_STIMBUFF_FRAME_PERIOD_MS)),
+	ringBuffer(
+		static_cast<std::shared_ptr<FrameAssemblyDesc>>(stagingBuffer),
+		callbacks, flags)
 	{}

 	virtual ~StimulusBuffer() = default;
@@ -68,44 +57,12 @@ public:
 	StimulusBuffer(StimulusBuffer&&) = default;
 	StimulusBuffer& operator=(StimulusBuffer&&) = default;

-	// Control methods
-	virtual void start()
-	{
-		std::cout << __func__ << ": Starting stimulus buffer for device "
-			<< deviceAttachmentSpec->deviceSelector << std::endl;
-
-		shouldContinue.store(true);
-		scheduleNextTimeout();
-	}
-
-	virtual void stop();
-
-protected:
-	// Virtual functions for derived classes to override
-	virtual int getStopDelayMs() const
-	{
-		return CONFIG_STIMBUFF_FRAME_PERIOD_MS;
-	}
-
-	virtual void stimFrameProductionTimesliceInd() = 0;
-
-private:
-	void onTimeout(const boost::system::error_code& error);
-
 public:
+	StimulusProducer& parent;
 	std::shared_ptr<device::DeviceAttachmentSpec> deviceAttachmentSpec;
-	std::vector<StimFrame> frames_;
-
-protected:
-	SpinLock frameAssemblyRateLimiter;
+	int histbuffMs;
+	StagingBuffer stagingBuffer;
 	SpMcRingBuffer ringBuffer;
-
-private:
-	boost::asio::io_service& ioService;
-	std::atomic<bool> shouldContinue;
-	boost::asio::deadline_timer timer;
-
-	void scheduleNextTimeout(int delayMs = CONFIG_STIMBUFF_FRAME_PERIOD_MS);
 };

 } // namespace stim_buff
@@ -1,7 +1,14 @@
-#ifndef _STIM_FRAME_H
-#define _STIM_FRAME_H
+#ifndef _ATTACHMENT_SUPPORT_STIMULUS_FRAME_H
+#define _ATTACHMENT_SUPPORT_STIMULUS_FRAME_H

 #include <cstdint>
+#include <memory>
+#include <user/frameAssemblyDesc.h>
+#include <user/sequenceLock.h>
+#include <user/compute.h>
+#include <user/senseApiDesc.h>
+#define CL_TARGET_OPENCL_VERSION 120
+#include <CL/cl.h>

 namespace smo {
 namespace stim_buff {
@@ -58,13 +65,59 @@ namespace stim_buff {
 */
 typedef uint64_t SimultaneityStamp;

-class StimFrame
+class StimulusFrame
 {
 public:
+	/**	EXPLANATION:
+	 * Default constructor creates uninitialized frame.
+	 * Must be properly initialized using placement new with the parameterized constructor.
+	 */
+	StimulusFrame() = default;
+
+	StimulusFrame(
+		const FrameAssemblyDesc::SlotDesc& slotDesc_,
+		const SmoCallbacks& callbacks,
+		cl_mem_flags flags,
+		size_t ringBufferIndex_)
+	: slotDesc(slotDesc_),
+	ringBufferIndex(ringBufferIndex_)
+	{
+		if (!callbacks.ComputeManager_createUseHostPtrBuffer)
+		{
+			throw std::runtime_error(std::string(__func__)
+				+ ": StimulusFrame: ComputeManager_createUseHostPtrBuffer "
+				"callback is null");
+		}
+
+		clBuffer = callbacks.ComputeManager_createUseHostPtrBuffer(
+			slotDesc.vaddr, slotDesc.nBytes, flags);
+
+		if (!clBuffer)
+		{
+			throw std::runtime_error(std::string(__func__)
+				+ ": StimulusFrame: failed to create clBuffer");
+		}
+
+//		std::cout << __func__ << ": StimulusFrame: created clBuffer with size " << slotDesc.nBytes << " bytes @ " << (const void*)slotDesc.vaddr << std::endl;
+	}
+
+	~StimulusFrame() = default;
+
+	// Non-copyable, movable
+	StimulusFrame(const StimulusFrame&) = delete;
+	StimulusFrame& operator=(const StimulusFrame&) = delete;
+	StimulusFrame(StimulusFrame&&) = default;
+	StimulusFrame& operator=(StimulusFrame&&) = default;
+
+public:
+	SequenceLock lock;
 	SimultaneityStamp simultaneityStamp;
+	FrameAssemblyDesc::SlotDesc slotDesc;
+	std::shared_ptr<smo::compute::ClBuffer> clBuffer;
+	size_t ringBufferIndex;
 };

 } // namespace stim_buff
 } // namespace smo

-#endif // _STIM_FRAME_H
+#endif // _ATTACHMENT_SUPPORT_STIMULUS_FRAME_H
@@ -0,0 +1,119 @@
+#ifndef _STIMULUS_PRODUCER_H
+#define _STIMULUS_PRODUCER_H
+
+#include <boostAsioLinkageFix.h>
+#include <vector>
+#include <memory>
+#include <cstdint>
+#include <atomic>
+#include <mutex>
+#include <functional>
+#include <iostream>
+#include <chrono>
+#include <config.h>
+#include <boost/asio/io_service.hpp>
+#include <boost/asio/deadline_timer.hpp>
+#include <spinLock.h>
+#include "deviceAttachmentSpec.h"
+
+namespace smo {
+namespace stim_buff {
+
+// Forward declaration
+class StimulusBuffer;
+
+/**
+ * StimulusProducer manages a collection of stimulus frames with simultaneity stamps.
+ *
+ * This producer is designed to hold stimulus frames that have been assembled
+ * from raw sensor data (e.g., Livox Avia point cloud data) and are ready
+ * for processing by the mind layer.
+ *
+ * The producer provides thread-safe operations for adding frames, retrieving
+ * frames, and managing the producer state.
+ */
+class StimulusProducer
+{
+public:
+	explicit StimulusProducer(
+		const std::shared_ptr<device::DeviceAttachmentSpec>
+			&deviceAttachmentSpec,
+		boost::asio::io_service& ioService_)
+	: deviceAttachmentSpec(deviceAttachmentSpec),
+	ioService(ioService_),
+	shouldContinue(false), timer(ioService),
+	nDeferrals(0)
+	{}
+
+	virtual ~StimulusProducer() = default;
+
+	// Non-copyable, movable
+	StimulusProducer(const StimulusProducer&) = delete;
+	StimulusProducer& operator=(const StimulusProducer&) = delete;
+	StimulusProducer(StimulusProducer&&) = default;
+	StimulusProducer& operator=(StimulusProducer&&) = default;
+
+	// Control methods
+	virtual void start()
+	{
+		std::cout << __func__ << ": Starting stimulus producer for device "
+			<< deviceAttachmentSpec->deviceSelector << std::endl;
+
+		shouldContinue = true;
+		nDeferrals = 0;
+		scheduleNextTimeout();
+	}
+
+	virtual void stop();
+
+	void allowNextStimulusFrame()
+		{ frameAssemblyRateLimiter.release(); }
+
+	virtual std::shared_ptr<StimulusBuffer> getAttachedStimulusBuffer(
+		const std::shared_ptr<device::DeviceAttachmentSpec>& spec) const;
+
+	virtual std::shared_ptr<StimulusBuffer> getOrCreateAttachedStimulusBuffer(
+		const std::shared_ptr<device::DeviceAttachmentSpec>
+			&deviceAttachmentSpec) = 0;
+
+	virtual void destroyAttachedStimulusBuffer(
+		const std::shared_ptr<StimulusBuffer>& buffer);
+
+	// Check if any attached buffer has the specified qualeIfaceApi
+	bool hasBufferWithQualeIfaceApi(const std::string& qualeIfaceApi) const;
+
+protected:
+	SpinLock frameAssemblyRateLimiter;
+
+	// Virtual functions for derived classes to override
+	virtual int getStopDelayMs() const
+	{
+		return CONFIG_STIMBUFF_FRAME_PERIOD_MS;
+	}
+
+	virtual void stimFrameProductionTimesliceInd() = 0;
+
+private:
+	void onTimeout(const boost::system::error_code& error);
+
+public:
+	std::shared_ptr<device::DeviceAttachmentSpec> deviceAttachmentSpec;
+	std::vector<std::shared_ptr<StimulusBuffer>> attachedStimulusBuffers;
+
+private:
+	boost::asio::io_service& ioService;
+protected:
+	SpinLock shouldContinueLock;
+	bool shouldContinue;
+private:
+	boost::asio::deadline_timer timer;
+	size_t nDeferrals;
+	std::chrono::high_resolution_clock::time_point deferralStartTime;
+
+	void scheduleNextTimeout(int delayMs = CONFIG_STIMBUFF_FRAME_PERIOD_MS);
+};
+
+} // namespace stim_buff
+} // namespace smo
+
+#endif // _STIMULUS_PRODUCER_H
@@ -34,6 +34,9 @@ add_library(smocore STATIC
    # SenseApis
    stimBuffApis/stimBuffApiManager.cpp

+    # ComputeManager
+    computeManager/computeManager.cpp
+
    # MindManager
    mindManager/mindManager.cpp
 )
@@ -47,6 +50,7 @@ target_include_directories(smocore PUBLIC
    ${CMAKE_CURRENT_SOURCE_DIR}/include
    ${CMAKE_CURRENT_BINARY_DIR}
 	${Boost_INCLUDE_DIRS}
+	${OPENCL_INCLUDE_DIRS}
 )

 # Link against pthread for CPU affinity functions
@@ -55,4 +59,9 @@ target_link_libraries(smocore PRIVATE
 	Threads::Threads
 	Boost::system
 	Boost::log
+	${OPENCL_LIBRARIES}
+	attachmentSupport
+)
+target_link_directories(smocore PRIVATE
+	${OPENCL_LIBRARY_DIRS}
 )
@@ -106,9 +106,7 @@ void MindThread::main(MindThread& self)

 		if (sendExceptionInd)
 		{
-			mrntt::mrntt.finalizeReq(
-				{nullptr, std::bind(
-				&mrntt::marionetteFinalizeReqCb, std::placeholders::_1)});
+			mrntt::mrntt.exceptionInd();
 		}
 	}

@@ -0,0 +1,192 @@
+#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <algorithm>
+#include <computeManager/computeManager.h>
+#include <user/compute.h>
+
+namespace smo {
+namespace compute {
+
+void ComputeManager::initialize()
+{
+	if (initialized) { return; }
+
+	cl_int err;
+
+	// Get number of platforms
+	cl_uint numPlatforms = 0;
+	err = clGetPlatformIDs(0, nullptr, &numPlatforms);
+	if (err != CL_SUCCESS)
+	{
+		throw std::runtime_error(
+			std::string(__func__) + ": failed to get OpenCL platforms: " +
+			std::to_string(err));
+	}
+	if (numPlatforms == 0)
+	{
+		throw std::runtime_error(
+			std::string(__func__) + ": no OpenCL platforms found");
+	}
+
+	// Get all platforms
+	std::vector<cl_platform_id> platforms(numPlatforms);
+	err = clGetPlatformIDs(numPlatforms, platforms.data(), nullptr);
+	if (err != CL_SUCCESS)
+	{
+		throw std::runtime_error(
+			std::string(__func__) + ": failed to enumerate OpenCL platforms: " +
+			std::to_string(err));
+	}
+
+	// Enumerate devices for each platform
+	for (cl_uint p = 0; p < numPlatforms; ++p)
+	{
+		cl_platform_id platform = platforms[p];
+
+		// Check platform version
+		char platformVersion[128];
+		err = clGetPlatformInfo(
+			platform, CL_PLATFORM_VERSION,
+			sizeof(platformVersion), platformVersion, nullptr);
+
+		if (err == CL_SUCCESS)
+		{
+			if (!validateOpenClVersion(platformVersion, "platform", 1, 2))
+			{
+				std::cout << __func__ << ": skipping platform " << p
+					<< " with incompatible OpenCL version "
+					<< std::string(platformVersion) << std::endl;
+				continue;
+			}
+		}
+
+		// Get number of devices
+		cl_uint numDevices = 0;
+		err = clGetDeviceIDs(
+			platform, CL_DEVICE_TYPE_ALL, 0, nullptr, &numDevices);
+
+		if (err != CL_SUCCESS || numDevices == 0)
+		{
+			std::cout << __func__ << ": skipping platform " << p
+				<< " with no devices" << std::endl;
+			continue;
+		}
+
+		// Get all devices
+		std::vector<cl_device_id> platformDevices(numDevices);
+		err = clGetDeviceIDs(
+			platform, CL_DEVICE_TYPE_ALL, numDevices,
+			platformDevices.data(), nullptr);
+
+		if (err != CL_SUCCESS)
+		{
+			throw std::runtime_error(
+				std::string(__func__) + ": failed to enumerate devices for "
+				"platform " + std::to_string(p) + ": " + std::to_string(err));
+		}
+
+		// Create ComputeDevice for each device
+		for (cl_uint d = 0; d < numDevices; ++d)
+		{
+			cl_device_id device = platformDevices[d];
+
+			// Check device version
+			char deviceVersion[128];
+			err = clGetDeviceInfo(
+				device, CL_DEVICE_VERSION,
+				sizeof(deviceVersion), deviceVersion, nullptr);
+
+			if (err == CL_SUCCESS)
+			{
+				if (!validateOpenClVersion(deviceVersion, "device", 1, 2))
+				{
+					std::cout << __func__ << ": skipping device " << d
+						<< " with incompatible OpenCL version "
+						<< std::string(deviceVersion) << std::endl;
+					continue;
+				}
+			}
+
+			// Create ComputeDevice (constructor creates context and queue)
+			try
+			{
+				auto deviceObj = std::make_shared<ComputeDevice>(
+					platform, device);
+				devices.push_back(deviceObj);
+			}
+			catch (const std::runtime_error& e)
+			{
+				// Re-throw with more context about which device/platform
+				throw std::runtime_error(
+					std::string(__func__) + ": failed to create ComputeDevice "
+					"for device " + std::to_string(d) + " on platform " +
+					std::to_string(p) + ": " + e.what());
+			}
+		}
+	}
+
+	if (devices.empty())
+	{
+		throw std::runtime_error(
+			std::string(__func__) + ": no compatible OpenCL devices found");
+	}
+
+	initialized = true;
+	std::cout << __func__ << ": Initialized with " << devices.size()
+		<< " compute device(s)" << std::endl;
+}
+
+void ComputeManager::finalize()
+{
+	if (!initialized) { return; }
+
+	// Release all devices (their shared_ptrs will clean up contexts/queues)
+	devices.clear();
+	initialized = false;
+	std::cout << __func__ << ": Finalized" << std::endl;
+}
+
+std::shared_ptr<ClBuffer>
+ComputeManager::createUseHostPtrBuffer(
+	void* hostPtr, size_t size, cl_mem_flags flags)
+{
+	if (!initialized)
+	{
+		std::cerr << __func__ << ": ComputeManager not initialized"
+			<< std::endl;
+		throw std::runtime_error(
+			std::string(__func__) + ": ComputeManager not initialized");
+	}
+
+	return std::make_shared<ClBuffer>(hostPtr, size, flags, devices);
+}
+
+void ComputeManager::releaseUseHostPtrBuffer(std::shared_ptr<ClBuffer> buffer)
+{
+	// No-op: ClBuffer's destructor handles cleanup automatically
+	// This function exists for API compatibility
+	(void)buffer;
+}
+
+std::shared_ptr<ComputeDevice> ComputeManager::getDevice()
+{
+	if (!initialized || devices.empty()) {
+		return nullptr;
+	}
+
+	// Return first available device
+	// In the future, this will filter based on ComputeDeviceConstraints
+	return devices[0];
+}
+
+void ComputeManager::releaseDevice(std::shared_ptr<ComputeDevice> device)
+{
+	// Placeholder for future refcounting implementation
+	// Devices are only removed in finalize()
+	(void)device;
+}
+
+} // namespace compute
+} // namespace smo
@@ -0,0 +1,105 @@
+#ifndef _COMPUTE_MANAGER_H
+#define _COMPUTE_MANAGER_H
+
+#include <memory>
+#include <vector>
+#include <user/compute.h>
+
+namespace smo {
+namespace compute {
+
+/**
+ * @brief Centralized OpenCL platform and device management
+ *
+ * Enumerates all OpenCL platforms and devices, maintains contexts and command
+ * queues, and provides methods to create buffers and access devices.
+ */
+class ComputeManager
+{
+public:
+	static ComputeManager& getInstance()
+	{
+		static ComputeManager instance;
+		return instance;
+	}
+
+	/**
+	 * @brief Initialize ComputeManager by enumerating platforms and devices
+	 *
+	 * Enumerates all OpenCL platforms, then all devices on each platform,
+	 * creating contexts and command queues for each device.
+	 * Idempotent - can be called multiple times safely.
+	 */
+	void initialize();
+
+	/**
+	 * @brief Finalize ComputeManager, releasing all resources
+	 *
+	 * Releases all contexts, command queues, and clears device list.
+	 * Safe to call even if not initialized.
+	 */
+	void finalize();
+
+	/**
+	 * @brief Create USE_HOST_PTR buffers on all contexts
+	 *
+	 * Creates a buffer using CL_MEM_USE_HOST_PTR on each device's context.
+	 *
+	 * @param hostPtr Host pointer to use
+	 * @param size Size of buffer in bytes
+	 * @param flags Additional OpenCL memory flags
+	 * @return Shared pointer to ClBuffer managing buffers on all devices
+	 */
+	std::shared_ptr<ClBuffer> createUseHostPtrBuffer(
+		void* hostPtr, size_t size, cl_mem_flags flags);
+
+	/**
+	 * @brief Release USE_HOST_PTR buffers
+	 *
+	 * Releases all buffers. This is a no-op since ClBuffer's destructor
+	 * handles cleanup automatically.
+	 *
+	 * @param buffer Shared pointer to ClBuffer to release
+	 */
+	void releaseUseHostPtrBuffer(std::shared_ptr<ClBuffer> buffer);
+
+	/**
+	 * @brief Get a compute device
+	 *
+	 * Returns the first available device. Later will accept
+	 * ComputeDeviceConstraints for filtering.
+	 *
+	 * @return Shared pointer to ComputeDevice, or nullptr if no devices available
+	 */
+	std::shared_ptr<ComputeDevice> getDevice();
+
+	/**
+	 * @brief Release a compute device
+	 *
+	 * Placeholder for future refcounting implementation.
+	 * Currently a no-op - devices are only removed in finalize().
+	 *
+	 * @param device Shared pointer to ComputeDevice to release
+	 */
+	void releaseDevice(std::shared_ptr<ComputeDevice> device);
+
+private:
+	ComputeManager() : initialized(false) {}
+	~ComputeManager() {}
+
+	// Non-copyable, non-movable
+	ComputeManager(const ComputeManager&) = delete;
+	ComputeManager& operator=(const ComputeManager&) = delete;
+	ComputeManager(ComputeManager&&) = delete;
+	ComputeManager& operator=(ComputeManager&&) = delete;
+
+	bool initialized;
+	std::vector<std::shared_ptr<ComputeDevice>> devices;
+};
+
+} // namespace compute
+} // namespace smo
+
+#endif // _COMPUTE_MANAGER_H
+
+
@@ -23,9 +23,12 @@ public:
 	typedef std::function<void(bool)> mrnttLifetimeMgmtOpCbFn;
 	void initializeReq(Callback<mrnttLifetimeMgmtOpCbFn> callback);
 	void finalizeReq(Callback<mrnttLifetimeMgmtOpCbFn> callback);
+	// Intentionally doesn't take a callback.
+	void exceptionInd();

 private:
 	class MrnttLifetimeMgmtOp;
+	class TerminationEvent;
 };

 extern std::atomic<int> exitCode;
@@ -117,6 +117,34 @@ public:
 	}
 };

+class MarionetteComponent::TerminationEvent
+:	public PostedAsynchronousContinuation<mrnttLifetimeMgmtOpCbFn>
+{
+public:
+	TerminationEvent(
+		const std::shared_ptr<ComponentThread> &caller)
+	:	PostedAsynchronousContinuation<mrnttLifetimeMgmtOpCbFn>(
+			caller, {nullptr, nullptr})
+	{}
+
+public:
+	void exceptionInd1_posted(
+		[[maybe_unused]] std::shared_ptr<TerminationEvent> context
+		)
+	{
+		auto self = ComponentThread::getSelf();
+		if (self->id != ComponentThread::MRNTT)
+		{
+			throw std::runtime_error(std::string(__func__)
+				+ ": Must be executed on Marionette thread");
+		}
+
+		mrntt::mrntt.finalizeReq({nullptr, std::bind(
+			&mrntt::marionetteFinalizeReqCb,
+			std::placeholders::_1)});
+	}
+};
+
 void MarionetteComponent::initializeReq(
 	Callback<mrnttLifetimeMgmtOpCbFn> callback)
 {
@@ -157,5 +185,19 @@ void MarionetteComponent::finalizeReq(
 			request.get(), request)));
 }

+void MarionetteComponent::exceptionInd()
+{
+	auto faultyThread = ComponentThread::getSelf();
+	auto mrntt = ComponentThread::getMrntt();
+
+	auto request = std::make_shared<TerminationEvent>(
+		faultyThread);
+
+	mrntt->getIoService().post(
+		STC(std::bind(
+			&TerminationEvent::exceptionInd1_posted,
+			request.get(), request)));
+}
+
 } // namespace mrntt
 } // namespace smo
@@ -184,9 +184,7 @@ void MarionetteThread::main(MarionetteThread& self)
 			if (sendExceptionInd)
 			{
 				mrntt::exitCode = EXIT_FAILURE;
-				mrntt::mrntt.finalizeReq({nullptr, std::bind(
-					&mrntt::marionetteFinalizeReqCb,
-					std::placeholders::_1)});
+				mrntt::mrntt.exceptionInd();
 			}
        }

@@ -2,6 +2,7 @@
 #include <mindManager/mindManager.h>
 #include <deviceManager/deviceManager.h>
 #include <stimBuffApis/stimBuffApiManager.h>
+#include <computeManager/computeManager.h>
 #include <salmanoff.h>


@@ -11,6 +12,7 @@ void initializeSalmanoff(void)
 {
 	std::cout << __func__ << ": Entered." << std::endl;

+	compute::ComputeManager::getInstance().initialize();
 	mind::MindManager::getInstance().initialize();
 	stim_buff::StimBuffApiManager::getInstance().initialize();
 	device::DeviceManager::getInstance().initialize();
@@ -25,6 +27,7 @@ void shutdownSalmanoff(void)
 	device::DeviceManager::getInstance().finalize();
 	stim_buff::StimBuffApiManager::getInstance().finalize();
 	mind::MindManager::getInstance().finalize();
+	compute::ComputeManager::getInstance().finalize();
 }

 } // namespace smo
@@ -13,6 +13,7 @@
 #include <mind.h>
 #include <deviceManager/deviceManager.h>
 #include <marionette/marionette.h>
+#include <computeManager/computeManager.h>


 namespace fs = std::filesystem;
@@ -79,13 +80,56 @@ static std::shared_ptr<ComponentThread> ComponentThread_getSelf()
 	return ComponentThread::getSelf();
 }

+/* Local static function to wrap OptionParser::getOptions for SmoCallbacks */
+static OptionParser& OptionParser_getOptions()
+{
+	return OptionParser::getOptions();
+}
+
+/* Local static functions to wrap ComputeManager methods for SmoCallbacks */
+static std::shared_ptr<smo::compute::ClBuffer>
+ComputeManager_createUseHostPtrBuffer(
+	void* hostPtr, size_t size, cl_mem_flags flags
+	)
+{
+	return smo::compute::ComputeManager::getInstance().createUseHostPtrBuffer(
+		hostPtr, size, flags);
+}
+
+static void ComputeManager_releaseUseHostPtrBuffer(
+	std::shared_ptr<smo::compute::ClBuffer> buffer
+	)
+{
+	smo::compute::ComputeManager::getInstance().releaseUseHostPtrBuffer(
+		buffer);
+}
+
+static std::shared_ptr<smo::compute::ComputeDevice> ComputeManager_getDevice()
+{
+	return smo::compute::ComputeManager::getInstance().getDevice();
+}
+
+static void ComputeManager_releaseDevice(
+	std::shared_ptr<smo::compute::ComputeDevice> device
+	)
+{
+	smo::compute::ComputeManager::getInstance().releaseDevice(device);
+}
+
 /* Hooks to be provided to stimBuffApiLibs, enabling them to call into Salmanoff
 * code.
 */
 static SmoCallbacks smoCallbacks =
 {
 	.searchForLibInSmoSearchPaths = searchForLibInSmoSearchPaths,
-	.ComponentThread_getSelf = ComponentThread_getSelf
+	.ComponentThread_getSelf = ComponentThread_getSelf,
+	.OptionParser_getOptions = OptionParser_getOptions,
+	.ComputeManager_createUseHostPtrBuffer =
+		ComputeManager_createUseHostPtrBuffer,
+	.ComputeManager_releaseUseHostPtrBuffer =
+		ComputeManager_releaseUseHostPtrBuffer,
+	.ComputeManager_getDevice = ComputeManager_getDevice,
+	.ComputeManager_releaseDevice = ComputeManager_releaseDevice
 };

 /* Static file-scope threading model object for senseApi libraries */
@@ -9,40 +9,23 @@ if(ENABLE_STIMBUFFAPI_livoxGen1)
 	# Find liburing using pkg-config
 	pkg_check_modules(URING REQUIRED liburing)

-	# Find OpenCL: try find_package first, fall back to pkg-config
-	find_package(OpenCL QUIET)
-	if(OpenCL_FOUND)
-		# Normalize find_package variables to match pkg_check_modules naming
-		set(OPENCL_FOUND TRUE)
-		set(OPENCL_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
-		# Handle both OpenCL_LIBRARY (singular) and OpenCL_LIBRARIES (plural)
-		if(OpenCL_LIBRARIES)
-			set(OPENCL_LIBRARIES ${OpenCL_LIBRARIES})
-		else()
-			set(OPENCL_LIBRARIES ${OpenCL_LIBRARY})
-		endif()
-		set(OPENCL_LIBRARY_DIRS "")
-		message(STATUS "Found OpenCL using find_package")
-	else()
-		# Fall back to pkg-config
-		pkg_check_modules(OPENCL OpenCL)
-		if(NOT OPENCL_FOUND)
-			message(FATAL_ERROR
-				"Failed to find OpenCL: both find_package and "
-				"pkg_check_modules failed. Try installing the "
-				"'ocl-icd-opencl-dev' package (or the appropriate "
-				"OpenCL development package for your system)."
-			)
-		endif()
-		message(STATUS "Found OpenCL using pkg-config")
-	endif()
+	# Enable assembly language
+	enable_language(ASM)

 	add_library(livoxGen1 SHARED
 		livoxGen1.cpp
-		stagingBuffer.cpp
-		pcloudStimulusBuffer.cpp
+		pcloudStimulusProducer.cpp
 		ioUringAssemblyEngine.cpp
-		openClSplittingEngine.cpp
+		openClCollatingAndMeshingEngine.cpp
+		openClKernels.cl.S
+	)
+
+	# Set assembler working directory so .incbin can find the .cl file
+	# Also declare dependency on collateDgrams.cl and slotCompactor.cl
+	set_source_files_properties(openClKernels.cl.S PROPERTIES
+		COMPILE_FLAGS "-I${CMAKE_CURRENT_SOURCE_DIR}"
+		OBJECT_DEPENDS
+			"${CMAKE_CURRENT_SOURCE_DIR}/collateDgrams.cl;${CMAKE_CURRENT_SOURCE_DIR}/slotCompactor.cl"
 	)

 	target_include_directories(livoxGen1 PUBLIC
@@ -0,0 +1,360 @@
+// Debug macro: define DEBUG_COLLATE_DGRAMS to enable printf statements
+// #define DEBUG_COLLATE_DGRAMS
+#ifdef DEBUG_COLLATE_DGRAMS
+#define DBG_PRINTF(...) printf(__VA_ARGS__)
+#else
+#define DBG_PRINTF(...)
+#endif
+
+// Helper function to read a little-endian int32 from unaligned memory
+inline int readInt32LE(__global uchar* ptr)
+{
+	// Read 4 bytes in little-endian order and assemble into int
+	// Handle sign extension correctly for signed int
+	int b0 = (int)ptr[0];
+	int b1 = (int)ptr[1];
+	int b2 = (int)ptr[2];
+	int b3 = (int)ptr[3];
+
+	// Assemble little-endian: b0 is LSB, b3 is MSB
+	int value = b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
+	return value;
+}
+
+__kernel void collate(
+	__global uchar* assembly,
+	__global float* collation,
+	__global float* intensityBuffer,
+	__global uint* ambienceBuffer,
+	uint ambienceHighVal,
+	uint slotStride,
+	uint nPointsPerSlot,
+	uint nDgramsPerFrame)
+{
+	// Get work item index (slot index)
+	uint slotIndex = get_global_id(0);
+
+	// Bounds check
+	if (slotIndex >= nDgramsPerFrame) { return; }
+
+	// Calculate slot address
+	__global uchar* slotStart = assembly + (slotIndex * slotStride);
+
+	// Read data_type from offset 9 (1 byte)
+	uchar dataType = slotStart[9];
+
+	// Get points array pointer (after 18-byte header)
+	__global uchar* pointsArray = slotStart + 18;
+
+	// Base offset in collation buffer for this slot (in floats)
+	// Each PointXYZ is 3 floats (x, y, z)
+	#define FLOATS_PER_POINT 3
+	uint collationBaseOffset = slotIndex * nPointsPerSlot * FLOATS_PER_POINT;
+	// Base offset in intensity buffer for this slot (in floats)
+	// Each intensity is 1 float
+	uint intensityBaseOffset = slotIndex * nPointsPerSlot;
+	DBG_PRINTF("Running kernel: about to process points in slot.\n");
+
+	// Initialize ambience counter for this work item
+	uint ambienceCount = 0;
+
+	// Process based on data type using nested ifs (outer) with loops (inner)
+	if (dataType == 0)
+	{
+		// Type 0: LivoxRawPoint - 13 bytes per point
+		// Structure: int32_t x, y, z (mm), uint8_t reflectivity
+		for (uint i = 0; i < nPointsPerSlot; ++i)
+		{
+			__global uchar* pointPtr = pointsArray + (i * 13);
+
+			// Read int coordinates (little-endian, unaligned-safe)
+			int x_mm = readInt32LE(pointPtr + 0);
+			int y_mm = readInt32LE(pointPtr + 4);
+			int z_mm = readInt32LE(pointPtr + 8);
+			uchar reflectivity = pointPtr[12];
+
+			DBG_PRINTF("collate[slot=%u,point=%u]: x_mm=%d, y_mm=%d, z_mm=%d, reflectivity=%d\n",
+				slotIndex, i, x_mm, y_mm, z_mm, reflectivity);
+			// Convert to PointXYZI (meters, float)
+			float x = (float)x_mm / 1000.0f;
+			float y = (float)y_mm / 1000.0f;
+			float z = (float)z_mm / 1000.0f;
+			float intensity = (float)reflectivity;
+
+			// Print intensity if above 5
+			if (intensity > 5.0f)
+			{
+				DBG_PRINTF("collate[slot=%u,point=%u]: intensity=%.1f\n",
+					slotIndex, i, intensity);
+			}
+
+			// Write XYZ to collation buffer
+			uint offset = collationBaseOffset + (i * FLOATS_PER_POINT);
+			collation[offset + 0] = x;
+			collation[offset + 1] = y;
+			collation[offset + 2] = z;
+			// Write intensity conditionally - divert to intensity buffer if attached, else discard
+			if (intensityBuffer != NULL) {
+				intensityBuffer[intensityBaseOffset + i] = intensity;
+			}
+			// Count high intensity values for ambience buffer
+			if (intensity >= (float)ambienceHighVal) {
+				++ambienceCount;
+			}
+			// Don't write intensity to collation buffer
+		}
+	}
+	else if (dataType == 2)
+	{
+		// Type 2: LivoxExtendRawPoint - 14 bytes per point
+		// Structure: int32_t x, y, z (mm), uint8_t reflectivity, uint8_t tag (ignored)
+		for (uint i = 0; i < nPointsPerSlot; ++i)
+		{
+			__global uchar* pointPtr = pointsArray + (i * 14);
+
+			// Read int coordinates (little-endian, unaligned-safe)
+			int x_mm = readInt32LE(pointPtr + 0);
+			int y_mm = readInt32LE(pointPtr + 4);
+			int z_mm = readInt32LE(pointPtr + 8);
+			uchar reflectivity = pointPtr[12];
+			// tag at offset 13 is ignored
+
+			DBG_PRINTF("collate[slot=%u,point=%u]: x_mm=%d, y_mm=%d, z_mm=%d, reflectivity=%d\n",
+				slotIndex, i, x_mm, y_mm, z_mm, reflectivity);
+			// Convert to PointXYZI (meters, float)
+			float x = (float)x_mm / 1000.0f;
+			float y = (float)y_mm / 1000.0f;
+			float z = (float)z_mm / 1000.0f;
+			float intensity = (float)reflectivity;
+
+			// Print intensity if above 5
+			if (intensity > 5.0f)
+			{
+				DBG_PRINTF("collate[slot=%u,point=%u]: intensity=%.1f\n",
+					slotIndex, i, intensity);
+			}
+
+			// Write XYZ to collation buffer
+			uint offset = collationBaseOffset + (i * FLOATS_PER_POINT);
+			collation[offset + 0] = x;
+			collation[offset + 1] = y;
+			collation[offset + 2] = z;
+			// Write intensity conditionally - divert to intensity buffer if attached, else discard
+			if (intensityBuffer != NULL) {
+				intensityBuffer[intensityBaseOffset + i] = intensity;
+			}
+			// Count high intensity values for ambience buffer
+			if (intensity >= (float)ambienceHighVal) {
+				++ambienceCount;
+			}
+			// Don't write intensity to collation buffer
+		}
+	}
+	else if (dataType == 4)
+	{
+		// Type 4: LivoxDualExtendRawPoint - 28 bytes per sample (2 points)
+		// Structure: point1 (x1,y1,z1,reflectivity1,tag1), point2 (x2,y2,z2,reflectivity2,tag2)
+		// nPointsPerSlot should be 96, but we have 48 samples * 2 points = 96 points
+		uint nSamples = nPointsPerSlot / 2;
+		uint pointIndex = 0;
+
+		for (uint i = 0; i < nSamples; ++i)
+		{
+			__global uchar* samplePtr = pointsArray + (i * 28);
+
+			// Process first point
+			int x1_mm = readInt32LE(samplePtr + 0);
+			int y1_mm = readInt32LE(samplePtr + 4);
+			int z1_mm = readInt32LE(samplePtr + 8);
+			uchar reflectivity1 = samplePtr[12];
+			// tag1 at offset 13 is ignored
+
+			float x1 = (float)x1_mm / 1000.0f;
+			float y1 = (float)y1_mm / 1000.0f;
+			float z1 = (float)z1_mm / 1000.0f;
+			float intensity1 = (float)reflectivity1;
+
+			// Print intensity if above 5
+			if (intensity1 > 5.0f)
+			{
+				DBG_PRINTF("collate[slot=%u,point=%u]: intensity=%.1f\n",
+					slotIndex, pointIndex, intensity1);
+			}
+
+			uint offset1 = collationBaseOffset
+				+ (pointIndex * FLOATS_PER_POINT);
+			collation[offset1 + 0] = x1;
+			collation[offset1 + 1] = y1;
+			collation[offset1 + 2] = z1;
+			// Write intensity conditionally - divert to intensity buffer if attached, else discard
+			if (intensityBuffer != NULL) {
+				intensityBuffer[intensityBaseOffset + pointIndex] = intensity1;
+			}
+			// Count high intensity values for ambience buffer
+			if (intensity1 >= (float)ambienceHighVal) {
+				++ambienceCount;
+			}
+			// Don't write intensity to collation buffer
+			++pointIndex;
+
+			// Process second point
+			int x2_mm = readInt32LE(samplePtr + 14);
+			int y2_mm = readInt32LE(samplePtr + 18);
+			int z2_mm = readInt32LE(samplePtr + 22);
+			uchar reflectivity2 = samplePtr[26];
+			// tag2 at offset 27 is ignored
+
+			float x2 = (float)x2_mm / 1000.0f;
+			float y2 = (float)y2_mm / 1000.0f;
+			float z2 = (float)z2_mm / 1000.0f;
+			float intensity2 = (float)reflectivity2;
+
+			// Print intensity if above 5
+			if (intensity2 > 5.0f)
+			{
+				DBG_PRINTF("collate[slot=%u,point=%u]: intensity=%.1f\n",
+					slotIndex, pointIndex, intensity2);
+			}
+
+			uint offset2 = collationBaseOffset
+				+ (pointIndex * FLOATS_PER_POINT);
+			collation[offset2 + 0] = x2;
+			collation[offset2 + 1] = y2;
+			collation[offset2 + 2] = z2;
+			// Write intensity conditionally - divert to intensity buffer if attached, else discard
+			if (intensityBuffer != NULL) {
+				intensityBuffer[intensityBaseOffset + pointIndex] = intensity2;
+			}
+			// Count high intensity values for ambience buffer
+			if (intensity2 >= (float)ambienceHighVal) {
+				++ambienceCount;
+			}
+			// Don't write intensity to collation buffer
+			++pointIndex;
+		}
+	}
+	else if (dataType == 7)
+	{
+		// Type 7: LivoxTripleExtendRawPoint - 42 bytes per sample (3 points)
+		// Structure: point1, point2, point3 (each: x,y,z,reflectivity,tag)
+		// nPointsPerSlot should be 90, but we have 30 samples * 3 points = 90 points
+		uint nSamples = nPointsPerSlot / 3;
+		uint pointIndex = 0;
+
+		for (uint i = 0; i < nSamples; ++i)
+		{
+			__global uchar* samplePtr = pointsArray + (i * 42);
+
+			// Process first point
+			int x1_mm = readInt32LE(samplePtr + 0);
+			int y1_mm = readInt32LE(samplePtr + 4);
+			int z1_mm = readInt32LE(samplePtr + 8);
+			uchar reflectivity1 = samplePtr[12];
+			// tag1 at offset 13 is ignored
+
+			float x1 = (float)x1_mm / 1000.0f;
+			float y1 = (float)y1_mm / 1000.0f;
+			float z1 = (float)z1_mm / 1000.0f;
+			float intensity1 = (float)reflectivity1;
+
+			// Print intensity if above 5
+			if (intensity1 > 5.0f)
+			{
+				DBG_PRINTF("collate[slot=%u,point=%u]: intensity=%.1f\n",
+					slotIndex, pointIndex, intensity1);
+			}
+
+			uint offset1 = collationBaseOffset
+				+ (pointIndex * FLOATS_PER_POINT);
+			collation[offset1 + 0] = x1;
+			collation[offset1 + 1] = y1;
+			collation[offset1 + 2] = z1;
+			// Write intensity conditionally - divert to intensity buffer if attached, else discard
+			if (intensityBuffer != NULL) {
+				intensityBuffer[intensityBaseOffset + pointIndex] = intensity1;
+			}
+			// Count high intensity values for ambience buffer
+			if (intensity1 >= (float)ambienceHighVal) {
+				++ambienceCount;
+			}
+			// Don't write intensity to collation buffer
+			++pointIndex;
+
+			// Process second point
+			int x2_mm = readInt32LE(samplePtr + 14);
+			int y2_mm = readInt32LE(samplePtr + 18);
+			int z2_mm = readInt32LE(samplePtr + 22);
+			uchar reflectivity2 = samplePtr[26];
+			// tag2 at offset 27 is ignored
+
+			float x2 = (float)x2_mm / 1000.0f;
+			float y2 = (float)y2_mm / 1000.0f;
+			float z2 = (float)z2_mm / 1000.0f;
+			float intensity2 = (float)reflectivity2;
+
+			// Print intensity if above 5
+			if (intensity2 > 5.0f)
+			{
+				DBG_PRINTF("collate[slot=%u,point=%u]: intensity=%.1f\n",
+					slotIndex, pointIndex, intensity2);
+			}
+
+			uint offset2 = collationBaseOffset
+				+ (pointIndex * FLOATS_PER_POINT);
+			collation[offset2 + 0] = x2;
+			collation[offset2 + 1] = y2;
+			collation[offset2 + 2] = z2;
+			// Write intensity conditionally - divert to intensity buffer if attached, else discard
+			if (intensityBuffer != NULL) {
+				intensityBuffer[intensityBaseOffset + pointIndex] = intensity2;
+			}
+			// Count high intensity values for ambience buffer
+			if (intensity2 >= (float)ambienceHighVal) {
+				++ambienceCount;
+			}
+			// Don't write intensity to collation buffer
+			++pointIndex;
+
+			// Process third point
+			int x3_mm = readInt32LE(samplePtr + 28);
+			int y3_mm = readInt32LE(samplePtr + 32);
+			int z3_mm = readInt32LE(samplePtr + 36);
+			uchar reflectivity3 = samplePtr[40];
+			// tag3 at offset 41 is ignored
+
+			float x3 = (float)x3_mm / 1000.0f;
+			float y3 = (float)y3_mm / 1000.0f;
+			float z3 = (float)z3_mm / 1000.0f;
+			float intensity3 = (float)reflectivity3;
+
+			// Print intensity if above 5
+			if (intensity3 > 5.0f)
+			{
+				DBG_PRINTF("collate[slot=%u,point=%u]: intensity=%.1f\n",
+					slotIndex, pointIndex, intensity3);
+			}
+
+			uint offset3 = collationBaseOffset
+				+ (pointIndex * FLOATS_PER_POINT);
+			collation[offset3 + 0] = x3;
+			collation[offset3 + 1] = y3;
+			collation[offset3 + 2] = z3;
+			// Write intensity conditionally - divert to intensity buffer if attached, else discard
+			if (intensityBuffer != NULL) {
+				intensityBuffer[intensityBaseOffset + pointIndex] = intensity3;
+			}
+			// Count high intensity values for ambience buffer
+			if (intensity3 >= (float)ambienceHighVal) {
+				++ambienceCount;
+			}
+			// Don't write intensity to collation buffer
+			++pointIndex;
+		}
+	}
+	// Unsupported data types are silently ignored
+
+	// Write ambience count for this work item (once at the end)
+	if (ambienceBuffer != NULL) {
+		ambienceBuffer[slotIndex] = ambienceCount;
+	}
+}
@@ -1,9 +1,13 @@
 #include <boostAsioLinkageFix.h>
 #include <config.h>
 #include <opts.h>
+#include <algorithm>
+#include <iostream>
+#include <iomanip>
 #include <cstring>
 #include <stdexcept>
 #include <functional>
+#include <random>
 #include <sys/socket.h>
 #include <sys/eventfd.h>
 #include <sys/uio.h>
@@ -20,9 +24,11 @@
 #include <callableTracer.h>
 #include <spinLock.h>
 #include "ioUringAssemblyEngine.h"
-#include "pcloudStimulusBuffer.h"
+#include "pcloudStimulusProducer.h"
 #include "livoxGen1.h"

+// #define REGISTER_IOURING_BUFFERS
+
 namespace smo {
 namespace stim_buff {

@@ -30,13 +36,18 @@ inline LivoxProto1DllState& getLivoxProto1State() { return livoxProto1; }

 struct DummyLivoxEthHeader
 {
-	enum : uint32_t {
-		INVALID_ERR_CODE = 0xFFFFFFFFu
-	};
-	enum : uint8_t {
-		INVALID_TIMESTAMP_TYPE = 0xFFu,
-		INVALID_DATA_TYPE = 0xFFu
-	};
+	DummyLivoxEthHeader()
+	: version(0xFF), slot(0xFF), id(0xFF), rsvd(0xFF)
+	{}
+
+	static bool isDummy(const DummyLivoxEthHeader& hdr)
+	{
+		return hdr.version == 0xFF || hdr.slot == 0xFF || hdr.id == 0xFF
+			|| hdr.rsvd == 0xFF;
+	}
+
+	static bool isValid(const DummyLivoxEthHeader& hdr)
+		{ return !isDummy(hdr); }

 	uint8_t version, slot, id, rsvd;
 	uint32_t err_code;
@@ -44,19 +55,29 @@ struct DummyLivoxEthHeader
 	uint8_t timestamp[8];
 };

-IoUringAssemblyEngine::IoUringAssemblyEngine(PcloudStimulusBuffer& parent_)
+IoUringAssemblyEngine::IoUringAssemblyEngine(
+	PcloudStimulusProducer& parent_, size_t nDgramsPerStagingBufferFrame_)
 : parent(parent_),
 frameAssemblyDesc(nullptr), ring{},
-isSetup(false),
 eventfdFd(-1), eventfdDesc(nullptr), eventfd_value(0),
 stallTimer(parent_.device->componentThread->getIoService()),
-isAssembling(false)
+shouldAcceptRequests(false),
+nDgramsPerStagingBufferFrame(nDgramsPerStagingBufferFrame_),
+assembledSlotsTracker(nDgramsPerStagingBufferFrame_),
+randomDevice(), randomGenerator(randomDevice())
 {}

 bool IoUringAssemblyEngine::setup()
 {
-	if (isSetup)
-		{ return false; }
+	// Defensive check to prevent double-calling
+	{
+		SpinLock::Guard lock(shouldAcceptRequestsLock);
+		if (shouldAcceptRequests)
+		{
+			throw std::runtime_error(std::string(__func__) + ": setup() called "
+				"while already set up");
+		}
+	}

 	// Get FrameAssemblyDesc from staging buffer
 	frameAssemblyDesc = static_cast<std::shared_ptr<FrameAssemblyDesc>>(
@@ -78,8 +99,27 @@ bool IoUringAssemblyEngine::setup()
 	if (udpFd < 0)
 		{ return false; }

+	// Set up iovecs for each slot
+	for (size_t i = 0; i < frameAssemblyDesc->numSlots; ++i)
+	{
+		assembledSlotsTracker[i].assembled = false;
+		assembledSlotsTracker[i].msgHdr = {};
+		assembledSlotsTracker[i].msgHdr.msg_iov =
+			&assembledSlotsTracker[i].ioVec;
+		assembledSlotsTracker[i].msgHdr.msg_iovlen = 1;
+	}
+
+	for (size_t i = 0; i < frameAssemblyDesc->numSlots; ++i)
+	{
+		const auto& slot = frameAssemblyDesc->slots[i];
+		assembledSlotsTracker[i].ioVec.iov_base = slot.vaddr;
+		assembledSlotsTracker[i].ioVec.iov_len = slot.nBytes;
+	}
+
 	// Declare iovec early to avoid goto crossing initialization
+#ifdef REGISTER_IOURING_BUFFERS
 	struct iovec iov;
+#endif
 	int ret;

 	/**	EXPLANATION:
@@ -94,30 +134,40 @@ bool IoUringAssemblyEngine::setup()
 	if (ret < 0)
 		{ goto cleanup; }

+#ifdef REGISTER_IOURING_BUFFERS
 	// Register staging buffer with io_uring for DMA-apt I/O
 	iov = parent.assemblyBuffer.getIoUringRegisterIoVec();
 	ret = io_uring_register_buffers(&ring, &iov, 1);
 	if (ret < 0)
 		{ goto cleanup_ring; }
+#endif

 	// Create eventfd for CQE notifications (used with boost's unified loop)
 	eventfdFd = eventfd(0, EFD_NONBLOCK);
 	if (eventfdFd < 0)
-		{ goto cleanup_buffers; }
+	{
+#ifdef REGISTER_IOURING_BUFFERS
+		goto cleanup_buffers;
+#else
+		goto cleanup_ring;
+#endif
+	}

 	// Register eventfd with io_uring
 	ret = io_uring_register_eventfd(&ring, eventfdFd);
 	if (ret < 0)
 		{ goto cleanup_eventfd; }

-	isSetup = true;
+	shouldAcceptRequests = true;
 	return true;

 cleanup_eventfd:
 	close(eventfdFd);
 	eventfdFd = -1;
+#ifdef REGISTER_IOURING_BUFFERS
 cleanup_buffers:
 	io_uring_unregister_buffers(&ring);
+#endif
 cleanup_ring:
 	io_uring_queue_exit(&ring);
 cleanup:
@@ -126,8 +176,7 @@ cleanup:

 void IoUringAssemblyEngine::finalize()
 {
-	// Call stop() to cancel in-flight operations (stop() already cancels the timer)
-	stop();
+	bool wasAcceptingRequests = stop();

 	if (eventfdFd >= 0)
 	{
@@ -136,11 +185,12 @@ void IoUringAssemblyEngine::finalize()
 		eventfdFd = -1;
 	}

-	if (isSetup)
+	if (wasAcceptingRequests)
 	{
+#ifdef REGISTER_IOURING_BUFFERS
 		io_uring_unregister_buffers(&ring);
+#endif
 		io_uring_queue_exit(&ring);
-		isSetup = false;
 	}

 	// Reset state to allow setup() to be called again
@@ -156,13 +206,10 @@ void IoUringAssemblyEngine::resetAndAssembleFrame(
 			+ ": onCqeReady callback is invalid");
 	}

-	if (!frameAssemblyDesc || !pcloudDataFdDesc || eventfdFd < 0)
+	if (!shouldAcceptRequests)
 	{
 		throw std::runtime_error(std::string(__func__)
-			+ ": invalid state: "
-			+ ( !frameAssemblyDesc ? "frameAssemblyDesc is null; " : "" )
-			+ ( !pcloudDataFdDesc ? "pcloudDataFdDesc is null; " : "" )
-			+ ( eventfdFd < 0 ? "eventfdFd is invalid." : "" ));
+			+ ": engine is not accepting requests");
 	}

 	// eventfdDesc should not be valid when resetAndAssembleFrame is called
@@ -175,6 +222,11 @@ void IoUringAssemblyEngine::resetAndAssembleFrame(
 	// Store the callback for re-arming
 	onCqeReadyCallback = std::move(onCqeReady);

+	// Reset all assembled slots tracker to false
+	for (auto& slotDesc : assembledSlotsTracker) {
+		slotDesc.assembled = false;
+	}
+
 	/**	EXPLANATION:
 	 * Flush eventfd state: poll and read any pending events before creating
 	 * descriptor.
@@ -221,17 +273,7 @@ void IoUringAssemblyEngine::resetAndAssembleFrame(
 				+ ": failed to get SQE for slot " + std::to_string(i));
 		}

-		const auto& slot = frameAssemblyDesc->slots[i];
-
-		// Prepare recvmsg SQE for this slot
-		struct msghdr msg = {};
-		struct iovec iov;
-		iov.iov_base = slot.vaddr;
-		iov.iov_len = slot.nBytes;
-		msg.msg_iov = &iov;
-		msg.msg_iovlen = 1;
-
-		io_uring_prep_recvmsg(sqe, udpFd, &msg, 0);
+		io_uring_prep_recvmsg(sqe, udpFd, &assembledSlotsTracker[i].msgHdr, 0);
 		// Set user_data to slot index for tracking
 		io_uring_sqe_set_data(sqe, reinterpret_cast<void*>(i));
 	}
@@ -245,8 +287,6 @@ void IoUringAssemblyEngine::resetAndAssembleFrame(
 			+ " (errno=" + std::to_string(errno) + ")");
 	}

-	// Set assembly flag
-	isAssembling = true;
 	// Start listening for CQE notifications on eventfd
 	eventfdDesc->async_read_some(
 		boost::asio::buffer(&eventfd_value, sizeof(eventfd_value)),
@@ -256,40 +296,22 @@ void IoUringAssemblyEngine::resetAndAssembleFrame(
 			std::placeholders::_2));
 }

-void IoUringAssemblyEngine::stop(bool doAcquireLock)
+bool IoUringAssemblyEngine::stop()
 {
-	// Clear assembly flag first to signal onEventfdRead to stop re-arming
 	// Acquire and release lock tightly around setting the flag
-	if (doAcquireLock)
-	{
-		SpinLock::Guard lock(isAssemblingLock);
-		isAssembling = false;
-	} else {
-		isAssembling = false;
-	}
-
-	/**	FIXME:
-	 * There's a problem with this bridge here.
-	 *
-	 * We can't delay during every call to stop because under normal operating
-	 * conditions, this whole assembly process should be able to move as fast
-	 * as possible and to receive as much data as possible without maximum
-	 * throughput.
-	 *
-	 * Yet we need to delay briefly here to ensure that the onEventfdRead loop
-	 * has a chance to see the flag and halt.
-	 *
-	 * We need to analyze this carefully and figure out what the correct
-	 * conditions are for being certain that we aren't destroying state while
-	 * the eventfdRead loop is still running; and we need to figure out how to
-	 * ensure that we only delay when absolutely necessary.
-	 */
+	SpinLock::Guard lock(shouldAcceptRequestsLock);
+	bool wasAcceptingRequests = shouldAcceptRequests;
+	shouldAcceptRequests = false;
+	return wasAcceptingRequests;
+}

+void IoUringAssemblyEngine::assemblyCycleComplete()
+{
 	// Cancel in-flight stall timeout timer
 	stallTimer.cancel();
 	onCqeReadyCallback = std::move([](void *, int){});

-	if (isSetup)
+	if (frameAssemblyDesc)
 	{
 		struct io_uring_sqe *sqe = io_uring_get_sqe(&ring);
 		if (!sqe)
@@ -329,7 +351,7 @@ void IoUringAssemblyEngine::stop(bool doAcquireLock)
 			}
 		}

-		if (!sawCancelCqe && OptionParser::getOptions().verbose) {
+		if (!sawCancelCqe && smoHooksPtr->OptionParser_getOptions().verbose) {
 			std::cerr << __func__ << ": no CQE seen for cancel operation\n";
 		}
 	}
@@ -339,8 +361,9 @@ cleanup_eventfd:
 	{
 		/**	EXPLANATION:
 		 * The goal here is to ensure that our io_service's event loop will not
-		 * get any events from the eventfd after we've called stop(). So we
-		 * completely deinitialize the eventfd descriptor.
+		 * get any events from the eventfd after we've called
+		 * assemblyCycleComplete(). So we completely deinitialize the eventfd
+		 * descriptor.
 		 *
 		 * But we still want to reuse the underlying eventfd file descriptor,
 		 * itself in the next resetAndAssembleFrame() cycle, so we call
@@ -351,6 +374,18 @@ cleanup_eventfd:
 		 * io_service before releasing it, otherwise Boost.Asio will complain
 		 * when we try to create a new descriptor with the same fd.
 		 */
+		/**	CAVEAT:
+		 * There's a rare but real race condition here where the eventfd gets an
+		 * event signaled on it, and while boost is internally processing that
+		 * event to enqeue our handler, we call cancel() and release() here.
+		 * If boost internally has locking on the stream_descriptor object,
+		 * this should be fine. But just in case it doesn't, I'm just
+		 * documenting that possibility here.
+		 *
+		 * There's nothing we can really do about it except know that it would
+		 * be very rarely happen; and that we can't do anything about it short
+		 * of modifying the boost.Asio code.
+		 */
 		eventfdDesc->cancel();
 		eventfdDesc->release();
 		/* Destroy the descriptor object (now that it's unregistered, destroying
@@ -377,19 +412,29 @@ public:
 	timerFired(false), handlerExecuted(false)
 	{}

+	void callOriginalCallback(bool success, AsynchronousLoop loop)
+	{
+		callOriginalCb(success, loop);
+	}
+
 public:
 	void assembleFrameReq1_posted(
 		std::shared_ptr<AssembleFrameReq> context)
 	{
-		if (!engine.frameAssemblyDesc)
+		SpinLock::Guard lock(engine.shouldAcceptRequestsLock);
+
+		if (!engine.shouldAcceptRequests)
 		{
-			throw std::runtime_error(std::string(__func__)
-				+ ": frameAssemblyDesc is null");
+			context->callOriginalCallback(false, AsynchronousLoop(0));
+			return;
 		}

 		// Initialize loop with number of slots
 		context->loop = AsynchronousLoop(engine.frameAssemblyDesc->numSlots);

+		// Record assembly start time
+		engine.assemblyStartTime = std::chrono::high_resolution_clock::now();
+
 		/**	FIXME:
 		 * I'm suspicious of this std::bind return object here. What if us
 		 * setting it to null inside of stop() doesn't actually cause the
@@ -418,6 +463,28 @@ public:
 		// Check if timer was cancelled (ignore if operation_aborted)
 		if (error == boost::asio::error::operation_aborted) { return; }

+		/**	EXPLANATION:
+		 * This lock acquisition here will also cover the call to
+		 * assembleFrameReq3 below. Because of that, it means that the
+		 * requirement that the lock be held while accessing
+		 * the metadata that's destroyed in stop() is satisfied.
+		 *
+		 * In theory though, we shouldn't need to hold the lock into
+		 * assembleFrameReq3 below because that function doesn't really access
+		 * any state that's destroyed in stop()? But I'm not sure, and we have
+		 * indeed seen a SEGFAULT even in the current code with locking, so
+		 * I'm going to hold the lock here for now.
+		 */
+		SpinLock::Guard lock(context->engine.shouldAcceptRequestsLock);
+
+		if (!context->engine.shouldAcceptRequests)
+		{
+			context->engine.assemblyCycleComplete();
+			context->loop.setRemainingIterationsToFailure();
+			context->callOriginalCallback(false, context->loop);
+			return;
+		}
+
 		// Set timer fired flag
 		context->timerFired.store(true);
 		context->assembleFrameReq3(context);
@@ -427,10 +494,24 @@ public:
 		std::shared_ptr<AssembleFrameReq> context,
 		void *user_data, int cqe_result)
 	{
-		(void)user_data;  // Not used - we just track success/failure counts
+		// NB: The lock was acquired by onEventFdRead before calling this func
+		if (!context->engine.shouldAcceptRequests)
+		{
+			context->engine.assemblyCycleComplete();
+			context->loop.setRemainingIterationsToFailure();
+			context->callOriginalCallback(false, context->loop);
+			return;
+		}
+
+		// Extract index from user_data and mark slot as assembled if successful
+		size_t index = reinterpret_cast<size_t>(user_data);
+		bool success = (cqe_result >= 0);
+
+		if (success && index < context->engine.assembledSlotsTracker.size()) {
+			context->engine.assembledSlotsTracker[index].assembled = true;
+		}

 		// Caller decides success: result >= 0 means success
-		bool success = (cqe_result >= 0);
 		if (context->loop.incrementSuccessOrFailureAndTestForCompletionDueTo(
 			success))
 		{
@@ -439,12 +520,23 @@ public:
 		}
 	}

-	void assembleFrameReq3(std::shared_ptr<AssembleFrameReq> context)
+	void assembleFrameReq3(
+		std::shared_ptr<AssembleFrameReq> context
+		)
 	{
+		/**	EXPLANATION:
+		 * All branch paths that invoke this unifyig oracle function are
+		 * expected to already hold the shouldAcceptRequestsLock before calling
+		 * it.
+		 */
 		// Ensure we only execute once using atomic exchange
 		if (context->handlerExecuted.exchange(true)) { return; }
+		// Record assembly end time
+		context->engine.assemblyEndTime =
+			std::chrono::high_resolution_clock::now();
+
 		// Cancel the timer, stop the engine and process frame, if any.
-		context->engine.stop(false);
+		context->engine.assemblyCycleComplete();

 		/**	EXPLANATION:
 		 * Timeout doesn't necessarily mean error.
@@ -457,10 +549,28 @@ public:
 		// Error: no slots succeeded - no data received successfully.
 		if (context->loop.nSucceeded.load() == 0)
 		{
-			context->callOriginalCb(false, context->loop);
+			context->callOriginalCallback(false, context->loop);
 			return;
 		}

+#if 0
+		// Artificially create random dummy slots for testing
+		context->engine.randomDummySlotFiller(context->loop);
+#endif
+
+		// Fill un-assembled slots with dummy datagrams
+		context->engine.fillUnAssembledSlotsWithDummyDgrams();
+
+#if 0
+		// Print first 4 bytes of each slot (whether assembled or not)
+		if (context->engine.frameAssemblyDesc)
+		{
+			for (size_t i = 0; i < context->engine.frameAssemblyDesc->numSlots; ++i) {
+				context->engine.printSlotBytes(i, 4);
+			}
+		}
+#endif
+
 		if (context->loop.nSucceeded.load() >= context->loop.nTotal)
 		{
 			// Success: all or more slots succeeded
@@ -471,7 +581,7 @@ public:
 					<< ") > nTotal (" << context->loop.nTotal << ")\n";
 			}

-			context->callOriginalCb(true, context->loop);
+			context->callOriginalCallback(true, context->loop);
 			return;
 		}

@@ -479,18 +589,18 @@ public:
 		{
 			// Success: some slots succeeded (less than total)
 			// Note: dummy fill for un-assembled slots will be implemented later
-			context->callOriginalCb(true, context->loop);
+			context->callOriginalCallback(true, context->loop);
 			return;
 		}

-		if (OptionParser::getOptions().verbose)
+		if (smoHooksPtr->OptionParser_getOptions().verbose)
 		{
 			std::cerr << __func__ << ": Invalid state: nSucceeded ("
 				<< context->loop.nSucceeded.load()
 				<< ") < nTotal (" << context->loop.nTotal << ")" << std::endl;
 		}

-		context->callOriginalCb(false, context->loop);
+		context->callOriginalCallback(false, context->loop);
 		return;
 	}

@@ -504,10 +614,13 @@ public:
 void IoUringAssemblyEngine::assembleFrameReq(
 	Callback<assembleFrameReqCbFn> cb)
 {
-	if (!frameAssemblyDesc)
 	{
-		throw std::runtime_error(std::string(__func__)
-			+ ": frameAssemblyDesc is null");
+		SpinLock::Guard lock(shouldAcceptRequestsLock);
+		if (!shouldAcceptRequests)
+		{
+			cb.callbackFn(false, AsynchronousLoop(0));
+			return;
+		}
 	}

 	const auto& caller = smoHooksPtr->ComponentThread_getSelf();
@@ -534,8 +647,20 @@ void IoUringAssemblyEngine::onEventfdRead(
 	 * IoUringAssemblyEngine's per-assembly state isn't destroyed while this
 	 * handler is running.
 	 */
-	SpinLock::Guard lock(isAssemblingLock);
-	if (!isAssembling) { return; }
+	SpinLock::Guard lock(shouldAcceptRequestsLock);
+	/**	EXPLANATION:
+	 * You'd think we should put check for shouldAcceptRequests here and
+	 * `return` here if !shouldAcceptRequests, but we shouldn't because
+	 * that would mean that we can't invoke the caller's callback. This would
+	 * make the caller freeze forever.
+	 *
+	 * Instead we just let the onCqeReadyCallback check for
+	 * shouldAcceptRequests. That way the onCqeReadyCallback can actually
+	 * invoke the caller's callback, as it should. We have no knowledge of the
+	 * caller's callback because we don't have access to the caller's
+	 * continuation object. The onCqeReadyCallback does have access to it,
+	 * so we leave that up to it.
+	 */

 	/**	FIXME:
 	 * It may be necessary to specifically check for and handle the cancel op
@@ -570,56 +695,177 @@ void IoUringAssemblyEngine::onEventfdRead(
 		}
 	}

-	// Re-arm the eventfd read for next CQE notification
-	// Only re-arm if assembly is still active (stop() hasn't been called)
-	if (eventfdDesc && eventfdFd >= 0)
+	/**	EXPLANATION:
+	 * But we do put a `return` here because we know that at this point, the
+	 * caller's callback has already been invoked.
+	 */
+	if (!shouldAcceptRequests
+		|| eventfdDesc == nullptr || !eventfdDesc->is_open())
 	{
-		eventfdDesc->async_read_some(
-			boost::asio::buffer(&eventfd_value, sizeof(eventfd_value)),
-			std::bind(
-				&IoUringAssemblyEngine::onEventfdRead, this,
-				std::placeholders::_1,
-				std::placeholders::_2));
+		return;
 	}
+
+	// Re-arm the eventfd read for next CQE notification
+	eventfdDesc->async_read_some(
+		boost::asio::buffer(&eventfd_value, sizeof(eventfd_value)),
+		std::bind(
+			&IoUringAssemblyEngine::onEventfdRead, this,
+			std::placeholders::_1,
+			std::placeholders::_2));
 }

-void IoUringAssemblyEngine::cancelIncompleteAndFillDummies()
+void IoUringAssemblyEngine::fillUnAssembledSlotsWithDummyDgrams()
 {
 	if (!frameAssemblyDesc)
 		{ return; }

 	for (size_t i = 0; i < frameAssemblyDesc->numSlots; ++i)
 	{
-		// In the real path, decide from CQE accounting whether slot i completed.
-		// Here, demonstrate dummy header insertion API.
-		auto* hdr = reinterpret_cast<DummyLivoxEthHeader*>(frameAssemblyDesc->slots[i].vaddr);
-		hdr->err_code = DummyLivoxEthHeader::INVALID_ERR_CODE;
-		hdr->timestamp_type = DummyLivoxEthHeader::INVALID_TIMESTAMP_TYPE;
-		hdr->data_type = DummyLivoxEthHeader::INVALID_DATA_TYPE;
+		// Only fill slots that were not successfully assembled
+		if (i >= assembledSlotsTracker.size()
+			|| assembledSlotsTracker[i].assembled)
+		{
+			continue;
+		}
+
+		// Placement construct DummyLivoxEthHeader in the slot
+		new (frameAssemblyDesc->slots[i].vaddr) DummyLivoxEthHeader();
 	}
 }

-size_t IoUringAssemblyEngine::computePointsPerDgram(int returnMode)
+void IoUringAssemblyEngine::randomDummySlotFiller(AsynchronousLoop& loop)
 {
-	/*
-	 * Map modes to points per datagram based on Livox docs
-	 * 1: first, 2: strongest -> 96 samples => 96 points
-	 * 3: dual -> 48 samples * 2 points = 96
-	 * 4: triple -> 30 samples * 3 points = 90
-	 */
-	switch (returnMode)
+	if (!frameAssemblyDesc)
+		{ return; }
+
+	// Check if there are already dummies (natural dummy instance)
+	uint32_t nSucceeded = loop.nSucceeded.load();
+	uint32_t nTotal = loop.nTotal;
+	uint32_t nFailed = loop.nFailed.load();
+
+	if (nFailed > 0 || nSucceeded < nTotal)
 	{
-	case static_cast<int>(livoxProto1::Device::ReturnMode::SingleFirst):
-	case static_cast<int>(livoxProto1::Device::ReturnMode::SingleStrongest):
-	case static_cast<int>(livoxProto1::Device::ReturnMode::Dual):
-		return 96u;
-	case static_cast<int>(livoxProto1::Device::ReturnMode::Triple):
-		return 90u;
-	default:
-		throw std::runtime_error(
-			std::string(__func__) + ": Unknown returnMode "
-			+ std::to_string(returnMode));
+		std::cout << __func__ << ": Natural dummy instance detected (nSucceeded="
+			<< nSucceeded << ", nTotal=" << nTotal << ", nFailed=" << nFailed
+			<< "), skipping artificial dummy creation" << std::endl;
+		return;
 	}
+
+	// Randomly select a number of slots to make into dummies (less than total)
+	std::uniform_int_distribution<size_t> numDummiesDist(1, nTotal - 1);
+	size_t numDummiesToCreate = numDummiesDist(randomGenerator);
+
+	std::uniform_int_distribution<size_t> slotIndexDist(0, nTotal - 1);
+	size_t dummiesCreated = 0;
+	size_t maxAttempts = nTotal * 10; // Safety limit to prevent infinite loop
+	size_t attempts = 0;
+
+	// Mark random slots as unassembled
+	while (dummiesCreated < numDummiesToCreate && attempts < maxAttempts)
+	{
+		++attempts;
+		size_t randomIndex = slotIndexDist(randomGenerator);
+
+		// Skip if already unassembled, re-roll
+		if (randomIndex >= assembledSlotsTracker.size()
+			|| !assembledSlotsTracker[randomIndex].assembled)
+		{
+			continue;
+		}
+
+		// Mark as unassembled
+		assembledSlotsTracker[randomIndex].assembled = false;
+		++dummiesCreated;
+	}
+
+	if (dummiesCreated < numDummiesToCreate)
+	{
+		std::cerr << __func__ << ": Warning: Could only create " << dummiesCreated
+			<< " dummy slots out of " << numDummiesToCreate
+			<< " requested (max attempts reached)" << std::endl;
+		numDummiesToCreate = dummiesCreated;
+	}
+
+	// Update the AsynchronousLoop to reflect the new number of dummies
+	// Since we only reach here when nSucceeded == nTotal and nFailed == 0,
+	// we can directly calculate the new values
+	uint32_t newSucceeded = nTotal - static_cast<uint32_t>(numDummiesToCreate);
+	uint32_t newFailed = static_cast<uint32_t>(numDummiesToCreate);
+
+	loop.nSucceeded.store(newSucceeded);
+	loop.nFailed.store(newFailed);
+
+	std::cout << __func__ << ": Artificially created " << numDummiesToCreate
+		<< " dummy slots (nSucceeded: " << nTotal << " -> "
+		<< newSucceeded << ", nFailed: 0 -> " << newFailed << ")" << std::endl;
+}
+
+void IoUringAssemblyEngine::printSlotBytes(size_t slotIndex, size_t nBytes)
+{
+	if (!frameAssemblyDesc)
+	{
+		std::cerr << __func__ << ": frameAssemblyDesc is null" << std::endl;
+		return;
+	}
+
+	if (slotIndex >= frameAssemblyDesc->numSlots)
+	{
+		std::cerr << __func__ << ": slotIndex " << slotIndex
+			<< " out of range (numSlots=" << frameAssemblyDesc->numSlots
+			<< ")" << std::endl;
+		return;
+	}
+
+	const auto& slot = frameAssemblyDesc->slots[slotIndex];
+	size_t bytesToPrint = std::min(nBytes, static_cast<size_t>(slot.nBytes));
+	const uint8_t* data = reinterpret_cast<const uint8_t*>(slot.vaddr);
+
+	std::cout << __func__ << ": Slot " << slotIndex << " vaddr=" << (void*)slot.vaddr
+		<< " (" << bytesToPrint
+		<< " bytes):" << std::endl;
+
+	// Print hex dump format: offset | hex bytes | ASCII
+	const size_t bytesPerLine = 16;
+	for (size_t offset = 0; offset < bytesToPrint; offset += bytesPerLine)
+	{
+		// Print offset
+		std::cout << std::hex << std::setfill('0') << std::setw(4)
+			<< offset << ": ";
+
+		// Print hex bytes
+		for (size_t i = 0; i < bytesPerLine; ++i)
+		{
+			if (offset + i < bytesToPrint)
+			{
+				std::cout << std::setw(2) << static_cast<unsigned>(data[offset + i])
+					<< " ";
+			}
+			else
+			{
+				std::cout << "   ";
+			}
+		}
+
+		// Print ASCII representation
+		std::cout << " |";
+		for (size_t i = 0; i < bytesPerLine && offset + i < bytesToPrint; ++i)
+		{
+			uint8_t byte = data[offset + i];
+			char c = (byte >= 32 && byte < 127) ? static_cast<char>(byte) : '.';
+			std::cout << c;
+		}
+		std::cout << "|" << std::dec << std::endl;
+	}
+}
+
+std::chrono::milliseconds IoUringAssemblyEngine::getAssemblyDuration() const
+{
+	auto duration = assemblyEndTime - assemblyStartTime;
+	if (duration.count() < 0)
+	{
+		return std::chrono::milliseconds(0);
+	}
+	return std::chrono::duration_cast<std::chrono::milliseconds>(duration);
 }

 } // namespace stim_buff
@@ -9,6 +9,7 @@
 #include <vector>
 #include <chrono>
 #include <atomic>
+#include <random>
 #include <liburing.h>
 #include <boost/asio/io_service.hpp>
 #include <boost/asio/deadline_timer.hpp>
@@ -18,43 +19,49 @@
 #include <asynchronousLoop.h>
 #include <callback.h>
 #include <spinLock.h>
-#include "frameAssemblyDesc.h"
+#include <user/frameAssemblyDesc.h>

 namespace smo {
 namespace stim_buff {

-class PcloudStimulusBuffer;
+class PcloudStimulusProducer;

 class IoUringAssemblyEngine
 {
 public:
-	explicit IoUringAssemblyEngine(PcloudStimulusBuffer& parent);
+	explicit IoUringAssemblyEngine(
+		PcloudStimulusProducer& parent, size_t nDgramsPerStagingBufferFrame);
 	~IoUringAssemblyEngine() = default;

 	bool setup();
 	void finalize();

-	typedef std::function<void(void*, int)> resetAndAssembleFrameCbFn;
-	void resetAndAssembleFrame(resetAndAssembleFrameCbFn onCqeReady);
-	void stop(bool doAcquireLock = true);
-
 	typedef std::function<void(bool, AsynchronousLoop)> assembleFrameReqCbFn;
 	void assembleFrameReq(Callback<assembleFrameReqCbFn> cb);

 	// Telemetry helpers
-	static size_t computePointsPerDgram(int returnMode);
 	static size_t computePointsPerFrame(int returnMode, size_t nDgramsPerFrame)
-		{ return computePointsPerDgram(returnMode) * nDgramsPerFrame; }
+		{ return livoxProto1::Device::getNPointsPerDgram(returnMode) * nDgramsPerFrame; }
+	static bool compactionIsNeeded(uint32_t nSucceeded, uint32_t nTotal)
+		{ return nSucceeded != 0 && nTotal != 0 && nSucceeded != nTotal; }
+
+	// Get assembly execution duration in milliseconds
+	std::chrono::milliseconds getAssemblyDuration() const;

 private:
-	PcloudStimulusBuffer& parent;
+	typedef std::function<void(void*, int)> resetAndAssembleFrameCbFn;
+	void resetAndAssembleFrame(resetAndAssembleFrameCbFn onCqeReady);
+	void assemblyCycleComplete();
+	bool stop();
+
+private:
+	PcloudStimulusProducer& parent;

 	// Cached descriptor for reuse across iterations
 	std::shared_ptr<FrameAssemblyDesc> frameAssemblyDesc;

 	// io_uring infrastructure
 	struct io_uring ring;
-	bool isSetup;

 	// Eventfd for CQE notifications (used with boost's unified loop)
 	int eventfdFd;
@@ -67,17 +74,43 @@ private:
 	boost::asio::deadline_timer stallTimer;
 	// Callback for CQE ntfns (called with user_data+result from each CQE)
 	resetAndAssembleFrameCbFn onCqeReadyCallback;
-	// Flag to indicate assembly is in progress (cleared by stop())
-	// Protected by isAssemblingLock
-	SpinLock isAssemblingLock;
-	bool isAssembling;
+	/**	EXPLANATION:
+	 * Flag to indicate whether engine should accept new requests.
+	 * Set by setup(), cleared by stop().
+	 */
+	SpinLock shouldAcceptRequestsLock;
+	bool shouldAcceptRequests;

-	void cancelIncompleteAndFillDummies();
+	size_t nDgramsPerStagingBufferFrame;
+
+	struct SlotAssemblyDesc
+	{
+		bool assembled;
+		struct msghdr msgHdr;
+		struct iovec ioVec;
+	};
+
+	// Track which slots have been successfully assembled and maintain persistent iovecs
+	std::vector<SlotAssemblyDesc> assembledSlotsTracker;
+
+	// Random number generation for dummy slot creation
+	std::random_device randomDevice;
+	std::mt19937 randomGenerator;
+
+	// Timestamp tracking for assembly execution
+	std::chrono::high_resolution_clock::time_point assemblyStartTime;
+	std::chrono::high_resolution_clock::time_point assemblyEndTime;
+
+	void fillUnAssembledSlotsWithDummyDgrams();
+	void randomDummySlotFiller(AsynchronousLoop& loop);
 	void onEventfdRead(
 		const boost::system::error_code& error, std::size_t bytes_transferred);

 	class AssembleFrameReq;
 	friend class AssembleFrameReq;
+
+public:
+	void printSlotBytes(size_t slotIndex, size_t nBytes);
 };

 } // namespace stim_buff
@@ -16,7 +16,7 @@
 #include <livoxProto1/protocol.h>
 #include <asynchronousContinuation.h>
 #include <boost/asio/deadline_timer.hpp>
-#include "pcloudStimulusBuffer.h"
+#include "pcloudStimulusProducer.h"
 #include "livoxGen1.h"


@@ -27,26 +27,73 @@ namespace stim_buff {
 const SmoCallbacks* smoHooksPtr = nullptr;
 static SmoThreadingModelDesc smoThreadingModelDesc;

-// Local collection of stimulus buffers
-static std::vector<std::shared_ptr<StimulusBuffer>> attachedStimBuffs;
+// Local collection of stimulus producers
+static std::vector<std::shared_ptr<StimulusProducer>> attachedStimulusProducers;

-// Get stimulus buffer by device attachment spec
-static std::shared_ptr<StimulusBuffer>
-getStimBuff(const std::shared_ptr<smo::device::DeviceAttachmentSpec>& spec)
+static bool isSupportedQualeIfaceApi(const std::string& qualeIfaceApi);
+
+// Check if a StimulusProducer matches the requested stim feature
+static bool isProducerForStimFeature(
+	const std::shared_ptr<StimulusProducer>& stimProducer,
+	const std::string& qualeIfaceApi)
 {
-	for (const auto& stimBuff : attachedStimBuffs)
+	// Check if the qualeIfaceApi requires a PcloudStimulusProducer
+	if (qualeIfaceApi == "mesh" || qualeIfaceApi == "pcloudIntensity" ||
+		qualeIfaceApi == "pcloudAmbience")
+	{
+		// Attempt to upcast to PcloudStimulusProducer
+		auto pcloudProducer = std::dynamic_pointer_cast<PcloudStimulusProducer>(
+			stimProducer);
+
+		return pcloudProducer != nullptr;
+	}
+	else if (qualeIfaceApi == "gyro" || qualeIfaceApi == "accel")
+	{
+		/**	TODO:
+		 * Add upcast mappings for gyro and accel later when we implement
+		 * ImuStimulusProducer.
+		 */
+		return false;
+	}
+
+	return false;
+}
+
+// Get stimulus producer by device attachment spec
+static std::shared_ptr<StimulusProducer>
+getStimulusProducer(
+	const std::shared_ptr<smo::device::DeviceAttachmentSpec>& spec
+	)
+{
+	for (const auto& stimProducer : attachedStimulusProducers)
 	{
 		// Compare device selectors to find matching buffer
-		if (stimBuff->deviceAttachmentSpec->deviceSelector
-			== spec->deviceSelector)
+		if (livoxProto1::comms::deviceIdentifiersEqual(
+				stimProducer->deviceAttachmentSpec->deviceSelector,
+				spec->deviceSelector)
+			&& isProducerForStimFeature(stimProducer, spec->qualeIfaceApi))
 		{
-			return stimBuff;
+			return stimProducer;
 		}
 	}

 	return nullptr;
 }

+// Helper function to parse n-dgrams-per-frame from stim-buff-api params
+static size_t parseNDgramsPerFrame(
+	const std::shared_ptr<smo::device::DeviceAttachmentSpec>& spec)
+{
+	const std::vector<std::string> nDgramsPerFrameParamNames = {
+		"n-dgrams-per-frame",
+		"num-dgrams-per-frame"
+	};
+
+	return static_cast<size_t>(
+		smo::device::DeviceAttachmentSpec::parseOptionalParamAsIntWithSynonyms(
+			spec->stimBuffApiParams, nDgramsPerFrameParamNames, 84));
+}
+
 // LivoxProto1DllState constructor implementation
 LivoxProto1DllState::LivoxProto1DllState()
 : dlopenHandle(nullptr, DlCloser),
@@ -85,12 +132,48 @@ public:

 public:
 	const std::shared_ptr<smo::device::DeviceAttachmentSpec> spec;
-	std::shared_ptr<PcloudStimulusBuffer> stimBuff;
+	std::shared_ptr<PcloudStimulusProducer> stimProducer;
 	std::shared_ptr<livoxProto1::Device> deviceTmp;

 private:
 	std::unique_ptr<boost::asio::deadline_timer> delayTimer;

+	// Helper method to ensure StimBuffer is attached
+	// Returns true if successful, false on error
+	bool ensureStimBufferAttached(std::shared_ptr<AttachDeviceReq> context)
+	{
+		if (!context->stimProducer)
+		{
+			std::cerr << __func__ << ": stimProducer is null" << std::endl;
+			return false;
+		}
+
+		// Check for duplicate qualeIfaceApi
+		const std::string& qualeIfaceApi = context->spec->qualeIfaceApi;
+		if (context->stimProducer->hasBufferWithQualeIfaceApi(qualeIfaceApi))
+		{
+			std::cerr << __func__ << ": Buffer with qualeIfaceApi '"
+				<< qualeIfaceApi << "' already exists for this producer. "
+				"Each producer can only have one buffer per qualeIfaceApi."
+				<< std::endl;
+			return false;
+		}
+
+		// Call getOrCreateAttachedStimulusBuffer (may throw, catch and return failure)
+		try {
+			context->stimProducer->getOrCreateAttachedStimulusBuffer(
+				context->spec);
+		} catch (const std::exception& e) {
+			std::cerr << __func__ << ": Failed to create StimBuffer: "
+				<< e.what() << ". Producer is committed, DeviceReattacher will retry."
+				<< std::endl;
+			// Return false so caller can handle error callback
+			return false;
+		}
+
+		return true;
+	}
+
 public:
 	void attachDeviceReq1(
 		std::shared_ptr<AttachDeviceReq> context,
@@ -107,7 +190,7 @@ public:
 		// Stash device pointer until after getReturnMode succeeds
 		context->deviceTmp = dev;

-		if (1 || OptionParser::getOptions().verbose)
+		if (1 || smoHooksPtr->OptionParser_getOptions().verbose)
 		{
 			std::cout << __func__ << ": Successfully attached/found Livox "
 				"device: " << context->spec->deviceSelector << " (ID: "
@@ -123,9 +206,9 @@ public:
 	void delayedGetReturnMode(
 		std::shared_ptr<AttachDeviceReq> context)
 	{
-		// Initialize timer with device's component thread
+		// Initialize timer with LivoxGen1 metadata io_service
 		delayTimer = std::make_unique<boost::asio::deadline_timer>(
-			context->deviceTmp->componentThread->getIoService());
+			smoThreadingModelDesc.componentThread->getIoService());

 		delayTimer->expires_from_now(boost::posix_time::milliseconds(5));
 		delayTimer->async_wait(
@@ -150,12 +233,12 @@ public:
 		(*livoxProto1.livoxProto1_device_getReturnModeReq)(
 			context->deviceTmp,
 			{context, std::bind(
-				&AttachDeviceReq::attachDeviceReq3,
+				&AttachDeviceReq::attachDeviceReq3_doCreateStimProducer,
 				context.get(), context,
 				std::placeholders::_1, std::placeholders::_2)});
 	}

-	void attachDeviceReq3(
+	void attachDeviceReq3_doCreateStimProducer(
 		std::shared_ptr<AttachDeviceReq> context,
 		bool success, uint8_t mode)
 	{
@@ -168,90 +251,120 @@ public:
 			return;
 		}

-		// Parse history buffer duration from quale-iface-api-params
-		int histbuffMs = 30000;  // Default: 30000ms (30 seconds)
-		const std::vector<std::string> histbuffParamNames = {
-			"history-buffer-duration-ms",
-			"hist-buff-duration-ms",
-			"histbuff-duration-ms",
-			"histbuff-ms"
-		};
-
-		// Loop through synonyms in reverse order; lattermost synonym wins.
-		for (auto synIt = histbuffParamNames.rbegin();
-			synIt != histbuffParamNames.rend(); ++synIt)
-		{
-			const auto& paramName = *synIt;
-			try {
-				histbuffMs = smo::device::DeviceAttachmentSpec
-					::parseRequiredParamAsInt(
-						context->spec->qualeIfaceApiParams, paramName);
-				break; // Found and parsed successfully
-			} catch (const std::exception&) {
-				// Parameter not found or parse error, continue to next synonym
-				continue;
-			}
-		}
-
-		// Create and add PcloudStimulusBuffer to collection now that device is ready
-		StimulusBuffer::PcloudFormatDesc formatDesc;
-		formatDesc.format = StimulusBuffer::PcloudFormatDesc::Format::XYZI;
-		auto pcloudStimBuff = std::make_shared<PcloudStimulusBuffer>(
-			context->spec, context->deviceTmp, formatDesc, histbuffMs, 30);
-
-		context->stimBuff = pcloudStimBuff;
-		context->deviceTmp->nAttachedStimBuffs++;
-		attachedStimBuffs.push_back(pcloudStimBuff);
-
-		pcloudStimBuff->start();
-
-		if (1 || OptionParser::getOptions().verbose)
+		if (1 || smoHooksPtr->OptionParser_getOptions().verbose)
 		{
 			std::cout << __func__ << ": Got return mode (" << (int)mode
 				<< ") for device: " << context->spec->deviceSelector
 				<< std::endl;
 		}

-		context->delayedEnablePcloudData(context);
-	}
-
-	// Helper method to delay and then call enablePcloudDataReq
-	void delayedEnablePcloudData(
-		std::shared_ptr<AttachDeviceReq> context)
-	{
-		// Initialize timer with device's component thread
-		delayTimer = std::make_unique<boost::asio::deadline_timer>(
-			context->stimBuff->device->componentThread->getIoService());
-
-		delayTimer->expires_from_now(boost::posix_time::milliseconds(5));
-		delayTimer->async_wait(
-			std::bind(
-				&AttachDeviceReq::attachDeviceReq4,
-				context.get(), context,
-				std::placeholders::_1));
-	}
-
-	void attachDeviceReq4(
-		std::shared_ptr<AttachDeviceReq> context,
-		const boost::system::error_code& error)
-	{
-		if (error)
+		/* Check if PcloudStimulusProducer already exists
+		 * (race condition or double-add)
+		 */
+		auto existingProducer = getStimulusProducer(context->spec);
+		if (existingProducer)
 		{
-			std::cerr << __func__ << ": Timer error: " << error.message()
+			throw std::runtime_error(
+				std::string(__func__) + ": PcloudStimulusProducer already "
+				"exists for device " + context->spec->deviceSelector + " "
+				"(race condition or double-add)");
+		}
+
+		// Create & add PcloudStimulusProducer to collection since dev now ready
+		PcloudStimulusProducer::PcloudFormatDesc formatDesc;
+		formatDesc.format = PcloudStimulusProducer::PcloudFormatDesc::Format
+			::XYZI;
+
+		// Parse n-dgrams-per-frame from stim-buff-api params (default: 84)
+		size_t nDgramsPerFrame = parseNDgramsPerFrame(context->spec);
+
+		auto pcloudDataProducer = std::make_shared<PcloudStimulusProducer>(
+			context->spec, context->deviceTmp, formatDesc, nDgramsPerFrame);
+
+		context->stimProducer = pcloudDataProducer;
+		context->deviceTmp->nAttachedStimulusProducers++;
+		if (context->deviceTmp->nAttachedStimulusProducers > 2)
+		{
+			throw std::runtime_error(
+				std::string(__func__) + ": Each LivoxGen1 device can only have "
+					"at most two StimulusProducers attached to it. Found "
+					+ std::to_string(
+						context->deviceTmp->nAttachedStimulusProducers) + ".");
+		}
+
+		attachedStimulusProducers.push_back(pcloudDataProducer);
+		if (false
+			/*attachedStimulusProducers.size() >= 2*nDevicesKnownToGen1Lib */)
+		{
+			/**	TODO:
+			 * It would be nice to add an nDevicesKnownToGen1Lib counter, and
+			 * then add a check here to ensure that
+			 * attachedStimulusProducers.size() is always less than or equal to
+			 * 2*nDevicesKnownToGen1Lib.
+			 *
+			 * (2 stim producers per device).
+			 */
+#if 0
+			throw std::runtime_error(
+				std::string(__func__) + ": Number of StimulusProducers attached "
+				"to LivoxGen1 devices known to the library ("
+				+ std::to_string(attachedStimulusProducers.size())
+				+ ") is greater than "
+				"expected. Lib knows about "
+				+ std::to_string(nDevicesKnownToGen1Lib) + " devices, "
+				"so there should be at most "
+				+ std::to_string(2*nDevicesKnownToGen1Lib)
+				+ " StimulusProducers attached to devices.");
+#endif
+		}
+
+		pcloudDataProducer->start();
+
+		// Ensure StimBuffer is attached
+		attachDeviceReq4_doCreateStimBuff_maybeDirectlyCalled(context);
+	}
+
+	// Ensure StimBuffer is attached
+	void attachDeviceReq4_doCreateStimBuff_maybeDirectlyCalled(
+		std::shared_ptr<AttachDeviceReq> context
+		)
+	{
+		// Ensure StimBuffer is attached
+		if (!ensureStimBufferAttached(context))
+		{
+			context->callOriginalCb(false, context->spec);
+			return;
+		}
+
+		// Continue to enable pcloud data if needed
+		attachDeviceReq5_doEnablePcloudData_maybeDirectlyCalled(context);
+	}
+
+	// Enable pcloud data if needed
+	void attachDeviceReq5_doEnablePcloudData_maybeDirectlyCalled(
+		std::shared_ptr<AttachDeviceReq> context
+		)
+	{
+		if (!context->stimProducer || !context->stimProducer->device)
+		{
+			std::cerr << __func__ << ": stimProducer or device is null"
 				<< std::endl;
 			context->callOriginalCb(false, context->spec);
 			return;
 		}

+		/* Enable pcloud data. Don't need delay since no commands were
+		 * sent to device prior to us reaching here (or delay already handled).
+		 */
 		(*livoxProto1.livoxProto1_device_enablePcloudDataReq)(
-			context->stimBuff->device,
+			context->stimProducer->device,
 			{context, std::bind(
-			&AttachDeviceReq::attachDeviceReq5,
-			context.get(), context,
-			std::placeholders::_1)});
+				&AttachDeviceReq::attachDeviceReq6,
+				context.get(), context,
+				std::placeholders::_1)});
 	}

-	void attachDeviceReq5(
+	void attachDeviceReq6(
 		std::shared_ptr<AttachDeviceReq> context,
 		bool success)
 	{
@@ -264,7 +377,7 @@ public:
 			return;
 		}

-		if (1 || OptionParser::getOptions().verbose)
+		if (1 || smoHooksPtr->OptionParser_getOptions().verbose)
 		{
 			std::cout << __func__ << ": Enabled pcloud data for device: "
 				<< context->spec->deviceSelector << std::endl;
@@ -280,16 +393,16 @@ class DetachDeviceReq
 public:
 	DetachDeviceReq(
 		const std::shared_ptr<smo::device::DeviceAttachmentSpec>& spec,
-		const std::shared_ptr<PcloudStimulusBuffer>& stimBuff,
+		const std::shared_ptr<StimulusBuffer>& stimBuffer,
 		smo::Callback<sal_mlo_detachDeviceReqCbFn> cb)
 	:	smo::NonPostedAsynchronousContinuation<sal_mlo_detachDeviceReqCbFn>(
 		std::move(cb)),
-	spec(spec), stimBuff(stimBuff)
+	spec(spec), stimBuffer(stimBuffer)
 	{}

 public:
 	const std::shared_ptr<smo::device::DeviceAttachmentSpec> spec;
-	std::shared_ptr<PcloudStimulusBuffer> stimBuff;
+	std::shared_ptr<StimulusBuffer> stimBuffer;
 private:
 	std::unique_ptr<boost::asio::deadline_timer> delayTimer;

@@ -301,7 +414,7 @@ public:
 		if (!success)
 		{
 			std::cerr << __func__ << ": Failed to disable pcloud data for "
-				"stimbuff " << context->spec->deviceSelector << std::endl;
+				"stim producer " << context->spec->deviceSelector << std::endl;
 			// Fallthrough.
 		}

@@ -313,9 +426,9 @@ public:
 	void delayedDestroyDevice(
 		std::shared_ptr<DetachDeviceReq> context)
 	{
-		// Initialize timer with device's component thread
+		// Initialize timer with LivoxGen1 metadata io_service
 		delayTimer = std::make_unique<boost::asio::deadline_timer>(
-			context->stimBuff->device->componentThread->getIoService());
+			smoThreadingModelDesc.componentThread->getIoService());

 		delayTimer->expires_from_now(boost::posix_time::milliseconds(5));
 		delayTimer->async_wait(
@@ -337,17 +450,50 @@ public:
 			// Fallthrough.
 		}

-		context->stimBuff->stop();
-		// Remove stimulus buffer from collection before destroying device
-		context->stimBuff->device->nAttachedStimBuffs--;
-		auto it = std::find(
-			attachedStimBuffs.begin(), attachedStimBuffs.end(),
-			context->stimBuff);
-		if (it != attachedStimBuffs.end())
-			{ attachedStimBuffs.erase(it); }
+		// Remove StimBuffer from collection if it exists
+		if (!context->stimBuffer)
+		{
+			throw std::runtime_error(std::string(__func__)
+				+ ": stimBuffer (API: " + context->spec->stimBuffApi + ") "
+				+ "is missing in detachDeviceReq1_delayed "
+				+ "for device " + context->spec->deviceSelector);
+		}
+
+		// Get the producer from the buffer's parent
+		auto& stimProducer = dynamic_cast<PcloudStimulusProducer&>(
+			context->stimBuffer->parent);
+
+		stimProducer.destroyAttachedStimulusBuffer(context->stimBuffer);
+
+		// Check if StimProducer has other buffers
+		if (!stimProducer.attachedStimulusBuffers.empty())
+		{
+			// Other buffers exist - just remove this buffer, done
+			context->callOriginalCb(true, context->spec);
+			return;
+		}
+
+		// No other buffers - stop and remove StimProducer
+		stimProducer.stop();
+		// Remove stimulus producer from collection before destroying device
+		stimProducer.device->nAttachedStimulusProducers--;
+		// Find and remove the producer from the collection by comparing device
+		auto it2 = std::find_if(
+			attachedStimulusProducers.begin(), attachedStimulusProducers.end(),
+			[&stimProducer](const std::shared_ptr<StimulusProducer>& p)
+			{
+				/**	FIXME:
+				 * When we implement the ImuStimulusProducer, we need to make
+				 * sure we handle that properly here.
+				 */
+				auto pcloudProd = std::dynamic_pointer_cast<PcloudStimulusProducer>(p);
+				return pcloudProd && pcloudProd->device == stimProducer.device;
+			});
+		if (it2 != attachedStimulusProducers.end())
+			{ attachedStimulusProducers.erase(it2); }

 		(*livoxProto1.livoxProto1_destroyDeviceReq)(
-			context->stimBuff->device,
+			stimProducer.device,
 			{context, std::bind(
 				&DetachDeviceReq::detachDeviceReq2,
 				context.get(), context,
@@ -361,21 +507,21 @@ public:
 		if (!success)
 		{
 			std::cerr << __func__ << ": Failed to destroy dev "
-				"device " << context->spec->deviceSelector << " for stimbuff."
-				"\n";
+				"device " << context->spec->deviceSelector << " for stim "
+				"producer.\n";

 			/**	NOTE:
 			 * There's a decent argument for falling through here and still
-			 * removing the stimulus buffer from attachedStimBuffs.
+			 * removing the stimulus producer from attachedStimulusProducers.
 			 */
 			context->callOriginalCb(false, context->spec);
 			return;
 		}

-		if (1 || OptionParser::getOptions().verbose)
+		if (1 || smoHooksPtr->OptionParser_getOptions().verbose)
 		{
-			std::cout << __func__ << ": Successfully detached pcloud stimbuff "
-				"for device " << context->spec->deviceSelector
+			std::cout << __func__ << ": Successfully detached pcloud stim "
+				"producer for device " << context->spec->deviceSelector
 				<< " and possibly also destroyed device.\n";
 		}

@@ -393,8 +539,9 @@ extern "C" sal_mlo_detachDeviceReqFn livoxGen1_detachDeviceReq;
 static const StimBuffApiDesc livoxGen1ApiDesc = {
 	.name = "livoxGen1",
 	.exportedQualeIfaceApis = {
-		{.name = "pcloud"},
+		{.name = "mesh"},
 		{.name = "pcloudIntensity"},
+		{.name = "pcloudAmbience"},
 		{.name = "gyro"},
 		{.name = "accel"}
 	},
@@ -406,6 +553,13 @@ static const StimBuffApiDesc livoxGen1ApiDesc = {
 	}
 };

+static bool isSupportedQualeIfaceApi(const std::string& qualeIfaceApi)
+{
+	// Check if this is a supported (implemented) qualeIfaceApi
+	return qualeIfaceApi == "mesh" || qualeIfaceApi == "pcloudIntensity" ||
+		qualeIfaceApi == "pcloudAmbience";
+}
+
 // Callback function implementations
 extern "C" int livoxGen1_initializeInd(void)
 {
@@ -489,7 +643,7 @@ extern "C" int livoxGen1_initializeInd(void)
 extern "C" int livoxGen1_finalizeInd(void)
 {

-	attachedStimBuffs.clear();
+	attachedStimulusProducers.clear();

 	// Call LivoxProto1 library exit function
 	if (livoxProto1.livoxProto1_exit) {
@@ -515,36 +669,84 @@ extern "C" void livoxGen1_attachDeviceReq(
 			"not available");
 	}

-	auto request = std::make_shared<AttachDeviceReq>(desc, cb);
-
-	// Check if stimulus buffer already exists in the collection
-	auto pcloudStimBuff = std::static_pointer_cast<PcloudStimulusBuffer>(
-		getStimBuff(desc));
-
-	if (pcloudStimBuff)
+	// Validate qualeIfaceApi
+	const std::string& qualeIfaceApi = desc->qualeIfaceApi;
+	if (qualeIfaceApi == "gyro" || qualeIfaceApi == "accel")
 	{
-		request->stimBuff = pcloudStimBuff;
-
-		// Check if device's point cloud data is already active
-		if (pcloudStimBuff->device && pcloudStimBuff->device->pcloudDataActive)
-		{
-			// Point cloud data is already active, call success callback
-			request->callOriginalCb(true, request->spec);
-			return;
-		}
-
-		/* Enable pcloud data first. Don't need delay since no commands were
-		 * sent to device prior to us reaching here.
-		 */
-		(*livoxProto1.livoxProto1_device_enablePcloudDataReq)(
-			pcloudStimBuff->device,
-			{request, std::bind(
-				&AttachDeviceReq::attachDeviceReq5,
-				request.get(), request,
-				std::placeholders::_1)});
+		// These are for ImuStimulusProducer (not yet implemented)
+		std::cerr << __func__ << ": qualeIfaceApi '" << qualeIfaceApi
+			<< "' requires ImuStimulusProducer which is not yet implemented"
+			<< std::endl;
+		cb.callbackFn(false, desc);
 		return;
 	}

+	if (!isSupportedQualeIfaceApi(qualeIfaceApi))
+	{
+		// Unknown qualeIfaceApi
+		std::cerr << __func__ << ": Unsupported qualeIfaceApi '"
+			<< qualeIfaceApi << "' for LivoxGen1. "
+			"Supported values: mesh, pcloudIntensity, pcloudAmbience"
+			<< std::endl;
+		cb.callbackFn(false, desc);
+		return;
+	}
+
+	auto request = std::make_shared<AttachDeviceReq>(desc, cb);
+
+	// Case 1: Check if StimBuffer already exists
+	auto stimProducer = std::dynamic_pointer_cast<PcloudStimulusProducer>(
+		getStimulusProducer(desc));
+	if (stimProducer)
+	{
+		auto existingBuffer = stimProducer->getAttachedStimulusBuffer(desc);
+		if (existingBuffer)
+		{
+			// StimBuffer exists, check if pcloud data is active
+			if (stimProducer->device && stimProducer->device->pcloudDataActive)
+			{
+				// Both StimBuffer and pcloud data are active, early return with success
+				request->callOriginalCb(true, request->spec);
+				return;
+			}
+
+			// StimBuffer exists but pcloud data is not active, enable it
+			request->stimProducer = stimProducer;
+			request->attachDeviceReq5_doEnablePcloudData_maybeDirectlyCalled(
+				request);
+
+			return;
+		}
+		else
+		{
+			/**	EXPLANATION:
+			 * StimProducer exists, StimBuffer doesn't (DASpec doesn't match)
+			 * Check if producer already has a buffer with the requested
+			 * qualeIfaceApi but different DASpec - this is not allowed.
+			 */
+			if (stimProducer->hasBufferWithQualeIfaceApi(desc->qualeIfaceApi))
+			{
+				std::cerr << __func__ << ": Producer already has a buffer with "
+					"qualeIfaceApi '" << desc->qualeIfaceApi
+					<< "' but with a different DeviceAttachmentSpec. "
+					"A single LivoxGen1 device cannot support multiple DASpecs "
+					"with the same qualeIfaceApi." << std::endl;
+
+				cb.callbackFn(false, desc);
+				return;
+			}
+
+			request->stimProducer = stimProducer;
+			// Ensure StimBuffer is attached and enable pcloud data if needed
+			request->attachDeviceReq4_doCreateStimBuff_maybeDirectlyCalled(
+				request);
+
+			return;
+		}
+	}
+
+	// StimProducer doesn't exist - need to create device first
+
 	// Parse integer parameters from provider params with defaults
 	/**	EXPLANATION:
 	 * We may want to add a new param here called "command-delay-ms" to control
@@ -648,22 +850,41 @@ extern "C" void livoxGen1_detachDeviceReq(
 	Callback<smo::stim_buff::sal_mlo_detachDeviceReqCbFn> cb
 	)
 {
-	// Check if stimulus buffer exists in the collection
-	auto stimBuff = std::static_pointer_cast<PcloudStimulusBuffer>(
-		getStimBuff(desc));
-
-	if (!stimBuff)
+	// Case 1: Check if StimBuffer doesn't exist (early return)
+	auto stimProducerBase = getStimulusProducer(desc);
+	if (!stimProducerBase)
 	{
-		cb.callbackFn(false, desc);
+		// StimProducer doesn't exist, nothing to detach - success
+		cb.callbackFn(true, desc);
 		return;
 	}

+	auto stimProducer = std::dynamic_pointer_cast<PcloudStimulusProducer>(
+		stimProducerBase);
+
+	if (!stimProducer)
+	{
+		throw std::runtime_error(std::string(__func__) +
+			": Failed to cast StimulusProducer to PcloudStimulusProducer "
+			"for device " + desc->deviceSelector);
+	}
+
+	// Check if StimBuffer exists
+	auto stimBuffer = stimProducer->getAttachedStimulusBuffer(desc);
+	if (!stimBuffer)
+	{
+		// StimBuffer doesn't exist, nothing to detach - success
+		cb.callbackFn(true, desc);
+		return;
+	}
+
+	// Case 2: StimBuffer exists - proceed with detach
 	auto request = std::make_shared<DetachDeviceReq>(
-		desc, stimBuff, cb);
+		desc, stimBuffer, cb);

 	// Disable point cloud data first
 	(*livoxProto1.livoxProto1_device_disablePcloudDataReq)(
-		stimBuff->device,
+		stimProducer->device,
 		{request, std::bind(
 			&DetachDeviceReq::detachDeviceReq1,
 			request.get(), request,
@@ -0,0 +1,48 @@
+#ifndef _LIVOX_GEN1_MESH_STIMULUS_BUFFER_H
+#define _LIVOX_GEN1_MESH_STIMULUS_BUFFER_H
+
+#include <memory>
+#include <user/stimulusBuffer.h>
+#include <user/stagingBuffer.h>
+
+namespace smo {
+namespace stim_buff {
+
+// Forward declaration
+class StimulusProducer;
+
+/**
+ * MeshStimulusBuffer is a specialized StimulusBuffer for mesh data.
+ */
+class MeshStimulusBuffer
+:	public StimulusBuffer
+{
+public:
+	explicit MeshStimulusBuffer(
+		StimulusProducer& parent,
+		const std::shared_ptr<device::DeviceAttachmentSpec>& deviceAttachmentSpec,
+		int histbuffMs,
+		const StagingBuffer::IOEngineConstraints& inputEngineConstraints,
+		const StagingBuffer::IOEngineConstraints& outputEngineConstraints,
+		const SmoCallbacks& callbacks,
+		cl_mem_flags flags)
+	:	StimulusBuffer(
+			parent, deviceAttachmentSpec, histbuffMs,
+			inputEngineConstraints, outputEngineConstraints,
+			callbacks, flags)
+	{}
+
+	~MeshStimulusBuffer() = default;
+
+	// Non-copyable, movable
+	MeshStimulusBuffer(const MeshStimulusBuffer&) = delete;
+	MeshStimulusBuffer& operator=(const MeshStimulusBuffer&) = delete;
+	MeshStimulusBuffer(MeshStimulusBuffer&&) = default;
+	MeshStimulusBuffer& operator=(MeshStimulusBuffer&&) = default;
+};
+
+} // namespace stim_buff
+} // namespace smo
+
+#endif // _LIVOX_GEN1_MESH_STIMULUS_BUFFER_H
+
@@ -0,0 +1,234 @@
+#ifndef _LIVOX_GEN1_OPENCL_COLLATING_AND_MESHING_ENGINE_H
+#define _LIVOX_GEN1_OPENCL_COLLATING_AND_MESHING_ENGINE_H
+
+#include <boostAsioLinkageFix.h>
+#include <cstdint>
+#include <cstddef>
+#include <memory>
+#include <functional>
+#include <optional>
+#include <iostream>
+#include <stdexcept>
+#include <chrono>
+#define CL_TARGET_OPENCL_VERSION 120
+#include <CL/cl.h>
+#include <asynchronousLoop.h>
+#include <callback.h>
+#include <spinLock.h>
+#include <user/stimulusFrame.h>
+#include <user/stagingBuffer.h>
+#include <user/frameAssemblyDesc.h>
+#include <user/compute.h>
+#include <user/senseApiDesc.h>
+
+namespace smo {
+namespace stim_buff {
+
+class PcloudStimulusProducer;
+
+class OpenClCollatingAndMeshingEngine
+{
+public:
+	explicit OpenClCollatingAndMeshingEngine(PcloudStimulusProducer& parent);
+	~OpenClCollatingAndMeshingEngine();
+
+	// Non-copyable, movable
+	OpenClCollatingAndMeshingEngine(
+		const OpenClCollatingAndMeshingEngine&) = delete;
+	OpenClCollatingAndMeshingEngine& operator=(
+		const OpenClCollatingAndMeshingEngine&) = delete;
+	OpenClCollatingAndMeshingEngine(
+		OpenClCollatingAndMeshingEngine&&) = default;
+	OpenClCollatingAndMeshingEngine& operator=(
+		OpenClCollatingAndMeshingEngine&&) = default;
+
+	bool setup();
+	void finalize();
+
+	typedef std::function<void(bool, StimulusFrame&)>
+		compactCollateAndMeshFrameReqCbFn;
+	void compactCollateAndMeshFrameReq(
+		AsynchronousLoop& asyncLoop, StimulusFrame& stimulusFrame,
+		std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
+		std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame,
+		Callback<compactCollateAndMeshFrameReqCbFn> callback);
+
+private:
+	// Callback function types
+	typedef std::function<void(cl_int)> compactKernelCbFn;
+	typedef std::function<void(cl_int)> collateKernelCbFn;
+
+	bool startCompactKernel(
+		StagingBuffer& assemblyBuff, uint32_t nSucceeded,
+		compactKernelCbFn callback);
+	bool startCollateKernel(
+		StagingBuffer& assemblyBuff, StagingBuffer& collationBuff,
+		std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
+		std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame,
+		collateKernelCbFn callback);
+
+	void compactKernelComplete(bool isFinalizing=false);
+	void collateKernelComplete(
+		std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
+		std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame,
+		bool isFinalizing=false);
+	bool stop();
+
+public:
+	// Get kernel execution durations in milliseconds
+	std::chrono::milliseconds getCompactKernelDuration() const;
+	std::chrono::milliseconds getCollateKernelDuration() const;
+
+private:
+	PcloudStimulusProducer& parent;
+
+	// OpenCL infrastructure (managed by ComputeManager)
+	std::shared_ptr<smo::compute::ComputeDevice> computeDevice;
+	cl_program slotCompactorProgram;
+	cl_program collateProgram;
+	cl_kernel slotCompactorKernel;
+	cl_kernel collateKernel;
+
+	// OpenCL buffers (managed by ComputeManager)
+	std::shared_ptr<smo::compute::ClBuffer> clAssemblyBufferClBuffer;
+	std::shared_ptr<smo::compute::ClBuffer> clCollationBufferClBuffer;
+	// Cached cl_mem handles for the device we're using
+	cl_mem clAssemblyBuffer;
+	cl_mem clCollationBuffer;
+
+	// State tracking
+	SpinLock shouldAcceptRequestsLock;
+	bool shouldAcceptRequests;
+	bool compactIsRunning;
+	bool collateIsRunning;
+	cl_event currentCompactKernelEvent;
+	cl_event currentCollateKernelEvent;
+
+	// Memory tracking
+	void* assemblyBufferPtr;
+	size_t assemblyBufferSize;
+	void* collationBufferPtr;
+	size_t collationBufferSize;
+	// Mapped buffer pointers (for zero-copy synchronization)
+	void* mappedAssemblyBuffer;
+	void* mappedCollationBuffer;
+
+	// Frame descriptor (cached from setup)
+	std::shared_ptr<FrameAssemblyDesc> frameAssemblyDesc;
+
+	// Callback storage
+	compactKernelCbFn compactKernelCb;
+	collateKernelCbFn collateKernelCb;
+
+	// Timestamp tracking for kernel execution
+	std::chrono::high_resolution_clock::time_point compactKernelStartTime;
+	std::chrono::high_resolution_clock::time_point compactKernelEndTime;
+	std::chrono::high_resolution_clock::time_point collateKernelStartTime;
+	std::chrono::high_resolution_clock::time_point collateKernelEndTime;
+
+	// Static callbacks for OpenCL events
+	static void CL_CALLBACK compactKernelEventCallback(
+		cl_event event, cl_int event_command_exec_status, void* user_data);
+	static void CL_CALLBACK collateKernelEventCallback(
+		cl_event event, cl_int event_command_exec_status, void* user_data);
+
+	// Private helper methods
+	bool compileAndPrepareKernel(
+		const char* kernelSource, size_t kernelSourceLen,
+		const char* kernelName, cl_program& program, cl_kernel& kernel);
+	bool compileAndPrepareKernels();
+	bool setupSlotCompactorsArgs(
+		StagingBuffer& assemblyBuff, uint32_t nSucceeded);
+	bool setupCollateDgramsArgs(
+		StagingBuffer& assemblyBuff,
+		std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame,
+		std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame);
+
+	// Generic buffer mapping/unmapping for zero-copy synchronization
+	bool mapBuffer(
+		cl_mem buffer, size_t size, cl_map_flags mapFlags, void*& mappedPtr);
+	bool unmapBuffer(cl_mem buffer, void*& mappedPtr);
+
+	// Wrapper functions for specific buffers
+	bool mapAssemblyBuffer(cl_map_flags mapFlags = CL_MAP_READ);
+	bool unmapAssemblyBuffer();
+	bool mapCollationBuffer(cl_map_flags mapFlags = CL_MAP_READ);
+	bool unmapCollationBuffer();
+
+	// Forward declaration for continuation class
+	class CompactCollateAndMeshFrameReq;
+
+	// Unified kernel start function
+	template<typename SetupArgsFn, typename ValidateBuffersFn>
+	bool startKernel(
+		cl_kernel kernel,
+		cl_event* eventPtr,
+		SetupArgsFn setupArgsFn,
+		ValidateBuffersFn validateBuffersFn,
+		size_t globalWorkSize,
+		void (CL_CALLBACK *eventCallback)(cl_event, cl_int, void*),
+		const char* kernelName,
+		bool& isRunning)
+	{
+		if (isRunning)
+		{
+			std::cerr << __func__ << ": already running, call stop() first"
+				<< std::endl;
+			return false;
+		}
+
+		// Validate buffers
+		validateBuffersFn();
+
+		// Set up kernel arguments
+		if (!setupArgsFn()) {
+			return false;
+		}
+
+		// Enqueue kernel execution
+		cl_int err = clEnqueueNDRangeKernel(
+			computeDevice->commandQueue, kernel,
+			1, nullptr, &globalWorkSize, nullptr,
+			0, nullptr, eventPtr);
+
+		if (err != CL_SUCCESS)
+		{
+			std::cerr << __func__ << ": failed to enqueue " << kernelName
+				<< " kernel: " << err << std::endl;
+			return false;
+		}
+
+		// Set up callback using static member function
+		err = clSetEventCallback(
+			*eventPtr, CL_COMPLETE, eventCallback, this);
+
+		if (err != CL_SUCCESS)
+		{
+			std::cerr << __func__ << ": failed to set event callback: " << err
+				<< std::endl;
+			clReleaseEvent(*eventPtr);
+			*eventPtr = nullptr;
+			return false;
+		}
+
+		// Force queue flush to ensure event processing and callback invocation
+		err = clFlush(computeDevice->commandQueue);
+		if (err != CL_SUCCESS)
+		{
+			std::cerr << __func__ << ": failed to flush queue: " << err
+				<< std::endl;
+			clReleaseEvent(*eventPtr);
+			*eventPtr = nullptr;
+			return false;
+		}
+
+		isRunning = true;
+		return true;
+	}
+};
+
+} // namespace stim_buff
+} // namespace smo
+
+#endif // _LIVOX_GEN1_OPENCL_COLLATING_AND_MESHING_ENGINE_H
+
@@ -0,0 +1,33 @@
+	.section .rodata
+	.global collateKernelStart
+	.global collateKernelNBytes
+	.type collateKernelStart, @object
+	.type collateKernelNBytes, @object
+
+collateKernelStart:
+	.incbin "collateDgrams.cl"
+	.size collateKernelStart, . - collateKernelStart
+.collateKernelEnd:
+
+	.section .data
+collateKernelNBytes:
+	.long .collateKernelEnd - collateKernelStart
+	.size collateKernelNBytes, 4
+
+	.section .rodata
+	.global slotCompactorKernelStart
+	.global slotCompactorKernelNBytes
+	.type slotCompactorKernelStart, @object
+	.type slotCompactorKernelNBytes, @object
+
+slotCompactorKernelStart:
+	.incbin "slotCompactor.cl"
+	.size slotCompactorKernelStart, . - slotCompactorKernelStart
+.slotCompactorKernelEnd:
+
+	.section .data
+slotCompactorKernelNBytes:
+	.long .slotCompactorKernelEnd - slotCompactorKernelStart
+	.size slotCompactorKernelNBytes, 4
+
+	.section .note.GNU-stack,"",@progbits
@@ -0,0 +1,22 @@
+#ifndef _LIVOX_GEN1_OPENCL_KERNELS_H
+#define _LIVOX_GEN1_OPENCL_KERNELS_H
+
+#include <cstdint>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// External symbols for collate kernel (unmangled, not namespaced)
+extern const char collateKernelStart[];
+extern const uint32_t collateKernelNBytes;
+
+// External symbols for slotCompactor kernel (unmangled, not namespaced)
+extern const char slotCompactorKernelStart[];
+extern const uint32_t slotCompactorKernelNBytes;
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // _LIVOX_GEN1_OPENCL_KERNELS_H
@@ -1,119 +0,0 @@
-#include <stdexcept>
-#include <iostream>
-#include <cstring>
-#include "openClSplittingEngine.h"
-#include "pcloudStimulusBuffer.h"
-
-namespace smo {
-namespace stim_buff {
-
-OpenClSplittingEngine::OpenClSplittingEngine(PcloudStimulusBuffer& parent_)
-: parent(parent_),
-platform(nullptr),
-device(nullptr),
-context(nullptr),
-commandQueue(nullptr),
-program(nullptr),
-kernel(nullptr),
-isSetup(false),
-assemblyBuffer(nullptr),
-xyzBuffer(nullptr),
-intensityBuffer(nullptr)
-{
-}
-
-OpenClSplittingEngine::~OpenClSplittingEngine()
-{
-	finalize();
-}
-
-bool OpenClSplittingEngine::setup()
-{
-	if (isSetup) {
-		return true;
-	}
-
-	cl_int err;
-
-	// Get platform
-	cl_uint numPlatforms;
-	err = clGetPlatformIDs(1, &platform, &numPlatforms);
-	if (err != CL_SUCCESS || numPlatforms == 0) {
-		std::cerr << __func__ << ": failed to get OpenCL platform: "
-			<< err << std::endl;
-		return false;
-	}
-
-	// Get device
-	err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, nullptr);
-	if (err != CL_SUCCESS) {
-		std::cerr << __func__ << ": failed to get GPU device: "
-			<< err << std::endl;
-		return false;
-	}
-
-	// Create context
-	context = clCreateContext(nullptr, 1, &device, nullptr, nullptr, &err);
-	if (err != CL_SUCCESS || !context) {
-		std::cerr << __func__ << ": failed to create OpenCL context: "
-			<< err << std::endl;
-		goto cleanup;
-	}
-
-	// Create command queue
-	commandQueue = clCreateCommandQueue(context, device, 0, &err);
-	if (err != CL_SUCCESS || !commandQueue) {
-		std::cerr << __func__ << ": failed to create command queue: "
-			<< err << std::endl;
-		goto cleanup;
-	}
-
-	// TODO: Create program and kernel
-	// TODO: Create buffers
-
-	isSetup = true;
-	return true;
-
-cleanup:
-	finalize();
-	return false;
-}
-
-void OpenClSplittingEngine::finalize()
-{
-	if (intensityBuffer) {
-		clReleaseMemObject(intensityBuffer);
-		intensityBuffer = nullptr;
-	}
-	if (xyzBuffer) {
-		clReleaseMemObject(xyzBuffer);
-		xyzBuffer = nullptr;
-	}
-	if (assemblyBuffer) {
-		clReleaseMemObject(assemblyBuffer);
-		assemblyBuffer = nullptr;
-	}
-	if (kernel) {
-		clReleaseKernel(kernel);
-		kernel = nullptr;
-	}
-	if (program) {
-		clReleaseProgram(program);
-		program = nullptr;
-	}
-	if (commandQueue) {
-		clReleaseCommandQueue(commandQueue);
-		commandQueue = nullptr;
-	}
-	if (context) {
-		clReleaseContext(context);
-		context = nullptr;
-	}
-
-	device = nullptr;
-	platform = nullptr;
-	isSetup = false;
-}
-
-} // namespace stim_buff
-} // namespace smo
@@ -1,50 +0,0 @@
-#ifndef _LIVOX_GEN1_OPENCL_SPLITTING_ENGINE_H
-#define _LIVOX_GEN1_OPENCL_SPLITTING_ENGINE_H
-
-#include <cstdint>
-#include <cstddef>
-#include <memory>
-#include <CL/cl.h>
-
-namespace smo {
-namespace stim_buff {
-
-class PcloudStimulusBuffer;
-
-class OpenClSplittingEngine
-{
-public:
-	explicit OpenClSplittingEngine(PcloudStimulusBuffer& parent);
-	~OpenClSplittingEngine();
-
-	// Non-copyable, movable
-	OpenClSplittingEngine(const OpenClSplittingEngine&) = delete;
-	OpenClSplittingEngine& operator=(const OpenClSplittingEngine&) = delete;
-	OpenClSplittingEngine(OpenClSplittingEngine&&) = default;
-	OpenClSplittingEngine& operator=(OpenClSplittingEngine&&) = default;
-
-	bool setup();
-	void finalize();
-
-private:
-	PcloudStimulusBuffer& parent;
-
-	// OpenCL infrastructure
-	cl_platform_id platform;
-	cl_device_id device;
-	cl_context context;
-	cl_command_queue commandQueue;
-	cl_program program;
-	cl_kernel kernel;
-	bool isSetup;
-
-	// OpenCL buffers
-	cl_mem assemblyBuffer;
-	cl_mem xyzBuffer;
-	cl_mem intensityBuffer;
-};
-
-} // namespace stim_buff
-} // namespace smo
-
-#endif // _LIVOX_GEN1_OPENCL_SPLITTING_ENGINE_H
@@ -0,0 +1,53 @@
+#ifndef _LIVOX_GEN1_PCLOUD_AMBIENCE_STIMULUS_BUFFER_H
+#define _LIVOX_GEN1_PCLOUD_AMBIENCE_STIMULUS_BUFFER_H
+
+#include <memory>
+#include <cstdint>
+#include <user/stimulusBuffer.h>
+#include <user/stagingBuffer.h>
+
+namespace smo {
+namespace stim_buff {
+
+// Forward declaration
+class StimulusProducer;
+
+/**
+ * PcloudAmbienceStimulusBuffer is a specialized StimulusBuffer for ambience point cloud data.
+ */
+class PcloudAmbienceStimulusBuffer
+:	public StimulusBuffer
+{
+public:
+	explicit PcloudAmbienceStimulusBuffer(
+		StimulusProducer& parent,
+		const std::shared_ptr<device::DeviceAttachmentSpec>& deviceAttachmentSpec,
+		int histbuffMs,
+		const StagingBuffer::IOEngineConstraints& inputEngineConstraints,
+		const StagingBuffer::IOEngineConstraints& outputEngineConstraints,
+		const SmoCallbacks& callbacks,
+		cl_mem_flags flags,
+		uint32_t ambienceHighVal_)
+	:	StimulusBuffer(
+			parent, deviceAttachmentSpec, histbuffMs,
+			inputEngineConstraints, outputEngineConstraints,
+			callbacks, flags),
+	ambienceHighVal(ambienceHighVal_)
+	{}
+
+	~PcloudAmbienceStimulusBuffer() = default;
+
+	// Non-copyable, movable
+	PcloudAmbienceStimulusBuffer(const PcloudAmbienceStimulusBuffer&) = delete;
+	PcloudAmbienceStimulusBuffer& operator=(const PcloudAmbienceStimulusBuffer&) = delete;
+	PcloudAmbienceStimulusBuffer(PcloudAmbienceStimulusBuffer&&) = default;
+	PcloudAmbienceStimulusBuffer& operator=(PcloudAmbienceStimulusBuffer&&) = default;
+
+public:
+	uint32_t ambienceHighVal;
+};
+
+} // namespace stim_buff
+} // namespace smo
+
+#endif // _LIVOX_GEN1_PCLOUD_AMBIENCE_STIMULUS_BUFFER_H
@@ -0,0 +1,52 @@
+#ifndef _LIVOX_GEN1_PCLOUD_INTENSITY_STIMULUS_BUFFER_H
+#define _LIVOX_GEN1_PCLOUD_INTENSITY_STIMULUS_BUFFER_H
+
+#include <memory>
+#include <user/stimulusBuffer.h>
+#include <user/stagingBuffer.h>
+
+namespace smo {
+namespace stim_buff {
+
+// Forward declaration
+class StimulusProducer;
+
+/**
+ * PcloudIntensityStimulusBuffer is a specialized StimulusBuffer for intensity point cloud data.
+ */
+class PcloudIntensityStimulusBuffer
+:	public StimulusBuffer
+{
+public:
+	explicit PcloudIntensityStimulusBuffer(
+		StimulusProducer& parent,
+		const std::shared_ptr<device::DeviceAttachmentSpec>
+			&deviceAttachmentSpec,
+		int histbuffMs,
+		const StagingBuffer::IOEngineConstraints& inputEngineConstraints,
+		const StagingBuffer::IOEngineConstraints& outputEngineConstraints,
+		const SmoCallbacks& callbacks,
+		cl_mem_flags flags)
+	:	StimulusBuffer(
+			parent, deviceAttachmentSpec, histbuffMs,
+			inputEngineConstraints, outputEngineConstraints,
+			callbacks, flags)
+	{}
+
+	~PcloudIntensityStimulusBuffer() = default;
+
+	// Non-copyable, movable
+	PcloudIntensityStimulusBuffer(
+		const PcloudIntensityStimulusBuffer&) = delete;
+	PcloudIntensityStimulusBuffer& operator=(
+		const PcloudIntensityStimulusBuffer&) = delete;
+	PcloudIntensityStimulusBuffer(
+		PcloudIntensityStimulusBuffer&&) = default;
+	PcloudIntensityStimulusBuffer& operator=(
+		PcloudIntensityStimulusBuffer&&) = default;
+};
+
+} // namespace stim_buff
+} // namespace smo
+
+#endif // _LIVOX_GEN1_PCLOUD_INTENSITY_STIMULUS_BUFFER_H
@@ -1,122 +0,0 @@
-#include <config.h>
-#include <opts.h>
-#include <algorithm>
-#include <unistd.h>
-#include <user/spMcRingBuffer.h>
-#include <componentThread.h>
-#include "pcloudStimulusBuffer.h"
-
-namespace smo {
-namespace stim_buff {
-
-extern const SmoCallbacks* smoHooksPtr;
-
-// OpenCL kernels are used to collate and produce our StimFrames.
-static SpMcRingBuffer::InputEngineConstraints openClInputConstraints(
-	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)), sizeof(void *));
-
-PcloudStimulusBuffer::PcloudStimulusBuffer(
-	const std::shared_ptr<device::DeviceAttachmentSpec> &deviceAttachmentSpec,
-	std::shared_ptr<livoxProto1::Device> &device,
-	const PcloudFormatDesc& formatDesc,
-	int histbuffMs,
-	size_t nDgramsPerStagingBufferFrame)
-:	StimulusBuffer(
-		deviceAttachmentSpec,
-		static_cast<size_t>(histbuffMs / CONFIG_STIMBUFF_FRAME_PERIOD_MS),
-		openClInputConstraints,
-		device->componentThread->getIoService()),
-device(device),
-formatDesc(formatDesc), assemblyBuffer(
-	StagingBuffer::InputEngineConstraints::ioUringConstraints,
-	OpenClConstraints(), nDgramsPerStagingBufferFrame),
-ioUringAssemblyEngine(*this)
-{
-	std::cout << __func__ << ": Device's component thread is "
-		<< device->componentThread->name << std::endl;
-
-#ifndef CONFIG_WORLD_USE_BODY_THREAD
-	if (smoHooksPtr->ComponentThread_getSelf()->id != ComponentThread::WORLD)
-#else
-	if (smoHooksPtr->ComponentThread_getSelf()->id != ComponentThread::BODY)
-#endif
-	{
-		std::string errMsg = std::string(__func__) +
-			": PcloudStimulusBuffer constructor called on non-world/body thread " +
-			smoHooksPtr->ComponentThread_getSelf()->name;
-
-		std::cout << errMsg << std::endl;
-	//		throw std::runtime_error(errMsg);
-	}
-}
-
-void PcloudStimulusBuffer::start()
-{
-	// Call ioUringAssemblyEngine setup() as the final step
-	ioUringAssemblyEngine.setup();
-	// Call base class start() as the final step
-	StimulusBuffer::start();
-}
-
-void PcloudStimulusBuffer::stop()
-{
-	// Call base class stop() as the first step
-	StimulusBuffer::stop();
-	// Call ioUringAssemblyEngine stop() as the final step
-	ioUringAssemblyEngine.finalize();
-}
-
-void PcloudStimulusBuffer::stimFrameProductionTimesliceInd()
-{
-	ioUringAssemblyEngine.assembleFrameReq(
-		{nullptr, [this](bool success, AsynchronousLoop loop) {
-			if (!success) {
-				std::cerr << __func__ << ": Failed to assemble frame" << std::endl;
-			} else {
-				std::cout << __func__ << ": Successfully assembled frame "
-					<< loop.nSucceeded.load() << " slots succeeded "
-					<< "out of " << loop.nTotal << " total slots" << std::endl;
-			}
-		}});
-	// Release the spinlock for now
-	frameAssemblyRateLimiter.release();
-}
-
-class PcloudStimulusBuffer::AssembleAndProduceStimulusFrameReq
-:	public smo::PostedAsynchronousContinuation<
-		assembleAndProduceStimulusFrameReqCbFn>
-{
-private:
-	PcloudStimulusBuffer& stimBuff;
-
-public:
-	AssembleAndProduceStimulusFrameReq(
-		PcloudStimulusBuffer& buffer,
-		Callback<assembleAndProduceStimulusFrameReqCbFn> callback)
-	:	PostedAsynchronousContinuation<assembleAndProduceStimulusFrameReqCbFn>(
-			buffer.device->componentThread, std::move(callback)),
-		stimBuff(buffer)
-	{}
-
-	void callOriginalCallback()
-	{
-		stimBuff.frameAssemblyRateLimiter.release();
-		callOriginalCb(static_cast<SimultaneityStamp>(0));
-	}
-
-	void callOriginalCallbackWithFailure()
-	{
-		stimBuff.frameAssemblyRateLimiter.release();
-		callOriginalCb(static_cast<SimultaneityStamp>(0));
-	}
-};
-
-void PcloudStimulusBuffer::assembleAndProduceStimulusFrameReq(
-	smo::Callback<assembleAndProduceStimulusFrameReqCbFn> callback)
-{
-	// Wireframe implementation - do nothing for now
-	(void)callback;
-}
-
-} // namespace stim_buff
-} // namespace smo
@@ -1,70 +0,0 @@
-#ifndef _LIVOX_GEN1_PCLOUD_STIMULUS_BUFFER_H
-#define _LIVOX_GEN1_PCLOUD_STIMULUS_BUFFER_H
-
-#include <functional>
-#include <user/stimulusBuffer.h>
-#include <user/stimFrame.h>
-#include <livoxProto1/device.h>
-#include <asynchronousContinuation.h>
-#include <callback.h>
-#include "stagingBuffer.h"
-#include "ioUringAssemblyEngine.h"
-
-namespace smo {
-namespace stim_buff {
-
-/**
- * PcloudStimulusBuffer is a specialized StimulusBuffer for point cloud data.
- *
- * This class extends StimulusBuffer to handle point cloud-specific stimulus
- * frames, particularly those generated from LiDAR point cloud data. It
- * provides additional functionality for managing point cloud frame metadata
- * and processing.
- */
-class PcloudStimulusBuffer
-:	public StimulusBuffer
-{
-public:
-	explicit PcloudStimulusBuffer(
-		const std::shared_ptr<device::DeviceAttachmentSpec> &deviceAttachmentSpec,
-		std::shared_ptr<livoxProto1::Device> &device,
-		const PcloudFormatDesc& formatDesc,
-		int histbuffMs,
-		size_t nDgramsPerStagingBufferFrame);
-
-	~PcloudStimulusBuffer() = default;
-
-	// Non-copyable, movable
-	PcloudStimulusBuffer(const PcloudStimulusBuffer&) = delete;
-	PcloudStimulusBuffer& operator=(const PcloudStimulusBuffer&) = delete;
-	PcloudStimulusBuffer(PcloudStimulusBuffer&&) = default;
-	PcloudStimulusBuffer& operator=(PcloudStimulusBuffer&&) = default;
-
-	// Control methods
-	void start() override;
-	void stop() override;
-
-protected:
-	void stimFrameProductionTimesliceInd() override;
-
-	// Callback function type for assembleAndProduceStimulusFrameReq
-	typedef std::function<void(SimultaneityStamp)>
-		assembleAndProduceStimulusFrameReqCbFn;
-
-public:
-	void assembleAndProduceStimulusFrameReq(
-		smo::Callback<assembleAndProduceStimulusFrameReqCbFn> callback);
-
-	std::shared_ptr<livoxProto1::Device> device;
-	PcloudFormatDesc formatDesc;
-	StagingBuffer assemblyBuffer;
-	IoUringAssemblyEngine ioUringAssemblyEngine;
-
-private:
-	class AssembleAndProduceStimulusFrameReq;
-};
-
-} // namespace stim_buff
-} // namespace smo
-
-#endif // _LIVOX_GEN1_PCLOUD_STIMULUS_BUFFER_H
@@ -0,0 +1,512 @@
+#include <config.h>
+#include <opts.h>
+#include <algorithm>
+#include <unistd.h>
+#include <iomanip>
+#include <cstddef>
+#include <user/spMcRingBuffer.h>
+#include <componentThread.h>
+#include <asynchronousLoop.h>
+#include <user/stimulusFrame.h>
+#include <user/frameAssemblyDesc.h>
+#include <livoxProto1/device.h>
+#include "pcloudStimulusProducer.h"
+
+namespace smo {
+namespace stim_buff {
+
+extern const SmoCallbacks* smoHooksPtr;
+
+// OpenCL kernels are used to collate and produce our StimFrames.
+static StagingBuffer::IOEngineConstraints openClInputConstraints(
+	/**	FIXME:
+	 * This should eventually be aligned to 4B and padded to 12B.
+	 */
+	// slotStartAlignmentByteVal (page alignment)
+	sizeof(float),
+	// slotPadToNBytes (XYZ = 3 floats per point)
+	sizeof(float) * 3,
+	// frameStartAlignmentByteVal (page alignment)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
+	// framePadToNBytes (pointer size)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)));
+
+// OpenCL kernels are used to collate and produce our StimFrames.
+static StagingBuffer::IOEngineConstraints openClMeshInputConstraints(
+	// slotStartAlignmentByteVal (page alignment)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
+	// slotPadToNBytes: This is dynamically calculated based on the return mode.
+	sizeof(float) * 3,
+	// frameStartAlignmentByteVal (page alignment)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
+	// framePadToNBytes (pointer size)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)));
+
+static StagingBuffer::IOEngineConstraints openClIntensityInputConstraints(
+	// slotStartAlignmentByteVal (page alignment)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
+	// slotPadToNBytes: This is dynamically calculated based on the return mode.
+	sizeof(float),
+	// frameStartAlignmentByteVal (page alignment)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
+	// framePadToNBytes (pointer size)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)));
+
+static StagingBuffer::IOEngineConstraints openClAmbienceInputConstraints(
+	// slotStartAlignmentByteVal (page alignment)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
+	// slotPadToNBytes: This is dynamically calculated based on the return mode.
+	sizeof(uint32_t),
+	// frameStartAlignmentByteVal (page alignment)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)),
+	// framePadToNBytes (pointer size)
+	static_cast<size_t>(sysconf(_SC_PAGE_SIZE)));
+
+PcloudStimulusProducer::PcloudStimulusProducer(
+	const std::shared_ptr<device::DeviceAttachmentSpec> &deviceAttachmentSpec,
+	std::shared_ptr<livoxProto1::Device> &device,
+	const PcloudFormatDesc& formatDesc,
+	size_t nDgramsPerStagingBufferFrame)
+:	StimulusProducer(
+		deviceAttachmentSpec,
+		device->componentThread->getIoService()),
+nDgramsPerStagingBufferFrame(nDgramsPerStagingBufferFrame),
+device(device),
+formatDesc(formatDesc),
+openClCollatingAndMeshingEngine(*this),
+assemblyBuffer(
+	StagingBuffer::IOEngineConstraints::ioUringConstraints,
+	StagingBuffer::IOEngineConstraints::openClInputConstraints,
+	nDgramsPerStagingBufferFrame),
+ioUringAssemblyEngine(*this, nDgramsPerStagingBufferFrame),
+collationBuffer(
+	StagingBuffer::IOEngineConstraints::openClInputConstraints,
+	StagingBuffer::IOEngineConstraints::openClInputConstraints,
+	nDgramsPerStagingBufferFrame),
+tempStimulusFrameMem(0),
+tempStimulusFrame(
+	FrameAssemblyDesc::SlotDesc{
+		0,
+		reinterpret_cast<uint8_t*>(&tempStimulusFrameMem),
+		sizeof(tempStimulusFrameMem)},
+	*smoHooksPtr, 0, SIZE_MAX)
+{
+	if (smoHooksPtr->OptionParser_getOptions().verbose)
+	{
+		std::cout << __func__ << ": assembly buffer : "
+			<< assemblyBuffer.stringify()
+			<< "\n\tcollation buffer : " << collationBuffer.stringify()
+			<< "\n";
+	}
+
+	std::cout << __func__ << ": Device's component thread is "
+		<< device->componentThread->name << std::endl;
+
+#ifndef CONFIG_WORLD_USE_BODY_THREAD
+	if (smoHooksPtr->ComponentThread_getSelf()->id != ComponentThread::WORLD)
+#else
+	if (smoHooksPtr->ComponentThread_getSelf()->id != ComponentThread::BODY)
+#endif
+	{
+		std::string errMsg = std::string(__func__) +
+			": PcloudStimulusProducer constructor called on non-world/body thread " +
+			smoHooksPtr->ComponentThread_getSelf()->name;
+
+		std::cout << errMsg << std::endl;
+	//		throw std::runtime_error(errMsg);
+	}
+}
+
+void PcloudStimulusProducer::start()
+{
+	std::cout << __func__ << ": Starting PcloudStimulusProducer for device "
+		<< device->discoveredDevice.deviceIdentifier << std::endl;
+
+	// Call ioUringAssemblyEngine setup() as the first step
+	if (!ioUringAssemblyEngine.setup())
+	{
+		std::cout <<__func__ <<"Failed to setup() "
+			<<"IOUringAssemblyEngine.\n";
+		return;
+	}
+
+	if (!openClCollatingAndMeshingEngine.setup())
+	{
+		std::cout <<__func__ <<"Failed to setup() "
+			<<"OClCollMeshEngine.\n";
+		return;
+	}
+
+	// Call base class start() as the final step
+	StimulusProducer::start();
+}
+
+void PcloudStimulusProducer::stop()
+{
+	std::cout << __func__ << ": Stopping PcloudStimulusProducer for device "
+		<< device->discoveredDevice.deviceIdentifier << std::endl;
+
+	// Call base class stop() as the first step
+	StimulusProducer::stop();
+	// Call ioUringAssemblyEngine stop() as the final step
+	openClCollatingAndMeshingEngine.finalize();
+	ioUringAssemblyEngine.finalize();
+}
+
+void produceStimFrameAck(void)
+{
+}
+
+// Helper function to parse histbuffMs from device attachment spec
+static int parseHistbuffMs(
+	const std::shared_ptr<device::DeviceAttachmentSpec>& spec)
+{
+	const std::vector<std::string> histbuffParamNames = {
+		"history-buffer-duration-ms",
+		"hist-buff-duration-ms",
+		"histbuff-duration-ms",
+		"histbuff-ms"
+	};
+
+	return device::DeviceAttachmentSpec::parseOptionalParamAsIntWithSynonyms(
+		spec->qualeIfaceApiParams, histbuffParamNames, 30000);
+}
+
+std::shared_ptr<StimulusBuffer>
+PcloudStimulusProducer::getAttachedStimulusBuffer(
+	const std::shared_ptr<device::DeviceAttachmentSpec>& spec) const
+{
+	// Call base class implementation
+	auto buffer = StimulusProducer::getAttachedStimulusBuffer(spec);
+	if (!buffer)
+	{
+		return nullptr;
+	}
+
+	// Optionally validate/upcast the buffer type matches expected type
+	// based on qualeIfaceApi (for type safety)
+	const std::string& qualeIfaceApi = spec->qualeIfaceApi;
+	if (qualeIfaceApi == "mesh")
+	{
+		if (std::dynamic_pointer_cast<MeshStimulusBuffer>(buffer))
+			{ return buffer; }
+	}
+	else if (qualeIfaceApi == "pcloudIntensity")
+	{
+		if (std::dynamic_pointer_cast<PcloudIntensityStimulusBuffer>(buffer))
+			{ return buffer; }
+	}
+	else if (qualeIfaceApi == "pcloudAmbience")
+	{
+		if (std::dynamic_pointer_cast<PcloudAmbienceStimulusBuffer>(buffer))
+			{ return buffer; }
+	}
+
+	// Type mismatch - return nullptr
+	return nullptr;
+}
+
+void PcloudStimulusProducer::destroyAttachedStimulusBuffer(
+	const std::shared_ptr<StimulusBuffer>& buffer)
+{
+	if (!buffer) { return; }
+
+	this->stop();
+
+	// Clear specialized buffer pointers if they match
+	if (meshStimulusBuffer == buffer)
+		{ meshStimulusBuffer.reset(); }
+	if (intensityStimulusBuffer == buffer)
+		{ intensityStimulusBuffer.reset(); }
+	if (ambienceStimulusBuffer == buffer)
+		{ ambienceStimulusBuffer.reset(); }
+
+	// Call base class implementation to remove from attachedStimulusBuffers
+	StimulusProducer::destroyAttachedStimulusBuffer(buffer);
+
+	this->start();
+}
+
+std::shared_ptr<StimulusBuffer>
+PcloudStimulusProducer::getOrCreateAttachedStimulusBuffer(
+	const std::shared_ptr<device::DeviceAttachmentSpec>& deviceAttachmentSpec
+	)
+{
+	// Check if buffer already exists (idempotent)
+	auto existingBuffer = getAttachedStimulusBuffer(deviceAttachmentSpec);
+	if (existingBuffer)
+		{ return existingBuffer; }
+
+	// Parse histbuffMs from device attachment spec
+	int histbuffMs = parseHistbuffMs(deviceAttachmentSpec);
+
+	// Parse qualeIfaceApi to determine buffer type
+	const std::string& qualeIfaceApi = deviceAttachmentSpec->qualeIfaceApi;
+
+	// Calculate nPointsPerDgram based on return mode
+	size_t nPointsPerDgram = livoxProto1::Device::getNPointsPerDgram(
+		static_cast<int>(device->currentReturnMode));
+
+	if (qualeIfaceApi == "mesh")
+	{
+		/* Calculate slotStrideNBytes:
+		 * nDgramsPerStagingBufferFrame * nPointsPerDgram * sizeof(float) * 3
+		 */
+		size_t slotStrideNBytes = this->nDgramsPerStagingBufferFrame
+			* nPointsPerDgram * sizeof(float) * 3;
+		// Reuse openClMeshInputConstraints, only modify slotPadToNBytes
+		openClMeshInputConstraints.slotPadToNBytes = slotStrideNBytes;
+
+std::cout << __func__ << ": $$$$$$$ Creating MeshStimulusBuffer" << std::endl;
+		auto meshBuffer = std::make_shared<MeshStimulusBuffer>(
+			*this, deviceAttachmentSpec, histbuffMs,
+			openClMeshInputConstraints, openClMeshInputConstraints,
+			*smoHooksPtr, CL_MEM_READ_WRITE);
+
+std::cout << __func__ << ": $$$$$$$ Created MeshStimulusBuffer" << std::endl;
+		this->stop();
+		meshStimulusBuffer = meshBuffer;
+		attachedStimulusBuffers.push_back(meshBuffer);
+		this->start();
+		return meshBuffer;
+	}
+	else if (qualeIfaceApi == "pcloudIntensity")
+	{
+		/* Calculate slotStrideNBytes:
+		 * nDgramsPerStagingBufferFrame * nPointsPerDgram * sizeof(float) * 1
+		 */
+		size_t slotStrideNBytes = this->nDgramsPerStagingBufferFrame
+			* nPointsPerDgram * sizeof(float) * 1;
+		// Reuse openClIntensityInputConstraints, only modify slotPadToNBytes
+		openClIntensityInputConstraints.slotPadToNBytes = slotStrideNBytes;
+
+std::cout << __func__ << ": $$$$$$$ Creating PcloudIntensityStimulusBuffer" << std::endl;
+		auto intensityBuffer = std::make_shared<PcloudIntensityStimulusBuffer>(
+			*this, deviceAttachmentSpec, histbuffMs,
+			openClIntensityInputConstraints, openClIntensityInputConstraints,
+			*smoHooksPtr, CL_MEM_READ_WRITE);
+
+std::cout << __func__ << ": $$$$$$$ Created PcloudIntensityStimulusBuffer" << std::endl;
+		this->stop();
+		intensityStimulusBuffer = intensityBuffer;
+		attachedStimulusBuffers.push_back(intensityBuffer);
+		this->start();
+		return intensityBuffer;
+	}
+	else if (qualeIfaceApi == "pcloudAmbience")
+	{
+		// Parse ambienceHighVal from stimBuffApiParams
+		const std::vector<std::string> ambienceHighValParamNames = {
+			"high-value",
+			"high-val"
+		};
+		int ambienceHighValInt = device::DeviceAttachmentSpec
+			::parseOptionalParamAsIntWithSynonyms(
+				deviceAttachmentSpec->stimBuffApiParams,
+				ambienceHighValParamNames, 116);
+		uint32_t ambienceHighVal = static_cast<uint32_t>(ambienceHighValInt);
+
+		/* Calculate slotStrideNBytes:
+		 * nDgramsPerStagingBufferFrame * sizeof(uint32_t)
+		 */
+		size_t slotStrideNBytes = this->nDgramsPerStagingBufferFrame
+			* sizeof(uint32_t);
+		// Reuse openClAmbienceInputConstraints, only modify slotPadToNBytes
+		openClAmbienceInputConstraints.slotPadToNBytes = slotStrideNBytes;
+
+		auto ambienceBuffer = std::make_shared<PcloudAmbienceStimulusBuffer>(
+			*this, deviceAttachmentSpec, histbuffMs,
+			openClAmbienceInputConstraints, openClAmbienceInputConstraints,
+			*smoHooksPtr, CL_MEM_READ_WRITE, ambienceHighVal);
+
+std::cout << __func__ << ": $$$$$$$ Created PcloudAmbienceStimulusBuffer" << std::endl;
+		this->stop();
+		ambienceStimulusBuffer = ambienceBuffer;
+		attachedStimulusBuffers.push_back(ambienceBuffer);
+		this->start();
+		return ambienceBuffer;
+	}
+	else
+	{
+		throw std::runtime_error(
+			"Unsupported qualeIfaceApi: '" + qualeIfaceApi + "' for "
+			"PcloudStimulusProducer. "
+			"Supported values: mesh, pcloudIntensity, pcloudAmbience");
+	}
+}
+
+void PcloudStimulusProducer::stimFrameProductionTimesliceInd()
+{
+	produceFrameReq({nullptr, nullptr});
+}
+
+class PcloudStimulusProducer::ProduceFrameReq
+:	public PostedAsynchronousContinuation<produceFrameReqCbFn>
+{
+private:
+	PcloudStimulusProducer& pcloudProducer;
+	AsynchronousLoop frameAssemblyResult;
+	StimulusFrame& stimulusFrame;
+	std::optional<std::reference_wrapper<StimulusFrame>> intensityStimFrame;
+	std::optional<std::reference_wrapper<StimulusFrame>> ambienceStimFrame;
+
+public:
+	ProduceFrameReq(
+		PcloudStimulusProducer& producer,
+		const std::shared_ptr<ComponentThread>& caller,
+		Callback<produceFrameReqCbFn> cb)
+	:	PostedAsynchronousContinuation<produceFrameReqCbFn>(caller, cb),
+	pcloudProducer(producer),
+	frameAssemblyResult(0),
+	stimulusFrame(producer.tempStimulusFrame)
+	{}
+
+public:
+	void callOriginalCallback()
+	{
+		pcloudProducer.allowNextStimulusFrame();
+		callOriginalCb();
+	}
+
+public:
+	void produceFrameReq1_doAssemble_posted(
+		std::shared_ptr<ProduceFrameReq> context)
+	{
+		SpinLock::Guard lock(pcloudProducer.shouldContinueLock);
+		if (!pcloudProducer.shouldContinue)
+		{
+			callOriginalCallback();
+			return;
+		}
+
+		pcloudProducer.ioUringAssemblyEngine.assembleFrameReq(
+			{context, std::bind(
+				&ProduceFrameReq::produceFrameReq2_assembleDone,
+				context.get(), context,
+				std::placeholders::_1, std::placeholders::_2)});
+	}
+
+	void produceFrameReq2_assembleDone(
+		std::shared_ptr<ProduceFrameReq> context,
+		bool success, AsynchronousLoop loop)
+	{
+		SpinLock::Guard lock(pcloudProducer.shouldContinueLock);
+		if (!pcloudProducer.shouldContinue)
+		{
+			callOriginalCallback();
+			return;
+		}
+
+		if (!success)
+		{
+			std::cerr << __func__ << ": Failed to assemble frame" << std::endl;
+			callOriginalCallback();
+			return;
+		}
+
+		context->frameAssemblyResult = loop;
+
+		// Check if intensity buffer is attached and acquire frame if so
+		if (pcloudProducer.intensityStimulusBuffer)
+		{
+			size_t intensityRingbuffIndex = pcloudProducer
+				.intensityStimulusBuffer->ringBuffer.getIndexToProduceInto();
+
+			StimulusFrame& intensityStimFrame = pcloudProducer
+				.intensityStimulusBuffer->ringBuffer.getDataAtSlot(
+					intensityRingbuffIndex);
+
+			intensityStimFrame.lock.writeAcquire();
+			context->intensityStimFrame = std::make_optional(
+				std::ref(intensityStimFrame));
+		}
+		else {
+			context->intensityStimFrame = std::nullopt;
+		}
+
+		// Check if ambience buffer is attached and acquire frame if so
+		if (pcloudProducer.ambienceStimulusBuffer)
+		{
+			size_t ambienceRingbuffIndex = pcloudProducer
+				.ambienceStimulusBuffer->ringBuffer.getIndexToProduceInto();
+
+			StimulusFrame& ambienceStimFrame = pcloudProducer
+				.ambienceStimulusBuffer->ringBuffer.getDataAtSlot(
+					ambienceRingbuffIndex);
+
+			ambienceStimFrame.lock.writeAcquire();
+			context->ambienceStimFrame = std::make_optional(
+				std::ref(ambienceStimFrame));
+		}
+		else {
+			context->ambienceStimFrame = std::nullopt;
+		}
+
+		pcloudProducer.openClCollatingAndMeshingEngine.compactCollateAndMeshFrameReq(
+			loop, stimulusFrame,
+			context->intensityStimFrame, context->ambienceStimFrame,
+			{context, std::bind(
+				&ProduceFrameReq::produceFrameReq3_compactCollateDone,
+				context.get(), context,
+				std::placeholders::_1, std::placeholders::_2)});
+	}
+
+	void produceFrameReq3_compactCollateDone(
+		[[maybe_unused]] std::shared_ptr<ProduceFrameReq> context,
+		bool success, StimulusFrame& /*stimulusFrame*/)
+	{
+		// Release intensity frame if it was used
+		if (context->intensityStimFrame.has_value()) {
+			context->intensityStimFrame->get().lock.writeRelease();
+		}
+		// Release ambience frame if it was used
+		if (context->ambienceStimFrame.has_value()) {
+			context->ambienceStimFrame->get().lock.writeRelease();
+		}
+
+		SpinLock::Guard lock(pcloudProducer.shouldContinueLock);
+		if (!pcloudProducer.shouldContinue)
+		{
+			callOriginalCallback();
+			return;
+		}
+
+		if (!success) {
+			std::cerr << __func__ << ": Failed to compact and collate frame" << std::endl;
+		} else
+		{
+			// Print execution durations
+			auto assemblyDuration = pcloudProducer.ioUringAssemblyEngine.getAssemblyDuration();
+			auto compactDuration = pcloudProducer.openClCollatingAndMeshingEngine.getCompactKernelDuration();
+			auto collateDuration = pcloudProducer.openClCollatingAndMeshingEngine.getCollateKernelDuration();
+			std::cout << __func__ << ": Successfully compacted and collated frame: assemblyDuration=" << assemblyDuration.count()
+				<< "ms, compactKernelDuration=" << compactDuration.count()
+				<< "ms, collateKernelDuration=" << collateDuration.count() << "ms" << std::endl;
+		}
+
+		callOriginalCallback();
+	}
+};
+
+void PcloudStimulusProducer::produceFrameReq(
+	smo::Callback<produceFrameReqCbFn> callback)
+{
+	/**	EXPLANATION:
+	 * We shouldn't acquire the StimulusProducer::shouldContinueLock here because
+	 * this function is called from
+	 * StimulusProducer::stimFrameProductionTimesliceInd(), which is already
+	 * holding the lock.
+	 */
+	auto caller = smoHooksPtr->ComponentThread_getSelf();
+	auto request = std::make_shared<ProduceFrameReq>(
+		*this, caller, std::move(callback));
+
+	// Post the doAssemble method to the component thread
+	device->componentThread->getIoService().post(
+		STC(std::bind(
+			&ProduceFrameReq::produceFrameReq1_doAssemble_posted,
+			request.get(), request)));
+}
+
+} // namespace stim_buff
+} // namespace smo
@@ -0,0 +1,105 @@
+#ifndef _LIVOX_GEN1_PCLOUD_STIMULUS_PRODUCER_H
+#define _LIVOX_GEN1_PCLOUD_STIMULUS_PRODUCER_H
+
+#include <functional>
+#include <atomic>
+#include <user/stimulusProducer.h>
+#include <user/stimulusFrame.h>
+#include <livoxProto1/device.h>
+#include <asynchronousContinuation.h>
+#include <callback.h>
+#include <user/stagingBuffer.h>
+#include "ioUringAssemblyEngine.h"
+#include "openClCollatingAndMeshingEngine.h"
+#include "meshStimulusBuffer.h"
+#include "pcloudIntensityStimulusBuffer.h"
+#include "pcloudAmbienceStimulusBuffer.h"
+
+namespace smo {
+namespace stim_buff {
+
+/**
+ * PcloudStimulusProducer is a specialized StimulusProducer for point cloud data.
+ *
+ * This class extends StimulusProducer to handle point cloud-specific stimulus
+ * frames, particularly those generated from LiDAR point cloud data. It
+ * provides additional functionality for managing point cloud frame metadata
+ * and processing.
+ */
+class PcloudStimulusProducer
+:	public StimulusProducer
+{
+public:
+	class PcloudFormatDesc
+	{
+	public:
+		enum class Format
+		{
+			XYZ,
+			XYZI,
+		};
+
+	public:
+		Format format;
+	};
+
+public:
+	explicit PcloudStimulusProducer(
+		const std::shared_ptr<device::DeviceAttachmentSpec> &deviceAttachmentSpec,
+		std::shared_ptr<livoxProto1::Device> &device,
+		const PcloudFormatDesc& formatDesc,
+		size_t nDgramsPerStagingBufferFrame);
+
+	~PcloudStimulusProducer() = default;
+
+	// Non-copyable, movable
+	PcloudStimulusProducer(const PcloudStimulusProducer&) = delete;
+	PcloudStimulusProducer& operator=(const PcloudStimulusProducer&) = delete;
+	PcloudStimulusProducer(PcloudStimulusProducer&&) = default;
+	PcloudStimulusProducer& operator=(PcloudStimulusProducer&&) = default;
+
+	// Control methods
+	void start() override;
+	void stop() override;
+
+	std::shared_ptr<StimulusBuffer> getOrCreateAttachedStimulusBuffer(
+		const std::shared_ptr<device::DeviceAttachmentSpec>
+			&deviceAttachmentSpec) override;
+
+	std::shared_ptr<StimulusBuffer> getAttachedStimulusBuffer(
+		const std::shared_ptr<device::DeviceAttachmentSpec>& spec)
+		const override;
+
+	void destroyAttachedStimulusBuffer(
+		const std::shared_ptr<StimulusBuffer>& buffer) override;
+
+protected:
+	void stimFrameProductionTimesliceInd() override;
+
+	// Callback function type for produceFrameReq
+	typedef std::function<void()> produceFrameReqCbFn;
+
+public:
+	void produceFrameReq(smo::Callback<produceFrameReqCbFn> callback);
+
+	size_t nDgramsPerStagingBufferFrame;
+	std::shared_ptr<livoxProto1::Device> device;
+	PcloudFormatDesc formatDesc;
+	OpenClCollatingAndMeshingEngine openClCollatingAndMeshingEngine;
+	StagingBuffer assemblyBuffer;
+	IoUringAssemblyEngine ioUringAssemblyEngine;
+	StagingBuffer collationBuffer;
+	size_t tempStimulusFrameMem;
+	StimulusFrame tempStimulusFrame;
+	std::shared_ptr<MeshStimulusBuffer> meshStimulusBuffer;
+	std::shared_ptr<PcloudIntensityStimulusBuffer> intensityStimulusBuffer;
+	std::shared_ptr<PcloudAmbienceStimulusBuffer> ambienceStimulusBuffer;
+
+private:
+	class ProduceFrameReq;
+};
+
+} // namespace stim_buff
+} // namespace smo
+
+#endif // _LIVOX_GEN1_PCLOUD_STIMULUS_PRODUCER_H
@@ -0,0 +1,126 @@
+// Debug macro: define DEBUG_SLOT_COMPACTOR to enable printf statements
+#ifdef DEBUG_SLOT_COMPACTOR
+#define DBG_PRINTF(...) printf(__VA_ARGS__)
+#else
+#define DBG_PRINTF(...)
+#endif
+
+__kernel void slotCompactor(
+	__global uchar* assembly,
+	uint numSlots,
+	uint slotStride,
+	uint slotSize,
+	uint nSucceeded)
+{
+	// Sequential processing: single work item processes all slots
+	// Compact non-dummy slots to the beginning (lowest addresses)
+	// Dummy slots will remain at the end (highest addresses)
+	// Optimizations:
+	// 1. For each dummy, find rightmost non-dummy and copy it there
+	// 2. Exit early once we've seen nSucceeded non-dummy slots
+	// 3. Exit early once we've moved nFailed dummy slots
+
+	DBG_PRINTF("slotCompactor: KERNEL STARTED\n");
+	DBG_PRINTF("slotCompactor: numSlots=%u, slotStride=%u, slotSize=%u, nSucceeded=%u\n",
+		numSlots, slotStride, slotSize, nSucceeded);
+
+	uint nFailed = numSlots - nSucceeded;  // Calculate number of failed slots
+	uint nonDummiesSeen = 0;  // Track how many non-dummy slots we've seen
+	uint dummiesMoved = 0;     // Track how many dummy slots we've moved
+
+	DBG_PRINTF("slotCompactor: nFailed=%u\n", nFailed);
+
+	// Initialize rightmostNonDummy to start from the end
+	// We'll decrement it each time we use it to avoid re-selecting the same slot
+	uint rightmostNonDummy = numSlots - 1;
+
+	// Process slots from beginning to end
+	DBG_PRINTF("slotCompactor: Starting loop, numSlots=%u\n", numSlots);
+	for (uint i = 0; i < numSlots; ++i)
+	{
+		// Optimization 2: Exit early once we've seen nSucceeded non-dummy slots
+		if (nonDummiesSeen >= nSucceeded) {
+			DBG_PRINTF("slotCompactor: Early exit at i=%u, nonDummiesSeen=%u >= nSucceeded=%u\n",
+				i, nonDummiesSeen, nSucceeded);
+			break;
+		}
+		// Optimization 3: Exit early once we've moved nFailed dummy slots
+		if (dummiesMoved >= nFailed) {
+			DBG_PRINTF("slotCompactor: Early exit at i=%u, dummiesMoved=%u >= nFailed=%u\n",
+				i, dummiesMoved, nFailed);
+			break;
+		}
+
+		// Calculate slot address
+		__global uchar* slotAddr = assembly + (i * slotStride);
+
+		// Check if slot is dummy: first 4 bytes should all be 0xFF
+		bool isDummy = (slotAddr[0] == 0xFF) && (slotAddr[1] == 0xFF)
+			&& (slotAddr[2] == 0xFF) && (slotAddr[3] == 0xFF);
+
+		if (i < 5 || i == numSlots - 1) {
+			DBG_PRINTF("slotCompactor: i=%u, slot[0-3]=0x%02X%02X%02X%02X, isDummy=%d\n",
+				i, slotAddr[0], slotAddr[1], slotAddr[2], slotAddr[3], isDummy ? 1 : 0);
+		}
+
+		// Early continue for non-dummy slots (already in the right place)
+		if (!isDummy)
+		{
+			++nonDummiesSeen;
+			continue;
+		}
+
+		// Optimization 1: Find rightmost non-dummy slot starting from where we left off
+		// Search backwards from rightmostNonDummy until we find a non-dummy slot
+		// or reach the current position
+		bool foundNonDummy = false;
+		for (int j = (int)rightmostNonDummy; j > (int)i; --j)
+		{
+			__global uchar* checkSlotAddr = assembly + (j * slotStride);
+
+			bool checkIsDummy = (checkSlotAddr[0] == 0xFF)
+				&& (checkSlotAddr[1] == 0xFF) && (checkSlotAddr[2] == 0xFF)
+				&& (checkSlotAddr[3] == 0xFF);
+
+			// Early continue for dummy slots
+			if (checkIsDummy)
+				{ continue; }
+
+			// Found a non-dummy slot
+			rightmostNonDummy = (uint)j;
+			foundNonDummy = true;
+			break;
+		}
+
+		// If we found a non-dummy slot to the right, copy it here
+		if (foundNonDummy)
+		{
+			DBG_PRINTF("slotCompactor: Moving slot from %u to %u\n", rightmostNonDummy, i);
+			__global uchar* srcAddr = assembly
+				+ (rightmostNonDummy * slotStride);
+
+			// Copy slot data (byte-by-byte copy)
+			for (uint j = 0; j < slotSize; ++j) {
+				slotAddr[j] = srcAddr[j];
+			}
+
+			// Mark the source slot as dummy (move it to the end)
+			for (uint j = 0; j < 4; ++j) {
+				srcAddr[j] = 0xFF;
+			}
+
+			// Decrement rightmostNonDummy to avoid re-selecting the same slot
+			--rightmostNonDummy;
+
+			++dummiesMoved;
+			++nonDummiesSeen;  // We just moved a non-dummy to this position
+		} else {
+			if (i < 5) {
+				DBG_PRINTF("slotCompactor: i=%u, no non-dummy found to move\n", i);
+			}
+		}
+	}
+	DBG_PRINTF("slotCompactor: Loop complete, nonDummiesSeen=%u, dummiesMoved=%u\n",
+		nonDummiesSeen, dummiesMoved);
+	DBG_PRINTF("slotCompactor: KERNEL FINISHED\n");
+}
@@ -1,16 +0,0 @@
-#include "stagingBuffer.h"
-
-namespace smo {
-namespace stim_buff {
-
-// Static defaults for io_uring
-const StagingBuffer::InputEngineConstraints
-StagingBuffer::InputEngineConstraints::ioUringConstraints(
-	4096,  // slotStartAlignmentByteVal (page alignment for DMA)
-	1472   // slotPadToNBytes (MTU 1500 - UDP/IP header 28)
-);
-
-} // namespace stim_buff
-} // namespace smo
-
-
@@ -14,3 +14,19 @@ add_dependencies(qutex_tests gtest_main)

 # Add the test to CTest
 add_test(NAME qutex_tests COMMAND qutex_tests)
+
+# Create a test executable for StagingBuffer
+add_executable(stagingBuffer_tests commonLibs/attachmentSupport/stagingBuffer_tests.cpp)
+
+# Link against Google Test and the attachmentSupport library
+target_link_libraries(stagingBuffer_tests
+    gtest_main
+    attachmentSupport
+    ${Boost_LIBRARIES}
+)
+
+# Ensure Google Test is built before our test executable
+add_dependencies(stagingBuffer_tests gtest_main)
+
+# Add the test to CTest
+add_test(NAME stagingBuffer_tests COMMAND stagingBuffer_tests)
@@ -0,0 +1,276 @@
+#include <gtest/gtest.h>
+#include <user/stagingBuffer.h>
+#include <user/frameAssemblyDesc.h>
+#include <unistd.h>
+#include <cstdint>
+#include <algorithm>
+#include <string>
+
+namespace smo {
+namespace stim_buff {
+
+// Helper function to create test constraints
+static StagingBuffer::IOEngineConstraints createTestConstraints(
+	size_t slotStartAlignment,
+	size_t slotPadToNBytes,
+	size_t frameStartAlignment,
+	size_t framePadToNBytes)
+{
+	return StagingBuffer::IOEngineConstraints(
+		slotStartAlignment,
+		slotPadToNBytes,
+		frameStartAlignment,
+		framePadToNBytes);
+}
+
+// Helper function to verify alignment
+static void verifyAlignment(
+	uint8_t* addr,
+	size_t alignment,
+	const std::string& description)
+{
+	uintptr_t addrValue = reinterpret_cast<uintptr_t>(addr);
+	EXPECT_EQ(addrValue % alignment, 0u)
+		<< description << ": Address " << (void*)addr
+		<< " is not aligned to " << alignment;
+}
+
+class StagingBufferTest : public ::testing::Test {
+protected:
+	void SetUp() override {
+		pageSize = static_cast<size_t>(sysconf(_SC_PAGE_SIZE));
+	}
+
+	void TearDown() override {
+	}
+
+	size_t pageSize;
+};
+
+// Helper function to verify all slots are page-aligned and have correct stride
+static void verifyAllSlotsPageAligned(
+	const StagingBuffer& buffer,
+	size_t expectedSlotStride,
+	size_t pageSize)
+{
+	auto frameDesc = static_cast<std::shared_ptr<FrameAssemblyDesc>>(buffer);
+	ASSERT_NE(frameDesc, nullptr);
+	ASSERT_GT(frameDesc->slots.size(), 0u);
+
+	// Verify slot stride is a multiple of page size
+	EXPECT_EQ(expectedSlotStride % pageSize, 0u)
+		<< "Slot stride " << expectedSlotStride
+		<< " should be a multiple of page size " << pageSize;
+
+	// Verify first slot is page-aligned
+	verifyAlignment(frameDesc->slots[0].vaddr, pageSize, "First slot");
+
+	// Verify all subsequent slots are page-aligned and have correct stride
+	for (size_t i = 1; i < frameDesc->slots.size(); ++i) {
+		verifyAlignment(frameDesc->slots[i].vaddr, pageSize,
+			"Slot " + std::to_string(i));
+
+		// Verify actual stride matches expected stride
+		size_t actualStride = reinterpret_cast<uintptr_t>(frameDesc->slots[i].vaddr) -
+			reinterpret_cast<uintptr_t>(frameDesc->slots[i-1].vaddr);
+		EXPECT_EQ(actualStride, expectedSlotStride)
+			<< "Slot " << i << " stride mismatch: expected " << expectedSlotStride
+			<< ", got " << actualStride;
+	}
+}
+
+// Test 1: Small slot stride (< page size) - should round up to page size
+TEST_F(StagingBufferTest, SmallSlotStrideRoundsUpToPageSize) {
+	size_t nSlots = 10;
+	size_t smallSlotPad = 256; // Much smaller than typical page size (4096)
+	auto constraints = createTestConstraints(
+		pageSize,                // slotStartAlignment (page size)
+		smallSlotPad,            // slotPadToNBytes (small)
+		pageSize,                // frameStartAlignment
+		pageSize);               // framePadToNBytes
+
+	StagingBuffer buffer(constraints, constraints, nSlots);
+
+	// Slot stride should be rounded up to page size
+	size_t expectedSlotStride = pageSize;
+	EXPECT_EQ(buffer.slotStrideNBytes, expectedSlotStride)
+		<< "Small slot pad should round up to page size";
+
+	verifyAllSlotsPageAligned(buffer, expectedSlotStride, pageSize);
+}
+
+// Test 2: Slot stride equal to page size - should remain page size
+TEST_F(StagingBufferTest, SlotStrideEqualToPageSize) {
+	size_t nSlots = 20;
+	auto constraints = createTestConstraints(
+		pageSize,                // slotStartAlignment
+		pageSize,                // slotPadToNBytes (equal to page size)
+		pageSize,                // frameStartAlignment
+		pageSize);               // framePadToNBytes
+
+	StagingBuffer buffer(constraints, constraints, nSlots);
+
+	size_t expectedSlotStride = pageSize;
+	EXPECT_EQ(buffer.slotStrideNBytes, expectedSlotStride)
+		<< "Slot stride equal to page size should remain unchanged";
+
+	verifyAllSlotsPageAligned(buffer, expectedSlotStride, pageSize);
+}
+
+// Test 3: Slot stride slightly larger than page size (e.g., 336) - should round up
+TEST_F(StagingBufferTest, SlotStrideSlightlyLargerThanPageSize) {
+	size_t nSlots = 50;
+	size_t slotPadToNBytes = 336; // Slightly larger than page size (4096)
+	auto constraints = createTestConstraints(
+		pageSize,                // slotStartAlignment
+		slotPadToNBytes,         // slotPadToNBytes
+		pageSize,                // frameStartAlignment
+		pageSize);               // framePadToNBytes
+
+	StagingBuffer buffer(constraints, constraints, nSlots);
+
+	// Should round up to next multiple of page size (4096)
+	size_t expectedSlotStride = pageSize;
+	EXPECT_EQ(buffer.slotStrideNBytes, expectedSlotStride)
+		<< "Slot stride 336 should round up to page size " << pageSize;
+
+	verifyAllSlotsPageAligned(buffer, expectedSlotStride, pageSize);
+}
+
+// Test 4: Slot stride much larger than page size (e.g., 32256) - should round up to 32768
+TEST_F(StagingBufferTest, SlotStrideMuchLargerThanPageSize) {
+	size_t nSlots = 100;
+	size_t slotPadToNBytes = 32256; // Much larger than page size
+	auto constraints = createTestConstraints(
+		pageSize,                // slotStartAlignment
+		slotPadToNBytes,         // slotPadToNBytes
+		pageSize,                // frameStartAlignment
+		pageSize);               // framePadToNBytes
+
+	StagingBuffer buffer(constraints, constraints, nSlots);
+
+	// Should round up to next multiple of page size
+	// 32256 / 4096 = 7.875, so rounds up to 8 pages = 32768
+	size_t expectedSlotStride = ((slotPadToNBytes + pageSize - 1) / pageSize) * pageSize;
+	EXPECT_EQ(expectedSlotStride, 32768u)
+		<< "32256 should round up to 32768 (8 pages)";
+	EXPECT_EQ(buffer.slotStrideNBytes, expectedSlotStride)
+		<< "Slot stride should be rounded up to " << expectedSlotStride;
+
+	verifyAllSlotsPageAligned(buffer, expectedSlotStride, pageSize);
+}
+
+// Test 5: Slot stride already a multiple of page size - should remain unchanged
+TEST_F(StagingBufferTest, SlotStrideAlreadyMultipleOfPageSize) {
+	size_t nSlots = 30;
+	size_t slotPadToNBytes = pageSize * 3; // Already a multiple (e.g., 12288)
+	auto constraints = createTestConstraints(
+		pageSize,                // slotStartAlignment
+		slotPadToNBytes,         // slotPadToNBytes (already multiple)
+		pageSize,                // frameStartAlignment
+		pageSize);               // framePadToNBytes
+
+	StagingBuffer buffer(constraints, constraints, nSlots);
+
+	// Should remain unchanged
+	size_t expectedSlotStride = slotPadToNBytes;
+	EXPECT_EQ(buffer.slotStrideNBytes, expectedSlotStride)
+		<< "Slot stride already a multiple of page size should remain unchanged";
+
+	verifyAllSlotsPageAligned(buffer, expectedSlotStride, pageSize);
+}
+
+// Test 6: Multiple different slot stride sizes in sequence
+TEST_F(StagingBufferTest, MultipleDifferentSlotStrideSizes) {
+	// Test with various slot pad sizes
+	struct TestCase {
+		size_t slotPadToNBytes;
+		size_t expectedRoundedStride;
+	};
+
+	std::vector<TestCase> testCases = {
+		{256, pageSize},                    // Small: rounds to 1 page
+		{pageSize, pageSize},               // Equal: stays 1 page
+		{pageSize + 1, pageSize * 2},      // Slightly larger: rounds to 2 pages
+		{32256, 32768},                     // Much larger: rounds to 8 pages
+		{pageSize * 5, pageSize * 5},       // Already multiple: stays 5 pages
+		{pageSize * 10 + 100, pageSize * 11}, // Large with remainder: rounds to 11 pages
+	};
+
+	for (const auto& testCase : testCases) {
+		size_t nSlots = 10;
+		auto constraints = createTestConstraints(
+			pageSize,
+			testCase.slotPadToNBytes,
+			pageSize,
+			pageSize);
+
+		StagingBuffer buffer(constraints, constraints, nSlots);
+
+		EXPECT_EQ(buffer.slotStrideNBytes, testCase.expectedRoundedStride)
+			<< "Slot pad " << testCase.slotPadToNBytes
+			<< " should round to " << testCase.expectedRoundedStride;
+
+		verifyAllSlotsPageAligned(buffer, testCase.expectedRoundedStride, pageSize);
+	}
+}
+
+// Test 7: Real-world scenario - PcloudIntensityStimulusBuffer (32256 bytes)
+TEST_F(StagingBufferTest, RealWorldPcloudIntensityScenario) {
+	size_t nSlots = 909; // histbuffMs=30000 / CONFIG_STIMBUFF_FRAME_PERIOD_MS=33
+	size_t nDgramsPerFrame = 84;
+	size_t nPointsPerDgram = 96;
+	size_t slotPadToNBytes = nDgramsPerFrame * nPointsPerDgram * sizeof(float) * 1; // 32256
+
+	auto constraints = createTestConstraints(
+		pageSize,                // slotStartAlignment
+		slotPadToNBytes,         // slotPadToNBytes
+		pageSize,                // frameStartAlignment
+		pageSize);               // framePadToNBytes
+
+	StagingBuffer buffer(constraints, constraints, nSlots);
+
+	// Should round up to 32768 (8 pages)
+	size_t expectedSlotStride = 32768;
+	EXPECT_EQ(buffer.slotStrideNBytes, expectedSlotStride)
+		<< "Real-world 32256-byte slots should round to 32768";
+
+	verifyAllSlotsPageAligned(buffer, expectedSlotStride, pageSize);
+
+	// Verify FrameAssemblyDesc
+	auto frameDesc = static_cast<std::shared_ptr<FrameAssemblyDesc>>(buffer);
+	ASSERT_NE(frameDesc, nullptr);
+	EXPECT_EQ(frameDesc->numSlots, nSlots);
+	EXPECT_EQ(frameDesc->slotSizeBytes, slotPadToNBytes);
+}
+
+// Test 8: Real-world scenario - PcloudAmbienceStimulusBuffer (336 bytes)
+TEST_F(StagingBufferTest, RealWorldPcloudAmbienceScenario) {
+	size_t nSlots = 909;
+	size_t nDgramsPerFrame = 84;
+	size_t slotPadToNBytes = nDgramsPerFrame * sizeof(float); // 336
+
+	auto constraints = createTestConstraints(
+		pageSize,                // slotStartAlignment
+		slotPadToNBytes,         // slotPadToNBytes
+		pageSize,                // frameStartAlignment
+		pageSize);               // framePadToNBytes
+
+	StagingBuffer buffer(constraints, constraints, nSlots);
+
+	// Should round up to page size
+	size_t expectedSlotStride = pageSize;
+	EXPECT_EQ(buffer.slotStrideNBytes, expectedSlotStride)
+		<< "Real-world 336-byte slots should round to page size";
+
+	verifyAllSlotsPageAligned(buffer, expectedSlotStride, pageSize);
+
+	// Verify FrameAssemblyDesc
+	auto frameDesc = static_cast<std::shared_ptr<FrameAssemblyDesc>>(buffer);
+	ASSERT_NE(frameDesc, nullptr);
+	EXPECT_EQ(frameDesc->numSlots, nSlots);
+	EXPECT_EQ(frameDesc->slotSizeBytes, slotPadToNBytes);
+}
+
+} // namespace stim_buff
+} // namespace smo
@@ -9,14 +9,6 @@
  a ref and not by-value. Propagate this upward into
  SerializedAsyncContin and into all derived classes'
  constructors.
-* In classes like udpCommandDemuxer and possibly other such background tasks,
-  use a spinlock to ensure that the stop() function doesn't deallocate the
-  data to be used by the daemon task while the daemon task is executing.
-  * Alternatively we could re-emqueue the message;
-  * Alternatively, if select/poll don't consume the read-data-rdy flag,
-    we can just return and let the next timer invocation run instead.
-  * Alternatively, we can use an xchg'd flag between the udp listener
-    and the timed enforcer.
 * In livoxProto1/device.cpp, migrate the registerUdpCommandHandler() calls
  from using the inProgress collection to the per-device collections.
 * In cases where we use boost deadline_timers and pass in an async
@@ -25,16 +17,6 @@
  after they expire just in case boost doesn't clean up the internal
  callable that we passed it. Or else we'll have circular sh_ptr
  references in our continuations.
-* Think of a unified mechanism or a design pattern which will ensure that
-  timeouts will always either expire or be canceled before program
-  finalization.
-  Or more generically, before their lifetime ends.
-  * One good mechanism is to use synchronous waits on the timeout
-    events. This is fine during program shutdown because we don't
-    need to care about responsiveness during program shutdown.
-  * We can use asynchronous bridging to ensure that asynchronous
-    events are executed in the meantime while we wait for the sync
-    timeout to complete.
 * UdpCommandDemuxer::registerUdpCommandHandler should accept a pointer
  to the io_context of the thread it should post its callbacks to, and
  then post callbacks to those io_contexts when UDP cmd responses
				`@@ -62,4 +62,3 @@ public:`

				`#endif // _LIVOX_GEN1_FRAME_ASSEMBLY_DESC_H`