NXP Backend: Enable prefetching weights from external memory to SRAM in AoT and runtime

jirioc · jirioc · commit fc0f06beb9de · 2026-02-23T12:33:17.000+01:00
diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py
@@ -72,7 +72,19 @@ def verify_target(self, target: str):
                 f"Target `{target}` is not a valid target. Must be one of `{valid_targets}`."
             )
 
-    def convert(self, tflite_model: bytes, target: str) -> bytes:
+    def convert(
+        self, tflite_model: bytes, target: str, fetch_constants_to_sram: bool = False
+    ) -> bytes:
+        """
+        Call Neutron Converter.
+
+        :param tflite_model: A generic TFLite model to be converted.
+        :param target: The target platform.
+        :param fetch_constants_to_sram: Add microcode that fetches weights from external memory.
+        This allows running models which do not fit into SRAM. Applies to Neutron-C only (microcontrollers).
+
+        :return: TFLite model with Neutron microcode as bytes.
+        """
         # Neutron converter crashes if we provide invalid target -> verify.
         self.verify_target(target)
 
@@ -82,6 +94,7 @@ def convert(self, tflite_model: bytes, target: str) -> bytes:
         cctx.compilationOpts.excludeGraphPasses = (
             "HoistSliceAboveTranspose,MergeTranspose"
         )
+        cctx.compilationOpts.fetchConstantsToSRAM = fetch_constants_to_sram
 
         # Try to use multiprocessing for isolation, but fall back to direct execution
         # if the environment doesn't support it (e.g., in sandcastle/build environments)
diff --git a/backends/nxp/nxp_backend.py b/backends/nxp/nxp_backend.py
@@ -46,6 +46,7 @@ def __init__(self):
         self.operators_not_to_delegate: List[str] = []
         self.neutron_converter_flavor = None
         self.use_neutron_for_format_conversion = True
+        self.fetch_constants_to_sram = False
 
     def _replace_colons(self, operator: str) -> str:
         """
@@ -60,6 +61,7 @@ def neutron_compile_spec(
         extra_flags: Optional[str] = None,
         operators_not_to_delegate: Optional[List[str]] = None,
         use_neutron_for_format_conversion: bool = True,
+        fetch_constants_to_sram: bool = False,
     ):
         """
         Generate compile spec for Neutron NPU
@@ -73,6 +75,8 @@ def neutron_compile_spec(
             use_neutron_for_format_conversion: If True, the EdgeProgramToIRConverter will insert `Transpose` ops to
                                                 ensure that the IO matches the executorch partition, which will be
                                                 delegated to Neutron.
+            fetch_constants_to_sram: If True, the Neutron Converter will insert microinstructions to prefetch weights
+                                     from FLASH to SRAM. This should be used when the whole model does not fit into SRAM.
         """
 
         self.neutron_converter_flavor = neutron_converter_flavor
@@ -94,6 +98,8 @@ def neutron_compile_spec(
 
         self.use_neutron_for_format_conversion = use_neutron_for_format_conversion
 
+        self.fetch_constants_to_sram = fetch_constants_to_sram
+
         return self
 
     def build(self):
@@ -116,6 +122,10 @@ def build(self):
                     "use_neutron_for_format_conversion",
                     f"{self.use_neutron_for_format_conversion}".encode(),
                 ),
+                CompileSpec(
+                    "fetch_constants_to_sram",
+                    f"{self.fetch_constants_to_sram}".encode(),
+                ),
             ]
 
         return self.compile_spec
@@ -128,6 +138,7 @@ def generate_neutron_compile_spec(
     extra_flags: Optional[str] = None,
     operators_not_to_delegate: Optional[List[str]] = None,
     use_neutron_for_format_conversion: bool = True,
+    fetch_constants_to_sram: bool = False,
 ) -> List[CompileSpec]:
     return (
         NeutronCompileSpecBuilder()
@@ -137,6 +148,7 @@ def generate_neutron_compile_spec(
             extra_flags=extra_flags,
             operators_not_to_delegate=operators_not_to_delegate,
             use_neutron_for_format_conversion=use_neutron_for_format_conversion,
+            fetch_constants_to_sram=fetch_constants_to_sram,
         )
         .build()
     )
@@ -160,6 +172,7 @@ def preprocess(  # noqa C901
         target = ""
         neutron_converter_flavor = ""
         use_neutron_for_format_conversion = None
+        fetch_constants_to_sram = False
         for spec in compile_spec:
             if spec.key == "output_format":
                 output_format = spec.value.decode()
@@ -171,6 +184,8 @@ def preprocess(  # noqa C901
                 neutron_converter_flavor = spec.value.decode()
             if spec.key == "use_neutron_for_format_conversion":
                 use_neutron_for_format_conversion = spec.value.decode() == "True"
+            if spec.key == "fetch_constants_to_sram":
+                fetch_constants_to_sram = spec.value.decode() == "True"
 
         # Check that the output format is set in the compile spec
         if not output_format:
@@ -212,7 +227,7 @@ def preprocess(  # noqa C901
             )
 
             neutron_model = NeutronConverterManager(neutron_converter_flavor).convert(
-                tflite_model, target
+                tflite_model, target, fetch_constants_to_sram
             )
 
             # Dump the tflite file if logging level is enabled
diff --git a/backends/nxp/runtime/NeutronBackend.cpp b/backends/nxp/runtime/NeutronBackend.cpp
@@ -70,6 +70,9 @@ typedef struct {
   int numOutputs = 0;
   int numInputArgs = 0;
   uint32_t scratchSize = 0;
+#ifdef EXTERNAL_MEM
+  uint32_t sramScratchSize = 0;
+#endif
   uint32_t profileSize = 0;
   uint32_t debugSize = 0;
   NeutronModelConfig mcfg;
@@ -79,7 +82,18 @@ typedef struct {
   const uint8_t* outputTranspositionFlags;
   const uint8_t* inputMap;
   const uint8_t* outputMap;
-} NeutronConfig;
+} NeutronExecutorchConfig;
+
+#ifdef EXTERNAL_MEM
+// Neutron compute has no access to FLASH.
+// Prefetch weights from FLASH to SRAM using memcpy.
+// For a model converted with --fetch_constants_to_sram.
+void copy(void* dst, void* src, uint32_t size, uint32_t channel) {
+  memcpy(dst, src, size);
+}
+void wait(uint32_t channel) {}
+static NeutronConfig neutronMemCopyConfig = {copy, wait};
+#endif
 
 // Applied on outputs.
 template <typename T>
@@ -258,7 +272,7 @@ class NeutronBackend final : public PyTorchBackendInterface {
       ArrayRef<CompileSpec> compile_specs) const override {
     MemoryAllocator* allocator = context.get_runtime_allocator();
 
-    auto* cfg = allocator->allocateInstance<NeutronConfig>();
+    auto* cfg = allocator->allocateInstance<NeutronExecutorchConfig>();
 
     // The following data is read from the "processed" data blob.
     //    cfg->numInputs
@@ -293,15 +307,22 @@ class NeutronBackend final : public PyTorchBackendInterface {
     switch (payloadVersion) {
       case 0:
         cfg->scratchSize = buffer[9];
+#ifdef EXTERNAL_MEM
+        cfg->sramScratchSize = buffer[10];
+#endif
         cfg->profileSize = 0;
         cfg->debugSize = 0;
         cfg->numInputs = buffer[11];
         cfg->numOutputs = buffer[12];
         break;
       case 1:
         cfg->scratchSize = buffer[9];
-        cfg->profileSize = buffer[10];
+        // The highest bit has special meaning in NS >= 2.2.3
+        cfg->profileSize = buffer[10] & 0x7FFFFFFF;
         cfg->debugSize = buffer[11];
+#ifdef EXTERNAL_MEM
+        cfg->sramScratchSize = buffer[12];
+#endif
         cfg->numInputs = buffer[13];
         cfg->numOutputs = buffer[14];
         break;
@@ -351,6 +372,14 @@ class NeutronBackend final : public PyTorchBackendInterface {
       return Error::InvalidProgram;
     }
 
+#ifdef EXTERNAL_MEM
+    neutronRC = neutronSetConfig(&neutronMemCopyConfig);
+    if (neutronRC != ENONE) {
+      ET_LOG(Error, "Neutron set config failed with error code %ld", neutronRC);
+      return Error::InvalidProgram;
+    }
+#endif
+
     return cfg;
   }
 
@@ -365,7 +394,8 @@ class NeutronBackend final : public PyTorchBackendInterface {
       BackendExecutionContext& context,
       DelegateHandle* input_handle,
       Span<EValue*> args) const override {
-    NeutronConfig* cfg = static_cast<NeutronConfig*>(input_handle);
+    NeutronExecutorchConfig* cfg =
+        static_cast<NeutronExecutorchConfig*>(input_handle);
 
     // Allocate place for input and output pointers.
     cfg->dcfg.inputs = static_cast<const void**>(
@@ -381,6 +411,12 @@ class NeutronBackend final : public PyTorchBackendInterface {
     cfg->dcfg.outputs[cfg->numOutputs + 2] =
         static_cast<void*>(context.allocate(cfg->debugSize, 16));
 
+#ifdef EXTERNAL_MEM
+    // Allocate the space in SRAM to prefetch weights from FLASH.
+    cfg->dcfg.scratchWeights =
+        static_cast<void*>(context.allocate(cfg->sramScratchSize, 16));
+#endif
+
     // Set inputs from args.
     // Transpose inputs if needed.
     for (int i = 0; i < cfg->numInputs; i++) {
@@ -527,7 +563,8 @@ class NeutronBackend final : public PyTorchBackendInterface {
   }
 
   void destroy(DelegateHandle* handle) const override {
-    NeutronConfig* cfg = reinterpret_cast<NeutronConfig*>(handle);
+    NeutronExecutorchConfig* cfg =
+        reinterpret_cast<NeutronExecutorchConfig*>(handle);
 
     // Unprepare to free resources in neutron driver.
     NeutronError neutronRC = neutronModelUnprepare(cfg->nmh);
diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py
@@ -99,6 +99,7 @@ def to_quantized_edge_program(
     get_quantizer_fn=None,
     use_neutron_for_format_conversion=True,
     use_quant_state_dict=True,
+    fetch_constants_to_sram=False,
 ) -> EdgeProgramManager:
     _neutron_target_spec = NeutronTargetSpec(target, neutron_converter_flavor)
     if get_quantizer_fn is None:
@@ -128,6 +129,7 @@ def to_quantized_edge_program(
         operators_not_to_delegate=operators_not_to_delegate,
         neutron_converter_flavor=neutron_converter_flavor,
         use_neutron_for_format_conversion=use_neutron_for_format_conversion,
+        fetch_constants_to_sram=fetch_constants_to_sram,
     )
     post_quant_state_dict = (
         exir_program_aten__module_quant.state_dict() if use_quant_state_dict else None
diff --git a/backends/nxp/tests/test_neutron_converter_manager.py b/backends/nxp/tests/test_neutron_converter_manager.py
@@ -1,4 +1,4 @@
-# Copyright 2024-2025 NXP
+# Copyright 2024-2026 NXP
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -14,7 +14,8 @@
     NeutronConverterManager,
 )
 from executorch.backends.nxp.backend.node_format_inference import NodeFormatInference
-from executorch.backends.nxp.tests.models import Conv2dModule
+from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program
+from executorch.backends.nxp.tests.models import Conv2dModule, LinearModule
 
 
 def test_conv2d_neutron_conversion__default_flavor():
@@ -31,7 +32,7 @@ def test_conv2d_neutron_conversion__default_flavor():
     )
 
     neutron_converter_manager = NeutronConverterManager()
-    neutron_model = neutron_converter_manager.convert(tflite_model, "imxrt700")
+    neutron_model = neutron_converter_manager.convert(tflite_model, "imxrt700", False)
 
     assert len(
         neutron_model
@@ -52,8 +53,30 @@ def test__conv2d_neutron_conversion__invalid_flavor():
     )
 
     with pytest.raises(RuntimeError) as excinfo:
-        _ = NeutronConverterManager("bad_flavor").convert(tflite_model, "imxrt700")
+        _ = NeutronConverterManager("bad_flavor").convert(
+            tflite_model, "imxrt700", False
+        )
 
     assert "Neutron Converter module with flavor 'bad_flavor' not found." in str(
         excinfo
     )
+
+
+def test_conv2d_neutron_conversion__prefetching(mocker):
+    model = LinearModule(True)
+    input_shape = (1, 1, 32, 32)
+
+    converter_spy = mocker.spy(NeutronConverterManager, "convert")
+    _ = to_quantized_edge_program(
+        model, input_shape, fetch_constants_to_sram=True
+    ).exported_program()
+    neutron_model_prefetch = converter_spy.spy_return
+
+    _ = to_quantized_edge_program(
+        model, input_shape, fetch_constants_to_sram=False
+    ).exported_program()
+    neutron_model_regular = converter_spy.spy_return
+
+    assert len(neutron_model_prefetch) != len(
+        neutron_model_regular
+    ), "The weight prefetching flag does not make a difference!"
diff --git a/examples/nxp/aot_neutron_compile.py b/examples/nxp/aot_neutron_compile.py
@@ -229,6 +229,13 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool):
         action="store_true",
         help="The calibration and testing datasets will be generated randomly instead of being downloaded.",
     )
+    parser.add_argument(
+        "--fetch_constants_to_sram",
+        required=False,
+        default=False,
+        action="store_true",
+        help="This feature allows running models which do not fit into SRAM by offloading them to an external memory.",
+    )
 
     args = parser.parse_args()
 
@@ -313,6 +320,7 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool):
         args.target,
         operators_not_to_delegate=args.operators_not_to_delegate,
         neutron_converter_flavor=args.neutron_converter_flavor,
+        fetch_constants_to_sram=args.fetch_constants_to_sram,
     )
     partitioners = (
         [

Original file line number	Diff line number	Diff line change
`@@ -229,6 +229,13 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool):`
`229`	`229`	`action="store_true",`
`230`	`230`	`help="The calibration and testing datasets will be generated randomly instead of being downloaded.",`
`231`	`231`	`)`
	`232`	`+ parser.add_argument(`
	`233`	`+ "--fetch_constants_to_sram",`
	`234`	`+ required=False,`
	`235`	`+ default=False,`
	`236`	`+ action="store_true",`
	`237`	`+ help="This feature allows running models which do not fit into SRAM by offloading them to an external memory.",`
	`238`	`+ )`
`232`	`239`
`233`	`240`	`args = parser.parse_args()`
`234`	`241`
`@@ -313,6 +320,7 @@ def get_model_and_inputs_from_name(model_name: str, use_random_dataset: bool):`
`313`	`320`	`args.target,`
`314`	`321`	`operators_not_to_delegate=args.operators_not_to_delegate,`
`315`	`322`	`neutron_converter_flavor=args.neutron_converter_flavor,`
	`323`	`+ fetch_constants_to_sram=args.fetch_constants_to_sram,`
`316`	`324`	`)`
`317`	`325`	`partitioners = (`
`318`	`326`	`[`