tests: eager tests

JingyaHuang · JingyaHuang · commit 2480388fb12a · 2026-03-27T17:55:26.000Z
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -2249,64 +2249,6 @@ def _is_pipeline_device_mapped(self):
 
         return not is_device_type_map and isinstance(device_map, dict) and len(device_map) > 1
 
-    def enable_neuron_compile(
-        self,
-        model_names: Optional[List[str]] = None,
-        cache_dir: Optional[str] = None,
-        fullgraph: bool = True,
-    ) -> None:
-        """
-        Compiles the pipeline's nn.Module components with ``torch.compile(backend="neuron")``,
-        enabling whole-graph NEFF compilation for AWS Trainium/Inferentia.
-
-        The first forward call per component triggers neuronx-cc compilation (slow).
-        Use ``neuron_warmup()`` to trigger this explicitly before timed inference.
-
-        Args:
-            model_names (`List[str]`, *optional*):
-                Component names to compile. Defaults to all nn.Module components.
-            cache_dir (`str`, *optional*):
-                Path to persist compiled NEFFs across runs via ``TORCH_NEURONX_NEFF_CACHE_DIR``.
-                Skips recompilation on subsequent runs.
-            fullgraph (`bool`, defaults to `True`):
-                Disallow graph breaks (required for full-graph fusion).
-        """
-        requires_backends(self, "torch_neuronx")
-        import torch_neuronx  # noqa: F401 — registers neuron backend
-        from torch_neuronx.neuron_dynamo_backend import set_model_name
-
-        if cache_dir is not None:
-            os.environ["TORCH_NEURONX_NEFF_CACHE_DIR"] = cache_dir
-
-        if model_names is None:
-            model_names = [
-                name for name, comp in self.components.items() if isinstance(comp, torch.nn.Module)
-            ]
-
-        for name in model_names:
-            component = getattr(self, name, None)
-            if isinstance(component, torch.nn.Module) and not is_compiled_module(component):
-                logger.info(f"Compiling {name} with backend='neuron'")
-                set_model_name(name)
-                setattr(self, name, torch.compile(component, backend="neuron", fullgraph=fullgraph))
-
-    def neuron_warmup(self, *args, **kwargs) -> None:
-        """
-        Runs a single dummy forward pass through the pipeline to trigger neuronx-cc
-        compilation for all components (static-shape NEFF compilation).
-
-        This is equivalent to calling ``__call__`` with the same shapes but discards
-        the output. After warmup, subsequent calls reuse the compiled NEFFs and run fast.
-
-        Pass the same arguments you would use for real inference (height, width,
-        num_inference_steps, batch_size, etc.) so that the compiled shapes match.
-        """
-        logger.info("Running Neuron warmup forward pass to trigger NEFF compilation...")
-        with torch.no_grad():
-            self(*args, **kwargs)
-        logger.info("Neuron warmup complete.")
-
-
 class StableDiffusionMixin:
     r"""
     Helper for DiffusionPipeline with vae and unet.(mainly for LDM such as stable diffusion)
diff --git a/src/diffusers/utils/testing_utils.py b/src/diffusers/utils/testing_utils.py
@@ -46,6 +46,7 @@
     is_peft_available,
     is_timm_available,
     is_torch_available,
+    is_torch_neuronx_available,
     is_torch_version,
     is_torchao_available,
     is_torchsde_available,
@@ -113,6 +114,8 @@
             torch_device = "cuda"
         elif torch.xpu.is_available():
             torch_device = "xpu"
+        elif is_torch_neuronx_available() and hasattr(torch, "neuron") and torch.neuron.is_available():
+            torch_device = torch.neuron.current_device()
         else:
             torch_device = "cpu"
         is_torch_higher_equal_than_1_12 = version.parse(
diff --git a/tests/pipelines/pixart_alpha/test_pixart.py b/tests/pipelines/pixart_alpha/test_pixart.py
@@ -28,6 +28,8 @@
     PixArtTransformer2DModel,
 )
 
+from diffusers.utils.import_utils import is_torch_neuronx_available
+
 from ...testing_utils import (
     backend_empty_cache,
     enable_full_determinism,
@@ -291,7 +293,9 @@ def test_pixart_1024(self):
         expected_slice = np.array([0.0742, 0.0835, 0.2114, 0.0295, 0.0784, 0.2361, 0.1738, 0.2251, 0.3589])
 
         max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
-        self.assertLessEqual(max_diff, 1e-4)
+        # Neuron uses bfloat16 internally which has lower precision than float16 on CUDA
+        atol = 1e-2 if is_torch_neuronx_available() else 1e-4
+        self.assertLessEqual(max_diff, atol)
 
     def test_pixart_512(self):
         generator = torch.Generator("cpu").manual_seed(0)
@@ -307,7 +311,9 @@ def test_pixart_512(self):
         expected_slice = np.array([0.3477, 0.3882, 0.4541, 0.3413, 0.3821, 0.4463, 0.4001, 0.4409, 0.4958])
 
         max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
-        self.assertLessEqual(max_diff, 1e-4)
+        # Neuron uses bfloat16 internally which has lower precision than float16 on CUDA
+        atol = 1e-2 if is_torch_neuronx_available() else 1e-4
+        self.assertLessEqual(max_diff, atol)
 
     def test_pixart_1024_without_resolution_binning(self):
         generator = torch.manual_seed(0)
diff --git a/tests/testing_utils.py b/tests/testing_utils.py
@@ -45,6 +45,7 @@
     is_peft_available,
     is_timm_available,
     is_torch_available,
+    is_torch_neuronx_available,
     is_torch_version,
     is_torchao_available,
     is_torchsde_available,
@@ -109,6 +110,8 @@
             torch_device = "cuda"
         elif torch.xpu.is_available():
             torch_device = "xpu"
+        elif is_torch_neuronx_available() and hasattr(torch, "neuron") and torch.neuron.is_available():
+            torch_device = torch.neuron.current_device()
         else:
             torch_device = "cpu"
         is_torch_higher_equal_than_1_12 = version.parse(
@@ -1427,6 +1430,15 @@ def _is_torch_fp64_available(device):
     # Behaviour flags
     BACKEND_SUPPORTS_TRAINING = {"cuda": True, "xpu": True, "cpu": True, "mps": False, "default": True}
 
+    # Neuron device key: torch.neuron.current_device() returns an int (e.g. 0).
+    # We capture it once at import time if torch_neuronx is available so we can add it
+    # to all dispatch tables using the same key that torch_device is set to.
+    _neuron_device = (
+        torch.neuron.current_device()
+        if (is_torch_neuronx_available() and hasattr(torch, "neuron") and torch.neuron.is_available())
+        else None
+    )
+
     # Function definitions
     BACKEND_EMPTY_CACHE = {
         "cuda": torch.cuda.empty_cache,
@@ -1478,13 +1490,19 @@ def _is_torch_fp64_available(device):
         "default": None,
     }
 
+    if _neuron_device is not None:
+        BACKEND_EMPTY_CACHE[_neuron_device] = None
+        BACKEND_DEVICE_COUNT[_neuron_device] = torch.neuron.device_count
+        BACKEND_MANUAL_SEED[_neuron_device] = torch.manual_seed
+        BACKEND_RESET_PEAK_MEMORY_STATS[_neuron_device] = None
+        BACKEND_RESET_MAX_MEMORY_ALLOCATED[_neuron_device] = None
+        BACKEND_MAX_MEMORY_ALLOCATED[_neuron_device] = 0
+        BACKEND_SYNCHRONIZE[_neuron_device] = torch.neuron.synchronize
+
 
 # This dispatches a defined function according to the accelerator from the function definitions.
 def _device_agnostic_dispatch(device: str, dispatch_table: dict[str, Callable], *args, **kwargs):
-    if device not in dispatch_table:
-        return dispatch_table["default"](*args, **kwargs)
-
-    fn = dispatch_table[device]
+    fn = dispatch_table[device] if device in dispatch_table else dispatch_table["default"]
 
     # Some device agnostic functions return values. Need to guard against 'None' instead at
     # user level