feat: neuron-specific changes in the pipeline

JingyaHuang · JingyaHuang · commit a76953cf34ae · 2026-03-26T11:57:09.000Z
diff --git a/src/diffusers/models/unets/unet_2d_condition.py b/src/diffusers/models/unets/unet_2d_condition.py
@@ -855,10 +855,11 @@ def get_time_embed(self, sample: torch.Tensor, timestep: torch.Tensor | float |
             # This would be a good case for the `match` statement (Python 3.10+)
             is_mps = sample.device.type == "mps"
             is_npu = sample.device.type == "npu"
+            is_neuron = sample.device.type == "neuron"
             if isinstance(timestep, float):
-                dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+                dtype = torch.float32 if (is_mps or is_npu or is_neuron) else torch.float64
             else:
-                dtype = torch.int32 if (is_mps or is_npu) else torch.int64
+                dtype = torch.int32 if (is_mps or is_npu or is_neuron) else torch.int64
             timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
         elif len(timesteps.shape) == 0:
             timesteps = timesteps[None].to(sample.device)
diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py
@@ -2273,6 +2273,7 @@ def enable_neuron_compile(
         """
         requires_backends(self, "torch_neuronx")
         import torch_neuronx  # noqa: F401 — registers neuron backend
+        from torch_neuronx.neuron_dynamo_backend import set_model_name
 
         if cache_dir is not None:
             os.environ["TORCH_NEURONX_NEFF_CACHE_DIR"] = cache_dir
@@ -2286,6 +2287,7 @@ def enable_neuron_compile(
             component = getattr(self, name, None)
             if isinstance(component, torch.nn.Module) and not is_compiled_module(component):
                 logger.info(f"Compiling {name} with backend='neuron'")
+                set_model_name(name)
                 setattr(self, name, torch.compile(component, backend="neuron", fullgraph=fullgraph))
 
     def neuron_warmup(self, *args, **kwargs) -> None:
diff --git a/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -1092,7 +1092,11 @@ def __call__(
         )
 
         # 4. Prepare timesteps
-        if XLA_AVAILABLE:
+        # Keep timesteps on CPU for XLA (TPU) and Neuron: both use lazy/XLA execution where
+        # dynamic-shape ops like .nonzero() and .item() inside scheduler.index_for_timestep()
+        # are incompatible with static-graph compilation.
+        is_neuron_device = hasattr(device, "type") and device.type == "neuron"
+        if XLA_AVAILABLE or is_neuron_device:
             timestep_device = "cpu"
         else:
             timestep_device = device
@@ -1195,15 +1199,23 @@ def __call__(
                 # expand the latents if we are doing classifier free guidance
                 latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
 
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # For Neuron: scale_model_input on CPU to avoid XLA ops outside the compiled UNet region.
+                # index_for_timestep() uses .nonzero()/.item() which are incompatible with static graphs.
+                if is_neuron_device:
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input.to("cpu"), t).to(device)
+                else:
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
                 if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
                     added_cond_kwargs["image_embeds"] = image_embeds
+                # For Neuron: pre-cast timestep to float32 on device. Neuron XLA does not support
+                # int64 ops; the compiled UNet graph requires a float32 timestep input on-device.
+                t_unet = t.to(torch.float32).to(device) if is_neuron_device else t
                 noise_pred = self.unet(
                     latent_model_input,
-                    t,
+                    t_unet,
                     encoder_hidden_states=prompt_embeds,
                     timestep_cond=timestep_cond,
                     cross_attention_kwargs=self.cross_attention_kwargs,
@@ -1222,7 +1234,13 @@ def __call__(
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents_dtype = latents.dtype
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                # For Neuron: scheduler.step on CPU to keep scheduler arithmetic off the XLA device.
+                if is_neuron_device:
+                    latents = self.scheduler.step(
+                        noise_pred.to("cpu"), t, latents.to("cpu"), **extra_step_kwargs, return_dict=False
+                    )[0].to(device)
+                else:
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
                 if latents.dtype != latents_dtype:
                     if torch.backends.mps.is_available():
                         # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
diff --git a/src/diffusers/utils/import_utils.py b/src/diffusers/utils/import_utils.py
@@ -584,6 +584,10 @@ def is_av_available():
 """
 
 
+TORCH_NEURONX_IMPORT_ERROR = """
+{0} requires the torch_neuronx library (AWS Neuron SDK) but it was not found in your environment. Please install it following the AWS Neuron documentation: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/
+"""
+
 BACKENDS_MAPPING = OrderedDict(
     [
         ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
@@ -614,6 +618,7 @@ def is_av_available():
         ("pytorch_retinaface", (is_pytorch_retinaface_available, PYTORCH_RETINAFACE_IMPORT_ERROR)),
         ("better_profanity", (is_better_profanity_available, BETTER_PROFANITY_IMPORT_ERROR)),
         ("nltk", (is_nltk_available, NLTK_IMPORT_ERROR)),
+        ("torch_neuronx", (is_torch_neuronx_available, TORCH_NEURONX_IMPORT_ERROR)),
     ]
 )
 
diff --git a/src/diffusers/utils/torch_utils.py b/src/diffusers/utils/torch_utils.py
@@ -93,7 +93,7 @@
         "xpu": getattr(torch.xpu, "synchronize", None),
         "cpu": None,
         "mps": None,
-        "neuron": None,
+        "neuron": lambda: getattr(getattr(torch, "neuron", None), "synchronize", lambda: None)(),
         "default": None,
     }
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

Original file line number	Diff line number	Diff line change
`@@ -584,6 +584,10 @@ def is_av_available():`
`584`	`584`	`"""`
`585`	`585`
`586`	`586`
	`587`	`+TORCH_NEURONX_IMPORT_ERROR = """`
	`588`	`+{0} requires the torch_neuronx library (AWS Neuron SDK) but it was not found in your environment. Please install it following the AWS Neuron documentation: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/`
	`589`	`+"""`
	`590`	`+`
`587`	`591`	`BACKENDS_MAPPING = OrderedDict(`
`588`	`592`	`[`
`589`	`593`	`("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),`
`@@ -614,6 +618,7 @@ def is_av_available():`
`614`	`618`	`("pytorch_retinaface", (is_pytorch_retinaface_available, PYTORCH_RETINAFACE_IMPORT_ERROR)),`
`615`	`619`	`("better_profanity", (is_better_profanity_available, BETTER_PROFANITY_IMPORT_ERROR)),`
`616`	`620`	`("nltk", (is_nltk_available, NLTK_IMPORT_ERROR)),`
	`621`	`+ ("torch_neuronx", (is_torch_neuronx_available, TORCH_NEURONX_IMPORT_ERROR)),`
`617`	`622`	`]`
`618`	`623`	`)`
`619`	`624`
Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,7 @@`
`93`	`93`	`"xpu": getattr(torch.xpu, "synchronize", None),`
`94`	`94`	`"cpu": None,`
`95`	`95`	`"mps": None,`
`96`		`- "neuron": None,`
	`96`	`+ "neuron": lambda: getattr(getattr(torch, "neuron", None), "synchronize", lambda: None)(),`
`97`	`97`	`"default": None,`
`98`	`98`	`}`
`99`	`99`	`logger = logging.get_logger(__name__) # pylint: disable=invalid-name`