Arm backend: Raise quantization testing thresholds (pytorch#17617)

AdrianLundell · web-flow · commit 8dc6f36f1905 · 2026-02-23T10:30:39.000+01:00
Raises frobenius norm and cosine similarity thresholds for comparison
ops and 8aw4 ops. These type of ops will always have potentially large
diffs from the original model for random inputs when quantized and they
have been causing flaky test failures.

Signed-off-by: Adrian Lundell &lt;adrian.lundell@arm.com&gt;
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
@@ -510,7 +510,7 @@ def test_convolution_2d_tosa_INT_a8w4(test_data):
         aten_op,
         exir_op,
         tosa_extensions=["int4"],
-        frobenius_threshold=0.3,
+        frobenius_threshold=0.4,
     )
     pipeline.quantizer.set_global(
         get_symmetric_a8w4_quantization_config(is_per_channel=per_channel_quantization)
diff --git a/backends/arm/test/ops/test_conv3d.py b/backends/arm/test/ops/test_conv3d.py
@@ -531,7 +531,7 @@ def test_convolution_3d_tosa_INT_a8w4(test_data):
         exir_op,
         tosa_extensions=["int4"],
         qtol=1,
-        frobenius_threshold=0.2,
+        frobenius_threshold=0.4,
     )
     pipeline.quantizer.set_global(
         get_symmetric_a8w4_quantization_config(is_per_channel=per_channel_quantization)
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
@@ -279,6 +279,7 @@ def test_convolution_2d_tosa_INT_a8w4_depthwise(test_data):
         aten_op=[],
         exir_op=exir_op,
         tosa_extensions=["int4"],
+        frobenius_threshold=0.4,
     )
     pipeline.quantizer.set_global(
         get_symmetric_a8w4_quantization_config(is_per_channel=per_channel_quantization)
diff --git a/backends/arm/test/ops/test_eq.py b/backends/arm/test/ops/test_eq.py
@@ -146,7 +146,8 @@ def test_eq_scalar_tosa_INT_tensor(test_module):
         test_module().get_inputs(),
         Equal.aten_op_Tensor,
         Equal.exir_op,
-        frobenius_threshold=0.5,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        frobenius_threshold=0.6,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        cosine_threshold=0.8,
     )
     pipeline.run()
 
@@ -158,7 +159,8 @@ def test_eq_scalar_tosa_INT(test_module):
         test_module().get_inputs(),
         Equal.aten_op_Tensor,
         Equal.exir_op,
-        frobenius_threshold=0.5,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        frobenius_threshold=0.6,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        cosine_threshold=0.8,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_ge.py b/backends/arm/test/ops/test_ge.py
@@ -148,7 +148,8 @@ def test_ge_tensor_tosa_INT(test_module):
         test_module().get_inputs(),
         GreaterEqual.aten_op_tensor,
         GreaterEqual.exir_op,
-        frobenius_threshold=0.5,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        frobenius_threshold=0.6,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        cosine_threshold=0.8,
     )
     pipeline.run()
 
@@ -160,7 +161,8 @@ def test_ge_scalar_tosa_INT(test_module):
         test_module().get_inputs(),
         GreaterEqual.aten_op_tensor,
         GreaterEqual.exir_op,
-        frobenius_threshold=0.5,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        frobenius_threshold=0.6,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        cosine_threshold=0.8,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_gt.py b/backends/arm/test/ops/test_gt.py
@@ -149,7 +149,8 @@ def test_gt_tensor_tosa_INT(test_module):
         test_module().get_inputs(),
         Greater.aten_op_tensor,
         Greater.exir_op,
-        frobenius_threshold=0.5,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        frobenius_threshold=0.6,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        cosine_threshold=0.8,
     )
     pipeline.run()
 
@@ -161,7 +162,8 @@ def test_gt_scalar_tosa_INT(test_module):
         test_module().get_inputs(),
         Greater.aten_op_tensor,
         Greater.exir_op,
-        frobenius_threshold=0.5,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        frobenius_threshold=0.6,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        cosine_threshold=0.8,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_le.py b/backends/arm/test/ops/test_le.py
@@ -125,7 +125,8 @@ def test_le_tensor_tosa_INT(test_module):
         test_module().get_inputs(),
         LessEqual.aten_op_tensor,
         LessEqual.exir_op,
-        frobenius_threshold=0.5,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        frobenius_threshold=0.6,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        cosine_threshold=0.8,
     )
     pipeline.run()
 
@@ -137,7 +138,8 @@ def test_le_scalar_tosa_INT(test_module):
         test_module().get_inputs(),
         LessEqual.aten_op_tensor,
         LessEqual.exir_op,
-        frobenius_threshold=0.5,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        frobenius_threshold=0.6,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        cosine_threshold=0.8,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
@@ -217,7 +217,7 @@ def test_linear_tosa_INT_a8w4(test_data: torch.Tensor):
         (test_data,),
         aten_op,
         tosa_extensions=["int4"],
-        frobenius_threshold=0.15,
+        frobenius_threshold=0.4,
     )
     pipeline.quantizer.set_global(
         get_symmetric_a8w4_quantization_config(is_per_channel=per_channel_quantization)
diff --git a/backends/arm/test/ops/test_lt.py b/backends/arm/test/ops/test_lt.py
@@ -125,7 +125,8 @@ def test_lt_tensor_tosa_INT(test_module):
         test_module().get_inputs(),
         LessThan.aten_op_tensor,
         LessThan.exir_op,
-        frobenius_threshold=0.5,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        frobenius_threshold=0.6,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        cosine_threshold=0.8,
     )
     pipeline.run()
 
@@ -137,7 +138,8 @@ def test_lt_scalar_tosa_INT(test_module):
         test_module().get_inputs(),
         LessThan.aten_op_tensor,
         LessThan.exir_op,
-        frobenius_threshold=0.5,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        frobenius_threshold=0.6,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        cosine_threshold=0.8,
     )
     pipeline.run()
 
diff --git a/backends/arm/test/ops/test_ne.py b/backends/arm/test/ops/test_ne.py
@@ -111,7 +111,8 @@ def test_ne_tensor_tosa_INT(test_module):
         test_module.get_inputs(),
         NotEqual.decomposed_ops,
         NotEqual.exir_op,
-        frobenius_threshold=0.5,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        frobenius_threshold=0.6,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        cosine_threshold=0.8,
     )
     pipeline.run()
 
@@ -123,7 +124,8 @@ def test_ne_scalar_tosa_INT(test_module):
         test_module.get_inputs(),
         NotEqual.decomposed_ops,
         NotEqual.exir_op,
-        frobenius_threshold=0.5,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        frobenius_threshold=0.6,  # Quantized comparisons with small diffs can be inaccurate, leading to large errors in unlucky cases.
+        cosine_threshold=0.8,
     )
     pipeline.run()
 

Original file line number	Diff line number	Diff line change
`@@ -510,7 +510,7 @@ def test_convolution_2d_tosa_INT_a8w4(test_data):`
`510`	`510`	`aten_op,`
`511`	`511`	`exir_op,`
`512`	`512`	`tosa_extensions=["int4"],`
`513`		`- frobenius_threshold=0.3,`
	`513`	`+ frobenius_threshold=0.4,`
`514`	`514`	`)`
`515`	`515`	`pipeline.quantizer.set_global(`
`516`	`516`	`get_symmetric_a8w4_quantization_config(is_per_channel=per_channel_quantization)`
Original file line number	Diff line number	Diff line change
`@@ -531,7 +531,7 @@ def test_convolution_3d_tosa_INT_a8w4(test_data):`
`531`	`531`	`exir_op,`
`532`	`532`	`tosa_extensions=["int4"],`
`533`	`533`	`qtol=1,`
`534`		`- frobenius_threshold=0.2,`
	`534`	`+ frobenius_threshold=0.4,`
`535`	`535`	`)`
`536`	`536`	`pipeline.quantizer.set_global(`
`537`	`537`	`get_symmetric_a8w4_quantization_config(is_per_channel=per_channel_quantization)`
Original file line number	Diff line number	Diff line change
`@@ -279,6 +279,7 @@ def test_convolution_2d_tosa_INT_a8w4_depthwise(test_data):`
`279`	`279`	`aten_op=[],`
`280`	`280`	`exir_op=exir_op,`
`281`	`281`	`tosa_extensions=["int4"],`
	`282`	`+ frobenius_threshold=0.4,`
`282`	`283`	`)`
`283`	`284`	`pipeline.quantizer.set_global(`
`284`	`285`	`get_symmetric_a8w4_quantization_config(is_per_channel=per_channel_quantization)`
Original file line number	Diff line number	Diff line change
`@@ -217,7 +217,7 @@ def test_linear_tosa_INT_a8w4(test_data: torch.Tensor):`
`217`	`217`	`(test_data,),`
`218`	`218`	`aten_op,`
`219`	`219`	`tosa_extensions=["int4"],`
`220`		`- frobenius_threshold=0.15,`
	`220`	`+ frobenius_threshold=0.4,`
`221`	`221`	`)`
`222`	`222`	`pipeline.quantizer.set_global(`
`223`	`223`	`get_symmetric_a8w4_quantization_config(is_per_channel=per_channel_quantization)`