Updated more CUDA tests

JPPhoto · JPPhoto · commit 5e3a99f707a3 · 2026-05-26T20:39:11.000-05:00
diff --git a/tests/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/test_all_custom_modules.py b/tests/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/test_all_custom_modules.py
@@ -316,30 +316,45 @@ def test_inference_autocast_from_cpu_to_device(device: str, layer_under_test: La
     # Move the original layer to the CPU.
     layer_to_device_via_state_dict(orig_layer, "cpu")
 
-    # Inference should fail with an input on the device.
-    with pytest.raises(RuntimeError):
-        _ = orig_layer(x)
+    is_nf4_layer = type(orig_layer).__name__ == "InvokeLinearNF4"
+    # Inference should fail with an input on the device. Do not probe raw NF4 here: with CPU-stored weights and a
+    # single-row CUDA input, some bitsandbytes versions hit an unsafe gemv_4bit path instead of raising safely.
+    if not is_nf4_layer:
+        with pytest.raises((RuntimeError, ValueError)):
+            _ = orig_layer(x)
 
     # Wrap the original layer.
     custom_layer = copy.deepcopy(orig_layer)
     custom_layer = wrap_single_custom_layer(custom_layer)
 
-    # Inference should still fail with autocasting disabled.
+    # Inference should still fail with autocasting disabled. See the raw NF4 note above.
     custom_layer.set_device_autocasting_enabled(False)
-    with pytest.raises(RuntimeError):
-        _ = custom_layer(x)
+    if not is_nf4_layer:
+        with pytest.raises((RuntimeError, ValueError)):
+            _ = custom_layer(x)
 
     # Run inference with the wrapped layer on the device.
     custom_layer.set_device_autocasting_enabled(True)
     custom_output = custom_layer(x)
     assert custom_output.device.type == device
 
-    assert torch.allclose(orig_output, custom_output)
+    if is_nf4_layer:
+        assert torch.allclose(orig_output, custom_output, atol=1e-5)
+    else:
+        assert torch.allclose(orig_output, custom_output)
 
 
 PatchUnderTest = tuple[list[tuple[BaseLayerPatch, float]], torch.Tensor]
 
 
+def _has_dora_patch(patches: list[tuple[BaseLayerPatch, float]]) -> bool:
+    return any(isinstance(patch, DoRALayer) for patch, _ in patches)
+
+
+def _is_bnb_quantized_linear(layer: torch.nn.Module) -> bool:
+    return type(layer).__name__ in {"InvokeLinear8bitLt", "InvokeLinearNF4"}
+
+
 @pytest.fixture(
     params=[
         "single_lora",
@@ -564,6 +579,8 @@ def test_quantized_linear_sidecar_patches(
     patches, input = patch_under_test
 
     linear_layer, quantized_linear_layer = quantized_linear_layer_under_test
+    if _is_bnb_quantized_linear(quantized_linear_layer) and _has_dora_patch(patches):
+        pytest.skip("DoRA patches require readable base weights and are not compatible with bnb quantized layers.")
 
     # Move everything to the device.
     layer_to_device_via_state_dict(linear_layer, device)
@@ -598,6 +615,8 @@ def test_quantized_linear_sidecar_patches_with_autocast_from_cpu_to_device(
     patches, input = patch_under_test
 
     _, quantized_linear_layer = quantized_linear_layer_under_test
+    if _is_bnb_quantized_linear(quantized_linear_layer) and _has_dora_patch(patches):
+        pytest.skip("DoRA patches require readable base weights and are not compatible with bnb quantized layers.")
 
     # Move everything to the device.
     layer_to_device_via_state_dict(quantized_linear_layer, device)