Add CUDA diagnostic script for troubleshooting GPU detection

- Checks PyTorch installation and version - Detects CUDA availability and GPU info - Tests CUDA with simple tensor operation - Shows device manager detection results - Provides troubleshooting hints for CPU-only builds Usage: python check_cuda.py or uv run check_cuda.py
2025-12-26 12:00:37 -08:00
parent d51b24e2e5
commit 8604662262
1 changed files with 97 additions and 0 deletions
--- a/check_cuda.py
+++ b/check_cuda.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""Diagnostic script to check CUDA availability and GPU detection."""
+
+import sys
+
+print("=" * 60)
+print("CUDA/GPU Detection Diagnostic")
+print("=" * 60)
+print()
+
+# Check Python version
+print(f"Python version: {sys.version}")
+print()
+
+# Check PyTorch
+try:
+    import torch
+    print(f"✓ PyTorch installed: {torch.__version__}")
+    print()
+
+    # Check CUDA availability
+    print("CUDA Detection:")
+    print(f"  CUDA available: {torch.cuda.is_available()}")
+
+    if torch.cuda.is_available():
+        print(f"  CUDA version: {torch.version.cuda}")
+        print(f"  cuDNN version: {torch.backends.cudnn.version()}")
+        print(f"  GPU count: {torch.cuda.device_count()}")
+
+        # List all GPUs
+        for i in range(torch.cuda.device_count()):
+            print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
+            print(f"    Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")
+
+        # Test CUDA with a simple tensor operation
+        try:
+            test_tensor = torch.tensor([1.0, 2.0, 3.0]).cuda()
+            print(f"\n  ✓ CUDA tensor test successful: {test_tensor.device}")
+        except Exception as e:
+            print(f"\n  ✗ CUDA tensor test failed: {e}")
+    else:
+        print("  ℹ No CUDA GPUs detected")
+        print()
+        print("  Possible reasons:")
+        print("    1. No NVIDIA GPU installed")
+        print("    2. NVIDIA drivers not installed or outdated")
+        print("    3. PyTorch built without CUDA support (CPU-only)")
+        print()
+        print("  To check if PyTorch has CUDA support:")
+        print(f"    Built with CUDA: {torch.version.cuda is not None}")
+
+        if torch.version.cuda is None:
+            print()
+            print("  ⚠ PyTorch is CPU-only!")
+            print("  To enable CUDA, reinstall PyTorch with CUDA support:")
+            print("    uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121")
+
+    print()
+
+    # Check MPS (Apple Silicon)
+    if hasattr(torch.backends, 'mps'):
+        print("MPS Detection (Apple Silicon):")
+        print(f"  MPS available: {torch.backends.mps.is_available()}")
+        if torch.backends.mps.is_available():
+            print(f"  MPS built: {torch.backends.mps.is_built()}")
+
+except ImportError as e:
+    print(f"✗ PyTorch not installed: {e}")
+    print()
+    print("Install with: uv pip install torch")
+
+print()
+
+# Check our device manager
+try:
+    from client.device_utils import DeviceManager
+
+    print("=" * 60)
+    print("Device Manager Detection")
+    print("=" * 60)
+    print()
+
+    dm = DeviceManager()
+    print(f"Available devices: {dm.available_devices}")
+    print(f"Current device: {dm.current_device}")
+    print(f"GPU available: {dm.is_gpu_available()}")
+    print()
+
+    print("Device info:")
+    for device, description in dm.get_device_info():
+        print(f"  {device}: {description}")
+
+except ImportError as e:
+    print(f"Device manager not available: {e}")
+
+print()
+print("=" * 60)