|
4 | 4 | branches: [ nvidia-gpu-runners ] |
5 | 5 | workflow_dispatch: {} |
6 | 6 | jobs: |
7 | | - smoke: |
8 | | - runs-on: gpu-runners |
| 7 | + gpu-test: |
| 8 | + runs-on: [self-hosted, nvidia-docker-b200-8-x86-64] |
| 9 | + |
9 | 10 | steps: |
10 | | - - uses: actions/checkout@v4 |
11 | | - - run: nvidia-smi || true |
12 | | - - run: | |
13 | | - python - <<'PY' |
14 | | - import torch |
15 | | - print("cuda?", torch.cuda.is_available(), "count:", torch.cuda.device_count()) |
16 | | - if torch.cuda.is_available(): |
17 | | - print("name:", torch.cuda.get_device_name(0)) |
18 | | - PY |
| 11 | + - name: Checkout repo |
| 12 | + uses: actions/checkout@v4 |
| 13 | + |
| 14 | + - name: Show GPU info |
| 15 | + run: | |
| 16 | + echo "===== nvidia-smi =====" |
| 17 | + nvidia-smi || echo "nvidia-smi not available" |
| 18 | + echo "======================" |
| 19 | +
|
| 20 | + - name: Run CUDA sanity test with PyTorch |
| 21 | + run: | |
| 22 | + python - << 'EOF' |
| 23 | + import torch, time |
| 24 | +
|
| 25 | + print("PyTorch version:", torch.__version__) |
| 26 | + print("CUDA available:", torch.cuda.is_available()) |
| 27 | + print("CUDA device count:", torch.cuda.device_count()) |
| 28 | +
|
| 29 | + if not torch.cuda.is_available(): |
| 30 | + raise SystemExit("ERROR: CUDA not available on this runner ❌") |
| 31 | +
|
| 32 | + # list all visible GPUs |
| 33 | + for i in range(torch.cuda.device_count()): |
| 34 | + print(f"Device {i}: {torch.cuda.get_device_name(i)}") |
| 35 | +
|
| 36 | + # simple GPU compute test on cuda:0 |
| 37 | + device = torch.device("cuda:0") |
| 38 | + a = torch.randn(4096, 4096, device=device) |
| 39 | + b = torch.randn(4096, 4096, device=device) |
| 40 | +
|
| 41 | + torch.cuda.synchronize() |
| 42 | + t0 = time.time() |
| 43 | + c = a @ b |
| 44 | + torch.cuda.synchronize() |
| 45 | + t1 = time.time() |
| 46 | +
|
| 47 | + print("Matmul result shape:", tuple(c.shape)) |
| 48 | + print(f"Matmul took {t1 - t0:.3f} sec on GPU") |
| 49 | + print("All good ✅") |
| 50 | + EOF |
0 commit comments