Skip to content

Commit d559349

Browse files
committed
tmp
1 parent c61afb3 commit d559349

1 file changed

Lines changed: 43 additions & 11 deletions

File tree

.github/workflows/nvidia-smoke.yml

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,47 @@ on:
44
branches: [ nvidia-gpu-runners ]
55
workflow_dispatch: {}
66
jobs:
7-
smoke:
8-
runs-on: gpu-runners
7+
gpu-test:
8+
runs-on: [self-hosted, nvidia-docker-b200-8-x86-64]
9+
910
steps:
10-
- uses: actions/checkout@v4
11-
- run: nvidia-smi || true
12-
- run: |
13-
python - <<'PY'
14-
import torch
15-
print("cuda?", torch.cuda.is_available(), "count:", torch.cuda.device_count())
16-
if torch.cuda.is_available():
17-
print("name:", torch.cuda.get_device_name(0))
18-
PY
11+
- name: Checkout repo
12+
uses: actions/checkout@v4
13+
14+
- name: Show GPU info
15+
run: |
16+
echo "===== nvidia-smi ====="
17+
nvidia-smi || echo "nvidia-smi not available"
18+
echo "======================"
19+
20+
- name: Run CUDA sanity test with PyTorch
21+
run: |
22+
python - << 'EOF'
23+
import torch, time
24+
25+
print("PyTorch version:", torch.__version__)
26+
print("CUDA available:", torch.cuda.is_available())
27+
print("CUDA device count:", torch.cuda.device_count())
28+
29+
if not torch.cuda.is_available():
30+
raise SystemExit("ERROR: CUDA not available on this runner ❌")
31+
32+
# list all visible GPUs
33+
for i in range(torch.cuda.device_count()):
34+
print(f"Device {i}: {torch.cuda.get_device_name(i)}")
35+
36+
# simple GPU compute test on cuda:0
37+
device = torch.device("cuda:0")
38+
a = torch.randn(4096, 4096, device=device)
39+
b = torch.randn(4096, 4096, device=device)
40+
41+
torch.cuda.synchronize()
42+
t0 = time.time()
43+
c = a @ b
44+
torch.cuda.synchronize()
45+
t1 = time.time()
46+
47+
print("Matmul result shape:", tuple(c.shape))
48+
print(f"Matmul took {t1 - t0:.3f} sec on GPU")
49+
print("All good ✅")
50+
EOF

0 commit comments

Comments
 (0)