Skip to content

Commit 51af552

Browse files
authored
Initial AMD Profiling (#339)
* rocprof: implement ROCm profiling This uses rocPROF to fetch some interesting data and put it in the profile_data directory, the download link of which is then returned to the user. * rocprof: post-process rocprof results rocPROF generates one trace for every process. Simply combine them together into a single trace for ease of use. Also remove the individual traces are they are no longer useful afterwards. * rocprof: also output code objects
1 parent 5464d43 commit 51af552

1 file changed

Lines changed: 93 additions & 1 deletion

File tree

src/libkernelbot/run_eval.py

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import json
55
import os
66
import shlex
7+
import shutil
78
import subprocess
89
import tempfile
910
import time
@@ -232,11 +233,18 @@ def compile_cuda_script( # # noqa: C901
232233

233234

234235
def run_program(
235-
args: list[str], seed: Optional[int], timeout: int, multi_gpu: bool = False
236+
args: list[str],
237+
seed: Optional[int],
238+
timeout: int,
239+
multi_gpu: bool = False,
240+
extra_env: Optional[dict[str, str]] = None,
236241
) -> RunResult:
237242
print("[Running]")
238243
# set up a pipe so the tester can communicate its verdict with us
239244
env = os.environ.copy()
245+
if extra_env is not None:
246+
env.update(extra_env)
247+
240248
pipe_read, pipe_write = os.pipe()
241249
env["POPCORN_FD"] = str(pipe_write)
242250
if seed is not None:
@@ -297,6 +305,87 @@ def run_program(
297305
)
298306

299307

308+
def profile_program(
309+
system: SystemInfo,
310+
call: list[str],
311+
seed: Optional[int],
312+
timeout: int,
313+
multi_gpu: bool,
314+
) -> tuple[RunResult, Optional[ProfileResult]]:
315+
# The runner-specific configuration should implement logic
316+
# to fetch the data in this directory and return it as
317+
# ProfileResult.download_url.
318+
# Insert an extra nested nested path here so that the resulting zip has all files
319+
# in the profile_data/ directory rather than directly in the root.
320+
output_dir = Path(".") / "profile_data" / "profile_data"
321+
output_dir.mkdir(parents=True, exist_ok=True)
322+
323+
if system.runtime == "ROCm":
324+
# Wrap program in rocprof
325+
call = [
326+
"rocprofv3",
327+
"--log-level",
328+
"fatal",
329+
"--hip-trace",
330+
"--kernel-trace",
331+
"--rccl-trace",
332+
"--marker-trace",
333+
"--hip-trace",
334+
"--memory-copy-trace",
335+
# New? Doesn't work in the runner
336+
# "--memory-allocation-trace",
337+
"--scratch-memory-trace",
338+
# The HSA trace output is very large, so skip it for now
339+
# "--hsa-trace",
340+
"--output-format",
341+
"pftrace",
342+
"csv",
343+
"-d",
344+
str(output_dir),
345+
# Just store the files as %pid%_tracename.ext instead of putting them in an
346+
# additional directory named after the hostname.
347+
"-o",
348+
# Insert an extra path here so that the resulting zip has all files
349+
# in the profile_data/ directory rather than the root.
350+
"%pid%",
351+
"--",
352+
] + call
353+
354+
run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={
355+
"GPU_DUMP_CODE_OBJECT": "1",
356+
})
357+
358+
profile_result = None
359+
360+
if run_result.success:
361+
# Post-process trace data.
362+
# rocPROF generates one trace for every process, but its more useful to
363+
# have all traces be in the same file. Fortunately we can do that by
364+
# concatenating.
365+
traces = list(output_dir.glob("*.pftrace"))
366+
with (output_dir / "combined.pftrace").open("wb") as combined:
367+
for trace_path in traces:
368+
with trace_path.open("rb") as trace:
369+
shutil.copyfileobj(trace, combined)
370+
371+
# After we've created the combined trace, there is no point in
372+
# keeping the individual traces around.
373+
trace_path.unlink()
374+
375+
# Also move the code objects to the profiling output directory.
376+
for code_obj in list(Path.cwd().glob("_code_object*.o")):
377+
code_obj.rename(output_dir / code_obj.name)
378+
379+
profile_result = ProfileResult(
380+
profiler='rocPROF',
381+
download_url=None,
382+
)
383+
384+
return run_result, profile_result
385+
else:
386+
# TODO: Implement profiling for other platforms
387+
return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
388+
300389
def run_single_evaluation(
301390
system: SystemInfo,
302391
call: list[str],
@@ -332,6 +421,9 @@ def run_single_evaluation(
332421

333422
call += [mode, cases.name]
334423

424+
if mode == "profile":
425+
return profile_program(system, call, seed=seed, timeout=timeout, multi_gpu=multi_gpu)
426+
335427
return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None
336428

337429

0 commit comments

Comments
 (0)