|
4 | 4 | import json |
5 | 5 | import os |
6 | 6 | import shlex |
| 7 | +import shutil |
7 | 8 | import subprocess |
8 | 9 | import tempfile |
9 | 10 | import time |
@@ -232,11 +233,18 @@ def compile_cuda_script( # # noqa: C901 |
232 | 233 |
|
233 | 234 |
|
234 | 235 | def run_program( |
235 | | - args: list[str], seed: Optional[int], timeout: int, multi_gpu: bool = False |
| 236 | + args: list[str], |
| 237 | + seed: Optional[int], |
| 238 | + timeout: int, |
| 239 | + multi_gpu: bool = False, |
| 240 | + extra_env: Optional[dict[str, str]] = None, |
236 | 241 | ) -> RunResult: |
237 | 242 | print("[Running]") |
238 | 243 | # set up a pipe so the tester can communicate its verdict with us |
239 | 244 | env = os.environ.copy() |
| 245 | + if extra_env is not None: |
| 246 | + env.update(extra_env) |
| 247 | + |
240 | 248 | pipe_read, pipe_write = os.pipe() |
241 | 249 | env["POPCORN_FD"] = str(pipe_write) |
242 | 250 | if seed is not None: |
@@ -297,6 +305,87 @@ def run_program( |
297 | 305 | ) |
298 | 306 |
|
299 | 307 |
|
| 308 | +def profile_program( |
| 309 | + system: SystemInfo, |
| 310 | + call: list[str], |
| 311 | + seed: Optional[int], |
| 312 | + timeout: int, |
| 313 | + multi_gpu: bool, |
| 314 | +) -> tuple[RunResult, Optional[ProfileResult]]: |
| 315 | + # The runner-specific configuration should implement logic |
| 316 | + # to fetch the data in this directory and return it as |
| 317 | + # ProfileResult.download_url. |
| 318 | + # Insert an extra nested nested path here so that the resulting zip has all files |
| 319 | + # in the profile_data/ directory rather than directly in the root. |
| 320 | + output_dir = Path(".") / "profile_data" / "profile_data" |
| 321 | + output_dir.mkdir(parents=True, exist_ok=True) |
| 322 | + |
| 323 | + if system.runtime == "ROCm": |
| 324 | + # Wrap program in rocprof |
| 325 | + call = [ |
| 326 | + "rocprofv3", |
| 327 | + "--log-level", |
| 328 | + "fatal", |
| 329 | + "--hip-trace", |
| 330 | + "--kernel-trace", |
| 331 | + "--rccl-trace", |
| 332 | + "--marker-trace", |
| 333 | + "--hip-trace", |
| 334 | + "--memory-copy-trace", |
| 335 | + # New? Doesn't work in the runner |
| 336 | + # "--memory-allocation-trace", |
| 337 | + "--scratch-memory-trace", |
| 338 | + # The HSA trace output is very large, so skip it for now |
| 339 | + # "--hsa-trace", |
| 340 | + "--output-format", |
| 341 | + "pftrace", |
| 342 | + "csv", |
| 343 | + "-d", |
| 344 | + str(output_dir), |
| 345 | + # Just store the files as %pid%_tracename.ext instead of putting them in an |
| 346 | + # additional directory named after the hostname. |
| 347 | + "-o", |
| 348 | + # Insert an extra path here so that the resulting zip has all files |
| 349 | + # in the profile_data/ directory rather than the root. |
| 350 | + "%pid%", |
| 351 | + "--", |
| 352 | + ] + call |
| 353 | + |
| 354 | + run_result = run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu, extra_env={ |
| 355 | + "GPU_DUMP_CODE_OBJECT": "1", |
| 356 | + }) |
| 357 | + |
| 358 | + profile_result = None |
| 359 | + |
| 360 | + if run_result.success: |
| 361 | + # Post-process trace data. |
| 362 | + # rocPROF generates one trace for every process, but its more useful to |
| 363 | + # have all traces be in the same file. Fortunately we can do that by |
| 364 | + # concatenating. |
| 365 | + traces = list(output_dir.glob("*.pftrace")) |
| 366 | + with (output_dir / "combined.pftrace").open("wb") as combined: |
| 367 | + for trace_path in traces: |
| 368 | + with trace_path.open("rb") as trace: |
| 369 | + shutil.copyfileobj(trace, combined) |
| 370 | + |
| 371 | + # After we've created the combined trace, there is no point in |
| 372 | + # keeping the individual traces around. |
| 373 | + trace_path.unlink() |
| 374 | + |
| 375 | + # Also move the code objects to the profiling output directory. |
| 376 | + for code_obj in list(Path.cwd().glob("_code_object*.o")): |
| 377 | + code_obj.rename(output_dir / code_obj.name) |
| 378 | + |
| 379 | + profile_result = ProfileResult( |
| 380 | + profiler='rocPROF', |
| 381 | + download_url=None, |
| 382 | + ) |
| 383 | + |
| 384 | + return run_result, profile_result |
| 385 | + else: |
| 386 | + # TODO: Implement profiling for other platforms |
| 387 | + return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None |
| 388 | + |
300 | 389 | def run_single_evaluation( |
301 | 390 | system: SystemInfo, |
302 | 391 | call: list[str], |
@@ -332,6 +421,9 @@ def run_single_evaluation( |
332 | 421 |
|
333 | 422 | call += [mode, cases.name] |
334 | 423 |
|
| 424 | + if mode == "profile": |
| 425 | + return profile_program(system, call, seed=seed, timeout=timeout, multi_gpu=multi_gpu) |
| 426 | + |
335 | 427 | return run_program(call, seed=seed, timeout=timeout, multi_gpu=multi_gpu), None |
336 | 428 |
|
337 | 429 |
|
|
0 commit comments