Skip to content

Commit 44c8aeb

Browse files
wenkaiduCopilot
andauthored
[RCCL] Enforce model index matching across nodes (#5377)
## Motivation RCCL relies on a single, consistent Rome / GIO preset topology model index across all ranks so precomputed graphs (rings, trees, etc.) stay in lockstep. If different ranks infer different romeTopoModelIdx, behavior is undefined and jobs should fail fast with actionable logs. ## Technical Details Use majority-style reference (plurality vote) for deciding reference index. For mismatched ranks, one line will be printed per physical host. ## JIRA ID ## Test Plan Manually introduce mismatched nodes and check output. ## Test Result rocm-systems/projects/rccl/build/hipify/src/init.cc:1681 NCCL WARN RCCL FATAL: mismatched Rome preset topology model index across ranks; all ranks must agree for precomputed graphs (voted refIdx 40 from 16 of 24 ranks). rocm-systems/projects/rccl/build/hipify/src/init.cc:1693 NCCL WARN rank 8 host useocpm2m-097-019 romeTopoModelIdx=-1 ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --------- Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
1 parent 71aeed5 commit 44c8aeb

11 files changed

Lines changed: 230 additions & 0 deletions

File tree

projects/rccl/src/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ set(SRC_FILES
6464
graph/rings.h
6565
graph/rome_models.cc
6666
graph/rome_models.h
67+
graph/rome_topo_consensus.cc
68+
graph/rome_topo_consensus.h
6769
graph/search.cc
6870
graph/topo.cc
6971
graph/topo.h

projects/rccl/src/graph/rome_models.cc

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2080,6 +2080,17 @@ static void parseOptions(struct ncclTopoSystem* system, const char *options) {
20802080
}
20812081
}
20822082

2083+
static ncclResult_t rcclTopoSetPresetRomeModelIdx(struct ncclTopoSystem* system, int idx) {
2084+
if (system->romeTopoModelIdx == RCCL_ROME_TOPO_PRESET_MODEL_IDX_NONE) {
2085+
system->romeTopoModelIdx = idx;
2086+
} else if (system->romeTopoModelIdx != idx) {
2087+
WARN("RCCL: conflicting preset Rome topology model index on this node (already %d, attempted %d).",
2088+
system->romeTopoModelIdx, idx);
2089+
return ncclInvalidUsage;
2090+
}
2091+
return ncclSuccess;
2092+
}
2093+
20832094
static bool checkOption(const char *options, const char *name) {
20842095
if (strcmp(options, "")) {
20852096
char *str_temp = (char *)malloc(strlen(options) + 1);
@@ -2666,6 +2677,10 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
26662677
break;
26672678
}
26682679

2680+
if (graph->nChannels) {
2681+
NCCLCHECK(rcclTopoSetPresetRomeModelIdx(system, i));
2682+
}
2683+
26692684
// clean up
26702685
free(all_gpu_permutations);
26712686
return ncclSuccess;
@@ -2833,6 +2848,9 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
28332848
applyNetOverride(system, romeTopoModels[i].options);
28342849
break;
28352850
}
2851+
if (graph->nChannels) {
2852+
NCCLCHECK(rcclTopoSetPresetRomeModelIdx(system, i));
2853+
}
28362854
return ncclSuccess;
28372855
}
28382856

@@ -2972,6 +2990,9 @@ ncclResult_t parse1H16P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
29722990
NCCLCHECK(parseGraph(romeTopoModels[i].ringBase, system, graph, rdm, nnets > 1 ? n : NULL, false));
29732991

29742992
if (romeTopoModels[i].treeBase != nullptr) NCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, rdm));
2993+
if (graph->nChannels) {
2994+
NCCLCHECK(rcclTopoSetPresetRomeModelIdx(system, i));
2995+
}
29752996
// clean up
29762997
free(all_gpu_permutations);
29772998
return ncclSuccess;
@@ -3092,6 +3113,9 @@ ncclResult_t parse4H4P(struct ncclTopoSystem* system, struct ncclTopoGraph* grap
30923113
parseOptions(system, rome_model_68.options);
30933114
// create 4P4H based on reference and remapped ids
30943115
NCCLCHECK(parseGraph(rome_model_68.ringBase, system, graph, rdm, n_hives.data(), false));
3116+
if (graph->nChannels) {
3117+
NCCLCHECK(rcclTopoSetPresetRomeModelIdx(system, RCCL_ROME_TOPO_PRESET_MODEL_IDX_4H4P));
3118+
}
30953119
return ncclSuccess;
30963120
}
30973121

@@ -3170,6 +3194,9 @@ ncclResult_t parseGIOTopos(struct ncclTopoSystem* system, struct ncclTopoGraph*
31703194
system->type |= RCCL_TOPO_4P2H_ROME;
31713195

31723196
NCCLCHECKGOTO(parseGraph(gio16gColumbaModel.ringBase, system, graph, rdm, NULL, false), r, exit);
3197+
if (graph->nChannels) {
3198+
NCCLCHECK(rcclTopoSetPresetRomeModelIdx(system, RCCL_ROME_TOPO_PRESET_MODEL_IDX_GIO_COLUMBA));
3199+
}
31733200

31743201
exit:
31753202
return ncclSuccess;
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*************************************************************************
2+
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#include "graph/rome_topo_consensus.h"
8+
#include "debug.h"
9+
#include <unordered_map>
10+
11+
#if defined(__GNUC__) || defined(__clang__)
12+
__attribute__((visibility("default")))
13+
#endif
14+
ncclResult_t rcclCheckRomeTopoModelIdxConsensus(
15+
int nranks,
16+
std::function<int(int)> getRomeTopoModelIdx,
17+
std::function<const char*(int)> getHostname,
18+
std::function<uint64_t(int)> getHostHash) {
19+
if (nranks <= 0) return ncclSuccess;
20+
21+
std::unordered_map<int, std::pair<int, int>> tallies; // modelIdx -> (vote count, first rank)
22+
tallies.reserve(nranks);
23+
for (int r = 0; r < nranks; r++) {
24+
int v = getRomeTopoModelIdx(r);
25+
auto it = tallies.find(v);
26+
if (it == tallies.end()) {
27+
tallies.emplace(v, std::make_pair(1, r));
28+
} else {
29+
it->second.first++;
30+
}
31+
}
32+
int refIdx = getRomeTopoModelIdx(0);
33+
int refVotes = -1;
34+
int refFirstRank = nranks;
35+
for (const auto& e : tallies) {
36+
int cnt = e.second.first;
37+
int firstRank = e.second.second;
38+
if (cnt > refVotes || (cnt == refVotes && firstRank < refFirstRank)) {
39+
refVotes = cnt;
40+
refIdx = e.first;
41+
refFirstRank = firstRank;
42+
}
43+
}
44+
if (tallies.size() == 1) return ncclSuccess;
45+
int nDisagree = 0;
46+
for (int r = 0; r < nranks; r++) {
47+
if (getRomeTopoModelIdx(r) != refIdx) nDisagree++;
48+
}
49+
if (nDisagree > 0) {
50+
WARN("RCCL FATAL: mismatched Rome preset topology model index across ranks; all ranks must agree for precomputed graphs (voted refIdx %d from %d of %d ranks).", refIdx, refVotes, nranks);
51+
std::unordered_map<uint64_t, int> lowestMismatchRankByHost;
52+
lowestMismatchRankByHost.reserve(nranks);
53+
for (int r = 0; r < nranks; r++) {
54+
if (getRomeTopoModelIdx(r) == refIdx) continue;
55+
uint64_t h = getHostHash(r);
56+
if (lowestMismatchRankByHost.find(h) == lowestMismatchRankByHost.end()) {
57+
lowestMismatchRankByHost.emplace(h, r);
58+
}
59+
}
60+
for (int r = 0; r < nranks; r++) {
61+
if (getRomeTopoModelIdx(r) == refIdx) continue;
62+
uint64_t h = getHostHash(r);
63+
if (lowestMismatchRankByHost[h] != r) continue;
64+
WARN(" rank %d host %s romeTopoModelIdx=%d", r, getHostname(r), getRomeTopoModelIdx(r));
65+
}
66+
return ncclInvalidUsage;
67+
}
68+
return ncclSuccess;
69+
}
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
/*************************************************************************
2+
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#ifndef RCCL_GRAPH_ROME_TOPO_CONSENSUS_H_
8+
#define RCCL_GRAPH_ROME_TOPO_CONSENSUS_H_
9+
10+
#include "nccl.h"
11+
#include <functional>
12+
13+
/* Plurality vote on romeTopoModelIdx across ranks; all ranks must match the winner.
14+
* Callers supply getters so this stays independent of ncclComm / allGatherInfo. */
15+
ncclResult_t rcclCheckRomeTopoModelIdxConsensus(
16+
int nranks,
17+
std::function<int(int /*rank*/)> getRomeTopoModelIdx,
18+
std::function<const char*(int /*rank*/)> getHostname,
19+
std::function<uint64_t(int /*rank*/)> getHostHash);
20+
21+
#endif

projects/rccl/src/graph/topo.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,7 @@ ncclResult_t ncclTopoAddC2c(struct ncclXmlNode* node, struct ncclTopoSystem* sys
757757
ncclResult_t ncclTopoGetSystemFromXml(struct ncclXml* xml, struct ncclTopoSystem** topoSystem, const uint64_t localHostHash) {
758758
NCCLCHECK(ncclCalloc(topoSystem, 1));
759759
struct ncclTopoSystem* system = *topoSystem;
760+
system->romeTopoModelIdx = RCCL_ROME_TOPO_PRESET_MODEL_IDX_NONE;
760761
struct ncclXmlNode* topNode;
761762
NCCLCHECK(xmlFindTag(xml, "system", &topNode));
762763
for (int s=0; s<topNode->nSubs; s++) {

projects/rccl/src/graph/topo.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,12 @@ struct ncclTopoLinkList {
139139
#define RCCL_TOPO_FORCE_INTRA 16
140140
#define RCCL_TOPO_XGMI_ALL 32
141141

142+
/* Rome preset graph: index into romeTopoModels[] in rome_models.cc, or sentinel below. */
143+
#define RCCL_ROME_TOPO_PRESET_MODEL_IDX_NONE (-1)
144+
#define RCCL_ROME_TOPO_PRESET_MODEL_IDX_GIO_COLUMBA (1000000)
145+
/* parse4H4P() applies rome_model_68 directly; this tags the preset, not romeTopoModels[]. */
146+
#define RCCL_ROME_TOPO_PRESET_MODEL_IDX_4H4P (1000001)
147+
142148

143149
#define GCN_ARCH_NAME_LEN 16
144150

@@ -215,6 +221,8 @@ struct ncclTopoSystem {
215221
// [RCCL] Track hostIdx to support rail-optimized rings/trees
216222
int hostIdx;
217223
bool useRailOptimizedTrees;
224+
/* RCCL Rome / GIO preset: RCCL_ROME_TOPO_PRESET_MODEL_IDX_* sentinels or romeTopoModels[] index */
225+
int romeTopoModelIdx;
218226
};
219227

220228
ncclResult_t ncclTopoGetNode(struct ncclTopoSystem* system, struct ncclTopoNode** node, int type, uint64_t id);

projects/rccl/src/init.cc

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,14 @@
4040
#include <sys/resource.h>
4141
#include <unistd.h>
4242
#include "graph/topo.h"
43+
#include "graph/rome_topo_consensus.h"
4344
#include "graph/xml.h"
4445
#include "archinfo.h"
4546
#include "param.h"
4647
#include "nvtx_payload_schemas.h"
4748
#include "utils.h"
4849
#include <mutex>
50+
#include <unordered_map>
4951
#include "ce_coll.h"
5052
#include "nvtx.h"
5153

@@ -1245,6 +1247,28 @@ static ncclResult_t initNvlDomainInfo(struct ncclComm* comm) {
12451247
return ncclSuccess;
12461248
}
12471249

1250+
// True when every host has the same number of ranks and those partitions sum to nranks.
1251+
static bool uniformRanksPerHost(const struct ncclComm* comm, int nranks) {
1252+
int ranksPerHost = -1;
1253+
int total = 0;
1254+
for (int i = 0; i < nranks; i++) {
1255+
uint64_t h = comm->peerInfo[i].hostHash;
1256+
bool lowestRankOnHost = true;
1257+
for (int j = 0; j < i; j++) {
1258+
if (comm->peerInfo[j].hostHash == h) { lowestRankOnHost = false; break; }
1259+
}
1260+
if (!lowestRankOnHost) continue;
1261+
int cnt = 0;
1262+
for (int j = 0; j < nranks; j++) {
1263+
if (comm->peerInfo[j].hostHash == h) cnt++;
1264+
}
1265+
total += cnt;
1266+
if (ranksPerHost < 0) ranksPerHost = cnt;
1267+
else if (cnt != ranksPerHost) return false;
1268+
}
1269+
return total == nranks && ranksPerHost > 0;
1270+
}
1271+
12481272
static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* parent, uint64_t timers[TIMERS_INIT_COUNT]) {
12491273
// We use 2 AllGathers
12501274
// 1. { peerInfo, comm, compCap}
@@ -1279,8 +1303,10 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
12791303
int cpuVendor;
12801304
int localRanks;
12811305
int nc;
1306+
int romeTopoModelIdx;
12821307
bool pivotA2AEnabled;
12831308
bool ll128Enabled;
1309+
char hostname[128];
12841310
};
12851311

12861312
int nChannelsOrig;
@@ -1656,6 +1682,8 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
16561682

16571683
allGather3Data[rank].cpuArch = comm->cpuArch;
16581684
allGather3Data[rank].cpuVendor = comm->cpuVendor;
1685+
allGather3Data[rank].romeTopoModelIdx = comm->topo->romeTopoModelIdx;
1686+
(void) getHostName(allGather3Data[rank].hostname, sizeof(allGather3Data[rank].hostname), '\0');
16591687

16601688
comm->nChannels = std::min(treeGraph->nChannels, ringGraph->nChannels);
16611689

@@ -1667,6 +1695,15 @@ static ncclResult_t initTransportsRank(struct ncclComm* comm, struct ncclComm* p
16671695

16681696
NCCLCHECKGOTO(bootstrapAllGather(comm->bootstrap, allGather3Data, sizeof(*allGather3Data)), ret, fail);
16691697

1698+
if (uniformRanksPerHost(comm, nranks)) {
1699+
NCCLCHECKGOTO(rcclCheckRomeTopoModelIdxConsensus(
1700+
nranks,
1701+
[&](int r) { return allGather3Data[r].romeTopoModelIdx; },
1702+
[&](int r) { return allGather3Data[r].hostname; },
1703+
[&](int r) { return comm->peerInfo[r].hostHash; }),
1704+
ret, fail);
1705+
}
1706+
16701707
// Determine nNodes, firstRanks, ...
16711708
NCCLCHECKGOTO(ncclCalloc(&nodesFirstRank, nranks), ret, fail);
16721709
NCCLCHECKGOTO(ncclCalloc(&nodesTreePatterns, nranks), ret, fail);

projects/rccl/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ if(BUILD_TESTS)
199199

200200
set(TEST_FIXTURE_SOURCE_FILES
201201
BitOpsTests.cpp
202+
RomeTopoConsensusTests.cpp
202203
CommTests.cpp
203204
EnqueueCountTests.cpp
204205
device/TestOp128.cpp
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*************************************************************************
2+
* Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
3+
*
4+
* See LICENSE.txt for license information
5+
************************************************************************/
6+
7+
#include "graph/rome_topo_consensus.h"
8+
#include "gtest/gtest.h"
9+
#include <array>
10+
11+
namespace RcclUnitTesting {
12+
13+
TEST(RomeTopoConsensus, emptyRanks) {
14+
EXPECT_EQ(rcclCheckRomeTopoModelIdxConsensus(
15+
0,
16+
[](int) { return 0; },
17+
[](int) { return ""; },
18+
[](int) { return 0ULL; }),
19+
ncclSuccess);
20+
}
21+
22+
TEST(RomeTopoConsensus, allSameModelIdx) {
23+
constexpr int n = 4;
24+
std::array<int, n> idx{{7, 7, 7, 7}};
25+
std::array<const char*, n> names{{"a", "a", "b", "b"}};
26+
std::array<uint64_t, n> hosts{{1, 1, 2, 2}};
27+
EXPECT_EQ(rcclCheckRomeTopoModelIdxConsensus(
28+
n,
29+
[&](int r) { return idx[r]; },
30+
[&](int r) { return names[r]; },
31+
[&](int r) { return hosts[r]; }),
32+
ncclSuccess);
33+
}
34+
35+
TEST(RomeTopoConsensus, mismatchFails) {
36+
constexpr int n = 2;
37+
std::array<int, n> idx{{1, 2}};
38+
std::array<const char*, n> names{{"h0", "h1"}};
39+
std::array<uint64_t, n> hosts{{10, 11}};
40+
EXPECT_EQ(rcclCheckRomeTopoModelIdxConsensus(
41+
n,
42+
[&](int r) { return idx[r]; },
43+
[&](int r) { return names[r]; },
44+
[&](int r) { return hosts[r]; }),
45+
ncclInvalidUsage);
46+
}
47+
48+
TEST(RomeTopoConsensus, tieBreakLowerFirstRankWinsThenMinorityFails) {
49+
// idx 2 at ranks 0,1; idx 3 at ranks 2,3 -> plurality tie; first occurrence rank 0 -> refIdx 2
50+
constexpr int n = 4;
51+
std::array<int, n> idx{{2, 2, 3, 3}};
52+
std::array<const char*, n> names{{"a", "a", "a", "a"}};
53+
std::array<uint64_t, n> hosts{{1, 1, 1, 1}};
54+
EXPECT_EQ(rcclCheckRomeTopoModelIdxConsensus(
55+
n,
56+
[&](int r) { return idx[r]; },
57+
[&](int r) { return names[r]; },
58+
[&](int r) { return hosts[r]; }),
59+
ncclInvalidUsage);
60+
}
61+
62+
} // namespace RcclUnitTesting

projects/rccl/tools/topo_expl/include/topo_expl_impl.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ struct allGatherInfo {
2525
struct graphInfo graphInfo[NCCL_NUM_ALGORITHMS];
2626
struct ncclTopoRanks topoRanks;
2727
int nc;
28+
int romeTopoModelIdx;
2829
bool pivotA2AEnabled;
2930
bool ll128Enabled;
3031
bool mscclEnabled;

0 commit comments

Comments
 (0)