From 89748f1ff6c6d0d5b02393dabf1ad83e3effbf77 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Sun, 17 May 2026 08:49:41 +0800 Subject: [PATCH 1/3] Shard Windows IT jobs to speed up 1C1D and Table 1C1D CI The Windows runners for Cluster IT - 1C1D and Table Cluster IT - 1C1D are 67-77% slower than their Ubuntu counterparts, making them the bottleneck of the entire PR check pipeline (87 min and 65 min wall clock respectively). Split each pipeline's Windows job into 3 parallel matrix shards: - LocalStandaloneIT test classes (276) split for Cluster IT - 1C1D - TableLocalStandaloneIT test classes (231) split for Table Cluster IT - 1C1D Each shard uses failsafe.includesFile reading from a generated file, avoiding command-line length limits regardless of how the test suite grows. Ubuntu jobs stay as a single job since they were already fast enough. Expected wall clock reduction: - Cluster IT - 1C1D: 87 min -> ~49 min (capped by Ubuntu) - Table Cluster IT - 1C1D: 65 min -> ~39 min (capped by Ubuntu) --- .github/workflows/cluster-it-1c1d.yml | 78 +++++++++++++++++---- .github/workflows/table-cluster-it-1c1d.yml | 78 +++++++++++++++++---- 2 files changed, 126 insertions(+), 30 deletions(-) diff --git a/.github/workflows/cluster-it-1c1d.yml b/.github/workflows/cluster-it-1c1d.yml index 4ab201450e695..477cf33e21ea1 100644 --- a/.github/workflows/cluster-it-1c1d.yml +++ b/.github/workflows/cluster-it-1c1d.yml @@ -30,13 +30,52 @@ env: DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }} jobs: - Simple: + # Ubuntu runs all ITs in a single job (already fast at ~49 min) + Ubuntu: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v5 + - name: Set up JDK + uses: actions/setup-java@v5 + with: + distribution: corretto + java-version: 17 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Cache Maven packages + uses: actions/cache@v5 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} + restore-keys: ${{ runner.os }}-m2- + - name: Adjust Linux kernel somaxconn + shell: bash + run: sudo sysctl -w net.core.somaxconn=65535 + - name: IT/UT Test + shell: bash + run: | + mvn clean verify \ + -P with-integration-tests \ + -DskipUTs \ + -DintegrationTest.forkCount=2 \ + -pl integration-test \ + -am + - name: Upload Artifact + if: failure() + uses: actions/upload-artifact@v6 + with: + name: standalone-log-Linux + path: integration-test/target/cluster-logs + retention-days: 1 + + # Windows is ~77% slower than Ubuntu, so split into 3 shards to parallelize + Windows: strategy: fail-fast: false - max-parallel: 15 matrix: - os: [ubuntu-latest, windows-latest] - runs-on: ${{ matrix.os }} + shard: [0, 1, 2] + runs-on: windows-latest steps: - uses: actions/checkout@v5 @@ -54,36 +93,45 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2- - name: Adjust network dynamic TCP ports range - if: ${{ runner.os == 'Windows' }} shell: pwsh run: | netsh int ipv4 set dynamicport tcp start=32768 num=32768 netsh int ipv4 set dynamicport udp start=32768 num=32768 netsh int ipv6 set dynamicport tcp start=32768 num=32768 netsh int ipv6 set dynamicport udp start=32768 num=32768 - - name: Adjust Linux kernel somaxconn - if: ${{ runner.os == 'Linux' }} + - name: Build IT shard list shell: bash - run: sudo sysctl -w net.core.somaxconn=65535 - # - name: Adjust Mac kernel somaxconn - # if: ${{ runner.os == 'macOS' }} - # shell: bash - # run: sudo sysctl -w kern.ipc.somaxconn=65535 + # Distribute LocalStandaloneIT test classes across 3 shards using hash-mod assignment. + # The list is written to a file so failsafe.includesFile can read it without command-line length limits. + run: | + set -euo pipefail + SHARD=${{ matrix.shard }} + TOTAL=3 + mkdir -p integration-test + find integration-test/src/test/java -name '*IT.java' -print0 \ + | xargs -0 grep -lE '\bLocalStandaloneIT\b' \ + | awk -F'/' '{print $NF}' | sed 's/\.java$//' \ + | sort \ + | awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \ + > integration-test/it-shard.txt + echo "Shard $SHARD/$TOTAL contains $(wc -l < integration-test/it-shard.txt) test classes" + head -5 integration-test/it-shard.txt - name: IT/UT Test shell: bash - # we do not compile client-cpp for saving time, it is tested in client.yml - # we can skip influxdb-protocol because it has been tested separately in influxdb-protocol.yml run: | mvn clean verify \ -P with-integration-tests \ -DskipUTs \ -DintegrationTest.forkCount=2 \ + -Dfailsafe.includesFile="$(pwd)/integration-test/it-shard.txt" \ + -DfailIfNoTests=false \ + -Dfailsafe.failIfNoSpecifiedTests=false \ -pl integration-test \ -am - name: Upload Artifact if: failure() uses: actions/upload-artifact@v6 with: - name: standalone-log-java${{ matrix.java }}-${{ runner.os }} + name: standalone-log-Windows-shard${{ matrix.shard }} path: integration-test/target/cluster-logs retention-days: 1 diff --git a/.github/workflows/table-cluster-it-1c1d.yml b/.github/workflows/table-cluster-it-1c1d.yml index 782bafa4ddbe6..149b6a3fd2732 100644 --- a/.github/workflows/table-cluster-it-1c1d.yml +++ b/.github/workflows/table-cluster-it-1c1d.yml @@ -31,13 +31,52 @@ env: DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }} jobs: - Simple: + # Ubuntu runs all ITs in a single job (already fast at ~39 min) + Ubuntu: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v5 + - name: Set up JDK + uses: actions/setup-java@v5 + with: + distribution: corretto + java-version: 17 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Cache Maven packages + uses: actions/cache@v5 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} + restore-keys: ${{ runner.os }}-m2- + - name: Adjust Linux kernel somaxconn + shell: bash + run: sudo sysctl -w net.core.somaxconn=65535 + - name: IT/UT Test + shell: bash + run: | + mvn clean verify \ + -P with-integration-tests \ + -DskipUTs \ + -DintegrationTest.forkCount=2 -DDataNodeMaxHeapSize=1024 \ + -pl integration-test \ + -am -PTableSimpleIT + - name: Upload Artifact + if: failure() + uses: actions/upload-artifact@v6 + with: + name: table-standalone-log-Linux + path: integration-test/target/cluster-logs + retention-days: 1 + + # Windows is ~67% slower than Ubuntu, so split into 3 shards to parallelize + Windows: strategy: fail-fast: false - max-parallel: 15 matrix: - os: [ubuntu-latest, windows-latest] - runs-on: ${{ matrix.os }} + shard: [0, 1, 2] + runs-on: windows-latest steps: - uses: actions/checkout@v5 @@ -55,36 +94,45 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2- - name: Adjust network dynamic TCP ports range - if: ${{ runner.os == 'Windows' }} shell: pwsh run: | netsh int ipv4 set dynamicport tcp start=32768 num=32768 netsh int ipv4 set dynamicport udp start=32768 num=32768 netsh int ipv6 set dynamicport tcp start=32768 num=32768 netsh int ipv6 set dynamicport udp start=32768 num=32768 - - name: Adjust Linux kernel somaxconn - if: ${{ runner.os == 'Linux' }} + - name: Build IT shard list shell: bash - run: sudo sysctl -w net.core.somaxconn=65535 - # - name: Adjust Mac kernel somaxconn - # if: ${{ runner.os == 'macOS' }} - # shell: bash - # run: sudo sysctl -w kern.ipc.somaxconn=65535 + # Distribute TableLocalStandaloneIT test classes across 3 shards using hash-mod assignment. + # The list is written to a file so failsafe.includesFile can read it without command-line length limits. + run: | + set -euo pipefail + SHARD=${{ matrix.shard }} + TOTAL=3 + mkdir -p integration-test + find integration-test/src/test/java -name '*IT.java' -print0 \ + | xargs -0 grep -l 'TableLocalStandaloneIT' \ + | awk -F'/' '{print $NF}' | sed 's/\.java$//' \ + | sort \ + | awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \ + > integration-test/it-shard.txt + echo "Shard $SHARD/$TOTAL contains $(wc -l < integration-test/it-shard.txt) test classes" + head -5 integration-test/it-shard.txt - name: IT/UT Test shell: bash - # we do not compile client-cpp for saving time, it is tested in client.yml - # we can skip influxdb-protocol because it has been tested separately in influxdb-protocol.yml run: | mvn clean verify \ -P with-integration-tests \ -DskipUTs \ -DintegrationTest.forkCount=2 -DDataNodeMaxHeapSize=1024 \ + -Dfailsafe.includesFile="$(pwd)/integration-test/it-shard.txt" \ + -DfailIfNoTests=false \ + -Dfailsafe.failIfNoSpecifiedTests=false \ -pl integration-test \ -am -PTableSimpleIT - name: Upload Artifact if: failure() uses: actions/upload-artifact@v6 with: - name: table-standalone-log-java${{ matrix.java }}-${{ runner.os }} + name: table-standalone-log-Windows-shard${{ matrix.shard }} path: integration-test/target/cluster-logs retention-days: 1 From a343cf50e32faee0bf6318818ffa1a5f8ec61411 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Sun, 17 May 2026 08:55:29 +0800 Subject: [PATCH 2/3] Replace find | xargs grep with grep -rl to avoid xargs exit 123 on Windows On Windows Git Bash ARG_MAX is much smaller than on Linux, so `xargs -0` splits the file list into many batches. Batches with no matching files make grep return 1, which makes xargs return 123, and `set -o pipefail` turns that into a hard failure for the whole shard step. Replace the pipeline with a single `grep -rl --include='*IT.java'` call. That uses one grep invocation, so its exit code reflects whether any match was found across the entire tree (which is always 0 here). Local counts on macOS confirm the logic is preserved: - LocalStandaloneIT: 276 classes - TableLocalStandaloneIT: 231 classes --- .github/workflows/cluster-it-1c1d.yml | 6 ++++-- .github/workflows/table-cluster-it-1c1d.yml | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cluster-it-1c1d.yml b/.github/workflows/cluster-it-1c1d.yml index 477cf33e21ea1..bd5e3eaf825f2 100644 --- a/.github/workflows/cluster-it-1c1d.yml +++ b/.github/workflows/cluster-it-1c1d.yml @@ -108,8 +108,10 @@ jobs: SHARD=${{ matrix.shard }} TOTAL=3 mkdir -p integration-test - find integration-test/src/test/java -name '*IT.java' -print0 \ - | xargs -0 grep -lE '\bLocalStandaloneIT\b' \ + # Use a single grep -rl call instead of `find | xargs grep`: on Windows Git Bash, + # ARG_MAX is small so xargs batches the file list, and any batch with no matches + # makes grep exit 1, which makes xargs exit 123 and trips `set -o pipefail`. + grep -rlE --include='*IT.java' '\bLocalStandaloneIT\b' integration-test/src/test/java \ | awk -F'/' '{print $NF}' | sed 's/\.java$//' \ | sort \ | awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \ diff --git a/.github/workflows/table-cluster-it-1c1d.yml b/.github/workflows/table-cluster-it-1c1d.yml index 149b6a3fd2732..e4e0ada6d625e 100644 --- a/.github/workflows/table-cluster-it-1c1d.yml +++ b/.github/workflows/table-cluster-it-1c1d.yml @@ -109,8 +109,10 @@ jobs: SHARD=${{ matrix.shard }} TOTAL=3 mkdir -p integration-test - find integration-test/src/test/java -name '*IT.java' -print0 \ - | xargs -0 grep -l 'TableLocalStandaloneIT' \ + # Use a single grep -rl call instead of `find | xargs grep`: on Windows Git Bash, + # ARG_MAX is small so xargs batches the file list, and any batch with no matches + # makes grep exit 1, which makes xargs exit 123 and trips `set -o pipefail`. + grep -rl --include='*IT.java' 'TableLocalStandaloneIT' integration-test/src/test/java \ | awk -F'/' '{print $NF}' | sed 's/\.java$//' \ | sort \ | awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \ From 02ef20af29529c8cc39ca51620cbb1444b06bca6 Mon Sep 17 00:00:00 2001 From: JackieTien97 Date: Sun, 17 May 2026 09:29:04 +0800 Subject: [PATCH 3/3] Write IT shard file to $RUNNER_TEMP so Apache RAT doesn't flag it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous attempt wrote the generated shard list to integration-test/it-shard.txt. That path is inside the repo and not covered by the root pom.xml's RAT excludes (which only excludes **/target/**), so the license check started warning: Files with unapproved licenses: D:/a/iotdb/iotdb/integration-test/it-shard.txt We can't use a target/ subdirectory because `mvn clean verify` wipes it before our shard file would be read. Instead, write the file to $RUNNER_TEMP/it-shard.txt — the runner-scoped tmp dir is outside the repository entirely, so RAT never sees it. Update both -Dfailsafe.includesFile invocations to match. --- .github/workflows/cluster-it-1c1d.yml | 12 ++++++------ .github/workflows/table-cluster-it-1c1d.yml | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/cluster-it-1c1d.yml b/.github/workflows/cluster-it-1c1d.yml index bd5e3eaf825f2..caf1de91c6827 100644 --- a/.github/workflows/cluster-it-1c1d.yml +++ b/.github/workflows/cluster-it-1c1d.yml @@ -107,17 +107,17 @@ jobs: set -euo pipefail SHARD=${{ matrix.shard }} TOTAL=3 - mkdir -p integration-test - # Use a single grep -rl call instead of `find | xargs grep`: on Windows Git Bash, + # Write outside the repo so Apache RAT (license check) doesn't flag the file. + # Using a single grep -rl call instead of `find | xargs grep`: on Windows Git Bash, # ARG_MAX is small so xargs batches the file list, and any batch with no matches # makes grep exit 1, which makes xargs exit 123 and trips `set -o pipefail`. grep -rlE --include='*IT.java' '\bLocalStandaloneIT\b' integration-test/src/test/java \ | awk -F'/' '{print $NF}' | sed 's/\.java$//' \ | sort \ | awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \ - > integration-test/it-shard.txt - echo "Shard $SHARD/$TOTAL contains $(wc -l < integration-test/it-shard.txt) test classes" - head -5 integration-test/it-shard.txt + > "$RUNNER_TEMP/it-shard.txt" + echo "Shard $SHARD/$TOTAL contains $(wc -l < "$RUNNER_TEMP/it-shard.txt") test classes" + head -5 "$RUNNER_TEMP/it-shard.txt" - name: IT/UT Test shell: bash run: | @@ -125,7 +125,7 @@ jobs: -P with-integration-tests \ -DskipUTs \ -DintegrationTest.forkCount=2 \ - -Dfailsafe.includesFile="$(pwd)/integration-test/it-shard.txt" \ + -Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \ -DfailIfNoTests=false \ -Dfailsafe.failIfNoSpecifiedTests=false \ -pl integration-test \ diff --git a/.github/workflows/table-cluster-it-1c1d.yml b/.github/workflows/table-cluster-it-1c1d.yml index e4e0ada6d625e..a0058fe3219a4 100644 --- a/.github/workflows/table-cluster-it-1c1d.yml +++ b/.github/workflows/table-cluster-it-1c1d.yml @@ -108,17 +108,17 @@ jobs: set -euo pipefail SHARD=${{ matrix.shard }} TOTAL=3 - mkdir -p integration-test - # Use a single grep -rl call instead of `find | xargs grep`: on Windows Git Bash, + # Write outside the repo so Apache RAT (license check) doesn't flag the file. + # Using a single grep -rl call instead of `find | xargs grep`: on Windows Git Bash, # ARG_MAX is small so xargs batches the file list, and any batch with no matches # makes grep exit 1, which makes xargs exit 123 and trips `set -o pipefail`. grep -rl --include='*IT.java' 'TableLocalStandaloneIT' integration-test/src/test/java \ | awk -F'/' '{print $NF}' | sed 's/\.java$//' \ | sort \ | awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \ - > integration-test/it-shard.txt - echo "Shard $SHARD/$TOTAL contains $(wc -l < integration-test/it-shard.txt) test classes" - head -5 integration-test/it-shard.txt + > "$RUNNER_TEMP/it-shard.txt" + echo "Shard $SHARD/$TOTAL contains $(wc -l < "$RUNNER_TEMP/it-shard.txt") test classes" + head -5 "$RUNNER_TEMP/it-shard.txt" - name: IT/UT Test shell: bash run: | @@ -126,7 +126,7 @@ jobs: -P with-integration-tests \ -DskipUTs \ -DintegrationTest.forkCount=2 -DDataNodeMaxHeapSize=1024 \ - -Dfailsafe.includesFile="$(pwd)/integration-test/it-shard.txt" \ + -Dfailsafe.includesFile="$RUNNER_TEMP/it-shard.txt" \ -DfailIfNoTests=false \ -Dfailsafe.failIfNoSpecifiedTests=false \ -pl integration-test \