From eb38dd3d91ecfad90ca3ba0ca8bdf696c33bcf19 Mon Sep 17 00:00:00 2001 From: SS-JIA Date: Thu, 28 May 2026 11:38:10 -0400 Subject: [PATCH] CI: identify hanging tests in macOS unittest job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: The macOS unittest job has been hitting its timeout for several runs in a row with the progress bar frozen partway through pytest. Add `pytest-timeout` so a stuck test fails with a nodeid and per-thread stack trace, and set `faulthandler_timeout=180` so every xdist worker dumps its threads every 3 minutes while tests are still running — this surfaces the hung test as it develops, not just at termination. Drop the job timeout to 30 minutes so we can iterate quickly on this diagnostic loop. Co-Authored-By: Claude --- .ci/docker/requirements-ci.txt | 1 + .ci/scripts/unittest-macos-cmake.sh | 9 +++++++-- .github/workflows/_unittest.yml | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index c82882d56e6..a29232c6386 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -11,6 +11,7 @@ zstd==1.5.5.1 pandas>=2.2.2; python_version >= '3.10' pytest==7.2.0 pytest-cov==4.1.0 +pytest-timeout==2.2.0 expecttest==0.1.6 hypothesis==6.84.2 parameterized==0.9.0 diff --git a/.ci/scripts/unittest-macos-cmake.sh b/.ci/scripts/unittest-macos-cmake.sh index 43eb1f21c3c..03ddb3067b3 100755 --- a/.ci/scripts/unittest-macos-cmake.sh +++ b/.ci/scripts/unittest-macos-cmake.sh @@ -12,8 +12,13 @@ set -eux export TORCHINDUCTOR_CACHE_DIR="$(mktemp -d "${RUNNER_TEMP:-/tmp}/torchinductor_cache_XXXXXX")" trap 'rm -rf "${TORCHINDUCTOR_CACHE_DIR}"' EXIT -# Run pytest with coverage -${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml +# Run pytest with coverage. --timeout surfaces hung tests with a thread dump +# and faulthandler_timeout periodically dumps every worker's threads while +# tests are still running, so we can see which test is dragging before it +# trips the hard timeout. +${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml \ + --timeout=600 --timeout-method=thread \ + -o faulthandler_timeout=180 # Run gtest LLVM_PROFDATA="xcrun llvm-profdata" LLVM_COV="xcrun llvm-cov" \ ${CONDA_RUN} test/run_oss_cpp_tests.sh diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml index 15c87bd79e4..e00c3812adc 100644 --- a/.github/workflows/_unittest.yml +++ b/.github/workflows/_unittest.yml @@ -49,6 +49,7 @@ jobs: python-version: '3.11' submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 30 script: | set -eux # This is needed to get the prebuilt PyTorch wheel from S3