diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index c82882d56e6..a29232c6386 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -11,6 +11,7 @@ zstd==1.5.5.1 pandas>=2.2.2; python_version >= '3.10' pytest==7.2.0 pytest-cov==4.1.0 +pytest-timeout==2.2.0 expecttest==0.1.6 hypothesis==6.84.2 parameterized==0.9.0 diff --git a/.ci/scripts/unittest-macos-cmake.sh b/.ci/scripts/unittest-macos-cmake.sh index 43eb1f21c3c..8c1d5df0879 100755 --- a/.ci/scripts/unittest-macos-cmake.sh +++ b/.ci/scripts/unittest-macos-cmake.sh @@ -12,8 +12,15 @@ set -eux export TORCHINDUCTOR_CACHE_DIR="$(mktemp -d "${RUNNER_TEMP:-/tmp}/torchinductor_cache_XXXXXX")" trap 'rm -rf "${TORCHINDUCTOR_CACHE_DIR}"' EXIT -# Run pytest with coverage -${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml +# EXPERIMENT: drop xdist (`-n 1`) on macOS to test whether AOTI hangs are +# caused by parallel-worker contention (clang/ld, dlopen lock, libomp +# oversubscription) rather than a true deadlock. AOTI skips removed so we +# can observe whether the previously-hung tests now pass serially. +# --timeout surfaces hung tests with a thread dump and faulthandler_timeout +# periodically dumps every worker's threads while tests are still running. +${CONDA_RUN} pytest -n 1 --cov=./ --cov-report=xml \ + --timeout=1500 --timeout-method=thread \ + -o faulthandler_timeout=180 # Run gtest LLVM_PROFDATA="xcrun llvm-profdata" LLVM_COV="xcrun llvm-cov" \ ${CONDA_RUN} test/run_oss_cpp_tests.sh diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml index 15c87bd79e4..e63a6bc518c 100644 --- a/.github/workflows/_unittest.yml +++ b/.github/workflows/_unittest.yml @@ -49,6 +49,7 @@ jobs: python-version: '3.11' submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 90 script: | set -eux # This is needed to get the prebuilt PyTorch wheel from S3