diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index c82882d56e6..a29232c6386 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -11,6 +11,7 @@ zstd==1.5.5.1 pandas>=2.2.2; python_version >= '3.10' pytest==7.2.0 pytest-cov==4.1.0 +pytest-timeout==2.2.0 expecttest==0.1.6 hypothesis==6.84.2 parameterized==0.9.0 diff --git a/.ci/scripts/unittest-macos-cmake.sh b/.ci/scripts/unittest-macos-cmake.sh index 43eb1f21c3c..5ad76452950 100755 --- a/.ci/scripts/unittest-macos-cmake.sh +++ b/.ci/scripts/unittest-macos-cmake.sh @@ -12,8 +12,27 @@ set -eux export TORCHINDUCTOR_CACHE_DIR="$(mktemp -d "${RUNNER_TEMP:-/tmp}/torchinductor_cache_XXXXXX")" trap 'rm -rf "${TORCHINDUCTOR_CACHE_DIR}"' EXIT -# Run pytest with coverage -${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml +# AOTI-packaged .so invocation (torch._inductor.package._package.__call__) +# hangs on macOS CI runners. Skip every test that loads and calls an +# AOTI-packaged module on macOS until the hang is root-caused. +# TODO(SS-JIA): re-enable once AOTI hang is root-caused. +AOTI_SKIPS=( + --ignore=examples/models/llama3_2_vision/preprocess/test_preprocess.py + --ignore=examples/models/llama3_2_vision/vision_encoder/test/test_vision_encoder.py + --ignore=examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py + --deselect=extension/llm/modules/test/test_position_embeddings.py::TilePositionalEmbeddingTest::test_tile_positional_embedding_aoti + --deselect=extension/llm/modules/test/test_position_embeddings.py::TiledTokenPositionalEmbeddingTest::test_tiled_token_positional_embedding_aoti + --deselect=extension/llm/modules/test/test_attention.py::AttentionTest::test_attention_aoti +) + +# Run pytest with coverage. --timeout surfaces hung tests with a thread dump +# and faulthandler_timeout periodically dumps every worker's threads while +# tests are still running, so we can see which test is dragging before it +# trips the hard timeout. +${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml \ + --timeout=1500 --timeout-method=thread \ + -o faulthandler_timeout=180 \ + "${AOTI_SKIPS[@]}" # Run gtest LLVM_PROFDATA="xcrun llvm-profdata" LLVM_COV="xcrun llvm-cov" \ ${CONDA_RUN} test/run_oss_cpp_tests.sh diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml index 15c87bd79e4..e00c3812adc 100644 --- a/.github/workflows/_unittest.yml +++ b/.github/workflows/_unittest.yml @@ -49,6 +49,7 @@ jobs: python-version: '3.11' submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 30 script: | set -eux # This is needed to get the prebuilt PyTorch wheel from S3