diff --git a/.ci/scripts/unittest-macos-cmake.sh b/.ci/scripts/unittest-macos-cmake.sh
index 43eb1f21c3c..9fd4a994bf2 100755
--- a/.ci/scripts/unittest-macos-cmake.sh
+++ b/.ci/scripts/unittest-macos-cmake.sh
@@ -6,14 +6,139 @@
 # LICENSE file in the root directory of this source tree.
 set -eux
 
-# Keep AOTInductor precompiled headers scoped to this job. The default cache
-# location can persist across macOS self-hosted runner jobs and produce stale
-# PCH failures after PyTorch is reinstalled.
+# =============================================================================
+# AOTI HANG DIAGNOSIS
+#
+# Run a single AOTI test that is known to hang on macOS CI.  A background
+# watchdog samples the native call stack every 60 s so we can see exactly
+# which C/C++ function the thread is blocked in (faulthandler only shows
+# Python frames and cannot fire when the GIL is held by native code).
+# =============================================================================
+
 export TORCHINDUCTOR_CACHE_DIR="$(mktemp -d "${RUNNER_TEMP:-/tmp}/torchinductor_cache_XXXXXX")"
 trap 'rm -rf "${TORCHINDUCTOR_CACHE_DIR}"' EXIT
 
-# Run pytest with coverage
-${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml
-# Run gtest
-LLVM_PROFDATA="xcrun llvm-profdata" LLVM_COV="xcrun llvm-cov" \
-${CONDA_RUN} test/run_oss_cpp_tests.sh
+# Force unbuffered output so every print appears immediately in the CI log.
+export PYTHONUNBUFFERED=1
+
+# ---------- instrumented test wrapper ----------
+cat > /tmp/aoti_diag.py << 'PYEOF'
+"""Run a single AOTI test with step-by-step timing instrumentation."""
+import json, os, sys, tempfile, time
+
+def log(msg):
+    elapsed = time.time() - _t0
+    print(f"[AOTI-DIAG +{elapsed:7.1f}s] {msg}", flush=True)
+
+_t0 = time.time()
+log("start")
+
+import torch
+log(f"torch {torch.__version__} loaded")
+
+from executorch.examples.models.llama3_2_vision.text_decoder.model import Llama3_2Decoder
+log("Llama3_2Decoder imported")
+
+params = {
+    "dim": 2048,
+    "ffn_dim_multiplier": 1.3,
+    "fusion_interval": 2,
+    "intermediate_dim": 14336,
+    "multiple_of": 1024,
+    "n_heads": 32,
+    "n_kv_heads": 8,
+    "n_layers": 2,
+    "n_special_tokens": 8,
+    "norm_eps": 1e-05,
+    "rope_theta": 500000.0,
+    "use_scaled_rope": True,
+    "vision_chunk_size": 560,
+    "vision_max_num_chunks": 4,
+    "vocab_size": 21008,
+    "vision_num_cross_attention_layers": 1,
+}
+
+with tempfile.NamedTemporaryFile(mode="w", suffix=".json") as f:
+    json.dump(params, f, indent=2); f.flush()
+    model = Llama3_2Decoder(
+        encoder_max_seq_len=6404,
+        generate_full_logits=True,
+        enable_dynamic_shape=True,
+        use_kv_cache=True,
+        params=f.name,
+        dtype=torch.float32,
+    )
+log("model constructed")
+
+encoder = model.get_eager_model().eval()
+for p in encoder.parameters():
+    p.requires_grad_(False)
+log("model eval + no_grad")
+
+example_inputs = model.get_example_inputs()
+example_kwargs = model.get_example_kwarg_inputs()
+
+# Step 1: torch.export
+log("step 1/4: torch.export.export ...")
+t = time.time()
+with torch.no_grad(), torch.inference_mode():
+    ep = torch.export.export(encoder, example_inputs, kwargs=example_kwargs, strict=True)
+log(f"step 1/4: torch.export.export done ({time.time()-t:.1f}s)")
+
+# Step 2: aoti_compile_and_package
+tmpdir = tempfile.mkdtemp()
+pkg_path = os.path.join(tmpdir, "text_decoder.pt2")
+log(f"step 2/4: aoti_compile_and_package -> {pkg_path} ...")
+t = time.time()
+path = torch._inductor.aoti_compile_and_package(ep, package_path=pkg_path)
+log(f"step 2/4: aoti_compile_and_package done ({time.time()-t:.1f}s)")
+
+# Step 3: aoti_load_package
+log("step 3/4: aoti_load_package ...")
+t = time.time()
+encoder_aoti = torch._inductor.aoti_load_package(path)
+log(f"step 3/4: aoti_load_package done ({time.time()-t:.1f}s)")
+
+# Step 4: inference
+log("step 4/4: inference ...")
+t = time.time()
+y = encoder_aoti(*example_inputs, **example_kwargs)
+log(f"step 4/4: inference done ({time.time()-t:.1f}s)")
+
+# Verify
+eager_res = encoder.forward(*example_inputs, **example_kwargs)
+torch.testing.assert_close(y, eager_res, rtol=1e-4, atol=1e-4)
+log("PASS — results match")
+PYEOF
+
+# ---------- run with background watchdog ----------
+# Start the test
+${CONDA_RUN} --no-capture-output python /tmp/aoti_diag.py &
+TEST_PID=$!
+
+# Watchdog: every 60s, if the test is still running, sample the native stack.
+(
+  while kill -0 "$TEST_PID" 2>/dev/null; do
+    sleep 60
+    if kill -0 "$TEST_PID" 2>/dev/null; then
+      echo ""
+      echo "===== WATCHDOG: native stack sample ($(date)) ====="
+      # sample captures C/C++ call stacks on macOS
+      sample "$TEST_PID" 1 2>&1 | head -200 || true
+      echo "===== END WATCHDOG ====="
+      echo ""
+    fi
+  done
+) &
+WATCHDOG_PID=$!
+
+# Wait for test, propagate exit code
+wait "$TEST_PID"
+EXIT_CODE=$?
+
+# Clean up watchdog
+kill "$WATCHDOG_PID" 2>/dev/null || true
+wait "$WATCHDOG_PID" 2>/dev/null || true
+
+echo "Test exited with code $EXIT_CODE"
+exit $EXIT_CODE
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
index 15c87bd79e4..71966db2038 100644
--- a/.github/workflows/_unittest.yml
+++ b/.github/workflows/_unittest.yml
@@ -26,20 +26,11 @@ on:
         default: '3.10'
 
 jobs:
-  linux:
-    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    permissions:
-      id-token: write
-      contents: read
-    with:
-      runner: linux.2xlarge.memory
-      docker-image: ${{ inputs.docker-image }}
-      submodules: 'recursive'
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 90
-      script: |
-        set -eux
-        .ci/scripts/unittest-linux.sh --build-tool "${{ inputs.build-tool }}" --build-mode "${{ inputs.build-mode }}" --editable "${{ inputs.editable }}"
+  # linux and windows disabled for AOTI hang diagnosis
+  # linux:
+  #   ...
+  # windows:
+  #   ...
 
   macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -54,32 +45,3 @@ jobs:
         # This is needed to get the prebuilt PyTorch wheel from S3
         ${CONDA_RUN} --no-capture-output pip install awscli==1.37.21
         .ci/scripts/unittest-macos.sh --build-tool "${{ inputs.build-tool }}" --build-mode "${{ inputs.build-mode }}" --editable "${{ inputs.editable }}"
-
-  windows:
-    if: ${{ inputs.build-tool == 'cmake' }}
-    uses: pytorch/test-infra/.github/workflows/windows_job.yml@main
-    with:
-      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 120
-      script: |
-        git config --global http.sslBackend openssl
-        git submodule update --init --recursive 
-        conda init powershell
-
-        powershell -Command "& {
-          Set-PSDebug -Trace 1
-          \$ErrorActionPreference = 'Stop'
-          \$PSNativeCommandUseErrorActionPreference = \$true
-
-          .ci/scripts/setup-windows.ps1 -editable "${{ inputs.editable }}"
-          if (\$LASTEXITCODE -ne 0) {
-              Write-Host "Setup failed. Exit code: \$LASTEXITCODE."
-              exit \$LASTEXITCODE
-          }
-
-          .ci/scripts/unittest-windows.ps1 -buildMode "${{ inputs.build-mode }}"
-          if (\$LASTEXITCODE -ne 0) {
-              Write-Host "Unit tests failed. Exit code: \$LASTEXITCODE."
-              exit \$LASTEXITCODE
-          }
-        }"